| { |
| "best_metric": 3.1984846591949463, |
| "best_model_checkpoint": "./output/models/rotating-head-gp-gpt2-medium-wikitext/checkpoint-8500", |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 8910, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05611672278338945, |
| "grad_norm": 1.7026984691619873, |
| "learning_rate": 1.1223344556677892e-05, |
| "loss": 8.993, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1122334455667789, |
| "grad_norm": 1.4564013481140137, |
| "learning_rate": 2.2446689113355783e-05, |
| "loss": 7.3838, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16835016835016836, |
| "grad_norm": 1.6578171253204346, |
| "learning_rate": 3.3670033670033675e-05, |
| "loss": 6.5796, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2244668911335578, |
| "grad_norm": 2.0035881996154785, |
| "learning_rate": 4.4893378226711566e-05, |
| "loss": 6.1995, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.28058361391694725, |
| "grad_norm": 1.3597742319107056, |
| "learning_rate": 5.611672278338945e-05, |
| "loss": 5.9062, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.28058361391694725, |
| "eval_accuracy": 0.22335966032779805, |
| "eval_bleu": 0.0492964548196352, |
| "eval_loss": 5.746989727020264, |
| "eval_perplexity": 313.2462827607658, |
| "eval_runtime": 12.1113, |
| "eval_samples_per_second": 94.209, |
| "eval_steps_per_second": 1.486, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3367003367003367, |
| "grad_norm": 1.2420822381973267, |
| "learning_rate": 6.734006734006735e-05, |
| "loss": 5.6583, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.39281705948372614, |
| "grad_norm": 1.7358133792877197, |
| "learning_rate": 7.856341189674523e-05, |
| "loss": 5.4354, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4489337822671156, |
| "grad_norm": 1.1559091806411743, |
| "learning_rate": 8.978675645342313e-05, |
| "loss": 5.252, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5050505050505051, |
| "grad_norm": 1.1872116327285767, |
| "learning_rate": 9.988776655443322e-05, |
| "loss": 5.0452, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5611672278338945, |
| "grad_norm": 1.0503571033477783, |
| "learning_rate": 9.864072827035791e-05, |
| "loss": 4.8598, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5611672278338945, |
| "eval_accuracy": 0.28109999374594663, |
| "eval_bleu": 0.06981793198064049, |
| "eval_loss": 4.74280309677124, |
| "eval_perplexity": 114.75542181664323, |
| "eval_runtime": 12.1357, |
| "eval_samples_per_second": 94.02, |
| "eval_steps_per_second": 1.483, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6172839506172839, |
| "grad_norm": 0.9331828951835632, |
| "learning_rate": 9.73936899862826e-05, |
| "loss": 4.7116, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6734006734006734, |
| "grad_norm": 0.8568651676177979, |
| "learning_rate": 9.614665170220725e-05, |
| "loss": 4.5871, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7295173961840629, |
| "grad_norm": 0.7943041920661926, |
| "learning_rate": 9.489961341813194e-05, |
| "loss": 4.48, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7856341189674523, |
| "grad_norm": 0.929481029510498, |
| "learning_rate": 9.365257513405662e-05, |
| "loss": 4.3936, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8417508417508418, |
| "grad_norm": 0.9010036587715149, |
| "learning_rate": 9.24055368499813e-05, |
| "loss": 4.3025, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8417508417508418, |
| "eval_accuracy": 0.31702567503082046, |
| "eval_bleu": 0.08337643356737867, |
| "eval_loss": 4.23293399810791, |
| "eval_perplexity": 68.91914446331879, |
| "eval_runtime": 12.1629, |
| "eval_samples_per_second": 93.809, |
| "eval_steps_per_second": 1.48, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8978675645342312, |
| "grad_norm": 0.8791123628616333, |
| "learning_rate": 9.115849856590598e-05, |
| "loss": 4.2455, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9539842873176206, |
| "grad_norm": 0.938328742980957, |
| "learning_rate": 8.991146028183066e-05, |
| "loss": 4.1605, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.0101010101010102, |
| "grad_norm": 1.0388487577438354, |
| "learning_rate": 8.866442199775533e-05, |
| "loss": 4.1011, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.0662177328843996, |
| "grad_norm": 0.9663709998130798, |
| "learning_rate": 8.741738371368002e-05, |
| "loss": 4.0148, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.122334455667789, |
| "grad_norm": 0.8401734828948975, |
| "learning_rate": 8.617034542960469e-05, |
| "loss": 3.9635, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.122334455667789, |
| "eval_accuracy": 0.34535482328872397, |
| "eval_bleu": 0.09320946829041966, |
| "eval_loss": 3.9290566444396973, |
| "eval_perplexity": 50.85897693319932, |
| "eval_runtime": 12.2142, |
| "eval_samples_per_second": 93.416, |
| "eval_steps_per_second": 1.474, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.1784511784511784, |
| "grad_norm": 1.1741111278533936, |
| "learning_rate": 8.492330714552937e-05, |
| "loss": 3.9219, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.2345679012345678, |
| "grad_norm": 1.1291131973266602, |
| "learning_rate": 8.367626886145406e-05, |
| "loss": 3.876, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.2906846240179575, |
| "grad_norm": 1.0407253503799438, |
| "learning_rate": 8.242923057737873e-05, |
| "loss": 3.8557, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.3468013468013469, |
| "grad_norm": 1.0766143798828125, |
| "learning_rate": 8.11821922933034e-05, |
| "loss": 3.8141, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.4029180695847363, |
| "grad_norm": 1.42284095287323, |
| "learning_rate": 7.993515400922809e-05, |
| "loss": 3.7769, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.4029180695847363, |
| "eval_accuracy": 0.36358581717774274, |
| "eval_bleu": 0.10197996736893164, |
| "eval_loss": 3.7426531314849854, |
| "eval_perplexity": 42.209829965401774, |
| "eval_runtime": 12.1768, |
| "eval_samples_per_second": 93.703, |
| "eval_steps_per_second": 1.478, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.4590347923681257, |
| "grad_norm": 1.3685775995254517, |
| "learning_rate": 7.868811572515277e-05, |
| "loss": 3.7592, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.5151515151515151, |
| "grad_norm": 1.6901907920837402, |
| "learning_rate": 7.744107744107744e-05, |
| "loss": 3.7311, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.5712682379349046, |
| "grad_norm": 1.2826731204986572, |
| "learning_rate": 7.619403915700213e-05, |
| "loss": 3.6989, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.627384960718294, |
| "grad_norm": 1.1707065105438232, |
| "learning_rate": 7.49470008729268e-05, |
| "loss": 3.6858, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.6835016835016834, |
| "grad_norm": 1.8538202047348022, |
| "learning_rate": 7.369996258885148e-05, |
| "loss": 3.6738, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.6835016835016834, |
| "eval_accuracy": 0.3753888436255347, |
| "eval_bleu": 0.1065937514211335, |
| "eval_loss": 3.6224589347839355, |
| "eval_perplexity": 37.429491437089155, |
| "eval_runtime": 12.1708, |
| "eval_samples_per_second": 93.749, |
| "eval_steps_per_second": 1.479, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.7396184062850728, |
| "grad_norm": 1.2428492307662964, |
| "learning_rate": 7.245292430477615e-05, |
| "loss": 3.6346, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.7957351290684624, |
| "grad_norm": 1.5533177852630615, |
| "learning_rate": 7.120588602070084e-05, |
| "loss": 3.6228, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.8518518518518519, |
| "grad_norm": 1.538769006729126, |
| "learning_rate": 6.995884773662552e-05, |
| "loss": 3.5964, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.9079685746352413, |
| "grad_norm": 0.9837027788162231, |
| "learning_rate": 6.871180945255021e-05, |
| "loss": 3.5916, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.964085297418631, |
| "grad_norm": 1.3107187747955322, |
| "learning_rate": 6.746477116847487e-05, |
| "loss": 3.5744, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.964085297418631, |
| "eval_accuracy": 0.38449063305584186, |
| "eval_bleu": 0.11184212819864767, |
| "eval_loss": 3.5325236320495605, |
| "eval_perplexity": 34.21019270752698, |
| "eval_runtime": 12.1937, |
| "eval_samples_per_second": 93.573, |
| "eval_steps_per_second": 1.476, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.0202020202020203, |
| "grad_norm": 1.3989722728729248, |
| "learning_rate": 6.621773288439955e-05, |
| "loss": 3.5381, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.0763187429854097, |
| "grad_norm": 1.5801029205322266, |
| "learning_rate": 6.497069460032424e-05, |
| "loss": 3.4865, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.132435465768799, |
| "grad_norm": 1.2122889757156372, |
| "learning_rate": 6.372365631624892e-05, |
| "loss": 3.4665, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.1885521885521886, |
| "grad_norm": 2.2837603092193604, |
| "learning_rate": 6.247661803217359e-05, |
| "loss": 3.4675, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.244668911335578, |
| "grad_norm": 1.1615939140319824, |
| "learning_rate": 6.122957974809826e-05, |
| "loss": 3.456, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.244668911335578, |
| "eval_accuracy": 0.3902143769549271, |
| "eval_bleu": 0.11388067737811994, |
| "eval_loss": 3.4704020023345947, |
| "eval_perplexity": 32.149664087333434, |
| "eval_runtime": 12.121, |
| "eval_samples_per_second": 94.134, |
| "eval_steps_per_second": 1.485, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.3007856341189674, |
| "grad_norm": 0.9502741694450378, |
| "learning_rate": 5.998254146402295e-05, |
| "loss": 3.4467, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.356902356902357, |
| "grad_norm": 1.7536747455596924, |
| "learning_rate": 5.8735503179947625e-05, |
| "loss": 3.4298, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.4130190796857462, |
| "grad_norm": 1.4674713611602783, |
| "learning_rate": 5.748846489587231e-05, |
| "loss": 3.4221, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.4691358024691357, |
| "grad_norm": 2.0394678115844727, |
| "learning_rate": 5.624142661179699e-05, |
| "loss": 3.4072, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.525252525252525, |
| "grad_norm": 2.8717079162597656, |
| "learning_rate": 5.4994388327721666e-05, |
| "loss": 3.3972, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.525252525252525, |
| "eval_accuracy": 0.3955106177548291, |
| "eval_bleu": 0.12298741482321522, |
| "eval_loss": 3.4189839363098145, |
| "eval_perplexity": 30.53837032278329, |
| "eval_runtime": 12.1708, |
| "eval_samples_per_second": 93.749, |
| "eval_steps_per_second": 1.479, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.581369248035915, |
| "grad_norm": 2.2189624309539795, |
| "learning_rate": 5.374735004364634e-05, |
| "loss": 3.4006, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.637485970819304, |
| "grad_norm": 1.444754719734192, |
| "learning_rate": 5.250031175957102e-05, |
| "loss": 3.3886, |
| "step": 4700 |
| }, |
| { |
| "epoch": 2.6936026936026938, |
| "grad_norm": 1.8333204984664917, |
| "learning_rate": 5.12532734754957e-05, |
| "loss": 3.3813, |
| "step": 4800 |
| }, |
| { |
| "epoch": 2.749719416386083, |
| "grad_norm": 2.1033811569213867, |
| "learning_rate": 5.000623519142038e-05, |
| "loss": 3.372, |
| "step": 4900 |
| }, |
| { |
| "epoch": 2.8058361391694726, |
| "grad_norm": 1.8956849575042725, |
| "learning_rate": 4.8759196907345056e-05, |
| "loss": 3.3654, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.8058361391694726, |
| "eval_accuracy": 0.40071690299277873, |
| "eval_bleu": 0.12304297750670024, |
| "eval_loss": 3.368644952774048, |
| "eval_perplexity": 29.039150964630583, |
| "eval_runtime": 12.2072, |
| "eval_samples_per_second": 93.469, |
| "eval_steps_per_second": 1.475, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.861952861952862, |
| "grad_norm": 2.0555260181427, |
| "learning_rate": 4.751215862326974e-05, |
| "loss": 3.3622, |
| "step": 5100 |
| }, |
| { |
| "epoch": 2.9180695847362514, |
| "grad_norm": 1.1492657661437988, |
| "learning_rate": 4.626512033919442e-05, |
| "loss": 3.3413, |
| "step": 5200 |
| }, |
| { |
| "epoch": 2.974186307519641, |
| "grad_norm": 2.6185925006866455, |
| "learning_rate": 4.5018082055119096e-05, |
| "loss": 3.3452, |
| "step": 5300 |
| }, |
| { |
| "epoch": 3.0303030303030303, |
| "grad_norm": 1.4890856742858887, |
| "learning_rate": 4.3771043771043774e-05, |
| "loss": 3.2908, |
| "step": 5400 |
| }, |
| { |
| "epoch": 3.0864197530864197, |
| "grad_norm": 2.316535711288452, |
| "learning_rate": 4.252400548696845e-05, |
| "loss": 3.247, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.0864197530864197, |
| "eval_accuracy": 0.40426800589080425, |
| "eval_bleu": 0.1247222065482489, |
| "eval_loss": 3.3328051567077637, |
| "eval_perplexity": 28.016823154790686, |
| "eval_runtime": 12.2011, |
| "eval_samples_per_second": 93.516, |
| "eval_steps_per_second": 1.475, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.142536475869809, |
| "grad_norm": 1.5401073694229126, |
| "learning_rate": 4.127696720289313e-05, |
| "loss": 3.253, |
| "step": 5600 |
| }, |
| { |
| "epoch": 3.1986531986531985, |
| "grad_norm": 1.508957028388977, |
| "learning_rate": 4.002992891881781e-05, |
| "loss": 3.2477, |
| "step": 5700 |
| }, |
| { |
| "epoch": 3.254769921436588, |
| "grad_norm": 1.5511479377746582, |
| "learning_rate": 3.8782890634742486e-05, |
| "loss": 3.2456, |
| "step": 5800 |
| }, |
| { |
| "epoch": 3.3108866442199774, |
| "grad_norm": 1.5875085592269897, |
| "learning_rate": 3.7535852350667164e-05, |
| "loss": 3.2472, |
| "step": 5900 |
| }, |
| { |
| "epoch": 3.3670033670033668, |
| "grad_norm": 1.1592992544174194, |
| "learning_rate": 3.628881406659185e-05, |
| "loss": 3.2403, |
| "step": 6000 |
| }, |
| { |
| "epoch": 3.3670033670033668, |
| "eval_accuracy": 0.40832457337503847, |
| "eval_bleu": 0.129799477764246, |
| "eval_loss": 3.298476457595825, |
| "eval_perplexity": 27.07136311627636, |
| "eval_runtime": 12.1774, |
| "eval_samples_per_second": 93.699, |
| "eval_steps_per_second": 1.478, |
| "step": 6000 |
| }, |
| { |
| "epoch": 3.4231200897867566, |
| "grad_norm": 1.2478611469268799, |
| "learning_rate": 3.504177578251652e-05, |
| "loss": 3.2187, |
| "step": 6100 |
| }, |
| { |
| "epoch": 3.479236812570146, |
| "grad_norm": 1.8688626289367676, |
| "learning_rate": 3.3794737498441205e-05, |
| "loss": 3.2313, |
| "step": 6200 |
| }, |
| { |
| "epoch": 3.5353535353535355, |
| "grad_norm": 1.8166719675064087, |
| "learning_rate": 3.254769921436588e-05, |
| "loss": 3.2192, |
| "step": 6300 |
| }, |
| { |
| "epoch": 3.591470258136925, |
| "grad_norm": 1.6677237749099731, |
| "learning_rate": 3.130066093029056e-05, |
| "loss": 3.221, |
| "step": 6400 |
| }, |
| { |
| "epoch": 3.6475869809203143, |
| "grad_norm": 1.4927235841751099, |
| "learning_rate": 3.0053622646215242e-05, |
| "loss": 3.2167, |
| "step": 6500 |
| }, |
| { |
| "epoch": 3.6475869809203143, |
| "eval_accuracy": 0.4111534616185319, |
| "eval_bleu": 0.1288262427188711, |
| "eval_loss": 3.269272565841675, |
| "eval_perplexity": 26.292206536048766, |
| "eval_runtime": 12.1618, |
| "eval_samples_per_second": 93.819, |
| "eval_steps_per_second": 1.48, |
| "step": 6500 |
| }, |
| { |
| "epoch": 3.7037037037037037, |
| "grad_norm": 1.9368300437927246, |
| "learning_rate": 2.880658436213992e-05, |
| "loss": 3.2084, |
| "step": 6600 |
| }, |
| { |
| "epoch": 3.759820426487093, |
| "grad_norm": 1.1655679941177368, |
| "learning_rate": 2.7559546078064598e-05, |
| "loss": 3.1997, |
| "step": 6700 |
| }, |
| { |
| "epoch": 3.8159371492704826, |
| "grad_norm": 1.2166575193405151, |
| "learning_rate": 2.6312507793989276e-05, |
| "loss": 3.1889, |
| "step": 6800 |
| }, |
| { |
| "epoch": 3.872053872053872, |
| "grad_norm": 1.2495133876800537, |
| "learning_rate": 2.5065469509913957e-05, |
| "loss": 3.1814, |
| "step": 6900 |
| }, |
| { |
| "epoch": 3.9281705948372614, |
| "grad_norm": 1.4042552709579468, |
| "learning_rate": 2.3818431225838632e-05, |
| "loss": 3.1903, |
| "step": 7000 |
| }, |
| { |
| "epoch": 3.9281705948372614, |
| "eval_accuracy": 0.41343918961176035, |
| "eval_bleu": 0.13053470151276936, |
| "eval_loss": 3.2455883026123047, |
| "eval_perplexity": 25.67681135292233, |
| "eval_runtime": 12.1742, |
| "eval_samples_per_second": 93.723, |
| "eval_steps_per_second": 1.479, |
| "step": 7000 |
| }, |
| { |
| "epoch": 3.984287317620651, |
| "grad_norm": 1.472741723060608, |
| "learning_rate": 2.2571392941763313e-05, |
| "loss": 3.1863, |
| "step": 7100 |
| }, |
| { |
| "epoch": 4.040404040404041, |
| "grad_norm": 1.0730637311935425, |
| "learning_rate": 2.132435465768799e-05, |
| "loss": 3.1197, |
| "step": 7200 |
| }, |
| { |
| "epoch": 4.09652076318743, |
| "grad_norm": 1.275009036064148, |
| "learning_rate": 2.007731637361267e-05, |
| "loss": 3.1142, |
| "step": 7300 |
| }, |
| { |
| "epoch": 4.1526374859708195, |
| "grad_norm": 1.156083583831787, |
| "learning_rate": 1.883027808953735e-05, |
| "loss": 3.1174, |
| "step": 7400 |
| }, |
| { |
| "epoch": 4.2087542087542085, |
| "grad_norm": 1.0556427240371704, |
| "learning_rate": 1.758323980546203e-05, |
| "loss": 3.1212, |
| "step": 7500 |
| }, |
| { |
| "epoch": 4.2087542087542085, |
| "eval_accuracy": 0.4161310027132311, |
| "eval_bleu": 0.1325477390562919, |
| "eval_loss": 3.2261738777160645, |
| "eval_perplexity": 25.1831187134555, |
| "eval_runtime": 12.1902, |
| "eval_samples_per_second": 93.6, |
| "eval_steps_per_second": 1.477, |
| "step": 7500 |
| }, |
| { |
| "epoch": 4.264870931537598, |
| "grad_norm": 2.7228903770446777, |
| "learning_rate": 1.6336201521386706e-05, |
| "loss": 3.1073, |
| "step": 7600 |
| }, |
| { |
| "epoch": 4.320987654320987, |
| "grad_norm": 1.526665210723877, |
| "learning_rate": 1.5089163237311384e-05, |
| "loss": 3.0936, |
| "step": 7700 |
| }, |
| { |
| "epoch": 4.377104377104377, |
| "grad_norm": 1.0413861274719238, |
| "learning_rate": 1.3842124953236066e-05, |
| "loss": 3.104, |
| "step": 7800 |
| }, |
| { |
| "epoch": 4.433221099887767, |
| "grad_norm": 1.394884467124939, |
| "learning_rate": 1.2595086669160744e-05, |
| "loss": 3.1018, |
| "step": 7900 |
| }, |
| { |
| "epoch": 4.489337822671156, |
| "grad_norm": 1.2792820930480957, |
| "learning_rate": 1.1348048385085423e-05, |
| "loss": 3.0816, |
| "step": 8000 |
| }, |
| { |
| "epoch": 4.489337822671156, |
| "eval_accuracy": 0.4176114142470762, |
| "eval_bleu": 0.13073000122523024, |
| "eval_loss": 3.2127764225006104, |
| "eval_perplexity": 24.847979030639497, |
| "eval_runtime": 12.2049, |
| "eval_samples_per_second": 93.487, |
| "eval_steps_per_second": 1.475, |
| "step": 8000 |
| }, |
| { |
| "epoch": 4.545454545454545, |
| "grad_norm": 1.173828363418579, |
| "learning_rate": 1.0101010101010101e-05, |
| "loss": 3.0993, |
| "step": 8100 |
| }, |
| { |
| "epoch": 4.601571268237935, |
| "grad_norm": 1.09369957447052, |
| "learning_rate": 8.853971816934781e-06, |
| "loss": 3.0905, |
| "step": 8200 |
| }, |
| { |
| "epoch": 4.657687991021325, |
| "grad_norm": 1.0657522678375244, |
| "learning_rate": 7.606933532859459e-06, |
| "loss": 3.097, |
| "step": 8300 |
| }, |
| { |
| "epoch": 4.713804713804714, |
| "grad_norm": 1.1146267652511597, |
| "learning_rate": 6.359895248784138e-06, |
| "loss": 3.0896, |
| "step": 8400 |
| }, |
| { |
| "epoch": 4.7699214365881035, |
| "grad_norm": 2.1886322498321533, |
| "learning_rate": 5.112856964708817e-06, |
| "loss": 3.0917, |
| "step": 8500 |
| }, |
| { |
| "epoch": 4.7699214365881035, |
| "eval_accuracy": 0.4195613081423491, |
| "eval_bleu": 0.13385511134755296, |
| "eval_loss": 3.1984846591949463, |
| "eval_perplexity": 24.49538320533492, |
| "eval_runtime": 12.1678, |
| "eval_samples_per_second": 93.772, |
| "eval_steps_per_second": 1.479, |
| "step": 8500 |
| }, |
| { |
| "epoch": 4.8260381593714925, |
| "grad_norm": 1.1058974266052246, |
| "learning_rate": 3.865818680633495e-06, |
| "loss": 3.0822, |
| "step": 8600 |
| }, |
| { |
| "epoch": 4.882154882154882, |
| "grad_norm": 1.3942530155181885, |
| "learning_rate": 2.6187803965581742e-06, |
| "loss": 3.0923, |
| "step": 8700 |
| }, |
| { |
| "epoch": 4.938271604938271, |
| "grad_norm": 1.3995310068130493, |
| "learning_rate": 1.3717421124828533e-06, |
| "loss": 3.088, |
| "step": 8800 |
| }, |
| { |
| "epoch": 4.994388327721661, |
| "grad_norm": 1.1968387365341187, |
| "learning_rate": 1.2470382840753213e-07, |
| "loss": 3.0789, |
| "step": 8900 |
| }, |
| { |
| "epoch": 5.0, |
| "step": 8910, |
| "total_flos": 1.0587061010143642e+18, |
| "train_loss": 3.767289323945907, |
| "train_runtime": 15738.6672, |
| "train_samples_per_second": 36.215, |
| "train_steps_per_second": 0.566 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 8910, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 2, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0587061010143642e+18, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|