| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8040275275968778, |
| "eval_steps": 1024, |
| "global_step": 17408, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 0.7295829057693481, |
| "learning_rate": 0.000498046875, |
| "loss": 1.8308284282684326, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 0.8052719831466675, |
| "learning_rate": 0.000998046875, |
| "loss": 1.8338860273361206, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 0.8997901678085327, |
| "learning_rate": 0.000999640996023194, |
| "loss": 1.8343064785003662, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 0.7250568270683289, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 1.8296632766723633, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_beta_ce_loss": 0.6351920716022248, |
| "eval_bleu": 0.4998693474981933, |
| "eval_loss": 1.8211502122552428, |
| "eval_uni_ce_loss": 0.5507660694590443, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_beta_ce_loss": 0.6351920716022248, |
| "eval_bleu": 0.4998693474981933, |
| "eval_loss": 1.8211502122552428, |
| "eval_runtime": 139.8967, |
| "eval_samples_per_second": 200.098, |
| "eval_steps_per_second": 3.131, |
| "eval_uni_ce_loss": 0.5507660694590443, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 0.7675038576126099, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 1.8191157579421997, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 0.8503870368003845, |
| "learning_rate": 0.000994232528651847, |
| "loss": 1.8177354335784912, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 0.8022050261497498, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 1.807867407798767, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 0.9533084034919739, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 1.8013904094696045, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_beta_ce_loss": 0.6264337543483194, |
| "eval_bleu": 0.49771606091961523, |
| "eval_loss": 1.7982131136606818, |
| "eval_uni_ce_loss": 0.545345604555792, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_beta_ce_loss": 0.6264337543483194, |
| "eval_bleu": 0.49771606091961523, |
| "eval_loss": 1.7982131136606818, |
| "eval_runtime": 133.1265, |
| "eval_samples_per_second": 210.274, |
| "eval_steps_per_second": 3.29, |
| "eval_uni_ce_loss": 0.545345604555792, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 0.9694677591323853, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 1.8004993200302124, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.8076265454292297, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 1.7975847721099854, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.7418428063392639, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 1.789136528968811, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 0.7103869915008545, |
| "learning_rate": 0.000964275131968659, |
| "loss": 1.7754944562911987, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_beta_ce_loss": 0.620368564918161, |
| "eval_bleu": 0.49887627167388116, |
| "eval_loss": 1.7806649703413384, |
| "eval_uni_ce_loss": 0.5399278415936858, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_beta_ce_loss": 0.620368564918161, |
| "eval_bleu": 0.49887627167388116, |
| "eval_loss": 1.7806649703413384, |
| "eval_runtime": 134.7634, |
| "eval_samples_per_second": 207.72, |
| "eval_steps_per_second": 3.25, |
| "eval_uni_ce_loss": 0.5399278415936858, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.7851372957229614, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 1.7792595624923706, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 0.7782461047172546, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 1.7753657102584839, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.7312053442001343, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 1.769174575805664, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 0.8110184669494629, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 1.769570231437683, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_beta_ce_loss": 0.6151743780260217, |
| "eval_bleu": 0.5008352932587944, |
| "eval_loss": 1.766069833818636, |
| "eval_uni_ce_loss": 0.5357210769500906, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_beta_ce_loss": 0.6151743780260217, |
| "eval_bleu": 0.5008352932587944, |
| "eval_loss": 1.766069833818636, |
| "eval_runtime": 135.334, |
| "eval_samples_per_second": 206.844, |
| "eval_steps_per_second": 3.236, |
| "eval_uni_ce_loss": 0.5357210769500906, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 0.9300932288169861, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 1.7624573707580566, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 0.7691043019294739, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 1.7650220394134521, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 0.8132173418998718, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 1.761474370956421, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 0.7635871171951294, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 1.7527934312820435, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_beta_ce_loss": 0.611639610028158, |
| "eval_bleu": 0.5024511500541371, |
| "eval_loss": 1.753186773763944, |
| "eval_uni_ce_loss": 0.5299075521426658, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_beta_ce_loss": 0.611639610028158, |
| "eval_bleu": 0.5024511500541371, |
| "eval_loss": 1.753186773763944, |
| "eval_runtime": 135.0739, |
| "eval_samples_per_second": 207.242, |
| "eval_steps_per_second": 3.243, |
| "eval_uni_ce_loss": 0.5299075521426658, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 0.7438808083534241, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 1.7492367029190063, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.7799262404441833, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 1.744637131690979, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 0.7694032192230225, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 1.7494758367538452, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.7369454503059387, |
| "learning_rate": 0.0008349001781229053, |
| "loss": 1.7494639158248901, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_beta_ce_loss": 0.6058613158524309, |
| "eval_bleu": 0.5028032064859724, |
| "eval_loss": 1.739504432841523, |
| "eval_uni_ce_loss": 0.5277818014088287, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_beta_ce_loss": 0.6058613158524309, |
| "eval_bleu": 0.5028032064859724, |
| "eval_loss": 1.739504432841523, |
| "eval_runtime": 136.5059, |
| "eval_samples_per_second": 205.068, |
| "eval_steps_per_second": 3.209, |
| "eval_uni_ce_loss": 0.5277818014088287, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 0.8735672235488892, |
| "learning_rate": 0.0008205359904536107, |
| "loss": 1.7374849319458008, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 1.0139427185058594, |
| "learning_rate": 0.0008057078912056363, |
| "loss": 1.7405836582183838, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 1.0008057355880737, |
| "learning_rate": 0.0007904373410796086, |
| "loss": 1.7392584085464478, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.8208015561103821, |
| "learning_rate": 0.0007747464411350876, |
| "loss": 1.7316131591796875, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_beta_ce_loss": 0.6033149586148459, |
| "eval_bleu": 0.5017084521574802, |
| "eval_loss": 1.7307289455034962, |
| "eval_uni_ce_loss": 0.5240990275253444, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_beta_ce_loss": 0.6033149586148459, |
| "eval_bleu": 0.5017084521574802, |
| "eval_loss": 1.7307289455034962, |
| "eval_runtime": 136.4837, |
| "eval_samples_per_second": 205.101, |
| "eval_steps_per_second": 3.209, |
| "eval_uni_ce_loss": 0.5240990275253444, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 0.9193022847175598, |
| "learning_rate": 0.000758657900803716, |
| "loss": 1.7337878942489624, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 0.7078978419303894, |
| "learning_rate": 0.000742195005021869, |
| "loss": 1.7295589447021484, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 0.7894092798233032, |
| "learning_rate": 0.0007253815805303786, |
| "loss": 1.723784327507019, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 0.7812065482139587, |
| "learning_rate": 0.0007082419613901028, |
| "loss": 1.7162002325057983, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_beta_ce_loss": 0.6003280380000807, |
| "eval_bleu": 0.5041936063198118, |
| "eval_loss": 1.7231203064526597, |
| "eval_uni_ce_loss": 0.5224642294318709, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_beta_ce_loss": 0.6003280380000807, |
| "eval_bleu": 0.5041936063198118, |
| "eval_loss": 1.7231203064526597, |
| "eval_runtime": 136.1645, |
| "eval_samples_per_second": 205.582, |
| "eval_steps_per_second": 3.217, |
| "eval_uni_ce_loss": 0.5224642294318709, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 0.6728302836418152, |
| "learning_rate": 0.0006908009537632514, |
| "loss": 1.7215343713760376, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 0.6643648743629456, |
| "learning_rate": 0.0006730838000114403, |
| "loss": 1.7175226211547852, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.6722604036331177, |
| "learning_rate": 0.0006551161421624341, |
| "loss": 1.7141364812850952, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 0.7661899328231812, |
| "learning_rate": 0.0006369239847984517, |
| "loss": 1.7067075967788696, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_beta_ce_loss": 0.5960702291906697, |
| "eval_bleu": 0.508842331362903, |
| "eval_loss": 1.7111802789718593, |
| "eval_uni_ce_loss": 0.5190398215431057, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_beta_ce_loss": 0.5960702291906697, |
| "eval_bleu": 0.508842331362903, |
| "eval_loss": 1.7111802789718593, |
| "eval_runtime": 135.4442, |
| "eval_samples_per_second": 206.676, |
| "eval_steps_per_second": 3.234, |
| "eval_uni_ce_loss": 0.5190398215431057, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.7040799260139465, |
| "learning_rate": 0.0006185336574197479, |
| "loss": 1.7019784450531006, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 0.7697860598564148, |
| "learning_rate": 0.0005999717763379407, |
| "loss": 1.7126665115356445, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 0.7353936433792114, |
| "learning_rate": 0.0005812652061542363, |
| "loss": 1.7056387662887573, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 0.8408072590827942, |
| "learning_rate": 0.0005624410208783071, |
| "loss": 1.7016122341156006, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_beta_ce_loss": 0.5948017802276567, |
| "eval_bleu": 0.5119743719435753, |
| "eval_loss": 1.705481736083009, |
| "eval_uni_ce_loss": 0.5158781752874863, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_beta_ce_loss": 0.5948017802276567, |
| "eval_bleu": 0.5119743719435753, |
| "eval_loss": 1.705481736083009, |
| "eval_runtime": 134.7944, |
| "eval_samples_per_second": 207.672, |
| "eval_steps_per_second": 3.249, |
| "eval_uni_ce_loss": 0.5158781752874863, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 0.6450205445289612, |
| "learning_rate": 0.0005435264647440881, |
| "loss": 1.7052643299102783, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.6514373421669006, |
| "learning_rate": 0.000524548912779213, |
| "loss": 1.69474458694458, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 0.7065825462341309, |
| "learning_rate": 0.0005055358311851499, |
| "loss": 1.7007395029067993, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 0.8027070164680481, |
| "learning_rate": 0.0004865147375853812, |
| "loss": 1.696798324584961, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_beta_ce_loss": 0.5911326601625033, |
| "eval_bleu": 0.5082277536286182, |
| "eval_loss": 1.6985954277591617, |
| "eval_uni_ce_loss": 0.5163301084547827, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_beta_ce_loss": 0.5911326601625033, |
| "eval_bleu": 0.5082277536286182, |
| "eval_loss": 1.6985954277591617, |
| "eval_runtime": 138.4679, |
| "eval_samples_per_second": 202.162, |
| "eval_steps_per_second": 3.163, |
| "eval_uni_ce_loss": 0.5163301084547827, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 0.7604843974113464, |
| "learning_rate": 0.0004675131611991607, |
| "loss": 1.6915127038955688, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 0.6604936718940735, |
| "learning_rate": 0.0004485586029984899, |
| "loss": 1.692893147468567, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 0.7172213792800903, |
| "learning_rate": 0.00042967849590597266, |
| "loss": 1.6868789196014404, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 0.7629127502441406, |
| "learning_rate": 0.0004109001650911621, |
| "loss": 1.6939620971679688, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_beta_ce_loss": 0.5891192779965597, |
| "eval_bleu": 0.5114094116436015, |
| "eval_loss": 1.6906098777845027, |
| "eval_uni_ce_loss": 0.5123713216552995, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_beta_ce_loss": 0.5891192779965597, |
| "eval_bleu": 0.5114094116436015, |
| "eval_loss": 1.6906098777845027, |
| "eval_runtime": 135.3599, |
| "eval_samples_per_second": 206.804, |
| "eval_steps_per_second": 3.236, |
| "eval_uni_ce_loss": 0.5123713216552995, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 0.6822903156280518, |
| "learning_rate": 0.0003922507884228551, |
| "loss": 1.6874535083770752, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 0.7348518371582031, |
| "learning_rate": 0.00037375735713457723, |
| "loss": 1.6786881685256958, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 0.797898530960083, |
| "learning_rate": 0.00035544663676018276, |
| "loss": 1.6823772192001343, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 0.666439950466156, |
| "learning_rate": 0.00033734512839611255, |
| "loss": 1.6804336309432983, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_beta_ce_loss": 0.5856027225517246, |
| "eval_bleu": 0.5110656031508146, |
| "eval_loss": 1.6819111756538148, |
| "eval_uni_ce_loss": 0.5107057281689013, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_beta_ce_loss": 0.5856027225517246, |
| "eval_bleu": 0.5110656031508146, |
| "eval_loss": 1.6819111756538148, |
| "eval_runtime": 135.0756, |
| "eval_samples_per_second": 207.24, |
| "eval_steps_per_second": 3.243, |
| "eval_uni_ce_loss": 0.5107057281689013, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 0.9536261558532715, |
| "learning_rate": 0.0003194790303463687, |
| "loss": 1.6719762086868286, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.7538688778877258, |
| "learning_rate": 0.00030187420020572406, |
| "loss": 1.6798793077468872, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 0.6934835910797119, |
| "learning_rate": 0.00028455611743603626, |
| "loss": 1.6709473133087158, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 0.7217169404029846, |
| "learning_rate": 0.0002675498464898373, |
| "loss": 1.6749387979507446, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_beta_ce_loss": 0.5853756803627972, |
| "eval_bleu": 0.5112546154669725, |
| "eval_loss": 1.6793161832034316, |
| "eval_uni_ce_loss": 0.5085648222737116, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_beta_ce_loss": 0.5853756803627972, |
| "eval_bleu": 0.5112546154669725, |
| "eval_loss": 1.6793161832034316, |
| "eval_runtime": 137.7212, |
| "eval_samples_per_second": 203.258, |
| "eval_steps_per_second": 3.18, |
| "eval_uni_ce_loss": 0.5085648222737116, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 0.8353447914123535, |
| "learning_rate": 0.0002508800005345623, |
| "loss": 1.679425835609436, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 0.6642665266990662, |
| "learning_rate": 0.00023457070582992562, |
| "loss": 1.6819074153900146, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 0.6584897637367249, |
| "learning_rate": 0.00021864556680999692, |
| "loss": 1.6745400428771973, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 0.6884040236473083, |
| "learning_rate": 0.0002031276319205152, |
| "loss": 1.6730715036392212, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_beta_ce_loss": 0.582520013870714, |
| "eval_bleu": 0.5116472036246329, |
| "eval_loss": 1.6724328953925878, |
| "eval_uni_ce_loss": 0.5073928678552854, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_beta_ce_loss": 0.582520013870714, |
| "eval_bleu": 0.5116472036246329, |
| "eval_loss": 1.6724328953925878, |
| "eval_runtime": 135.5849, |
| "eval_samples_per_second": 206.461, |
| "eval_steps_per_second": 3.23, |
| "eval_uni_ce_loss": 0.5073928678552854, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.7419734597206116, |
| "learning_rate": 0.00018803936026088542, |
| "loss": 1.6732308864593506, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 0.6793375611305237, |
| "learning_rate": 0.00017340258907913464, |
| "loss": 1.6713887453079224, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 0.6444223523139954, |
| "learning_rate": 0.0001592385021668743, |
| "loss": 1.664468765258789, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 0.7465987205505371, |
| "learning_rate": 0.0001455675992000087, |
| "loss": 1.6718043088912964, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_beta_ce_loss": 0.5802708933614704, |
| "eval_bleu": 0.5137912099965332, |
| "eval_loss": 1.6668929272590707, |
| "eval_uni_ce_loss": 0.5063511407402552, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_beta_ce_loss": 0.5802708933614704, |
| "eval_bleu": 0.5137912099965332, |
| "eval_loss": 1.6668929272590707, |
| "eval_runtime": 137.3458, |
| "eval_samples_per_second": 203.814, |
| "eval_steps_per_second": 3.189, |
| "eval_uni_ce_loss": 0.5063511407402552, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 0.7398092746734619, |
| "learning_rate": 0.000132409666069565, |
| "loss": 1.6554030179977417, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 0.7583303451538086, |
| "learning_rate": 0.0001197837462455823, |
| "loss": 1.6627554893493652, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 0.7260330319404602, |
| "learning_rate": 0.00010770811321550749, |
| "loss": 1.6633129119873047, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 0.6960119605064392, |
| "learning_rate": 9.620024403698591e-05, |
| "loss": 1.6641393899917603, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_beta_ce_loss": 0.580827721654008, |
| "eval_bleu": 0.5137603294282832, |
| "eval_loss": 1.6667931901809832, |
| "eval_uni_ce_loss": 0.5051377474853437, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_beta_ce_loss": 0.580827721654008, |
| "eval_bleu": 0.5137603294282832, |
| "eval_loss": 1.6667931901809832, |
| "eval_runtime": 135.8623, |
| "eval_samples_per_second": 206.039, |
| "eval_steps_per_second": 3.224, |
| "eval_uni_ce_loss": 0.5051377474853437, |
| "step": 17408 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|