{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8040275275968778, "eval_steps": 1024, "global_step": 17408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 0.7295829057693481, "learning_rate": 0.000498046875, "loss": 1.8308284282684326, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.8052719831466675, "learning_rate": 0.000998046875, "loss": 1.8338860273361206, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.8997901678085327, "learning_rate": 0.000999640996023194, "loss": 1.8343064785003662, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.7250568270683289, "learning_rate": 0.0009985588674043958, "loss": 1.8296632766723633, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_beta_ce_loss": 0.6351920716022248, "eval_bleu": 0.4998693474981933, "eval_loss": 1.8211502122552428, "eval_uni_ce_loss": 0.5507660694590443, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_beta_ce_loss": 0.6351920716022248, "eval_bleu": 0.4998693474981933, "eval_loss": 1.8211502122552428, "eval_runtime": 139.8967, "eval_samples_per_second": 200.098, "eval_steps_per_second": 3.131, "eval_uni_ce_loss": 0.5507660694590443, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.7675038576126099, "learning_rate": 0.0009967551747861387, "loss": 1.8191157579421997, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.8503870368003845, "learning_rate": 0.000994232528651847, "loss": 1.8177354335784912, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.8022050261497498, "learning_rate": 0.0009909945800260092, "loss": 1.807867407798767, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.9533084034919739, "learning_rate": 0.0009870460151900522, "loss": 1.8013904094696045, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_beta_ce_loss": 0.6264337543483194, "eval_bleu": 0.49771606091961523, "eval_loss": 1.7982131136606818, "eval_uni_ce_loss": 0.545345604555792, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_beta_ce_loss": 0.6264337543483194, "eval_bleu": 0.49771606091961523, "eval_loss": 1.7982131136606818, "eval_runtime": 133.1265, "eval_samples_per_second": 210.274, "eval_steps_per_second": 3.29, "eval_uni_ce_loss": 0.545345604555792, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.9694677591323853, "learning_rate": 0.0009823925488998885, "loss": 1.8004993200302124, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.8076265454292297, "learning_rate": 0.0009770409161149525, "loss": 1.7975847721099854, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.7418428063392639, "learning_rate": 0.0009709988622506973, "loss": 1.789136528968811, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.7103869915008545, "learning_rate": 0.000964275131968659, "loss": 1.7754944562911987, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_beta_ce_loss": 0.620368564918161, "eval_bleu": 0.49887627167388116, "eval_loss": 1.7806649703413384, "eval_uni_ce_loss": 0.5399278415936858, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_beta_ce_loss": 0.620368564918161, "eval_bleu": 0.49887627167388116, "eval_loss": 1.7806649703413384, "eval_runtime": 134.7634, "eval_samples_per_second": 207.72, "eval_steps_per_second": 3.25, "eval_uni_ce_loss": 0.5399278415936858, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.7851372957229614, "learning_rate": 0.0009568794565203123, "loss": 1.7792595624923706, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.7782461047172546, "learning_rate": 0.0009488225396630347, "loss": 1.7753657102584839, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.7312053442001343, "learning_rate": 0.0009401160421685646, "loss": 1.769174575805664, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 0.8110184669494629, "learning_rate": 0.0009307725649463714, "loss": 1.769570231437683, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_beta_ce_loss": 0.6151743780260217, "eval_bleu": 0.5008352932587944, "eval_loss": 1.766069833818636, "eval_uni_ce_loss": 0.5357210769500906, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_beta_ce_loss": 0.6151743780260217, "eval_bleu": 0.5008352932587944, "eval_loss": 1.766069833818636, "eval_runtime": 135.334, "eval_samples_per_second": 206.844, "eval_steps_per_second": 3.236, "eval_uni_ce_loss": 0.5357210769500906, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.9300932288169861, "learning_rate": 0.0009208056308063659, "loss": 1.7624573707580566, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.7691043019294739, "learning_rate": 0.0009102296648873445, "loss": 1.7650220394134521, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.8132173418998718, "learning_rate": 0.0008990599737794927, "loss": 1.761474370956421, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.7635871171951294, "learning_rate": 0.0008873127233711644, "loss": 1.7527934312820435, "step": 5120 }, { "epoch": 0.236478684587317, "eval_beta_ce_loss": 0.611639610028158, "eval_bleu": 0.5024511500541371, "eval_loss": 1.753186773763944, "eval_uni_ce_loss": 0.5299075521426658, "step": 5120 }, { "epoch": 0.236478684587317, "eval_beta_ce_loss": 0.611639610028158, "eval_bleu": 0.5024511500541371, "eval_loss": 1.753186773763944, "eval_runtime": 135.0739, "eval_samples_per_second": 207.242, "eval_steps_per_second": 3.243, "eval_uni_ce_loss": 0.5299075521426658, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.7438808083534241, "learning_rate": 0.0008750049154520011, "loss": 1.7492367029190063, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.7799262404441833, "learning_rate": 0.0008621543631062487, "loss": 1.744637131690979, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.7694032192230225, "learning_rate": 0.0008487796649318904, "loss": 1.7494758367538452, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.7369454503059387, "learning_rate": 0.0008349001781229053, "loss": 1.7494639158248901, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_beta_ce_loss": 0.6058613158524309, "eval_bleu": 0.5028032064859724, "eval_loss": 1.739504432841523, "eval_uni_ce_loss": 0.5277818014088287, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_beta_ce_loss": 0.6058613158524309, "eval_bleu": 0.5028032064859724, "eval_loss": 1.739504432841523, "eval_runtime": 136.5059, "eval_samples_per_second": 205.068, "eval_steps_per_second": 3.209, "eval_uni_ce_loss": 0.5277818014088287, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.8735672235488892, "learning_rate": 0.0008205359904536107, "loss": 1.7374849319458008, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 1.0139427185058594, "learning_rate": 0.0008057078912056363, "loss": 1.7405836582183838, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 1.0008057355880737, "learning_rate": 0.0007904373410796086, "loss": 1.7392584085464478, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.8208015561103821, "learning_rate": 0.0007747464411350876, "loss": 1.7316131591796875, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_beta_ce_loss": 0.6033149586148459, "eval_bleu": 0.5017084521574802, "eval_loss": 1.7307289455034962, "eval_uni_ce_loss": 0.5240990275253444, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_beta_ce_loss": 0.6033149586148459, "eval_bleu": 0.5017084521574802, "eval_loss": 1.7307289455034962, "eval_runtime": 136.4837, "eval_samples_per_second": 205.101, "eval_steps_per_second": 3.209, "eval_uni_ce_loss": 0.5240990275253444, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.9193022847175598, "learning_rate": 0.000758657900803716, "loss": 1.7337878942489624, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.7078978419303894, "learning_rate": 0.000742195005021869, "loss": 1.7295589447021484, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.7894092798233032, "learning_rate": 0.0007253815805303786, "loss": 1.723784327507019, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.7812065482139587, "learning_rate": 0.0007082419613901028, "loss": 1.7162002325057983, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_beta_ce_loss": 0.6003280380000807, "eval_bleu": 0.5041936063198118, "eval_loss": 1.7231203064526597, "eval_uni_ce_loss": 0.5224642294318709, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_beta_ce_loss": 0.6003280380000807, "eval_bleu": 0.5041936063198118, "eval_loss": 1.7231203064526597, "eval_runtime": 136.1645, "eval_samples_per_second": 205.582, "eval_steps_per_second": 3.217, "eval_uni_ce_loss": 0.5224642294318709, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.6728302836418152, "learning_rate": 0.0006908009537632514, "loss": 1.7215343713760376, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.6643648743629456, "learning_rate": 0.0006730838000114403, "loss": 1.7175226211547852, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.6722604036331177, "learning_rate": 0.0006551161421624341, "loss": 1.7141364812850952, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.7661899328231812, "learning_rate": 0.0006369239847984517, "loss": 1.7067075967788696, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_beta_ce_loss": 0.5960702291906697, "eval_bleu": 0.508842331362903, "eval_loss": 1.7111802789718593, "eval_uni_ce_loss": 0.5190398215431057, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_beta_ce_loss": 0.5960702291906697, "eval_bleu": 0.508842331362903, "eval_loss": 1.7111802789718593, "eval_runtime": 135.4442, "eval_samples_per_second": 206.676, "eval_steps_per_second": 3.234, "eval_uni_ce_loss": 0.5190398215431057, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.7040799260139465, "learning_rate": 0.0006185336574197479, "loss": 1.7019784450531006, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.7697860598564148, "learning_rate": 0.0005999717763379407, "loss": 1.7126665115356445, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.7353936433792114, "learning_rate": 0.0005812652061542363, "loss": 1.7056387662887573, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.8408072590827942, "learning_rate": 0.0005624410208783071, "loss": 1.7016122341156006, "step": 10240 }, { "epoch": 0.472957369174634, "eval_beta_ce_loss": 0.5948017802276567, "eval_bleu": 0.5119743719435753, "eval_loss": 1.705481736083009, "eval_uni_ce_loss": 0.5158781752874863, "step": 10240 }, { "epoch": 0.472957369174634, "eval_beta_ce_loss": 0.5948017802276567, "eval_bleu": 0.5119743719435753, "eval_loss": 1.705481736083009, "eval_runtime": 134.7944, "eval_samples_per_second": 207.672, "eval_steps_per_second": 3.249, "eval_uni_ce_loss": 0.5158781752874863, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.6450205445289612, "learning_rate": 0.0005435264647440881, "loss": 1.7052643299102783, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.6514373421669006, "learning_rate": 0.000524548912779213, "loss": 1.69474458694458, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.7065825462341309, "learning_rate": 0.0005055358311851499, "loss": 1.7007395029067993, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.8027070164680481, "learning_rate": 0.0004865147375853812, "loss": 1.696798324584961, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_beta_ce_loss": 0.5911326601625033, "eval_bleu": 0.5082277536286182, "eval_loss": 1.6985954277591617, "eval_uni_ce_loss": 0.5163301084547827, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_beta_ce_loss": 0.5911326601625033, "eval_bleu": 0.5082277536286182, "eval_loss": 1.6985954277591617, "eval_runtime": 138.4679, "eval_samples_per_second": 202.162, "eval_steps_per_second": 3.163, "eval_uni_ce_loss": 0.5163301084547827, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.7604843974113464, "learning_rate": 0.0004675131611991607, "loss": 1.6915127038955688, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.6604936718940735, "learning_rate": 0.0004485586029984899, "loss": 1.692893147468567, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.7172213792800903, "learning_rate": 0.00042967849590597266, "loss": 1.6868789196014404, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.7629127502441406, "learning_rate": 0.0004109001650911621, "loss": 1.6939620971679688, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_beta_ce_loss": 0.5891192779965597, "eval_bleu": 0.5114094116436015, "eval_loss": 1.6906098777845027, "eval_uni_ce_loss": 0.5123713216552995, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_beta_ce_loss": 0.5891192779965597, "eval_bleu": 0.5114094116436015, "eval_loss": 1.6906098777845027, "eval_runtime": 135.3599, "eval_samples_per_second": 206.804, "eval_steps_per_second": 3.236, "eval_uni_ce_loss": 0.5123713216552995, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.6822903156280518, "learning_rate": 0.0003922507884228551, "loss": 1.6874535083770752, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.7348518371582031, "learning_rate": 0.00037375735713457723, "loss": 1.6786881685256958, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.797898530960083, "learning_rate": 0.00035544663676018276, "loss": 1.6823772192001343, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 0.666439950466156, "learning_rate": 0.00033734512839611255, "loss": 1.6804336309432983, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_beta_ce_loss": 0.5856027225517246, "eval_bleu": 0.5110656031508146, "eval_loss": 1.6819111756538148, "eval_uni_ce_loss": 0.5107057281689013, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_beta_ce_loss": 0.5856027225517246, "eval_bleu": 0.5110656031508146, "eval_loss": 1.6819111756538148, "eval_runtime": 135.0756, "eval_samples_per_second": 207.24, "eval_steps_per_second": 3.243, "eval_uni_ce_loss": 0.5107057281689013, "step": 13312 }, { "epoch": 0.62666851415639, "grad_norm": 0.9536261558532715, "learning_rate": 0.0003194790303463687, "loss": 1.6719762086868286, "step": 13568 }, { "epoch": 0.6384924483857558, "grad_norm": 0.7538688778877258, "learning_rate": 0.00030187420020572406, "loss": 1.6798793077468872, "step": 13824 }, { "epoch": 0.6503163826151217, "grad_norm": 0.6934835910797119, "learning_rate": 0.00028455611743603626, "loss": 1.6709473133087158, "step": 14080 }, { "epoch": 0.6621403168444876, "grad_norm": 0.7217169404029846, "learning_rate": 0.0002675498464898373, "loss": 1.6749387979507446, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_beta_ce_loss": 0.5853756803627972, "eval_bleu": 0.5112546154669725, "eval_loss": 1.6793161832034316, "eval_uni_ce_loss": 0.5085648222737116, "step": 14336 }, { "epoch": 0.6621403168444876, "eval_beta_ce_loss": 0.5853756803627972, "eval_bleu": 0.5112546154669725, "eval_loss": 1.6793161832034316, "eval_runtime": 137.7212, "eval_samples_per_second": 203.258, "eval_steps_per_second": 3.18, "eval_uni_ce_loss": 0.5085648222737116, "step": 14336 }, { "epoch": 0.6739642510738534, "grad_norm": 0.8353447914123535, "learning_rate": 0.0002508800005345623, "loss": 1.679425835609436, "step": 14592 }, { "epoch": 0.6857881853032193, "grad_norm": 0.6642665266990662, "learning_rate": 0.00023457070582992562, "loss": 1.6819074153900146, "step": 14848 }, { "epoch": 0.6976121195325851, "grad_norm": 0.6584897637367249, "learning_rate": 0.00021864556680999692, "loss": 1.6745400428771973, "step": 15104 }, { "epoch": 0.709436053761951, "grad_norm": 0.6884040236473083, "learning_rate": 0.0002031276319205152, "loss": 1.6730715036392212, "step": 15360 }, { "epoch": 0.709436053761951, "eval_beta_ce_loss": 0.582520013870714, "eval_bleu": 0.5116472036246329, "eval_loss": 1.6724328953925878, "eval_uni_ce_loss": 0.5073928678552854, "step": 15360 }, { "epoch": 0.709436053761951, "eval_beta_ce_loss": 0.582520013870714, "eval_bleu": 0.5116472036246329, "eval_loss": 1.6724328953925878, "eval_runtime": 135.5849, "eval_samples_per_second": 206.461, "eval_steps_per_second": 3.23, "eval_uni_ce_loss": 0.5073928678552854, "step": 15360 }, { "epoch": 0.7212599879913169, "grad_norm": 0.7419734597206116, "learning_rate": 0.00018803936026088542, "loss": 1.6732308864593506, "step": 15616 }, { "epoch": 0.7330839222206826, "grad_norm": 0.6793375611305237, "learning_rate": 0.00017340258907913464, "loss": 1.6713887453079224, "step": 15872 }, { "epoch": 0.7449078564500485, "grad_norm": 0.6444223523139954, "learning_rate": 0.0001592385021668743, "loss": 1.664468765258789, "step": 16128 }, { "epoch": 0.7567317906794143, "grad_norm": 0.7465987205505371, "learning_rate": 0.0001455675992000087, "loss": 1.6718043088912964, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_beta_ce_loss": 0.5802708933614704, "eval_bleu": 0.5137912099965332, "eval_loss": 1.6668929272590707, "eval_uni_ce_loss": 0.5063511407402552, "step": 16384 }, { "epoch": 0.7567317906794143, "eval_beta_ce_loss": 0.5802708933614704, "eval_bleu": 0.5137912099965332, "eval_loss": 1.6668929272590707, "eval_runtime": 137.3458, "eval_samples_per_second": 203.814, "eval_steps_per_second": 3.189, "eval_uni_ce_loss": 0.5063511407402552, "step": 16384 }, { "epoch": 0.7685557249087802, "grad_norm": 0.7398092746734619, "learning_rate": 0.000132409666069565, "loss": 1.6554030179977417, "step": 16640 }, { "epoch": 0.780379659138146, "grad_norm": 0.7583303451538086, "learning_rate": 0.0001197837462455823, "loss": 1.6627554893493652, "step": 16896 }, { "epoch": 0.7922035933675119, "grad_norm": 0.7260330319404602, "learning_rate": 0.00010770811321550749, "loss": 1.6633129119873047, "step": 17152 }, { "epoch": 0.8040275275968778, "grad_norm": 0.6960119605064392, "learning_rate": 9.620024403698591e-05, "loss": 1.6641393899917603, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_beta_ce_loss": 0.580827721654008, "eval_bleu": 0.5137603294282832, "eval_loss": 1.6667931901809832, "eval_uni_ce_loss": 0.5051377474853437, "step": 17408 }, { "epoch": 0.8040275275968778, "eval_beta_ce_loss": 0.580827721654008, "eval_bleu": 0.5137603294282832, "eval_loss": 1.6667931901809832, "eval_runtime": 135.8623, "eval_samples_per_second": 206.039, "eval_steps_per_second": 3.224, "eval_uni_ce_loss": 0.5051377474853437, "step": 17408 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }