{ "best_global_step": 2128, "best_metric": 0.08128391951322556, "best_model_checkpoint": "tmp/out/1536-96-r2_common_channel_fcmCtx4_fcmLayers6_fcmChMixingTrue_stride24_bs512_lr0.001_a7e3/checkpoint-2128", "epoch": 76.0, "eval_steps": 500, "global_step": 2128, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.18818920850753784, "learning_rate": 0.000999997705696395, "loss": 0.1949, "step": 28 }, { "epoch": 1.0, "eval_loss": 0.13799113035202026, "eval_runtime": 13.9945, "eval_samples_per_second": 388.011, "eval_steps_per_second": 0.786, "step": 28 }, { "epoch": 2.0, "grad_norm": 0.17697054147720337, "learning_rate": 0.0009999904797644875, "loss": 0.1397, "step": 56 }, { "epoch": 2.0, "eval_loss": 0.1200891062617302, "eval_runtime": 13.7764, "eval_samples_per_second": 394.151, "eval_steps_per_second": 0.798, "step": 56 }, { "epoch": 3.0, "grad_norm": 0.12785276770591736, "learning_rate": 0.0009999783191283983, "loss": 0.1239, "step": 84 }, { "epoch": 3.0, "eval_loss": 0.11045144498348236, "eval_runtime": 13.7689, "eval_samples_per_second": 394.367, "eval_steps_per_second": 0.799, "step": 84 }, { "epoch": 4.0, "grad_norm": 0.15085896849632263, "learning_rate": 0.0009999612239081498, "loss": 0.1156, "step": 112 }, { "epoch": 4.0, "eval_loss": 0.10991678386926651, "eval_runtime": 14.5903, "eval_samples_per_second": 372.166, "eval_steps_per_second": 0.754, "step": 112 }, { "epoch": 5.0, "grad_norm": 0.1445561647415161, "learning_rate": 0.0009999391942724632, "loss": 0.1106, "step": 140 }, { "epoch": 5.0, "eval_loss": 0.1034143716096878, "eval_runtime": 12.8666, "eval_samples_per_second": 422.022, "eval_steps_per_second": 0.855, "step": 140 }, { "epoch": 6.0, "grad_norm": 0.13405078649520874, "learning_rate": 0.000999912230438763, "loss": 0.1072, "step": 168 }, { "epoch": 6.0, "eval_loss": 0.10323869436979294, "eval_runtime": 13.5764, "eval_samples_per_second": 399.957, "eval_steps_per_second": 0.81, "step": 168 }, { "epoch": 7.0, "grad_norm": 0.13994024693965912, "learning_rate": 0.0009998803326731703, "loss": 0.1042, "step": 196 }, { "epoch": 7.0, "eval_loss": 0.10014788061380386, "eval_runtime": 14.0652, "eval_samples_per_second": 386.058, "eval_steps_per_second": 0.782, "step": 196 }, { "epoch": 8.0, "grad_norm": 0.13133035600185394, "learning_rate": 0.0009998435012905044, "loss": 0.1025, "step": 224 }, { "epoch": 8.0, "eval_loss": 0.1004691943526268, "eval_runtime": 13.771, "eval_samples_per_second": 394.307, "eval_steps_per_second": 0.799, "step": 224 }, { "epoch": 9.0, "grad_norm": 0.1343812197446823, "learning_rate": 0.0009998017366542756, "loss": 0.1008, "step": 252 }, { "epoch": 9.0, "eval_loss": 0.09979160130023956, "eval_runtime": 14.0034, "eval_samples_per_second": 387.762, "eval_steps_per_second": 0.786, "step": 252 }, { "epoch": 10.0, "grad_norm": 0.1553795337677002, "learning_rate": 0.0009997550391766844, "loss": 0.0991, "step": 280 }, { "epoch": 10.0, "eval_loss": 0.098526231944561, "eval_runtime": 13.3534, "eval_samples_per_second": 406.638, "eval_steps_per_second": 0.824, "step": 280 }, { "epoch": 11.0, "grad_norm": 0.1313014179468155, "learning_rate": 0.000999703409318616, "loss": 0.0977, "step": 308 }, { "epoch": 11.0, "eval_loss": 0.09693964570760727, "eval_runtime": 14.1264, "eval_samples_per_second": 384.385, "eval_steps_per_second": 0.779, "step": 308 }, { "epoch": 12.0, "grad_norm": 0.14203286170959473, "learning_rate": 0.0009996468475896355, "loss": 0.0964, "step": 336 }, { "epoch": 12.0, "eval_loss": 0.0984538123011589, "eval_runtime": 13.8783, "eval_samples_per_second": 391.259, "eval_steps_per_second": 0.793, "step": 336 }, { "epoch": 13.0, "grad_norm": 0.16649910807609558, "learning_rate": 0.0009995853545479853, "loss": 0.0955, "step": 364 }, { "epoch": 13.0, "eval_loss": 0.09544174373149872, "eval_runtime": 13.7858, "eval_samples_per_second": 393.884, "eval_steps_per_second": 0.798, "step": 364 }, { "epoch": 14.0, "grad_norm": 0.11850055307149887, "learning_rate": 0.0009995189308005762, "loss": 0.0945, "step": 392 }, { "epoch": 14.0, "eval_loss": 0.09443824738264084, "eval_runtime": 14.0327, "eval_samples_per_second": 386.953, "eval_steps_per_second": 0.784, "step": 392 }, { "epoch": 15.0, "grad_norm": 0.1464478075504303, "learning_rate": 0.0009994475770029841, "loss": 0.0926, "step": 420 }, { "epoch": 15.0, "eval_loss": 0.09755795449018478, "eval_runtime": 13.5982, "eval_samples_per_second": 399.319, "eval_steps_per_second": 0.809, "step": 420 }, { "epoch": 16.0, "grad_norm": 0.13713239133358002, "learning_rate": 0.0009993712938594424, "loss": 0.0922, "step": 448 }, { "epoch": 16.0, "eval_loss": 0.09337516874074936, "eval_runtime": 13.7553, "eval_samples_per_second": 394.757, "eval_steps_per_second": 0.8, "step": 448 }, { "epoch": 17.0, "grad_norm": 0.17032985389232635, "learning_rate": 0.0009992900821228345, "loss": 0.0914, "step": 476 }, { "epoch": 17.0, "eval_loss": 0.09800300747156143, "eval_runtime": 13.4874, "eval_samples_per_second": 402.597, "eval_steps_per_second": 0.816, "step": 476 }, { "epoch": 18.0, "grad_norm": 0.11109838634729385, "learning_rate": 0.000999203942594687, "loss": 0.091, "step": 504 }, { "epoch": 18.0, "eval_loss": 0.09393668174743652, "eval_runtime": 13.6194, "eval_samples_per_second": 398.697, "eval_steps_per_second": 0.808, "step": 504 }, { "epoch": 19.0, "grad_norm": 0.13168472051620483, "learning_rate": 0.0009991128761251632, "loss": 0.0894, "step": 532 }, { "epoch": 19.0, "eval_loss": 0.09444674849510193, "eval_runtime": 13.817, "eval_samples_per_second": 392.994, "eval_steps_per_second": 0.796, "step": 532 }, { "epoch": 20.0, "grad_norm": 0.12797316908836365, "learning_rate": 0.0009990168836130527, "loss": 0.0892, "step": 560 }, { "epoch": 20.0, "eval_loss": 0.09130553901195526, "eval_runtime": 12.8149, "eval_samples_per_second": 423.724, "eval_steps_per_second": 0.858, "step": 560 }, { "epoch": 21.0, "grad_norm": 0.12064854055643082, "learning_rate": 0.0009989159660057615, "loss": 0.0878, "step": 588 }, { "epoch": 21.0, "eval_loss": 0.09315093606710434, "eval_runtime": 13.0593, "eval_samples_per_second": 415.796, "eval_steps_per_second": 0.842, "step": 588 }, { "epoch": 22.0, "grad_norm": 0.11039382964372635, "learning_rate": 0.0009988101242993065, "loss": 0.0873, "step": 616 }, { "epoch": 22.0, "eval_loss": 0.09174513071775436, "eval_runtime": 13.6345, "eval_samples_per_second": 398.254, "eval_steps_per_second": 0.807, "step": 616 }, { "epoch": 23.0, "grad_norm": 0.11381122469902039, "learning_rate": 0.000998699359538303, "loss": 0.0862, "step": 644 }, { "epoch": 23.0, "eval_loss": 0.09421718120574951, "eval_runtime": 13.1437, "eval_samples_per_second": 413.126, "eval_steps_per_second": 0.837, "step": 644 }, { "epoch": 24.0, "grad_norm": 0.13068965077400208, "learning_rate": 0.0009985836728159524, "loss": 0.0862, "step": 672 }, { "epoch": 24.0, "eval_loss": 0.0908147320151329, "eval_runtime": 13.6756, "eval_samples_per_second": 397.059, "eval_steps_per_second": 0.804, "step": 672 }, { "epoch": 25.0, "grad_norm": 0.19063422083854675, "learning_rate": 0.0009984630652740383, "loss": 0.0861, "step": 700 }, { "epoch": 25.0, "eval_loss": 0.09025771915912628, "eval_runtime": 12.5305, "eval_samples_per_second": 433.342, "eval_steps_per_second": 0.878, "step": 700 }, { "epoch": 26.0, "grad_norm": 0.1407005786895752, "learning_rate": 0.0009983375381029088, "loss": 0.0854, "step": 728 }, { "epoch": 26.0, "eval_loss": 0.08990131318569183, "eval_runtime": 13.5474, "eval_samples_per_second": 400.814, "eval_steps_per_second": 0.812, "step": 728 }, { "epoch": 27.0, "grad_norm": 0.1301293671131134, "learning_rate": 0.0009982070925414637, "loss": 0.0847, "step": 756 }, { "epoch": 27.0, "eval_loss": 0.08932916820049286, "eval_runtime": 13.7031, "eval_samples_per_second": 396.259, "eval_steps_per_second": 0.803, "step": 756 }, { "epoch": 28.0, "grad_norm": 0.15950527787208557, "learning_rate": 0.0009980717298771495, "loss": 0.0836, "step": 784 }, { "epoch": 28.0, "eval_loss": 0.08872799575328827, "eval_runtime": 12.7264, "eval_samples_per_second": 426.672, "eval_steps_per_second": 0.864, "step": 784 }, { "epoch": 29.0, "grad_norm": 0.14679257571697235, "learning_rate": 0.000997931451445941, "loss": 0.0831, "step": 812 }, { "epoch": 29.0, "eval_loss": 0.09064222127199173, "eval_runtime": 13.2879, "eval_samples_per_second": 408.642, "eval_steps_per_second": 0.828, "step": 812 }, { "epoch": 30.0, "grad_norm": 0.1313679814338684, "learning_rate": 0.0009977862586323298, "loss": 0.0827, "step": 840 }, { "epoch": 30.0, "eval_loss": 0.09007434546947479, "eval_runtime": 13.714, "eval_samples_per_second": 395.947, "eval_steps_per_second": 0.802, "step": 840 }, { "epoch": 31.0, "grad_norm": 0.1265498697757721, "learning_rate": 0.0009976361528693104, "loss": 0.0819, "step": 868 }, { "epoch": 31.0, "eval_loss": 0.08951247483491898, "eval_runtime": 13.4697, "eval_samples_per_second": 403.128, "eval_steps_per_second": 0.817, "step": 868 }, { "epoch": 32.0, "grad_norm": 0.1129317358136177, "learning_rate": 0.0009974811356383668, "loss": 0.0819, "step": 896 }, { "epoch": 32.0, "eval_loss": 0.09060715138912201, "eval_runtime": 13.0035, "eval_samples_per_second": 417.579, "eval_steps_per_second": 0.846, "step": 896 }, { "epoch": 33.0, "grad_norm": 0.12748871743679047, "learning_rate": 0.0009973212084694557, "loss": 0.0812, "step": 924 }, { "epoch": 33.0, "eval_loss": 0.0907522663474083, "eval_runtime": 14.0967, "eval_samples_per_second": 385.196, "eval_steps_per_second": 0.78, "step": 924 }, { "epoch": 34.0, "grad_norm": 0.10055958479642868, "learning_rate": 0.000997156372940993, "loss": 0.0815, "step": 952 }, { "epoch": 34.0, "eval_loss": 0.08895213901996613, "eval_runtime": 13.4637, "eval_samples_per_second": 403.308, "eval_steps_per_second": 0.817, "step": 952 }, { "epoch": 35.0, "grad_norm": 0.12697456777095795, "learning_rate": 0.00099698663067984, "loss": 0.0805, "step": 980 }, { "epoch": 35.0, "eval_loss": 0.08886592090129852, "eval_runtime": 13.6328, "eval_samples_per_second": 398.305, "eval_steps_per_second": 0.807, "step": 980 }, { "epoch": 36.0, "grad_norm": 0.20513515174388885, "learning_rate": 0.0009968119833612843, "loss": 0.0804, "step": 1008 }, { "epoch": 36.0, "eval_loss": 0.08658694475889206, "eval_runtime": 13.2186, "eval_samples_per_second": 410.785, "eval_steps_per_second": 0.832, "step": 1008 }, { "epoch": 37.0, "grad_norm": 0.1370176076889038, "learning_rate": 0.000996632432709024, "loss": 0.0803, "step": 1036 }, { "epoch": 37.0, "eval_loss": 0.08886294066905975, "eval_runtime": 13.4889, "eval_samples_per_second": 402.554, "eval_steps_per_second": 0.815, "step": 1036 }, { "epoch": 38.0, "grad_norm": 0.13838233053684235, "learning_rate": 0.0009964479804951505, "loss": 0.0795, "step": 1064 }, { "epoch": 38.0, "eval_loss": 0.0883188471198082, "eval_runtime": 13.4375, "eval_samples_per_second": 404.093, "eval_steps_per_second": 0.819, "step": 1064 }, { "epoch": 39.0, "grad_norm": 0.1312042474746704, "learning_rate": 0.000996258628540135, "loss": 0.0786, "step": 1092 }, { "epoch": 39.0, "eval_loss": 0.0872192457318306, "eval_runtime": 13.5729, "eval_samples_per_second": 400.063, "eval_steps_per_second": 0.81, "step": 1092 }, { "epoch": 40.0, "grad_norm": 0.1502840369939804, "learning_rate": 0.0009960643787128027, "loss": 0.0783, "step": 1120 }, { "epoch": 40.0, "eval_loss": 0.08811386674642563, "eval_runtime": 13.7911, "eval_samples_per_second": 393.732, "eval_steps_per_second": 0.798, "step": 1120 }, { "epoch": 41.0, "grad_norm": 0.10295199602842331, "learning_rate": 0.0009958652329303218, "loss": 0.0779, "step": 1148 }, { "epoch": 41.0, "eval_loss": 0.08700462430715561, "eval_runtime": 12.9613, "eval_samples_per_second": 418.94, "eval_steps_per_second": 0.849, "step": 1148 }, { "epoch": 42.0, "grad_norm": 0.14093568921089172, "learning_rate": 0.0009956611931581812, "loss": 0.0779, "step": 1176 }, { "epoch": 42.0, "eval_loss": 0.08765130490064621, "eval_runtime": 13.6238, "eval_samples_per_second": 398.567, "eval_steps_per_second": 0.807, "step": 1176 }, { "epoch": 43.0, "grad_norm": 0.12385249137878418, "learning_rate": 0.00099545226141017, "loss": 0.0771, "step": 1204 }, { "epoch": 43.0, "eval_loss": 0.08630397915840149, "eval_runtime": 13.9701, "eval_samples_per_second": 388.687, "eval_steps_per_second": 0.787, "step": 1204 }, { "epoch": 44.0, "grad_norm": 0.12912018597126007, "learning_rate": 0.000995238439748361, "loss": 0.0765, "step": 1232 }, { "epoch": 44.0, "eval_loss": 0.08651801943778992, "eval_runtime": 14.0705, "eval_samples_per_second": 385.913, "eval_steps_per_second": 0.782, "step": 1232 }, { "epoch": 45.0, "grad_norm": 0.13436101377010345, "learning_rate": 0.000995019730283088, "loss": 0.0768, "step": 1260 }, { "epoch": 45.0, "eval_loss": 0.08759938925504684, "eval_runtime": 13.8608, "eval_samples_per_second": 391.754, "eval_steps_per_second": 0.794, "step": 1260 }, { "epoch": 46.0, "grad_norm": 0.1179327666759491, "learning_rate": 0.000994796135172924, "loss": 0.076, "step": 1288 }, { "epoch": 46.0, "eval_loss": 0.08753702789545059, "eval_runtime": 13.1322, "eval_samples_per_second": 413.488, "eval_steps_per_second": 0.838, "step": 1288 }, { "epoch": 47.0, "grad_norm": 0.12396474927663803, "learning_rate": 0.0009945676566246633, "loss": 0.0758, "step": 1316 }, { "epoch": 47.0, "eval_loss": 0.0872747004032135, "eval_runtime": 13.9301, "eval_samples_per_second": 389.803, "eval_steps_per_second": 0.79, "step": 1316 }, { "epoch": 48.0, "grad_norm": 0.13964654505252838, "learning_rate": 0.0009943342968932972, "loss": 0.0753, "step": 1344 }, { "epoch": 48.0, "eval_loss": 0.08765023946762085, "eval_runtime": 14.1213, "eval_samples_per_second": 384.525, "eval_steps_per_second": 0.779, "step": 1344 }, { "epoch": 49.0, "grad_norm": 0.14741012454032898, "learning_rate": 0.0009940960582819915, "loss": 0.0751, "step": 1372 }, { "epoch": 49.0, "eval_loss": 0.08702078461647034, "eval_runtime": 14.1874, "eval_samples_per_second": 382.734, "eval_steps_per_second": 0.775, "step": 1372 }, { "epoch": 50.0, "grad_norm": 0.1287020742893219, "learning_rate": 0.0009938529431420646, "loss": 0.075, "step": 1400 }, { "epoch": 50.0, "eval_loss": 0.08901604264974594, "eval_runtime": 13.9742, "eval_samples_per_second": 388.575, "eval_steps_per_second": 0.787, "step": 1400 }, { "epoch": 51.0, "grad_norm": 0.12099709361791611, "learning_rate": 0.0009936049538729656, "loss": 0.0748, "step": 1428 }, { "epoch": 51.0, "eval_loss": 0.08469326049089432, "eval_runtime": 13.9824, "eval_samples_per_second": 388.345, "eval_steps_per_second": 0.787, "step": 1428 }, { "epoch": 52.0, "grad_norm": 0.10210338979959488, "learning_rate": 0.0009933520929222485, "loss": 0.0742, "step": 1456 }, { "epoch": 52.0, "eval_loss": 0.08571095019578934, "eval_runtime": 14.305, "eval_samples_per_second": 379.588, "eval_steps_per_second": 0.769, "step": 1456 }, { "epoch": 53.0, "grad_norm": 0.21403531730175018, "learning_rate": 0.0009930943627855485, "loss": 0.0738, "step": 1484 }, { "epoch": 53.0, "eval_loss": 0.08666189014911652, "eval_runtime": 13.2505, "eval_samples_per_second": 409.795, "eval_steps_per_second": 0.83, "step": 1484 }, { "epoch": 54.0, "grad_norm": 0.15750914812088013, "learning_rate": 0.0009928317660065577, "loss": 0.073, "step": 1512 }, { "epoch": 54.0, "eval_loss": 0.08511006087064743, "eval_runtime": 13.0651, "eval_samples_per_second": 415.611, "eval_steps_per_second": 0.842, "step": 1512 }, { "epoch": 55.0, "grad_norm": 0.13378620147705078, "learning_rate": 0.000992564305177001, "loss": 0.0731, "step": 1540 }, { "epoch": 55.0, "eval_loss": 0.08461768925189972, "eval_runtime": 13.8593, "eval_samples_per_second": 391.796, "eval_steps_per_second": 0.794, "step": 1540 }, { "epoch": 56.0, "grad_norm": 0.10955790430307388, "learning_rate": 0.0009922919829366086, "loss": 0.0731, "step": 1568 }, { "epoch": 56.0, "eval_loss": 0.0852380245923996, "eval_runtime": 13.9377, "eval_samples_per_second": 389.59, "eval_steps_per_second": 0.789, "step": 1568 }, { "epoch": 57.0, "grad_norm": 0.13638441264629364, "learning_rate": 0.0009920148019730913, "loss": 0.072, "step": 1596 }, { "epoch": 57.0, "eval_loss": 0.08537213504314423, "eval_runtime": 13.8736, "eval_samples_per_second": 391.392, "eval_steps_per_second": 0.793, "step": 1596 }, { "epoch": 58.0, "grad_norm": 0.1874396950006485, "learning_rate": 0.0009917327650221124, "loss": 0.0719, "step": 1624 }, { "epoch": 58.0, "eval_loss": 0.0842534676194191, "eval_runtime": 14.039, "eval_samples_per_second": 386.779, "eval_steps_per_second": 0.784, "step": 1624 }, { "epoch": 59.0, "grad_norm": 0.12774085998535156, "learning_rate": 0.0009914458748672634, "loss": 0.0717, "step": 1652 }, { "epoch": 59.0, "eval_loss": 0.08502174913883209, "eval_runtime": 14.0351, "eval_samples_per_second": 386.886, "eval_steps_per_second": 0.784, "step": 1652 }, { "epoch": 60.0, "grad_norm": 0.13431188464164734, "learning_rate": 0.000991154134340034, "loss": 0.071, "step": 1680 }, { "epoch": 60.0, "eval_loss": 0.08464298397302628, "eval_runtime": 14.5396, "eval_samples_per_second": 373.462, "eval_steps_per_second": 0.757, "step": 1680 }, { "epoch": 61.0, "grad_norm": 0.12022250145673752, "learning_rate": 0.0009908575463197854, "loss": 0.071, "step": 1708 }, { "epoch": 61.0, "eval_loss": 0.0845947265625, "eval_runtime": 13.51, "eval_samples_per_second": 401.924, "eval_steps_per_second": 0.814, "step": 1708 }, { "epoch": 62.0, "grad_norm": 0.14051498472690582, "learning_rate": 0.0009905561137337224, "loss": 0.0706, "step": 1736 }, { "epoch": 62.0, "eval_loss": 0.08576343953609467, "eval_runtime": 13.5918, "eval_samples_per_second": 399.506, "eval_steps_per_second": 0.809, "step": 1736 }, { "epoch": 63.0, "grad_norm": 0.12124724686145782, "learning_rate": 0.0009902498395568619, "loss": 0.0701, "step": 1764 }, { "epoch": 63.0, "eval_loss": 0.08432195335626602, "eval_runtime": 13.7606, "eval_samples_per_second": 394.605, "eval_steps_per_second": 0.799, "step": 1764 }, { "epoch": 64.0, "grad_norm": 0.15731370449066162, "learning_rate": 0.0009899387268120072, "loss": 0.0701, "step": 1792 }, { "epoch": 64.0, "eval_loss": 0.08528588712215424, "eval_runtime": 13.8086, "eval_samples_per_second": 393.233, "eval_steps_per_second": 0.797, "step": 1792 }, { "epoch": 65.0, "grad_norm": 0.12616483867168427, "learning_rate": 0.0009896227785697153, "loss": 0.0697, "step": 1820 }, { "epoch": 65.0, "eval_loss": 0.08514665067195892, "eval_runtime": 13.7015, "eval_samples_per_second": 396.308, "eval_steps_per_second": 0.803, "step": 1820 }, { "epoch": 66.0, "grad_norm": 0.12217582017183304, "learning_rate": 0.0009893019979482674, "loss": 0.0697, "step": 1848 }, { "epoch": 66.0, "eval_loss": 0.08603604882955551, "eval_runtime": 13.8552, "eval_samples_per_second": 391.91, "eval_steps_per_second": 0.794, "step": 1848 }, { "epoch": 67.0, "grad_norm": 0.1277308613061905, "learning_rate": 0.0009889763881136386, "loss": 0.0693, "step": 1876 }, { "epoch": 67.0, "eval_loss": 0.08344025909900665, "eval_runtime": 13.4274, "eval_samples_per_second": 404.396, "eval_steps_per_second": 0.819, "step": 1876 }, { "epoch": 68.0, "grad_norm": 0.18230511248111725, "learning_rate": 0.0009886459522794678, "loss": 0.0692, "step": 1904 }, { "epoch": 68.0, "eval_loss": 0.08314071595668793, "eval_runtime": 13.346, "eval_samples_per_second": 406.863, "eval_steps_per_second": 0.824, "step": 1904 }, { "epoch": 69.0, "grad_norm": 0.17872166633605957, "learning_rate": 0.0009883106937070216, "loss": 0.0694, "step": 1932 }, { "epoch": 69.0, "eval_loss": 0.08490563184022903, "eval_runtime": 14.1692, "eval_samples_per_second": 383.224, "eval_steps_per_second": 0.776, "step": 1932 }, { "epoch": 70.0, "grad_norm": 0.1801847666501999, "learning_rate": 0.000987970615705167, "loss": 0.0691, "step": 1960 }, { "epoch": 70.0, "eval_loss": 0.08387839794158936, "eval_runtime": 13.4426, "eval_samples_per_second": 403.94, "eval_steps_per_second": 0.818, "step": 1960 }, { "epoch": 71.0, "grad_norm": 0.1893129050731659, "learning_rate": 0.0009876257216303382, "loss": 0.0693, "step": 1988 }, { "epoch": 71.0, "eval_loss": 0.08392627537250519, "eval_runtime": 13.6535, "eval_samples_per_second": 397.7, "eval_steps_per_second": 0.806, "step": 1988 }, { "epoch": 72.0, "grad_norm": 0.10252843797206879, "learning_rate": 0.0009872760148864983, "loss": 0.0678, "step": 2016 }, { "epoch": 72.0, "eval_loss": 0.08312556147575378, "eval_runtime": 13.2392, "eval_samples_per_second": 410.145, "eval_steps_per_second": 0.831, "step": 2016 }, { "epoch": 73.0, "grad_norm": 0.10509613156318665, "learning_rate": 0.0009869214989251126, "loss": 0.0675, "step": 2044 }, { "epoch": 73.0, "eval_loss": 0.08445987850427628, "eval_runtime": 12.9266, "eval_samples_per_second": 420.065, "eval_steps_per_second": 0.851, "step": 2044 }, { "epoch": 74.0, "grad_norm": 0.12325132638216019, "learning_rate": 0.0009865621772451112, "loss": 0.0676, "step": 2072 }, { "epoch": 74.0, "eval_loss": 0.08372888714075089, "eval_runtime": 13.3229, "eval_samples_per_second": 407.57, "eval_steps_per_second": 0.826, "step": 2072 }, { "epoch": 75.0, "grad_norm": 0.1851738691329956, "learning_rate": 0.000986198053392854, "loss": 0.0675, "step": 2100 }, { "epoch": 75.0, "eval_loss": 0.08268258720636368, "eval_runtime": 13.5981, "eval_samples_per_second": 399.322, "eval_steps_per_second": 0.809, "step": 2100 }, { "epoch": 76.0, "grad_norm": 0.16348977386951447, "learning_rate": 0.0009858291309620953, "loss": 0.0673, "step": 2128 }, { "epoch": 76.0, "eval_loss": 0.08128391951322556, "eval_runtime": 13.6413, "eval_samples_per_second": 398.055, "eval_steps_per_second": 0.806, "step": 2128 } ], "logging_steps": 500, "max_steps": 28000, "num_input_tokens_seen": 0, "num_train_epochs": 1000, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 1e-05 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.500471276978176e+17, "train_batch_size": 512, "trial_name": null, "trial_params": null }