| { | |
| "best_global_step": 2128, | |
| "best_metric": 0.08128391951322556, | |
| "best_model_checkpoint": "tmp/out/1536-96-r2_common_channel_fcmCtx4_fcmLayers6_fcmChMixingTrue_stride24_bs512_lr0.001_a7e3/checkpoint-2128", | |
| "epoch": 76.0, | |
| "eval_steps": 500, | |
| "global_step": 2128, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.18818920850753784, | |
| "learning_rate": 0.000999997705696395, | |
| "loss": 0.1949, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.13799113035202026, | |
| "eval_runtime": 13.9945, | |
| "eval_samples_per_second": 388.011, | |
| "eval_steps_per_second": 0.786, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.17697054147720337, | |
| "learning_rate": 0.0009999904797644875, | |
| "loss": 0.1397, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.1200891062617302, | |
| "eval_runtime": 13.7764, | |
| "eval_samples_per_second": 394.151, | |
| "eval_steps_per_second": 0.798, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.12785276770591736, | |
| "learning_rate": 0.0009999783191283983, | |
| "loss": 0.1239, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.11045144498348236, | |
| "eval_runtime": 13.7689, | |
| "eval_samples_per_second": 394.367, | |
| "eval_steps_per_second": 0.799, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.15085896849632263, | |
| "learning_rate": 0.0009999612239081498, | |
| "loss": 0.1156, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.10991678386926651, | |
| "eval_runtime": 14.5903, | |
| "eval_samples_per_second": 372.166, | |
| "eval_steps_per_second": 0.754, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.1445561647415161, | |
| "learning_rate": 0.0009999391942724632, | |
| "loss": 0.1106, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.1034143716096878, | |
| "eval_runtime": 12.8666, | |
| "eval_samples_per_second": 422.022, | |
| "eval_steps_per_second": 0.855, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.13405078649520874, | |
| "learning_rate": 0.000999912230438763, | |
| "loss": 0.1072, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.10323869436979294, | |
| "eval_runtime": 13.5764, | |
| "eval_samples_per_second": 399.957, | |
| "eval_steps_per_second": 0.81, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.13994024693965912, | |
| "learning_rate": 0.0009998803326731703, | |
| "loss": 0.1042, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.10014788061380386, | |
| "eval_runtime": 14.0652, | |
| "eval_samples_per_second": 386.058, | |
| "eval_steps_per_second": 0.782, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.13133035600185394, | |
| "learning_rate": 0.0009998435012905044, | |
| "loss": 0.1025, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.1004691943526268, | |
| "eval_runtime": 13.771, | |
| "eval_samples_per_second": 394.307, | |
| "eval_steps_per_second": 0.799, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.1343812197446823, | |
| "learning_rate": 0.0009998017366542756, | |
| "loss": 0.1008, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.09979160130023956, | |
| "eval_runtime": 14.0034, | |
| "eval_samples_per_second": 387.762, | |
| "eval_steps_per_second": 0.786, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.1553795337677002, | |
| "learning_rate": 0.0009997550391766844, | |
| "loss": 0.0991, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.098526231944561, | |
| "eval_runtime": 13.3534, | |
| "eval_samples_per_second": 406.638, | |
| "eval_steps_per_second": 0.824, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.1313014179468155, | |
| "learning_rate": 0.000999703409318616, | |
| "loss": 0.0977, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.09693964570760727, | |
| "eval_runtime": 14.1264, | |
| "eval_samples_per_second": 384.385, | |
| "eval_steps_per_second": 0.779, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.14203286170959473, | |
| "learning_rate": 0.0009996468475896355, | |
| "loss": 0.0964, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.0984538123011589, | |
| "eval_runtime": 13.8783, | |
| "eval_samples_per_second": 391.259, | |
| "eval_steps_per_second": 0.793, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.16649910807609558, | |
| "learning_rate": 0.0009995853545479853, | |
| "loss": 0.0955, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.09544174373149872, | |
| "eval_runtime": 13.7858, | |
| "eval_samples_per_second": 393.884, | |
| "eval_steps_per_second": 0.798, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.11850055307149887, | |
| "learning_rate": 0.0009995189308005762, | |
| "loss": 0.0945, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.09443824738264084, | |
| "eval_runtime": 14.0327, | |
| "eval_samples_per_second": 386.953, | |
| "eval_steps_per_second": 0.784, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.1464478075504303, | |
| "learning_rate": 0.0009994475770029841, | |
| "loss": 0.0926, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.09755795449018478, | |
| "eval_runtime": 13.5982, | |
| "eval_samples_per_second": 399.319, | |
| "eval_steps_per_second": 0.809, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.13713239133358002, | |
| "learning_rate": 0.0009993712938594424, | |
| "loss": 0.0922, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.09337516874074936, | |
| "eval_runtime": 13.7553, | |
| "eval_samples_per_second": 394.757, | |
| "eval_steps_per_second": 0.8, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.17032985389232635, | |
| "learning_rate": 0.0009992900821228345, | |
| "loss": 0.0914, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.09800300747156143, | |
| "eval_runtime": 13.4874, | |
| "eval_samples_per_second": 402.597, | |
| "eval_steps_per_second": 0.816, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.11109838634729385, | |
| "learning_rate": 0.000999203942594687, | |
| "loss": 0.091, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.09393668174743652, | |
| "eval_runtime": 13.6194, | |
| "eval_samples_per_second": 398.697, | |
| "eval_steps_per_second": 0.808, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.13168472051620483, | |
| "learning_rate": 0.0009991128761251632, | |
| "loss": 0.0894, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.09444674849510193, | |
| "eval_runtime": 13.817, | |
| "eval_samples_per_second": 392.994, | |
| "eval_steps_per_second": 0.796, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.12797316908836365, | |
| "learning_rate": 0.0009990168836130527, | |
| "loss": 0.0892, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.09130553901195526, | |
| "eval_runtime": 12.8149, | |
| "eval_samples_per_second": 423.724, | |
| "eval_steps_per_second": 0.858, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 0.12064854055643082, | |
| "learning_rate": 0.0009989159660057615, | |
| "loss": 0.0878, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.09315093606710434, | |
| "eval_runtime": 13.0593, | |
| "eval_samples_per_second": 415.796, | |
| "eval_steps_per_second": 0.842, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 0.11039382964372635, | |
| "learning_rate": 0.0009988101242993065, | |
| "loss": 0.0873, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.09174513071775436, | |
| "eval_runtime": 13.6345, | |
| "eval_samples_per_second": 398.254, | |
| "eval_steps_per_second": 0.807, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 0.11381122469902039, | |
| "learning_rate": 0.000998699359538303, | |
| "loss": 0.0862, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.09421718120574951, | |
| "eval_runtime": 13.1437, | |
| "eval_samples_per_second": 413.126, | |
| "eval_steps_per_second": 0.837, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 0.13068965077400208, | |
| "learning_rate": 0.0009985836728159524, | |
| "loss": 0.0862, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.0908147320151329, | |
| "eval_runtime": 13.6756, | |
| "eval_samples_per_second": 397.059, | |
| "eval_steps_per_second": 0.804, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.19063422083854675, | |
| "learning_rate": 0.0009984630652740383, | |
| "loss": 0.0861, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.09025771915912628, | |
| "eval_runtime": 12.5305, | |
| "eval_samples_per_second": 433.342, | |
| "eval_steps_per_second": 0.878, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 0.1407005786895752, | |
| "learning_rate": 0.0009983375381029088, | |
| "loss": 0.0854, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 0.08990131318569183, | |
| "eval_runtime": 13.5474, | |
| "eval_samples_per_second": 400.814, | |
| "eval_steps_per_second": 0.812, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 0.1301293671131134, | |
| "learning_rate": 0.0009982070925414637, | |
| "loss": 0.0847, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 0.08932916820049286, | |
| "eval_runtime": 13.7031, | |
| "eval_samples_per_second": 396.259, | |
| "eval_steps_per_second": 0.803, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 0.15950527787208557, | |
| "learning_rate": 0.0009980717298771495, | |
| "loss": 0.0836, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 0.08872799575328827, | |
| "eval_runtime": 12.7264, | |
| "eval_samples_per_second": 426.672, | |
| "eval_steps_per_second": 0.864, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 0.14679257571697235, | |
| "learning_rate": 0.000997931451445941, | |
| "loss": 0.0831, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 0.09064222127199173, | |
| "eval_runtime": 13.2879, | |
| "eval_samples_per_second": 408.642, | |
| "eval_steps_per_second": 0.828, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.1313679814338684, | |
| "learning_rate": 0.0009977862586323298, | |
| "loss": 0.0827, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.09007434546947479, | |
| "eval_runtime": 13.714, | |
| "eval_samples_per_second": 395.947, | |
| "eval_steps_per_second": 0.802, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 0.1265498697757721, | |
| "learning_rate": 0.0009976361528693104, | |
| "loss": 0.0819, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 0.08951247483491898, | |
| "eval_runtime": 13.4697, | |
| "eval_samples_per_second": 403.128, | |
| "eval_steps_per_second": 0.817, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 0.1129317358136177, | |
| "learning_rate": 0.0009974811356383668, | |
| "loss": 0.0819, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 0.09060715138912201, | |
| "eval_runtime": 13.0035, | |
| "eval_samples_per_second": 417.579, | |
| "eval_steps_per_second": 0.846, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "grad_norm": 0.12748871743679047, | |
| "learning_rate": 0.0009973212084694557, | |
| "loss": 0.0812, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 0.0907522663474083, | |
| "eval_runtime": 14.0967, | |
| "eval_samples_per_second": 385.196, | |
| "eval_steps_per_second": 0.78, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 0.10055958479642868, | |
| "learning_rate": 0.000997156372940993, | |
| "loss": 0.0815, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 0.08895213901996613, | |
| "eval_runtime": 13.4637, | |
| "eval_samples_per_second": 403.308, | |
| "eval_steps_per_second": 0.817, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 0.12697456777095795, | |
| "learning_rate": 0.00099698663067984, | |
| "loss": 0.0805, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 0.08886592090129852, | |
| "eval_runtime": 13.6328, | |
| "eval_samples_per_second": 398.305, | |
| "eval_steps_per_second": 0.807, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 0.20513515174388885, | |
| "learning_rate": 0.0009968119833612843, | |
| "loss": 0.0804, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.08658694475889206, | |
| "eval_runtime": 13.2186, | |
| "eval_samples_per_second": 410.785, | |
| "eval_steps_per_second": 0.832, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "grad_norm": 0.1370176076889038, | |
| "learning_rate": 0.000996632432709024, | |
| "loss": 0.0803, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 0.08886294066905975, | |
| "eval_runtime": 13.4889, | |
| "eval_samples_per_second": 402.554, | |
| "eval_steps_per_second": 0.815, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 0.13838233053684235, | |
| "learning_rate": 0.0009964479804951505, | |
| "loss": 0.0795, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 0.0883188471198082, | |
| "eval_runtime": 13.4375, | |
| "eval_samples_per_second": 404.093, | |
| "eval_steps_per_second": 0.819, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "grad_norm": 0.1312042474746704, | |
| "learning_rate": 0.000996258628540135, | |
| "loss": 0.0786, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 0.0872192457318306, | |
| "eval_runtime": 13.5729, | |
| "eval_samples_per_second": 400.063, | |
| "eval_steps_per_second": 0.81, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 0.1502840369939804, | |
| "learning_rate": 0.0009960643787128027, | |
| "loss": 0.0783, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 0.08811386674642563, | |
| "eval_runtime": 13.7911, | |
| "eval_samples_per_second": 393.732, | |
| "eval_steps_per_second": 0.798, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "grad_norm": 0.10295199602842331, | |
| "learning_rate": 0.0009958652329303218, | |
| "loss": 0.0779, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_loss": 0.08700462430715561, | |
| "eval_runtime": 12.9613, | |
| "eval_samples_per_second": 418.94, | |
| "eval_steps_per_second": 0.849, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 0.14093568921089172, | |
| "learning_rate": 0.0009956611931581812, | |
| "loss": 0.0779, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 0.08765130490064621, | |
| "eval_runtime": 13.6238, | |
| "eval_samples_per_second": 398.567, | |
| "eval_steps_per_second": 0.807, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "grad_norm": 0.12385249137878418, | |
| "learning_rate": 0.00099545226141017, | |
| "loss": 0.0771, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_loss": 0.08630397915840149, | |
| "eval_runtime": 13.9701, | |
| "eval_samples_per_second": 388.687, | |
| "eval_steps_per_second": 0.787, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 0.12912018597126007, | |
| "learning_rate": 0.000995238439748361, | |
| "loss": 0.0765, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 0.08651801943778992, | |
| "eval_runtime": 14.0705, | |
| "eval_samples_per_second": 385.913, | |
| "eval_steps_per_second": 0.782, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 0.13436101377010345, | |
| "learning_rate": 0.000995019730283088, | |
| "loss": 0.0768, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_loss": 0.08759938925504684, | |
| "eval_runtime": 13.8608, | |
| "eval_samples_per_second": 391.754, | |
| "eval_steps_per_second": 0.794, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 0.1179327666759491, | |
| "learning_rate": 0.000994796135172924, | |
| "loss": 0.076, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_loss": 0.08753702789545059, | |
| "eval_runtime": 13.1322, | |
| "eval_samples_per_second": 413.488, | |
| "eval_steps_per_second": 0.838, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "grad_norm": 0.12396474927663803, | |
| "learning_rate": 0.0009945676566246633, | |
| "loss": 0.0758, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_loss": 0.0872747004032135, | |
| "eval_runtime": 13.9301, | |
| "eval_samples_per_second": 389.803, | |
| "eval_steps_per_second": 0.79, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 0.13964654505252838, | |
| "learning_rate": 0.0009943342968932972, | |
| "loss": 0.0753, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 0.08765023946762085, | |
| "eval_runtime": 14.1213, | |
| "eval_samples_per_second": 384.525, | |
| "eval_steps_per_second": 0.779, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "grad_norm": 0.14741012454032898, | |
| "learning_rate": 0.0009940960582819915, | |
| "loss": 0.0751, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_loss": 0.08702078461647034, | |
| "eval_runtime": 14.1874, | |
| "eval_samples_per_second": 382.734, | |
| "eval_steps_per_second": 0.775, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 0.1287020742893219, | |
| "learning_rate": 0.0009938529431420646, | |
| "loss": 0.075, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_loss": 0.08901604264974594, | |
| "eval_runtime": 13.9742, | |
| "eval_samples_per_second": 388.575, | |
| "eval_steps_per_second": 0.787, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "grad_norm": 0.12099709361791611, | |
| "learning_rate": 0.0009936049538729656, | |
| "loss": 0.0748, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "eval_loss": 0.08469326049089432, | |
| "eval_runtime": 13.9824, | |
| "eval_samples_per_second": 388.345, | |
| "eval_steps_per_second": 0.787, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 0.10210338979959488, | |
| "learning_rate": 0.0009933520929222485, | |
| "loss": 0.0742, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "eval_loss": 0.08571095019578934, | |
| "eval_runtime": 14.305, | |
| "eval_samples_per_second": 379.588, | |
| "eval_steps_per_second": 0.769, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "grad_norm": 0.21403531730175018, | |
| "learning_rate": 0.0009930943627855485, | |
| "loss": 0.0738, | |
| "step": 1484 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "eval_loss": 0.08666189014911652, | |
| "eval_runtime": 13.2505, | |
| "eval_samples_per_second": 409.795, | |
| "eval_steps_per_second": 0.83, | |
| "step": 1484 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "grad_norm": 0.15750914812088013, | |
| "learning_rate": 0.0009928317660065577, | |
| "loss": 0.073, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "eval_loss": 0.08511006087064743, | |
| "eval_runtime": 13.0651, | |
| "eval_samples_per_second": 415.611, | |
| "eval_steps_per_second": 0.842, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 0.13378620147705078, | |
| "learning_rate": 0.000992564305177001, | |
| "loss": 0.0731, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "eval_loss": 0.08461768925189972, | |
| "eval_runtime": 13.8593, | |
| "eval_samples_per_second": 391.796, | |
| "eval_steps_per_second": 0.794, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "grad_norm": 0.10955790430307388, | |
| "learning_rate": 0.0009922919829366086, | |
| "loss": 0.0731, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "eval_loss": 0.0852380245923996, | |
| "eval_runtime": 13.9377, | |
| "eval_samples_per_second": 389.59, | |
| "eval_steps_per_second": 0.789, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "grad_norm": 0.13638441264629364, | |
| "learning_rate": 0.0009920148019730913, | |
| "loss": 0.072, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "eval_loss": 0.08537213504314423, | |
| "eval_runtime": 13.8736, | |
| "eval_samples_per_second": 391.392, | |
| "eval_steps_per_second": 0.793, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "grad_norm": 0.1874396950006485, | |
| "learning_rate": 0.0009917327650221124, | |
| "loss": 0.0719, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "eval_loss": 0.0842534676194191, | |
| "eval_runtime": 14.039, | |
| "eval_samples_per_second": 386.779, | |
| "eval_steps_per_second": 0.784, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "grad_norm": 0.12774085998535156, | |
| "learning_rate": 0.0009914458748672634, | |
| "loss": 0.0717, | |
| "step": 1652 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "eval_loss": 0.08502174913883209, | |
| "eval_runtime": 14.0351, | |
| "eval_samples_per_second": 386.886, | |
| "eval_steps_per_second": 0.784, | |
| "step": 1652 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 0.13431188464164734, | |
| "learning_rate": 0.000991154134340034, | |
| "loss": 0.071, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_loss": 0.08464298397302628, | |
| "eval_runtime": 14.5396, | |
| "eval_samples_per_second": 373.462, | |
| "eval_steps_per_second": 0.757, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "grad_norm": 0.12022250145673752, | |
| "learning_rate": 0.0009908575463197854, | |
| "loss": 0.071, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "eval_loss": 0.0845947265625, | |
| "eval_runtime": 13.51, | |
| "eval_samples_per_second": 401.924, | |
| "eval_steps_per_second": 0.814, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "grad_norm": 0.14051498472690582, | |
| "learning_rate": 0.0009905561137337224, | |
| "loss": 0.0706, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "eval_loss": 0.08576343953609467, | |
| "eval_runtime": 13.5918, | |
| "eval_samples_per_second": 399.506, | |
| "eval_steps_per_second": 0.809, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "grad_norm": 0.12124724686145782, | |
| "learning_rate": 0.0009902498395568619, | |
| "loss": 0.0701, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "eval_loss": 0.08432195335626602, | |
| "eval_runtime": 13.7606, | |
| "eval_samples_per_second": 394.605, | |
| "eval_steps_per_second": 0.799, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "grad_norm": 0.15731370449066162, | |
| "learning_rate": 0.0009899387268120072, | |
| "loss": 0.0701, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "eval_loss": 0.08528588712215424, | |
| "eval_runtime": 13.8086, | |
| "eval_samples_per_second": 393.233, | |
| "eval_steps_per_second": 0.797, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 0.12616483867168427, | |
| "learning_rate": 0.0009896227785697153, | |
| "loss": 0.0697, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "eval_loss": 0.08514665067195892, | |
| "eval_runtime": 13.7015, | |
| "eval_samples_per_second": 396.308, | |
| "eval_steps_per_second": 0.803, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "grad_norm": 0.12217582017183304, | |
| "learning_rate": 0.0009893019979482674, | |
| "loss": 0.0697, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "eval_loss": 0.08603604882955551, | |
| "eval_runtime": 13.8552, | |
| "eval_samples_per_second": 391.91, | |
| "eval_steps_per_second": 0.794, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "grad_norm": 0.1277308613061905, | |
| "learning_rate": 0.0009889763881136386, | |
| "loss": 0.0693, | |
| "step": 1876 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "eval_loss": 0.08344025909900665, | |
| "eval_runtime": 13.4274, | |
| "eval_samples_per_second": 404.396, | |
| "eval_steps_per_second": 0.819, | |
| "step": 1876 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "grad_norm": 0.18230511248111725, | |
| "learning_rate": 0.0009886459522794678, | |
| "loss": 0.0692, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "eval_loss": 0.08314071595668793, | |
| "eval_runtime": 13.346, | |
| "eval_samples_per_second": 406.863, | |
| "eval_steps_per_second": 0.824, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "grad_norm": 0.17872166633605957, | |
| "learning_rate": 0.0009883106937070216, | |
| "loss": 0.0694, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "eval_loss": 0.08490563184022903, | |
| "eval_runtime": 14.1692, | |
| "eval_samples_per_second": 383.224, | |
| "eval_steps_per_second": 0.776, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 0.1801847666501999, | |
| "learning_rate": 0.000987970615705167, | |
| "loss": 0.0691, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "eval_loss": 0.08387839794158936, | |
| "eval_runtime": 13.4426, | |
| "eval_samples_per_second": 403.94, | |
| "eval_steps_per_second": 0.818, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "grad_norm": 0.1893129050731659, | |
| "learning_rate": 0.0009876257216303382, | |
| "loss": 0.0693, | |
| "step": 1988 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "eval_loss": 0.08392627537250519, | |
| "eval_runtime": 13.6535, | |
| "eval_samples_per_second": 397.7, | |
| "eval_steps_per_second": 0.806, | |
| "step": 1988 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "grad_norm": 0.10252843797206879, | |
| "learning_rate": 0.0009872760148864983, | |
| "loss": 0.0678, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "eval_loss": 0.08312556147575378, | |
| "eval_runtime": 13.2392, | |
| "eval_samples_per_second": 410.145, | |
| "eval_steps_per_second": 0.831, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "grad_norm": 0.10509613156318665, | |
| "learning_rate": 0.0009869214989251126, | |
| "loss": 0.0675, | |
| "step": 2044 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "eval_loss": 0.08445987850427628, | |
| "eval_runtime": 12.9266, | |
| "eval_samples_per_second": 420.065, | |
| "eval_steps_per_second": 0.851, | |
| "step": 2044 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "grad_norm": 0.12325132638216019, | |
| "learning_rate": 0.0009865621772451112, | |
| "loss": 0.0676, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "eval_loss": 0.08372888714075089, | |
| "eval_runtime": 13.3229, | |
| "eval_samples_per_second": 407.57, | |
| "eval_steps_per_second": 0.826, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "grad_norm": 0.1851738691329956, | |
| "learning_rate": 0.000986198053392854, | |
| "loss": 0.0675, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "eval_loss": 0.08268258720636368, | |
| "eval_runtime": 13.5981, | |
| "eval_samples_per_second": 399.322, | |
| "eval_steps_per_second": 0.809, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "grad_norm": 0.16348977386951447, | |
| "learning_rate": 0.0009858291309620953, | |
| "loss": 0.0673, | |
| "step": 2128 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "eval_loss": 0.08128391951322556, | |
| "eval_runtime": 13.6413, | |
| "eval_samples_per_second": 398.055, | |
| "eval_steps_per_second": 0.806, | |
| "step": 2128 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 28000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1000, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 10, | |
| "early_stopping_threshold": 1e-05 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.500471276978176e+17, | |
| "train_batch_size": 512, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |