{ "best_global_step": 138718, "best_metric": 0.9915470627263667, "best_model_checkpoint": "/home/skwon01/scratch/sibal/finetuned_models/serengeti_camera_ready/checkpoint-138718", "epoch": 2.0, "eval_steps": 1000.0, "global_step": 138718, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007208869793393791, "grad_norm": 2.880587577819824, "learning_rate": 1.9985611095892387e-05, "loss": 3.675, "step": 500 }, { "epoch": 0.014417739586787583, "grad_norm": 3.1965484619140625, "learning_rate": 1.99711933563056e-05, "loss": 1.3703, "step": 1000 }, { "epoch": 0.021626609380181374, "grad_norm": 3.587383270263672, "learning_rate": 1.9956775616718814e-05, "loss": 0.7317, "step": 1500 }, { "epoch": 0.028835479173575165, "grad_norm": 2.73246169090271, "learning_rate": 1.9942357877132026e-05, "loss": 0.4764, "step": 2000 }, { "epoch": 0.03604434896696896, "grad_norm": 3.9599311351776123, "learning_rate": 1.9927940137545237e-05, "loss": 0.3488, "step": 2500 }, { "epoch": 0.04325321876036275, "grad_norm": 3.690446138381958, "learning_rate": 1.991352239795845e-05, "loss": 0.2729, "step": 3000 }, { "epoch": 0.05046208855375654, "grad_norm": 3.0428125858306885, "learning_rate": 1.989910465837166e-05, "loss": 0.2249, "step": 3500 }, { "epoch": 0.05767095834715033, "grad_norm": 2.6362786293029785, "learning_rate": 1.9884686918784876e-05, "loss": 0.1907, "step": 4000 }, { "epoch": 0.06487982814054413, "grad_norm": 4.072872161865234, "learning_rate": 1.9870269179198087e-05, "loss": 0.1695, "step": 4500 }, { "epoch": 0.07208869793393792, "grad_norm": 2.4177143573760986, "learning_rate": 1.98558514396113e-05, "loss": 0.1535, "step": 5000 }, { "epoch": 0.07929756772733171, "grad_norm": 2.4438438415527344, "learning_rate": 1.9841433700024514e-05, "loss": 0.1429, "step": 5500 }, { "epoch": 0.0865064375207255, "grad_norm": 1.9982225894927979, "learning_rate": 1.9827015960437722e-05, "loss": 0.1348, "step": 6000 }, { "epoch": 0.0937153073141193, "grad_norm": 2.988769769668579, "learning_rate": 1.9812598220850938e-05, "loss": 0.1226, "step": 6500 }, { "epoch": 0.10092417710751309, "grad_norm": 2.386380672454834, "learning_rate": 1.979818048126415e-05, "loss": 0.1168, "step": 7000 }, { "epoch": 0.10813304690090687, "grad_norm": 1.9924527406692505, "learning_rate": 1.978376274167736e-05, "loss": 0.1082, "step": 7500 }, { "epoch": 0.11534191669430066, "grad_norm": 1.9020510911941528, "learning_rate": 1.9769345002090573e-05, "loss": 0.1064, "step": 8000 }, { "epoch": 0.12255078648769446, "grad_norm": 2.333510160446167, "learning_rate": 1.9754927262503788e-05, "loss": 0.1029, "step": 8500 }, { "epoch": 0.12975965628108826, "grad_norm": 2.677407741546631, "learning_rate": 1.9740509522917e-05, "loss": 0.0995, "step": 9000 }, { "epoch": 0.13696852607448204, "grad_norm": 1.5480279922485352, "learning_rate": 1.972609178333021e-05, "loss": 0.0948, "step": 9500 }, { "epoch": 0.14417739586787584, "grad_norm": 2.630037546157837, "learning_rate": 1.9711674043743423e-05, "loss": 0.0937, "step": 10000 }, { "epoch": 0.15138626566126961, "grad_norm": 2.267946243286133, "learning_rate": 1.9697256304156634e-05, "loss": 0.0909, "step": 10500 }, { "epoch": 0.15859513545466342, "grad_norm": 2.932375907897949, "learning_rate": 1.968283856456985e-05, "loss": 0.0889, "step": 11000 }, { "epoch": 0.16580400524805722, "grad_norm": 2.69350528717041, "learning_rate": 1.966842082498306e-05, "loss": 0.086, "step": 11500 }, { "epoch": 0.173012875041451, "grad_norm": 2.1316378116607666, "learning_rate": 1.9654003085396273e-05, "loss": 0.0843, "step": 12000 }, { "epoch": 0.1802217448348448, "grad_norm": 2.52103853225708, "learning_rate": 1.9639585345809488e-05, "loss": 0.0828, "step": 12500 }, { "epoch": 0.1874306146282386, "grad_norm": 1.939334511756897, "learning_rate": 1.9625167606222696e-05, "loss": 0.0795, "step": 13000 }, { "epoch": 0.19463948442163237, "grad_norm": 2.3057949542999268, "learning_rate": 1.961074986663591e-05, "loss": 0.0786, "step": 13500 }, { "epoch": 0.20184835421502617, "grad_norm": 2.0021777153015137, "learning_rate": 1.9596332127049123e-05, "loss": 0.0773, "step": 14000 }, { "epoch": 0.20905722400841997, "grad_norm": 2.276421546936035, "learning_rate": 1.9581914387462335e-05, "loss": 0.0772, "step": 14500 }, { "epoch": 0.21626609380181375, "grad_norm": 2.426966428756714, "learning_rate": 1.9567496647875546e-05, "loss": 0.0746, "step": 15000 }, { "epoch": 0.22347496359520755, "grad_norm": 1.984330415725708, "learning_rate": 1.955307890828876e-05, "loss": 0.074, "step": 15500 }, { "epoch": 0.23068383338860132, "grad_norm": 2.1131157875061035, "learning_rate": 1.9538661168701973e-05, "loss": 0.0754, "step": 16000 }, { "epoch": 0.23789270318199512, "grad_norm": 2.672717332839966, "learning_rate": 1.9524243429115185e-05, "loss": 0.0719, "step": 16500 }, { "epoch": 0.24510157297538893, "grad_norm": 1.4720840454101562, "learning_rate": 1.9509825689528396e-05, "loss": 0.0689, "step": 17000 }, { "epoch": 0.25231044276878273, "grad_norm": 1.7824233770370483, "learning_rate": 1.9495407949941608e-05, "loss": 0.0711, "step": 17500 }, { "epoch": 0.25951931256217653, "grad_norm": 1.7139828205108643, "learning_rate": 1.9480990210354823e-05, "loss": 0.067, "step": 18000 }, { "epoch": 0.2667281823555703, "grad_norm": 2.2731082439422607, "learning_rate": 1.9466572470768035e-05, "loss": 0.0678, "step": 18500 }, { "epoch": 0.2739370521489641, "grad_norm": 2.2537448406219482, "learning_rate": 1.9452154731181247e-05, "loss": 0.0657, "step": 19000 }, { "epoch": 0.2811459219423579, "grad_norm": 3.0216615200042725, "learning_rate": 1.943773699159446e-05, "loss": 0.0656, "step": 19500 }, { "epoch": 0.2883547917357517, "grad_norm": 1.4544578790664673, "learning_rate": 1.942331925200767e-05, "loss": 0.0658, "step": 20000 }, { "epoch": 0.2955636615291455, "grad_norm": 2.4549198150634766, "learning_rate": 1.9408901512420885e-05, "loss": 0.0641, "step": 20500 }, { "epoch": 0.30277253132253923, "grad_norm": 1.514060616493225, "learning_rate": 1.9394483772834097e-05, "loss": 0.0633, "step": 21000 }, { "epoch": 0.30998140111593303, "grad_norm": 2.4346635341644287, "learning_rate": 1.9380066033247308e-05, "loss": 0.0627, "step": 21500 }, { "epoch": 0.31719027090932683, "grad_norm": 1.432133436203003, "learning_rate": 1.9365648293660523e-05, "loss": 0.0616, "step": 22000 }, { "epoch": 0.32439914070272063, "grad_norm": 1.2359411716461182, "learning_rate": 1.9351230554073735e-05, "loss": 0.0628, "step": 22500 }, { "epoch": 0.33160801049611444, "grad_norm": 2.1902575492858887, "learning_rate": 1.9336812814486947e-05, "loss": 0.0628, "step": 23000 }, { "epoch": 0.33881688028950824, "grad_norm": 1.7415978908538818, "learning_rate": 1.932239507490016e-05, "loss": 0.0616, "step": 23500 }, { "epoch": 0.346025750082902, "grad_norm": 1.401383399963379, "learning_rate": 1.930797733531337e-05, "loss": 0.0589, "step": 24000 }, { "epoch": 0.3532346198762958, "grad_norm": 1.5828105211257935, "learning_rate": 1.9293559595726582e-05, "loss": 0.0604, "step": 24500 }, { "epoch": 0.3604434896696896, "grad_norm": 0.8541142344474792, "learning_rate": 1.9279141856139797e-05, "loss": 0.0599, "step": 25000 }, { "epoch": 0.3676523594630834, "grad_norm": 2.8157145977020264, "learning_rate": 1.926472411655301e-05, "loss": 0.0593, "step": 25500 }, { "epoch": 0.3748612292564772, "grad_norm": 2.129725217819214, "learning_rate": 1.925030637696622e-05, "loss": 0.0578, "step": 26000 }, { "epoch": 0.38207009904987094, "grad_norm": 2.5838279724121094, "learning_rate": 1.9235888637379435e-05, "loss": 0.0574, "step": 26500 }, { "epoch": 0.38927896884326474, "grad_norm": 1.7000998258590698, "learning_rate": 1.9221470897792647e-05, "loss": 0.0553, "step": 27000 }, { "epoch": 0.39648783863665854, "grad_norm": 1.2641727924346924, "learning_rate": 1.920705315820586e-05, "loss": 0.0549, "step": 27500 }, { "epoch": 0.40369670843005234, "grad_norm": 1.7529101371765137, "learning_rate": 1.919263541861907e-05, "loss": 0.0562, "step": 28000 }, { "epoch": 0.41090557822344614, "grad_norm": 1.4027022123336792, "learning_rate": 1.9178217679032282e-05, "loss": 0.0552, "step": 28500 }, { "epoch": 0.41811444801683995, "grad_norm": 1.6767141819000244, "learning_rate": 1.9163799939445497e-05, "loss": 0.0572, "step": 29000 }, { "epoch": 0.4253233178102337, "grad_norm": 0.8946545720100403, "learning_rate": 1.914938219985871e-05, "loss": 0.0556, "step": 29500 }, { "epoch": 0.4325321876036275, "grad_norm": 2.469862937927246, "learning_rate": 1.913496446027192e-05, "loss": 0.0546, "step": 30000 }, { "epoch": 0.4397410573970213, "grad_norm": 3.368171215057373, "learning_rate": 1.9120546720685132e-05, "loss": 0.0527, "step": 30500 }, { "epoch": 0.4469499271904151, "grad_norm": 2.107477903366089, "learning_rate": 1.9106128981098344e-05, "loss": 0.0538, "step": 31000 }, { "epoch": 0.4541587969838089, "grad_norm": 1.8676276206970215, "learning_rate": 1.9091711241511555e-05, "loss": 0.0529, "step": 31500 }, { "epoch": 0.46136766677720265, "grad_norm": 1.8789501190185547, "learning_rate": 1.907729350192477e-05, "loss": 0.0525, "step": 32000 }, { "epoch": 0.46857653657059645, "grad_norm": 1.8588016033172607, "learning_rate": 1.9062875762337982e-05, "loss": 0.0519, "step": 32500 }, { "epoch": 0.47578540636399025, "grad_norm": 1.6721725463867188, "learning_rate": 1.9048458022751194e-05, "loss": 0.0508, "step": 33000 }, { "epoch": 0.48299427615738405, "grad_norm": 1.9724555015563965, "learning_rate": 1.903404028316441e-05, "loss": 0.0502, "step": 33500 }, { "epoch": 0.49020314595077785, "grad_norm": 1.9921311140060425, "learning_rate": 1.901962254357762e-05, "loss": 0.051, "step": 34000 }, { "epoch": 0.49741201574417165, "grad_norm": 2.889782190322876, "learning_rate": 1.9005204803990832e-05, "loss": 0.0518, "step": 34500 }, { "epoch": 0.5046208855375655, "grad_norm": 1.7622694969177246, "learning_rate": 1.8990787064404044e-05, "loss": 0.0494, "step": 35000 }, { "epoch": 0.5118297553309592, "grad_norm": 1.713699460029602, "learning_rate": 1.8976369324817256e-05, "loss": 0.0493, "step": 35500 }, { "epoch": 0.5190386251243531, "grad_norm": 1.262862205505371, "learning_rate": 1.896195158523047e-05, "loss": 0.0496, "step": 36000 }, { "epoch": 0.5262474949177468, "grad_norm": 2.085010051727295, "learning_rate": 1.8947533845643682e-05, "loss": 0.0509, "step": 36500 }, { "epoch": 0.5334563647111406, "grad_norm": 1.6257765293121338, "learning_rate": 1.8933116106056894e-05, "loss": 0.0498, "step": 37000 }, { "epoch": 0.5406652345045344, "grad_norm": 0.6558777093887329, "learning_rate": 1.8918698366470106e-05, "loss": 0.0484, "step": 37500 }, { "epoch": 0.5478741042979282, "grad_norm": 1.7351698875427246, "learning_rate": 1.8904280626883318e-05, "loss": 0.0496, "step": 38000 }, { "epoch": 0.555082974091322, "grad_norm": 0.915392279624939, "learning_rate": 1.888986288729653e-05, "loss": 0.0467, "step": 38500 }, { "epoch": 0.5622918438847158, "grad_norm": 0.9719710350036621, "learning_rate": 1.8875445147709744e-05, "loss": 0.0491, "step": 39000 }, { "epoch": 0.5695007136781095, "grad_norm": 0.4347970485687256, "learning_rate": 1.8861027408122956e-05, "loss": 0.0478, "step": 39500 }, { "epoch": 0.5767095834715034, "grad_norm": 1.4013206958770752, "learning_rate": 1.8846609668536168e-05, "loss": 0.0482, "step": 40000 }, { "epoch": 0.5839184532648971, "grad_norm": 1.6916135549545288, "learning_rate": 1.8832191928949383e-05, "loss": 0.0487, "step": 40500 }, { "epoch": 0.591127323058291, "grad_norm": 1.1497479677200317, "learning_rate": 1.8817774189362594e-05, "loss": 0.0473, "step": 41000 }, { "epoch": 0.5983361928516847, "grad_norm": 2.1202707290649414, "learning_rate": 1.8803356449775806e-05, "loss": 0.046, "step": 41500 }, { "epoch": 0.6055450626450785, "grad_norm": 1.8288294076919556, "learning_rate": 1.8788938710189018e-05, "loss": 0.0473, "step": 42000 }, { "epoch": 0.6127539324384723, "grad_norm": 0.8600142598152161, "learning_rate": 1.877452097060223e-05, "loss": 0.0452, "step": 42500 }, { "epoch": 0.6199628022318661, "grad_norm": 2.8069839477539062, "learning_rate": 1.8760103231015445e-05, "loss": 0.048, "step": 43000 }, { "epoch": 0.6271716720252599, "grad_norm": 0.8850429058074951, "learning_rate": 1.8745685491428656e-05, "loss": 0.0474, "step": 43500 }, { "epoch": 0.6343805418186537, "grad_norm": 1.063219666481018, "learning_rate": 1.8731267751841868e-05, "loss": 0.0446, "step": 44000 }, { "epoch": 0.6415894116120474, "grad_norm": 1.3925724029541016, "learning_rate": 1.871685001225508e-05, "loss": 0.0468, "step": 44500 }, { "epoch": 0.6487982814054413, "grad_norm": 0.9575428366661072, "learning_rate": 1.870243227266829e-05, "loss": 0.0447, "step": 45000 }, { "epoch": 0.656007151198835, "grad_norm": 2.547752618789673, "learning_rate": 1.8688014533081503e-05, "loss": 0.0456, "step": 45500 }, { "epoch": 0.6632160209922289, "grad_norm": 0.6029974222183228, "learning_rate": 1.8673596793494718e-05, "loss": 0.0464, "step": 46000 }, { "epoch": 0.6704248907856226, "grad_norm": 0.27106812596321106, "learning_rate": 1.865917905390793e-05, "loss": 0.0437, "step": 46500 }, { "epoch": 0.6776337605790165, "grad_norm": 1.3233801126480103, "learning_rate": 1.864476131432114e-05, "loss": 0.0447, "step": 47000 }, { "epoch": 0.6848426303724102, "grad_norm": 0.38903898000717163, "learning_rate": 1.8630343574734356e-05, "loss": 0.0455, "step": 47500 }, { "epoch": 0.692051500165804, "grad_norm": 1.247036337852478, "learning_rate": 1.8615925835147568e-05, "loss": 0.044, "step": 48000 }, { "epoch": 0.6992603699591978, "grad_norm": 0.9771102666854858, "learning_rate": 1.860150809556078e-05, "loss": 0.0446, "step": 48500 }, { "epoch": 0.7064692397525916, "grad_norm": 1.6191680431365967, "learning_rate": 1.858709035597399e-05, "loss": 0.0455, "step": 49000 }, { "epoch": 0.7136781095459854, "grad_norm": 0.9542379975318909, "learning_rate": 1.8572672616387203e-05, "loss": 0.0426, "step": 49500 }, { "epoch": 0.7208869793393792, "grad_norm": 1.6160619258880615, "learning_rate": 1.8558254876800418e-05, "loss": 0.0433, "step": 50000 }, { "epoch": 0.7280958491327729, "grad_norm": 1.1810977458953857, "learning_rate": 1.854383713721363e-05, "loss": 0.0443, "step": 50500 }, { "epoch": 0.7353047189261668, "grad_norm": 1.4848960638046265, "learning_rate": 1.852941939762684e-05, "loss": 0.0442, "step": 51000 }, { "epoch": 0.7425135887195605, "grad_norm": 1.2140188217163086, "learning_rate": 1.8515001658040053e-05, "loss": 0.0436, "step": 51500 }, { "epoch": 0.7497224585129544, "grad_norm": 0.6803346276283264, "learning_rate": 1.8500583918453265e-05, "loss": 0.0416, "step": 52000 }, { "epoch": 0.7569313283063481, "grad_norm": 2.847879409790039, "learning_rate": 1.8486166178866477e-05, "loss": 0.0401, "step": 52500 }, { "epoch": 0.7641401980997419, "grad_norm": 1.3574286699295044, "learning_rate": 1.8471748439279692e-05, "loss": 0.0426, "step": 53000 }, { "epoch": 0.7713490678931357, "grad_norm": 1.5763428211212158, "learning_rate": 1.8457330699692903e-05, "loss": 0.0416, "step": 53500 }, { "epoch": 0.7785579376865295, "grad_norm": 2.006143808364868, "learning_rate": 1.8442912960106115e-05, "loss": 0.0423, "step": 54000 }, { "epoch": 0.7857668074799233, "grad_norm": 2.0041260719299316, "learning_rate": 1.842849522051933e-05, "loss": 0.043, "step": 54500 }, { "epoch": 0.7929756772733171, "grad_norm": 1.0083436965942383, "learning_rate": 1.8414077480932542e-05, "loss": 0.0428, "step": 55000 }, { "epoch": 0.8001845470667108, "grad_norm": 1.2364863157272339, "learning_rate": 1.8399659741345754e-05, "loss": 0.0431, "step": 55500 }, { "epoch": 0.8073934168601047, "grad_norm": 1.1397020816802979, "learning_rate": 1.8385242001758965e-05, "loss": 0.0408, "step": 56000 }, { "epoch": 0.8146022866534984, "grad_norm": 1.046647071838379, "learning_rate": 1.8370824262172177e-05, "loss": 0.0424, "step": 56500 }, { "epoch": 0.8218111564468923, "grad_norm": 0.7180289626121521, "learning_rate": 1.8356406522585392e-05, "loss": 0.0417, "step": 57000 }, { "epoch": 0.829020026240286, "grad_norm": 1.866095781326294, "learning_rate": 1.8341988782998604e-05, "loss": 0.0406, "step": 57500 }, { "epoch": 0.8362288960336799, "grad_norm": 1.7192025184631348, "learning_rate": 1.8327571043411815e-05, "loss": 0.042, "step": 58000 }, { "epoch": 0.8434377658270736, "grad_norm": 1.3043447732925415, "learning_rate": 1.8313153303825027e-05, "loss": 0.0419, "step": 58500 }, { "epoch": 0.8506466356204674, "grad_norm": 2.372190237045288, "learning_rate": 1.829873556423824e-05, "loss": 0.0421, "step": 59000 }, { "epoch": 0.8578555054138612, "grad_norm": 0.9028930068016052, "learning_rate": 1.828431782465145e-05, "loss": 0.0396, "step": 59500 }, { "epoch": 0.865064375207255, "grad_norm": 1.2869058847427368, "learning_rate": 1.8269900085064665e-05, "loss": 0.0401, "step": 60000 }, { "epoch": 0.8722732450006488, "grad_norm": 2.214855670928955, "learning_rate": 1.8255482345477877e-05, "loss": 0.04, "step": 60500 }, { "epoch": 0.8794821147940426, "grad_norm": 0.9826574325561523, "learning_rate": 1.824106460589109e-05, "loss": 0.0397, "step": 61000 }, { "epoch": 0.8866909845874363, "grad_norm": 0.7741074562072754, "learning_rate": 1.8226646866304304e-05, "loss": 0.0397, "step": 61500 }, { "epoch": 0.8938998543808302, "grad_norm": 1.2778081893920898, "learning_rate": 1.8212229126717516e-05, "loss": 0.0396, "step": 62000 }, { "epoch": 0.9011087241742239, "grad_norm": 0.7415226697921753, "learning_rate": 1.8197811387130727e-05, "loss": 0.0398, "step": 62500 }, { "epoch": 0.9083175939676178, "grad_norm": 2.152737617492676, "learning_rate": 1.818339364754394e-05, "loss": 0.0395, "step": 63000 }, { "epoch": 0.9155264637610115, "grad_norm": 0.9719590544700623, "learning_rate": 1.816897590795715e-05, "loss": 0.0387, "step": 63500 }, { "epoch": 0.9227353335544053, "grad_norm": 1.4587551355361938, "learning_rate": 1.8154558168370366e-05, "loss": 0.0395, "step": 64000 }, { "epoch": 0.9299442033477991, "grad_norm": 1.4218809604644775, "learning_rate": 1.8140140428783577e-05, "loss": 0.0375, "step": 64500 }, { "epoch": 0.9371530731411929, "grad_norm": 1.8009737730026245, "learning_rate": 1.812572268919679e-05, "loss": 0.0387, "step": 65000 }, { "epoch": 0.9443619429345868, "grad_norm": 1.2379016876220703, "learning_rate": 1.811130494961e-05, "loss": 0.0386, "step": 65500 }, { "epoch": 0.9515708127279805, "grad_norm": 1.1901589632034302, "learning_rate": 1.8096887210023216e-05, "loss": 0.0381, "step": 66000 }, { "epoch": 0.9587796825213742, "grad_norm": 1.0341569185256958, "learning_rate": 1.8082469470436424e-05, "loss": 0.0402, "step": 66500 }, { "epoch": 0.9659885523147681, "grad_norm": 1.4235957860946655, "learning_rate": 1.806805173084964e-05, "loss": 0.0382, "step": 67000 }, { "epoch": 0.9731974221081618, "grad_norm": 1.095893383026123, "learning_rate": 1.805363399126285e-05, "loss": 0.0396, "step": 67500 }, { "epoch": 0.9804062919015557, "grad_norm": 1.8859561681747437, "learning_rate": 1.8039216251676063e-05, "loss": 0.038, "step": 68000 }, { "epoch": 0.9876151616949495, "grad_norm": 1.8770360946655273, "learning_rate": 1.8024798512089278e-05, "loss": 0.039, "step": 68500 }, { "epoch": 0.9948240314883433, "grad_norm": 1.870827555656433, "learning_rate": 1.801038077250249e-05, "loss": 0.038, "step": 69000 }, { "epoch": 1.0, "eval_f1": 0.9895049158009324, "eval_loss": 0.034001659601926804, "eval_runtime": 683.1241, "eval_samples_per_second": 1528.989, "eval_steps_per_second": 47.782, "step": 69359 }, { "epoch": 1.002032901281737, "grad_norm": 0.4856395125389099, "learning_rate": 1.79959630329157e-05, "loss": 0.0352, "step": 69500 }, { "epoch": 1.009241771075131, "grad_norm": 1.8835086822509766, "learning_rate": 1.7981545293328913e-05, "loss": 0.0287, "step": 70000 }, { "epoch": 1.0164506408685245, "grad_norm": 1.941490888595581, "learning_rate": 1.7967127553742124e-05, "loss": 0.0307, "step": 70500 }, { "epoch": 1.0236595106619184, "grad_norm": 1.525707483291626, "learning_rate": 1.795270981415534e-05, "loss": 0.03, "step": 71000 }, { "epoch": 1.0308683804553123, "grad_norm": 0.6174446940422058, "learning_rate": 1.793829207456855e-05, "loss": 0.029, "step": 71500 }, { "epoch": 1.0380772502487061, "grad_norm": 1.043771505355835, "learning_rate": 1.7923874334981763e-05, "loss": 0.0311, "step": 72000 }, { "epoch": 1.0452861200420998, "grad_norm": 0.28765255212783813, "learning_rate": 1.7909456595394978e-05, "loss": 0.0291, "step": 72500 }, { "epoch": 1.0524949898354936, "grad_norm": 0.8367669582366943, "learning_rate": 1.789503885580819e-05, "loss": 0.0307, "step": 73000 }, { "epoch": 1.0597038596288875, "grad_norm": 0.8930952548980713, "learning_rate": 1.7880621116221398e-05, "loss": 0.0297, "step": 73500 }, { "epoch": 1.066912729422281, "grad_norm": 1.0413399934768677, "learning_rate": 1.7866203376634613e-05, "loss": 0.03, "step": 74000 }, { "epoch": 1.074121599215675, "grad_norm": 1.1929751634597778, "learning_rate": 1.7851785637047825e-05, "loss": 0.0287, "step": 74500 }, { "epoch": 1.0813304690090688, "grad_norm": 0.8676954507827759, "learning_rate": 1.7837367897461036e-05, "loss": 0.0307, "step": 75000 }, { "epoch": 1.0885393388024625, "grad_norm": 0.733383059501648, "learning_rate": 1.782295015787425e-05, "loss": 0.029, "step": 75500 }, { "epoch": 1.0957482085958563, "grad_norm": 1.005913257598877, "learning_rate": 1.7808532418287463e-05, "loss": 0.0288, "step": 76000 }, { "epoch": 1.1029570783892502, "grad_norm": 1.4946510791778564, "learning_rate": 1.7794114678700675e-05, "loss": 0.0294, "step": 76500 }, { "epoch": 1.110165948182644, "grad_norm": 0.966665506362915, "learning_rate": 1.7779696939113886e-05, "loss": 0.0311, "step": 77000 }, { "epoch": 1.1173748179760377, "grad_norm": 0.8129379749298096, "learning_rate": 1.7765279199527098e-05, "loss": 0.0301, "step": 77500 }, { "epoch": 1.1245836877694315, "grad_norm": 1.1672717332839966, "learning_rate": 1.7750861459940313e-05, "loss": 0.0297, "step": 78000 }, { "epoch": 1.1317925575628254, "grad_norm": 1.0149409770965576, "learning_rate": 1.7736443720353525e-05, "loss": 0.031, "step": 78500 }, { "epoch": 1.139001427356219, "grad_norm": 1.3319754600524902, "learning_rate": 1.7722025980766736e-05, "loss": 0.0294, "step": 79000 }, { "epoch": 1.1462102971496129, "grad_norm": 3.036787509918213, "learning_rate": 1.770760824117995e-05, "loss": 0.0294, "step": 79500 }, { "epoch": 1.1534191669430067, "grad_norm": 0.6281238198280334, "learning_rate": 1.7693190501593163e-05, "loss": 0.0312, "step": 80000 }, { "epoch": 1.1606280367364006, "grad_norm": 1.39284086227417, "learning_rate": 1.767877276200637e-05, "loss": 0.0299, "step": 80500 }, { "epoch": 1.1678369065297942, "grad_norm": 2.4636764526367188, "learning_rate": 1.7664355022419587e-05, "loss": 0.0304, "step": 81000 }, { "epoch": 1.175045776323188, "grad_norm": 1.0513309240341187, "learning_rate": 1.7649937282832798e-05, "loss": 0.0293, "step": 81500 }, { "epoch": 1.182254646116582, "grad_norm": 0.739205539226532, "learning_rate": 1.763551954324601e-05, "loss": 0.0297, "step": 82000 }, { "epoch": 1.1894635159099756, "grad_norm": 1.1646817922592163, "learning_rate": 1.7621101803659225e-05, "loss": 0.0281, "step": 82500 }, { "epoch": 1.1966723857033694, "grad_norm": 1.6882481575012207, "learning_rate": 1.7606684064072437e-05, "loss": 0.0308, "step": 83000 }, { "epoch": 1.2038812554967633, "grad_norm": 2.1905980110168457, "learning_rate": 1.759226632448565e-05, "loss": 0.0301, "step": 83500 }, { "epoch": 1.211090125290157, "grad_norm": 0.4102253317832947, "learning_rate": 1.757784858489886e-05, "loss": 0.0296, "step": 84000 }, { "epoch": 1.2182989950835508, "grad_norm": 1.5355827808380127, "learning_rate": 1.7563430845312072e-05, "loss": 0.031, "step": 84500 }, { "epoch": 1.2255078648769446, "grad_norm": 0.4144400954246521, "learning_rate": 1.7549013105725287e-05, "loss": 0.0303, "step": 85000 }, { "epoch": 1.2327167346703383, "grad_norm": 0.5286178588867188, "learning_rate": 1.75345953661385e-05, "loss": 0.0311, "step": 85500 }, { "epoch": 1.2399256044637321, "grad_norm": 1.3401720523834229, "learning_rate": 1.752017762655171e-05, "loss": 0.0303, "step": 86000 }, { "epoch": 1.247134474257126, "grad_norm": 1.5546993017196655, "learning_rate": 1.7505759886964925e-05, "loss": 0.0296, "step": 86500 }, { "epoch": 1.2543433440505198, "grad_norm": 1.7993361949920654, "learning_rate": 1.7491342147378137e-05, "loss": 0.03, "step": 87000 }, { "epoch": 1.2615522138439135, "grad_norm": 1.058311939239502, "learning_rate": 1.7476924407791345e-05, "loss": 0.0283, "step": 87500 }, { "epoch": 1.2687610836373073, "grad_norm": 1.1616915464401245, "learning_rate": 1.746250666820456e-05, "loss": 0.0306, "step": 88000 }, { "epoch": 1.2759699534307012, "grad_norm": 1.5120762586593628, "learning_rate": 1.7448088928617772e-05, "loss": 0.0296, "step": 88500 }, { "epoch": 1.283178823224095, "grad_norm": 1.033087134361267, "learning_rate": 1.7433671189030984e-05, "loss": 0.0296, "step": 89000 }, { "epoch": 1.2903876930174887, "grad_norm": 0.9456692337989807, "learning_rate": 1.74192534494442e-05, "loss": 0.0293, "step": 89500 }, { "epoch": 1.2975965628108825, "grad_norm": 0.4252309799194336, "learning_rate": 1.740483570985741e-05, "loss": 0.0287, "step": 90000 }, { "epoch": 1.3048054326042764, "grad_norm": 1.4315825700759888, "learning_rate": 1.7390417970270622e-05, "loss": 0.0314, "step": 90500 }, { "epoch": 1.31201430239767, "grad_norm": 0.9023242592811584, "learning_rate": 1.7376000230683834e-05, "loss": 0.0296, "step": 91000 }, { "epoch": 1.3192231721910639, "grad_norm": 1.8055963516235352, "learning_rate": 1.7361582491097045e-05, "loss": 0.0289, "step": 91500 }, { "epoch": 1.3264320419844577, "grad_norm": 1.2063618898391724, "learning_rate": 1.734716475151026e-05, "loss": 0.03, "step": 92000 }, { "epoch": 1.3336409117778514, "grad_norm": 2.5645272731781006, "learning_rate": 1.7332747011923472e-05, "loss": 0.0289, "step": 92500 }, { "epoch": 1.3408497815712452, "grad_norm": 1.9335203170776367, "learning_rate": 1.7318329272336684e-05, "loss": 0.0285, "step": 93000 }, { "epoch": 1.348058651364639, "grad_norm": 0.8842147588729858, "learning_rate": 1.73039115327499e-05, "loss": 0.0287, "step": 93500 }, { "epoch": 1.3552675211580327, "grad_norm": 1.2006937265396118, "learning_rate": 1.728949379316311e-05, "loss": 0.0288, "step": 94000 }, { "epoch": 1.3624763909514266, "grad_norm": 1.1261006593704224, "learning_rate": 1.7275076053576322e-05, "loss": 0.0293, "step": 94500 }, { "epoch": 1.3696852607448204, "grad_norm": 1.2065215110778809, "learning_rate": 1.7260658313989534e-05, "loss": 0.0282, "step": 95000 }, { "epoch": 1.3768941305382143, "grad_norm": 1.8486534357070923, "learning_rate": 1.7246240574402746e-05, "loss": 0.029, "step": 95500 }, { "epoch": 1.384103000331608, "grad_norm": 0.8908069729804993, "learning_rate": 1.7231822834815957e-05, "loss": 0.0294, "step": 96000 }, { "epoch": 1.3913118701250018, "grad_norm": 0.6375325918197632, "learning_rate": 1.7217405095229172e-05, "loss": 0.0287, "step": 96500 }, { "epoch": 1.3985207399183957, "grad_norm": 1.9673434495925903, "learning_rate": 1.7202987355642384e-05, "loss": 0.0282, "step": 97000 }, { "epoch": 1.4057296097117895, "grad_norm": 1.1606006622314453, "learning_rate": 1.7188569616055596e-05, "loss": 0.0284, "step": 97500 }, { "epoch": 1.4129384795051831, "grad_norm": 1.003493309020996, "learning_rate": 1.7174151876468807e-05, "loss": 0.0283, "step": 98000 }, { "epoch": 1.420147349298577, "grad_norm": 0.9186868071556091, "learning_rate": 1.715973413688202e-05, "loss": 0.0277, "step": 98500 }, { "epoch": 1.4273562190919709, "grad_norm": 1.3305683135986328, "learning_rate": 1.7145316397295234e-05, "loss": 0.0292, "step": 99000 }, { "epoch": 1.4345650888853645, "grad_norm": 1.3776835203170776, "learning_rate": 1.7130898657708446e-05, "loss": 0.0286, "step": 99500 }, { "epoch": 1.4417739586787583, "grad_norm": 1.6687921285629272, "learning_rate": 1.7116480918121658e-05, "loss": 0.029, "step": 100000 }, { "epoch": 1.4489828284721522, "grad_norm": 1.9249308109283447, "learning_rate": 1.7102063178534873e-05, "loss": 0.0262, "step": 100500 }, { "epoch": 1.4561916982655458, "grad_norm": 1.1834752559661865, "learning_rate": 1.7087645438948084e-05, "loss": 0.0294, "step": 101000 }, { "epoch": 1.4634005680589397, "grad_norm": 2.1350696086883545, "learning_rate": 1.7073227699361296e-05, "loss": 0.0276, "step": 101500 }, { "epoch": 1.4706094378523336, "grad_norm": 2.563725709915161, "learning_rate": 1.7058809959774508e-05, "loss": 0.0276, "step": 102000 }, { "epoch": 1.4778183076457272, "grad_norm": 0.9226647019386292, "learning_rate": 1.704439222018772e-05, "loss": 0.0284, "step": 102500 }, { "epoch": 1.485027177439121, "grad_norm": 0.34231990575790405, "learning_rate": 1.702997448060093e-05, "loss": 0.0281, "step": 103000 }, { "epoch": 1.492236047232515, "grad_norm": 2.339191436767578, "learning_rate": 1.7015556741014146e-05, "loss": 0.029, "step": 103500 }, { "epoch": 1.4994449170259085, "grad_norm": 1.7756520509719849, "learning_rate": 1.7001139001427358e-05, "loss": 0.0288, "step": 104000 }, { "epoch": 1.5066537868193026, "grad_norm": 2.0807387828826904, "learning_rate": 1.698672126184057e-05, "loss": 0.0281, "step": 104500 }, { "epoch": 1.5138626566126963, "grad_norm": 1.4787542819976807, "learning_rate": 1.6972303522253785e-05, "loss": 0.0284, "step": 105000 }, { "epoch": 1.52107152640609, "grad_norm": 1.719581961631775, "learning_rate": 1.6957885782666993e-05, "loss": 0.0287, "step": 105500 }, { "epoch": 1.528280396199484, "grad_norm": 0.8158332109451294, "learning_rate": 1.6943468043080208e-05, "loss": 0.029, "step": 106000 }, { "epoch": 1.5354892659928776, "grad_norm": 0.10212863981723785, "learning_rate": 1.692905030349342e-05, "loss": 0.0275, "step": 106500 }, { "epoch": 1.5426981357862715, "grad_norm": 1.0970171689987183, "learning_rate": 1.691463256390663e-05, "loss": 0.0282, "step": 107000 }, { "epoch": 1.5499070055796653, "grad_norm": 0.4221758246421814, "learning_rate": 1.6900214824319846e-05, "loss": 0.0285, "step": 107500 }, { "epoch": 1.557115875373059, "grad_norm": 1.5400525331497192, "learning_rate": 1.6885797084733058e-05, "loss": 0.0282, "step": 108000 }, { "epoch": 1.5643247451664528, "grad_norm": 1.6638318300247192, "learning_rate": 1.687137934514627e-05, "loss": 0.0301, "step": 108500 }, { "epoch": 1.5715336149598467, "grad_norm": 1.3407906293869019, "learning_rate": 1.685696160555948e-05, "loss": 0.0276, "step": 109000 }, { "epoch": 1.5787424847532403, "grad_norm": 0.8864063024520874, "learning_rate": 1.6842543865972693e-05, "loss": 0.0273, "step": 109500 }, { "epoch": 1.5859513545466342, "grad_norm": 1.5699615478515625, "learning_rate": 1.6828126126385905e-05, "loss": 0.0267, "step": 110000 }, { "epoch": 1.593160224340028, "grad_norm": 0.20337066054344177, "learning_rate": 1.681370838679912e-05, "loss": 0.0285, "step": 110500 }, { "epoch": 1.6003690941334217, "grad_norm": 0.7260587811470032, "learning_rate": 1.679929064721233e-05, "loss": 0.028, "step": 111000 }, { "epoch": 1.6075779639268155, "grad_norm": 0.434865266084671, "learning_rate": 1.6784872907625543e-05, "loss": 0.027, "step": 111500 }, { "epoch": 1.6147868337202094, "grad_norm": 1.0067859888076782, "learning_rate": 1.677045516803876e-05, "loss": 0.0276, "step": 112000 }, { "epoch": 1.621995703513603, "grad_norm": 1.7014882564544678, "learning_rate": 1.6756037428451967e-05, "loss": 0.0276, "step": 112500 }, { "epoch": 1.629204573306997, "grad_norm": 1.2809230089187622, "learning_rate": 1.674161968886518e-05, "loss": 0.0276, "step": 113000 }, { "epoch": 1.6364134431003907, "grad_norm": 1.2574232816696167, "learning_rate": 1.6727201949278393e-05, "loss": 0.0284, "step": 113500 }, { "epoch": 1.6436223128937844, "grad_norm": 1.3797274827957153, "learning_rate": 1.6712784209691605e-05, "loss": 0.0282, "step": 114000 }, { "epoch": 1.6508311826871784, "grad_norm": 0.32101693749427795, "learning_rate": 1.669836647010482e-05, "loss": 0.0274, "step": 114500 }, { "epoch": 1.658040052480572, "grad_norm": 0.41121360659599304, "learning_rate": 1.6683948730518032e-05, "loss": 0.0286, "step": 115000 }, { "epoch": 1.665248922273966, "grad_norm": 0.5161770582199097, "learning_rate": 1.6669530990931243e-05, "loss": 0.0271, "step": 115500 }, { "epoch": 1.6724577920673598, "grad_norm": 1.153785228729248, "learning_rate": 1.6655113251344455e-05, "loss": 0.0264, "step": 116000 }, { "epoch": 1.6796666618607534, "grad_norm": 1.5621336698532104, "learning_rate": 1.6640695511757667e-05, "loss": 0.0272, "step": 116500 }, { "epoch": 1.6868755316541473, "grad_norm": 2.4250948429107666, "learning_rate": 1.662627777217088e-05, "loss": 0.0282, "step": 117000 }, { "epoch": 1.6940844014475411, "grad_norm": 0.24833956360816956, "learning_rate": 1.6611860032584094e-05, "loss": 0.0279, "step": 117500 }, { "epoch": 1.7012932712409348, "grad_norm": 2.7739059925079346, "learning_rate": 1.6597442292997305e-05, "loss": 0.0283, "step": 118000 }, { "epoch": 1.7085021410343286, "grad_norm": 0.29604852199554443, "learning_rate": 1.6583024553410517e-05, "loss": 0.0271, "step": 118500 }, { "epoch": 1.7157110108277225, "grad_norm": 1.0948668718338013, "learning_rate": 1.6568606813823732e-05, "loss": 0.0269, "step": 119000 }, { "epoch": 1.7229198806211161, "grad_norm": 0.20236891508102417, "learning_rate": 1.655418907423694e-05, "loss": 0.0264, "step": 119500 }, { "epoch": 1.73012875041451, "grad_norm": 0.9090920090675354, "learning_rate": 1.6539771334650155e-05, "loss": 0.0282, "step": 120000 }, { "epoch": 1.7373376202079038, "grad_norm": 2.128474473953247, "learning_rate": 1.6525353595063367e-05, "loss": 0.0283, "step": 120500 }, { "epoch": 1.7445464900012975, "grad_norm": 1.6552634239196777, "learning_rate": 1.651093585547658e-05, "loss": 0.0272, "step": 121000 }, { "epoch": 1.7517553597946915, "grad_norm": 0.7921839356422424, "learning_rate": 1.6496518115889794e-05, "loss": 0.0301, "step": 121500 }, { "epoch": 1.7589642295880852, "grad_norm": 0.8467416763305664, "learning_rate": 1.6482100376303006e-05, "loss": 0.0266, "step": 122000 }, { "epoch": 1.7661730993814788, "grad_norm": 1.4604544639587402, "learning_rate": 1.6467682636716217e-05, "loss": 0.0253, "step": 122500 }, { "epoch": 1.773381969174873, "grad_norm": 0.677890956401825, "learning_rate": 1.645326489712943e-05, "loss": 0.0266, "step": 123000 }, { "epoch": 1.7805908389682665, "grad_norm": 0.2728472352027893, "learning_rate": 1.643884715754264e-05, "loss": 0.027, "step": 123500 }, { "epoch": 1.7877997087616604, "grad_norm": 1.2005136013031006, "learning_rate": 1.6424429417955852e-05, "loss": 0.0265, "step": 124000 }, { "epoch": 1.7950085785550542, "grad_norm": 2.1395583152770996, "learning_rate": 1.6410011678369067e-05, "loss": 0.0285, "step": 124500 }, { "epoch": 1.8022174483484479, "grad_norm": 1.5524953603744507, "learning_rate": 1.639559393878228e-05, "loss": 0.026, "step": 125000 }, { "epoch": 1.8094263181418417, "grad_norm": 1.5434062480926514, "learning_rate": 1.638117619919549e-05, "loss": 0.0272, "step": 125500 }, { "epoch": 1.8166351879352356, "grad_norm": 1.4732664823532104, "learning_rate": 1.6366758459608706e-05, "loss": 0.0264, "step": 126000 }, { "epoch": 1.8238440577286292, "grad_norm": 0.5316962599754333, "learning_rate": 1.6352340720021914e-05, "loss": 0.0262, "step": 126500 }, { "epoch": 1.831052927522023, "grad_norm": 0.09009312838315964, "learning_rate": 1.633792298043513e-05, "loss": 0.0272, "step": 127000 }, { "epoch": 1.838261797315417, "grad_norm": 1.211990475654602, "learning_rate": 1.632350524084834e-05, "loss": 0.0272, "step": 127500 }, { "epoch": 1.8454706671088106, "grad_norm": 1.1306172609329224, "learning_rate": 1.6309087501261552e-05, "loss": 0.0268, "step": 128000 }, { "epoch": 1.8526795369022044, "grad_norm": 1.8232672214508057, "learning_rate": 1.6294669761674768e-05, "loss": 0.0282, "step": 128500 }, { "epoch": 1.8598884066955983, "grad_norm": 2.736703395843506, "learning_rate": 1.628025202208798e-05, "loss": 0.0271, "step": 129000 }, { "epoch": 1.867097276488992, "grad_norm": 2.2017531394958496, "learning_rate": 1.626583428250119e-05, "loss": 0.0264, "step": 129500 }, { "epoch": 1.874306146282386, "grad_norm": 0.6630580425262451, "learning_rate": 1.6251416542914403e-05, "loss": 0.0268, "step": 130000 }, { "epoch": 1.8815150160757796, "grad_norm": 0.2576875388622284, "learning_rate": 1.6236998803327614e-05, "loss": 0.0275, "step": 130500 }, { "epoch": 1.8887238858691733, "grad_norm": 0.625859260559082, "learning_rate": 1.6222581063740826e-05, "loss": 0.0263, "step": 131000 }, { "epoch": 1.8959327556625674, "grad_norm": 2.3079171180725098, "learning_rate": 1.620816332415404e-05, "loss": 0.0266, "step": 131500 }, { "epoch": 1.903141625455961, "grad_norm": 0.8551648259162903, "learning_rate": 1.6193745584567253e-05, "loss": 0.0268, "step": 132000 }, { "epoch": 1.9103504952493549, "grad_norm": 1.2068754434585571, "learning_rate": 1.6179327844980464e-05, "loss": 0.0276, "step": 132500 }, { "epoch": 1.9175593650427487, "grad_norm": 0.4594031274318695, "learning_rate": 1.616491010539368e-05, "loss": 0.0271, "step": 133000 }, { "epoch": 1.9247682348361423, "grad_norm": 0.5821360945701599, "learning_rate": 1.6150492365806888e-05, "loss": 0.0267, "step": 133500 }, { "epoch": 1.9319771046295362, "grad_norm": 0.5188286304473877, "learning_rate": 1.6136074626220103e-05, "loss": 0.027, "step": 134000 }, { "epoch": 1.93918597442293, "grad_norm": 1.6506882905960083, "learning_rate": 1.6121656886633315e-05, "loss": 0.026, "step": 134500 }, { "epoch": 1.9463948442163237, "grad_norm": 1.5678963661193848, "learning_rate": 1.6107239147046526e-05, "loss": 0.0264, "step": 135000 }, { "epoch": 1.9536037140097176, "grad_norm": 0.3626735210418701, "learning_rate": 1.609282140745974e-05, "loss": 0.0264, "step": 135500 }, { "epoch": 1.9608125838031114, "grad_norm": 0.48542195558547974, "learning_rate": 1.6078403667872953e-05, "loss": 0.0257, "step": 136000 }, { "epoch": 1.968021453596505, "grad_norm": 0.93156498670578, "learning_rate": 1.6063985928286165e-05, "loss": 0.0274, "step": 136500 }, { "epoch": 1.975230323389899, "grad_norm": 0.6599089503288269, "learning_rate": 1.6049568188699376e-05, "loss": 0.0253, "step": 137000 }, { "epoch": 1.9824391931832928, "grad_norm": 2.511162519454956, "learning_rate": 1.6035150449112588e-05, "loss": 0.0264, "step": 137500 }, { "epoch": 1.9896480629766864, "grad_norm": 0.7365297675132751, "learning_rate": 1.6020732709525803e-05, "loss": 0.0263, "step": 138000 }, { "epoch": 1.9968569327700805, "grad_norm": 0.9106433391571045, "learning_rate": 1.6006314969939015e-05, "loss": 0.027, "step": 138500 }, { "epoch": 2.0, "eval_f1": 0.9915470627263667, "eval_loss": 0.02749801054596901, "eval_runtime": 1640.2112, "eval_samples_per_second": 636.802, "eval_steps_per_second": 19.9, "step": 138718 } ], "logging_steps": 500, "max_steps": 693590, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000.0, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.005 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.35149119187216e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }