Instructions to use Gege24/lion_8bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/lion_8bit with PEFT:
Base model is not found.
- Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": 1044, | |
| "best_metric": 0.5790691375732422, | |
| "best_model_checkpoint": "/workspace/scripts/soutputs/8ca8a9ea-9ae3-4938-9713-015819984d61/checkpoint-1044", | |
| "epoch": 2.9914040114613183, | |
| "eval_steps": 500, | |
| "global_step": 1044, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014326647564469915, | |
| "grad_norm": 1.6428219079971313, | |
| "learning_rate": 1.4026021586989397e-06, | |
| "loss": 0.9723, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02865329512893983, | |
| "grad_norm": 1.3679360151290894, | |
| "learning_rate": 3.155854857072614e-06, | |
| "loss": 0.9416, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04297994269340974, | |
| "grad_norm": 1.0384185314178467, | |
| "learning_rate": 4.9091075554462895e-06, | |
| "loss": 0.8955, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05730659025787966, | |
| "grad_norm": 0.6389966607093811, | |
| "learning_rate": 6.662360253819964e-06, | |
| "loss": 0.8219, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07163323782234957, | |
| "grad_norm": 0.46849510073661804, | |
| "learning_rate": 8.415612952193638e-06, | |
| "loss": 0.7583, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08595988538681948, | |
| "grad_norm": 0.5466313362121582, | |
| "learning_rate": 1.0168865650567315e-05, | |
| "loss": 0.7283, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10028653295128939, | |
| "grad_norm": 0.46641281247138977, | |
| "learning_rate": 1.1922118348940989e-05, | |
| "loss": 0.708, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11461318051575932, | |
| "grad_norm": 0.5155534744262695, | |
| "learning_rate": 1.2272343115538091e-05, | |
| "loss": 0.7074, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12893982808022922, | |
| "grad_norm": 0.45078691840171814, | |
| "learning_rate": 1.2270613524924088e-05, | |
| "loss": 0.6699, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14326647564469913, | |
| "grad_norm": 0.4526143968105316, | |
| "learning_rate": 1.2267553922326047e-05, | |
| "loss": 0.6663, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15759312320916904, | |
| "grad_norm": 0.44320717453956604, | |
| "learning_rate": 1.2263165044858593e-05, | |
| "loss": 0.6612, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.17191977077363896, | |
| "grad_norm": 0.4532703757286072, | |
| "learning_rate": 1.2257447949883163e-05, | |
| "loss": 0.6705, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18624641833810887, | |
| "grad_norm": 0.3655495345592499, | |
| "learning_rate": 1.2250404014753254e-05, | |
| "loss": 0.6574, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.20057306590257878, | |
| "grad_norm": 0.3733099102973938, | |
| "learning_rate": 1.2242034936482603e-05, | |
| "loss": 0.6834, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2148997134670487, | |
| "grad_norm": 0.355129599571228, | |
| "learning_rate": 1.2232342731336339e-05, | |
| "loss": 0.6645, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.22922636103151864, | |
| "grad_norm": 0.358656108379364, | |
| "learning_rate": 1.222132973434523e-05, | |
| "loss": 0.653, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.24355300859598855, | |
| "grad_norm": 0.29975712299346924, | |
| "learning_rate": 1.2208998598743134e-05, | |
| "loss": 0.6719, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.25787965616045844, | |
| "grad_norm": 0.32437002658843994, | |
| "learning_rate": 1.2195352295327777e-05, | |
| "loss": 0.6661, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2722063037249284, | |
| "grad_norm": 0.28565841913223267, | |
| "learning_rate": 1.2180394111745045e-05, | |
| "loss": 0.6515, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.28653295128939826, | |
| "grad_norm": 0.28558802604675293, | |
| "learning_rate": 1.2164127651696922e-05, | |
| "loss": 0.6448, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.28653295128939826, | |
| "eval_loss": 0.6602650880813599, | |
| "eval_runtime": 2.8798, | |
| "eval_samples_per_second": 14.584, | |
| "eval_steps_per_second": 14.584, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3008595988538682, | |
| "grad_norm": 0.3361125886440277, | |
| "learning_rate": 1.214655683407329e-05, | |
| "loss": 0.6516, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3151862464183381, | |
| "grad_norm": 0.2776224613189697, | |
| "learning_rate": 1.2127685892007806e-05, | |
| "loss": 0.6592, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.32951289398280803, | |
| "grad_norm": 0.26801374554634094, | |
| "learning_rate": 1.2107519371858048e-05, | |
| "loss": 0.6565, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3438395415472779, | |
| "grad_norm": 0.297080934047699, | |
| "learning_rate": 1.2086062132110227e-05, | |
| "loss": 0.642, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.35816618911174786, | |
| "grad_norm": 0.28340891003608704, | |
| "learning_rate": 1.2063319342208684e-05, | |
| "loss": 0.6478, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.37249283667621774, | |
| "grad_norm": 0.2782769203186035, | |
| "learning_rate": 1.2039296481310471e-05, | |
| "loss": 0.6368, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3868194842406877, | |
| "grad_norm": 0.292530357837677, | |
| "learning_rate": 1.2013999336965322e-05, | |
| "loss": 0.6153, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.40114613180515757, | |
| "grad_norm": 0.24663622677326202, | |
| "learning_rate": 1.1987434003721335e-05, | |
| "loss": 0.6424, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4154727793696275, | |
| "grad_norm": 0.2681853473186493, | |
| "learning_rate": 1.195960688165667e-05, | |
| "loss": 0.6348, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4297994269340974, | |
| "grad_norm": 0.2627250850200653, | |
| "learning_rate": 1.1930524674837664e-05, | |
| "loss": 0.6249, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.44412607449856734, | |
| "grad_norm": 0.24072442948818207, | |
| "learning_rate": 1.1900194389703684e-05, | |
| "loss": 0.6391, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4584527220630373, | |
| "grad_norm": 0.25336554646492004, | |
| "learning_rate": 1.1868623333379166e-05, | |
| "loss": 0.6298, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.47277936962750716, | |
| "grad_norm": 0.2672167718410492, | |
| "learning_rate": 1.1835819111913174e-05, | |
| "loss": 0.6368, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4871060171919771, | |
| "grad_norm": 0.2560673952102661, | |
| "learning_rate": 1.1801789628446977e-05, | |
| "loss": 0.6318, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.501432664756447, | |
| "grad_norm": 0.27951574325561523, | |
| "learning_rate": 1.1766543081310029e-05, | |
| "loss": 0.6109, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5157593123209169, | |
| "grad_norm": 0.25252604484558105, | |
| "learning_rate": 1.1730087962044844e-05, | |
| "loss": 0.6273, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5300859598853869, | |
| "grad_norm": 0.25956350564956665, | |
| "learning_rate": 1.1692433053361224e-05, | |
| "loss": 0.6133, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5444126074498568, | |
| "grad_norm": 0.2530823349952698, | |
| "learning_rate": 1.165358742702035e-05, | |
| "loss": 0.6214, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5587392550143266, | |
| "grad_norm": 0.2583998143672943, | |
| "learning_rate": 1.1613560441649214e-05, | |
| "loss": 0.6105, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5730659025787965, | |
| "grad_norm": 0.27742502093315125, | |
| "learning_rate": 1.1572361740485967e-05, | |
| "loss": 0.6349, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5730659025787965, | |
| "eval_loss": 0.6322649717330933, | |
| "eval_runtime": 2.88, | |
| "eval_samples_per_second": 14.583, | |
| "eval_steps_per_second": 14.583, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5873925501432665, | |
| "grad_norm": 0.2662568688392639, | |
| "learning_rate": 1.1530001249056676e-05, | |
| "loss": 0.6299, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6017191977077364, | |
| "grad_norm": 0.2614499032497406, | |
| "learning_rate": 1.148648917278409e-05, | |
| "loss": 0.6005, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6160458452722063, | |
| "grad_norm": 0.26987332105636597, | |
| "learning_rate": 1.1441835994528954e-05, | |
| "loss": 0.6214, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6303724928366762, | |
| "grad_norm": 0.24090726673603058, | |
| "learning_rate": 1.1396052472064512e-05, | |
| "loss": 0.6245, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6446991404011462, | |
| "grad_norm": 0.2746104300022125, | |
| "learning_rate": 1.1349149635484741e-05, | |
| "loss": 0.6222, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6590257879656161, | |
| "grad_norm": 0.26875993609428406, | |
| "learning_rate": 1.1301138784547013e-05, | |
| "loss": 0.6092, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.673352435530086, | |
| "grad_norm": 0.2399819940328598, | |
| "learning_rate": 1.1252031485949773e-05, | |
| "loss": 0.6177, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6876790830945558, | |
| "grad_norm": 0.27088305354118347, | |
| "learning_rate": 1.1201839570545898e-05, | |
| "loss": 0.6024, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7020057306590258, | |
| "grad_norm": 0.2598998248577118, | |
| "learning_rate": 1.1150575130492442e-05, | |
| "loss": 0.6068, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7163323782234957, | |
| "grad_norm": 0.26509082317352295, | |
| "learning_rate": 1.1098250516337403e-05, | |
| "loss": 0.6128, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7306590257879656, | |
| "grad_norm": 0.23148998618125916, | |
| "learning_rate": 1.1044878334044251e-05, | |
| "loss": 0.6225, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7449856733524355, | |
| "grad_norm": 0.23298867046833038, | |
| "learning_rate": 1.0990471441954915e-05, | |
| "loss": 0.6176, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7593123209169055, | |
| "grad_norm": 0.25643882155418396, | |
| "learning_rate": 1.093504294769198e-05, | |
| "loss": 0.6132, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7736389684813754, | |
| "grad_norm": 0.2456223964691162, | |
| "learning_rate": 1.087860620500081e-05, | |
| "loss": 0.6083, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7879656160458453, | |
| "grad_norm": 0.24799339473247528, | |
| "learning_rate": 1.0821174810532391e-05, | |
| "loss": 0.6064, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8022922636103151, | |
| "grad_norm": 0.24989920854568481, | |
| "learning_rate": 1.076276260056765e-05, | |
| "loss": 0.6063, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8166189111747851, | |
| "grad_norm": 0.253239244222641, | |
| "learning_rate": 1.0703383647684028e-05, | |
| "loss": 0.6071, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.830945558739255, | |
| "grad_norm": 0.24544061720371246, | |
| "learning_rate": 1.064305225736515e-05, | |
| "loss": 0.611, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8452722063037249, | |
| "grad_norm": 0.24104644358158112, | |
| "learning_rate": 1.0581782964554359e-05, | |
| "loss": 0.5985, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8595988538681948, | |
| "grad_norm": 0.23256933689117432, | |
| "learning_rate": 1.0519590530152995e-05, | |
| "loss": 0.5887, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8595988538681948, | |
| "eval_loss": 0.6149212718009949, | |
| "eval_runtime": 2.8878, | |
| "eval_samples_per_second": 14.544, | |
| "eval_steps_per_second": 14.544, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8739255014326648, | |
| "grad_norm": 0.26569247245788574, | |
| "learning_rate": 1.0456489937464206e-05, | |
| "loss": 0.5988, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8882521489971347, | |
| "grad_norm": 0.2356170415878296, | |
| "learning_rate": 1.0392496388583203e-05, | |
| "loss": 0.6133, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9025787965616046, | |
| "grad_norm": 0.25165677070617676, | |
| "learning_rate": 1.0327625300734795e-05, | |
| "loss": 0.6022, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9169054441260746, | |
| "grad_norm": 0.2422744333744049, | |
| "learning_rate": 1.0261892302559097e-05, | |
| "loss": 0.6209, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9312320916905444, | |
| "grad_norm": 0.2504790723323822, | |
| "learning_rate": 1.019531323034629e-05, | |
| "loss": 0.5836, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9455587392550143, | |
| "grad_norm": 0.23083172738552094, | |
| "learning_rate": 1.0127904124221387e-05, | |
| "loss": 0.6036, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9598853868194842, | |
| "grad_norm": 0.23841316998004913, | |
| "learning_rate": 1.0059681224279856e-05, | |
| "loss": 0.6028, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9742120343839542, | |
| "grad_norm": 0.2634727656841278, | |
| "learning_rate": 9.990660966675092e-06, | |
| "loss": 0.6074, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9885386819484241, | |
| "grad_norm": 0.22332459688186646, | |
| "learning_rate": 9.920859979658633e-06, | |
| "loss": 0.6061, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.997134670487106, | |
| "eval_loss": 0.6086744070053101, | |
| "eval_runtime": 2.8877, | |
| "eval_samples_per_second": 14.544, | |
| "eval_steps_per_second": 14.544, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.6092488169670105, | |
| "eval_runtime": 2.8916, | |
| "eval_samples_per_second": 14.525, | |
| "eval_steps_per_second": 14.525, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.002865329512894, | |
| "grad_norm": 0.23956461250782013, | |
| "learning_rate": 9.85029507957412e-06, | |
| "loss": 0.5824, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0171919770773639, | |
| "grad_norm": 0.2437165081501007, | |
| "learning_rate": 9.77898326680592e-06, | |
| "loss": 0.5803, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.0315186246418337, | |
| "grad_norm": 0.2500912845134735, | |
| "learning_rate": 9.706941721683432e-06, | |
| "loss": 0.5957, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0458452722063036, | |
| "grad_norm": 0.2493949979543686, | |
| "learning_rate": 9.634187800342016e-06, | |
| "loss": 0.5911, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.0601719197707737, | |
| "grad_norm": 0.23148047924041748, | |
| "learning_rate": 9.56073903054159e-06, | |
| "loss": 0.5688, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0744985673352436, | |
| "grad_norm": 0.23534221947193146, | |
| "learning_rate": 9.486613107443863e-06, | |
| "loss": 0.5938, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0888252148997135, | |
| "grad_norm": 0.23032759130001068, | |
| "learning_rate": 9.411827889349254e-06, | |
| "loss": 0.5675, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1031518624641834, | |
| "grad_norm": 0.23191657662391663, | |
| "learning_rate": 9.336401393394483e-06, | |
| "loss": 0.5899, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.1174785100286533, | |
| "grad_norm": 0.2217395305633545, | |
| "learning_rate": 9.260351791211929e-06, | |
| "loss": 0.5726, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1318051575931232, | |
| "grad_norm": 0.2425890415906906, | |
| "learning_rate": 9.183697404551733e-06, | |
| "loss": 0.5762, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.146131805157593, | |
| "grad_norm": 0.2324853092432022, | |
| "learning_rate": 9.106456700867764e-06, | |
| "loss": 0.596, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.146131805157593, | |
| "eval_loss": 0.6035182476043701, | |
| "eval_runtime": 2.8972, | |
| "eval_samples_per_second": 14.497, | |
| "eval_steps_per_second": 14.497, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1604584527220632, | |
| "grad_norm": 0.23952153325080872, | |
| "learning_rate": 9.028648288868459e-06, | |
| "loss": 0.5904, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.174785100286533, | |
| "grad_norm": 0.23701021075248718, | |
| "learning_rate": 8.950290914033645e-06, | |
| "loss": 0.5785, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.189111747851003, | |
| "grad_norm": 0.2227863371372223, | |
| "learning_rate": 8.871403454098416e-06, | |
| "loss": 0.5724, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.2034383954154728, | |
| "grad_norm": 0.2232217639684677, | |
| "learning_rate": 8.792004914505126e-06, | |
| "loss": 0.5727, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2177650429799427, | |
| "grad_norm": 0.24012598395347595, | |
| "learning_rate": 8.712114423824633e-06, | |
| "loss": 0.589, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.2320916905444126, | |
| "grad_norm": 0.2352171540260315, | |
| "learning_rate": 8.631751229147881e-06, | |
| "loss": 0.5667, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.2464183381088825, | |
| "grad_norm": 0.23246026039123535, | |
| "learning_rate": 8.550934691448907e-06, | |
| "loss": 0.5927, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.2607449856733524, | |
| "grad_norm": 0.24500536918640137, | |
| "learning_rate": 8.469684280920438e-06, | |
| "loss": 0.5831, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2750716332378222, | |
| "grad_norm": 0.22870078682899475, | |
| "learning_rate": 8.388019572283156e-06, | |
| "loss": 0.5851, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.2893982808022924, | |
| "grad_norm": 0.22906720638275146, | |
| "learning_rate": 8.305960240069795e-06, | |
| "loss": 0.586, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3037249283667622, | |
| "grad_norm": 0.22709061205387115, | |
| "learning_rate": 8.223526053885171e-06, | |
| "loss": 0.5719, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.3180515759312321, | |
| "grad_norm": 0.2257590889930725, | |
| "learning_rate": 8.140736873643331e-06, | |
| "loss": 0.5718, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.332378223495702, | |
| "grad_norm": 0.22583012282848358, | |
| "learning_rate": 8.05761264478293e-06, | |
| "loss": 0.5754, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.346704871060172, | |
| "grad_norm": 0.22651982307434082, | |
| "learning_rate": 7.974173393462007e-06, | |
| "loss": 0.5651, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.3610315186246418, | |
| "grad_norm": 0.24124553799629211, | |
| "learning_rate": 7.890439221733317e-06, | |
| "loss": 0.5826, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.3753581661891117, | |
| "grad_norm": 0.22888998687267303, | |
| "learning_rate": 7.806430302701367e-06, | |
| "loss": 0.5705, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3896848137535818, | |
| "grad_norm": 0.21681609749794006, | |
| "learning_rate": 7.722166875662358e-06, | |
| "loss": 0.5814, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.4040114613180517, | |
| "grad_norm": 0.2206772118806839, | |
| "learning_rate": 7.63766924122816e-06, | |
| "loss": 0.5844, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4183381088825215, | |
| "grad_norm": 0.22052349150180817, | |
| "learning_rate": 7.552957756435512e-06, | |
| "loss": 0.5674, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.4326647564469914, | |
| "grad_norm": 0.24319517612457275, | |
| "learning_rate": 7.468052829841645e-06, | |
| "loss": 0.5813, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4326647564469914, | |
| "eval_loss": 0.5956406593322754, | |
| "eval_runtime": 2.8806, | |
| "eval_samples_per_second": 14.581, | |
| "eval_steps_per_second": 14.581, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4469914040114613, | |
| "grad_norm": 0.2275008261203766, | |
| "learning_rate": 7.382974916607492e-06, | |
| "loss": 0.5853, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.4613180515759312, | |
| "grad_norm": 0.23689113557338715, | |
| "learning_rate": 7.297744513569644e-06, | |
| "loss": 0.5796, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.475644699140401, | |
| "grad_norm": 0.23207077383995056, | |
| "learning_rate": 7.2123821543023e-06, | |
| "loss": 0.5832, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.4899713467048712, | |
| "grad_norm": 0.237880676984787, | |
| "learning_rate": 7.126908404170343e-06, | |
| "loss": 0.5783, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5042979942693409, | |
| "grad_norm": 0.22841981053352356, | |
| "learning_rate": 7.041343855374771e-06, | |
| "loss": 0.5623, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.518624641833811, | |
| "grad_norm": 0.223537415266037, | |
| "learning_rate": 6.955709121991649e-06, | |
| "loss": 0.574, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.5329512893982808, | |
| "grad_norm": 0.22695119678974152, | |
| "learning_rate": 6.870024835005807e-06, | |
| "loss": 0.5592, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.5472779369627507, | |
| "grad_norm": 0.22849540412425995, | |
| "learning_rate": 6.784311637340442e-06, | |
| "loss": 0.5613, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.5616045845272206, | |
| "grad_norm": 0.2229369729757309, | |
| "learning_rate": 6.6985901788838775e-06, | |
| "loss": 0.566, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.5759312320916905, | |
| "grad_norm": 0.21880346536636353, | |
| "learning_rate": 6.612881111514604e-06, | |
| "loss": 0.5767, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5902578796561606, | |
| "grad_norm": 0.21992699801921844, | |
| "learning_rate": 6.527205084125875e-06, | |
| "loss": 0.5711, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.6045845272206303, | |
| "grad_norm": 0.23056058585643768, | |
| "learning_rate": 6.441582737651007e-06, | |
| "loss": 0.5607, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6189111747851004, | |
| "grad_norm": 0.22267192602157593, | |
| "learning_rate": 6.356034700090591e-06, | |
| "loss": 0.5549, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.63323782234957, | |
| "grad_norm": 0.22011469304561615, | |
| "learning_rate": 6.270581581542831e-06, | |
| "loss": 0.5821, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.6475644699140402, | |
| "grad_norm": 0.22847089171409607, | |
| "learning_rate": 6.185243969238195e-06, | |
| "loss": 0.5821, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.66189111747851, | |
| "grad_norm": 0.22488202154636383, | |
| "learning_rate": 6.10004242257957e-06, | |
| "loss": 0.5585, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.67621776504298, | |
| "grad_norm": 0.22973030805587769, | |
| "learning_rate": 6.01499746818912e-06, | |
| "loss": 0.5715, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.6905444126074498, | |
| "grad_norm": 0.22791410982608795, | |
| "learning_rate": 5.930129594963047e-06, | |
| "loss": 0.5709, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7048710601719197, | |
| "grad_norm": 0.2369392067193985, | |
| "learning_rate": 5.845459249135437e-06, | |
| "loss": 0.5712, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.7191977077363898, | |
| "grad_norm": 0.22787928581237793, | |
| "learning_rate": 5.7610068293523925e-06, | |
| "loss": 0.5806, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7191977077363898, | |
| "eval_loss": 0.589396595954895, | |
| "eval_runtime": 2.8838, | |
| "eval_samples_per_second": 14.564, | |
| "eval_steps_per_second": 14.564, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7335243553008595, | |
| "grad_norm": 0.2262052297592163, | |
| "learning_rate": 5.676792681757612e-06, | |
| "loss": 0.5653, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.7478510028653296, | |
| "grad_norm": 0.2277483344078064, | |
| "learning_rate": 5.5928370950906355e-06, | |
| "loss": 0.5634, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.7621776504297995, | |
| "grad_norm": 0.2228267937898636, | |
| "learning_rate": 5.5091602957989115e-06, | |
| "loss": 0.5472, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.7765042979942693, | |
| "grad_norm": 0.22168482840061188, | |
| "learning_rate": 5.425782443164878e-06, | |
| "loss": 0.5565, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.7908309455587392, | |
| "grad_norm": 0.22628583014011383, | |
| "learning_rate": 5.342723624449211e-06, | |
| "loss": 0.558, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.8051575931232091, | |
| "grad_norm": 0.22420856356620789, | |
| "learning_rate": 5.260003850051442e-06, | |
| "loss": 0.5721, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.8194842406876792, | |
| "grad_norm": 0.22148585319519043, | |
| "learning_rate": 5.177643048689078e-06, | |
| "loss": 0.5688, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.8338108882521489, | |
| "grad_norm": 0.21723760664463043, | |
| "learning_rate": 5.095661062596411e-06, | |
| "loss": 0.5719, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.848137535816619, | |
| "grad_norm": 0.22150275111198425, | |
| "learning_rate": 5.014077642744153e-06, | |
| "loss": 0.5486, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.8624641833810889, | |
| "grad_norm": 0.21508848667144775, | |
| "learning_rate": 4.932912444081069e-06, | |
| "loss": 0.555, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.8767908309455588, | |
| "grad_norm": 0.2276742309331894, | |
| "learning_rate": 4.852185020798736e-06, | |
| "loss": 0.5527, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.8911174785100286, | |
| "grad_norm": 0.22282367944717407, | |
| "learning_rate": 4.771914821620574e-06, | |
| "loss": 0.5513, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.9054441260744985, | |
| "grad_norm": 0.22503264248371124, | |
| "learning_rate": 4.6921211851162955e-06, | |
| "loss": 0.5656, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.9197707736389686, | |
| "grad_norm": 0.22671757638454437, | |
| "learning_rate": 4.612823335042883e-06, | |
| "loss": 0.5746, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.9340974212034383, | |
| "grad_norm": 0.2195613831281662, | |
| "learning_rate": 4.534040375713239e-06, | |
| "loss": 0.5481, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.9484240687679084, | |
| "grad_norm": 0.2245696634054184, | |
| "learning_rate": 4.455791287393597e-06, | |
| "loss": 0.558, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.962750716332378, | |
| "grad_norm": 0.21683502197265625, | |
| "learning_rate": 4.37809492173083e-06, | |
| "loss": 0.5523, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.9770773638968482, | |
| "grad_norm": 0.2247258424758911, | |
| "learning_rate": 4.300969997210741e-06, | |
| "loss": 0.5735, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.991404011461318, | |
| "grad_norm": 0.22837325930595398, | |
| "learning_rate": 4.224435094648434e-06, | |
| "loss": 0.5669, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.994269340974212, | |
| "eval_loss": 0.5852823853492737, | |
| "eval_runtime": 2.8671, | |
| "eval_samples_per_second": 14.649, | |
| "eval_steps_per_second": 14.649, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.5849316716194153, | |
| "eval_runtime": 2.8768, | |
| "eval_samples_per_second": 14.6, | |
| "eval_steps_per_second": 14.6, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 2.005730659025788, | |
| "grad_norm": 0.21968944370746613, | |
| "learning_rate": 4.148508652711858e-06, | |
| "loss": 0.5577, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.005730659025788, | |
| "eval_loss": 0.5852600932121277, | |
| "eval_runtime": 2.8671, | |
| "eval_samples_per_second": 14.649, | |
| "eval_steps_per_second": 14.649, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.020057306590258, | |
| "grad_norm": 0.22937500476837158, | |
| "learning_rate": 4.073208963479584e-06, | |
| "loss": 0.5605, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.0343839541547277, | |
| "grad_norm": 0.23057711124420166, | |
| "learning_rate": 3.998554168033906e-06, | |
| "loss": 0.5525, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.048710601719198, | |
| "grad_norm": 0.2270784080028534, | |
| "learning_rate": 3.924562252090337e-06, | |
| "loss": 0.5562, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.0630372492836675, | |
| "grad_norm": 0.2220994234085083, | |
| "learning_rate": 3.8512510416644995e-06, | |
| "loss": 0.5447, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.0773638968481376, | |
| "grad_norm": 0.23204341530799866, | |
| "learning_rate": 3.778638198777512e-06, | |
| "loss": 0.549, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.0916905444126073, | |
| "grad_norm": 0.22262004017829895, | |
| "learning_rate": 3.706741217200896e-06, | |
| "loss": 0.5499, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.1060171919770774, | |
| "grad_norm": 0.22019214928150177, | |
| "learning_rate": 3.6355774182419905e-06, | |
| "loss": 0.55, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.1203438395415475, | |
| "grad_norm": 0.22234179079532623, | |
| "learning_rate": 3.5651639465709426e-06, | |
| "loss": 0.5524, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.134670487106017, | |
| "grad_norm": 0.22449831664562225, | |
| "learning_rate": 3.495517766090224e-06, | |
| "loss": 0.5459, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 2.1489971346704873, | |
| "grad_norm": 0.23554570972919464, | |
| "learning_rate": 3.426655655847724e-06, | |
| "loss": 0.5617, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.163323782234957, | |
| "grad_norm": 0.23134228587150574, | |
| "learning_rate": 3.3585942059943785e-06, | |
| "loss": 0.5523, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 2.177650429799427, | |
| "grad_norm": 0.2272178828716278, | |
| "learning_rate": 3.291349813787276e-06, | |
| "loss": 0.5506, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.1919770773638967, | |
| "grad_norm": 0.22482511401176453, | |
| "learning_rate": 3.2249386796392656e-06, | |
| "loss": 0.5451, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 2.206303724928367, | |
| "grad_norm": 0.2274748831987381, | |
| "learning_rate": 3.159376803215985e-06, | |
| "loss": 0.5531, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.2206303724928365, | |
| "grad_norm": 0.2227988839149475, | |
| "learning_rate": 3.0946799795812396e-06, | |
| "loss": 0.5489, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 2.2349570200573066, | |
| "grad_norm": 0.22400720417499542, | |
| "learning_rate": 3.030863795391684e-06, | |
| "loss": 0.5456, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.2492836676217767, | |
| "grad_norm": 0.2268913835287094, | |
| "learning_rate": 2.9679436251417016e-06, | |
| "loss": 0.5394, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 2.2636103151862463, | |
| "grad_norm": 0.22335706651210785, | |
| "learning_rate": 2.9059346274594124e-06, | |
| "loss": 0.5377, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.2779369627507164, | |
| "grad_norm": 0.22807373106479645, | |
| "learning_rate": 2.8448517414546884e-06, | |
| "loss": 0.5484, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 2.292263610315186, | |
| "grad_norm": 0.22118327021598816, | |
| "learning_rate": 2.7847096831200282e-06, | |
| "loss": 0.5419, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.292263610315186, | |
| "eval_loss": 0.5827357769012451, | |
| "eval_runtime": 2.9066, | |
| "eval_samples_per_second": 14.45, | |
| "eval_steps_per_second": 14.45, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.306590257879656, | |
| "grad_norm": 0.22792136669158936, | |
| "learning_rate": 2.7255229417852123e-06, | |
| "loss": 0.5496, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 2.3209169054441263, | |
| "grad_norm": 0.22095544636249542, | |
| "learning_rate": 2.667305776626566e-06, | |
| "loss": 0.554, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.335243553008596, | |
| "grad_norm": 0.22290435433387756, | |
| "learning_rate": 2.6100722132316454e-06, | |
| "loss": 0.5492, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 2.349570200573066, | |
| "grad_norm": 0.23009058833122253, | |
| "learning_rate": 2.553836040220221e-06, | |
| "loss": 0.5473, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.3638968481375358, | |
| "grad_norm": 0.22500832378864288, | |
| "learning_rate": 2.49861080592235e-06, | |
| "loss": 0.5586, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.378223495702006, | |
| "grad_norm": 0.22200486063957214, | |
| "learning_rate": 2.4444098151143295e-06, | |
| "loss": 0.5358, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.3925501432664755, | |
| "grad_norm": 0.22904905676841736, | |
| "learning_rate": 2.391246125813331e-06, | |
| "loss": 0.5524, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 2.4068767908309456, | |
| "grad_norm": 0.23062781989574432, | |
| "learning_rate": 2.339132546131483e-06, | |
| "loss": 0.5404, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.4212034383954153, | |
| "grad_norm": 0.22324807941913605, | |
| "learning_rate": 2.288081631190158e-06, | |
| "loss": 0.5377, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 2.4355300859598854, | |
| "grad_norm": 0.22595882415771484, | |
| "learning_rate": 2.2381056800952273e-06, | |
| "loss": 0.5465, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.4498567335243555, | |
| "grad_norm": 0.23639383912086487, | |
| "learning_rate": 2.189216732973958e-06, | |
| "loss": 0.5518, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 2.464183381088825, | |
| "grad_norm": 0.23035073280334473, | |
| "learning_rate": 2.1414265680743383e-06, | |
| "loss": 0.5444, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.4785100286532953, | |
| "grad_norm": 0.22556614875793457, | |
| "learning_rate": 2.0947466989274793e-06, | |
| "loss": 0.5519, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 2.492836676217765, | |
| "grad_norm": 0.22614265978336334, | |
| "learning_rate": 2.0491883715737904e-06, | |
| "loss": 0.5526, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.507163323782235, | |
| "grad_norm": 0.22689661383628845, | |
| "learning_rate": 2.0047625618536037e-06, | |
| "loss": 0.5489, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 2.5214899713467047, | |
| "grad_norm": 0.22763052582740784, | |
| "learning_rate": 1.961479972762888e-06, | |
| "loss": 0.5397, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.535816618911175, | |
| "grad_norm": 0.22761483490467072, | |
| "learning_rate": 1.919351031874699e-06, | |
| "loss": 0.5452, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 2.5501432664756445, | |
| "grad_norm": 0.22768139839172363, | |
| "learning_rate": 1.8783858888269978e-06, | |
| "loss": 0.5522, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.5644699140401146, | |
| "grad_norm": 0.23226258158683777, | |
| "learning_rate": 1.8385944128773981e-06, | |
| "loss": 0.521, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 2.5787965616045847, | |
| "grad_norm": 0.2272603064775467, | |
| "learning_rate": 1.7999861905254893e-06, | |
| "loss": 0.5526, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.5787965616045847, | |
| "eval_loss": 0.5810644030570984, | |
| "eval_runtime": 2.9211, | |
| "eval_samples_per_second": 14.378, | |
| "eval_steps_per_second": 14.378, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.5931232091690544, | |
| "grad_norm": 0.22808772325515747, | |
| "learning_rate": 1.7625705232032741e-06, | |
| "loss": 0.5573, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 2.6074498567335245, | |
| "grad_norm": 0.22595611214637756, | |
| "learning_rate": 1.726356425034279e-06, | |
| "loss": 0.5378, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.621776504297994, | |
| "grad_norm": 0.22707025706768036, | |
| "learning_rate": 1.6913526206618854e-06, | |
| "loss": 0.5243, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 2.6361031518624642, | |
| "grad_norm": 0.2284831553697586, | |
| "learning_rate": 1.6575675431474023e-06, | |
| "loss": 0.5411, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.6504297994269344, | |
| "grad_norm": 0.22921448945999146, | |
| "learning_rate": 1.6250093319383871e-06, | |
| "loss": 0.5411, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 2.664756446991404, | |
| "grad_norm": 0.2303130179643631, | |
| "learning_rate": 1.5936858309077084e-06, | |
| "loss": 0.546, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.6790830945558737, | |
| "grad_norm": 0.2226521223783493, | |
| "learning_rate": 1.5636045864637997e-06, | |
| "loss": 0.5378, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 2.693409742120344, | |
| "grad_norm": 0.22775433957576752, | |
| "learning_rate": 1.5347728457326013e-06, | |
| "loss": 0.5341, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.707736389684814, | |
| "grad_norm": 0.23151849210262299, | |
| "learning_rate": 1.507197554811592e-06, | |
| "loss": 0.5411, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 2.7220630372492836, | |
| "grad_norm": 0.22131632268428802, | |
| "learning_rate": 1.480885357096343e-06, | |
| "loss": 0.5322, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.7363896848137537, | |
| "grad_norm": 0.22514161467552185, | |
| "learning_rate": 1.4558425916800066e-06, | |
| "loss": 0.5287, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 2.7507163323782233, | |
| "grad_norm": 0.22741974890232086, | |
| "learning_rate": 1.4320752918261058e-06, | |
| "loss": 0.5467, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.7650429799426934, | |
| "grad_norm": 0.22180503606796265, | |
| "learning_rate": 1.4095891835150126e-06, | |
| "loss": 0.5398, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 2.7793696275071635, | |
| "grad_norm": 0.2328280508518219, | |
| "learning_rate": 1.3883896840644583e-06, | |
| "loss": 0.5347, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.793696275071633, | |
| "grad_norm": 0.22877122461795807, | |
| "learning_rate": 1.3684819008243952e-06, | |
| "loss": 0.5453, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.8080229226361033, | |
| "grad_norm": 0.22728435695171356, | |
| "learning_rate": 1.3498706299465446e-06, | |
| "loss": 0.5356, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.822349570200573, | |
| "grad_norm": 0.22559645771980286, | |
| "learning_rate": 1.3325603552289166e-06, | |
| "loss": 0.5432, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 2.836676217765043, | |
| "grad_norm": 0.2304041087627411, | |
| "learning_rate": 1.3165552470355781e-06, | |
| "loss": 0.5441, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.8510028653295127, | |
| "grad_norm": 0.22864393889904022, | |
| "learning_rate": 1.301859161291938e-06, | |
| "loss": 0.5417, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 2.865329512893983, | |
| "grad_norm": 0.22412388026714325, | |
| "learning_rate": 1.2884756385557813e-06, | |
| "loss": 0.5374, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.865329512893983, | |
| "eval_loss": 0.5795248746871948, | |
| "eval_runtime": 2.889, | |
| "eval_samples_per_second": 14.538, | |
| "eval_steps_per_second": 14.538, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.8796561604584525, | |
| "grad_norm": 0.22551295161247253, | |
| "learning_rate": 1.2764079031642852e-06, | |
| "loss": 0.5425, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 2.8939828080229226, | |
| "grad_norm": 0.22314225137233734, | |
| "learning_rate": 1.265658862457217e-06, | |
| "loss": 0.5405, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.9083094555873927, | |
| "grad_norm": 0.22527816891670227, | |
| "learning_rate": 1.2562311060765001e-06, | |
| "loss": 0.5436, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 2.9226361031518624, | |
| "grad_norm": 0.22648297250270844, | |
| "learning_rate": 1.248126905342324e-06, | |
| "loss": 0.5497, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.9369627507163325, | |
| "grad_norm": 0.2278534471988678, | |
| "learning_rate": 1.2413482127059402e-06, | |
| "loss": 0.5391, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 2.951289398280802, | |
| "grad_norm": 0.2279985249042511, | |
| "learning_rate": 1.2358966612792807e-06, | |
| "loss": 0.5398, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.9656160458452723, | |
| "grad_norm": 0.23118627071380615, | |
| "learning_rate": 1.2317735644415136e-06, | |
| "loss": 0.5517, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 2.9799426934097424, | |
| "grad_norm": 0.22241578996181488, | |
| "learning_rate": 1.228979915522621e-06, | |
| "loss": 0.5407, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.9914040114613183, | |
| "eval_loss": 0.5790691375732422, | |
| "eval_runtime": 2.8699, | |
| "eval_samples_per_second": 14.635, | |
| "eval_steps_per_second": 14.635, | |
| "step": 1044 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1047, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1950747837551084e+18, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |