{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1599, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018788163457022077, "grad_norm": 38.902539143522475, "learning_rate": 5.625e-07, "loss": 1.3022, "step": 10 }, { "epoch": 0.03757632691404415, "grad_norm": 25.841390543115104, "learning_rate": 1.1875e-06, "loss": 0.9208, "step": 20 }, { "epoch": 0.05636449037106623, "grad_norm": 12.835104883604552, "learning_rate": 1.8125e-06, "loss": 0.6368, "step": 30 }, { "epoch": 0.0751526538280883, "grad_norm": 14.597848294473495, "learning_rate": 2.4375e-06, "loss": 0.4864, "step": 40 }, { "epoch": 0.09394081728511038, "grad_norm": 10.147065718110532, "learning_rate": 3.0625000000000003e-06, "loss": 0.413, "step": 50 }, { "epoch": 0.11272898074213246, "grad_norm": 7.953678589468881, "learning_rate": 3.6875000000000007e-06, "loss": 0.3552, "step": 60 }, { "epoch": 0.13151714419915453, "grad_norm": 15.497275455908772, "learning_rate": 4.312500000000001e-06, "loss": 0.3232, "step": 70 }, { "epoch": 0.1503053076561766, "grad_norm": 10.684325762714469, "learning_rate": 4.937500000000001e-06, "loss": 0.3995, "step": 80 }, { "epoch": 0.1690934711131987, "grad_norm": 5.578373425613698, "learning_rate": 5.5625000000000005e-06, "loss": 0.383, "step": 90 }, { "epoch": 0.18788163457022075, "grad_norm": 6.8251720225201336, "learning_rate": 6.1875000000000005e-06, "loss": 0.4042, "step": 100 }, { "epoch": 0.20666979802724283, "grad_norm": 6.671521431411895, "learning_rate": 6.8125e-06, "loss": 0.2883, "step": 110 }, { "epoch": 0.22545796148426492, "grad_norm": 5.055948701380774, "learning_rate": 7.437500000000001e-06, "loss": 0.3389, "step": 120 }, { "epoch": 0.244246124941287, "grad_norm": 6.217541233533783, "learning_rate": 8.062500000000001e-06, "loss": 0.2971, "step": 130 }, { "epoch": 0.26303428839830906, "grad_norm": 4.948590364279342, "learning_rate": 8.687500000000001e-06, "loss": 0.3087, "step": 140 }, { "epoch": 0.28182245185533117, "grad_norm": 9.515982957654412, "learning_rate": 9.312500000000001e-06, "loss": 0.328, "step": 150 }, { "epoch": 0.3006106153123532, "grad_norm": 5.588586595769071, "learning_rate": 9.937500000000001e-06, "loss": 0.3183, "step": 160 }, { "epoch": 0.3193987787693753, "grad_norm": 6.382027734743243, "learning_rate": 9.999034862449997e-06, "loss": 0.3427, "step": 170 }, { "epoch": 0.3381869422263974, "grad_norm": 7.190427289350578, "learning_rate": 9.995699062853814e-06, "loss": 0.3567, "step": 180 }, { "epoch": 0.35697510568341945, "grad_norm": 7.34935653943741, "learning_rate": 9.989982275421674e-06, "loss": 0.2315, "step": 190 }, { "epoch": 0.3757632691404415, "grad_norm": 6.295235795739874, "learning_rate": 9.981887224817565e-06, "loss": 0.3465, "step": 200 }, { "epoch": 0.3945514325974636, "grad_norm": 6.275130440505698, "learning_rate": 9.971417769203639e-06, "loss": 0.3036, "step": 210 }, { "epoch": 0.41333959605448567, "grad_norm": 7.6581869277864465, "learning_rate": 9.958578898401365e-06, "loss": 0.314, "step": 220 }, { "epoch": 0.4321277595115077, "grad_norm": 4.7865555589430535, "learning_rate": 9.943376731513364e-06, "loss": 0.3888, "step": 230 }, { "epoch": 0.45091592296852984, "grad_norm": 5.5592205355180155, "learning_rate": 9.92581851400698e-06, "loss": 0.3072, "step": 240 }, { "epoch": 0.4697040864255519, "grad_norm": 7.370996615746405, "learning_rate": 9.90591261426105e-06, "loss": 0.3344, "step": 250 }, { "epoch": 0.488492249882574, "grad_norm": 5.867505322765833, "learning_rate": 9.883668519577464e-06, "loss": 0.2927, "step": 260 }, { "epoch": 0.5072804133395961, "grad_norm": 5.191413529518597, "learning_rate": 9.85909683165945e-06, "loss": 0.2952, "step": 270 }, { "epoch": 0.5260685767966181, "grad_norm": 19.97325422873732, "learning_rate": 9.832209261558707e-06, "loss": 0.2545, "step": 280 }, { "epoch": 0.5448567402536402, "grad_norm": 4.341682760199477, "learning_rate": 9.803018624093859e-06, "loss": 0.2789, "step": 290 }, { "epoch": 0.5636449037106623, "grad_norm": 5.155418355116268, "learning_rate": 9.771538831742785e-06, "loss": 0.3209, "step": 300 }, { "epoch": 0.5824330671676844, "grad_norm": 5.585775233082983, "learning_rate": 9.737784888011847e-06, "loss": 0.2721, "step": 310 }, { "epoch": 0.6012212306247064, "grad_norm": 6.011924955720608, "learning_rate": 9.701772880285098e-06, "loss": 0.2369, "step": 320 }, { "epoch": 0.6200093940817285, "grad_norm": 3.464693811617661, "learning_rate": 9.663519972156919e-06, "loss": 0.327, "step": 330 }, { "epoch": 0.6387975575387506, "grad_norm": 3.9209881919217473, "learning_rate": 9.623044395251709e-06, "loss": 0.3057, "step": 340 }, { "epoch": 0.6575857209957726, "grad_norm": 3.4697639872647668, "learning_rate": 9.580365440534567e-06, "loss": 0.2397, "step": 350 }, { "epoch": 0.6763738844527948, "grad_norm": 3.7401214735406234, "learning_rate": 9.535503449117067e-06, "loss": 0.2472, "step": 360 }, { "epoch": 0.6951620479098168, "grad_norm": 6.346162811263442, "learning_rate": 9.488479802562535e-06, "loss": 0.2861, "step": 370 }, { "epoch": 0.7139502113668389, "grad_norm": 3.159937104618782, "learning_rate": 9.439316912695433e-06, "loss": 0.263, "step": 380 }, { "epoch": 0.732738374823861, "grad_norm": 4.840970300357461, "learning_rate": 9.388038210919706e-06, "loss": 0.317, "step": 390 }, { "epoch": 0.751526538280883, "grad_norm": 3.7912950448134284, "learning_rate": 9.334668137051213e-06, "loss": 0.2399, "step": 400 }, { "epoch": 0.7703147017379052, "grad_norm": 3.6544447000330536, "learning_rate": 9.279232127669519e-06, "loss": 0.2855, "step": 410 }, { "epoch": 0.7891028651949272, "grad_norm": 3.415036837035855, "learning_rate": 9.221756603994622e-06, "loss": 0.2258, "step": 420 }, { "epoch": 0.8078910286519493, "grad_norm": 4.076375098278762, "learning_rate": 9.162268959294421e-06, "loss": 0.2501, "step": 430 }, { "epoch": 0.8266791921089713, "grad_norm": 3.496486007224735, "learning_rate": 9.10079754582885e-06, "loss": 0.1867, "step": 440 }, { "epoch": 0.8454673555659934, "grad_norm": 3.469220654012647, "learning_rate": 9.037371661337006e-06, "loss": 0.2623, "step": 450 }, { "epoch": 0.8642555190230155, "grad_norm": 5.445833530786857, "learning_rate": 8.972021535073605e-06, "loss": 0.2986, "step": 460 }, { "epoch": 0.8830436824800376, "grad_norm": 2.807672252814633, "learning_rate": 8.904778313401497e-06, "loss": 0.2536, "step": 470 }, { "epoch": 0.9018318459370597, "grad_norm": 7.9424075184306755, "learning_rate": 8.835674044947078e-06, "loss": 0.3049, "step": 480 }, { "epoch": 0.9206200093940817, "grad_norm": 4.713848985031599, "learning_rate": 8.764741665325672e-06, "loss": 0.2319, "step": 490 }, { "epoch": 0.9394081728511038, "grad_norm": 4.413441646714263, "learning_rate": 8.692014981444166e-06, "loss": 0.2843, "step": 500 }, { "epoch": 0.9394081728511038, "eval_loss": 0.23845727741718292, "eval_runtime": 139.1371, "eval_samples_per_second": 6.799, "eval_steps_per_second": 1.703, "step": 500 }, { "epoch": 0.9581963363081258, "grad_norm": 1.9799881256140055, "learning_rate": 8.617528655388384e-06, "loss": 0.2351, "step": 510 }, { "epoch": 0.976984499765148, "grad_norm": 5.12487715924031, "learning_rate": 8.541318187902879e-06, "loss": 0.233, "step": 520 }, { "epoch": 0.9957726632221701, "grad_norm": 3.19651743083849, "learning_rate": 8.463419901471002e-06, "loss": 0.2415, "step": 530 }, { "epoch": 1.0131517144199154, "grad_norm": 3.1257162430007224, "learning_rate": 8.383870923003345e-06, "loss": 0.2149, "step": 540 }, { "epoch": 1.0319398778769375, "grad_norm": 3.45631296377837, "learning_rate": 8.302709166142765e-06, "loss": 0.1915, "step": 550 }, { "epoch": 1.0507280413339597, "grad_norm": 2.286180863936669, "learning_rate": 8.219973313194461e-06, "loss": 0.1317, "step": 560 }, { "epoch": 1.0695162047909816, "grad_norm": 2.9540247439256673, "learning_rate": 8.135702796689693e-06, "loss": 0.1626, "step": 570 }, { "epoch": 1.0883043682480038, "grad_norm": 2.1567801477380066, "learning_rate": 8.049937780591944e-06, "loss": 0.1284, "step": 580 }, { "epoch": 1.1070925317050258, "grad_norm": 3.3038946448126256, "learning_rate": 7.962719141154469e-06, "loss": 0.15, "step": 590 }, { "epoch": 1.125880695162048, "grad_norm": 3.0873807975498635, "learning_rate": 7.874088447438366e-06, "loss": 0.139, "step": 600 }, { "epoch": 1.1446688586190699, "grad_norm": 2.051043815446488, "learning_rate": 7.784087941500446e-06, "loss": 0.1519, "step": 610 }, { "epoch": 1.163457022076092, "grad_norm": 2.4617892609619143, "learning_rate": 7.692760518260355e-06, "loss": 0.1084, "step": 620 }, { "epoch": 1.1822451855331142, "grad_norm": 1.7623637548791922, "learning_rate": 7.6001497050565256e-06, "loss": 0.1348, "step": 630 }, { "epoch": 1.2010333489901361, "grad_norm": 3.2507782557395206, "learning_rate": 7.506299640900725e-06, "loss": 0.1598, "step": 640 }, { "epoch": 1.2198215124471583, "grad_norm": 4.2496165791641864, "learning_rate": 7.411255055441064e-06, "loss": 0.1691, "step": 650 }, { "epoch": 1.2386096759041805, "grad_norm": 6.841696105552823, "learning_rate": 7.315061247643518e-06, "loss": 0.1755, "step": 660 }, { "epoch": 1.2573978393612024, "grad_norm": 2.5825812551037224, "learning_rate": 7.2177640642020875e-06, "loss": 0.1706, "step": 670 }, { "epoch": 1.2761860028182246, "grad_norm": 3.371098422024725, "learning_rate": 7.119409877687923e-06, "loss": 0.1444, "step": 680 }, { "epoch": 1.2949741662752465, "grad_norm": 2.590100923291357, "learning_rate": 7.0200455644478105e-06, "loss": 0.1028, "step": 690 }, { "epoch": 1.3137623297322687, "grad_norm": 2.481043190681829, "learning_rate": 6.91971848226255e-06, "loss": 0.1614, "step": 700 }, { "epoch": 1.3325504931892906, "grad_norm": 2.329271785947211, "learning_rate": 6.818476447775873e-06, "loss": 0.14, "step": 710 }, { "epoch": 1.3513386566463128, "grad_norm": 2.646993250936894, "learning_rate": 6.7163677137046855e-06, "loss": 0.1737, "step": 720 }, { "epoch": 1.370126820103335, "grad_norm": 5.413980223824179, "learning_rate": 6.6134409458414415e-06, "loss": 0.1878, "step": 730 }, { "epoch": 1.388914983560357, "grad_norm": 2.1714036144692934, "learning_rate": 6.50974519985967e-06, "loss": 0.1495, "step": 740 }, { "epoch": 1.407703147017379, "grad_norm": 3.133663498685042, "learning_rate": 6.405329897933669e-06, "loss": 0.1128, "step": 750 }, { "epoch": 1.4264913104744013, "grad_norm": 6.167890536629415, "learning_rate": 6.300244805183524e-06, "loss": 0.1226, "step": 760 }, { "epoch": 1.4452794739314232, "grad_norm": 2.676375029014761, "learning_rate": 6.194540005956675e-06, "loss": 0.1484, "step": 770 }, { "epoch": 1.4640676373884451, "grad_norm": 2.475835214596877, "learning_rate": 6.088265879957345e-06, "loss": 0.1491, "step": 780 }, { "epoch": 1.4828558008454673, "grad_norm": 2.2436437435088723, "learning_rate": 5.981473078235186e-06, "loss": 0.1166, "step": 790 }, { "epoch": 1.5016439643024895, "grad_norm": 2.132099788544184, "learning_rate": 5.874212499044609e-06, "loss": 0.1531, "step": 800 }, { "epoch": 1.5204321277595114, "grad_norm": 4.079361340018406, "learning_rate": 5.7665352635862945e-06, "loss": 0.1398, "step": 810 }, { "epoch": 1.5392202912165336, "grad_norm": 1.9003215983018618, "learning_rate": 5.658492691642443e-06, "loss": 0.1391, "step": 820 }, { "epoch": 1.5580084546735558, "grad_norm": 2.6377367761402586, "learning_rate": 5.550136277117375e-06, "loss": 0.1418, "step": 830 }, { "epoch": 1.5767966181305777, "grad_norm": 3.1017028365243995, "learning_rate": 5.4415176634951515e-06, "loss": 0.1381, "step": 840 }, { "epoch": 1.5955847815875999, "grad_norm": 2.6515166581373504, "learning_rate": 5.332688619225903e-06, "loss": 0.1238, "step": 850 }, { "epoch": 1.614372945044622, "grad_norm": 1.922819738710662, "learning_rate": 5.22370101305259e-06, "loss": 0.1119, "step": 860 }, { "epoch": 1.633161108501644, "grad_norm": 2.529225306976089, "learning_rate": 5.114606789289973e-06, "loss": 0.1622, "step": 870 }, { "epoch": 1.651949271958666, "grad_norm": 1.288640339335927, "learning_rate": 5.005457943067561e-06, "loss": 0.1192, "step": 880 }, { "epoch": 1.670737435415688, "grad_norm": 3.4037089339443782, "learning_rate": 4.896306495548334e-06, "loss": 0.1039, "step": 890 }, { "epoch": 1.6895255988727103, "grad_norm": 3.9649233321606805, "learning_rate": 4.7872044691350735e-06, "loss": 0.1375, "step": 900 }, { "epoch": 1.7083137623297322, "grad_norm": 2.5965801580755286, "learning_rate": 4.678203862676091e-06, "loss": 0.1092, "step": 910 }, { "epoch": 1.7271019257867544, "grad_norm": 2.5786254690111265, "learning_rate": 4.569356626682181e-06, "loss": 0.1239, "step": 920 }, { "epoch": 1.7458900892437765, "grad_norm": 3.8696436706505435, "learning_rate": 4.4607146385666145e-06, "loss": 0.1387, "step": 930 }, { "epoch": 1.7646782527007985, "grad_norm": 2.851127249549094, "learning_rate": 4.352329677919983e-06, "loss": 0.1595, "step": 940 }, { "epoch": 1.7834664161578204, "grad_norm": 2.2742219632834795, "learning_rate": 4.244253401831646e-06, "loss": 0.11, "step": 950 }, { "epoch": 1.8022545796148428, "grad_norm": 1.9123730074854848, "learning_rate": 4.136537320269571e-06, "loss": 0.1205, "step": 960 }, { "epoch": 1.8210427430718648, "grad_norm": 2.5825751840603277, "learning_rate": 4.029232771530306e-06, "loss": 0.1134, "step": 970 }, { "epoch": 1.8398309065288867, "grad_norm": 2.471433944974579, "learning_rate": 3.92239089777075e-06, "loss": 0.1123, "step": 980 }, { "epoch": 1.8586190699859089, "grad_norm": 3.5281114838659686, "learning_rate": 3.816062620633414e-06, "loss": 0.1188, "step": 990 }, { "epoch": 1.877407233442931, "grad_norm": 1.9634958487628664, "learning_rate": 3.7102986169767954e-06, "loss": 0.1062, "step": 1000 }, { "epoch": 1.877407233442931, "eval_loss": 0.19623179733753204, "eval_runtime": 139.2435, "eval_samples_per_second": 6.794, "eval_steps_per_second": 1.702, "step": 1000 }, { "epoch": 1.896195396899953, "grad_norm": 3.6853787421863164, "learning_rate": 3.605149294722392e-06, "loss": 0.1055, "step": 1010 }, { "epoch": 1.9149835603569751, "grad_norm": 3.7066247620003168, "learning_rate": 3.500664768829908e-06, "loss": 0.125, "step": 1020 }, { "epoch": 1.9337717238139973, "grad_norm": 1.9487521917157127, "learning_rate": 3.3968948374120958e-06, "loss": 0.1046, "step": 1030 }, { "epoch": 1.9525598872710193, "grad_norm": 1.7787524143692979, "learning_rate": 3.2938889580005932e-06, "loss": 0.1308, "step": 1040 }, { "epoch": 1.9713480507280412, "grad_norm": 2.4115181227326383, "learning_rate": 3.191696223974084e-06, "loss": 0.1195, "step": 1050 }, { "epoch": 1.9901362141850634, "grad_norm": 1.8676954461633957, "learning_rate": 3.090365341160041e-06, "loss": 0.119, "step": 1060 }, { "epoch": 2.0075152653828088, "grad_norm": 2.0403175416536157, "learning_rate": 2.989944604621148e-06, "loss": 0.1082, "step": 1070 }, { "epoch": 2.0263034288398307, "grad_norm": 1.8796687540004382, "learning_rate": 2.8904818756375076e-06, "loss": 0.0649, "step": 1080 }, { "epoch": 2.045091592296853, "grad_norm": 1.6678868897065664, "learning_rate": 2.792024558895606e-06, "loss": 0.0671, "step": 1090 }, { "epoch": 2.063879755753875, "grad_norm": 3.1485507384660547, "learning_rate": 2.6946195798948755e-06, "loss": 0.0597, "step": 1100 }, { "epoch": 2.082667919210897, "grad_norm": 0.966514290803291, "learning_rate": 2.598313362582639e-06, "loss": 0.0582, "step": 1110 }, { "epoch": 2.1014560826679194, "grad_norm": 1.3610544972984766, "learning_rate": 2.5031518072281236e-06, "loss": 0.0609, "step": 1120 }, { "epoch": 2.1202442461249413, "grad_norm": 1.5921330796712605, "learning_rate": 2.4091802685460336e-06, "loss": 0.0664, "step": 1130 }, { "epoch": 2.1390324095819633, "grad_norm": 2.3312224294157025, "learning_rate": 2.3164435340801574e-06, "loss": 0.0675, "step": 1140 }, { "epoch": 2.1578205730389852, "grad_norm": 1.7103466660833229, "learning_rate": 2.224985802857284e-06, "loss": 0.0453, "step": 1150 }, { "epoch": 2.1766087364960076, "grad_norm": 2.060481882429091, "learning_rate": 2.134850664321617e-06, "loss": 0.0625, "step": 1160 }, { "epoch": 2.1953968999530296, "grad_norm": 0.9427782782833426, "learning_rate": 2.046081077559707e-06, "loss": 0.0565, "step": 1170 }, { "epoch": 2.2141850634100515, "grad_norm": 1.9240741575863745, "learning_rate": 1.9587193508258415e-06, "loss": 0.051, "step": 1180 }, { "epoch": 2.232973226867074, "grad_norm": 1.2001636500769781, "learning_rate": 1.8728071213776028e-06, "loss": 0.048, "step": 1190 }, { "epoch": 2.251761390324096, "grad_norm": 1.8036945101434407, "learning_rate": 1.7883853356312375e-06, "loss": 0.0575, "step": 1200 }, { "epoch": 2.270549553781118, "grad_norm": 1.2865058971634646, "learning_rate": 1.7054942296462895e-06, "loss": 0.0708, "step": 1210 }, { "epoch": 2.2893377172381397, "grad_norm": 1.9144408525509076, "learning_rate": 1.6241733099487888e-06, "loss": 0.0513, "step": 1220 }, { "epoch": 2.308125880695162, "grad_norm": 1.7106138783392042, "learning_rate": 1.5444613347021392e-06, "loss": 0.0562, "step": 1230 }, { "epoch": 2.326914044152184, "grad_norm": 1.2128475292175165, "learning_rate": 1.4663962952346938e-06, "loss": 0.0507, "step": 1240 }, { "epoch": 2.3457022076092064, "grad_norm": 1.321730273201786, "learning_rate": 1.3900153979327951e-06, "loss": 0.0577, "step": 1250 }, { "epoch": 2.3644903710662284, "grad_norm": 2.7875274955613745, "learning_rate": 1.315355046507934e-06, "loss": 0.0648, "step": 1260 }, { "epoch": 2.3832785345232503, "grad_norm": 1.8288963759953032, "learning_rate": 1.2424508246464635e-06, "loss": 0.0558, "step": 1270 }, { "epoch": 2.4020666979802723, "grad_norm": 1.537482323897175, "learning_rate": 1.171337479050148e-06, "loss": 0.061, "step": 1280 }, { "epoch": 2.4208548614372947, "grad_norm": 2.1795553061277357, "learning_rate": 1.1020489028756243e-06, "loss": 0.0521, "step": 1290 }, { "epoch": 2.4396430248943166, "grad_norm": 1.75277898274252, "learning_rate": 1.0346181195806614e-06, "loss": 0.0583, "step": 1300 }, { "epoch": 2.4584311883513386, "grad_norm": 1.7605887217641596, "learning_rate": 9.690772671849403e-07, "loss": 0.0546, "step": 1310 }, { "epoch": 2.477219351808361, "grad_norm": 1.447955784872353, "learning_rate": 9.054575829528251e-07, "loss": 0.0551, "step": 1320 }, { "epoch": 2.496007515265383, "grad_norm": 5.225313476562762, "learning_rate": 8.437893885054504e-07, "loss": 0.0517, "step": 1330 }, { "epoch": 2.514795678722405, "grad_norm": 1.5717503369334935, "learning_rate": 7.841020753692058e-07, "loss": 0.0545, "step": 1340 }, { "epoch": 2.533583842179427, "grad_norm": 2.6343667579863963, "learning_rate": 7.264240909675174e-07, "loss": 0.0472, "step": 1350 }, { "epoch": 2.552372005636449, "grad_norm": 2.023557390851674, "learning_rate": 6.707829250625825e-07, "loss": 0.0446, "step": 1360 }, { "epoch": 2.571160169093471, "grad_norm": 1.6290921730514096, "learning_rate": 6.172050966535514e-07, "loss": 0.0542, "step": 1370 }, { "epoch": 2.589948332550493, "grad_norm": 1.712949974396625, "learning_rate": 5.65716141337368e-07, "loss": 0.0469, "step": 1380 }, { "epoch": 2.6087364960075154, "grad_norm": 1.3027380061751708, "learning_rate": 5.163405991383114e-07, "loss": 0.0425, "step": 1390 }, { "epoch": 2.6275246594645374, "grad_norm": 1.065265515773574, "learning_rate": 4.6910200281203523e-07, "loss": 0.0504, "step": 1400 }, { "epoch": 2.6463128229215593, "grad_norm": 1.9478720946650225, "learning_rate": 4.240228666296825e-07, "loss": 0.0634, "step": 1410 }, { "epoch": 2.6651009863785813, "grad_norm": 2.5865600012088326, "learning_rate": 3.8112467564740796e-07, "loss": 0.0573, "step": 1420 }, { "epoch": 2.6838891498356037, "grad_norm": 2.0098060853151702, "learning_rate": 3.4042787546644305e-07, "loss": 0.07, "step": 1430 }, { "epoch": 2.7026773132926256, "grad_norm": 1.1444888015196577, "learning_rate": 3.0195186248856866e-07, "loss": 0.0448, "step": 1440 }, { "epoch": 2.7214654767496476, "grad_norm": 1.0223331183810753, "learning_rate": 2.6571497467164033e-07, "loss": 0.058, "step": 1450 }, { "epoch": 2.74025364020667, "grad_norm": 1.4130933630125946, "learning_rate": 2.3173448278958178e-07, "loss": 0.0654, "step": 1460 }, { "epoch": 2.759041803663692, "grad_norm": 1.2949137507027924, "learning_rate": 2.0002658220100334e-07, "loss": 0.0556, "step": 1470 }, { "epoch": 2.777829967120714, "grad_norm": 1.5805556155196185, "learning_rate": 1.7060638513037076e-07, "loss": 0.0558, "step": 1480 }, { "epoch": 2.796618130577736, "grad_norm": 2.0964534267823725, "learning_rate": 1.434879134654077e-07, "loss": 0.0496, "step": 1490 }, { "epoch": 2.815406294034758, "grad_norm": 1.464233077890015, "learning_rate": 1.186840920741561e-07, "loss": 0.0573, "step": 1500 }, { "epoch": 2.815406294034758, "eval_loss": 0.19711866974830627, "eval_runtime": 140.2242, "eval_samples_per_second": 6.746, "eval_steps_per_second": 1.69, "step": 1500 }, { "epoch": 2.83419445749178, "grad_norm": 0.969822959813251, "learning_rate": 9.620674264488594e-08, "loss": 0.0548, "step": 1510 }, { "epoch": 2.8529826209488025, "grad_norm": 1.8413122290273847, "learning_rate": 7.606657805179274e-08, "loss": 0.063, "step": 1520 }, { "epoch": 2.8717707844058245, "grad_norm": 0.7665376382039885, "learning_rate": 5.827319724915959e-08, "loss": 0.0514, "step": 1530 }, { "epoch": 2.8905589478628464, "grad_norm": 0.8215067982809356, "learning_rate": 4.283508069641951e-08, "loss": 0.0467, "step": 1540 }, { "epoch": 2.9093471113198683, "grad_norm": 2.021014987559278, "learning_rate": 2.975958631631082e-08, "loss": 0.0483, "step": 1550 }, { "epoch": 2.9281352747768903, "grad_norm": 1.4695240828377725, "learning_rate": 1.9052945988030648e-08, "loss": 0.0427, "step": 1560 }, { "epoch": 2.9469234382339127, "grad_norm": 1.1451618059955273, "learning_rate": 1.0720262577076923e-08, "loss": 0.0493, "step": 1570 }, { "epoch": 2.9657116016909346, "grad_norm": 2.9129055521839895, "learning_rate": 4.76550750318383e-09, "loss": 0.0542, "step": 1580 }, { "epoch": 2.984499765147957, "grad_norm": 1.7988477080352803, "learning_rate": 1.1915188475125627e-09, "loss": 0.04, "step": 1590 }, { "epoch": 3.0, "step": 1599, "total_flos": 141952390594560.0, "train_loss": 0.17646967794389706, "train_runtime": 16767.1291, "train_samples_per_second": 1.523, "train_steps_per_second": 0.095 } ], "logging_steps": 10, "max_steps": 1599, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 141952390594560.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }