| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1599, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.018788163457022077, | |
| "grad_norm": 38.902539143522475, | |
| "learning_rate": 5.625e-07, | |
| "loss": 1.3022, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03757632691404415, | |
| "grad_norm": 25.841390543115104, | |
| "learning_rate": 1.1875e-06, | |
| "loss": 0.9208, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05636449037106623, | |
| "grad_norm": 12.835104883604552, | |
| "learning_rate": 1.8125e-06, | |
| "loss": 0.6368, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0751526538280883, | |
| "grad_norm": 14.597848294473495, | |
| "learning_rate": 2.4375e-06, | |
| "loss": 0.4864, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09394081728511038, | |
| "grad_norm": 10.147065718110532, | |
| "learning_rate": 3.0625000000000003e-06, | |
| "loss": 0.413, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11272898074213246, | |
| "grad_norm": 7.953678589468881, | |
| "learning_rate": 3.6875000000000007e-06, | |
| "loss": 0.3552, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13151714419915453, | |
| "grad_norm": 15.497275455908772, | |
| "learning_rate": 4.312500000000001e-06, | |
| "loss": 0.3232, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1503053076561766, | |
| "grad_norm": 10.684325762714469, | |
| "learning_rate": 4.937500000000001e-06, | |
| "loss": 0.3995, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1690934711131987, | |
| "grad_norm": 5.578373425613698, | |
| "learning_rate": 5.5625000000000005e-06, | |
| "loss": 0.383, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.18788163457022075, | |
| "grad_norm": 6.8251720225201336, | |
| "learning_rate": 6.1875000000000005e-06, | |
| "loss": 0.4042, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.20666979802724283, | |
| "grad_norm": 6.671521431411895, | |
| "learning_rate": 6.8125e-06, | |
| "loss": 0.2883, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.22545796148426492, | |
| "grad_norm": 5.055948701380774, | |
| "learning_rate": 7.437500000000001e-06, | |
| "loss": 0.3389, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.244246124941287, | |
| "grad_norm": 6.217541233533783, | |
| "learning_rate": 8.062500000000001e-06, | |
| "loss": 0.2971, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.26303428839830906, | |
| "grad_norm": 4.948590364279342, | |
| "learning_rate": 8.687500000000001e-06, | |
| "loss": 0.3087, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.28182245185533117, | |
| "grad_norm": 9.515982957654412, | |
| "learning_rate": 9.312500000000001e-06, | |
| "loss": 0.328, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3006106153123532, | |
| "grad_norm": 5.588586595769071, | |
| "learning_rate": 9.937500000000001e-06, | |
| "loss": 0.3183, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3193987787693753, | |
| "grad_norm": 6.382027734743243, | |
| "learning_rate": 9.999034862449997e-06, | |
| "loss": 0.3427, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3381869422263974, | |
| "grad_norm": 7.190427289350578, | |
| "learning_rate": 9.995699062853814e-06, | |
| "loss": 0.3567, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.35697510568341945, | |
| "grad_norm": 7.34935653943741, | |
| "learning_rate": 9.989982275421674e-06, | |
| "loss": 0.2315, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3757632691404415, | |
| "grad_norm": 6.295235795739874, | |
| "learning_rate": 9.981887224817565e-06, | |
| "loss": 0.3465, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3945514325974636, | |
| "grad_norm": 6.275130440505698, | |
| "learning_rate": 9.971417769203639e-06, | |
| "loss": 0.3036, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.41333959605448567, | |
| "grad_norm": 7.6581869277864465, | |
| "learning_rate": 9.958578898401365e-06, | |
| "loss": 0.314, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4321277595115077, | |
| "grad_norm": 4.7865555589430535, | |
| "learning_rate": 9.943376731513364e-06, | |
| "loss": 0.3888, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.45091592296852984, | |
| "grad_norm": 5.5592205355180155, | |
| "learning_rate": 9.92581851400698e-06, | |
| "loss": 0.3072, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4697040864255519, | |
| "grad_norm": 7.370996615746405, | |
| "learning_rate": 9.90591261426105e-06, | |
| "loss": 0.3344, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.488492249882574, | |
| "grad_norm": 5.867505322765833, | |
| "learning_rate": 9.883668519577464e-06, | |
| "loss": 0.2927, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5072804133395961, | |
| "grad_norm": 5.191413529518597, | |
| "learning_rate": 9.85909683165945e-06, | |
| "loss": 0.2952, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5260685767966181, | |
| "grad_norm": 19.97325422873732, | |
| "learning_rate": 9.832209261558707e-06, | |
| "loss": 0.2545, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5448567402536402, | |
| "grad_norm": 4.341682760199477, | |
| "learning_rate": 9.803018624093859e-06, | |
| "loss": 0.2789, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5636449037106623, | |
| "grad_norm": 5.155418355116268, | |
| "learning_rate": 9.771538831742785e-06, | |
| "loss": 0.3209, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5824330671676844, | |
| "grad_norm": 5.585775233082983, | |
| "learning_rate": 9.737784888011847e-06, | |
| "loss": 0.2721, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6012212306247064, | |
| "grad_norm": 6.011924955720608, | |
| "learning_rate": 9.701772880285098e-06, | |
| "loss": 0.2369, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6200093940817285, | |
| "grad_norm": 3.464693811617661, | |
| "learning_rate": 9.663519972156919e-06, | |
| "loss": 0.327, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6387975575387506, | |
| "grad_norm": 3.9209881919217473, | |
| "learning_rate": 9.623044395251709e-06, | |
| "loss": 0.3057, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6575857209957726, | |
| "grad_norm": 3.4697639872647668, | |
| "learning_rate": 9.580365440534567e-06, | |
| "loss": 0.2397, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6763738844527948, | |
| "grad_norm": 3.7401214735406234, | |
| "learning_rate": 9.535503449117067e-06, | |
| "loss": 0.2472, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6951620479098168, | |
| "grad_norm": 6.346162811263442, | |
| "learning_rate": 9.488479802562535e-06, | |
| "loss": 0.2861, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7139502113668389, | |
| "grad_norm": 3.159937104618782, | |
| "learning_rate": 9.439316912695433e-06, | |
| "loss": 0.263, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.732738374823861, | |
| "grad_norm": 4.840970300357461, | |
| "learning_rate": 9.388038210919706e-06, | |
| "loss": 0.317, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.751526538280883, | |
| "grad_norm": 3.7912950448134284, | |
| "learning_rate": 9.334668137051213e-06, | |
| "loss": 0.2399, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7703147017379052, | |
| "grad_norm": 3.6544447000330536, | |
| "learning_rate": 9.279232127669519e-06, | |
| "loss": 0.2855, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7891028651949272, | |
| "grad_norm": 3.415036837035855, | |
| "learning_rate": 9.221756603994622e-06, | |
| "loss": 0.2258, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8078910286519493, | |
| "grad_norm": 4.076375098278762, | |
| "learning_rate": 9.162268959294421e-06, | |
| "loss": 0.2501, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8266791921089713, | |
| "grad_norm": 3.496486007224735, | |
| "learning_rate": 9.10079754582885e-06, | |
| "loss": 0.1867, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8454673555659934, | |
| "grad_norm": 3.469220654012647, | |
| "learning_rate": 9.037371661337006e-06, | |
| "loss": 0.2623, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8642555190230155, | |
| "grad_norm": 5.445833530786857, | |
| "learning_rate": 8.972021535073605e-06, | |
| "loss": 0.2986, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8830436824800376, | |
| "grad_norm": 2.807672252814633, | |
| "learning_rate": 8.904778313401497e-06, | |
| "loss": 0.2536, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9018318459370597, | |
| "grad_norm": 7.9424075184306755, | |
| "learning_rate": 8.835674044947078e-06, | |
| "loss": 0.3049, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9206200093940817, | |
| "grad_norm": 4.713848985031599, | |
| "learning_rate": 8.764741665325672e-06, | |
| "loss": 0.2319, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9394081728511038, | |
| "grad_norm": 4.413441646714263, | |
| "learning_rate": 8.692014981444166e-06, | |
| "loss": 0.2843, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9394081728511038, | |
| "eval_loss": 0.23845727741718292, | |
| "eval_runtime": 139.1371, | |
| "eval_samples_per_second": 6.799, | |
| "eval_steps_per_second": 1.703, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9581963363081258, | |
| "grad_norm": 1.9799881256140055, | |
| "learning_rate": 8.617528655388384e-06, | |
| "loss": 0.2351, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.976984499765148, | |
| "grad_norm": 5.12487715924031, | |
| "learning_rate": 8.541318187902879e-06, | |
| "loss": 0.233, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9957726632221701, | |
| "grad_norm": 3.19651743083849, | |
| "learning_rate": 8.463419901471002e-06, | |
| "loss": 0.2415, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0131517144199154, | |
| "grad_norm": 3.1257162430007224, | |
| "learning_rate": 8.383870923003345e-06, | |
| "loss": 0.2149, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0319398778769375, | |
| "grad_norm": 3.45631296377837, | |
| "learning_rate": 8.302709166142765e-06, | |
| "loss": 0.1915, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.0507280413339597, | |
| "grad_norm": 2.286180863936669, | |
| "learning_rate": 8.219973313194461e-06, | |
| "loss": 0.1317, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0695162047909816, | |
| "grad_norm": 2.9540247439256673, | |
| "learning_rate": 8.135702796689693e-06, | |
| "loss": 0.1626, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0883043682480038, | |
| "grad_norm": 2.1567801477380066, | |
| "learning_rate": 8.049937780591944e-06, | |
| "loss": 0.1284, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.1070925317050258, | |
| "grad_norm": 3.3038946448126256, | |
| "learning_rate": 7.962719141154469e-06, | |
| "loss": 0.15, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.125880695162048, | |
| "grad_norm": 3.0873807975498635, | |
| "learning_rate": 7.874088447438366e-06, | |
| "loss": 0.139, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.1446688586190699, | |
| "grad_norm": 2.051043815446488, | |
| "learning_rate": 7.784087941500446e-06, | |
| "loss": 0.1519, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.163457022076092, | |
| "grad_norm": 2.4617892609619143, | |
| "learning_rate": 7.692760518260355e-06, | |
| "loss": 0.1084, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.1822451855331142, | |
| "grad_norm": 1.7623637548791922, | |
| "learning_rate": 7.6001497050565256e-06, | |
| "loss": 0.1348, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.2010333489901361, | |
| "grad_norm": 3.2507782557395206, | |
| "learning_rate": 7.506299640900725e-06, | |
| "loss": 0.1598, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.2198215124471583, | |
| "grad_norm": 4.2496165791641864, | |
| "learning_rate": 7.411255055441064e-06, | |
| "loss": 0.1691, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.2386096759041805, | |
| "grad_norm": 6.841696105552823, | |
| "learning_rate": 7.315061247643518e-06, | |
| "loss": 0.1755, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.2573978393612024, | |
| "grad_norm": 2.5825812551037224, | |
| "learning_rate": 7.2177640642020875e-06, | |
| "loss": 0.1706, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.2761860028182246, | |
| "grad_norm": 3.371098422024725, | |
| "learning_rate": 7.119409877687923e-06, | |
| "loss": 0.1444, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.2949741662752465, | |
| "grad_norm": 2.590100923291357, | |
| "learning_rate": 7.0200455644478105e-06, | |
| "loss": 0.1028, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3137623297322687, | |
| "grad_norm": 2.481043190681829, | |
| "learning_rate": 6.91971848226255e-06, | |
| "loss": 0.1614, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.3325504931892906, | |
| "grad_norm": 2.329271785947211, | |
| "learning_rate": 6.818476447775873e-06, | |
| "loss": 0.14, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.3513386566463128, | |
| "grad_norm": 2.646993250936894, | |
| "learning_rate": 6.7163677137046855e-06, | |
| "loss": 0.1737, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.370126820103335, | |
| "grad_norm": 5.413980223824179, | |
| "learning_rate": 6.6134409458414415e-06, | |
| "loss": 0.1878, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.388914983560357, | |
| "grad_norm": 2.1714036144692934, | |
| "learning_rate": 6.50974519985967e-06, | |
| "loss": 0.1495, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.407703147017379, | |
| "grad_norm": 3.133663498685042, | |
| "learning_rate": 6.405329897933669e-06, | |
| "loss": 0.1128, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.4264913104744013, | |
| "grad_norm": 6.167890536629415, | |
| "learning_rate": 6.300244805183524e-06, | |
| "loss": 0.1226, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.4452794739314232, | |
| "grad_norm": 2.676375029014761, | |
| "learning_rate": 6.194540005956675e-06, | |
| "loss": 0.1484, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.4640676373884451, | |
| "grad_norm": 2.475835214596877, | |
| "learning_rate": 6.088265879957345e-06, | |
| "loss": 0.1491, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.4828558008454673, | |
| "grad_norm": 2.2436437435088723, | |
| "learning_rate": 5.981473078235186e-06, | |
| "loss": 0.1166, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5016439643024895, | |
| "grad_norm": 2.132099788544184, | |
| "learning_rate": 5.874212499044609e-06, | |
| "loss": 0.1531, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.5204321277595114, | |
| "grad_norm": 4.079361340018406, | |
| "learning_rate": 5.7665352635862945e-06, | |
| "loss": 0.1398, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.5392202912165336, | |
| "grad_norm": 1.9003215983018618, | |
| "learning_rate": 5.658492691642443e-06, | |
| "loss": 0.1391, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.5580084546735558, | |
| "grad_norm": 2.6377367761402586, | |
| "learning_rate": 5.550136277117375e-06, | |
| "loss": 0.1418, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.5767966181305777, | |
| "grad_norm": 3.1017028365243995, | |
| "learning_rate": 5.4415176634951515e-06, | |
| "loss": 0.1381, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.5955847815875999, | |
| "grad_norm": 2.6515166581373504, | |
| "learning_rate": 5.332688619225903e-06, | |
| "loss": 0.1238, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.614372945044622, | |
| "grad_norm": 1.922819738710662, | |
| "learning_rate": 5.22370101305259e-06, | |
| "loss": 0.1119, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.633161108501644, | |
| "grad_norm": 2.529225306976089, | |
| "learning_rate": 5.114606789289973e-06, | |
| "loss": 0.1622, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.651949271958666, | |
| "grad_norm": 1.288640339335927, | |
| "learning_rate": 5.005457943067561e-06, | |
| "loss": 0.1192, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.670737435415688, | |
| "grad_norm": 3.4037089339443782, | |
| "learning_rate": 4.896306495548334e-06, | |
| "loss": 0.1039, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.6895255988727103, | |
| "grad_norm": 3.9649233321606805, | |
| "learning_rate": 4.7872044691350735e-06, | |
| "loss": 0.1375, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.7083137623297322, | |
| "grad_norm": 2.5965801580755286, | |
| "learning_rate": 4.678203862676091e-06, | |
| "loss": 0.1092, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.7271019257867544, | |
| "grad_norm": 2.5786254690111265, | |
| "learning_rate": 4.569356626682181e-06, | |
| "loss": 0.1239, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.7458900892437765, | |
| "grad_norm": 3.8696436706505435, | |
| "learning_rate": 4.4607146385666145e-06, | |
| "loss": 0.1387, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.7646782527007985, | |
| "grad_norm": 2.851127249549094, | |
| "learning_rate": 4.352329677919983e-06, | |
| "loss": 0.1595, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.7834664161578204, | |
| "grad_norm": 2.2742219632834795, | |
| "learning_rate": 4.244253401831646e-06, | |
| "loss": 0.11, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.8022545796148428, | |
| "grad_norm": 1.9123730074854848, | |
| "learning_rate": 4.136537320269571e-06, | |
| "loss": 0.1205, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.8210427430718648, | |
| "grad_norm": 2.5825751840603277, | |
| "learning_rate": 4.029232771530306e-06, | |
| "loss": 0.1134, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.8398309065288867, | |
| "grad_norm": 2.471433944974579, | |
| "learning_rate": 3.92239089777075e-06, | |
| "loss": 0.1123, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.8586190699859089, | |
| "grad_norm": 3.5281114838659686, | |
| "learning_rate": 3.816062620633414e-06, | |
| "loss": 0.1188, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.877407233442931, | |
| "grad_norm": 1.9634958487628664, | |
| "learning_rate": 3.7102986169767954e-06, | |
| "loss": 0.1062, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.877407233442931, | |
| "eval_loss": 0.19623179733753204, | |
| "eval_runtime": 139.2435, | |
| "eval_samples_per_second": 6.794, | |
| "eval_steps_per_second": 1.702, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.896195396899953, | |
| "grad_norm": 3.6853787421863164, | |
| "learning_rate": 3.605149294722392e-06, | |
| "loss": 0.1055, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.9149835603569751, | |
| "grad_norm": 3.7066247620003168, | |
| "learning_rate": 3.500664768829908e-06, | |
| "loss": 0.125, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.9337717238139973, | |
| "grad_norm": 1.9487521917157127, | |
| "learning_rate": 3.3968948374120958e-06, | |
| "loss": 0.1046, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.9525598872710193, | |
| "grad_norm": 1.7787524143692979, | |
| "learning_rate": 3.2938889580005932e-06, | |
| "loss": 0.1308, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.9713480507280412, | |
| "grad_norm": 2.4115181227326383, | |
| "learning_rate": 3.191696223974084e-06, | |
| "loss": 0.1195, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.9901362141850634, | |
| "grad_norm": 1.8676954461633957, | |
| "learning_rate": 3.090365341160041e-06, | |
| "loss": 0.119, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.0075152653828088, | |
| "grad_norm": 2.0403175416536157, | |
| "learning_rate": 2.989944604621148e-06, | |
| "loss": 0.1082, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.0263034288398307, | |
| "grad_norm": 1.8796687540004382, | |
| "learning_rate": 2.8904818756375076e-06, | |
| "loss": 0.0649, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.045091592296853, | |
| "grad_norm": 1.6678868897065664, | |
| "learning_rate": 2.792024558895606e-06, | |
| "loss": 0.0671, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.063879755753875, | |
| "grad_norm": 3.1485507384660547, | |
| "learning_rate": 2.6946195798948755e-06, | |
| "loss": 0.0597, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.082667919210897, | |
| "grad_norm": 0.966514290803291, | |
| "learning_rate": 2.598313362582639e-06, | |
| "loss": 0.0582, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.1014560826679194, | |
| "grad_norm": 1.3610544972984766, | |
| "learning_rate": 2.5031518072281236e-06, | |
| "loss": 0.0609, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.1202442461249413, | |
| "grad_norm": 1.5921330796712605, | |
| "learning_rate": 2.4091802685460336e-06, | |
| "loss": 0.0664, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.1390324095819633, | |
| "grad_norm": 2.3312224294157025, | |
| "learning_rate": 2.3164435340801574e-06, | |
| "loss": 0.0675, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.1578205730389852, | |
| "grad_norm": 1.7103466660833229, | |
| "learning_rate": 2.224985802857284e-06, | |
| "loss": 0.0453, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.1766087364960076, | |
| "grad_norm": 2.060481882429091, | |
| "learning_rate": 2.134850664321617e-06, | |
| "loss": 0.0625, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.1953968999530296, | |
| "grad_norm": 0.9427782782833426, | |
| "learning_rate": 2.046081077559707e-06, | |
| "loss": 0.0565, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.2141850634100515, | |
| "grad_norm": 1.9240741575863745, | |
| "learning_rate": 1.9587193508258415e-06, | |
| "loss": 0.051, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.232973226867074, | |
| "grad_norm": 1.2001636500769781, | |
| "learning_rate": 1.8728071213776028e-06, | |
| "loss": 0.048, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.251761390324096, | |
| "grad_norm": 1.8036945101434407, | |
| "learning_rate": 1.7883853356312375e-06, | |
| "loss": 0.0575, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.270549553781118, | |
| "grad_norm": 1.2865058971634646, | |
| "learning_rate": 1.7054942296462895e-06, | |
| "loss": 0.0708, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.2893377172381397, | |
| "grad_norm": 1.9144408525509076, | |
| "learning_rate": 1.6241733099487888e-06, | |
| "loss": 0.0513, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.308125880695162, | |
| "grad_norm": 1.7106138783392042, | |
| "learning_rate": 1.5444613347021392e-06, | |
| "loss": 0.0562, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.326914044152184, | |
| "grad_norm": 1.2128475292175165, | |
| "learning_rate": 1.4663962952346938e-06, | |
| "loss": 0.0507, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.3457022076092064, | |
| "grad_norm": 1.321730273201786, | |
| "learning_rate": 1.3900153979327951e-06, | |
| "loss": 0.0577, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.3644903710662284, | |
| "grad_norm": 2.7875274955613745, | |
| "learning_rate": 1.315355046507934e-06, | |
| "loss": 0.0648, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.3832785345232503, | |
| "grad_norm": 1.8288963759953032, | |
| "learning_rate": 1.2424508246464635e-06, | |
| "loss": 0.0558, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.4020666979802723, | |
| "grad_norm": 1.537482323897175, | |
| "learning_rate": 1.171337479050148e-06, | |
| "loss": 0.061, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.4208548614372947, | |
| "grad_norm": 2.1795553061277357, | |
| "learning_rate": 1.1020489028756243e-06, | |
| "loss": 0.0521, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.4396430248943166, | |
| "grad_norm": 1.75277898274252, | |
| "learning_rate": 1.0346181195806614e-06, | |
| "loss": 0.0583, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.4584311883513386, | |
| "grad_norm": 1.7605887217641596, | |
| "learning_rate": 9.690772671849403e-07, | |
| "loss": 0.0546, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.477219351808361, | |
| "grad_norm": 1.447955784872353, | |
| "learning_rate": 9.054575829528251e-07, | |
| "loss": 0.0551, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.496007515265383, | |
| "grad_norm": 5.225313476562762, | |
| "learning_rate": 8.437893885054504e-07, | |
| "loss": 0.0517, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.514795678722405, | |
| "grad_norm": 1.5717503369334935, | |
| "learning_rate": 7.841020753692058e-07, | |
| "loss": 0.0545, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.533583842179427, | |
| "grad_norm": 2.6343667579863963, | |
| "learning_rate": 7.264240909675174e-07, | |
| "loss": 0.0472, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.552372005636449, | |
| "grad_norm": 2.023557390851674, | |
| "learning_rate": 6.707829250625825e-07, | |
| "loss": 0.0446, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.571160169093471, | |
| "grad_norm": 1.6290921730514096, | |
| "learning_rate": 6.172050966535514e-07, | |
| "loss": 0.0542, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.589948332550493, | |
| "grad_norm": 1.712949974396625, | |
| "learning_rate": 5.65716141337368e-07, | |
| "loss": 0.0469, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.6087364960075154, | |
| "grad_norm": 1.3027380061751708, | |
| "learning_rate": 5.163405991383114e-07, | |
| "loss": 0.0425, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.6275246594645374, | |
| "grad_norm": 1.065265515773574, | |
| "learning_rate": 4.6910200281203523e-07, | |
| "loss": 0.0504, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.6463128229215593, | |
| "grad_norm": 1.9478720946650225, | |
| "learning_rate": 4.240228666296825e-07, | |
| "loss": 0.0634, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.6651009863785813, | |
| "grad_norm": 2.5865600012088326, | |
| "learning_rate": 3.8112467564740796e-07, | |
| "loss": 0.0573, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.6838891498356037, | |
| "grad_norm": 2.0098060853151702, | |
| "learning_rate": 3.4042787546644305e-07, | |
| "loss": 0.07, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.7026773132926256, | |
| "grad_norm": 1.1444888015196577, | |
| "learning_rate": 3.0195186248856866e-07, | |
| "loss": 0.0448, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.7214654767496476, | |
| "grad_norm": 1.0223331183810753, | |
| "learning_rate": 2.6571497467164033e-07, | |
| "loss": 0.058, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.74025364020667, | |
| "grad_norm": 1.4130933630125946, | |
| "learning_rate": 2.3173448278958178e-07, | |
| "loss": 0.0654, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.759041803663692, | |
| "grad_norm": 1.2949137507027924, | |
| "learning_rate": 2.0002658220100334e-07, | |
| "loss": 0.0556, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.777829967120714, | |
| "grad_norm": 1.5805556155196185, | |
| "learning_rate": 1.7060638513037076e-07, | |
| "loss": 0.0558, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.796618130577736, | |
| "grad_norm": 2.0964534267823725, | |
| "learning_rate": 1.434879134654077e-07, | |
| "loss": 0.0496, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.815406294034758, | |
| "grad_norm": 1.464233077890015, | |
| "learning_rate": 1.186840920741561e-07, | |
| "loss": 0.0573, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.815406294034758, | |
| "eval_loss": 0.19711866974830627, | |
| "eval_runtime": 140.2242, | |
| "eval_samples_per_second": 6.746, | |
| "eval_steps_per_second": 1.69, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.83419445749178, | |
| "grad_norm": 0.969822959813251, | |
| "learning_rate": 9.620674264488594e-08, | |
| "loss": 0.0548, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.8529826209488025, | |
| "grad_norm": 1.8413122290273847, | |
| "learning_rate": 7.606657805179274e-08, | |
| "loss": 0.063, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.8717707844058245, | |
| "grad_norm": 0.7665376382039885, | |
| "learning_rate": 5.827319724915959e-08, | |
| "loss": 0.0514, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.8905589478628464, | |
| "grad_norm": 0.8215067982809356, | |
| "learning_rate": 4.283508069641951e-08, | |
| "loss": 0.0467, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.9093471113198683, | |
| "grad_norm": 2.021014987559278, | |
| "learning_rate": 2.975958631631082e-08, | |
| "loss": 0.0483, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.9281352747768903, | |
| "grad_norm": 1.4695240828377725, | |
| "learning_rate": 1.9052945988030648e-08, | |
| "loss": 0.0427, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.9469234382339127, | |
| "grad_norm": 1.1451618059955273, | |
| "learning_rate": 1.0720262577076923e-08, | |
| "loss": 0.0493, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.9657116016909346, | |
| "grad_norm": 2.9129055521839895, | |
| "learning_rate": 4.76550750318383e-09, | |
| "loss": 0.0542, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.984499765147957, | |
| "grad_norm": 1.7988477080352803, | |
| "learning_rate": 1.1915188475125627e-09, | |
| "loss": 0.04, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1599, | |
| "total_flos": 141952390594560.0, | |
| "train_loss": 0.17646967794389706, | |
| "train_runtime": 16767.1291, | |
| "train_samples_per_second": 1.523, | |
| "train_steps_per_second": 0.095 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1599, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 141952390594560.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |