| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5068423720223011, |
| "eval_steps": 500, |
| "global_step": 3000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00016894745734076703, |
| "grad_norm": 6.621600151062012, |
| "learning_rate": 0.0, |
| "loss": 10.540443420410156, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0016894745734076701, |
| "grad_norm": 6.755760669708252, |
| "learning_rate": 1.3499999999999998e-06, |
| "loss": 10.498290167914497, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0033789491468153403, |
| "grad_norm": 5.475676536560059, |
| "learning_rate": 2.85e-06, |
| "loss": 10.216492462158204, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00506842372022301, |
| "grad_norm": 2.3924124240875244, |
| "learning_rate": 4.35e-06, |
| "loss": 9.751193237304687, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0067578982936306806, |
| "grad_norm": 2.018623113632202, |
| "learning_rate": 5.85e-06, |
| "loss": 9.445990753173827, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00844737286703835, |
| "grad_norm": 1.15117609500885, |
| "learning_rate": 7.35e-06, |
| "loss": 9.269255065917969, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01013684744044602, |
| "grad_norm": 0.9644901752471924, |
| "learning_rate": 8.849999999999998e-06, |
| "loss": 9.1482421875, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.011826322013853691, |
| "grad_norm": 0.9443461894989014, |
| "learning_rate": 1.035e-05, |
| "loss": 9.042950439453126, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.013515796587261361, |
| "grad_norm": 0.8729987144470215, |
| "learning_rate": 1.1849999999999998e-05, |
| "loss": 8.95867462158203, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.015205271160669031, |
| "grad_norm": 0.7886430621147156, |
| "learning_rate": 1.3349999999999998e-05, |
| "loss": 8.854803466796875, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0168947457340767, |
| "grad_norm": 0.8168472647666931, |
| "learning_rate": 1.485e-05, |
| "loss": 8.736968231201171, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.018584220307484373, |
| "grad_norm": 0.6868988275527954, |
| "learning_rate": 1.6349999999999998e-05, |
| "loss": 8.659466552734376, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.02027369488089204, |
| "grad_norm": 0.6943208575248718, |
| "learning_rate": 1.7849999999999997e-05, |
| "loss": 8.55049819946289, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.021963169454299714, |
| "grad_norm": 0.6718711853027344, |
| "learning_rate": 1.935e-05, |
| "loss": 8.454410552978516, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.023652644027707382, |
| "grad_norm": 0.5750948190689087, |
| "learning_rate": 2.085e-05, |
| "loss": 8.361714172363282, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.025342118601115054, |
| "grad_norm": 0.545462965965271, |
| "learning_rate": 2.2349999999999998e-05, |
| "loss": 8.286084747314453, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.027031593174522722, |
| "grad_norm": 0.6024239659309387, |
| "learning_rate": 2.3849999999999997e-05, |
| "loss": 8.2337158203125, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.028721067747930394, |
| "grad_norm": 0.5649603605270386, |
| "learning_rate": 2.535e-05, |
| "loss": 8.169093322753906, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.030410542321338063, |
| "grad_norm": 0.42989474534988403, |
| "learning_rate": 2.6849999999999995e-05, |
| "loss": 8.12140121459961, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.03210001689474573, |
| "grad_norm": 0.5494393110275269, |
| "learning_rate": 2.8349999999999998e-05, |
| "loss": 8.079795837402344, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.0337894914681534, |
| "grad_norm": 0.48430609703063965, |
| "learning_rate": 2.985e-05, |
| "loss": 8.047111511230469, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.035478966041561075, |
| "grad_norm": 0.45849987864494324, |
| "learning_rate": 3.1349999999999996e-05, |
| "loss": 7.997683715820313, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.03716844061496875, |
| "grad_norm": 0.3945513665676117, |
| "learning_rate": 3.285e-05, |
| "loss": 7.981051635742188, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.03885791518837641, |
| "grad_norm": 0.4015548825263977, |
| "learning_rate": 3.435e-05, |
| "loss": 7.976935577392578, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.04054738976178408, |
| "grad_norm": 0.6867141127586365, |
| "learning_rate": 3.585e-05, |
| "loss": 7.935280609130859, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.042236864335191755, |
| "grad_norm": 0.47820013761520386, |
| "learning_rate": 3.735e-05, |
| "loss": 7.920246124267578, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.04392633890859943, |
| "grad_norm": 0.44240179657936096, |
| "learning_rate": 3.8849999999999996e-05, |
| "loss": 7.917320251464844, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0456158134820071, |
| "grad_norm": 0.49838986992836, |
| "learning_rate": 4.035e-05, |
| "loss": 7.892914581298828, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.047305288055414764, |
| "grad_norm": 0.41489648818969727, |
| "learning_rate": 4.185e-05, |
| "loss": 7.873424530029297, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.048994762628822436, |
| "grad_norm": 0.3524978756904602, |
| "learning_rate": 4.334999999999999e-05, |
| "loss": 7.852528381347656, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.05068423720223011, |
| "grad_norm": 0.46830496191978455, |
| "learning_rate": 4.484999999999999e-05, |
| "loss": 7.83319091796875, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.05237371177563778, |
| "grad_norm": 0.5290191173553467, |
| "learning_rate": 4.6349999999999995e-05, |
| "loss": 7.8142448425292965, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.054063186349045445, |
| "grad_norm": 0.4697173535823822, |
| "learning_rate": 4.785e-05, |
| "loss": 7.787006378173828, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.055752660922453116, |
| "grad_norm": 0.5864154100418091, |
| "learning_rate": 4.935e-05, |
| "loss": 7.743091583251953, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.05744213549586079, |
| "grad_norm": 0.5467583537101746, |
| "learning_rate": 5.0849999999999996e-05, |
| "loss": 7.7266998291015625, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.05913161006926846, |
| "grad_norm": 0.5317718982696533, |
| "learning_rate": 5.234999999999999e-05, |
| "loss": 7.715788269042969, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.060821084642676125, |
| "grad_norm": 0.45864003896713257, |
| "learning_rate": 5.3849999999999994e-05, |
| "loss": 7.676011657714843, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.0625105592160838, |
| "grad_norm": 0.6899635195732117, |
| "learning_rate": 5.535e-05, |
| "loss": 7.6587471008300785, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.06420003378949146, |
| "grad_norm": 0.4785831868648529, |
| "learning_rate": 5.684999999999999e-05, |
| "loss": 7.648049163818359, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.06588950836289914, |
| "grad_norm": 0.42162397503852844, |
| "learning_rate": 5.8349999999999995e-05, |
| "loss": 7.612094116210938, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.0675789829363068, |
| "grad_norm": 0.6696052551269531, |
| "learning_rate": 5.985e-05, |
| "loss": 7.605092620849609, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.06926845750971448, |
| "grad_norm": 0.5291442275047302, |
| "learning_rate": 6.134999999999999e-05, |
| "loss": 7.5769294738769535, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.07095793208312215, |
| "grad_norm": 0.6115548014640808, |
| "learning_rate": 6.285e-05, |
| "loss": 7.553981781005859, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.07264740665652981, |
| "grad_norm": 0.5771138668060303, |
| "learning_rate": 6.434999999999999e-05, |
| "loss": 7.541645812988281, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.0743368812299375, |
| "grad_norm": 0.647227942943573, |
| "learning_rate": 6.584999999999999e-05, |
| "loss": 7.522480010986328, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.07602635580334516, |
| "grad_norm": 0.6701403856277466, |
| "learning_rate": 6.735e-05, |
| "loss": 7.49889907836914, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.07771583037675282, |
| "grad_norm": 0.844932496547699, |
| "learning_rate": 6.884999999999999e-05, |
| "loss": 7.476522064208984, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.0794053049501605, |
| "grad_norm": 0.5116700530052185, |
| "learning_rate": 7.034999999999999e-05, |
| "loss": 7.456998443603515, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.08109477952356817, |
| "grad_norm": 0.5343000292778015, |
| "learning_rate": 7.184999999999998e-05, |
| "loss": 7.443318939208984, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.08278425409697585, |
| "grad_norm": 0.6147258281707764, |
| "learning_rate": 7.335e-05, |
| "loss": 7.403359985351562, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.08447372867038351, |
| "grad_norm": 0.6813654899597168, |
| "learning_rate": 7.484999999999999e-05, |
| "loss": 7.403208160400391, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.08447372867038351, |
| "eval_loss": 7.390021324157715, |
| "eval_runtime": 4.0235, |
| "eval_samples_per_second": 248.538, |
| "eval_steps_per_second": 5.219, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.08616320324379118, |
| "grad_norm": 0.6618097424507141, |
| "learning_rate": 7.635e-05, |
| "loss": 7.392906188964844, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.08785267781719885, |
| "grad_norm": 0.6140709519386292, |
| "learning_rate": 7.785e-05, |
| "loss": 7.364067840576172, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.08954215239060652, |
| "grad_norm": 0.6116703748703003, |
| "learning_rate": 7.934999999999999e-05, |
| "loss": 7.337810516357422, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0912316269640142, |
| "grad_norm": 0.8000091314315796, |
| "learning_rate": 8.085e-05, |
| "loss": 7.299466705322265, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.09292110153742186, |
| "grad_norm": 0.5890388488769531, |
| "learning_rate": 8.235e-05, |
| "loss": 7.308570098876953, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.09461057611082953, |
| "grad_norm": 1.0396614074707031, |
| "learning_rate": 8.385e-05, |
| "loss": 7.27392349243164, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.0963000506842372, |
| "grad_norm": 0.5742290019989014, |
| "learning_rate": 8.534999999999999e-05, |
| "loss": 7.271208953857422, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.09798952525764487, |
| "grad_norm": 0.684992790222168, |
| "learning_rate": 8.684999999999998e-05, |
| "loss": 7.2550514221191404, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.09967899983105254, |
| "grad_norm": 1.2290043830871582, |
| "learning_rate": 8.834999999999999e-05, |
| "loss": 7.2304443359375, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.10136847440446022, |
| "grad_norm": 0.7645843029022217, |
| "learning_rate": 8.984999999999999e-05, |
| "loss": 7.205104064941406, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.10305794897786788, |
| "grad_norm": 0.730484664440155, |
| "learning_rate": 9.134999999999998e-05, |
| "loss": 7.210204315185547, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.10474742355127556, |
| "grad_norm": 0.7423863410949707, |
| "learning_rate": 9.285e-05, |
| "loss": 7.166588592529297, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.10643689812468322, |
| "grad_norm": 0.888006329536438, |
| "learning_rate": 9.434999999999999e-05, |
| "loss": 7.162047576904297, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.10812637269809089, |
| "grad_norm": 0.9920506477355957, |
| "learning_rate": 9.585e-05, |
| "loss": 7.145941925048828, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.10981584727149857, |
| "grad_norm": 0.8996961712837219, |
| "learning_rate": 9.735e-05, |
| "loss": 7.110871124267578, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.11150532184490623, |
| "grad_norm": 0.7783015370368958, |
| "learning_rate": 9.884999999999999e-05, |
| "loss": 7.120133972167968, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.1131947964183139, |
| "grad_norm": 0.8237811923027039, |
| "learning_rate": 0.00010035, |
| "loss": 7.082501220703125, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.11488427099172158, |
| "grad_norm": 0.8586721420288086, |
| "learning_rate": 0.00010185, |
| "loss": 7.066880798339843, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.11657374556512924, |
| "grad_norm": 0.9714040160179138, |
| "learning_rate": 0.00010334999999999998, |
| "loss": 7.058338928222656, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.11826322013853692, |
| "grad_norm": 0.8379534482955933, |
| "learning_rate": 0.00010484999999999999, |
| "loss": 7.033222198486328, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.11995269471194459, |
| "grad_norm": 1.147356629371643, |
| "learning_rate": 0.00010634999999999998, |
| "loss": 7.022679138183594, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.12164216928535225, |
| "grad_norm": 0.946237325668335, |
| "learning_rate": 0.00010784999999999999, |
| "loss": 6.991328430175781, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.12333164385875993, |
| "grad_norm": 1.0189383029937744, |
| "learning_rate": 0.00010934999999999999, |
| "loss": 6.9945930480957035, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.1250211184321676, |
| "grad_norm": 1.0500218868255615, |
| "learning_rate": 0.00011084999999999998, |
| "loss": 6.963920593261719, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.12671059300557527, |
| "grad_norm": 0.9184631109237671, |
| "learning_rate": 0.00011235, |
| "loss": 6.966143798828125, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.12840006757898292, |
| "grad_norm": 0.7820301651954651, |
| "learning_rate": 0.00011384999999999999, |
| "loss": 6.938487243652344, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.1300895421523906, |
| "grad_norm": 0.861544668674469, |
| "learning_rate": 0.00011535, |
| "loss": 6.920912170410157, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.13177901672579828, |
| "grad_norm": 0.7475805878639221, |
| "learning_rate": 0.00011685, |
| "loss": 6.9139961242675785, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.13346849129920596, |
| "grad_norm": 1.1264002323150635, |
| "learning_rate": 0.00011834999999999999, |
| "loss": 6.893434143066406, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.1351579658726136, |
| "grad_norm": 0.869057834148407, |
| "learning_rate": 0.00011985, |
| "loss": 6.887288665771484, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.1368474404460213, |
| "grad_norm": 0.831230878829956, |
| "learning_rate": 0.00012135, |
| "loss": 6.863740539550781, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.13853691501942897, |
| "grad_norm": 1.068192720413208, |
| "learning_rate": 0.00012284999999999998, |
| "loss": 6.873618316650391, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.14022638959283662, |
| "grad_norm": 0.9752544164657593, |
| "learning_rate": 0.00012435, |
| "loss": 6.84736328125, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.1419158641662443, |
| "grad_norm": 0.9449293613433838, |
| "learning_rate": 0.00012585, |
| "loss": 6.829524230957031, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.14360533873965198, |
| "grad_norm": 1.104444980621338, |
| "learning_rate": 0.00012734999999999998, |
| "loss": 6.8291679382324215, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.14529481331305963, |
| "grad_norm": 0.9382540583610535, |
| "learning_rate": 0.00012885, |
| "loss": 6.8139289855957035, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.1469842878864673, |
| "grad_norm": 0.7313889861106873, |
| "learning_rate": 0.00013035, |
| "loss": 6.798196411132812, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.148673762459875, |
| "grad_norm": 0.7715932130813599, |
| "learning_rate": 0.00013184999999999998, |
| "loss": 6.805503845214844, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.15036323703328264, |
| "grad_norm": 1.0334839820861816, |
| "learning_rate": 0.00013335, |
| "loss": 6.745892333984375, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.15205271160669032, |
| "grad_norm": 1.1189385652542114, |
| "learning_rate": 0.00013485, |
| "loss": 6.769204711914062, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.153742186180098, |
| "grad_norm": 1.289933681488037, |
| "learning_rate": 0.00013634999999999998, |
| "loss": 6.737556457519531, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.15543166075350565, |
| "grad_norm": 1.0107234716415405, |
| "learning_rate": 0.00013785, |
| "loss": 6.7412353515625, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.15712113532691332, |
| "grad_norm": 0.9233148097991943, |
| "learning_rate": 0.00013935, |
| "loss": 6.707360076904297, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.158810609900321, |
| "grad_norm": 0.9500652551651001, |
| "learning_rate": 0.00014084999999999998, |
| "loss": 6.697336578369141, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.16050008447372868, |
| "grad_norm": 1.0929033756256104, |
| "learning_rate": 0.00014235, |
| "loss": 6.673794555664062, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.16218955904713633, |
| "grad_norm": 1.0387179851531982, |
| "learning_rate": 0.00014384999999999997, |
| "loss": 6.667636871337891, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.163879033620544, |
| "grad_norm": 1.1298182010650635, |
| "learning_rate": 0.00014534999999999998, |
| "loss": 6.645402526855468, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.1655685081939517, |
| "grad_norm": 0.9608763456344604, |
| "learning_rate": 0.00014685, |
| "loss": 6.658983612060547, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.16725798276735934, |
| "grad_norm": 1.2303314208984375, |
| "learning_rate": 0.00014834999999999997, |
| "loss": 6.633333587646485, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.16894745734076702, |
| "grad_norm": 0.9978023767471313, |
| "learning_rate": 0.00014984999999999998, |
| "loss": 6.6367958068847654, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.16894745734076702, |
| "eval_loss": 6.617567539215088, |
| "eval_runtime": 3.6651, |
| "eval_samples_per_second": 272.845, |
| "eval_steps_per_second": 5.73, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1706369319141747, |
| "grad_norm": 0.9963025450706482, |
| "learning_rate": 0.00015134999999999997, |
| "loss": 6.613154602050781, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.17232640648758235, |
| "grad_norm": 0.872097909450531, |
| "learning_rate": 0.00015284999999999997, |
| "loss": 6.613529968261719, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.17401588106099003, |
| "grad_norm": 1.2607650756835938, |
| "learning_rate": 0.00015434999999999998, |
| "loss": 6.587220001220703, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.1757053556343977, |
| "grad_norm": 1.0194809436798096, |
| "learning_rate": 0.00015584999999999997, |
| "loss": 6.585498046875, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.17739483020780536, |
| "grad_norm": 0.9153720736503601, |
| "learning_rate": 0.00015734999999999998, |
| "loss": 6.5845489501953125, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.17908430478121304, |
| "grad_norm": 1.1903005838394165, |
| "learning_rate": 0.00015884999999999999, |
| "loss": 6.566903686523437, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.18077377935462072, |
| "grad_norm": 0.9262056350708008, |
| "learning_rate": 0.00016034999999999997, |
| "loss": 6.520059204101562, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.1824632539280284, |
| "grad_norm": 1.0881860256195068, |
| "learning_rate": 0.00016184999999999998, |
| "loss": 6.543362426757812, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.18415272850143605, |
| "grad_norm": 0.9753679633140564, |
| "learning_rate": 0.00016334999999999999, |
| "loss": 6.528910064697266, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.18584220307484373, |
| "grad_norm": 1.2809370756149292, |
| "learning_rate": 0.00016485, |
| "loss": 6.49705810546875, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.1875316776482514, |
| "grad_norm": 1.0647395849227905, |
| "learning_rate": 0.00016634999999999998, |
| "loss": 6.508152008056641, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.18922115222165906, |
| "grad_norm": 0.9427017569541931, |
| "learning_rate": 0.00016785, |
| "loss": 6.492857360839844, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.19091062679506673, |
| "grad_norm": 1.1307021379470825, |
| "learning_rate": 0.00016935, |
| "loss": 6.474656677246093, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.1926001013684744, |
| "grad_norm": 1.182411789894104, |
| "learning_rate": 0.00017084999999999998, |
| "loss": 6.457868194580078, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.19428957594188206, |
| "grad_norm": 1.1442158222198486, |
| "learning_rate": 0.00017235, |
| "loss": 6.443910217285156, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.19597905051528974, |
| "grad_norm": 1.2637932300567627, |
| "learning_rate": 0.00017385, |
| "loss": 6.428031158447266, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.19766852508869742, |
| "grad_norm": 1.334306001663208, |
| "learning_rate": 0.00017534999999999998, |
| "loss": 6.415740966796875, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.19935799966210507, |
| "grad_norm": 0.882560670375824, |
| "learning_rate": 0.00017685, |
| "loss": 6.413926696777343, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.20104747423551275, |
| "grad_norm": 0.9657256603240967, |
| "learning_rate": 0.00017835, |
| "loss": 6.425054931640625, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.20273694880892043, |
| "grad_norm": 1.0196014642715454, |
| "learning_rate": 0.00017984999999999998, |
| "loss": 6.391595077514649, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.2044264233823281, |
| "grad_norm": 1.297837257385254, |
| "learning_rate": 0.00018135, |
| "loss": 6.382472991943359, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.20611589795573576, |
| "grad_norm": 1.1288139820098877, |
| "learning_rate": 0.00018285, |
| "loss": 6.358099746704101, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.20780537252914344, |
| "grad_norm": 0.9396995306015015, |
| "learning_rate": 0.00018435, |
| "loss": 6.355449676513672, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.20949484710255112, |
| "grad_norm": 1.1936787366867065, |
| "learning_rate": 0.00018585, |
| "loss": 6.356659698486328, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.21118432167595877, |
| "grad_norm": 0.9550564289093018, |
| "learning_rate": 0.00018735, |
| "loss": 6.337493515014648, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.21287379624936645, |
| "grad_norm": 1.2012646198272705, |
| "learning_rate": 0.00018884999999999996, |
| "loss": 6.317781829833985, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.21456327082277413, |
| "grad_norm": 1.0816755294799805, |
| "learning_rate": 0.00019034999999999996, |
| "loss": 6.316750335693359, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.21625274539618178, |
| "grad_norm": 1.3777987957000732, |
| "learning_rate": 0.00019184999999999997, |
| "loss": 6.3194934844970705, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.21794221996958946, |
| "grad_norm": 1.187603235244751, |
| "learning_rate": 0.00019334999999999998, |
| "loss": 6.30432357788086, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.21963169454299714, |
| "grad_norm": 1.0069150924682617, |
| "learning_rate": 0.00019484999999999997, |
| "loss": 6.2757713317871096, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.2213211691164048, |
| "grad_norm": 1.2410210371017456, |
| "learning_rate": 0.00019634999999999998, |
| "loss": 6.2698211669921875, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.22301064368981247, |
| "grad_norm": 1.1892989873886108, |
| "learning_rate": 0.00019784999999999998, |
| "loss": 6.2431591033935545, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.22470011826322014, |
| "grad_norm": 1.1054743528366089, |
| "learning_rate": 0.00019934999999999997, |
| "loss": 6.26300163269043, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.2263895928366278, |
| "grad_norm": 1.145757794380188, |
| "learning_rate": 0.00020084999999999998, |
| "loss": 6.226350021362305, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.22807906741003547, |
| "grad_norm": 1.0067166090011597, |
| "learning_rate": 0.00020234999999999999, |
| "loss": 6.2175750732421875, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.22976854198344315, |
| "grad_norm": 1.5041327476501465, |
| "learning_rate": 0.00020384999999999997, |
| "loss": 6.191579055786133, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.23145801655685083, |
| "grad_norm": 1.2780109643936157, |
| "learning_rate": 0.00020534999999999998, |
| "loss": 6.204021835327149, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.23314749113025848, |
| "grad_norm": 1.1531580686569214, |
| "learning_rate": 0.00020684999999999999, |
| "loss": 6.191404342651367, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.23483696570366616, |
| "grad_norm": 1.056857705116272, |
| "learning_rate": 0.00020835, |
| "loss": 6.17081298828125, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.23652644027707384, |
| "grad_norm": 1.1238850355148315, |
| "learning_rate": 0.00020984999999999998, |
| "loss": 6.153195190429687, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.2382159148504815, |
| "grad_norm": 1.2115790843963623, |
| "learning_rate": 0.00021135, |
| "loss": 6.157797622680664, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.23990538942388917, |
| "grad_norm": 1.1303883790969849, |
| "learning_rate": 0.00021285, |
| "loss": 6.119416809082031, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.24159486399729685, |
| "grad_norm": 1.2523441314697266, |
| "learning_rate": 0.00021434999999999998, |
| "loss": 6.133832550048828, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.2432843385707045, |
| "grad_norm": 1.1120916604995728, |
| "learning_rate": 0.00021585, |
| "loss": 6.122848129272461, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.24497381314411218, |
| "grad_norm": 1.239675521850586, |
| "learning_rate": 0.00021735, |
| "loss": 6.106191253662109, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.24666328771751986, |
| "grad_norm": 1.1382733583450317, |
| "learning_rate": 0.00021884999999999998, |
| "loss": 6.0912620544433596, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.2483527622909275, |
| "grad_norm": 1.3199714422225952, |
| "learning_rate": 0.00022035, |
| "loss": 6.09831428527832, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.2500422368643352, |
| "grad_norm": 1.2705349922180176, |
| "learning_rate": 0.00022185, |
| "loss": 6.078111267089843, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.25173171143774287, |
| "grad_norm": 1.436306357383728, |
| "learning_rate": 0.00022335, |
| "loss": 6.058963012695313, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.25342118601115055, |
| "grad_norm": 1.179898977279663, |
| "learning_rate": 0.00022485, |
| "loss": 6.029299545288086, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.25342118601115055, |
| "eval_loss": 6.033608436584473, |
| "eval_runtime": 3.6064, |
| "eval_samples_per_second": 277.282, |
| "eval_steps_per_second": 5.823, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.2551106605845582, |
| "grad_norm": 1.3389363288879395, |
| "learning_rate": 0.00022634999999999997, |
| "loss": 6.027260589599609, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.25680013515796585, |
| "grad_norm": 1.2689851522445679, |
| "learning_rate": 0.00022784999999999995, |
| "loss": 6.00293083190918, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.2584896097313735, |
| "grad_norm": 1.4860210418701172, |
| "learning_rate": 0.00022934999999999996, |
| "loss": 5.998868942260742, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.2601790843047812, |
| "grad_norm": 1.2490425109863281, |
| "learning_rate": 0.00023084999999999997, |
| "loss": 5.984478759765625, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.2618685588781889, |
| "grad_norm": 1.5586382150650024, |
| "learning_rate": 0.00023234999999999998, |
| "loss": 5.9672401428222654, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.26355803345159656, |
| "grad_norm": 1.3526853322982788, |
| "learning_rate": 0.00023384999999999997, |
| "loss": 5.982438278198242, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.26524750802500424, |
| "grad_norm": 1.3406753540039062, |
| "learning_rate": 0.00023534999999999997, |
| "loss": 5.938652801513672, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.2669369825984119, |
| "grad_norm": 1.0397038459777832, |
| "learning_rate": 0.00023684999999999998, |
| "loss": 5.920218658447266, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.26862645717181954, |
| "grad_norm": 1.7000986337661743, |
| "learning_rate": 0.00023834999999999997, |
| "loss": 5.896316146850586, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.2703159317452272, |
| "grad_norm": 1.1729341745376587, |
| "learning_rate": 0.00023984999999999998, |
| "loss": 5.8752281188964846, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.2720054063186349, |
| "grad_norm": 1.3115921020507812, |
| "learning_rate": 0.00024134999999999998, |
| "loss": 5.877028274536133, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.2736948808920426, |
| "grad_norm": 1.5481823682785034, |
| "learning_rate": 0.00024284999999999997, |
| "loss": 5.863247299194336, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.27538435546545026, |
| "grad_norm": 1.4173649549484253, |
| "learning_rate": 0.00024435, |
| "loss": 5.848538970947265, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.27707383003885794, |
| "grad_norm": 1.2587963342666626, |
| "learning_rate": 0.00024585, |
| "loss": 5.841713333129883, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.27876330461226556, |
| "grad_norm": 1.0922702550888062, |
| "learning_rate": 0.00024734999999999997, |
| "loss": 5.8486980438232425, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.28045277918567324, |
| "grad_norm": 1.6068239212036133, |
| "learning_rate": 0.00024885, |
| "loss": 5.819171142578125, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.2821422537590809, |
| "grad_norm": 1.5260576009750366, |
| "learning_rate": 0.00025035, |
| "loss": 5.809968566894531, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.2838317283324886, |
| "grad_norm": 1.2246356010437012, |
| "learning_rate": 0.00025184999999999997, |
| "loss": 5.788796997070312, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.2855212029058963, |
| "grad_norm": 1.0366030931472778, |
| "learning_rate": 0.00025335, |
| "loss": 5.78180160522461, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.28721067747930396, |
| "grad_norm": 1.2072358131408691, |
| "learning_rate": 0.00025485, |
| "loss": 5.770789337158203, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.28890015205271163, |
| "grad_norm": 1.3359684944152832, |
| "learning_rate": 0.00025634999999999997, |
| "loss": 5.737417221069336, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.29058962662611926, |
| "grad_norm": 1.355406403541565, |
| "learning_rate": 0.00025785, |
| "loss": 5.725430297851562, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.29227910119952694, |
| "grad_norm": 1.1998307704925537, |
| "learning_rate": 0.00025935, |
| "loss": 5.723165130615234, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.2939685757729346, |
| "grad_norm": 1.0525386333465576, |
| "learning_rate": 0.00026084999999999997, |
| "loss": 5.720573043823242, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.2956580503463423, |
| "grad_norm": 1.2880501747131348, |
| "learning_rate": 0.00026235, |
| "loss": 5.684521102905274, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.29734752491975, |
| "grad_norm": 1.2246838808059692, |
| "learning_rate": 0.00026384999999999994, |
| "loss": 5.670655059814453, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.29903699949315765, |
| "grad_norm": 1.2167463302612305, |
| "learning_rate": 0.00026534999999999997, |
| "loss": 5.690992736816407, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.3007264740665653, |
| "grad_norm": 1.2467341423034668, |
| "learning_rate": 0.00026684999999999995, |
| "loss": 5.694464492797851, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.30241594863997295, |
| "grad_norm": 1.2740100622177124, |
| "learning_rate": 0.00026835, |
| "loss": 5.679082870483398, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.30410542321338063, |
| "grad_norm": 1.2217073440551758, |
| "learning_rate": 0.00026984999999999997, |
| "loss": 5.650615692138672, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.3057948977867883, |
| "grad_norm": 1.1172698736190796, |
| "learning_rate": 0.00027134999999999995, |
| "loss": 5.651753234863281, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.307484372360196, |
| "grad_norm": 1.1706960201263428, |
| "learning_rate": 0.00027285, |
| "loss": 5.6512096405029295, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.30917384693360367, |
| "grad_norm": 0.91384357213974, |
| "learning_rate": 0.00027435, |
| "loss": 5.63836784362793, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.3108633215070113, |
| "grad_norm": 1.1929048299789429, |
| "learning_rate": 0.00027584999999999996, |
| "loss": 5.628775787353516, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.31255279608041897, |
| "grad_norm": 1.023672103881836, |
| "learning_rate": 0.00027735, |
| "loss": 5.616031265258789, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.31424227065382665, |
| "grad_norm": 1.1450271606445312, |
| "learning_rate": 0.00027885, |
| "loss": 5.612253952026367, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.31593174522723433, |
| "grad_norm": 1.0316193103790283, |
| "learning_rate": 0.00028034999999999996, |
| "loss": 5.577928161621093, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.317621219800642, |
| "grad_norm": 1.1516318321228027, |
| "learning_rate": 0.00028185, |
| "loss": 5.589142227172852, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.3193106943740497, |
| "grad_norm": 1.426249384880066, |
| "learning_rate": 0.00028335, |
| "loss": 5.594329071044922, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.32100016894745736, |
| "grad_norm": 1.0666186809539795, |
| "learning_rate": 0.00028484999999999996, |
| "loss": 5.582658386230468, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.322689643520865, |
| "grad_norm": 0.8879145979881287, |
| "learning_rate": 0.00028635, |
| "loss": 5.542075347900391, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.32437911809427267, |
| "grad_norm": 1.2985228300094604, |
| "learning_rate": 0.00028785, |
| "loss": 5.572188949584961, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.32606859266768035, |
| "grad_norm": 1.1801198720932007, |
| "learning_rate": 0.00028934999999999996, |
| "loss": 5.531465530395508, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.327758067241088, |
| "grad_norm": 1.3345341682434082, |
| "learning_rate": 0.00029085, |
| "loss": 5.5121315002441404, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.3294475418144957, |
| "grad_norm": 0.9832890629768372, |
| "learning_rate": 0.00029235, |
| "loss": 5.515644073486328, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.3311370163879034, |
| "grad_norm": 1.379388689994812, |
| "learning_rate": 0.00029384999999999996, |
| "loss": 5.5223854064941404, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.332826490961311, |
| "grad_norm": 1.0441769361495972, |
| "learning_rate": 0.00029535, |
| "loss": 5.502047729492188, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.3345159655347187, |
| "grad_norm": 1.0386887788772583, |
| "learning_rate": 0.00029685, |
| "loss": 5.521197128295898, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.33620544010812636, |
| "grad_norm": 0.8223176598548889, |
| "learning_rate": 0.00029835, |
| "loss": 5.479276275634765, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.33789491468153404, |
| "grad_norm": 1.2531520128250122, |
| "learning_rate": 0.00029985, |
| "loss": 5.487053298950196, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.33789491468153404, |
| "eval_loss": 5.460203170776367, |
| "eval_runtime": 3.9099, |
| "eval_samples_per_second": 255.761, |
| "eval_steps_per_second": 5.371, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3395843892549417, |
| "grad_norm": 1.0625675916671753, |
| "learning_rate": 0.0002999993805131495, |
| "loss": 5.482983016967774, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.3412738638283494, |
| "grad_norm": 0.9310702681541443, |
| "learning_rate": 0.00029999723908369233, |
| "loss": 5.477756500244141, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.3429633384017571, |
| "grad_norm": 0.8275931477546692, |
| "learning_rate": 0.0002999935680854744, |
| "loss": 5.4467018127441404, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.3446528129751647, |
| "grad_norm": 0.8972215056419373, |
| "learning_rate": 0.00029998836755593, |
| "loss": 5.415990829467773, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.3463422875485724, |
| "grad_norm": 1.0727229118347168, |
| "learning_rate": 0.00029998163754809044, |
| "loss": 5.403407287597656, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.34803176212198006, |
| "grad_norm": 1.0068520307540894, |
| "learning_rate": 0.0002999733781305839, |
| "loss": 5.4188987731933596, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.34972123669538774, |
| "grad_norm": 0.9327341914176941, |
| "learning_rate": 0.00029996358938763406, |
| "loss": 5.406315612792969, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.3514107112687954, |
| "grad_norm": 1.022828221321106, |
| "learning_rate": 0.0002999522714190599, |
| "loss": 5.410961532592774, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.3531001858422031, |
| "grad_norm": 0.8379955887794495, |
| "learning_rate": 0.0002999394243402743, |
| "loss": 5.411350250244141, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.3547896604156107, |
| "grad_norm": 0.8905497193336487, |
| "learning_rate": 0.00029992504828228283, |
| "loss": 5.384899520874024, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.3564791349890184, |
| "grad_norm": 0.7869957685470581, |
| "learning_rate": 0.00029990914339168286, |
| "loss": 5.391331481933594, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.3581686095624261, |
| "grad_norm": 0.7781967520713806, |
| "learning_rate": 0.00029989170983066126, |
| "loss": 5.365080261230469, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.35985808413583376, |
| "grad_norm": 0.8611620664596558, |
| "learning_rate": 0.0002998727477769937, |
| "loss": 5.367116546630859, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.36154755870924143, |
| "grad_norm": 0.8369846940040588, |
| "learning_rate": 0.0002998522574240421, |
| "loss": 5.361904525756836, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.3632370332826491, |
| "grad_norm": 0.893395721912384, |
| "learning_rate": 0.00029983023898075305, |
| "loss": 5.338259887695313, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.3649265078560568, |
| "grad_norm": 0.9806540012359619, |
| "learning_rate": 0.00029980669267165545, |
| "loss": 5.33393440246582, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.3666159824294644, |
| "grad_norm": 0.789153516292572, |
| "learning_rate": 0.0002997816187368584, |
| "loss": 5.347314834594727, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.3683054570028721, |
| "grad_norm": 0.731369137763977, |
| "learning_rate": 0.00029975501743204866, |
| "loss": 5.322664260864258, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.3699949315762798, |
| "grad_norm": 0.6811886429786682, |
| "learning_rate": 0.00029972688902848803, |
| "loss": 5.326079177856445, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.37168440614968745, |
| "grad_norm": 0.8143295645713806, |
| "learning_rate": 0.0002996972338130106, |
| "loss": 5.30379638671875, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.37337388072309513, |
| "grad_norm": 0.8854978680610657, |
| "learning_rate": 0.00029966605208801996, |
| "loss": 5.301242828369141, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.3750633552965028, |
| "grad_norm": 0.77631014585495, |
| "learning_rate": 0.0002996333441714859, |
| "loss": 5.294522476196289, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.37675282986991043, |
| "grad_norm": 0.7743359208106995, |
| "learning_rate": 0.00029959911039694127, |
| "loss": 5.313030624389649, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.3784423044433181, |
| "grad_norm": 0.8531479239463806, |
| "learning_rate": 0.00029956335111347855, |
| "loss": 5.275916671752929, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.3801317790167258, |
| "grad_norm": 0.722363293170929, |
| "learning_rate": 0.0002995260666857463, |
| "loss": 5.2906639099121096, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.38182125359013347, |
| "grad_norm": 0.7797225713729858, |
| "learning_rate": 0.00029948725749394563, |
| "loss": 5.2658641815185545, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.38351072816354115, |
| "grad_norm": 0.8231165409088135, |
| "learning_rate": 0.00029944692393382586, |
| "loss": 5.2770263671875, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.3852002027369488, |
| "grad_norm": 0.8083261847496033, |
| "learning_rate": 0.000299405066416681, |
| "loss": 5.277169799804687, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.3868896773103565, |
| "grad_norm": 0.8675849437713623, |
| "learning_rate": 0.0002993616853693452, |
| "loss": 5.258210754394531, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.38857915188376413, |
| "grad_norm": 0.7585932016372681, |
| "learning_rate": 0.0002993167812341886, |
| "loss": 5.252765655517578, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.3902686264571718, |
| "grad_norm": 0.8213605284690857, |
| "learning_rate": 0.0002992703544691127, |
| "loss": 5.222419357299804, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.3919581010305795, |
| "grad_norm": 0.7984234690666199, |
| "learning_rate": 0.00029922240554754577, |
| "loss": 5.227847671508789, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.39364757560398717, |
| "grad_norm": 0.8216149806976318, |
| "learning_rate": 0.00029917293495843793, |
| "loss": 5.215268325805664, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.39533705017739484, |
| "grad_norm": 0.7992113828659058, |
| "learning_rate": 0.0002991219432062562, |
| "loss": 5.251160049438477, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.3970265247508025, |
| "grad_norm": 0.7669650316238403, |
| "learning_rate": 0.0002990694308109795, |
| "loss": 5.255714797973633, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.39871599932421015, |
| "grad_norm": 0.7685340046882629, |
| "learning_rate": 0.0002990153983080932, |
| "loss": 5.2186332702636715, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.4004054738976178, |
| "grad_norm": 0.8289806246757507, |
| "learning_rate": 0.0002989598462485835, |
| "loss": 5.2316020965576175, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.4020949484710255, |
| "grad_norm": 0.7260857224464417, |
| "learning_rate": 0.00029890277519893215, |
| "loss": 5.210884857177734, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.4037844230444332, |
| "grad_norm": 0.6450658440589905, |
| "learning_rate": 0.0002988441857411106, |
| "loss": 5.194115066528321, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.40547389761784086, |
| "grad_norm": 0.723818838596344, |
| "learning_rate": 0.0002987840784725737, |
| "loss": 5.197711563110351, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.40716337219124854, |
| "grad_norm": 0.8113153576850891, |
| "learning_rate": 0.0002987224540062542, |
| "loss": 5.196290588378906, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.4088528467646562, |
| "grad_norm": 0.8224965929985046, |
| "learning_rate": 0.00029865931297055605, |
| "loss": 5.174480819702149, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.41054232133806384, |
| "grad_norm": 0.9786369204521179, |
| "learning_rate": 0.00029859465600934814, |
| "loss": 5.19611701965332, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.4122317959114715, |
| "grad_norm": 0.8020685911178589, |
| "learning_rate": 0.0002985284837819577, |
| "loss": 5.181368637084961, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.4139212704848792, |
| "grad_norm": 0.7282792329788208, |
| "learning_rate": 0.0002984607969631636, |
| "loss": 5.1728168487548825, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.4156107450582869, |
| "grad_norm": 0.6869542598724365, |
| "learning_rate": 0.00029839159624318954, |
| "loss": 5.172641372680664, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.41730021963169456, |
| "grad_norm": 0.8235262632369995, |
| "learning_rate": 0.00029832088232769694, |
| "loss": 5.165771484375, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.41898969420510224, |
| "grad_norm": 0.7626176476478577, |
| "learning_rate": 0.0002982486559377776, |
| "loss": 5.175928115844727, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.42067916877850986, |
| "grad_norm": 0.636053740978241, |
| "learning_rate": 0.0002981749178099467, |
| "loss": 5.135253143310547, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.42236864335191754, |
| "grad_norm": 0.6814470291137695, |
| "learning_rate": 0.000298099668696135, |
| "loss": 5.177354049682617, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.42236864335191754, |
| "eval_loss": 5.138686656951904, |
| "eval_runtime": 3.9981, |
| "eval_samples_per_second": 250.119, |
| "eval_steps_per_second": 5.253, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.4240581179253252, |
| "grad_norm": 0.786521315574646, |
| "learning_rate": 0.0002980229093636812, |
| "loss": 5.136567687988281, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.4257475924987329, |
| "grad_norm": 0.7561874389648438, |
| "learning_rate": 0.00029794464059532426, |
| "loss": 5.145055770874023, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.4274370670721406, |
| "grad_norm": 0.6505213975906372, |
| "learning_rate": 0.0002978648631891952, |
| "loss": 5.145381164550781, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.42912654164554825, |
| "grad_norm": 0.7278615832328796, |
| "learning_rate": 0.0002977835779588093, |
| "loss": 5.112863540649414, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.4308160162189559, |
| "grad_norm": 0.6332527995109558, |
| "learning_rate": 0.0002977007857330575, |
| "loss": 5.129104995727539, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.43250549079236356, |
| "grad_norm": 0.669188380241394, |
| "learning_rate": 0.0002976164873561979, |
| "loss": 5.100088500976563, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.43419496536577123, |
| "grad_norm": 0.6842843294143677, |
| "learning_rate": 0.0002975306836878474, |
| "loss": 5.092770004272461, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.4358844399391789, |
| "grad_norm": 0.7057438492774963, |
| "learning_rate": 0.000297443375602973, |
| "loss": 5.1130115509033205, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.4375739145125866, |
| "grad_norm": 0.6845251321792603, |
| "learning_rate": 0.0002973545639918824, |
| "loss": 5.112728500366211, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.43926338908599427, |
| "grad_norm": 0.6881667971611023, |
| "learning_rate": 0.00029726424976021543, |
| "loss": 5.095853042602539, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.44095286365940195, |
| "grad_norm": 0.6859349608421326, |
| "learning_rate": 0.0002971724338289346, |
| "loss": 5.099851989746094, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.4426423382328096, |
| "grad_norm": 0.6879841089248657, |
| "learning_rate": 0.0002970791171343156, |
| "loss": 5.113912582397461, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.44433181280621725, |
| "grad_norm": 0.711805522441864, |
| "learning_rate": 0.000296984300627938, |
| "loss": 5.081494903564453, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.44602128737962493, |
| "grad_norm": 0.675470232963562, |
| "learning_rate": 0.00029688798527667537, |
| "loss": 5.089406585693359, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.4477107619530326, |
| "grad_norm": 0.6627302169799805, |
| "learning_rate": 0.00029679017206268545, |
| "loss": 5.071472930908203, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.4494002365264403, |
| "grad_norm": 0.6572045087814331, |
| "learning_rate": 0.00029669086198340014, |
| "loss": 5.081936645507812, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.45108971109984797, |
| "grad_norm": 0.8288828730583191, |
| "learning_rate": 0.0002965900560515155, |
| "loss": 5.082733535766602, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.4527791856732556, |
| "grad_norm": 0.6581189036369324, |
| "learning_rate": 0.00029648775529498103, |
| "loss": 5.069281387329101, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.45446866024666327, |
| "grad_norm": 0.737130880355835, |
| "learning_rate": 0.00029638396075698953, |
| "loss": 5.066775894165039, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.45615813482007095, |
| "grad_norm": 0.7000970244407654, |
| "learning_rate": 0.00029627867349596654, |
| "loss": 5.027889251708984, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.4578476093934786, |
| "grad_norm": 0.6418822407722473, |
| "learning_rate": 0.000296171894585559, |
| "loss": 5.060458374023438, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.4595370839668863, |
| "grad_norm": 0.6689320802688599, |
| "learning_rate": 0.00029606362511462494, |
| "loss": 5.073564910888672, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.461226558540294, |
| "grad_norm": 0.7149254083633423, |
| "learning_rate": 0.000295953866187222, |
| "loss": 5.058617782592774, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.46291603311370166, |
| "grad_norm": 0.6966880559921265, |
| "learning_rate": 0.00029584261892259627, |
| "loss": 5.050143432617188, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.4646055076871093, |
| "grad_norm": 0.6495580077171326, |
| "learning_rate": 0.00029572988445517094, |
| "loss": 5.034864807128907, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.46629498226051697, |
| "grad_norm": 0.6543110609054565, |
| "learning_rate": 0.0002956156639345346, |
| "loss": 5.027247619628906, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.46798445683392464, |
| "grad_norm": 0.6335380673408508, |
| "learning_rate": 0.00029549995852542967, |
| "loss": 5.0187946319580075, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.4696739314073323, |
| "grad_norm": 0.6705760359764099, |
| "learning_rate": 0.00029538276940774044, |
| "loss": 5.034427261352539, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.47136340598074, |
| "grad_norm": 0.6140398979187012, |
| "learning_rate": 0.0002952640977764808, |
| "loss": 5.027993011474609, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.4730528805541477, |
| "grad_norm": 0.6979998350143433, |
| "learning_rate": 0.00029514394484178266, |
| "loss": 5.034260940551758, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.4747423551275553, |
| "grad_norm": 0.6220052242279053, |
| "learning_rate": 0.00029502231182888306, |
| "loss": 5.024603652954101, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.476431829700963, |
| "grad_norm": 0.6017596125602722, |
| "learning_rate": 0.0002948991999781118, |
| "loss": 5.012111663818359, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.47812130427437066, |
| "grad_norm": 0.6071211695671082, |
| "learning_rate": 0.000294774610544879, |
| "loss": 5.029761886596679, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.47981077884777834, |
| "grad_norm": 0.6241064071655273, |
| "learning_rate": 0.0002946485447996621, |
| "loss": 5.060077667236328, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.481500253421186, |
| "grad_norm": 0.6929198503494263, |
| "learning_rate": 0.0002945210040279928, |
| "loss": 4.980299758911133, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.4831897279945937, |
| "grad_norm": 0.6135720014572144, |
| "learning_rate": 0.0002943919895304443, |
| "loss": 4.994546508789062, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.4848792025680014, |
| "grad_norm": 0.6381633281707764, |
| "learning_rate": 0.0002942615026226179, |
| "loss": 4.993935012817383, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.486568677141409, |
| "grad_norm": 0.5814259648323059, |
| "learning_rate": 0.0002941295446351292, |
| "loss": 4.992059326171875, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.4882581517148167, |
| "grad_norm": 0.5999816060066223, |
| "learning_rate": 0.00029399611691359527, |
| "loss": 4.977694320678711, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.48994762628822436, |
| "grad_norm": 0.6875694990158081, |
| "learning_rate": 0.0002938612208186202, |
| "loss": 4.999196243286133, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.49163710086163204, |
| "grad_norm": 0.6184036135673523, |
| "learning_rate": 0.0002937248577257817, |
| "loss": 5.010132217407227, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.4933265754350397, |
| "grad_norm": 0.7426770329475403, |
| "learning_rate": 0.0002935870290256169, |
| "loss": 4.990754699707031, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.4950160500084474, |
| "grad_norm": 0.6430733799934387, |
| "learning_rate": 0.0002934477361236081, |
| "loss": 4.980986404418945, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.496705524581855, |
| "grad_norm": 0.6040016412734985, |
| "learning_rate": 0.0002933069804401687, |
| "loss": 5.0005535125732425, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.4983949991552627, |
| "grad_norm": 0.6449369788169861, |
| "learning_rate": 0.0002931647634106282, |
| "loss": 4.974679946899414, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.5000844737286704, |
| "grad_norm": 0.5843121409416199, |
| "learning_rate": 0.0002930210864852184, |
| "loss": 4.985787963867187, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.501773948302078, |
| "grad_norm": 0.6128187775611877, |
| "learning_rate": 0.00029287595112905773, |
| "loss": 4.969168090820313, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.5034634228754857, |
| "grad_norm": 0.6031991839408875, |
| "learning_rate": 0.00029272935882213675, |
| "loss": 4.946027374267578, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.5051528974488934, |
| "grad_norm": 0.7001163959503174, |
| "learning_rate": 0.00029258131105930314, |
| "loss": 4.9540660858154295, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.5068423720223011, |
| "grad_norm": 0.5723311305046082, |
| "learning_rate": 0.0002924318093502462, |
| "loss": 4.953271865844727, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.5068423720223011, |
| "eval_loss": 4.9452409744262695, |
| "eval_runtime": 3.6082, |
| "eval_samples_per_second": 277.146, |
| "eval_steps_per_second": 5.82, |
| "step": 3000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 11838, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.00337046257664e+17, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|