| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 11838, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00016894745734076703, | |
| "grad_norm": 6.621600151062012, | |
| "learning_rate": 0.0, | |
| "loss": 10.540443420410156, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0016894745734076701, | |
| "grad_norm": 6.755760669708252, | |
| "learning_rate": 1.3499999999999998e-06, | |
| "loss": 10.498290167914497, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0033789491468153403, | |
| "grad_norm": 5.475676536560059, | |
| "learning_rate": 2.85e-06, | |
| "loss": 10.216492462158204, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.00506842372022301, | |
| "grad_norm": 2.3924124240875244, | |
| "learning_rate": 4.35e-06, | |
| "loss": 9.751193237304687, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0067578982936306806, | |
| "grad_norm": 2.018623113632202, | |
| "learning_rate": 5.85e-06, | |
| "loss": 9.445990753173827, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.00844737286703835, | |
| "grad_norm": 1.15117609500885, | |
| "learning_rate": 7.35e-06, | |
| "loss": 9.269255065917969, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01013684744044602, | |
| "grad_norm": 0.9644901752471924, | |
| "learning_rate": 8.849999999999998e-06, | |
| "loss": 9.1482421875, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.011826322013853691, | |
| "grad_norm": 0.9443461894989014, | |
| "learning_rate": 1.035e-05, | |
| "loss": 9.042950439453126, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.013515796587261361, | |
| "grad_norm": 0.8729987144470215, | |
| "learning_rate": 1.1849999999999998e-05, | |
| "loss": 8.95867462158203, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.015205271160669031, | |
| "grad_norm": 0.7886430621147156, | |
| "learning_rate": 1.3349999999999998e-05, | |
| "loss": 8.854803466796875, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0168947457340767, | |
| "grad_norm": 0.8168472647666931, | |
| "learning_rate": 1.485e-05, | |
| "loss": 8.736968231201171, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.018584220307484373, | |
| "grad_norm": 0.6868988275527954, | |
| "learning_rate": 1.6349999999999998e-05, | |
| "loss": 8.659466552734376, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.02027369488089204, | |
| "grad_norm": 0.6943208575248718, | |
| "learning_rate": 1.7849999999999997e-05, | |
| "loss": 8.55049819946289, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.021963169454299714, | |
| "grad_norm": 0.6718711853027344, | |
| "learning_rate": 1.935e-05, | |
| "loss": 8.454410552978516, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.023652644027707382, | |
| "grad_norm": 0.5750948190689087, | |
| "learning_rate": 2.085e-05, | |
| "loss": 8.361714172363282, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.025342118601115054, | |
| "grad_norm": 0.545462965965271, | |
| "learning_rate": 2.2349999999999998e-05, | |
| "loss": 8.286084747314453, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.027031593174522722, | |
| "grad_norm": 0.6024239659309387, | |
| "learning_rate": 2.3849999999999997e-05, | |
| "loss": 8.2337158203125, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.028721067747930394, | |
| "grad_norm": 0.5649603605270386, | |
| "learning_rate": 2.535e-05, | |
| "loss": 8.169093322753906, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.030410542321338063, | |
| "grad_norm": 0.42989474534988403, | |
| "learning_rate": 2.6849999999999995e-05, | |
| "loss": 8.12140121459961, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03210001689474573, | |
| "grad_norm": 0.5494393110275269, | |
| "learning_rate": 2.8349999999999998e-05, | |
| "loss": 8.079795837402344, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0337894914681534, | |
| "grad_norm": 0.48430609703063965, | |
| "learning_rate": 2.985e-05, | |
| "loss": 8.047111511230469, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.035478966041561075, | |
| "grad_norm": 0.45849987864494324, | |
| "learning_rate": 3.1349999999999996e-05, | |
| "loss": 7.997683715820313, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.03716844061496875, | |
| "grad_norm": 0.3945513665676117, | |
| "learning_rate": 3.285e-05, | |
| "loss": 7.981051635742188, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.03885791518837641, | |
| "grad_norm": 0.4015548825263977, | |
| "learning_rate": 3.435e-05, | |
| "loss": 7.976935577392578, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04054738976178408, | |
| "grad_norm": 0.6867141127586365, | |
| "learning_rate": 3.585e-05, | |
| "loss": 7.935280609130859, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.042236864335191755, | |
| "grad_norm": 0.47820013761520386, | |
| "learning_rate": 3.735e-05, | |
| "loss": 7.920246124267578, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04392633890859943, | |
| "grad_norm": 0.44240179657936096, | |
| "learning_rate": 3.8849999999999996e-05, | |
| "loss": 7.917320251464844, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0456158134820071, | |
| "grad_norm": 0.49838986992836, | |
| "learning_rate": 4.035e-05, | |
| "loss": 7.892914581298828, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.047305288055414764, | |
| "grad_norm": 0.41489648818969727, | |
| "learning_rate": 4.185e-05, | |
| "loss": 7.873424530029297, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.048994762628822436, | |
| "grad_norm": 0.3524978756904602, | |
| "learning_rate": 4.334999999999999e-05, | |
| "loss": 7.852528381347656, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.05068423720223011, | |
| "grad_norm": 0.46830496191978455, | |
| "learning_rate": 4.484999999999999e-05, | |
| "loss": 7.83319091796875, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05237371177563778, | |
| "grad_norm": 0.5290191173553467, | |
| "learning_rate": 4.6349999999999995e-05, | |
| "loss": 7.8142448425292965, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.054063186349045445, | |
| "grad_norm": 0.4697173535823822, | |
| "learning_rate": 4.785e-05, | |
| "loss": 7.787006378173828, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.055752660922453116, | |
| "grad_norm": 0.5864154100418091, | |
| "learning_rate": 4.935e-05, | |
| "loss": 7.743091583251953, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.05744213549586079, | |
| "grad_norm": 0.5467583537101746, | |
| "learning_rate": 5.0849999999999996e-05, | |
| "loss": 7.7266998291015625, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05913161006926846, | |
| "grad_norm": 0.5317718982696533, | |
| "learning_rate": 5.234999999999999e-05, | |
| "loss": 7.715788269042969, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.060821084642676125, | |
| "grad_norm": 0.45864003896713257, | |
| "learning_rate": 5.3849999999999994e-05, | |
| "loss": 7.676011657714843, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0625105592160838, | |
| "grad_norm": 0.6899635195732117, | |
| "learning_rate": 5.535e-05, | |
| "loss": 7.6587471008300785, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06420003378949146, | |
| "grad_norm": 0.4785831868648529, | |
| "learning_rate": 5.684999999999999e-05, | |
| "loss": 7.648049163818359, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.06588950836289914, | |
| "grad_norm": 0.42162397503852844, | |
| "learning_rate": 5.8349999999999995e-05, | |
| "loss": 7.612094116210938, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.0675789829363068, | |
| "grad_norm": 0.6696052551269531, | |
| "learning_rate": 5.985e-05, | |
| "loss": 7.605092620849609, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06926845750971448, | |
| "grad_norm": 0.5291442275047302, | |
| "learning_rate": 6.134999999999999e-05, | |
| "loss": 7.5769294738769535, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.07095793208312215, | |
| "grad_norm": 0.6115548014640808, | |
| "learning_rate": 6.285e-05, | |
| "loss": 7.553981781005859, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07264740665652981, | |
| "grad_norm": 0.5771138668060303, | |
| "learning_rate": 6.434999999999999e-05, | |
| "loss": 7.541645812988281, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.0743368812299375, | |
| "grad_norm": 0.647227942943573, | |
| "learning_rate": 6.584999999999999e-05, | |
| "loss": 7.522480010986328, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.07602635580334516, | |
| "grad_norm": 0.6701403856277466, | |
| "learning_rate": 6.735e-05, | |
| "loss": 7.49889907836914, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07771583037675282, | |
| "grad_norm": 0.844932496547699, | |
| "learning_rate": 6.884999999999999e-05, | |
| "loss": 7.476522064208984, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.0794053049501605, | |
| "grad_norm": 0.5116700530052185, | |
| "learning_rate": 7.034999999999999e-05, | |
| "loss": 7.456998443603515, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.08109477952356817, | |
| "grad_norm": 0.5343000292778015, | |
| "learning_rate": 7.184999999999998e-05, | |
| "loss": 7.443318939208984, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.08278425409697585, | |
| "grad_norm": 0.6147258281707764, | |
| "learning_rate": 7.335e-05, | |
| "loss": 7.403359985351562, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.08447372867038351, | |
| "grad_norm": 0.6813654899597168, | |
| "learning_rate": 7.484999999999999e-05, | |
| "loss": 7.403208160400391, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08447372867038351, | |
| "eval_loss": 7.390021324157715, | |
| "eval_runtime": 4.0235, | |
| "eval_samples_per_second": 248.538, | |
| "eval_steps_per_second": 5.219, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08616320324379118, | |
| "grad_norm": 0.6618097424507141, | |
| "learning_rate": 7.635e-05, | |
| "loss": 7.392906188964844, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.08785267781719885, | |
| "grad_norm": 0.6140709519386292, | |
| "learning_rate": 7.785e-05, | |
| "loss": 7.364067840576172, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.08954215239060652, | |
| "grad_norm": 0.6116703748703003, | |
| "learning_rate": 7.934999999999999e-05, | |
| "loss": 7.337810516357422, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.0912316269640142, | |
| "grad_norm": 0.8000091314315796, | |
| "learning_rate": 8.085e-05, | |
| "loss": 7.299466705322265, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.09292110153742186, | |
| "grad_norm": 0.5890388488769531, | |
| "learning_rate": 8.235e-05, | |
| "loss": 7.308570098876953, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.09461057611082953, | |
| "grad_norm": 1.0396614074707031, | |
| "learning_rate": 8.385e-05, | |
| "loss": 7.27392349243164, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.0963000506842372, | |
| "grad_norm": 0.5742290019989014, | |
| "learning_rate": 8.534999999999999e-05, | |
| "loss": 7.271208953857422, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.09798952525764487, | |
| "grad_norm": 0.684992790222168, | |
| "learning_rate": 8.684999999999998e-05, | |
| "loss": 7.2550514221191404, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09967899983105254, | |
| "grad_norm": 1.2290043830871582, | |
| "learning_rate": 8.834999999999999e-05, | |
| "loss": 7.2304443359375, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.10136847440446022, | |
| "grad_norm": 0.7645843029022217, | |
| "learning_rate": 8.984999999999999e-05, | |
| "loss": 7.205104064941406, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.10305794897786788, | |
| "grad_norm": 0.730484664440155, | |
| "learning_rate": 9.134999999999998e-05, | |
| "loss": 7.210204315185547, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.10474742355127556, | |
| "grad_norm": 0.7423863410949707, | |
| "learning_rate": 9.285e-05, | |
| "loss": 7.166588592529297, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.10643689812468322, | |
| "grad_norm": 0.888006329536438, | |
| "learning_rate": 9.434999999999999e-05, | |
| "loss": 7.162047576904297, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.10812637269809089, | |
| "grad_norm": 0.9920506477355957, | |
| "learning_rate": 9.585e-05, | |
| "loss": 7.145941925048828, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.10981584727149857, | |
| "grad_norm": 0.8996961712837219, | |
| "learning_rate": 9.735e-05, | |
| "loss": 7.110871124267578, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.11150532184490623, | |
| "grad_norm": 0.7783015370368958, | |
| "learning_rate": 9.884999999999999e-05, | |
| "loss": 7.120133972167968, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.1131947964183139, | |
| "grad_norm": 0.8237811923027039, | |
| "learning_rate": 0.00010035, | |
| "loss": 7.082501220703125, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.11488427099172158, | |
| "grad_norm": 0.8586721420288086, | |
| "learning_rate": 0.00010185, | |
| "loss": 7.066880798339843, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.11657374556512924, | |
| "grad_norm": 0.9714040160179138, | |
| "learning_rate": 0.00010334999999999998, | |
| "loss": 7.058338928222656, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.11826322013853692, | |
| "grad_norm": 0.8379534482955933, | |
| "learning_rate": 0.00010484999999999999, | |
| "loss": 7.033222198486328, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.11995269471194459, | |
| "grad_norm": 1.147356629371643, | |
| "learning_rate": 0.00010634999999999998, | |
| "loss": 7.022679138183594, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.12164216928535225, | |
| "grad_norm": 0.946237325668335, | |
| "learning_rate": 0.00010784999999999999, | |
| "loss": 6.991328430175781, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.12333164385875993, | |
| "grad_norm": 1.0189383029937744, | |
| "learning_rate": 0.00010934999999999999, | |
| "loss": 6.9945930480957035, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.1250211184321676, | |
| "grad_norm": 1.0500218868255615, | |
| "learning_rate": 0.00011084999999999998, | |
| "loss": 6.963920593261719, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.12671059300557527, | |
| "grad_norm": 0.9184631109237671, | |
| "learning_rate": 0.00011235, | |
| "loss": 6.966143798828125, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.12840006757898292, | |
| "grad_norm": 0.7820301651954651, | |
| "learning_rate": 0.00011384999999999999, | |
| "loss": 6.938487243652344, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.1300895421523906, | |
| "grad_norm": 0.861544668674469, | |
| "learning_rate": 0.00011535, | |
| "loss": 6.920912170410157, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.13177901672579828, | |
| "grad_norm": 0.7475805878639221, | |
| "learning_rate": 0.00011685, | |
| "loss": 6.9139961242675785, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.13346849129920596, | |
| "grad_norm": 1.1264002323150635, | |
| "learning_rate": 0.00011834999999999999, | |
| "loss": 6.893434143066406, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.1351579658726136, | |
| "grad_norm": 0.869057834148407, | |
| "learning_rate": 0.00011985, | |
| "loss": 6.887288665771484, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.1368474404460213, | |
| "grad_norm": 0.831230878829956, | |
| "learning_rate": 0.00012135, | |
| "loss": 6.863740539550781, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.13853691501942897, | |
| "grad_norm": 1.068192720413208, | |
| "learning_rate": 0.00012284999999999998, | |
| "loss": 6.873618316650391, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.14022638959283662, | |
| "grad_norm": 0.9752544164657593, | |
| "learning_rate": 0.00012435, | |
| "loss": 6.84736328125, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.1419158641662443, | |
| "grad_norm": 0.9449293613433838, | |
| "learning_rate": 0.00012585, | |
| "loss": 6.829524230957031, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.14360533873965198, | |
| "grad_norm": 1.104444980621338, | |
| "learning_rate": 0.00012734999999999998, | |
| "loss": 6.8291679382324215, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.14529481331305963, | |
| "grad_norm": 0.9382540583610535, | |
| "learning_rate": 0.00012885, | |
| "loss": 6.8139289855957035, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.1469842878864673, | |
| "grad_norm": 0.7313889861106873, | |
| "learning_rate": 0.00013035, | |
| "loss": 6.798196411132812, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.148673762459875, | |
| "grad_norm": 0.7715932130813599, | |
| "learning_rate": 0.00013184999999999998, | |
| "loss": 6.805503845214844, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.15036323703328264, | |
| "grad_norm": 1.0334839820861816, | |
| "learning_rate": 0.00013335, | |
| "loss": 6.745892333984375, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.15205271160669032, | |
| "grad_norm": 1.1189385652542114, | |
| "learning_rate": 0.00013485, | |
| "loss": 6.769204711914062, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.153742186180098, | |
| "grad_norm": 1.289933681488037, | |
| "learning_rate": 0.00013634999999999998, | |
| "loss": 6.737556457519531, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.15543166075350565, | |
| "grad_norm": 1.0107234716415405, | |
| "learning_rate": 0.00013785, | |
| "loss": 6.7412353515625, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.15712113532691332, | |
| "grad_norm": 0.9233148097991943, | |
| "learning_rate": 0.00013935, | |
| "loss": 6.707360076904297, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.158810609900321, | |
| "grad_norm": 0.9500652551651001, | |
| "learning_rate": 0.00014084999999999998, | |
| "loss": 6.697336578369141, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.16050008447372868, | |
| "grad_norm": 1.0929033756256104, | |
| "learning_rate": 0.00014235, | |
| "loss": 6.673794555664062, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.16218955904713633, | |
| "grad_norm": 1.0387179851531982, | |
| "learning_rate": 0.00014384999999999997, | |
| "loss": 6.667636871337891, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.163879033620544, | |
| "grad_norm": 1.1298182010650635, | |
| "learning_rate": 0.00014534999999999998, | |
| "loss": 6.645402526855468, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1655685081939517, | |
| "grad_norm": 0.9608763456344604, | |
| "learning_rate": 0.00014685, | |
| "loss": 6.658983612060547, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.16725798276735934, | |
| "grad_norm": 1.2303314208984375, | |
| "learning_rate": 0.00014834999999999997, | |
| "loss": 6.633333587646485, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.16894745734076702, | |
| "grad_norm": 0.9978023767471313, | |
| "learning_rate": 0.00014984999999999998, | |
| "loss": 6.6367958068847654, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.16894745734076702, | |
| "eval_loss": 6.617567539215088, | |
| "eval_runtime": 3.6651, | |
| "eval_samples_per_second": 272.845, | |
| "eval_steps_per_second": 5.73, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1706369319141747, | |
| "grad_norm": 0.9963025450706482, | |
| "learning_rate": 0.00015134999999999997, | |
| "loss": 6.613154602050781, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.17232640648758235, | |
| "grad_norm": 0.872097909450531, | |
| "learning_rate": 0.00015284999999999997, | |
| "loss": 6.613529968261719, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.17401588106099003, | |
| "grad_norm": 1.2607650756835938, | |
| "learning_rate": 0.00015434999999999998, | |
| "loss": 6.587220001220703, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.1757053556343977, | |
| "grad_norm": 1.0194809436798096, | |
| "learning_rate": 0.00015584999999999997, | |
| "loss": 6.585498046875, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.17739483020780536, | |
| "grad_norm": 0.9153720736503601, | |
| "learning_rate": 0.00015734999999999998, | |
| "loss": 6.5845489501953125, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.17908430478121304, | |
| "grad_norm": 1.1903005838394165, | |
| "learning_rate": 0.00015884999999999999, | |
| "loss": 6.566903686523437, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.18077377935462072, | |
| "grad_norm": 0.9262056350708008, | |
| "learning_rate": 0.00016034999999999997, | |
| "loss": 6.520059204101562, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.1824632539280284, | |
| "grad_norm": 1.0881860256195068, | |
| "learning_rate": 0.00016184999999999998, | |
| "loss": 6.543362426757812, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.18415272850143605, | |
| "grad_norm": 0.9753679633140564, | |
| "learning_rate": 0.00016334999999999999, | |
| "loss": 6.528910064697266, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.18584220307484373, | |
| "grad_norm": 1.2809370756149292, | |
| "learning_rate": 0.00016485, | |
| "loss": 6.49705810546875, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1875316776482514, | |
| "grad_norm": 1.0647395849227905, | |
| "learning_rate": 0.00016634999999999998, | |
| "loss": 6.508152008056641, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.18922115222165906, | |
| "grad_norm": 0.9427017569541931, | |
| "learning_rate": 0.00016785, | |
| "loss": 6.492857360839844, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.19091062679506673, | |
| "grad_norm": 1.1307021379470825, | |
| "learning_rate": 0.00016935, | |
| "loss": 6.474656677246093, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.1926001013684744, | |
| "grad_norm": 1.182411789894104, | |
| "learning_rate": 0.00017084999999999998, | |
| "loss": 6.457868194580078, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.19428957594188206, | |
| "grad_norm": 1.1442158222198486, | |
| "learning_rate": 0.00017235, | |
| "loss": 6.443910217285156, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.19597905051528974, | |
| "grad_norm": 1.2637932300567627, | |
| "learning_rate": 0.00017385, | |
| "loss": 6.428031158447266, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.19766852508869742, | |
| "grad_norm": 1.334306001663208, | |
| "learning_rate": 0.00017534999999999998, | |
| "loss": 6.415740966796875, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.19935799966210507, | |
| "grad_norm": 0.882560670375824, | |
| "learning_rate": 0.00017685, | |
| "loss": 6.413926696777343, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.20104747423551275, | |
| "grad_norm": 0.9657256603240967, | |
| "learning_rate": 0.00017835, | |
| "loss": 6.425054931640625, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.20273694880892043, | |
| "grad_norm": 1.0196014642715454, | |
| "learning_rate": 0.00017984999999999998, | |
| "loss": 6.391595077514649, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2044264233823281, | |
| "grad_norm": 1.297837257385254, | |
| "learning_rate": 0.00018135, | |
| "loss": 6.382472991943359, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.20611589795573576, | |
| "grad_norm": 1.1288139820098877, | |
| "learning_rate": 0.00018285, | |
| "loss": 6.358099746704101, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.20780537252914344, | |
| "grad_norm": 0.9396995306015015, | |
| "learning_rate": 0.00018435, | |
| "loss": 6.355449676513672, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.20949484710255112, | |
| "grad_norm": 1.1936787366867065, | |
| "learning_rate": 0.00018585, | |
| "loss": 6.356659698486328, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.21118432167595877, | |
| "grad_norm": 0.9550564289093018, | |
| "learning_rate": 0.00018735, | |
| "loss": 6.337493515014648, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.21287379624936645, | |
| "grad_norm": 1.2012646198272705, | |
| "learning_rate": 0.00018884999999999996, | |
| "loss": 6.317781829833985, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.21456327082277413, | |
| "grad_norm": 1.0816755294799805, | |
| "learning_rate": 0.00019034999999999996, | |
| "loss": 6.316750335693359, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.21625274539618178, | |
| "grad_norm": 1.3777987957000732, | |
| "learning_rate": 0.00019184999999999997, | |
| "loss": 6.3194934844970705, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.21794221996958946, | |
| "grad_norm": 1.187603235244751, | |
| "learning_rate": 0.00019334999999999998, | |
| "loss": 6.30432357788086, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.21963169454299714, | |
| "grad_norm": 1.0069150924682617, | |
| "learning_rate": 0.00019484999999999997, | |
| "loss": 6.2757713317871096, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2213211691164048, | |
| "grad_norm": 1.2410210371017456, | |
| "learning_rate": 0.00019634999999999998, | |
| "loss": 6.2698211669921875, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.22301064368981247, | |
| "grad_norm": 1.1892989873886108, | |
| "learning_rate": 0.00019784999999999998, | |
| "loss": 6.2431591033935545, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.22470011826322014, | |
| "grad_norm": 1.1054743528366089, | |
| "learning_rate": 0.00019934999999999997, | |
| "loss": 6.26300163269043, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.2263895928366278, | |
| "grad_norm": 1.145757794380188, | |
| "learning_rate": 0.00020084999999999998, | |
| "loss": 6.226350021362305, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.22807906741003547, | |
| "grad_norm": 1.0067166090011597, | |
| "learning_rate": 0.00020234999999999999, | |
| "loss": 6.2175750732421875, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.22976854198344315, | |
| "grad_norm": 1.5041327476501465, | |
| "learning_rate": 0.00020384999999999997, | |
| "loss": 6.191579055786133, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.23145801655685083, | |
| "grad_norm": 1.2780109643936157, | |
| "learning_rate": 0.00020534999999999998, | |
| "loss": 6.204021835327149, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.23314749113025848, | |
| "grad_norm": 1.1531580686569214, | |
| "learning_rate": 0.00020684999999999999, | |
| "loss": 6.191404342651367, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.23483696570366616, | |
| "grad_norm": 1.056857705116272, | |
| "learning_rate": 0.00020835, | |
| "loss": 6.17081298828125, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.23652644027707384, | |
| "grad_norm": 1.1238850355148315, | |
| "learning_rate": 0.00020984999999999998, | |
| "loss": 6.153195190429687, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2382159148504815, | |
| "grad_norm": 1.2115790843963623, | |
| "learning_rate": 0.00021135, | |
| "loss": 6.157797622680664, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.23990538942388917, | |
| "grad_norm": 1.1303883790969849, | |
| "learning_rate": 0.00021285, | |
| "loss": 6.119416809082031, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.24159486399729685, | |
| "grad_norm": 1.2523441314697266, | |
| "learning_rate": 0.00021434999999999998, | |
| "loss": 6.133832550048828, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.2432843385707045, | |
| "grad_norm": 1.1120916604995728, | |
| "learning_rate": 0.00021585, | |
| "loss": 6.122848129272461, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.24497381314411218, | |
| "grad_norm": 1.239675521850586, | |
| "learning_rate": 0.00021735, | |
| "loss": 6.106191253662109, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.24666328771751986, | |
| "grad_norm": 1.1382733583450317, | |
| "learning_rate": 0.00021884999999999998, | |
| "loss": 6.0912620544433596, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.2483527622909275, | |
| "grad_norm": 1.3199714422225952, | |
| "learning_rate": 0.00022035, | |
| "loss": 6.09831428527832, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.2500422368643352, | |
| "grad_norm": 1.2705349922180176, | |
| "learning_rate": 0.00022185, | |
| "loss": 6.078111267089843, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.25173171143774287, | |
| "grad_norm": 1.436306357383728, | |
| "learning_rate": 0.00022335, | |
| "loss": 6.058963012695313, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.25342118601115055, | |
| "grad_norm": 1.179898977279663, | |
| "learning_rate": 0.00022485, | |
| "loss": 6.029299545288086, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.25342118601115055, | |
| "eval_loss": 6.033608436584473, | |
| "eval_runtime": 3.6064, | |
| "eval_samples_per_second": 277.282, | |
| "eval_steps_per_second": 5.823, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2551106605845582, | |
| "grad_norm": 1.3389363288879395, | |
| "learning_rate": 0.00022634999999999997, | |
| "loss": 6.027260589599609, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.25680013515796585, | |
| "grad_norm": 1.2689851522445679, | |
| "learning_rate": 0.00022784999999999995, | |
| "loss": 6.00293083190918, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.2584896097313735, | |
| "grad_norm": 1.4860210418701172, | |
| "learning_rate": 0.00022934999999999996, | |
| "loss": 5.998868942260742, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.2601790843047812, | |
| "grad_norm": 1.2490425109863281, | |
| "learning_rate": 0.00023084999999999997, | |
| "loss": 5.984478759765625, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.2618685588781889, | |
| "grad_norm": 1.5586382150650024, | |
| "learning_rate": 0.00023234999999999998, | |
| "loss": 5.9672401428222654, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.26355803345159656, | |
| "grad_norm": 1.3526853322982788, | |
| "learning_rate": 0.00023384999999999997, | |
| "loss": 5.982438278198242, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.26524750802500424, | |
| "grad_norm": 1.3406753540039062, | |
| "learning_rate": 0.00023534999999999997, | |
| "loss": 5.938652801513672, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2669369825984119, | |
| "grad_norm": 1.0397038459777832, | |
| "learning_rate": 0.00023684999999999998, | |
| "loss": 5.920218658447266, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.26862645717181954, | |
| "grad_norm": 1.7000986337661743, | |
| "learning_rate": 0.00023834999999999997, | |
| "loss": 5.896316146850586, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2703159317452272, | |
| "grad_norm": 1.1729341745376587, | |
| "learning_rate": 0.00023984999999999998, | |
| "loss": 5.8752281188964846, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2720054063186349, | |
| "grad_norm": 1.3115921020507812, | |
| "learning_rate": 0.00024134999999999998, | |
| "loss": 5.877028274536133, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2736948808920426, | |
| "grad_norm": 1.5481823682785034, | |
| "learning_rate": 0.00024284999999999997, | |
| "loss": 5.863247299194336, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.27538435546545026, | |
| "grad_norm": 1.4173649549484253, | |
| "learning_rate": 0.00024435, | |
| "loss": 5.848538970947265, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.27707383003885794, | |
| "grad_norm": 1.2587963342666626, | |
| "learning_rate": 0.00024585, | |
| "loss": 5.841713333129883, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.27876330461226556, | |
| "grad_norm": 1.0922702550888062, | |
| "learning_rate": 0.00024734999999999997, | |
| "loss": 5.8486980438232425, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.28045277918567324, | |
| "grad_norm": 1.6068239212036133, | |
| "learning_rate": 0.00024885, | |
| "loss": 5.819171142578125, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2821422537590809, | |
| "grad_norm": 1.5260576009750366, | |
| "learning_rate": 0.00025035, | |
| "loss": 5.809968566894531, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.2838317283324886, | |
| "grad_norm": 1.2246356010437012, | |
| "learning_rate": 0.00025184999999999997, | |
| "loss": 5.788796997070312, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.2855212029058963, | |
| "grad_norm": 1.0366030931472778, | |
| "learning_rate": 0.00025335, | |
| "loss": 5.78180160522461, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.28721067747930396, | |
| "grad_norm": 1.2072358131408691, | |
| "learning_rate": 0.00025485, | |
| "loss": 5.770789337158203, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.28890015205271163, | |
| "grad_norm": 1.3359684944152832, | |
| "learning_rate": 0.00025634999999999997, | |
| "loss": 5.737417221069336, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.29058962662611926, | |
| "grad_norm": 1.355406403541565, | |
| "learning_rate": 0.00025785, | |
| "loss": 5.725430297851562, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.29227910119952694, | |
| "grad_norm": 1.1998307704925537, | |
| "learning_rate": 0.00025935, | |
| "loss": 5.723165130615234, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2939685757729346, | |
| "grad_norm": 1.0525386333465576, | |
| "learning_rate": 0.00026084999999999997, | |
| "loss": 5.720573043823242, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.2956580503463423, | |
| "grad_norm": 1.2880501747131348, | |
| "learning_rate": 0.00026235, | |
| "loss": 5.684521102905274, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.29734752491975, | |
| "grad_norm": 1.2246838808059692, | |
| "learning_rate": 0.00026384999999999994, | |
| "loss": 5.670655059814453, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.29903699949315765, | |
| "grad_norm": 1.2167463302612305, | |
| "learning_rate": 0.00026534999999999997, | |
| "loss": 5.690992736816407, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.3007264740665653, | |
| "grad_norm": 1.2467341423034668, | |
| "learning_rate": 0.00026684999999999995, | |
| "loss": 5.694464492797851, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.30241594863997295, | |
| "grad_norm": 1.2740100622177124, | |
| "learning_rate": 0.00026835, | |
| "loss": 5.679082870483398, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.30410542321338063, | |
| "grad_norm": 1.2217073440551758, | |
| "learning_rate": 0.00026984999999999997, | |
| "loss": 5.650615692138672, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3057948977867883, | |
| "grad_norm": 1.1172698736190796, | |
| "learning_rate": 0.00027134999999999995, | |
| "loss": 5.651753234863281, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.307484372360196, | |
| "grad_norm": 1.1706960201263428, | |
| "learning_rate": 0.00027285, | |
| "loss": 5.6512096405029295, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.30917384693360367, | |
| "grad_norm": 0.91384357213974, | |
| "learning_rate": 0.00027435, | |
| "loss": 5.63836784362793, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.3108633215070113, | |
| "grad_norm": 1.1929048299789429, | |
| "learning_rate": 0.00027584999999999996, | |
| "loss": 5.628775787353516, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.31255279608041897, | |
| "grad_norm": 1.023672103881836, | |
| "learning_rate": 0.00027735, | |
| "loss": 5.616031265258789, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.31424227065382665, | |
| "grad_norm": 1.1450271606445312, | |
| "learning_rate": 0.00027885, | |
| "loss": 5.612253952026367, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.31593174522723433, | |
| "grad_norm": 1.0316193103790283, | |
| "learning_rate": 0.00028034999999999996, | |
| "loss": 5.577928161621093, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.317621219800642, | |
| "grad_norm": 1.1516318321228027, | |
| "learning_rate": 0.00028185, | |
| "loss": 5.589142227172852, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.3193106943740497, | |
| "grad_norm": 1.426249384880066, | |
| "learning_rate": 0.00028335, | |
| "loss": 5.594329071044922, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.32100016894745736, | |
| "grad_norm": 1.0666186809539795, | |
| "learning_rate": 0.00028484999999999996, | |
| "loss": 5.582658386230468, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.322689643520865, | |
| "grad_norm": 0.8879145979881287, | |
| "learning_rate": 0.00028635, | |
| "loss": 5.542075347900391, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.32437911809427267, | |
| "grad_norm": 1.2985228300094604, | |
| "learning_rate": 0.00028785, | |
| "loss": 5.572188949584961, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.32606859266768035, | |
| "grad_norm": 1.1801198720932007, | |
| "learning_rate": 0.00028934999999999996, | |
| "loss": 5.531465530395508, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.327758067241088, | |
| "grad_norm": 1.3345341682434082, | |
| "learning_rate": 0.00029085, | |
| "loss": 5.5121315002441404, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.3294475418144957, | |
| "grad_norm": 0.9832890629768372, | |
| "learning_rate": 0.00029235, | |
| "loss": 5.515644073486328, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.3311370163879034, | |
| "grad_norm": 1.379388689994812, | |
| "learning_rate": 0.00029384999999999996, | |
| "loss": 5.5223854064941404, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.332826490961311, | |
| "grad_norm": 1.0441769361495972, | |
| "learning_rate": 0.00029535, | |
| "loss": 5.502047729492188, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.3345159655347187, | |
| "grad_norm": 1.0386887788772583, | |
| "learning_rate": 0.00029685, | |
| "loss": 5.521197128295898, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.33620544010812636, | |
| "grad_norm": 0.8223176598548889, | |
| "learning_rate": 0.00029835, | |
| "loss": 5.479276275634765, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.33789491468153404, | |
| "grad_norm": 1.2531520128250122, | |
| "learning_rate": 0.00029985, | |
| "loss": 5.487053298950196, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.33789491468153404, | |
| "eval_loss": 5.460203170776367, | |
| "eval_runtime": 3.9099, | |
| "eval_samples_per_second": 255.761, | |
| "eval_steps_per_second": 5.371, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3395843892549417, | |
| "grad_norm": 1.0625675916671753, | |
| "learning_rate": 0.0002999993805131495, | |
| "loss": 5.482983016967774, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.3412738638283494, | |
| "grad_norm": 0.9310702681541443, | |
| "learning_rate": 0.00029999723908369233, | |
| "loss": 5.477756500244141, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.3429633384017571, | |
| "grad_norm": 0.8275931477546692, | |
| "learning_rate": 0.0002999935680854744, | |
| "loss": 5.4467018127441404, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.3446528129751647, | |
| "grad_norm": 0.8972215056419373, | |
| "learning_rate": 0.00029998836755593, | |
| "loss": 5.415990829467773, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.3463422875485724, | |
| "grad_norm": 1.0727229118347168, | |
| "learning_rate": 0.00029998163754809044, | |
| "loss": 5.403407287597656, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.34803176212198006, | |
| "grad_norm": 1.0068520307540894, | |
| "learning_rate": 0.0002999733781305839, | |
| "loss": 5.4188987731933596, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.34972123669538774, | |
| "grad_norm": 0.9327341914176941, | |
| "learning_rate": 0.00029996358938763406, | |
| "loss": 5.406315612792969, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.3514107112687954, | |
| "grad_norm": 1.022828221321106, | |
| "learning_rate": 0.0002999522714190599, | |
| "loss": 5.410961532592774, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3531001858422031, | |
| "grad_norm": 0.8379955887794495, | |
| "learning_rate": 0.0002999394243402743, | |
| "loss": 5.411350250244141, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.3547896604156107, | |
| "grad_norm": 0.8905497193336487, | |
| "learning_rate": 0.00029992504828228283, | |
| "loss": 5.384899520874024, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3564791349890184, | |
| "grad_norm": 0.7869957685470581, | |
| "learning_rate": 0.00029990914339168286, | |
| "loss": 5.391331481933594, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.3581686095624261, | |
| "grad_norm": 0.7781967520713806, | |
| "learning_rate": 0.00029989170983066126, | |
| "loss": 5.365080261230469, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.35985808413583376, | |
| "grad_norm": 0.8611620664596558, | |
| "learning_rate": 0.0002998727477769937, | |
| "loss": 5.367116546630859, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.36154755870924143, | |
| "grad_norm": 0.8369846940040588, | |
| "learning_rate": 0.0002998522574240421, | |
| "loss": 5.361904525756836, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.3632370332826491, | |
| "grad_norm": 0.893395721912384, | |
| "learning_rate": 0.00029983023898075305, | |
| "loss": 5.338259887695313, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3649265078560568, | |
| "grad_norm": 0.9806540012359619, | |
| "learning_rate": 0.00029980669267165545, | |
| "loss": 5.33393440246582, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3666159824294644, | |
| "grad_norm": 0.789153516292572, | |
| "learning_rate": 0.0002997816187368584, | |
| "loss": 5.347314834594727, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.3683054570028721, | |
| "grad_norm": 0.731369137763977, | |
| "learning_rate": 0.00029975501743204866, | |
| "loss": 5.322664260864258, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.3699949315762798, | |
| "grad_norm": 0.6811886429786682, | |
| "learning_rate": 0.00029972688902848803, | |
| "loss": 5.326079177856445, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.37168440614968745, | |
| "grad_norm": 0.8143295645713806, | |
| "learning_rate": 0.0002996972338130106, | |
| "loss": 5.30379638671875, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.37337388072309513, | |
| "grad_norm": 0.8854978680610657, | |
| "learning_rate": 0.00029966605208801996, | |
| "loss": 5.301242828369141, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.3750633552965028, | |
| "grad_norm": 0.77631014585495, | |
| "learning_rate": 0.0002996333441714859, | |
| "loss": 5.294522476196289, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.37675282986991043, | |
| "grad_norm": 0.7743359208106995, | |
| "learning_rate": 0.00029959911039694127, | |
| "loss": 5.313030624389649, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.3784423044433181, | |
| "grad_norm": 0.8531479239463806, | |
| "learning_rate": 0.00029956335111347855, | |
| "loss": 5.275916671752929, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.3801317790167258, | |
| "grad_norm": 0.722363293170929, | |
| "learning_rate": 0.0002995260666857463, | |
| "loss": 5.2906639099121096, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.38182125359013347, | |
| "grad_norm": 0.7797225713729858, | |
| "learning_rate": 0.00029948725749394563, | |
| "loss": 5.2658641815185545, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.38351072816354115, | |
| "grad_norm": 0.8231165409088135, | |
| "learning_rate": 0.00029944692393382586, | |
| "loss": 5.2770263671875, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.3852002027369488, | |
| "grad_norm": 0.8083261847496033, | |
| "learning_rate": 0.000299405066416681, | |
| "loss": 5.277169799804687, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3868896773103565, | |
| "grad_norm": 0.8675849437713623, | |
| "learning_rate": 0.0002993616853693452, | |
| "loss": 5.258210754394531, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.38857915188376413, | |
| "grad_norm": 0.7585932016372681, | |
| "learning_rate": 0.0002993167812341886, | |
| "loss": 5.252765655517578, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3902686264571718, | |
| "grad_norm": 0.8213605284690857, | |
| "learning_rate": 0.0002992703544691127, | |
| "loss": 5.222419357299804, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.3919581010305795, | |
| "grad_norm": 0.7984234690666199, | |
| "learning_rate": 0.00029922240554754577, | |
| "loss": 5.227847671508789, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.39364757560398717, | |
| "grad_norm": 0.8216149806976318, | |
| "learning_rate": 0.00029917293495843793, | |
| "loss": 5.215268325805664, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.39533705017739484, | |
| "grad_norm": 0.7992113828659058, | |
| "learning_rate": 0.0002991219432062562, | |
| "loss": 5.251160049438477, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.3970265247508025, | |
| "grad_norm": 0.7669650316238403, | |
| "learning_rate": 0.0002990694308109795, | |
| "loss": 5.255714797973633, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.39871599932421015, | |
| "grad_norm": 0.7685340046882629, | |
| "learning_rate": 0.0002990153983080932, | |
| "loss": 5.2186332702636715, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.4004054738976178, | |
| "grad_norm": 0.8289806246757507, | |
| "learning_rate": 0.0002989598462485835, | |
| "loss": 5.2316020965576175, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.4020949484710255, | |
| "grad_norm": 0.7260857224464417, | |
| "learning_rate": 0.00029890277519893215, | |
| "loss": 5.210884857177734, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.4037844230444332, | |
| "grad_norm": 0.6450658440589905, | |
| "learning_rate": 0.0002988441857411106, | |
| "loss": 5.194115066528321, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.40547389761784086, | |
| "grad_norm": 0.723818838596344, | |
| "learning_rate": 0.0002987840784725737, | |
| "loss": 5.197711563110351, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.40716337219124854, | |
| "grad_norm": 0.8113153576850891, | |
| "learning_rate": 0.0002987224540062542, | |
| "loss": 5.196290588378906, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.4088528467646562, | |
| "grad_norm": 0.8224965929985046, | |
| "learning_rate": 0.00029865931297055605, | |
| "loss": 5.174480819702149, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.41054232133806384, | |
| "grad_norm": 0.9786369204521179, | |
| "learning_rate": 0.00029859465600934814, | |
| "loss": 5.19611701965332, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.4122317959114715, | |
| "grad_norm": 0.8020685911178589, | |
| "learning_rate": 0.0002985284837819577, | |
| "loss": 5.181368637084961, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.4139212704848792, | |
| "grad_norm": 0.7282792329788208, | |
| "learning_rate": 0.0002984607969631636, | |
| "loss": 5.1728168487548825, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.4156107450582869, | |
| "grad_norm": 0.6869542598724365, | |
| "learning_rate": 0.00029839159624318954, | |
| "loss": 5.172641372680664, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.41730021963169456, | |
| "grad_norm": 0.8235262632369995, | |
| "learning_rate": 0.00029832088232769694, | |
| "loss": 5.165771484375, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.41898969420510224, | |
| "grad_norm": 0.7626176476478577, | |
| "learning_rate": 0.0002982486559377776, | |
| "loss": 5.175928115844727, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.42067916877850986, | |
| "grad_norm": 0.636053740978241, | |
| "learning_rate": 0.0002981749178099467, | |
| "loss": 5.135253143310547, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.42236864335191754, | |
| "grad_norm": 0.6814470291137695, | |
| "learning_rate": 0.000298099668696135, | |
| "loss": 5.177354049682617, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.42236864335191754, | |
| "eval_loss": 5.138686656951904, | |
| "eval_runtime": 3.9981, | |
| "eval_samples_per_second": 250.119, | |
| "eval_steps_per_second": 5.253, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4240581179253252, | |
| "grad_norm": 0.786521315574646, | |
| "learning_rate": 0.0002980229093636812, | |
| "loss": 5.136567687988281, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.4257475924987329, | |
| "grad_norm": 0.7561874389648438, | |
| "learning_rate": 0.00029794464059532426, | |
| "loss": 5.145055770874023, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4274370670721406, | |
| "grad_norm": 0.6505213975906372, | |
| "learning_rate": 0.0002978648631891952, | |
| "loss": 5.145381164550781, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.42912654164554825, | |
| "grad_norm": 0.7278615832328796, | |
| "learning_rate": 0.0002977835779588093, | |
| "loss": 5.112863540649414, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.4308160162189559, | |
| "grad_norm": 0.6332527995109558, | |
| "learning_rate": 0.0002977007857330575, | |
| "loss": 5.129104995727539, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.43250549079236356, | |
| "grad_norm": 0.669188380241394, | |
| "learning_rate": 0.0002976164873561979, | |
| "loss": 5.100088500976563, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.43419496536577123, | |
| "grad_norm": 0.6842843294143677, | |
| "learning_rate": 0.0002975306836878474, | |
| "loss": 5.092770004272461, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.4358844399391789, | |
| "grad_norm": 0.7057438492774963, | |
| "learning_rate": 0.000297443375602973, | |
| "loss": 5.1130115509033205, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.4375739145125866, | |
| "grad_norm": 0.6845251321792603, | |
| "learning_rate": 0.0002973545639918824, | |
| "loss": 5.112728500366211, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.43926338908599427, | |
| "grad_norm": 0.6881667971611023, | |
| "learning_rate": 0.00029726424976021543, | |
| "loss": 5.095853042602539, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.44095286365940195, | |
| "grad_norm": 0.6859349608421326, | |
| "learning_rate": 0.0002971724338289346, | |
| "loss": 5.099851989746094, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.4426423382328096, | |
| "grad_norm": 0.6879841089248657, | |
| "learning_rate": 0.0002970791171343156, | |
| "loss": 5.113912582397461, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.44433181280621725, | |
| "grad_norm": 0.711805522441864, | |
| "learning_rate": 0.000296984300627938, | |
| "loss": 5.081494903564453, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.44602128737962493, | |
| "grad_norm": 0.675470232963562, | |
| "learning_rate": 0.00029688798527667537, | |
| "loss": 5.089406585693359, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.4477107619530326, | |
| "grad_norm": 0.6627302169799805, | |
| "learning_rate": 0.00029679017206268545, | |
| "loss": 5.071472930908203, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.4494002365264403, | |
| "grad_norm": 0.6572045087814331, | |
| "learning_rate": 0.00029669086198340014, | |
| "loss": 5.081936645507812, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.45108971109984797, | |
| "grad_norm": 0.8288828730583191, | |
| "learning_rate": 0.0002965900560515155, | |
| "loss": 5.082733535766602, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4527791856732556, | |
| "grad_norm": 0.6581189036369324, | |
| "learning_rate": 0.00029648775529498103, | |
| "loss": 5.069281387329101, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.45446866024666327, | |
| "grad_norm": 0.737130880355835, | |
| "learning_rate": 0.00029638396075698953, | |
| "loss": 5.066775894165039, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.45615813482007095, | |
| "grad_norm": 0.7000970244407654, | |
| "learning_rate": 0.00029627867349596654, | |
| "loss": 5.027889251708984, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4578476093934786, | |
| "grad_norm": 0.6418822407722473, | |
| "learning_rate": 0.000296171894585559, | |
| "loss": 5.060458374023438, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.4595370839668863, | |
| "grad_norm": 0.6689320802688599, | |
| "learning_rate": 0.00029606362511462494, | |
| "loss": 5.073564910888672, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.461226558540294, | |
| "grad_norm": 0.7149254083633423, | |
| "learning_rate": 0.000295953866187222, | |
| "loss": 5.058617782592774, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.46291603311370166, | |
| "grad_norm": 0.6966880559921265, | |
| "learning_rate": 0.00029584261892259627, | |
| "loss": 5.050143432617188, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.4646055076871093, | |
| "grad_norm": 0.6495580077171326, | |
| "learning_rate": 0.00029572988445517094, | |
| "loss": 5.034864807128907, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.46629498226051697, | |
| "grad_norm": 0.6543110609054565, | |
| "learning_rate": 0.0002956156639345346, | |
| "loss": 5.027247619628906, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.46798445683392464, | |
| "grad_norm": 0.6335380673408508, | |
| "learning_rate": 0.00029549995852542967, | |
| "loss": 5.0187946319580075, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.4696739314073323, | |
| "grad_norm": 0.6705760359764099, | |
| "learning_rate": 0.00029538276940774044, | |
| "loss": 5.034427261352539, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.47136340598074, | |
| "grad_norm": 0.6140398979187012, | |
| "learning_rate": 0.0002952640977764808, | |
| "loss": 5.027993011474609, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.4730528805541477, | |
| "grad_norm": 0.6979998350143433, | |
| "learning_rate": 0.00029514394484178266, | |
| "loss": 5.034260940551758, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4747423551275553, | |
| "grad_norm": 0.6220052242279053, | |
| "learning_rate": 0.00029502231182888306, | |
| "loss": 5.024603652954101, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.476431829700963, | |
| "grad_norm": 0.6017596125602722, | |
| "learning_rate": 0.0002948991999781118, | |
| "loss": 5.012111663818359, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.47812130427437066, | |
| "grad_norm": 0.6071211695671082, | |
| "learning_rate": 0.000294774610544879, | |
| "loss": 5.029761886596679, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.47981077884777834, | |
| "grad_norm": 0.6241064071655273, | |
| "learning_rate": 0.0002946485447996621, | |
| "loss": 5.060077667236328, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.481500253421186, | |
| "grad_norm": 0.6929198503494263, | |
| "learning_rate": 0.0002945210040279928, | |
| "loss": 4.980299758911133, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4831897279945937, | |
| "grad_norm": 0.6135720014572144, | |
| "learning_rate": 0.0002943919895304443, | |
| "loss": 4.994546508789062, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.4848792025680014, | |
| "grad_norm": 0.6381633281707764, | |
| "learning_rate": 0.0002942615026226179, | |
| "loss": 4.993935012817383, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.486568677141409, | |
| "grad_norm": 0.5814259648323059, | |
| "learning_rate": 0.0002941295446351292, | |
| "loss": 4.992059326171875, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.4882581517148167, | |
| "grad_norm": 0.5999816060066223, | |
| "learning_rate": 0.00029399611691359527, | |
| "loss": 4.977694320678711, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.48994762628822436, | |
| "grad_norm": 0.6875694990158081, | |
| "learning_rate": 0.0002938612208186202, | |
| "loss": 4.999196243286133, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.49163710086163204, | |
| "grad_norm": 0.6184036135673523, | |
| "learning_rate": 0.0002937248577257817, | |
| "loss": 5.010132217407227, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.4933265754350397, | |
| "grad_norm": 0.7426770329475403, | |
| "learning_rate": 0.0002935870290256169, | |
| "loss": 4.990754699707031, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.4950160500084474, | |
| "grad_norm": 0.6430733799934387, | |
| "learning_rate": 0.0002934477361236081, | |
| "loss": 4.980986404418945, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.496705524581855, | |
| "grad_norm": 0.6040016412734985, | |
| "learning_rate": 0.0002933069804401687, | |
| "loss": 5.0005535125732425, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.4983949991552627, | |
| "grad_norm": 0.6449369788169861, | |
| "learning_rate": 0.0002931647634106282, | |
| "loss": 4.974679946899414, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.5000844737286704, | |
| "grad_norm": 0.5843121409416199, | |
| "learning_rate": 0.0002930210864852184, | |
| "loss": 4.985787963867187, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.501773948302078, | |
| "grad_norm": 0.6128187775611877, | |
| "learning_rate": 0.00029287595112905773, | |
| "loss": 4.969168090820313, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.5034634228754857, | |
| "grad_norm": 0.6031991839408875, | |
| "learning_rate": 0.00029272935882213675, | |
| "loss": 4.946027374267578, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.5051528974488934, | |
| "grad_norm": 0.7001163959503174, | |
| "learning_rate": 0.00029258131105930314, | |
| "loss": 4.9540660858154295, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.5068423720223011, | |
| "grad_norm": 0.5723311305046082, | |
| "learning_rate": 0.0002924318093502462, | |
| "loss": 4.953271865844727, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5068423720223011, | |
| "eval_loss": 4.9452409744262695, | |
| "eval_runtime": 3.6082, | |
| "eval_samples_per_second": 277.146, | |
| "eval_steps_per_second": 5.82, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5085318465957087, | |
| "grad_norm": 0.6639596819877625, | |
| "learning_rate": 0.00029228085521948167, | |
| "loss": 4.964737319946289, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.5102213211691164, | |
| "grad_norm": 0.6372225284576416, | |
| "learning_rate": 0.000292128450206336, | |
| "loss": 4.964575576782226, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.5119107957425241, | |
| "grad_norm": 0.6287721991539001, | |
| "learning_rate": 0.00029197459586493077, | |
| "loss": 4.968457794189453, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.5136002703159317, | |
| "grad_norm": 0.596134603023529, | |
| "learning_rate": 0.0002918192937641668, | |
| "loss": 4.937327194213867, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.5152897448893394, | |
| "grad_norm": 0.6308782696723938, | |
| "learning_rate": 0.00029166254548770827, | |
| "loss": 4.955413055419922, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.516979219462747, | |
| "grad_norm": 0.6965638399124146, | |
| "learning_rate": 0.0002915043526339663, | |
| "loss": 4.951367568969727, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.5186686940361548, | |
| "grad_norm": 0.5890870690345764, | |
| "learning_rate": 0.00029134471681608286, | |
| "loss": 4.956174087524414, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.5203581686095624, | |
| "grad_norm": 0.6104860901832581, | |
| "learning_rate": 0.00029118363966191445, | |
| "loss": 4.943370056152344, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.5220476431829701, | |
| "grad_norm": 0.5911648869514465, | |
| "learning_rate": 0.00029102112281401507, | |
| "loss": 4.944694900512696, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.5237371177563778, | |
| "grad_norm": 0.565938413143158, | |
| "learning_rate": 0.0002908571679296199, | |
| "loss": 4.952177047729492, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5254265923297854, | |
| "grad_norm": 0.5907305479049683, | |
| "learning_rate": 0.00029069177668062816, | |
| "loss": 4.945420837402343, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.5271160669031931, | |
| "grad_norm": 0.6035214066505432, | |
| "learning_rate": 0.00029052495075358617, | |
| "loss": 4.937542343139649, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5288055414766007, | |
| "grad_norm": 0.6489094495773315, | |
| "learning_rate": 0.0002903566918496701, | |
| "loss": 4.9244636535644535, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.5304950160500085, | |
| "grad_norm": 0.6260411739349365, | |
| "learning_rate": 0.0002901870016846685, | |
| "loss": 4.8926746368408205, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.5321844906234161, | |
| "grad_norm": 0.6731085777282715, | |
| "learning_rate": 0.00029001588198896523, | |
| "loss": 4.926407241821289, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.5338739651968238, | |
| "grad_norm": 0.7428810596466064, | |
| "learning_rate": 0.0002898433345075212, | |
| "loss": 4.938412857055664, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.5355634397702315, | |
| "grad_norm": 0.6765902042388916, | |
| "learning_rate": 0.0002896693609998571, | |
| "loss": 4.913272094726563, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.5372529143436391, | |
| "grad_norm": 0.7018482089042664, | |
| "learning_rate": 0.0002894939632400352, | |
| "loss": 4.8967021942138675, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.5389423889170468, | |
| "grad_norm": 0.6272666454315186, | |
| "learning_rate": 0.0002893171430166413, | |
| "loss": 4.921631622314453, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.5406318634904544, | |
| "grad_norm": 0.6220508217811584, | |
| "learning_rate": 0.00028913890213276664, | |
| "loss": 4.878177642822266, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5423213380638622, | |
| "grad_norm": 0.6996421813964844, | |
| "learning_rate": 0.0002889592424059891, | |
| "loss": 4.867180633544922, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.5440108126372698, | |
| "grad_norm": 0.6159099340438843, | |
| "learning_rate": 0.0002887781656683551, | |
| "loss": 4.913734436035156, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5457002872106774, | |
| "grad_norm": 0.6269960403442383, | |
| "learning_rate": 0.0002885956737663609, | |
| "loss": 4.884205627441406, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.5473897617840852, | |
| "grad_norm": 0.6038281321525574, | |
| "learning_rate": 0.00028841176856093346, | |
| "loss": 4.866396713256836, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.5490792363574928, | |
| "grad_norm": 0.587129533290863, | |
| "learning_rate": 0.0002882264519274116, | |
| "loss": 4.902432632446289, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5507687109309005, | |
| "grad_norm": 0.58821040391922, | |
| "learning_rate": 0.0002880397257555271, | |
| "loss": 4.8774364471435545, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.5524581855043081, | |
| "grad_norm": 0.5894278287887573, | |
| "learning_rate": 0.000287851591949385, | |
| "loss": 4.882402038574218, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.5541476600777159, | |
| "grad_norm": 0.6682536005973816, | |
| "learning_rate": 0.0002876620524274447, | |
| "loss": 4.875947570800781, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5558371346511235, | |
| "grad_norm": 0.56892329454422, | |
| "learning_rate": 0.0002874711091224999, | |
| "loss": 4.8791545867919925, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.5575266092245311, | |
| "grad_norm": 0.6786117553710938, | |
| "learning_rate": 0.0002872787639816593, | |
| "loss": 4.891143417358398, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5592160837979389, | |
| "grad_norm": 0.5622825622558594, | |
| "learning_rate": 0.00028708501896632636, | |
| "loss": 4.864010620117187, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.5609055583713465, | |
| "grad_norm": 0.5666744709014893, | |
| "learning_rate": 0.0002868898760521796, | |
| "loss": 4.874967956542969, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.5625950329447542, | |
| "grad_norm": 0.5970442295074463, | |
| "learning_rate": 0.00028669333722915245, | |
| "loss": 4.873799514770508, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.5642845075181618, | |
| "grad_norm": 0.6412243843078613, | |
| "learning_rate": 0.0002864954045014126, | |
| "loss": 4.852934646606445, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5659739820915696, | |
| "grad_norm": 0.6244974732398987, | |
| "learning_rate": 0.00028629607988734214, | |
| "loss": 4.876249694824219, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5676634566649772, | |
| "grad_norm": 0.6246965527534485, | |
| "learning_rate": 0.00028609536541951636, | |
| "loss": 4.86097412109375, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5693529312383848, | |
| "grad_norm": 0.6178351044654846, | |
| "learning_rate": 0.00028589326314468357, | |
| "loss": 4.864706420898438, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.5710424058117926, | |
| "grad_norm": 0.5810765624046326, | |
| "learning_rate": 0.0002856897751237439, | |
| "loss": 4.842799377441406, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5727318803852002, | |
| "grad_norm": 0.5823137760162354, | |
| "learning_rate": 0.0002854849034317282, | |
| "loss": 4.846402740478515, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.5744213549586079, | |
| "grad_norm": 0.5997573137283325, | |
| "learning_rate": 0.0002852786501577773, | |
| "loss": 4.83245849609375, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5761108295320155, | |
| "grad_norm": 0.587899386882782, | |
| "learning_rate": 0.0002850710174051204, | |
| "loss": 4.871646118164063, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5778003041054233, | |
| "grad_norm": 0.6415139436721802, | |
| "learning_rate": 0.0002848620072910535, | |
| "loss": 4.83592414855957, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5794897786788309, | |
| "grad_norm": 0.5660437345504761, | |
| "learning_rate": 0.0002846516219469181, | |
| "loss": 4.855052947998047, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5811792532522385, | |
| "grad_norm": 0.5609036087989807, | |
| "learning_rate": 0.0002844398635180794, | |
| "loss": 4.856648254394531, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.5828687278256462, | |
| "grad_norm": 0.5897738933563232, | |
| "learning_rate": 0.00028422673416390437, | |
| "loss": 4.862053298950196, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5845582023990539, | |
| "grad_norm": 0.5655834674835205, | |
| "learning_rate": 0.00028401223605773964, | |
| "loss": 4.845742416381836, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5862476769724616, | |
| "grad_norm": 0.604269802570343, | |
| "learning_rate": 0.00028379637138688945, | |
| "loss": 4.828603363037109, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.5879371515458692, | |
| "grad_norm": 0.5728313326835632, | |
| "learning_rate": 0.0002835791423525935, | |
| "loss": 4.821448516845703, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5896266261192769, | |
| "grad_norm": 0.6384554505348206, | |
| "learning_rate": 0.00028336055117000403, | |
| "loss": 4.838501358032227, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.5913161006926846, | |
| "grad_norm": 0.5678889155387878, | |
| "learning_rate": 0.0002831406000681638, | |
| "loss": 4.827870178222656, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5913161006926846, | |
| "eval_loss": 4.812247276306152, | |
| "eval_runtime": 3.7454, | |
| "eval_samples_per_second": 266.992, | |
| "eval_steps_per_second": 5.607, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5930055752660922, | |
| "grad_norm": 0.6172225475311279, | |
| "learning_rate": 0.00028291929128998293, | |
| "loss": 4.818932342529297, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.5946950498395, | |
| "grad_norm": 0.6093600392341614, | |
| "learning_rate": 0.00028269662709221635, | |
| "loss": 4.821521759033203, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5963845244129076, | |
| "grad_norm": 0.5978082418441772, | |
| "learning_rate": 0.00028247260974544037, | |
| "loss": 4.8287818908691404, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.5980739989863153, | |
| "grad_norm": 0.6068260073661804, | |
| "learning_rate": 0.00028224724153403015, | |
| "loss": 4.830426025390625, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.5997634735597229, | |
| "grad_norm": 0.5650802850723267, | |
| "learning_rate": 0.0002820205247561356, | |
| "loss": 4.82833251953125, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.6014529481331305, | |
| "grad_norm": 0.5627108812332153, | |
| "learning_rate": 0.0002817924617236587, | |
| "loss": 4.835605239868164, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.6031424227065383, | |
| "grad_norm": 0.5983113050460815, | |
| "learning_rate": 0.00028156305476222966, | |
| "loss": 4.8316596984863285, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.6048318972799459, | |
| "grad_norm": 0.6292333006858826, | |
| "learning_rate": 0.0002813323062111828, | |
| "loss": 4.8177978515625, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.6065213718533536, | |
| "grad_norm": 0.5784122943878174, | |
| "learning_rate": 0.0002811002184235334, | |
| "loss": 4.801444625854492, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.6082108464267613, | |
| "grad_norm": 0.598548173904419, | |
| "learning_rate": 0.00028086679376595314, | |
| "loss": 4.825639343261718, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.609900321000169, | |
| "grad_norm": 0.6459059715270996, | |
| "learning_rate": 0.00028063203461874635, | |
| "loss": 4.819043350219727, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.6115897955735766, | |
| "grad_norm": 0.6150253415107727, | |
| "learning_rate": 0.0002803959433758254, | |
| "loss": 4.791939926147461, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.6132792701469842, | |
| "grad_norm": 0.6212024092674255, | |
| "learning_rate": 0.0002801585224446866, | |
| "loss": 4.81633415222168, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.614968744720392, | |
| "grad_norm": 0.677480936050415, | |
| "learning_rate": 0.0002799197742463854, | |
| "loss": 4.800606918334961, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.6166582192937996, | |
| "grad_norm": 0.5394392609596252, | |
| "learning_rate": 0.0002796797012155118, | |
| "loss": 4.792086791992188, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.6183476938672073, | |
| "grad_norm": 0.6263740658760071, | |
| "learning_rate": 0.0002794383058001657, | |
| "loss": 4.8161579132080075, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.620037168440615, | |
| "grad_norm": 0.5984665751457214, | |
| "learning_rate": 0.00027919559046193156, | |
| "loss": 4.80903434753418, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.6217266430140226, | |
| "grad_norm": 0.570600688457489, | |
| "learning_rate": 0.0002789515576758536, | |
| "loss": 4.814385986328125, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.6234161175874303, | |
| "grad_norm": 0.6285440325737, | |
| "learning_rate": 0.00027870620993041055, | |
| "loss": 4.766680908203125, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.6251055921608379, | |
| "grad_norm": 0.6015235185623169, | |
| "learning_rate": 0.00027845954972749004, | |
| "loss": 4.784580230712891, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6267950667342457, | |
| "grad_norm": 0.5941788554191589, | |
| "learning_rate": 0.0002782115795823633, | |
| "loss": 4.8005828857421875, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.6284845413076533, | |
| "grad_norm": 0.5614147782325745, | |
| "learning_rate": 0.0002779623020236594, | |
| "loss": 4.785182952880859, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.630174015881061, | |
| "grad_norm": 0.6022568941116333, | |
| "learning_rate": 0.00027771171959333976, | |
| "loss": 4.825439453125, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.6318634904544687, | |
| "grad_norm": 0.5763116478919983, | |
| "learning_rate": 0.00027745983484667164, | |
| "loss": 4.781736373901367, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.6335529650278763, | |
| "grad_norm": 0.5323732495307922, | |
| "learning_rate": 0.0002772066503522026, | |
| "loss": 4.793933868408203, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.635242439601284, | |
| "grad_norm": 0.6131083369255066, | |
| "learning_rate": 0.00027695216869173415, | |
| "loss": 4.761587905883789, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6369319141746916, | |
| "grad_norm": 0.6051084399223328, | |
| "learning_rate": 0.0002766963924602953, | |
| "loss": 4.781452941894531, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.6386213887480994, | |
| "grad_norm": 0.5932102203369141, | |
| "learning_rate": 0.00027643932426611647, | |
| "loss": 4.7832183837890625, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.640310863321507, | |
| "grad_norm": 0.5552225708961487, | |
| "learning_rate": 0.0002761809667306022, | |
| "loss": 4.767021560668946, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.6420003378949147, | |
| "grad_norm": 0.6563674211502075, | |
| "learning_rate": 0.00027592132248830526, | |
| "loss": 4.7745105743408205, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6436898124683224, | |
| "grad_norm": 0.5460559725761414, | |
| "learning_rate": 0.00027566039418689905, | |
| "loss": 4.758267593383789, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.64537928704173, | |
| "grad_norm": 0.5483755469322205, | |
| "learning_rate": 0.00027539818448715124, | |
| "loss": 4.763653182983399, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.6470687616151377, | |
| "grad_norm": 0.5985057950019836, | |
| "learning_rate": 0.000275134696062896, | |
| "loss": 4.773738098144531, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.6487582361885453, | |
| "grad_norm": 0.5881696939468384, | |
| "learning_rate": 0.0002748699316010073, | |
| "loss": 4.750997161865234, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.6504477107619531, | |
| "grad_norm": 0.5993970632553101, | |
| "learning_rate": 0.000274603893801371, | |
| "loss": 4.757858657836914, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.6521371853353607, | |
| "grad_norm": 0.5355719327926636, | |
| "learning_rate": 0.000274336585376858, | |
| "loss": 4.756174468994141, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.6538266599087684, | |
| "grad_norm": 0.5604658722877502, | |
| "learning_rate": 0.0002740680090532958, | |
| "loss": 4.771471786499023, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.655516134482176, | |
| "grad_norm": 0.5580816268920898, | |
| "learning_rate": 0.0002737981675694411, | |
| "loss": 4.767171859741211, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.6572056090555837, | |
| "grad_norm": 0.5741564035415649, | |
| "learning_rate": 0.00027352706367695203, | |
| "loss": 4.755613708496094, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.6588950836289914, | |
| "grad_norm": 0.5679869055747986, | |
| "learning_rate": 0.00027325470014035965, | |
| "loss": 4.768374252319336, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.660584558202399, | |
| "grad_norm": 0.5516665577888489, | |
| "learning_rate": 0.0002729810797370402, | |
| "loss": 4.754274368286133, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.6622740327758068, | |
| "grad_norm": 0.572651743888855, | |
| "learning_rate": 0.00027270620525718647, | |
| "loss": 4.740098190307617, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.6639635073492144, | |
| "grad_norm": 0.5252317190170288, | |
| "learning_rate": 0.0002724300795037796, | |
| "loss": 4.781079864501953, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.665652981922622, | |
| "grad_norm": 0.578183114528656, | |
| "learning_rate": 0.00027215270529256015, | |
| "loss": 4.738787460327148, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.6673424564960297, | |
| "grad_norm": 0.6071267127990723, | |
| "learning_rate": 0.00027187408545199977, | |
| "loss": 4.73607177734375, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.6690319310694374, | |
| "grad_norm": 0.5916706919670105, | |
| "learning_rate": 0.00027159422282327204, | |
| "loss": 4.747200775146484, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.6707214056428451, | |
| "grad_norm": 0.5538118481636047, | |
| "learning_rate": 0.0002713131202602238, | |
| "loss": 4.765713119506836, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.6724108802162527, | |
| "grad_norm": 0.5321049094200134, | |
| "learning_rate": 0.0002710307806293458, | |
| "loss": 4.7143207550048825, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.6741003547896605, | |
| "grad_norm": 0.5859444737434387, | |
| "learning_rate": 0.0002707472068097435, | |
| "loss": 4.749985122680664, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.6757898293630681, | |
| "grad_norm": 0.520622730255127, | |
| "learning_rate": 0.0002704624016931079, | |
| "loss": 4.7440532684326175, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6757898293630681, | |
| "eval_loss": 4.7194108963012695, | |
| "eval_runtime": 3.7971, | |
| "eval_samples_per_second": 263.356, | |
| "eval_steps_per_second": 5.53, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6774793039364757, | |
| "grad_norm": 0.5953722596168518, | |
| "learning_rate": 0.00027017636818368575, | |
| "loss": 4.737245559692383, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.6791687785098834, | |
| "grad_norm": 0.6203189492225647, | |
| "learning_rate": 0.0002698891091982504, | |
| "loss": 4.716173934936523, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.6808582530832911, | |
| "grad_norm": 0.5239487886428833, | |
| "learning_rate": 0.00026960062766607135, | |
| "loss": 4.735467529296875, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.6825477276566988, | |
| "grad_norm": 0.5474298000335693, | |
| "learning_rate": 0.0002693109265288851, | |
| "loss": 4.725514984130859, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6842372022301064, | |
| "grad_norm": 0.5452102422714233, | |
| "learning_rate": 0.0002690200087408648, | |
| "loss": 4.726776885986328, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6859266768035142, | |
| "grad_norm": 0.6271504759788513, | |
| "learning_rate": 0.00026872787726859004, | |
| "loss": 4.71842041015625, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6876161513769218, | |
| "grad_norm": 0.5585569143295288, | |
| "learning_rate": 0.0002684345350910169, | |
| "loss": 4.728883361816406, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6893056259503294, | |
| "grad_norm": 0.544662594795227, | |
| "learning_rate": 0.0002681399851994472, | |
| "loss": 4.729270553588867, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6909951005237371, | |
| "grad_norm": 0.5363122820854187, | |
| "learning_rate": 0.00026784423059749845, | |
| "loss": 4.726214599609375, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.6926845750971448, | |
| "grad_norm": 0.5298801064491272, | |
| "learning_rate": 0.0002675472743010727, | |
| "loss": 4.697872924804687, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6943740496705525, | |
| "grad_norm": 0.5710757374763489, | |
| "learning_rate": 0.0002672491193383263, | |
| "loss": 4.723146438598633, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.6960635242439601, | |
| "grad_norm": 0.5484883785247803, | |
| "learning_rate": 0.00026694976874963854, | |
| "loss": 4.738557052612305, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.6977529988173679, | |
| "grad_norm": 0.5273333191871643, | |
| "learning_rate": 0.00026664922558758105, | |
| "loss": 4.700592803955078, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.6994424733907755, | |
| "grad_norm": 0.5574657320976257, | |
| "learning_rate": 0.00026634749291688646, | |
| "loss": 4.729513168334961, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.7011319479641831, | |
| "grad_norm": 0.5571582317352295, | |
| "learning_rate": 0.00026604457381441715, | |
| "loss": 4.706221389770508, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.7028214225375908, | |
| "grad_norm": 0.6286988258361816, | |
| "learning_rate": 0.00026574047136913403, | |
| "loss": 4.701080322265625, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.7045108971109985, | |
| "grad_norm": 0.5314433574676514, | |
| "learning_rate": 0.0002654351886820648, | |
| "loss": 4.714921188354492, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.7062003716844062, | |
| "grad_norm": 0.539644718170166, | |
| "learning_rate": 0.0002651287288662724, | |
| "loss": 4.722955703735352, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.7078898462578138, | |
| "grad_norm": 0.5164220333099365, | |
| "learning_rate": 0.0002648210950468236, | |
| "loss": 4.7029579162597654, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.7095793208312214, | |
| "grad_norm": 0.5345500111579895, | |
| "learning_rate": 0.0002645122903607566, | |
| "loss": 4.696025085449219, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7112687954046292, | |
| "grad_norm": 0.5561880469322205, | |
| "learning_rate": 0.0002642023179570493, | |
| "loss": 4.696010971069336, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.7129582699780368, | |
| "grad_norm": 0.5260653495788574, | |
| "learning_rate": 0.0002638911809965874, | |
| "loss": 4.705658721923828, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.7146477445514445, | |
| "grad_norm": 0.517846941947937, | |
| "learning_rate": 0.0002635788826521316, | |
| "loss": 4.690948104858398, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.7163372191248522, | |
| "grad_norm": 0.5815365314483643, | |
| "learning_rate": 0.00026326542610828597, | |
| "loss": 4.702710723876953, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.7180266936982599, | |
| "grad_norm": 0.5511707067489624, | |
| "learning_rate": 0.00026295081456146485, | |
| "loss": 4.713930130004883, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.7197161682716675, | |
| "grad_norm": 0.5390937924385071, | |
| "learning_rate": 0.0002626350512198606, | |
| "loss": 4.694212341308594, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.7214056428450751, | |
| "grad_norm": 0.5410081744194031, | |
| "learning_rate": 0.0002623181393034108, | |
| "loss": 4.696395492553711, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.7230951174184829, | |
| "grad_norm": 0.5272055268287659, | |
| "learning_rate": 0.00026200008204376525, | |
| "loss": 4.715652847290039, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.7247845919918905, | |
| "grad_norm": 0.5485383868217468, | |
| "learning_rate": 0.00026168088268425346, | |
| "loss": 4.689223861694336, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.7264740665652982, | |
| "grad_norm": 0.4974030554294586, | |
| "learning_rate": 0.00026136054447985105, | |
| "loss": 4.6958671569824215, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7281635411387058, | |
| "grad_norm": 0.5421763062477112, | |
| "learning_rate": 0.00026103907069714694, | |
| "loss": 4.706072235107422, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.7298530157121136, | |
| "grad_norm": 0.5402170419692993, | |
| "learning_rate": 0.0002607164646143098, | |
| "loss": 4.684348297119141, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.7315424902855212, | |
| "grad_norm": 0.5388095378875732, | |
| "learning_rate": 0.0002603927295210547, | |
| "loss": 4.681607818603515, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.7332319648589288, | |
| "grad_norm": 0.5691295266151428, | |
| "learning_rate": 0.00026006786871860975, | |
| "loss": 4.659119033813477, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.7349214394323366, | |
| "grad_norm": 0.5657386183738708, | |
| "learning_rate": 0.00025974188551968207, | |
| "loss": 4.707662963867188, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.7366109140057442, | |
| "grad_norm": 0.5887618660926819, | |
| "learning_rate": 0.0002594147832484243, | |
| "loss": 4.678396606445313, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.7383003885791519, | |
| "grad_norm": 0.5618587136268616, | |
| "learning_rate": 0.0002590865652404007, | |
| "loss": 4.6809638977050785, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.7399898631525595, | |
| "grad_norm": 0.5673303604125977, | |
| "learning_rate": 0.0002587572348425529, | |
| "loss": 4.683576583862305, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.7416793377259672, | |
| "grad_norm": 0.5109097361564636, | |
| "learning_rate": 0.0002584267954131659, | |
| "loss": 4.674320983886719, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.7433688122993749, | |
| "grad_norm": 0.5133926272392273, | |
| "learning_rate": 0.000258095250321834, | |
| "loss": 4.676524353027344, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7450582868727825, | |
| "grad_norm": 0.5628970265388489, | |
| "learning_rate": 0.00025776260294942615, | |
| "loss": 4.688607025146484, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.7467477614461903, | |
| "grad_norm": 0.5761396884918213, | |
| "learning_rate": 0.0002574288566880517, | |
| "loss": 4.666116333007812, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.7484372360195979, | |
| "grad_norm": 0.5518139004707336, | |
| "learning_rate": 0.0002570940149410256, | |
| "loss": 4.665610504150391, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.7501267105930056, | |
| "grad_norm": 0.5176488757133484, | |
| "learning_rate": 0.00025675808112283387, | |
| "loss": 4.673014831542969, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.7518161851664132, | |
| "grad_norm": 0.5482094287872314, | |
| "learning_rate": 0.00025642105865909874, | |
| "loss": 4.665557098388672, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.7535056597398209, | |
| "grad_norm": 0.7407347559928894, | |
| "learning_rate": 0.0002560829509865437, | |
| "loss": 4.660655975341797, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.7551951343132286, | |
| "grad_norm": 0.5341119766235352, | |
| "learning_rate": 0.00025574376155295845, | |
| "loss": 4.670759582519532, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.7568846088866362, | |
| "grad_norm": 0.5163617134094238, | |
| "learning_rate": 0.00025540349381716367, | |
| "loss": 4.689437484741211, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.758574083460044, | |
| "grad_norm": 0.6329180598258972, | |
| "learning_rate": 0.00025506215124897593, | |
| "loss": 4.6677288055419925, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.7602635580334516, | |
| "grad_norm": 0.5871708393096924, | |
| "learning_rate": 0.0002547197373291721, | |
| "loss": 4.678330993652343, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7602635580334516, | |
| "eval_loss": 4.647042751312256, | |
| "eval_runtime": 3.6169, | |
| "eval_samples_per_second": 276.482, | |
| "eval_steps_per_second": 5.806, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7619530326068593, | |
| "grad_norm": 0.5751690864562988, | |
| "learning_rate": 0.0002543762555494541, | |
| "loss": 4.659806823730468, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.7636425071802669, | |
| "grad_norm": 0.5488387942314148, | |
| "learning_rate": 0.0002540317094124131, | |
| "loss": 4.675619888305664, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.7653319817536746, | |
| "grad_norm": 0.5706210136413574, | |
| "learning_rate": 0.0002536861024314936, | |
| "loss": 4.647731018066406, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.7670214563270823, | |
| "grad_norm": 0.5262100100517273, | |
| "learning_rate": 0.0002533394381309583, | |
| "loss": 4.629973220825195, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.7687109309004899, | |
| "grad_norm": 0.5438910126686096, | |
| "learning_rate": 0.00025299172004585144, | |
| "loss": 4.680305099487304, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.7704004054738977, | |
| "grad_norm": 0.5125553011894226, | |
| "learning_rate": 0.00025264295172196304, | |
| "loss": 4.6679943084716795, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.7720898800473053, | |
| "grad_norm": 0.5525355339050293, | |
| "learning_rate": 0.0002522931367157928, | |
| "loss": 4.6561134338378904, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.773779354620713, | |
| "grad_norm": 0.5133577585220337, | |
| "learning_rate": 0.00025194227859451384, | |
| "loss": 4.66561279296875, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.7754688291941206, | |
| "grad_norm": 0.5095699429512024, | |
| "learning_rate": 0.00025159038093593606, | |
| "loss": 4.678707122802734, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.7771583037675283, | |
| "grad_norm": 0.5241293907165527, | |
| "learning_rate": 0.0002512374473284699, | |
| "loss": 4.642659759521484, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.778847778340936, | |
| "grad_norm": 0.557011067867279, | |
| "learning_rate": 0.00025088348137108983, | |
| "loss": 4.642984771728516, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.7805372529143436, | |
| "grad_norm": 0.5290088653564453, | |
| "learning_rate": 0.0002505284866732974, | |
| "loss": 4.668995666503906, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.7822267274877513, | |
| "grad_norm": 0.519223153591156, | |
| "learning_rate": 0.0002501724668550846, | |
| "loss": 4.627962112426758, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.783916202061159, | |
| "grad_norm": 0.5338088274002075, | |
| "learning_rate": 0.00024981542554689684, | |
| "loss": 4.67579231262207, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.7856056766345666, | |
| "grad_norm": 0.5252251625061035, | |
| "learning_rate": 0.000249457366389596, | |
| "loss": 4.656952285766602, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.7872951512079743, | |
| "grad_norm": 0.5428206324577332, | |
| "learning_rate": 0.0002490982930344233, | |
| "loss": 4.646731185913086, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.788984625781382, | |
| "grad_norm": 0.5392381548881531, | |
| "learning_rate": 0.0002487382091429621, | |
| "loss": 4.663632583618164, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.7906741003547897, | |
| "grad_norm": 0.51649409532547, | |
| "learning_rate": 0.00024837711838710035, | |
| "loss": 4.620084762573242, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7923635749281973, | |
| "grad_norm": 0.5248917937278748, | |
| "learning_rate": 0.00024801502444899353, | |
| "loss": 4.66024169921875, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.794053049501605, | |
| "grad_norm": 0.5321633219718933, | |
| "learning_rate": 0.00024765193102102676, | |
| "loss": 4.647469329833984, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7957425240750127, | |
| "grad_norm": 0.5236574411392212, | |
| "learning_rate": 0.0002472878418057772, | |
| "loss": 4.6667522430419925, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.7974319986484203, | |
| "grad_norm": 0.5166000127792358, | |
| "learning_rate": 0.0002469227605159766, | |
| "loss": 4.6316486358642575, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.799121473221828, | |
| "grad_norm": 0.6069431304931641, | |
| "learning_rate": 0.0002465566908744729, | |
| "loss": 4.614125442504883, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.8008109477952357, | |
| "grad_norm": 0.5319153666496277, | |
| "learning_rate": 0.00024618963661419285, | |
| "loss": 4.649255752563477, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.8025004223686434, | |
| "grad_norm": 0.4894997477531433, | |
| "learning_rate": 0.0002458216014781035, | |
| "loss": 4.621485900878906, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.804189896942051, | |
| "grad_norm": 0.516018807888031, | |
| "learning_rate": 0.00024545258921917416, | |
| "loss": 4.630000305175781, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.8058793715154587, | |
| "grad_norm": 0.5458150506019592, | |
| "learning_rate": 0.0002450826036003384, | |
| "loss": 4.635307312011719, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.8075688460888664, | |
| "grad_norm": 0.5067882537841797, | |
| "learning_rate": 0.00024471164839445526, | |
| "loss": 4.636883163452149, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.809258320662274, | |
| "grad_norm": 0.4767204523086548, | |
| "learning_rate": 0.0002443397273842709, | |
| "loss": 4.645626831054687, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.8109477952356817, | |
| "grad_norm": 0.5159788727760315, | |
| "learning_rate": 0.00024396684436238025, | |
| "loss": 4.605623626708985, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.8126372698090893, | |
| "grad_norm": 0.5320490598678589, | |
| "learning_rate": 0.00024359300313118814, | |
| "loss": 4.638732147216797, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.8143267443824971, | |
| "grad_norm": 0.5451418161392212, | |
| "learning_rate": 0.00024321820750287045, | |
| "loss": 4.6449028015136715, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.8160162189559047, | |
| "grad_norm": 0.5369979739189148, | |
| "learning_rate": 0.00024284246129933543, | |
| "loss": 4.602875518798828, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.8177056935293124, | |
| "grad_norm": 0.5349618196487427, | |
| "learning_rate": 0.0002424657683521847, | |
| "loss": 4.624568939208984, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.8193951681027201, | |
| "grad_norm": 0.5187742114067078, | |
| "learning_rate": 0.00024208813250267404, | |
| "loss": 4.621414566040039, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.8210846426761277, | |
| "grad_norm": 0.49689674377441406, | |
| "learning_rate": 0.00024170955760167436, | |
| "loss": 4.63438606262207, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.8227741172495354, | |
| "grad_norm": 0.5191966891288757, | |
| "learning_rate": 0.0002413300475096322, | |
| "loss": 4.629247665405273, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.824463591822943, | |
| "grad_norm": 0.5321470499038696, | |
| "learning_rate": 0.00024094960609653078, | |
| "loss": 4.630535507202149, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.8261530663963508, | |
| "grad_norm": 0.577171802520752, | |
| "learning_rate": 0.00024056823724185014, | |
| "loss": 4.614957809448242, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.8278425409697584, | |
| "grad_norm": 0.5203391313552856, | |
| "learning_rate": 0.00024018594483452783, | |
| "loss": 4.597796630859375, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.829532015543166, | |
| "grad_norm": 0.568663477897644, | |
| "learning_rate": 0.00023980273277291893, | |
| "loss": 4.630698394775391, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.8312214901165738, | |
| "grad_norm": 0.5214170813560486, | |
| "learning_rate": 0.00023941860496475687, | |
| "loss": 4.6348930358886715, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.8329109646899814, | |
| "grad_norm": 0.5391976237297058, | |
| "learning_rate": 0.00023903356532711296, | |
| "loss": 4.6155132293701175, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.8346004392633891, | |
| "grad_norm": 0.4739229381084442, | |
| "learning_rate": 0.0002386476177863568, | |
| "loss": 4.622202301025391, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.8362899138367967, | |
| "grad_norm": 0.5011942386627197, | |
| "learning_rate": 0.00023826076627811628, | |
| "loss": 4.608601379394531, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.8379793884102045, | |
| "grad_norm": 0.5716709494590759, | |
| "learning_rate": 0.0002378730147472371, | |
| "loss": 4.581511306762695, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.8396688629836121, | |
| "grad_norm": 0.5052880644798279, | |
| "learning_rate": 0.00023748436714774294, | |
| "loss": 4.649203491210938, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.8413583375570197, | |
| "grad_norm": 0.512668788433075, | |
| "learning_rate": 0.00023709482744279492, | |
| "loss": 4.621175765991211, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.8430478121304275, | |
| "grad_norm": 0.5231815576553345, | |
| "learning_rate": 0.00023670439960465128, | |
| "loss": 4.606881713867187, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.8447372867038351, | |
| "grad_norm": 0.5233691930770874, | |
| "learning_rate": 0.00023631308761462677, | |
| "loss": 4.614410018920898, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8447372867038351, | |
| "eval_loss": 4.584611415863037, | |
| "eval_runtime": 3.6357, | |
| "eval_samples_per_second": 275.05, | |
| "eval_steps_per_second": 5.776, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8464267612772428, | |
| "grad_norm": 0.5504565238952637, | |
| "learning_rate": 0.00023592089546305216, | |
| "loss": 4.576148986816406, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.8481162358506504, | |
| "grad_norm": 0.5207253098487854, | |
| "learning_rate": 0.00023552782714923343, | |
| "loss": 4.615359497070313, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.8498057104240582, | |
| "grad_norm": 0.5456526875495911, | |
| "learning_rate": 0.00023513388668141118, | |
| "loss": 4.583608627319336, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.8514951849974658, | |
| "grad_norm": 0.5371212959289551, | |
| "learning_rate": 0.00023473907807671952, | |
| "loss": 4.605810546875, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.8531846595708734, | |
| "grad_norm": 0.5273321270942688, | |
| "learning_rate": 0.00023434340536114531, | |
| "loss": 4.596974945068359, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.8548741341442812, | |
| "grad_norm": 0.5454714894294739, | |
| "learning_rate": 0.00023394687256948697, | |
| "loss": 4.595716094970703, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.8565636087176888, | |
| "grad_norm": 0.6011702418327332, | |
| "learning_rate": 0.00023354948374531344, | |
| "loss": 4.590705108642578, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.8582530832910965, | |
| "grad_norm": 0.5225823521614075, | |
| "learning_rate": 0.00023315124294092277, | |
| "loss": 4.578453063964844, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.8599425578645041, | |
| "grad_norm": 0.5181743502616882, | |
| "learning_rate": 0.000232752154217301, | |
| "loss": 4.5722908020019535, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.8616320324379118, | |
| "grad_norm": 0.5235112309455872, | |
| "learning_rate": 0.00023235222164408076, | |
| "loss": 4.600410461425781, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.8633215070113195, | |
| "grad_norm": 0.5427247881889343, | |
| "learning_rate": 0.00023195144929949953, | |
| "loss": 4.576435089111328, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.8650109815847271, | |
| "grad_norm": 0.5017905235290527, | |
| "learning_rate": 0.00023154984127035823, | |
| "loss": 4.6031841278076175, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.8667004561581348, | |
| "grad_norm": 0.5279256105422974, | |
| "learning_rate": 0.00023114740165197957, | |
| "loss": 4.570458221435547, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.8683899307315425, | |
| "grad_norm": 0.5026883482933044, | |
| "learning_rate": 0.00023074413454816619, | |
| "loss": 4.587477493286133, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.8700794053049502, | |
| "grad_norm": 0.5021783709526062, | |
| "learning_rate": 0.0002303400440711589, | |
| "loss": 4.580776977539062, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.8717688798783578, | |
| "grad_norm": 0.5208005309104919, | |
| "learning_rate": 0.00022993513434159464, | |
| "loss": 4.606272125244141, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.8734583544517655, | |
| "grad_norm": 0.4933724105358124, | |
| "learning_rate": 0.0002295294094884646, | |
| "loss": 4.598735046386719, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.8751478290251732, | |
| "grad_norm": 0.4844622015953064, | |
| "learning_rate": 0.00022912287364907204, | |
| "loss": 4.577612686157226, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.8768373035985808, | |
| "grad_norm": 0.49681806564331055, | |
| "learning_rate": 0.00022871553096899, | |
| "loss": 4.6206306457519535, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.8785267781719885, | |
| "grad_norm": 0.5069138407707214, | |
| "learning_rate": 0.00022830738560201911, | |
| "loss": 4.576866149902344, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.8802162527453962, | |
| "grad_norm": 0.49277958273887634, | |
| "learning_rate": 0.00022789844171014557, | |
| "loss": 4.570761489868164, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.8819057273188039, | |
| "grad_norm": 0.5152326822280884, | |
| "learning_rate": 0.00022748870346349796, | |
| "loss": 4.591669082641602, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.8835952018922115, | |
| "grad_norm": 0.5280734896659851, | |
| "learning_rate": 0.00022707817504030538, | |
| "loss": 4.600007629394531, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.8852846764656191, | |
| "grad_norm": 0.5109785795211792, | |
| "learning_rate": 0.0002266668606268545, | |
| "loss": 4.551007461547852, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.8869741510390269, | |
| "grad_norm": 0.511035144329071, | |
| "learning_rate": 0.00022625476441744715, | |
| "loss": 4.596772766113281, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.8886636256124345, | |
| "grad_norm": 0.5007238984107971, | |
| "learning_rate": 0.00022584189061435725, | |
| "loss": 4.5646717071533205, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.8903531001858422, | |
| "grad_norm": 0.517419159412384, | |
| "learning_rate": 0.00022542824342778806, | |
| "loss": 4.561199188232422, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.8920425747592499, | |
| "grad_norm": 0.5943387746810913, | |
| "learning_rate": 0.0002250138270758293, | |
| "loss": 4.576548385620117, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.8937320493326576, | |
| "grad_norm": 0.5131561160087585, | |
| "learning_rate": 0.00022459864578441415, | |
| "loss": 4.587300109863281, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.8954215239060652, | |
| "grad_norm": 0.5333006381988525, | |
| "learning_rate": 0.0002241827037872761, | |
| "loss": 4.5638988494873045, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.8971109984794728, | |
| "grad_norm": 0.46661046147346497, | |
| "learning_rate": 0.00022376600532590578, | |
| "loss": 4.5343585968017575, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.8988004730528806, | |
| "grad_norm": 0.4886866509914398, | |
| "learning_rate": 0.00022334855464950775, | |
| "loss": 4.5834095001220705, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.9004899476262882, | |
| "grad_norm": 0.5262774229049683, | |
| "learning_rate": 0.00022293035601495708, | |
| "loss": 4.579534912109375, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.9021794221996959, | |
| "grad_norm": 0.5163218975067139, | |
| "learning_rate": 0.00022251141368675607, | |
| "loss": 4.577048492431641, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.9038688967731036, | |
| "grad_norm": 0.5345433950424194, | |
| "learning_rate": 0.00022209173193699067, | |
| "loss": 4.582790374755859, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.9055583713465112, | |
| "grad_norm": 0.5151252150535583, | |
| "learning_rate": 0.00022167131504528695, | |
| "loss": 4.594097900390625, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.9072478459199189, | |
| "grad_norm": 0.47062498331069946, | |
| "learning_rate": 0.00022125016729876743, | |
| "loss": 4.574803161621094, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.9089373204933265, | |
| "grad_norm": 0.49667978286743164, | |
| "learning_rate": 0.00022082829299200743, | |
| "loss": 4.580567932128906, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.9106267950667343, | |
| "grad_norm": 0.48394060134887695, | |
| "learning_rate": 0.00022040569642699112, | |
| "loss": 4.555598449707031, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.9123162696401419, | |
| "grad_norm": 0.48837390542030334, | |
| "learning_rate": 0.00021998238191306798, | |
| "loss": 4.534821319580078, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9140057442135496, | |
| "grad_norm": 0.5261453986167908, | |
| "learning_rate": 0.00021955835376690841, | |
| "loss": 4.546956634521484, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.9156952187869573, | |
| "grad_norm": 0.5199710130691528, | |
| "learning_rate": 0.00021913361631246004, | |
| "loss": 4.561407852172851, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.9173846933603649, | |
| "grad_norm": 0.5369474291801453, | |
| "learning_rate": 0.0002187081738809036, | |
| "loss": 4.550098419189453, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.9190741679337726, | |
| "grad_norm": 0.5480945110321045, | |
| "learning_rate": 0.00021828203081060858, | |
| "loss": 4.559786224365235, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.9207636425071802, | |
| "grad_norm": 0.5149338245391846, | |
| "learning_rate": 0.00021785519144708912, | |
| "loss": 4.534018325805664, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.922453117080588, | |
| "grad_norm": 0.5365586280822754, | |
| "learning_rate": 0.00021742766014295976, | |
| "loss": 4.546533584594727, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.9241425916539956, | |
| "grad_norm": 0.5260055661201477, | |
| "learning_rate": 0.00021699944125789096, | |
| "loss": 4.534712600708008, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.9258320662274033, | |
| "grad_norm": 0.4802268147468567, | |
| "learning_rate": 0.00021657053915856455, | |
| "loss": 4.560755920410156, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.927521540800811, | |
| "grad_norm": 0.4982668459415436, | |
| "learning_rate": 0.0002161409582186294, | |
| "loss": 4.584963989257813, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.9292110153742186, | |
| "grad_norm": 0.49544209241867065, | |
| "learning_rate": 0.0002157107028186567, | |
| "loss": 4.547665786743164, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9292110153742186, | |
| "eval_loss": 4.52970552444458, | |
| "eval_runtime": 3.6346, | |
| "eval_samples_per_second": 275.136, | |
| "eval_steps_per_second": 5.778, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9309004899476263, | |
| "grad_norm": 0.4983241558074951, | |
| "learning_rate": 0.00021527977734609537, | |
| "loss": 4.547625732421875, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.9325899645210339, | |
| "grad_norm": 0.5012770295143127, | |
| "learning_rate": 0.00021484818619522722, | |
| "loss": 4.557040023803711, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.9342794390944417, | |
| "grad_norm": 0.5078200101852417, | |
| "learning_rate": 0.00021441593376712224, | |
| "loss": 4.553184890747071, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.9359689136678493, | |
| "grad_norm": 0.48705384135246277, | |
| "learning_rate": 0.0002139830244695935, | |
| "loss": 4.5813232421875, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.937658388241257, | |
| "grad_norm": 0.5023474097251892, | |
| "learning_rate": 0.00021354946271715265, | |
| "loss": 4.552815628051758, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.9393478628146646, | |
| "grad_norm": 0.5058281421661377, | |
| "learning_rate": 0.00021311525293096444, | |
| "loss": 4.541165924072265, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.9410373373880723, | |
| "grad_norm": 0.5129496455192566, | |
| "learning_rate": 0.00021268039953880184, | |
| "loss": 4.529154968261719, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.94272681196148, | |
| "grad_norm": 0.5097109079360962, | |
| "learning_rate": 0.00021224490697500088, | |
| "loss": 4.535088348388672, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.9444162865348876, | |
| "grad_norm": 0.5103420615196228, | |
| "learning_rate": 0.00021180877968041552, | |
| "loss": 4.553527069091797, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.9461057611082954, | |
| "grad_norm": 0.4936409294605255, | |
| "learning_rate": 0.00021137202210237213, | |
| "loss": 4.54007568359375, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.947795235681703, | |
| "grad_norm": 0.5701144933700562, | |
| "learning_rate": 0.0002109346386946243, | |
| "loss": 4.558887100219726, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.9494847102551106, | |
| "grad_norm": 0.4890182912349701, | |
| "learning_rate": 0.00021049663391730752, | |
| "loss": 4.543179702758789, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.9511741848285183, | |
| "grad_norm": 0.5074143409729004, | |
| "learning_rate": 0.00021005801223689344, | |
| "loss": 4.5704292297363285, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.952863659401926, | |
| "grad_norm": 0.4767675995826721, | |
| "learning_rate": 0.00020961877812614458, | |
| "loss": 4.569948196411133, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.9545531339753337, | |
| "grad_norm": 0.5034293532371521, | |
| "learning_rate": 0.00020917893606406843, | |
| "loss": 4.524322128295898, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.9562426085487413, | |
| "grad_norm": 0.5619840621948242, | |
| "learning_rate": 0.0002087384905358722, | |
| "loss": 4.528865051269531, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.9579320831221491, | |
| "grad_norm": 0.5692474842071533, | |
| "learning_rate": 0.00020829744603291663, | |
| "loss": 4.5155292510986325, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.9596215576955567, | |
| "grad_norm": 0.504224419593811, | |
| "learning_rate": 0.00020785580705267047, | |
| "loss": 4.559905624389648, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.9613110322689643, | |
| "grad_norm": 0.563014805316925, | |
| "learning_rate": 0.00020741357809866447, | |
| "loss": 4.556017303466797, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.963000506842372, | |
| "grad_norm": 0.4872301518917084, | |
| "learning_rate": 0.0002069707636804457, | |
| "loss": 4.550839233398437, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.9646899814157797, | |
| "grad_norm": 0.5135483145713806, | |
| "learning_rate": 0.0002065273683135312, | |
| "loss": 4.550697708129883, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.9663794559891874, | |
| "grad_norm": 0.4852290451526642, | |
| "learning_rate": 0.00020608339651936224, | |
| "loss": 4.531842422485352, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.968068930562595, | |
| "grad_norm": 0.5045028924942017, | |
| "learning_rate": 0.00020563885282525802, | |
| "loss": 4.532521057128906, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.9697584051360028, | |
| "grad_norm": 0.530616044998169, | |
| "learning_rate": 0.00020519374176436968, | |
| "loss": 4.546891403198242, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.9714478797094104, | |
| "grad_norm": 0.49565091729164124, | |
| "learning_rate": 0.00020474806787563392, | |
| "loss": 4.533766555786133, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.973137354282818, | |
| "grad_norm": 0.5225724577903748, | |
| "learning_rate": 0.0002043018357037267, | |
| "loss": 4.542942810058594, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.9748268288562257, | |
| "grad_norm": 0.49189162254333496, | |
| "learning_rate": 0.00020385504979901712, | |
| "loss": 4.545899200439453, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.9765163034296334, | |
| "grad_norm": 0.5116291642189026, | |
| "learning_rate": 0.00020340771471752078, | |
| "loss": 4.532541656494141, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.9782057780030411, | |
| "grad_norm": 0.5132644772529602, | |
| "learning_rate": 0.0002029598350208534, | |
| "loss": 4.524928283691406, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.9798952525764487, | |
| "grad_norm": 0.4904372990131378, | |
| "learning_rate": 0.00020251141527618434, | |
| "loss": 4.532776641845703, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.9815847271498563, | |
| "grad_norm": 0.48598089814186096, | |
| "learning_rate": 0.00020206246005618998, | |
| "loss": 4.519465637207031, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.9832742017232641, | |
| "grad_norm": 0.5415476560592651, | |
| "learning_rate": 0.00020161297393900713, | |
| "loss": 4.512179565429688, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.9849636762966717, | |
| "grad_norm": 0.5061231255531311, | |
| "learning_rate": 0.00020116296150818623, | |
| "loss": 4.534863662719727, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.9866531508700794, | |
| "grad_norm": 0.5157834887504578, | |
| "learning_rate": 0.0002007124273526449, | |
| "loss": 4.50738639831543, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.988342625443487, | |
| "grad_norm": 0.509292483329773, | |
| "learning_rate": 0.00020026137606662077, | |
| "loss": 4.5266845703125, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.9900321000168948, | |
| "grad_norm": 0.5107020139694214, | |
| "learning_rate": 0.0001998098122496249, | |
| "loss": 4.533035659790039, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.9917215745903024, | |
| "grad_norm": 0.5432437062263489, | |
| "learning_rate": 0.00019935774050639472, | |
| "loss": 4.518278884887695, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.99341104916371, | |
| "grad_norm": 0.5360410213470459, | |
| "learning_rate": 0.0001989051654468473, | |
| "loss": 4.502675628662109, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.9951005237371178, | |
| "grad_norm": 0.5418276786804199, | |
| "learning_rate": 0.00019845209168603195, | |
| "loss": 4.5235343933105465, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.9967899983105254, | |
| "grad_norm": 0.5157185792922974, | |
| "learning_rate": 0.00019799852384408355, | |
| "loss": 4.524081420898438, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.9984794728839331, | |
| "grad_norm": 0.5043293237686157, | |
| "learning_rate": 0.00019754446654617527, | |
| "loss": 4.508223342895508, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.0001689474573408, | |
| "grad_norm": 0.5386601090431213, | |
| "learning_rate": 0.00019708992442247136, | |
| "loss": 4.5236083984375, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.0018584220307485, | |
| "grad_norm": 0.5341511368751526, | |
| "learning_rate": 0.0001966349021080801, | |
| "loss": 4.459320068359375, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.003547896604156, | |
| "grad_norm": 0.5038416981697083, | |
| "learning_rate": 0.0001961794042430062, | |
| "loss": 4.505880355834961, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.0052373711775637, | |
| "grad_norm": 0.47585076093673706, | |
| "learning_rate": 0.000195723435472104, | |
| "loss": 4.477125930786133, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.0069268457509715, | |
| "grad_norm": 0.49405696988105774, | |
| "learning_rate": 0.00019526700044502956, | |
| "loss": 4.483388137817383, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.0086163203243792, | |
| "grad_norm": 0.47832658886909485, | |
| "learning_rate": 0.0001948101038161937, | |
| "loss": 4.474266052246094, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.0103057948977867, | |
| "grad_norm": 0.470113068819046, | |
| "learning_rate": 0.0001943527502447141, | |
| "loss": 4.483303833007812, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.0119952694711944, | |
| "grad_norm": 0.4839136004447937, | |
| "learning_rate": 0.00019389494439436836, | |
| "loss": 4.453615188598633, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.0136847440446022, | |
| "grad_norm": 0.482327401638031, | |
| "learning_rate": 0.0001934366909335458, | |
| "loss": 4.491983413696289, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0136847440446022, | |
| "eval_loss": 4.487085819244385, | |
| "eval_runtime": 4.7973, | |
| "eval_samples_per_second": 208.452, | |
| "eval_steps_per_second": 4.377, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0153742186180097, | |
| "grad_norm": 0.5339483022689819, | |
| "learning_rate": 0.00019297799453520028, | |
| "loss": 4.500830459594726, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.0170636931914174, | |
| "grad_norm": 0.5642256736755371, | |
| "learning_rate": 0.00019251885987680252, | |
| "loss": 4.485746002197265, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.0187531677648252, | |
| "grad_norm": 0.5060975551605225, | |
| "learning_rate": 0.00019205929164029217, | |
| "loss": 4.475402450561523, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.020442642338233, | |
| "grad_norm": 0.49786120653152466, | |
| "learning_rate": 0.00019159929451203033, | |
| "loss": 4.486777114868164, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.0221321169116404, | |
| "grad_norm": 0.506598949432373, | |
| "learning_rate": 0.00019113887318275149, | |
| "loss": 4.489146041870117, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.0238215914850481, | |
| "grad_norm": 0.48270103335380554, | |
| "learning_rate": 0.00019067803234751603, | |
| "loss": 4.474691009521484, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.0255110660584559, | |
| "grad_norm": 0.48239970207214355, | |
| "learning_rate": 0.00019021677670566208, | |
| "loss": 4.4708606719970705, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.0272005406318634, | |
| "grad_norm": 0.4966093599796295, | |
| "learning_rate": 0.00018975511096075762, | |
| "loss": 4.505655670166016, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.0288900152052711, | |
| "grad_norm": 0.5429375767707825, | |
| "learning_rate": 0.00018929303982055272, | |
| "loss": 4.499347305297851, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.0305794897786789, | |
| "grad_norm": 0.4981507360935211, | |
| "learning_rate": 0.00018883056799693125, | |
| "loss": 4.461819839477539, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.0322689643520866, | |
| "grad_norm": 0.5121614336967468, | |
| "learning_rate": 0.00018836770020586315, | |
| "loss": 4.478689956665039, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.033958438925494, | |
| "grad_norm": 0.4835728406906128, | |
| "learning_rate": 0.00018790444116735595, | |
| "loss": 4.47772216796875, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.0356479134989018, | |
| "grad_norm": 0.4881206154823303, | |
| "learning_rate": 0.00018744079560540695, | |
| "loss": 4.479801177978516, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.0373373880723096, | |
| "grad_norm": 0.47434431314468384, | |
| "learning_rate": 0.000186976768247955, | |
| "loss": 4.480235290527344, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.039026862645717, | |
| "grad_norm": 0.48258504271507263, | |
| "learning_rate": 0.00018651236382683225, | |
| "loss": 4.469864273071289, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.0407163372191248, | |
| "grad_norm": 0.5025637745857239, | |
| "learning_rate": 0.0001860475870777157, | |
| "loss": 4.472750091552735, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.0424058117925326, | |
| "grad_norm": 0.4636594355106354, | |
| "learning_rate": 0.0001855824427400793, | |
| "loss": 4.450835418701172, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.0440952863659403, | |
| "grad_norm": 0.4901501536369324, | |
| "learning_rate": 0.00018511693555714535, | |
| "loss": 4.490735626220703, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.0457847609393478, | |
| "grad_norm": 0.5198561549186707, | |
| "learning_rate": 0.00018465107027583615, | |
| "loss": 4.474180221557617, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.0474742355127555, | |
| "grad_norm": 0.4723539352416992, | |
| "learning_rate": 0.00018418485164672574, | |
| "loss": 4.4745361328125, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.0491637100861633, | |
| "grad_norm": 0.5074954628944397, | |
| "learning_rate": 0.00018371828442399128, | |
| "loss": 4.469810485839844, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.0508531846595708, | |
| "grad_norm": 0.49918699264526367, | |
| "learning_rate": 0.00018325137336536464, | |
| "loss": 4.442096710205078, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.0525426592329785, | |
| "grad_norm": 0.5088530778884888, | |
| "learning_rate": 0.00018278412323208392, | |
| "loss": 4.484762573242188, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.0542321338063863, | |
| "grad_norm": 0.506341814994812, | |
| "learning_rate": 0.00018231653878884486, | |
| "loss": 4.486656188964844, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.055921608379794, | |
| "grad_norm": 0.5262649059295654, | |
| "learning_rate": 0.00018184862480375233, | |
| "loss": 4.455668640136719, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.0576110829532015, | |
| "grad_norm": 0.5115051865577698, | |
| "learning_rate": 0.00018138038604827153, | |
| "loss": 4.479043960571289, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.0593005575266092, | |
| "grad_norm": 0.50110924243927, | |
| "learning_rate": 0.0001809118272971795, | |
| "loss": 4.446685409545898, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 1.060990032100017, | |
| "grad_norm": 0.5022484660148621, | |
| "learning_rate": 0.0001804429533285164, | |
| "loss": 4.4593353271484375, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.0626795066734245, | |
| "grad_norm": 0.492165744304657, | |
| "learning_rate": 0.00017997376892353668, | |
| "loss": 4.496971511840821, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 1.0643689812468322, | |
| "grad_norm": 0.5134599208831787, | |
| "learning_rate": 0.0001795042788666605, | |
| "loss": 4.465629196166992, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.06605845582024, | |
| "grad_norm": 0.5151488184928894, | |
| "learning_rate": 0.00017903448794542488, | |
| "loss": 4.454899597167969, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 1.0677479303936477, | |
| "grad_norm": 0.5240500569343567, | |
| "learning_rate": 0.00017856440095043464, | |
| "loss": 4.481625747680664, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.0694374049670552, | |
| "grad_norm": 0.5187123417854309, | |
| "learning_rate": 0.00017809402267531405, | |
| "loss": 4.437789535522461, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 1.071126879540463, | |
| "grad_norm": 0.4693409502506256, | |
| "learning_rate": 0.00017762335791665735, | |
| "loss": 4.450423812866211, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.0728163541138707, | |
| "grad_norm": 0.5061246752738953, | |
| "learning_rate": 0.00017715241147398035, | |
| "loss": 4.46313705444336, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.0745058286872782, | |
| "grad_norm": 0.47927796840667725, | |
| "learning_rate": 0.00017668118814967126, | |
| "loss": 4.446915817260742, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.076195303260686, | |
| "grad_norm": 0.47587907314300537, | |
| "learning_rate": 0.00017620969274894163, | |
| "loss": 4.461398696899414, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 1.0778847778340936, | |
| "grad_norm": 0.5091392397880554, | |
| "learning_rate": 0.00017573793007977763, | |
| "loss": 4.450970458984375, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.0795742524075012, | |
| "grad_norm": 0.5105127692222595, | |
| "learning_rate": 0.0001752659049528906, | |
| "loss": 4.458657455444336, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 1.081263726980909, | |
| "grad_norm": 0.5196726322174072, | |
| "learning_rate": 0.00017479362218166854, | |
| "loss": 4.444008636474609, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.0829532015543166, | |
| "grad_norm": 0.4891359210014343, | |
| "learning_rate": 0.0001743210865821265, | |
| "loss": 4.436445236206055, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 1.0846426761277244, | |
| "grad_norm": 0.5141095519065857, | |
| "learning_rate": 0.0001738483029728578, | |
| "loss": 4.455481338500976, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.0863321507011319, | |
| "grad_norm": 0.5223525166511536, | |
| "learning_rate": 0.00017337527617498474, | |
| "loss": 4.485405731201172, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 1.0880216252745396, | |
| "grad_norm": 0.4939091205596924, | |
| "learning_rate": 0.0001729020110121096, | |
| "loss": 4.448784255981446, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.0897110998479473, | |
| "grad_norm": 0.49695253372192383, | |
| "learning_rate": 0.0001724285123102652, | |
| "loss": 4.4587146759033205, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.091400574421355, | |
| "grad_norm": 0.4882517158985138, | |
| "learning_rate": 0.00017195478489786593, | |
| "loss": 4.43580207824707, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.0930900489947626, | |
| "grad_norm": 0.4971882998943329, | |
| "learning_rate": 0.00017148083360565836, | |
| "loss": 4.436479949951172, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.0947795235681703, | |
| "grad_norm": 0.4835260808467865, | |
| "learning_rate": 0.00017100666326667202, | |
| "loss": 4.476963043212891, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.096468998141578, | |
| "grad_norm": 0.4847490191459656, | |
| "learning_rate": 0.00017053227871617027, | |
| "loss": 4.449015426635742, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.0981584727149856, | |
| "grad_norm": 0.5305824279785156, | |
| "learning_rate": 0.00017005768479160064, | |
| "loss": 4.452330780029297, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.0981584727149856, | |
| "eval_loss": 4.447469711303711, | |
| "eval_runtime": 4.0239, | |
| "eval_samples_per_second": 248.518, | |
| "eval_steps_per_second": 5.219, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.0998479472883933, | |
| "grad_norm": 0.4943171739578247, | |
| "learning_rate": 0.0001695828863325459, | |
| "loss": 4.467470932006836, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 1.101537421861801, | |
| "grad_norm": 0.474933385848999, | |
| "learning_rate": 0.00016910788818067434, | |
| "loss": 4.4371185302734375, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.1032268964352085, | |
| "grad_norm": 0.5118041634559631, | |
| "learning_rate": 0.0001686326951796907, | |
| "loss": 4.451096725463867, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 1.1049163710086163, | |
| "grad_norm": 0.5289651155471802, | |
| "learning_rate": 0.00016815731217528667, | |
| "loss": 4.448075485229492, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.106605845582024, | |
| "grad_norm": 0.5182890295982361, | |
| "learning_rate": 0.00016768174401509143, | |
| "loss": 4.467396926879883, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.1082953201554318, | |
| "grad_norm": 0.5209820866584778, | |
| "learning_rate": 0.0001672059955486223, | |
| "loss": 4.459186172485351, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.1099847947288393, | |
| "grad_norm": 0.48584309220314026, | |
| "learning_rate": 0.000166730071627235, | |
| "loss": 4.46546516418457, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 1.111674269302247, | |
| "grad_norm": 0.5017306804656982, | |
| "learning_rate": 0.00016625397710407487, | |
| "loss": 4.452592086791992, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.1133637438756547, | |
| "grad_norm": 0.46485376358032227, | |
| "learning_rate": 0.00016577771683402647, | |
| "loss": 4.46324348449707, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 1.1150532184490622, | |
| "grad_norm": 0.5154596567153931, | |
| "learning_rate": 0.00016530129567366483, | |
| "loss": 4.457768249511719, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.11674269302247, | |
| "grad_norm": 0.49490463733673096, | |
| "learning_rate": 0.0001648247184812054, | |
| "loss": 4.427638244628906, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 1.1184321675958777, | |
| "grad_norm": 0.4721022844314575, | |
| "learning_rate": 0.00016434799011645507, | |
| "loss": 4.4389793395996096, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.1201216421692854, | |
| "grad_norm": 0.4648183286190033, | |
| "learning_rate": 0.00016387111544076193, | |
| "loss": 4.460124969482422, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 1.121811116742693, | |
| "grad_norm": 0.5035665035247803, | |
| "learning_rate": 0.00016339409931696625, | |
| "loss": 4.439287185668945, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.1235005913161007, | |
| "grad_norm": 0.4910880923271179, | |
| "learning_rate": 0.00016291694660935065, | |
| "loss": 4.456634140014648, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.1251900658895084, | |
| "grad_norm": 0.48906245827674866, | |
| "learning_rate": 0.00016243966218359047, | |
| "loss": 4.428804016113281, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.126879540462916, | |
| "grad_norm": 0.5756556391716003, | |
| "learning_rate": 0.00016196225090670435, | |
| "loss": 4.411157608032227, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 1.1285690150363237, | |
| "grad_norm": 0.49011167883872986, | |
| "learning_rate": 0.0001614847176470043, | |
| "loss": 4.435109329223633, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.1302584896097314, | |
| "grad_norm": 0.4775542616844177, | |
| "learning_rate": 0.00016100706727404645, | |
| "loss": 4.428675842285156, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 1.131947964183139, | |
| "grad_norm": 0.5201391577720642, | |
| "learning_rate": 0.00016052930465858094, | |
| "loss": 4.4389808654785154, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.1336374387565467, | |
| "grad_norm": 0.49004724621772766, | |
| "learning_rate": 0.00016005143467250267, | |
| "loss": 4.459021377563476, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 1.1353269133299544, | |
| "grad_norm": 0.49011871218681335, | |
| "learning_rate": 0.00015957346218880124, | |
| "loss": 4.455972290039062, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.1370163879033621, | |
| "grad_norm": 0.5173168182373047, | |
| "learning_rate": 0.0001590953920815117, | |
| "loss": 4.443459701538086, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 1.1387058624767696, | |
| "grad_norm": 0.47700756788253784, | |
| "learning_rate": 0.00015861722922566436, | |
| "loss": 4.435110473632813, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.1403953370501774, | |
| "grad_norm": 0.5626063942909241, | |
| "learning_rate": 0.00015813897849723544, | |
| "loss": 4.432453536987305, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.142084811623585, | |
| "grad_norm": 0.49542316794395447, | |
| "learning_rate": 0.0001576606447730972, | |
| "loss": 4.4374950408935545, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.1437742861969928, | |
| "grad_norm": 0.5116281509399414, | |
| "learning_rate": 0.0001571822329309682, | |
| "loss": 4.423119354248047, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 1.1454637607704004, | |
| "grad_norm": 0.4868847131729126, | |
| "learning_rate": 0.00015670374784936371, | |
| "loss": 4.4402107238769535, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.147153235343808, | |
| "grad_norm": 0.4938635230064392, | |
| "learning_rate": 0.00015622519440754566, | |
| "loss": 4.424631881713867, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 1.1488427099172158, | |
| "grad_norm": 0.5740174651145935, | |
| "learning_rate": 0.0001557465774854732, | |
| "loss": 4.450838470458985, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.1505321844906233, | |
| "grad_norm": 0.4828670918941498, | |
| "learning_rate": 0.0001552679019637528, | |
| "loss": 4.438276290893555, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 1.152221659064031, | |
| "grad_norm": 0.4659689664840698, | |
| "learning_rate": 0.00015478917272358848, | |
| "loss": 4.426282501220703, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.1539111336374388, | |
| "grad_norm": 0.4927656352519989, | |
| "learning_rate": 0.000154310394646732, | |
| "loss": 4.464373016357422, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 1.1556006082108463, | |
| "grad_norm": 0.5161291360855103, | |
| "learning_rate": 0.00015383157261543318, | |
| "loss": 4.416297531127929, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.157290082784254, | |
| "grad_norm": 0.4933563768863678, | |
| "learning_rate": 0.00015335271151239, | |
| "loss": 4.420982742309571, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.1589795573576618, | |
| "grad_norm": 0.4847005307674408, | |
| "learning_rate": 0.00015287381622069892, | |
| "loss": 4.416022872924804, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.1606690319310695, | |
| "grad_norm": 0.4981960654258728, | |
| "learning_rate": 0.00015239489162380504, | |
| "loss": 4.422767639160156, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 1.162358506504477, | |
| "grad_norm": 0.5001937747001648, | |
| "learning_rate": 0.0001519159426054522, | |
| "loss": 4.4368339538574215, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.1640479810778848, | |
| "grad_norm": 0.5044972896575928, | |
| "learning_rate": 0.0001514369740496334, | |
| "loss": 4.411078643798828, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 1.1657374556512925, | |
| "grad_norm": 0.4734691083431244, | |
| "learning_rate": 0.00015095799084054073, | |
| "loss": 4.438079071044922, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.1674269302247002, | |
| "grad_norm": 0.49377161264419556, | |
| "learning_rate": 0.00015047899786251587, | |
| "loss": 4.442370986938476, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 1.1691164047981077, | |
| "grad_norm": 0.5010132193565369, | |
| "learning_rate": 0.00015, | |
| "loss": 4.442108917236328, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.1708058793715155, | |
| "grad_norm": 0.5035766959190369, | |
| "learning_rate": 0.0001495210021374841, | |
| "loss": 4.430604553222656, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 1.1724953539449232, | |
| "grad_norm": 0.4899141788482666, | |
| "learning_rate": 0.00014904200915945927, | |
| "loss": 4.435578918457031, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.1741848285183307, | |
| "grad_norm": 0.4718686044216156, | |
| "learning_rate": 0.00014856302595036663, | |
| "loss": 4.429093551635742, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.1758743030917385, | |
| "grad_norm": 0.4881162941455841, | |
| "learning_rate": 0.00014808405739454776, | |
| "loss": 4.408749008178711, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.1775637776651462, | |
| "grad_norm": 0.46740713715553284, | |
| "learning_rate": 0.00014760510837619493, | |
| "loss": 4.419464492797852, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 1.1792532522385537, | |
| "grad_norm": 0.4737609922885895, | |
| "learning_rate": 0.00014712618377930105, | |
| "loss": 4.421468353271484, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.1809427268119614, | |
| "grad_norm": 0.4975055754184723, | |
| "learning_rate": 0.00014664728848760996, | |
| "loss": 4.422280502319336, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 1.1826322013853692, | |
| "grad_norm": 0.4839191734790802, | |
| "learning_rate": 0.00014616842738456682, | |
| "loss": 4.395424652099609, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.1826322013853692, | |
| "eval_loss": 4.412718772888184, | |
| "eval_runtime": 4.0717, | |
| "eval_samples_per_second": 245.6, | |
| "eval_steps_per_second": 5.158, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.184321675958777, | |
| "grad_norm": 0.49278581142425537, | |
| "learning_rate": 0.000145689605353268, | |
| "loss": 4.424203109741211, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 1.1860111505321844, | |
| "grad_norm": 0.49644234776496887, | |
| "learning_rate": 0.00014521082727641152, | |
| "loss": 4.395336151123047, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 1.1877006251055922, | |
| "grad_norm": 0.483456552028656, | |
| "learning_rate": 0.0001447320980362472, | |
| "loss": 4.440347671508789, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 1.189390099679, | |
| "grad_norm": 0.5150992274284363, | |
| "learning_rate": 0.00014425342251452679, | |
| "loss": 4.393960571289062, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.1910795742524076, | |
| "grad_norm": 0.47316014766693115, | |
| "learning_rate": 0.00014377480559245434, | |
| "loss": 4.433261108398438, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.1927690488258151, | |
| "grad_norm": 0.5043189525604248, | |
| "learning_rate": 0.00014329625215063629, | |
| "loss": 4.437650680541992, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 1.1944585233992229, | |
| "grad_norm": 0.49998390674591064, | |
| "learning_rate": 0.00014281776706903177, | |
| "loss": 4.40019416809082, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 1.1961479979726306, | |
| "grad_norm": 0.5133141279220581, | |
| "learning_rate": 0.0001423393552269028, | |
| "loss": 4.417116928100586, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 1.1978374725460381, | |
| "grad_norm": 0.513031005859375, | |
| "learning_rate": 0.00014186102150276454, | |
| "loss": 4.438409805297852, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 1.1995269471194459, | |
| "grad_norm": 0.4915519058704376, | |
| "learning_rate": 0.00014138277077433567, | |
| "loss": 4.4253074645996096, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.2012164216928536, | |
| "grad_norm": 0.5202800035476685, | |
| "learning_rate": 0.00014090460791848827, | |
| "loss": 4.41809310913086, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 1.202905896266261, | |
| "grad_norm": 0.49077826738357544, | |
| "learning_rate": 0.00014042653781119868, | |
| "loss": 4.397499465942383, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 1.2045953708396688, | |
| "grad_norm": 0.4648706912994385, | |
| "learning_rate": 0.0001399485653274973, | |
| "loss": 4.408271026611328, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 1.2062848454130766, | |
| "grad_norm": 0.4614482820034027, | |
| "learning_rate": 0.00013947069534141904, | |
| "loss": 4.425214004516602, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 1.207974319986484, | |
| "grad_norm": 0.4744400084018707, | |
| "learning_rate": 0.00013899293272595355, | |
| "loss": 4.440077590942383, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.2096637945598918, | |
| "grad_norm": 0.46556323766708374, | |
| "learning_rate": 0.0001385152823529957, | |
| "loss": 4.412957382202149, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 1.2113532691332995, | |
| "grad_norm": 0.4939349591732025, | |
| "learning_rate": 0.00013803774909329567, | |
| "loss": 4.405846023559571, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 1.2130427437067073, | |
| "grad_norm": 0.47055721282958984, | |
| "learning_rate": 0.0001375603378164095, | |
| "loss": 4.382000350952149, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 1.2147322182801148, | |
| "grad_norm": 0.47987523674964905, | |
| "learning_rate": 0.00013708305339064933, | |
| "loss": 4.415153121948242, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 1.2164216928535225, | |
| "grad_norm": 0.4784037470817566, | |
| "learning_rate": 0.00013660590068303373, | |
| "loss": 4.4415229797363285, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.2181111674269303, | |
| "grad_norm": 0.500056803226471, | |
| "learning_rate": 0.00013612888455923804, | |
| "loss": 4.416479873657226, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 1.219800642000338, | |
| "grad_norm": 0.4778987169265747, | |
| "learning_rate": 0.0001356520098835449, | |
| "loss": 4.442354583740235, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 1.2214901165737455, | |
| "grad_norm": 0.5005702376365662, | |
| "learning_rate": 0.00013517528151879457, | |
| "loss": 4.411639404296875, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 1.2231795911471532, | |
| "grad_norm": 0.4689568281173706, | |
| "learning_rate": 0.0001346987043263352, | |
| "loss": 4.414199447631836, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 1.224869065720561, | |
| "grad_norm": 0.4993502199649811, | |
| "learning_rate": 0.00013422228316597356, | |
| "loss": 4.432155609130859, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.2265585402939685, | |
| "grad_norm": 0.4782608151435852, | |
| "learning_rate": 0.00013374602289592508, | |
| "loss": 4.431560897827149, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 1.2282480148673762, | |
| "grad_norm": 0.5125144124031067, | |
| "learning_rate": 0.00013326992837276494, | |
| "loss": 4.405394744873047, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 1.229937489440784, | |
| "grad_norm": 0.48408523201942444, | |
| "learning_rate": 0.0001327940044513777, | |
| "loss": 4.4137004852294925, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 1.2316269640141915, | |
| "grad_norm": 0.4888753294944763, | |
| "learning_rate": 0.00013231825598490854, | |
| "loss": 4.409386062622071, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 1.2333164385875992, | |
| "grad_norm": 0.47923538088798523, | |
| "learning_rate": 0.0001318426878247133, | |
| "loss": 4.4191631317138675, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.235005913161007, | |
| "grad_norm": 0.46775540709495544, | |
| "learning_rate": 0.00013136730482030928, | |
| "loss": 4.423541259765625, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 1.2366953877344147, | |
| "grad_norm": 0.48620909452438354, | |
| "learning_rate": 0.0001308921118193257, | |
| "loss": 4.431262969970703, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 1.2383848623078222, | |
| "grad_norm": 0.5028111338615417, | |
| "learning_rate": 0.00013041711366745408, | |
| "loss": 4.423612976074219, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 1.24007433688123, | |
| "grad_norm": 0.4982888996601105, | |
| "learning_rate": 0.00012994231520839934, | |
| "loss": 4.428596878051758, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 1.2417638114546377, | |
| "grad_norm": 0.5141102075576782, | |
| "learning_rate": 0.0001294677212838297, | |
| "loss": 4.398578262329101, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.2434532860280454, | |
| "grad_norm": 0.48103561997413635, | |
| "learning_rate": 0.00012899333673332795, | |
| "loss": 4.439675140380859, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.245142760601453, | |
| "grad_norm": 0.5084096789360046, | |
| "learning_rate": 0.00012851916639434164, | |
| "loss": 4.3824302673339846, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 1.2468322351748606, | |
| "grad_norm": 0.4776511788368225, | |
| "learning_rate": 0.00012804521510213407, | |
| "loss": 4.402749633789062, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 1.2485217097482684, | |
| "grad_norm": 0.499318391084671, | |
| "learning_rate": 0.00012757148768973483, | |
| "loss": 4.405498886108399, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 1.2502111843216759, | |
| "grad_norm": 0.4898117184638977, | |
| "learning_rate": 0.00012709798898789042, | |
| "loss": 4.4396411895751955, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.2519006588950836, | |
| "grad_norm": 0.4777224361896515, | |
| "learning_rate": 0.00012662472382501524, | |
| "loss": 4.409711074829102, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 1.2535901334684914, | |
| "grad_norm": 0.48530757427215576, | |
| "learning_rate": 0.0001261516970271422, | |
| "loss": 4.4214935302734375, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 1.2552796080418989, | |
| "grad_norm": 0.48434415459632874, | |
| "learning_rate": 0.0001256789134178735, | |
| "loss": 4.438081741333008, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 1.2569690826153066, | |
| "grad_norm": 0.4974631071090698, | |
| "learning_rate": 0.00012520637781833144, | |
| "loss": 4.407797622680664, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 1.2586585571887143, | |
| "grad_norm": 0.4732743799686432, | |
| "learning_rate": 0.0001247340950471094, | |
| "loss": 4.418028259277344, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.2603480317621218, | |
| "grad_norm": 0.5003547072410583, | |
| "learning_rate": 0.0001242620699202224, | |
| "loss": 4.400883483886719, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 1.2620375063355296, | |
| "grad_norm": 0.49987900257110596, | |
| "learning_rate": 0.00012379030725105837, | |
| "loss": 4.402442169189453, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 1.2637269809089373, | |
| "grad_norm": 0.49416637420654297, | |
| "learning_rate": 0.00012331881185032872, | |
| "loss": 4.388990020751953, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 1.265416455482345, | |
| "grad_norm": 0.5343226194381714, | |
| "learning_rate": 0.00012284758852601962, | |
| "loss": 4.411848449707032, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 1.2671059300557528, | |
| "grad_norm": 0.5128340125083923, | |
| "learning_rate": 0.00012237664208334263, | |
| "loss": 4.403173446655273, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.2671059300557528, | |
| "eval_loss": 4.382744789123535, | |
| "eval_runtime": 3.7472, | |
| "eval_samples_per_second": 266.869, | |
| "eval_steps_per_second": 5.604, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.2687954046291603, | |
| "grad_norm": 0.48278650641441345, | |
| "learning_rate": 0.00012190597732468595, | |
| "loss": 4.407323837280273, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 1.270484879202568, | |
| "grad_norm": 0.48528528213500977, | |
| "learning_rate": 0.00012143559904956533, | |
| "loss": 4.389751815795899, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.2721743537759758, | |
| "grad_norm": 0.4944697320461273, | |
| "learning_rate": 0.00012096551205457511, | |
| "loss": 4.385165786743164, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 1.2738638283493833, | |
| "grad_norm": 0.5002730488777161, | |
| "learning_rate": 0.00012049572113333949, | |
| "loss": 4.374062347412109, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 1.275553302922791, | |
| "grad_norm": 0.46715047955513, | |
| "learning_rate": 0.00012002623107646327, | |
| "loss": 4.394298553466797, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.2772427774961987, | |
| "grad_norm": 0.4903099834918976, | |
| "learning_rate": 0.00011955704667148361, | |
| "loss": 4.400055694580078, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 1.2789322520696063, | |
| "grad_norm": 0.5333164930343628, | |
| "learning_rate": 0.00011908817270282048, | |
| "loss": 4.424139404296875, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 1.280621726643014, | |
| "grad_norm": 0.47946473956108093, | |
| "learning_rate": 0.00011861961395172844, | |
| "loss": 4.419405746459961, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 1.2823112012164217, | |
| "grad_norm": 0.4778226315975189, | |
| "learning_rate": 0.00011815137519624767, | |
| "loss": 4.414478302001953, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 1.2840006757898292, | |
| "grad_norm": 0.4878886342048645, | |
| "learning_rate": 0.0001176834612111551, | |
| "loss": 4.384803009033203, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.285690150363237, | |
| "grad_norm": 0.4819967746734619, | |
| "learning_rate": 0.0001172158767679161, | |
| "loss": 4.3730110168457035, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 1.2873796249366447, | |
| "grad_norm": 0.4928823411464691, | |
| "learning_rate": 0.00011674862663463538, | |
| "loss": 4.3778236389160154, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 1.2890690995100524, | |
| "grad_norm": 0.4724312722682953, | |
| "learning_rate": 0.00011628171557600869, | |
| "loss": 4.387655639648438, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 1.2907585740834602, | |
| "grad_norm": 0.5023632049560547, | |
| "learning_rate": 0.0001158151483532742, | |
| "loss": 4.366682052612305, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 1.2924480486568677, | |
| "grad_norm": 0.47042906284332275, | |
| "learning_rate": 0.00011534892972416382, | |
| "loss": 4.3992149353027346, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.2941375232302754, | |
| "grad_norm": 0.5019961595535278, | |
| "learning_rate": 0.00011488306444285465, | |
| "loss": 4.408546829223633, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 1.2958269978036832, | |
| "grad_norm": 0.4686186909675598, | |
| "learning_rate": 0.0001144175572599207, | |
| "loss": 4.392362976074219, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 1.2975164723770907, | |
| "grad_norm": 0.5097217559814453, | |
| "learning_rate": 0.00011395241292228435, | |
| "loss": 4.350882339477539, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.2992059469504984, | |
| "grad_norm": 0.5009888410568237, | |
| "learning_rate": 0.00011348763617316781, | |
| "loss": 4.407807159423828, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 1.3008954215239061, | |
| "grad_norm": 0.4623536765575409, | |
| "learning_rate": 0.00011302323175204497, | |
| "loss": 4.383738708496094, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.3025848960973136, | |
| "grad_norm": 0.49098923802375793, | |
| "learning_rate": 0.00011255920439459302, | |
| "loss": 4.3777015686035154, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 1.3042743706707214, | |
| "grad_norm": 0.47158893942832947, | |
| "learning_rate": 0.00011209555883264406, | |
| "loss": 4.398603439331055, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 1.3059638452441291, | |
| "grad_norm": 0.4723564684391022, | |
| "learning_rate": 0.00011163229979413685, | |
| "loss": 4.379953384399414, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 1.3076533198175366, | |
| "grad_norm": 0.478575199842453, | |
| "learning_rate": 0.00011116943200306871, | |
| "loss": 4.369690322875977, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 1.3093427943909444, | |
| "grad_norm": 0.4801791310310364, | |
| "learning_rate": 0.00011070696017944728, | |
| "loss": 4.421099853515625, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.311032268964352, | |
| "grad_norm": 0.5147274732589722, | |
| "learning_rate": 0.00011024488903924235, | |
| "loss": 4.396934127807617, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 1.3127217435377598, | |
| "grad_norm": 0.4905327558517456, | |
| "learning_rate": 0.00010978322329433796, | |
| "loss": 4.368836975097656, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 1.3144112181111673, | |
| "grad_norm": 0.47583821415901184, | |
| "learning_rate": 0.00010932196765248396, | |
| "loss": 4.351024627685547, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 1.316100692684575, | |
| "grad_norm": 0.4749636650085449, | |
| "learning_rate": 0.0001088611268172485, | |
| "loss": 4.381603622436524, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 1.3177901672579828, | |
| "grad_norm": 0.47106119990348816, | |
| "learning_rate": 0.00010840070548796967, | |
| "loss": 4.386127471923828, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.3194796418313905, | |
| "grad_norm": 0.49278977513313293, | |
| "learning_rate": 0.00010794070835970782, | |
| "loss": 4.393439865112304, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 1.321169116404798, | |
| "grad_norm": 0.49596497416496277, | |
| "learning_rate": 0.00010748114012319747, | |
| "loss": 4.369705581665039, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 1.3228585909782058, | |
| "grad_norm": 0.48959940671920776, | |
| "learning_rate": 0.0001070220054647997, | |
| "loss": 4.353339767456054, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 1.3245480655516135, | |
| "grad_norm": 0.4975447952747345, | |
| "learning_rate": 0.00010656330906645422, | |
| "loss": 4.378279113769532, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.326237540125021, | |
| "grad_norm": 0.48734408617019653, | |
| "learning_rate": 0.00010610505560563163, | |
| "loss": 4.365981674194336, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.3279270146984288, | |
| "grad_norm": 0.4985700845718384, | |
| "learning_rate": 0.00010564724975528584, | |
| "loss": 4.384627151489258, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 1.3296164892718365, | |
| "grad_norm": 0.48617759346961975, | |
| "learning_rate": 0.00010518989618380632, | |
| "loss": 4.387208938598633, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 1.331305963845244, | |
| "grad_norm": 0.479184091091156, | |
| "learning_rate": 0.00010473299955497044, | |
| "loss": 4.39497184753418, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 1.3329954384186518, | |
| "grad_norm": 0.5024631023406982, | |
| "learning_rate": 0.000104276564527896, | |
| "loss": 4.341180801391602, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 1.3346849129920595, | |
| "grad_norm": 0.5147078633308411, | |
| "learning_rate": 0.0001038205957569938, | |
| "loss": 4.36151008605957, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.336374387565467, | |
| "grad_norm": 0.4864480197429657, | |
| "learning_rate": 0.00010336509789191994, | |
| "loss": 4.3700817108154295, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 1.3380638621388747, | |
| "grad_norm": 0.48009052872657776, | |
| "learning_rate": 0.00010291007557752861, | |
| "loss": 4.372967910766602, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 1.3397533367122825, | |
| "grad_norm": 0.4770645499229431, | |
| "learning_rate": 0.00010245553345382467, | |
| "loss": 4.361065673828125, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 1.3414428112856902, | |
| "grad_norm": 0.47222378849983215, | |
| "learning_rate": 0.00010200147615591643, | |
| "loss": 4.3356986999511715, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 1.343132285859098, | |
| "grad_norm": 0.513080894947052, | |
| "learning_rate": 0.00010154790831396805, | |
| "loss": 4.402030181884766, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.3448217604325055, | |
| "grad_norm": 0.48416030406951904, | |
| "learning_rate": 0.00010109483455315269, | |
| "loss": 4.381985855102539, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 1.3465112350059132, | |
| "grad_norm": 0.46342408657073975, | |
| "learning_rate": 0.00010064225949360525, | |
| "loss": 4.364437103271484, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 1.348200709579321, | |
| "grad_norm": 0.4690420925617218, | |
| "learning_rate": 0.00010019018775037509, | |
| "loss": 4.399689102172852, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 1.3498901841527284, | |
| "grad_norm": 0.47876372933387756, | |
| "learning_rate": 9.973862393337925e-05, | |
| "loss": 4.388835144042969, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 1.3515796587261362, | |
| "grad_norm": 0.48350629210472107, | |
| "learning_rate": 9.928757264735506e-05, | |
| "loss": 4.405188751220703, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.3515796587261362, | |
| "eval_loss": 4.357097148895264, | |
| "eval_runtime": 3.734, | |
| "eval_samples_per_second": 267.812, | |
| "eval_steps_per_second": 5.624, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.353269133299544, | |
| "grad_norm": 0.463106632232666, | |
| "learning_rate": 9.883703849181374e-05, | |
| "loss": 4.368831634521484, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 1.3549586078729514, | |
| "grad_norm": 0.4774092137813568, | |
| "learning_rate": 9.838702606099289e-05, | |
| "loss": 4.350126647949219, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 1.3566480824463591, | |
| "grad_norm": 0.5083175897598267, | |
| "learning_rate": 9.793753994381003e-05, | |
| "loss": 4.375761032104492, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 1.3583375570197669, | |
| "grad_norm": 0.493473619222641, | |
| "learning_rate": 9.748858472381567e-05, | |
| "loss": 4.382857894897461, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 1.3600270315931744, | |
| "grad_norm": 0.47200217843055725, | |
| "learning_rate": 9.704016497914657e-05, | |
| "loss": 4.363901901245117, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.3617165061665821, | |
| "grad_norm": 0.47423017024993896, | |
| "learning_rate": 9.659228528247923e-05, | |
| "loss": 4.352508544921875, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 1.3634059807399899, | |
| "grad_norm": 0.49032631516456604, | |
| "learning_rate": 9.614495020098284e-05, | |
| "loss": 4.386605834960937, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 1.3650954553133976, | |
| "grad_norm": 0.5129415392875671, | |
| "learning_rate": 9.569816429627329e-05, | |
| "loss": 4.370170211791992, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 1.3667849298868053, | |
| "grad_norm": 0.47328782081604004, | |
| "learning_rate": 9.525193212436607e-05, | |
| "loss": 4.394309616088867, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 1.3684744044602128, | |
| "grad_norm": 0.5091307759284973, | |
| "learning_rate": 9.480625823563032e-05, | |
| "loss": 4.353821182250977, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.3701638790336206, | |
| "grad_norm": 0.49530673027038574, | |
| "learning_rate": 9.436114717474197e-05, | |
| "loss": 4.374178314208985, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 1.3718533536070283, | |
| "grad_norm": 0.5062808394432068, | |
| "learning_rate": 9.391660348063778e-05, | |
| "loss": 4.366446685791016, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 1.3735428281804358, | |
| "grad_norm": 0.4893403947353363, | |
| "learning_rate": 9.347263168646881e-05, | |
| "loss": 4.377128601074219, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 1.3752323027538436, | |
| "grad_norm": 0.49352315068244934, | |
| "learning_rate": 9.30292363195543e-05, | |
| "loss": 4.390756988525391, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 1.3769217773272513, | |
| "grad_norm": 0.4956866502761841, | |
| "learning_rate": 9.258642190133548e-05, | |
| "loss": 4.364201354980469, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.3786112519006588, | |
| "grad_norm": 0.4806705415248871, | |
| "learning_rate": 9.21441929473295e-05, | |
| "loss": 4.336410140991211, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.3803007264740665, | |
| "grad_norm": 0.503070056438446, | |
| "learning_rate": 9.170255396708336e-05, | |
| "loss": 4.363087463378906, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 1.3819902010474743, | |
| "grad_norm": 0.4839601218700409, | |
| "learning_rate": 9.126150946412775e-05, | |
| "loss": 4.353903961181641, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 1.3836796756208818, | |
| "grad_norm": 0.4867366552352905, | |
| "learning_rate": 9.082106393593153e-05, | |
| "loss": 4.347708892822266, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 1.3853691501942895, | |
| "grad_norm": 0.4875339865684509, | |
| "learning_rate": 9.038122187385543e-05, | |
| "loss": 4.371865844726562, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.3870586247676973, | |
| "grad_norm": 0.49478384852409363, | |
| "learning_rate": 8.994198776310652e-05, | |
| "loss": 4.368743133544922, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 1.388748099341105, | |
| "grad_norm": 0.4815446734428406, | |
| "learning_rate": 8.950336608269243e-05, | |
| "loss": 4.383320999145508, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 1.3904375739145125, | |
| "grad_norm": 0.4883415997028351, | |
| "learning_rate": 8.906536130537566e-05, | |
| "loss": 4.368521881103516, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 1.3921270484879202, | |
| "grad_norm": 0.5107654929161072, | |
| "learning_rate": 8.862797789762785e-05, | |
| "loss": 4.353972244262695, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 1.393816523061328, | |
| "grad_norm": 0.46853381395339966, | |
| "learning_rate": 8.819122031958446e-05, | |
| "loss": 4.374198150634766, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.3955059976347357, | |
| "grad_norm": 0.49264970421791077, | |
| "learning_rate": 8.77550930249991e-05, | |
| "loss": 4.353750228881836, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 1.3971954722081432, | |
| "grad_norm": 0.49197956919670105, | |
| "learning_rate": 8.731960046119819e-05, | |
| "loss": 4.378075408935547, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 1.398884946781551, | |
| "grad_norm": 0.48225274682044983, | |
| "learning_rate": 8.688474706903554e-05, | |
| "loss": 4.360022735595703, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 1.4005744213549587, | |
| "grad_norm": 0.4796869456768036, | |
| "learning_rate": 8.645053728284734e-05, | |
| "loss": 4.351276779174805, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 1.4022638959283662, | |
| "grad_norm": 0.46706125140190125, | |
| "learning_rate": 8.601697553040645e-05, | |
| "loss": 4.367401885986328, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.403953370501774, | |
| "grad_norm": 0.4695565104484558, | |
| "learning_rate": 8.55840662328778e-05, | |
| "loss": 4.338150405883789, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 1.4056428450751817, | |
| "grad_norm": 0.4987981915473938, | |
| "learning_rate": 8.515181380477273e-05, | |
| "loss": 4.369682693481446, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.4073323196485892, | |
| "grad_norm": 0.4853006899356842, | |
| "learning_rate": 8.47202226539046e-05, | |
| "loss": 4.392825698852539, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 1.409021794221997, | |
| "grad_norm": 0.48891976475715637, | |
| "learning_rate": 8.428929718134331e-05, | |
| "loss": 4.3820442199707035, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 1.4107112687954046, | |
| "grad_norm": 0.48374229669570923, | |
| "learning_rate": 8.385904178137061e-05, | |
| "loss": 4.367736053466797, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.4124007433688122, | |
| "grad_norm": 0.4966294765472412, | |
| "learning_rate": 8.342946084143546e-05, | |
| "loss": 4.336813354492188, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 1.41409021794222, | |
| "grad_norm": 0.4939606487751007, | |
| "learning_rate": 8.300055874210903e-05, | |
| "loss": 4.390798568725586, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 1.4157796925156276, | |
| "grad_norm": 0.48403191566467285, | |
| "learning_rate": 8.257233985704021e-05, | |
| "loss": 4.3521678924560545, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 1.4174691670890354, | |
| "grad_norm": 0.4766407012939453, | |
| "learning_rate": 8.214480855291084e-05, | |
| "loss": 4.337980651855469, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 1.419158641662443, | |
| "grad_norm": 0.469018816947937, | |
| "learning_rate": 8.171796918939142e-05, | |
| "loss": 4.341955184936523, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.4208481162358506, | |
| "grad_norm": 0.4855271875858307, | |
| "learning_rate": 8.129182611909642e-05, | |
| "loss": 4.353343963623047, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 1.4225375908092583, | |
| "grad_norm": 0.4870193898677826, | |
| "learning_rate": 8.086638368753993e-05, | |
| "loss": 4.374142074584961, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 1.424227065382666, | |
| "grad_norm": 0.4896891415119171, | |
| "learning_rate": 8.04416462330916e-05, | |
| "loss": 4.367203140258789, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 1.4259165399560736, | |
| "grad_norm": 0.46844348311424255, | |
| "learning_rate": 8.0017618086932e-05, | |
| "loss": 4.35595817565918, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 1.4276060145294813, | |
| "grad_norm": 0.4512944519519806, | |
| "learning_rate": 7.959430357300885e-05, | |
| "loss": 4.3400733947753904, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.429295489102889, | |
| "grad_norm": 0.4732443392276764, | |
| "learning_rate": 7.917170700799256e-05, | |
| "loss": 4.333652114868164, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 1.4309849636762966, | |
| "grad_norm": 0.4684848487377167, | |
| "learning_rate": 7.874983270123254e-05, | |
| "loss": 4.352918243408203, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 1.4326744382497043, | |
| "grad_norm": 0.506878137588501, | |
| "learning_rate": 7.832868495471306e-05, | |
| "loss": 4.357436752319336, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.434363912823112, | |
| "grad_norm": 0.5020336508750916, | |
| "learning_rate": 7.790826806300928e-05, | |
| "loss": 4.359925079345703, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 1.4360533873965196, | |
| "grad_norm": 0.4732269048690796, | |
| "learning_rate": 7.748858631324393e-05, | |
| "loss": 4.356634902954101, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.4360533873965196, | |
| "eval_loss": 4.3328938484191895, | |
| "eval_runtime": 3.6888, | |
| "eval_samples_per_second": 271.089, | |
| "eval_steps_per_second": 5.693, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.4377428619699273, | |
| "grad_norm": 0.4925293028354645, | |
| "learning_rate": 7.706964398504293e-05, | |
| "loss": 4.376210403442383, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 1.439432336543335, | |
| "grad_norm": 0.4719123840332031, | |
| "learning_rate": 7.665144535049224e-05, | |
| "loss": 4.338931274414063, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 1.4411218111167428, | |
| "grad_norm": 0.4722173511981964, | |
| "learning_rate": 7.623399467409416e-05, | |
| "loss": 4.352537536621094, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 1.4428112856901505, | |
| "grad_norm": 0.4844585955142975, | |
| "learning_rate": 7.581729621272386e-05, | |
| "loss": 4.332356262207031, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 1.444500760263558, | |
| "grad_norm": 0.49630841612815857, | |
| "learning_rate": 7.540135421558585e-05, | |
| "loss": 4.3133392333984375, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.4461902348369657, | |
| "grad_norm": 0.472133994102478, | |
| "learning_rate": 7.498617292417074e-05, | |
| "loss": 4.3697349548339846, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 1.4478797094103735, | |
| "grad_norm": 0.48327624797821045, | |
| "learning_rate": 7.457175657221194e-05, | |
| "loss": 4.366666030883789, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 1.449569183983781, | |
| "grad_norm": 0.4768034815788269, | |
| "learning_rate": 7.415810938564277e-05, | |
| "loss": 4.33704719543457, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 1.4512586585571887, | |
| "grad_norm": 0.4592680037021637, | |
| "learning_rate": 7.37452355825528e-05, | |
| "loss": 4.343940734863281, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 1.4529481331305965, | |
| "grad_norm": 0.4643280804157257, | |
| "learning_rate": 7.333313937314548e-05, | |
| "loss": 4.346873474121094, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.454637607704004, | |
| "grad_norm": 0.4980602264404297, | |
| "learning_rate": 7.292182495969462e-05, | |
| "loss": 4.370085525512695, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 1.4563270822774117, | |
| "grad_norm": 0.4845782518386841, | |
| "learning_rate": 7.251129653650206e-05, | |
| "loss": 4.3420463562011715, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 1.4580165568508194, | |
| "grad_norm": 0.47701558470726013, | |
| "learning_rate": 7.210155828985447e-05, | |
| "loss": 4.333865356445313, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 1.459706031424227, | |
| "grad_norm": 0.4681967794895172, | |
| "learning_rate": 7.169261439798083e-05, | |
| "loss": 4.315822982788086, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.4613955059976347, | |
| "grad_norm": 0.48438313603401184, | |
| "learning_rate": 7.128446903101004e-05, | |
| "loss": 4.31340446472168, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.4630849805710424, | |
| "grad_norm": 0.4675985872745514, | |
| "learning_rate": 7.087712635092802e-05, | |
| "loss": 4.347599792480469, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 1.4647744551444501, | |
| "grad_norm": 0.5026019215583801, | |
| "learning_rate": 7.047059051153538e-05, | |
| "loss": 4.3385356903076175, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 1.4664639297178579, | |
| "grad_norm": 0.4908424913883209, | |
| "learning_rate": 7.006486565840532e-05, | |
| "loss": 4.337771224975586, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 1.4681534042912654, | |
| "grad_norm": 0.47692814469337463, | |
| "learning_rate": 6.96599559288411e-05, | |
| "loss": 4.350002288818359, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 1.4698428788646731, | |
| "grad_norm": 0.4985916316509247, | |
| "learning_rate": 6.925586545183383e-05, | |
| "loss": 4.357270812988281, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.4715323534380809, | |
| "grad_norm": 0.4779921770095825, | |
| "learning_rate": 6.885259834802042e-05, | |
| "loss": 4.3343353271484375, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 1.4732218280114884, | |
| "grad_norm": 0.4964430630207062, | |
| "learning_rate": 6.845015872964179e-05, | |
| "loss": 4.345649337768554, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 1.4749113025848961, | |
| "grad_norm": 0.4816732108592987, | |
| "learning_rate": 6.80485507005005e-05, | |
| "loss": 4.349812316894531, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 1.4766007771583038, | |
| "grad_norm": 0.4839925765991211, | |
| "learning_rate": 6.764777835591921e-05, | |
| "loss": 4.342644119262696, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 1.4782902517317114, | |
| "grad_norm": 0.5161303877830505, | |
| "learning_rate": 6.724784578269892e-05, | |
| "loss": 4.322945022583008, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.479979726305119, | |
| "grad_norm": 0.4845769703388214, | |
| "learning_rate": 6.684875705907722e-05, | |
| "loss": 4.33643798828125, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 1.4816692008785268, | |
| "grad_norm": 0.48371464014053345, | |
| "learning_rate": 6.645051625468657e-05, | |
| "loss": 4.319810104370117, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 1.4833586754519343, | |
| "grad_norm": 0.4810192286968231, | |
| "learning_rate": 6.605312743051297e-05, | |
| "loss": 4.350659561157227, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 1.485048150025342, | |
| "grad_norm": 0.4886019825935364, | |
| "learning_rate": 6.565659463885467e-05, | |
| "loss": 4.340823364257813, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 1.4867376245987498, | |
| "grad_norm": 0.4922144114971161, | |
| "learning_rate": 6.526092192328048e-05, | |
| "loss": 4.337167358398437, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.4884270991721575, | |
| "grad_norm": 0.47720760107040405, | |
| "learning_rate": 6.486611331858879e-05, | |
| "loss": 4.330669403076172, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 1.490116573745565, | |
| "grad_norm": 0.45629069209098816, | |
| "learning_rate": 6.447217285076651e-05, | |
| "loss": 4.354007339477539, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 1.4918060483189728, | |
| "grad_norm": 0.4794461727142334, | |
| "learning_rate": 6.407910453694782e-05, | |
| "loss": 4.356667327880859, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 1.4934955228923805, | |
| "grad_norm": 0.4836932420730591, | |
| "learning_rate": 6.368691238537321e-05, | |
| "loss": 4.3167163848876955, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 1.4951849974657883, | |
| "grad_norm": 0.5060141086578369, | |
| "learning_rate": 6.329560039534874e-05, | |
| "loss": 4.362548828125, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.4968744720391958, | |
| "grad_norm": 0.48216700553894043, | |
| "learning_rate": 6.290517255720505e-05, | |
| "loss": 4.3512012481689455, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 1.4985639466126035, | |
| "grad_norm": 0.46019911766052246, | |
| "learning_rate": 6.251563285225707e-05, | |
| "loss": 4.32593002319336, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 1.5002534211860112, | |
| "grad_norm": 0.4773600697517395, | |
| "learning_rate": 6.212698525276294e-05, | |
| "loss": 4.345823287963867, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 1.5019428957594188, | |
| "grad_norm": 0.4903421401977539, | |
| "learning_rate": 6.173923372188372e-05, | |
| "loss": 4.330167770385742, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 1.5036323703328265, | |
| "grad_norm": 0.47027841210365295, | |
| "learning_rate": 6.135238221364313e-05, | |
| "loss": 4.352994155883789, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.5053218449062342, | |
| "grad_norm": 0.4893588125705719, | |
| "learning_rate": 6.096643467288703e-05, | |
| "loss": 4.3315269470214846, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 1.5070113194796417, | |
| "grad_norm": 0.4835808277130127, | |
| "learning_rate": 6.058139503524314e-05, | |
| "loss": 4.349056625366211, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 1.5087007940530495, | |
| "grad_norm": 0.4750809967517853, | |
| "learning_rate": 6.019726722708104e-05, | |
| "loss": 4.325545120239258, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 1.5103902686264572, | |
| "grad_norm": 0.4945700466632843, | |
| "learning_rate": 5.981405516547222e-05, | |
| "loss": 4.312815093994141, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 1.5120797431998647, | |
| "grad_norm": 0.4704221487045288, | |
| "learning_rate": 5.9431762758149875e-05, | |
| "loss": 4.328189849853516, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.5137692177732727, | |
| "grad_norm": 0.48752453923225403, | |
| "learning_rate": 5.9050393903469215e-05, | |
| "loss": 4.324124145507812, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.5154586923466802, | |
| "grad_norm": 0.5149093270301819, | |
| "learning_rate": 5.866995249036775e-05, | |
| "loss": 4.334346771240234, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 1.5171481669200877, | |
| "grad_norm": 0.49064958095550537, | |
| "learning_rate": 5.829044239832564e-05, | |
| "loss": 4.324323654174805, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 1.5188376414934956, | |
| "grad_norm": 0.486092746257782, | |
| "learning_rate": 5.791186749732594e-05, | |
| "loss": 4.346895599365235, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 1.5205271160669032, | |
| "grad_norm": 0.48512768745422363, | |
| "learning_rate": 5.7534231647815244e-05, | |
| "loss": 4.350548934936524, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.5205271160669032, | |
| "eval_loss": 4.312350273132324, | |
| "eval_runtime": 4.1596, | |
| "eval_samples_per_second": 240.409, | |
| "eval_steps_per_second": 5.049, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.522216590640311, | |
| "grad_norm": 0.48225322365760803, | |
| "learning_rate": 5.715753870066455e-05, | |
| "loss": 4.3221698760986325, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 1.5239060652137186, | |
| "grad_norm": 0.5018514394760132, | |
| "learning_rate": 5.67817924971296e-05, | |
| "loss": 4.321614456176758, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 1.5255955397871261, | |
| "grad_norm": 0.5176340341567993, | |
| "learning_rate": 5.6406996868811885e-05, | |
| "loss": 4.335358810424805, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 1.5272850143605339, | |
| "grad_norm": 0.48670732975006104, | |
| "learning_rate": 5.60331556376197e-05, | |
| "loss": 4.33309326171875, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 1.5289744889339416, | |
| "grad_norm": 0.49554112553596497, | |
| "learning_rate": 5.566027261572907e-05, | |
| "loss": 4.316144943237305, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.5306639635073491, | |
| "grad_norm": 0.5128636956214905, | |
| "learning_rate": 5.528835160554475e-05, | |
| "loss": 4.335286712646484, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 1.5323534380807569, | |
| "grad_norm": 0.4977918863296509, | |
| "learning_rate": 5.491739639966153e-05, | |
| "loss": 4.342447662353516, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 1.5340429126541646, | |
| "grad_norm": 0.5133760571479797, | |
| "learning_rate": 5.454741078082578e-05, | |
| "loss": 4.342383956909179, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 1.535732387227572, | |
| "grad_norm": 0.4748549461364746, | |
| "learning_rate": 5.417839852189653e-05, | |
| "loss": 4.369438171386719, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 1.53742186180098, | |
| "grad_norm": 0.4682454466819763, | |
| "learning_rate": 5.381036338580718e-05, | |
| "loss": 4.323982238769531, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.5391113363743876, | |
| "grad_norm": 0.48015832901000977, | |
| "learning_rate": 5.344330912552703e-05, | |
| "loss": 4.3247119903564455, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 1.540800810947795, | |
| "grad_norm": 0.4660989046096802, | |
| "learning_rate": 5.3077239484023385e-05, | |
| "loss": 4.338339614868164, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.542490285521203, | |
| "grad_norm": 0.48378968238830566, | |
| "learning_rate": 5.271215819422277e-05, | |
| "loss": 4.342069244384765, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 1.5441797600946106, | |
| "grad_norm": 0.4721354842185974, | |
| "learning_rate": 5.234806897897328e-05, | |
| "loss": 4.35260009765625, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 1.5458692346680183, | |
| "grad_norm": 0.47296905517578125, | |
| "learning_rate": 5.1984975551006434e-05, | |
| "loss": 4.3343055725097654, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.547558709241426, | |
| "grad_norm": 0.49029457569122314, | |
| "learning_rate": 5.1622881612899635e-05, | |
| "loss": 4.331478881835937, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 1.5492481838148335, | |
| "grad_norm": 0.4729316234588623, | |
| "learning_rate": 5.126179085703794e-05, | |
| "loss": 4.309265899658203, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 1.5509376583882413, | |
| "grad_norm": 0.4636003375053406, | |
| "learning_rate": 5.090170696557667e-05, | |
| "loss": 4.332284164428711, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 1.552627132961649, | |
| "grad_norm": 0.4683416783809662, | |
| "learning_rate": 5.054263361040395e-05, | |
| "loss": 4.323814392089844, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 1.5543166075350565, | |
| "grad_norm": 0.48071300983428955, | |
| "learning_rate": 5.018457445310313e-05, | |
| "loss": 4.331411743164063, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.5560060821084643, | |
| "grad_norm": 0.48741987347602844, | |
| "learning_rate": 4.9827533144915384e-05, | |
| "loss": 4.315482711791992, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 1.557695556681872, | |
| "grad_norm": 0.47064927220344543, | |
| "learning_rate": 4.9471513326702544e-05, | |
| "loss": 4.333251571655273, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 1.5593850312552795, | |
| "grad_norm": 0.48281940817832947, | |
| "learning_rate": 4.911651862891014e-05, | |
| "loss": 4.332812118530273, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 1.5610745058286872, | |
| "grad_norm": 0.4713364541530609, | |
| "learning_rate": 4.876255267153011e-05, | |
| "loss": 4.334049224853516, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 1.562763980402095, | |
| "grad_norm": 0.47604429721832275, | |
| "learning_rate": 4.8409619064063965e-05, | |
| "loss": 4.322870254516602, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.5644534549755025, | |
| "grad_norm": 0.4858945608139038, | |
| "learning_rate": 4.805772140548613e-05, | |
| "loss": 4.333293914794922, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 1.5661429295489104, | |
| "grad_norm": 0.4747396409511566, | |
| "learning_rate": 4.770686328420713e-05, | |
| "loss": 4.309678649902343, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 1.567832404122318, | |
| "grad_norm": 0.46066993474960327, | |
| "learning_rate": 4.7357048278036944e-05, | |
| "loss": 4.335137176513672, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.5695218786957257, | |
| "grad_norm": 0.48828113079071045, | |
| "learning_rate": 4.700827995414853e-05, | |
| "loss": 4.319439315795899, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 1.5712113532691334, | |
| "grad_norm": 0.48410648107528687, | |
| "learning_rate": 4.666056186904168e-05, | |
| "loss": 4.3514057159423825, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.572900827842541, | |
| "grad_norm": 0.4797396957874298, | |
| "learning_rate": 4.63138975685064e-05, | |
| "loss": 4.323817443847656, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 1.5745903024159487, | |
| "grad_norm": 0.46725404262542725, | |
| "learning_rate": 4.596829058758694e-05, | |
| "loss": 4.341088104248047, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 1.5762797769893564, | |
| "grad_norm": 0.4728842079639435, | |
| "learning_rate": 4.5623744450545846e-05, | |
| "loss": 4.356753540039063, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 1.577969251562764, | |
| "grad_norm": 0.4818381071090698, | |
| "learning_rate": 4.528026267082786e-05, | |
| "loss": 4.344687652587891, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 1.5796587261361716, | |
| "grad_norm": 0.47536230087280273, | |
| "learning_rate": 4.493784875102409e-05, | |
| "loss": 4.327443695068359, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.5813482007095794, | |
| "grad_norm": 0.5105261206626892, | |
| "learning_rate": 4.45965061828363e-05, | |
| "loss": 4.336804962158203, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 1.583037675282987, | |
| "grad_norm": 0.46770450472831726, | |
| "learning_rate": 4.4256238447041556e-05, | |
| "loss": 4.3366447448730465, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 1.5847271498563946, | |
| "grad_norm": 0.508904218673706, | |
| "learning_rate": 4.39170490134563e-05, | |
| "loss": 4.325738143920899, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 1.5864166244298024, | |
| "grad_norm": 0.46618375182151794, | |
| "learning_rate": 4.3578941340901274e-05, | |
| "loss": 4.311971282958984, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 1.5881060990032099, | |
| "grad_norm": 0.4693259596824646, | |
| "learning_rate": 4.324191887716612e-05, | |
| "loss": 4.320106124877929, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.5897955735766178, | |
| "grad_norm": 0.4754733145236969, | |
| "learning_rate": 4.290598505897439e-05, | |
| "loss": 4.334828948974609, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 1.5914850481500253, | |
| "grad_norm": 0.4678189158439636, | |
| "learning_rate": 4.25711433119483e-05, | |
| "loss": 4.344146347045898, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 1.5931745227234329, | |
| "grad_norm": 0.48414650559425354, | |
| "learning_rate": 4.223739705057384e-05, | |
| "loss": 4.333245849609375, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 1.5948639972968408, | |
| "grad_norm": 0.4806137979030609, | |
| "learning_rate": 4.1904749678165965e-05, | |
| "loss": 4.320773315429688, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.5965534718702483, | |
| "grad_norm": 0.46673110127449036, | |
| "learning_rate": 4.157320458683409e-05, | |
| "loss": 4.282149887084961, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.598242946443656, | |
| "grad_norm": 0.48379746079444885, | |
| "learning_rate": 4.124276515744713e-05, | |
| "loss": 4.316770935058594, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 1.5999324210170638, | |
| "grad_norm": 0.467012882232666, | |
| "learning_rate": 4.091343475959928e-05, | |
| "loss": 4.311006164550781, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 1.6016218955904713, | |
| "grad_norm": 0.45999497175216675, | |
| "learning_rate": 4.058521675157563e-05, | |
| "loss": 4.31392822265625, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 1.603311370163879, | |
| "grad_norm": 0.49554598331451416, | |
| "learning_rate": 4.025811448031792e-05, | |
| "loss": 4.317913818359375, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 1.6050008447372868, | |
| "grad_norm": 0.4512559771537781, | |
| "learning_rate": 3.993213128139027e-05, | |
| "loss": 4.320844650268555, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.6050008447372868, | |
| "eval_loss": 4.294473648071289, | |
| "eval_runtime": 7.7128, | |
| "eval_samples_per_second": 129.654, | |
| "eval_steps_per_second": 2.723, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.6066903193106943, | |
| "grad_norm": 0.4606820344924927, | |
| "learning_rate": 3.960727047894527e-05, | |
| "loss": 4.359199523925781, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 1.608379793884102, | |
| "grad_norm": 0.48804476857185364, | |
| "learning_rate": 3.928353538569023e-05, | |
| "loss": 4.32340087890625, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 1.6100692684575098, | |
| "grad_norm": 0.4648666977882385, | |
| "learning_rate": 3.8960929302853074e-05, | |
| "loss": 4.31898078918457, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 1.6117587430309173, | |
| "grad_norm": 0.48212724924087524, | |
| "learning_rate": 3.863945552014892e-05, | |
| "loss": 4.320017242431641, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 1.6134482176043252, | |
| "grad_norm": 0.46979817748069763, | |
| "learning_rate": 3.831911731574648e-05, | |
| "loss": 4.365304946899414, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.6151376921777327, | |
| "grad_norm": 0.47188496589660645, | |
| "learning_rate": 3.799991795623471e-05, | |
| "loss": 4.329359817504883, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 1.6168271667511402, | |
| "grad_norm": 0.47442197799682617, | |
| "learning_rate": 3.7681860696589216e-05, | |
| "loss": 4.333200836181641, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 1.6185166413245482, | |
| "grad_norm": 0.46460849046707153, | |
| "learning_rate": 3.7364948780139344e-05, | |
| "loss": 4.2955772399902346, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 1.6202061158979557, | |
| "grad_norm": 0.4687038064002991, | |
| "learning_rate": 3.70491854385351e-05, | |
| "loss": 4.287596893310547, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 1.6218955904713634, | |
| "grad_norm": 0.4717998802661896, | |
| "learning_rate": 3.673457389171401e-05, | |
| "loss": 4.3026374816894535, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.6235850650447712, | |
| "grad_norm": 0.47237226366996765, | |
| "learning_rate": 3.642111734786833e-05, | |
| "loss": 4.3385662078857425, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 1.6252745396181787, | |
| "grad_norm": 0.48337623476982117, | |
| "learning_rate": 3.610881900341261e-05, | |
| "loss": 4.29266357421875, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 1.6269640141915864, | |
| "grad_norm": 0.46639102697372437, | |
| "learning_rate": 3.579768204295063e-05, | |
| "loss": 4.3327476501464846, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 1.6286534887649942, | |
| "grad_norm": 0.4697898030281067, | |
| "learning_rate": 3.54877096392434e-05, | |
| "loss": 4.336753463745117, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 1.6303429633384017, | |
| "grad_norm": 0.46316251158714294, | |
| "learning_rate": 3.5178904953176354e-05, | |
| "loss": 4.306925964355469, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.6320324379118094, | |
| "grad_norm": 0.4708452820777893, | |
| "learning_rate": 3.487127113372755e-05, | |
| "loss": 4.326674270629883, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 1.6337219124852171, | |
| "grad_norm": 0.4727766811847687, | |
| "learning_rate": 3.4564811317935235e-05, | |
| "loss": 4.304772186279297, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 1.6354113870586247, | |
| "grad_norm": 0.47584787011146545, | |
| "learning_rate": 3.4259528630865995e-05, | |
| "loss": 4.3285400390625, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 1.6371008616320324, | |
| "grad_norm": 0.4718579947948456, | |
| "learning_rate": 3.3955426185582826e-05, | |
| "loss": 4.310879135131836, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 1.6387903362054401, | |
| "grad_norm": 0.466880738735199, | |
| "learning_rate": 3.365250708311352e-05, | |
| "loss": 4.325877380371094, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.6404798107788476, | |
| "grad_norm": 0.46377378702163696, | |
| "learning_rate": 3.335077441241895e-05, | |
| "loss": 4.307848358154297, | |
| "step": 9710 | |
| }, | |
| { | |
| "epoch": 1.6421692853522556, | |
| "grad_norm": 0.718170166015625, | |
| "learning_rate": 3.305023125036148e-05, | |
| "loss": 4.313734436035157, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 1.643858759925663, | |
| "grad_norm": 0.463375985622406, | |
| "learning_rate": 3.275088066167369e-05, | |
| "loss": 4.3089752197265625, | |
| "step": 9730 | |
| }, | |
| { | |
| "epoch": 1.6455482344990708, | |
| "grad_norm": 0.47580841183662415, | |
| "learning_rate": 3.245272569892727e-05, | |
| "loss": 4.3522186279296875, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 1.6472377090724786, | |
| "grad_norm": 0.46081092953681946, | |
| "learning_rate": 3.215576940250155e-05, | |
| "loss": 4.3113548278808596, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.648927183645886, | |
| "grad_norm": 0.47329118847846985, | |
| "learning_rate": 3.1860014800552734e-05, | |
| "loss": 4.3111930847167965, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 1.6506166582192938, | |
| "grad_norm": 0.4813630282878876, | |
| "learning_rate": 3.15654649089831e-05, | |
| "loss": 4.312236404418945, | |
| "step": 9770 | |
| }, | |
| { | |
| "epoch": 1.6523061327927016, | |
| "grad_norm": 0.5134222507476807, | |
| "learning_rate": 3.1272122731409916e-05, | |
| "loss": 4.3267356872558596, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 1.653995607366109, | |
| "grad_norm": 0.4687715768814087, | |
| "learning_rate": 3.097999125913518e-05, | |
| "loss": 4.311066055297852, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 1.6556850819395168, | |
| "grad_norm": 0.4736403524875641, | |
| "learning_rate": 3.068907347111485e-05, | |
| "loss": 4.3107654571533205, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.6573745565129245, | |
| "grad_norm": 0.4813496172428131, | |
| "learning_rate": 3.0399372333928644e-05, | |
| "loss": 4.314376449584961, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 1.659064031086332, | |
| "grad_norm": 0.49036741256713867, | |
| "learning_rate": 3.0110890801749627e-05, | |
| "loss": 4.307826995849609, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 1.6607535056597398, | |
| "grad_norm": 0.4669703543186188, | |
| "learning_rate": 2.982363181631418e-05, | |
| "loss": 4.303530883789063, | |
| "step": 9830 | |
| }, | |
| { | |
| "epoch": 1.6624429802331475, | |
| "grad_norm": 0.4788713753223419, | |
| "learning_rate": 2.9537598306892103e-05, | |
| "loss": 4.308844375610351, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 1.664132454806555, | |
| "grad_norm": 0.5307414531707764, | |
| "learning_rate": 2.9252793190256447e-05, | |
| "loss": 4.285565567016602, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.665821929379963, | |
| "grad_norm": 0.4659578502178192, | |
| "learning_rate": 2.896921937065419e-05, | |
| "loss": 4.313910675048828, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 1.6675114039533705, | |
| "grad_norm": 0.46300381422042847, | |
| "learning_rate": 2.8686879739776137e-05, | |
| "loss": 4.31811408996582, | |
| "step": 9870 | |
| }, | |
| { | |
| "epoch": 1.669200878526778, | |
| "grad_norm": 0.4717971086502075, | |
| "learning_rate": 2.8405777176727924e-05, | |
| "loss": 4.318044662475586, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 1.670890353100186, | |
| "grad_norm": 0.45347994565963745, | |
| "learning_rate": 2.8125914548000243e-05, | |
| "loss": 4.295824432373047, | |
| "step": 9890 | |
| }, | |
| { | |
| "epoch": 1.6725798276735935, | |
| "grad_norm": 0.4703952670097351, | |
| "learning_rate": 2.7847294707439828e-05, | |
| "loss": 4.28874626159668, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.6742693022470012, | |
| "grad_norm": 0.4726548194885254, | |
| "learning_rate": 2.7569920496220398e-05, | |
| "loss": 4.304931259155273, | |
| "step": 9910 | |
| }, | |
| { | |
| "epoch": 1.675958776820409, | |
| "grad_norm": 0.47394225001335144, | |
| "learning_rate": 2.729379474281352e-05, | |
| "loss": 4.3050182342529295, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 1.6776482513938165, | |
| "grad_norm": 0.49833500385284424, | |
| "learning_rate": 2.701892026295979e-05, | |
| "loss": 4.331858062744141, | |
| "step": 9930 | |
| }, | |
| { | |
| "epoch": 1.6793377259672242, | |
| "grad_norm": 0.4709710478782654, | |
| "learning_rate": 2.6745299859640318e-05, | |
| "loss": 4.332807159423828, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 1.681027200540632, | |
| "grad_norm": 0.48379939794540405, | |
| "learning_rate": 2.6472936323047972e-05, | |
| "loss": 4.311476516723633, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.6827166751140394, | |
| "grad_norm": 0.475941926240921, | |
| "learning_rate": 2.6201832430558866e-05, | |
| "loss": 4.314311599731445, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 1.6844061496874472, | |
| "grad_norm": 0.4633561372756958, | |
| "learning_rate": 2.5931990946704206e-05, | |
| "loss": 4.312783050537109, | |
| "step": 9970 | |
| }, | |
| { | |
| "epoch": 1.686095624260855, | |
| "grad_norm": 0.4624374806880951, | |
| "learning_rate": 2.5663414623141943e-05, | |
| "loss": 4.315936279296875, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 1.6877850988342624, | |
| "grad_norm": 0.46104687452316284, | |
| "learning_rate": 2.5396106198628947e-05, | |
| "loss": 4.317576217651367, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 1.6894745734076704, | |
| "grad_norm": 0.46486878395080566, | |
| "learning_rate": 2.5130068398992716e-05, | |
| "loss": 4.3148681640625, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.6894745734076704, | |
| "eval_loss": 4.282918930053711, | |
| "eval_runtime": 3.8826, | |
| "eval_samples_per_second": 257.563, | |
| "eval_steps_per_second": 5.409, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.691164047981078, | |
| "grad_norm": 0.4754472076892853, | |
| "learning_rate": 2.4865303937104007e-05, | |
| "loss": 4.285601425170898, | |
| "step": 10010 | |
| }, | |
| { | |
| "epoch": 1.6928535225544854, | |
| "grad_norm": 0.4589325785636902, | |
| "learning_rate": 2.460181551284876e-05, | |
| "loss": 4.316444396972656, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 1.6945429971278934, | |
| "grad_norm": 0.4686416685581207, | |
| "learning_rate": 2.433960581310091e-05, | |
| "loss": 4.296805191040039, | |
| "step": 10030 | |
| }, | |
| { | |
| "epoch": 1.6962324717013009, | |
| "grad_norm": 0.44992297887802124, | |
| "learning_rate": 2.4078677511694776e-05, | |
| "loss": 4.326528930664063, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 1.6979219462747086, | |
| "grad_norm": 0.460001677274704, | |
| "learning_rate": 2.381903326939777e-05, | |
| "loss": 4.270325088500977, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.6996114208481163, | |
| "grad_norm": 0.45742112398147583, | |
| "learning_rate": 2.356067573388355e-05, | |
| "loss": 4.311310958862305, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 1.7013008954215239, | |
| "grad_norm": 0.5051885843276978, | |
| "learning_rate": 2.3303607539704628e-05, | |
| "loss": 4.305488586425781, | |
| "step": 10070 | |
| }, | |
| { | |
| "epoch": 1.7029903699949316, | |
| "grad_norm": 0.460809588432312, | |
| "learning_rate": 2.3047831308265845e-05, | |
| "loss": 4.284737777709961, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 1.7046798445683393, | |
| "grad_norm": 0.48899003863334656, | |
| "learning_rate": 2.2793349647797372e-05, | |
| "loss": 4.308148956298828, | |
| "step": 10090 | |
| }, | |
| { | |
| "epoch": 1.7063693191417468, | |
| "grad_norm": 0.47210270166397095, | |
| "learning_rate": 2.2540165153328345e-05, | |
| "loss": 4.300167465209961, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.7080587937151546, | |
| "grad_norm": 0.4561355710029602, | |
| "learning_rate": 2.2288280406660237e-05, | |
| "loss": 4.295189285278321, | |
| "step": 10110 | |
| }, | |
| { | |
| "epoch": 1.7097482682885623, | |
| "grad_norm": 0.4685342013835907, | |
| "learning_rate": 2.2037697976340525e-05, | |
| "loss": 4.32569465637207, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 1.7114377428619698, | |
| "grad_norm": 0.4782038629055023, | |
| "learning_rate": 2.1788420417636704e-05, | |
| "loss": 4.281776046752929, | |
| "step": 10130 | |
| }, | |
| { | |
| "epoch": 1.7131272174353775, | |
| "grad_norm": 0.45496320724487305, | |
| "learning_rate": 2.1540450272509986e-05, | |
| "loss": 4.289628219604492, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 1.7148166920087853, | |
| "grad_norm": 0.4686676263809204, | |
| "learning_rate": 2.129379006958944e-05, | |
| "loss": 4.304840087890625, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.7165061665821928, | |
| "grad_norm": 0.45078393816947937, | |
| "learning_rate": 2.104844232414634e-05, | |
| "loss": 4.333132934570313, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 1.7181956411556008, | |
| "grad_norm": 0.4551495313644409, | |
| "learning_rate": 2.080440953806844e-05, | |
| "loss": 4.313465118408203, | |
| "step": 10170 | |
| }, | |
| { | |
| "epoch": 1.7198851157290083, | |
| "grad_norm": 0.46409592032432556, | |
| "learning_rate": 2.056169419983432e-05, | |
| "loss": 4.303678131103515, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 1.721574590302416, | |
| "grad_norm": 0.46051809191703796, | |
| "learning_rate": 2.0320298784488177e-05, | |
| "loss": 4.297393798828125, | |
| "step": 10190 | |
| }, | |
| { | |
| "epoch": 1.7232640648758237, | |
| "grad_norm": 0.541107714176178, | |
| "learning_rate": 2.008022575361464e-05, | |
| "loss": 4.302070617675781, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.7249535394492312, | |
| "grad_norm": 0.46840059757232666, | |
| "learning_rate": 1.9841477555313428e-05, | |
| "loss": 4.290169143676758, | |
| "step": 10210 | |
| }, | |
| { | |
| "epoch": 1.726643014022639, | |
| "grad_norm": 0.46939900517463684, | |
| "learning_rate": 1.960405662417458e-05, | |
| "loss": 4.315706634521485, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 1.7283324885960467, | |
| "grad_norm": 0.4771457016468048, | |
| "learning_rate": 1.9367965381253632e-05, | |
| "loss": 4.289263534545898, | |
| "step": 10230 | |
| }, | |
| { | |
| "epoch": 1.7300219631694542, | |
| "grad_norm": 0.48085805773735046, | |
| "learning_rate": 1.9133206234046833e-05, | |
| "loss": 4.3228507995605465, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 1.731711437742862, | |
| "grad_norm": 0.4604587256908417, | |
| "learning_rate": 1.8899781576466605e-05, | |
| "loss": 4.296081924438477, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.7334009123162697, | |
| "grad_norm": 0.4865635633468628, | |
| "learning_rate": 1.86676937888172e-05, | |
| "loss": 4.302744674682617, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 1.7350903868896772, | |
| "grad_norm": 0.4594942033290863, | |
| "learning_rate": 1.8436945237770347e-05, | |
| "loss": 4.3057910919189455, | |
| "step": 10270 | |
| }, | |
| { | |
| "epoch": 1.736779861463085, | |
| "grad_norm": 0.4511856734752655, | |
| "learning_rate": 1.8207538276341255e-05, | |
| "loss": 4.311210632324219, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 1.7384693360364927, | |
| "grad_norm": 0.46823564171791077, | |
| "learning_rate": 1.7979475243864422e-05, | |
| "loss": 4.291423797607422, | |
| "step": 10290 | |
| }, | |
| { | |
| "epoch": 1.7401588106099002, | |
| "grad_norm": 0.456841379404068, | |
| "learning_rate": 1.7752758465969835e-05, | |
| "loss": 4.291481781005859, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.7418482851833081, | |
| "grad_norm": 0.464433491230011, | |
| "learning_rate": 1.7527390254559564e-05, | |
| "loss": 4.306121826171875, | |
| "step": 10310 | |
| }, | |
| { | |
| "epoch": 1.7435377597567157, | |
| "grad_norm": 0.43991556763648987, | |
| "learning_rate": 1.7303372907783646e-05, | |
| "loss": 4.288319778442383, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 1.7452272343301232, | |
| "grad_norm": 0.4612221121788025, | |
| "learning_rate": 1.708070871001704e-05, | |
| "loss": 4.296160125732422, | |
| "step": 10330 | |
| }, | |
| { | |
| "epoch": 1.7469167089035311, | |
| "grad_norm": 0.4536151587963104, | |
| "learning_rate": 1.6859399931836182e-05, | |
| "loss": 4.302063751220703, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 1.7486061834769386, | |
| "grad_norm": 0.47430509328842163, | |
| "learning_rate": 1.663944882999596e-05, | |
| "loss": 4.320109176635742, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.7502956580503464, | |
| "grad_norm": 0.4570671021938324, | |
| "learning_rate": 1.6420857647406533e-05, | |
| "loss": 4.309846115112305, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 1.751985132623754, | |
| "grad_norm": 0.46541541814804077, | |
| "learning_rate": 1.6203628613110513e-05, | |
| "loss": 4.321808242797852, | |
| "step": 10370 | |
| }, | |
| { | |
| "epoch": 1.7536746071971616, | |
| "grad_norm": 0.4689694941043854, | |
| "learning_rate": 1.598776394226035e-05, | |
| "loss": 4.342444992065429, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 1.7553640817705694, | |
| "grad_norm": 0.4656012952327728, | |
| "learning_rate": 1.5773265836095615e-05, | |
| "loss": 4.284120178222656, | |
| "step": 10390 | |
| }, | |
| { | |
| "epoch": 1.757053556343977, | |
| "grad_norm": 0.44993332028388977, | |
| "learning_rate": 1.5560136481920583e-05, | |
| "loss": 4.305658340454102, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.7587430309173846, | |
| "grad_norm": 0.45734935998916626, | |
| "learning_rate": 1.5348378053081885e-05, | |
| "loss": 4.28479232788086, | |
| "step": 10410 | |
| }, | |
| { | |
| "epoch": 1.7604325054907923, | |
| "grad_norm": 0.48875826597213745, | |
| "learning_rate": 1.5137992708946522e-05, | |
| "loss": 4.30067024230957, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 1.7621219800642, | |
| "grad_norm": 0.4599165618419647, | |
| "learning_rate": 1.4928982594879602e-05, | |
| "loss": 4.302487564086914, | |
| "step": 10430 | |
| }, | |
| { | |
| "epoch": 1.7638114546376076, | |
| "grad_norm": 0.45845454931259155, | |
| "learning_rate": 1.4721349842222623e-05, | |
| "loss": 4.285428619384765, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 1.7655009292110155, | |
| "grad_norm": 0.4705585539340973, | |
| "learning_rate": 1.4515096568271728e-05, | |
| "loss": 4.30066032409668, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.767190403784423, | |
| "grad_norm": 0.47239530086517334, | |
| "learning_rate": 1.4310224876256071e-05, | |
| "loss": 4.3203174591064455, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 1.7688798783578306, | |
| "grad_norm": 0.4652308225631714, | |
| "learning_rate": 1.410673685531638e-05, | |
| "loss": 4.307133483886719, | |
| "step": 10470 | |
| }, | |
| { | |
| "epoch": 1.7705693529312385, | |
| "grad_norm": 0.4624398946762085, | |
| "learning_rate": 1.390463458048357e-05, | |
| "loss": 4.315113830566406, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 1.772258827504646, | |
| "grad_norm": 0.46324899792671204, | |
| "learning_rate": 1.3703920112657856e-05, | |
| "loss": 4.304290771484375, | |
| "step": 10490 | |
| }, | |
| { | |
| "epoch": 1.7739483020780538, | |
| "grad_norm": 0.4670204222202301, | |
| "learning_rate": 1.3504595498587378e-05, | |
| "loss": 4.301520919799804, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.7739483020780538, | |
| "eval_loss": 4.273873805999756, | |
| "eval_runtime": 4.0508, | |
| "eval_samples_per_second": 246.863, | |
| "eval_steps_per_second": 5.184, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.7756377766514615, | |
| "grad_norm": 0.45778968930244446, | |
| "learning_rate": 1.330666277084756e-05, | |
| "loss": 4.324074172973633, | |
| "step": 10510 | |
| }, | |
| { | |
| "epoch": 1.777327251224869, | |
| "grad_norm": 0.44530189037323, | |
| "learning_rate": 1.3110123947820345e-05, | |
| "loss": 4.296671295166016, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 1.7790167257982767, | |
| "grad_norm": 0.4516686797142029, | |
| "learning_rate": 1.2914981033673616e-05, | |
| "loss": 4.3019359588623045, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 1.7807062003716845, | |
| "grad_norm": 0.4497534930706024, | |
| "learning_rate": 1.2721236018340675e-05, | |
| "loss": 4.252984237670899, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 1.782395674945092, | |
| "grad_norm": 0.4479978680610657, | |
| "learning_rate": 1.2528890877500025e-05, | |
| "loss": 4.305055618286133, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.7840851495184997, | |
| "grad_norm": 0.462827205657959, | |
| "learning_rate": 1.2337947572555257e-05, | |
| "loss": 4.314754486083984, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 1.7857746240919075, | |
| "grad_norm": 0.4561219811439514, | |
| "learning_rate": 1.2148408050614961e-05, | |
| "loss": 4.2755790710449215, | |
| "step": 10570 | |
| }, | |
| { | |
| "epoch": 1.787464098665315, | |
| "grad_norm": 0.4636087119579315, | |
| "learning_rate": 1.1960274244472928e-05, | |
| "loss": 4.280724716186524, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 1.7891535732387227, | |
| "grad_norm": 0.4560607373714447, | |
| "learning_rate": 1.1773548072588352e-05, | |
| "loss": 4.296182632446289, | |
| "step": 10590 | |
| }, | |
| { | |
| "epoch": 1.7908430478121304, | |
| "grad_norm": 0.46516045928001404, | |
| "learning_rate": 1.158823143906652e-05, | |
| "loss": 4.301852416992188, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.792532522385538, | |
| "grad_norm": 0.4671533703804016, | |
| "learning_rate": 1.1404326233639056e-05, | |
| "loss": 4.321551132202148, | |
| "step": 10610 | |
| }, | |
| { | |
| "epoch": 1.794221996958946, | |
| "grad_norm": 0.46711355447769165, | |
| "learning_rate": 1.1221834331644857e-05, | |
| "loss": 4.292984390258789, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 1.7959114715323534, | |
| "grad_norm": 0.46830058097839355, | |
| "learning_rate": 1.1040757594010908e-05, | |
| "loss": 4.294471740722656, | |
| "step": 10630 | |
| }, | |
| { | |
| "epoch": 1.7976009461057612, | |
| "grad_norm": 0.45422518253326416, | |
| "learning_rate": 1.0861097867233375e-05, | |
| "loss": 4.302399444580078, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 1.799290420679169, | |
| "grad_norm": 0.46243947744369507, | |
| "learning_rate": 1.0682856983358645e-05, | |
| "loss": 4.300415420532227, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.8009798952525764, | |
| "grad_norm": 0.45327311754226685, | |
| "learning_rate": 1.050603675996477e-05, | |
| "loss": 4.294659042358399, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 1.8026693698259841, | |
| "grad_norm": 0.4481427073478699, | |
| "learning_rate": 1.0330639000142877e-05, | |
| "loss": 4.29761962890625, | |
| "step": 10670 | |
| }, | |
| { | |
| "epoch": 1.8043588443993919, | |
| "grad_norm": 0.45235884189605713, | |
| "learning_rate": 1.0156665492478794e-05, | |
| "loss": 4.2950092315673825, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 1.8060483189727994, | |
| "grad_norm": 0.4574648439884186, | |
| "learning_rate": 9.984118011034787e-06, | |
| "loss": 4.296451187133789, | |
| "step": 10690 | |
| }, | |
| { | |
| "epoch": 1.8077377935462071, | |
| "grad_norm": 0.44989126920700073, | |
| "learning_rate": 9.812998315331449e-06, | |
| "loss": 4.295338821411133, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.8094272681196149, | |
| "grad_norm": 0.44825267791748047, | |
| "learning_rate": 9.64330815032991e-06, | |
| "loss": 4.29632682800293, | |
| "step": 10710 | |
| }, | |
| { | |
| "epoch": 1.8111167426930224, | |
| "grad_norm": 0.44391629099845886, | |
| "learning_rate": 9.475049246413801e-06, | |
| "loss": 4.282930374145508, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 1.81280621726643, | |
| "grad_norm": 0.4501837491989136, | |
| "learning_rate": 9.308223319371789e-06, | |
| "loss": 4.3113666534423825, | |
| "step": 10730 | |
| }, | |
| { | |
| "epoch": 1.8144956918398378, | |
| "grad_norm": 0.45159661769866943, | |
| "learning_rate": 9.142832070380051e-06, | |
| "loss": 4.275300979614258, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 1.8161851664132453, | |
| "grad_norm": 0.4615607261657715, | |
| "learning_rate": 8.978877185984895e-06, | |
| "loss": 4.27879753112793, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.8178746409866533, | |
| "grad_norm": 0.4554959535598755, | |
| "learning_rate": 8.816360338085537e-06, | |
| "loss": 4.320524597167969, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 1.8195641155600608, | |
| "grad_norm": 0.4588150978088379, | |
| "learning_rate": 8.655283183917094e-06, | |
| "loss": 4.305972671508789, | |
| "step": 10770 | |
| }, | |
| { | |
| "epoch": 1.8212535901334683, | |
| "grad_norm": 0.4642908275127411, | |
| "learning_rate": 8.495647366033708e-06, | |
| "loss": 4.303414154052734, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 1.8229430647068763, | |
| "grad_norm": 0.44532260298728943, | |
| "learning_rate": 8.33745451229173e-06, | |
| "loss": 4.2890056610107425, | |
| "step": 10790 | |
| }, | |
| { | |
| "epoch": 1.8246325392802838, | |
| "grad_norm": 0.44421857595443726, | |
| "learning_rate": 8.180706235833162e-06, | |
| "loss": 4.27965087890625, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.8263220138536915, | |
| "grad_norm": 0.4534235894680023, | |
| "learning_rate": 8.025404135069207e-06, | |
| "loss": 4.3062583923339846, | |
| "step": 10810 | |
| }, | |
| { | |
| "epoch": 1.8280114884270993, | |
| "grad_norm": 0.44868797063827515, | |
| "learning_rate": 7.871549793663985e-06, | |
| "loss": 4.286159896850586, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 1.8297009630005068, | |
| "grad_norm": 0.4559250771999359, | |
| "learning_rate": 7.719144780518315e-06, | |
| "loss": 4.280204391479492, | |
| "step": 10830 | |
| }, | |
| { | |
| "epoch": 1.8313904375739145, | |
| "grad_norm": 0.4582137167453766, | |
| "learning_rate": 7.568190649753753e-06, | |
| "loss": 4.293819427490234, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 1.8330799121473222, | |
| "grad_norm": 0.44784441590309143, | |
| "learning_rate": 7.418688940696843e-06, | |
| "loss": 4.301911163330078, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.8347693867207298, | |
| "grad_norm": 0.4547264575958252, | |
| "learning_rate": 7.270641177863251e-06, | |
| "loss": 4.318780136108399, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 1.8364588612941375, | |
| "grad_norm": 0.44876977801322937, | |
| "learning_rate": 7.124048870942301e-06, | |
| "loss": 4.305691528320312, | |
| "step": 10870 | |
| }, | |
| { | |
| "epoch": 1.8381483358675452, | |
| "grad_norm": 0.4435437321662903, | |
| "learning_rate": 6.97891351478157e-06, | |
| "loss": 4.285098648071289, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 1.8398378104409527, | |
| "grad_norm": 0.4529848098754883, | |
| "learning_rate": 6.83523658937174e-06, | |
| "loss": 4.30163345336914, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 1.8415272850143607, | |
| "grad_norm": 0.44488754868507385, | |
| "learning_rate": 6.693019559831319e-06, | |
| "loss": 4.272104644775391, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.8432167595877682, | |
| "grad_norm": 0.44506925344467163, | |
| "learning_rate": 6.552263876391878e-06, | |
| "loss": 4.296164703369141, | |
| "step": 10910 | |
| }, | |
| { | |
| "epoch": 1.8449062341611757, | |
| "grad_norm": 0.453124076128006, | |
| "learning_rate": 6.412970974383069e-06, | |
| "loss": 4.268503189086914, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 1.8465957087345837, | |
| "grad_norm": 0.45630943775177, | |
| "learning_rate": 6.275142274218264e-06, | |
| "loss": 4.276957702636719, | |
| "step": 10930 | |
| }, | |
| { | |
| "epoch": 1.8482851833079912, | |
| "grad_norm": 0.4438062012195587, | |
| "learning_rate": 6.138779181379777e-06, | |
| "loss": 4.31237564086914, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 1.849974657881399, | |
| "grad_norm": 0.4586540460586548, | |
| "learning_rate": 6.003883086404709e-06, | |
| "loss": 4.296250915527343, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.8516641324548067, | |
| "grad_norm": 0.4555058777332306, | |
| "learning_rate": 5.870455364870747e-06, | |
| "loss": 4.289797973632813, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 1.8533536070282142, | |
| "grad_norm": 0.4580257833003998, | |
| "learning_rate": 5.738497377382117e-06, | |
| "loss": 4.288161849975586, | |
| "step": 10970 | |
| }, | |
| { | |
| "epoch": 1.855043081601622, | |
| "grad_norm": 0.44520384073257446, | |
| "learning_rate": 5.608010469555674e-06, | |
| "loss": 4.309579467773437, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 1.8567325561750296, | |
| "grad_norm": 0.4458165168762207, | |
| "learning_rate": 5.4789959720071995e-06, | |
| "loss": 4.300251007080078, | |
| "step": 10990 | |
| }, | |
| { | |
| "epoch": 1.8584220307484371, | |
| "grad_norm": 0.4537349343299866, | |
| "learning_rate": 5.3514552003379395e-06, | |
| "loss": 4.293206024169922, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.8584220307484371, | |
| "eval_loss": 4.268224716186523, | |
| "eval_runtime": 3.8046, | |
| "eval_samples_per_second": 262.842, | |
| "eval_steps_per_second": 5.52, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.8601115053218449, | |
| "grad_norm": 0.455091655254364, | |
| "learning_rate": 5.225389455120976e-06, | |
| "loss": 4.29366455078125, | |
| "step": 11010 | |
| }, | |
| { | |
| "epoch": 1.8618009798952526, | |
| "grad_norm": 0.46570661664009094, | |
| "learning_rate": 5.1008000218881576e-06, | |
| "loss": 4.286912536621093, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 1.8634904544686601, | |
| "grad_norm": 0.4428755044937134, | |
| "learning_rate": 4.977688171116923e-06, | |
| "loss": 4.300152206420899, | |
| "step": 11030 | |
| }, | |
| { | |
| "epoch": 1.8651799290420679, | |
| "grad_norm": 0.4450303316116333, | |
| "learning_rate": 4.856055158217298e-06, | |
| "loss": 4.289414978027343, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 1.8668694036154756, | |
| "grad_norm": 0.45025452971458435, | |
| "learning_rate": 4.735902223519173e-06, | |
| "loss": 4.273600006103516, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.868558878188883, | |
| "grad_norm": 0.4395146071910858, | |
| "learning_rate": 4.6172305922595746e-06, | |
| "loss": 4.288070297241211, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 1.870248352762291, | |
| "grad_norm": 0.45017367601394653, | |
| "learning_rate": 4.500041474570265e-06, | |
| "loss": 4.298558044433594, | |
| "step": 11070 | |
| }, | |
| { | |
| "epoch": 1.8719378273356986, | |
| "grad_norm": 0.45083948969841003, | |
| "learning_rate": 4.384336065465349e-06, | |
| "loss": 4.278664398193359, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 1.8736273019091063, | |
| "grad_norm": 0.4492949843406677, | |
| "learning_rate": 4.270115544829017e-06, | |
| "loss": 4.304440307617187, | |
| "step": 11090 | |
| }, | |
| { | |
| "epoch": 1.875316776482514, | |
| "grad_norm": 0.4543094336986542, | |
| "learning_rate": 4.1573810774037044e-06, | |
| "loss": 4.284811401367188, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.8770062510559216, | |
| "grad_norm": 0.44232332706451416, | |
| "learning_rate": 4.046133812777985e-06, | |
| "loss": 4.275522232055664, | |
| "step": 11110 | |
| }, | |
| { | |
| "epoch": 1.8786957256293293, | |
| "grad_norm": 0.4532018005847931, | |
| "learning_rate": 3.936374885375049e-06, | |
| "loss": 4.325132751464844, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 1.880385200202737, | |
| "grad_norm": 0.44870230555534363, | |
| "learning_rate": 3.828105414440974e-06, | |
| "loss": 4.293384170532226, | |
| "step": 11130 | |
| }, | |
| { | |
| "epoch": 1.8820746747761445, | |
| "grad_norm": 0.4408150017261505, | |
| "learning_rate": 3.7213265040334394e-06, | |
| "loss": 4.296081161499023, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 1.8837641493495523, | |
| "grad_norm": 0.44336998462677, | |
| "learning_rate": 3.616039243010399e-06, | |
| "loss": 4.299095916748047, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.88545362392296, | |
| "grad_norm": 0.4412024915218353, | |
| "learning_rate": 3.5122447050189573e-06, | |
| "loss": 4.285486221313477, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 1.8871430984963675, | |
| "grad_norm": 0.45283156633377075, | |
| "learning_rate": 3.4099439484844947e-06, | |
| "loss": 4.294749069213867, | |
| "step": 11170 | |
| }, | |
| { | |
| "epoch": 1.8888325730697753, | |
| "grad_norm": 0.4460100829601288, | |
| "learning_rate": 3.3091380165998103e-06, | |
| "loss": 4.310376739501953, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 1.890522047643183, | |
| "grad_norm": 0.44468414783477783, | |
| "learning_rate": 3.2098279373145463e-06, | |
| "loss": 4.327771377563477, | |
| "step": 11190 | |
| }, | |
| { | |
| "epoch": 1.8922115222165905, | |
| "grad_norm": 0.4378024637699127, | |
| "learning_rate": 3.1120147233246463e-06, | |
| "loss": 4.273694610595703, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.8939009967899985, | |
| "grad_norm": 0.44141000509262085, | |
| "learning_rate": 3.0156993720619804e-06, | |
| "loss": 4.287034606933593, | |
| "step": 11210 | |
| }, | |
| { | |
| "epoch": 1.895590471363406, | |
| "grad_norm": 0.4455374479293823, | |
| "learning_rate": 2.9208828656843876e-06, | |
| "loss": 4.320920181274414, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 1.8972799459368137, | |
| "grad_norm": 0.45368343591690063, | |
| "learning_rate": 2.827566171065415e-06, | |
| "loss": 4.285198974609375, | |
| "step": 11230 | |
| }, | |
| { | |
| "epoch": 1.8989694205102214, | |
| "grad_norm": 0.44222062826156616, | |
| "learning_rate": 2.7357502397845454e-06, | |
| "loss": 4.296764755249024, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 1.900658895083629, | |
| "grad_norm": 0.45191657543182373, | |
| "learning_rate": 2.645436008117602e-06, | |
| "loss": 4.27384033203125, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.9023483696570367, | |
| "grad_norm": 0.4548667371273041, | |
| "learning_rate": 2.5566243970270073e-06, | |
| "loss": 4.297956085205078, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 1.9040378442304444, | |
| "grad_norm": 0.4443969130516052, | |
| "learning_rate": 2.469316312152575e-06, | |
| "loss": 4.291641616821289, | |
| "step": 11270 | |
| }, | |
| { | |
| "epoch": 1.905727318803852, | |
| "grad_norm": 0.4367770850658417, | |
| "learning_rate": 2.3835126438021156e-06, | |
| "loss": 4.266088485717773, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 1.9074167933772597, | |
| "grad_norm": 0.4365804195404053, | |
| "learning_rate": 2.299214266942495e-06, | |
| "loss": 4.263021850585938, | |
| "step": 11290 | |
| }, | |
| { | |
| "epoch": 1.9091062679506674, | |
| "grad_norm": 0.4369988441467285, | |
| "learning_rate": 2.2164220411906407e-06, | |
| "loss": 4.288222122192383, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.910795742524075, | |
| "grad_norm": 0.44547080993652344, | |
| "learning_rate": 2.1351368108047495e-06, | |
| "loss": 4.26991081237793, | |
| "step": 11310 | |
| }, | |
| { | |
| "epoch": 1.9124852170974826, | |
| "grad_norm": 0.45165297389030457, | |
| "learning_rate": 2.0553594046757438e-06, | |
| "loss": 4.2671764373779295, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 1.9141746916708904, | |
| "grad_norm": 0.4523044526576996, | |
| "learning_rate": 1.9770906363187787e-06, | |
| "loss": 4.28791618347168, | |
| "step": 11330 | |
| }, | |
| { | |
| "epoch": 1.915864166244298, | |
| "grad_norm": 0.43898409605026245, | |
| "learning_rate": 1.9003313038649826e-06, | |
| "loss": 4.301726150512695, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 1.9175536408177059, | |
| "grad_norm": 0.44454851746559143, | |
| "learning_rate": 1.825082190053262e-06, | |
| "loss": 4.280124664306641, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.9192431153911134, | |
| "grad_norm": 0.44016656279563904, | |
| "learning_rate": 1.7513440622223762e-06, | |
| "loss": 4.312954330444336, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 1.9209325899645209, | |
| "grad_norm": 0.439481645822525, | |
| "learning_rate": 1.6791176723030763e-06, | |
| "loss": 4.291484069824219, | |
| "step": 11370 | |
| }, | |
| { | |
| "epoch": 1.9226220645379288, | |
| "grad_norm": 0.44403141736984253, | |
| "learning_rate": 1.608403756810428e-06, | |
| "loss": 4.297753524780274, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 1.9243115391113363, | |
| "grad_norm": 0.4380677342414856, | |
| "learning_rate": 1.5392030368363839e-06, | |
| "loss": 4.311534881591797, | |
| "step": 11390 | |
| }, | |
| { | |
| "epoch": 1.926001013684744, | |
| "grad_norm": 0.4598468542098999, | |
| "learning_rate": 1.4715162180422902e-06, | |
| "loss": 4.272250747680664, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.9276904882581518, | |
| "grad_norm": 0.44107797741889954, | |
| "learning_rate": 1.405343990651825e-06, | |
| "loss": 4.283835601806641, | |
| "step": 11410 | |
| }, | |
| { | |
| "epoch": 1.9293799628315593, | |
| "grad_norm": 0.43588972091674805, | |
| "learning_rate": 1.3406870294438876e-06, | |
| "loss": 4.273925399780273, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 1.931069437404967, | |
| "grad_norm": 0.4393414258956909, | |
| "learning_rate": 1.2775459937457544e-06, | |
| "loss": 4.295301055908203, | |
| "step": 11430 | |
| }, | |
| { | |
| "epoch": 1.9327589119783748, | |
| "grad_norm": 0.44228672981262207, | |
| "learning_rate": 1.2159215274262834e-06, | |
| "loss": 4.273171997070312, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 1.9344483865517823, | |
| "grad_norm": 0.4422619640827179, | |
| "learning_rate": 1.155814258889437e-06, | |
| "loss": 4.285517883300781, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.93613786112519, | |
| "grad_norm": 0.4427924156188965, | |
| "learning_rate": 1.0972248010678365e-06, | |
| "loss": 4.312974548339843, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 1.9378273356985978, | |
| "grad_norm": 0.44030192494392395, | |
| "learning_rate": 1.040153751416517e-06, | |
| "loss": 4.302379989624024, | |
| "step": 11470 | |
| }, | |
| { | |
| "epoch": 1.9395168102720053, | |
| "grad_norm": 0.44400596618652344, | |
| "learning_rate": 9.846016919068167e-07, | |
| "loss": 4.280198287963867, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 1.941206284845413, | |
| "grad_norm": 0.4478650689125061, | |
| "learning_rate": 9.305691890204469e-07, | |
| "loss": 4.281633758544922, | |
| "step": 11490 | |
| }, | |
| { | |
| "epoch": 1.9428957594188208, | |
| "grad_norm": 0.4458984434604645, | |
| "learning_rate": 8.780567937437644e-07, | |
| "loss": 4.278944396972657, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.9428957594188208, | |
| "eval_loss": 4.265942573547363, | |
| "eval_runtime": 4.0629, | |
| "eval_samples_per_second": 246.128, | |
| "eval_steps_per_second": 5.169, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.9445852339922283, | |
| "grad_norm": 0.4352650046348572, | |
| "learning_rate": 8.270650415620584e-07, | |
| "loss": 4.2965538024902346, | |
| "step": 11510 | |
| }, | |
| { | |
| "epoch": 1.9462747085656362, | |
| "grad_norm": 0.43585142493247986, | |
| "learning_rate": 7.775944524542055e-07, | |
| "loss": 4.270129776000976, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 1.9479641831390437, | |
| "grad_norm": 0.4469541907310486, | |
| "learning_rate": 7.296455308872406e-07, | |
| "loss": 4.283909606933594, | |
| "step": 11530 | |
| }, | |
| { | |
| "epoch": 1.9496536577124515, | |
| "grad_norm": 0.4361380636692047, | |
| "learning_rate": 6.832187658113441e-07, | |
| "loss": 4.296160125732422, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 1.9513431322858592, | |
| "grad_norm": 0.44409504532814026, | |
| "learning_rate": 6.383146306547626e-07, | |
| "loss": 4.304541778564453, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.9530326068592667, | |
| "grad_norm": 0.45062074065208435, | |
| "learning_rate": 5.949335833189628e-07, | |
| "loss": 4.3281913757324215, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 1.9547220814326745, | |
| "grad_norm": 0.45208507776260376, | |
| "learning_rate": 5.530760661741018e-07, | |
| "loss": 4.3035846710205075, | |
| "step": 11570 | |
| }, | |
| { | |
| "epoch": 1.9564115560060822, | |
| "grad_norm": 0.44333794713020325, | |
| "learning_rate": 5.127425060543478e-07, | |
| "loss": 4.278887939453125, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 1.9581010305794897, | |
| "grad_norm": 0.44367748498916626, | |
| "learning_rate": 4.7393331425364943e-07, | |
| "loss": 4.281793594360352, | |
| "step": 11590 | |
| }, | |
| { | |
| "epoch": 1.9597905051528974, | |
| "grad_norm": 0.4411092698574066, | |
| "learning_rate": 4.3664888652144017e-07, | |
| "loss": 4.278807067871094, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.9614799797263052, | |
| "grad_norm": 0.44609910249710083, | |
| "learning_rate": 4.008896030587072e-07, | |
| "loss": 4.270274353027344, | |
| "step": 11610 | |
| }, | |
| { | |
| "epoch": 1.9631694542997127, | |
| "grad_norm": 0.43740522861480713, | |
| "learning_rate": 3.6665582851406195e-07, | |
| "loss": 4.296014785766602, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 1.9648589288731204, | |
| "grad_norm": 0.44448962807655334, | |
| "learning_rate": 3.3394791198000927e-07, | |
| "loss": 4.282284927368164, | |
| "step": 11630 | |
| }, | |
| { | |
| "epoch": 1.9665484034465281, | |
| "grad_norm": 0.45065152645111084, | |
| "learning_rate": 3.027661869893672e-07, | |
| "loss": 4.2820892333984375, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 1.9682378780199357, | |
| "grad_norm": 0.4398045539855957, | |
| "learning_rate": 2.731109715119861e-07, | |
| "loss": 4.281244277954102, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.9699273525933436, | |
| "grad_norm": 0.4467960000038147, | |
| "learning_rate": 2.4498256795135173e-07, | |
| "loss": 4.307322311401367, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 1.9716168271667511, | |
| "grad_norm": 0.4327242970466614, | |
| "learning_rate": 2.183812631415871e-07, | |
| "loss": 4.275672149658203, | |
| "step": 11670 | |
| }, | |
| { | |
| "epoch": 1.9733063017401589, | |
| "grad_norm": 0.43306484818458557, | |
| "learning_rate": 1.933073283445219e-07, | |
| "loss": 4.291437149047852, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 1.9749957763135666, | |
| "grad_norm": 0.4464097023010254, | |
| "learning_rate": 1.697610192469112e-07, | |
| "loss": 4.312542343139649, | |
| "step": 11690 | |
| }, | |
| { | |
| "epoch": 1.976685250886974, | |
| "grad_norm": 0.4436480700969696, | |
| "learning_rate": 1.4774257595783766e-07, | |
| "loss": 4.300673294067383, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.9783747254603818, | |
| "grad_norm": 0.44450485706329346, | |
| "learning_rate": 1.272522230062467e-07, | |
| "loss": 4.290340805053711, | |
| "step": 11710 | |
| }, | |
| { | |
| "epoch": 1.9800642000337896, | |
| "grad_norm": 0.4362986981868744, | |
| "learning_rate": 1.0829016933869838e-07, | |
| "loss": 4.2894245147705075, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 1.981753674607197, | |
| "grad_norm": 0.43450725078582764, | |
| "learning_rate": 9.085660831715247e-08, | |
| "loss": 4.298795700073242, | |
| "step": 11730 | |
| }, | |
| { | |
| "epoch": 1.9834431491806048, | |
| "grad_norm": 0.44246765971183777, | |
| "learning_rate": 7.495171771710328e-08, | |
| "loss": 4.293585968017578, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 1.9851326237540126, | |
| "grad_norm": 0.43929263949394226, | |
| "learning_rate": 6.057565972568123e-08, | |
| "loss": 4.293174743652344, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.98682209832742, | |
| "grad_norm": 0.4450415372848511, | |
| "learning_rate": 4.772858094005405e-08, | |
| "loss": 4.3004913330078125, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 1.9885115729008278, | |
| "grad_norm": 0.4472520053386688, | |
| "learning_rate": 3.641061236591136e-08, | |
| "loss": 4.2836250305175785, | |
| "step": 11770 | |
| }, | |
| { | |
| "epoch": 1.9902010474742355, | |
| "grad_norm": 0.44302183389663696, | |
| "learning_rate": 2.6621869416099118e-08, | |
| "loss": 4.290175247192383, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 1.991890522047643, | |
| "grad_norm": 0.4414844512939453, | |
| "learning_rate": 1.8362451909520458e-08, | |
| "loss": 4.286873245239258, | |
| "step": 11790 | |
| }, | |
| { | |
| "epoch": 1.993579996621051, | |
| "grad_norm": 0.44598934054374695, | |
| "learning_rate": 1.16324440700033e-08, | |
| "loss": 4.297615051269531, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.9952694711944585, | |
| "grad_norm": 0.4523853361606598, | |
| "learning_rate": 6.431914525567572e-09, | |
| "loss": 4.289733505249023, | |
| "step": 11810 | |
| }, | |
| { | |
| "epoch": 1.996958945767866, | |
| "grad_norm": 0.4494129419326782, | |
| "learning_rate": 2.760916307625871e-09, | |
| "loss": 4.304800415039063, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 1.998648420341274, | |
| "grad_norm": 0.4344528913497925, | |
| "learning_rate": 6.194868504838524e-10, | |
| "loss": 4.279055786132813, | |
| "step": 11830 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 11838, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.959258038224814e+17, | |
| "train_batch_size": 48, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |