| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 15.748031496062993, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.07874015748031496, | |
| "grad_norm": 5.119835376739502, | |
| "learning_rate": 1.8e-06, | |
| "loss": 0.8008, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.15748031496062992, | |
| "grad_norm": 2.7846670150756836, | |
| "learning_rate": 3.8e-06, | |
| "loss": 0.6753, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23622047244094488, | |
| "grad_norm": 0.9429484605789185, | |
| "learning_rate": 5.8e-06, | |
| "loss": 0.3854, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 0.4289324879646301, | |
| "learning_rate": 7.8e-06, | |
| "loss": 0.2374, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3937007874015748, | |
| "grad_norm": 0.7212084531784058, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 0.1937, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.47244094488188976, | |
| "grad_norm": 0.3743739426136017, | |
| "learning_rate": 1.18e-05, | |
| "loss": 0.169, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5511811023622047, | |
| "grad_norm": 0.33906233310699463, | |
| "learning_rate": 1.3800000000000002e-05, | |
| "loss": 0.1513, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 0.4938454031944275, | |
| "learning_rate": 1.58e-05, | |
| "loss": 0.1333, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7086614173228346, | |
| "grad_norm": 0.3791239261627197, | |
| "learning_rate": 1.78e-05, | |
| "loss": 0.1209, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7874015748031497, | |
| "grad_norm": 0.31115081906318665, | |
| "learning_rate": 1.9800000000000004e-05, | |
| "loss": 0.108, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8661417322834646, | |
| "grad_norm": 0.3266952633857727, | |
| "learning_rate": 2.18e-05, | |
| "loss": 0.105, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 0.42559316754341125, | |
| "learning_rate": 2.38e-05, | |
| "loss": 0.0931, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0236220472440944, | |
| "grad_norm": 0.40235716104507446, | |
| "learning_rate": 2.58e-05, | |
| "loss": 0.0867, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.1023622047244095, | |
| "grad_norm": 0.3455784022808075, | |
| "learning_rate": 2.7800000000000005e-05, | |
| "loss": 0.0843, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1811023622047245, | |
| "grad_norm": 0.39895790815353394, | |
| "learning_rate": 2.98e-05, | |
| "loss": 0.0803, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.2598425196850394, | |
| "grad_norm": 0.32412493228912354, | |
| "learning_rate": 3.18e-05, | |
| "loss": 0.075, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.3385826771653544, | |
| "grad_norm": 0.3238581717014313, | |
| "learning_rate": 3.38e-05, | |
| "loss": 0.069, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.4173228346456692, | |
| "grad_norm": 0.35713112354278564, | |
| "learning_rate": 3.58e-05, | |
| "loss": 0.0632, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.4960629921259843, | |
| "grad_norm": 0.31078988313674927, | |
| "learning_rate": 3.7800000000000004e-05, | |
| "loss": 0.0606, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.574803149606299, | |
| "grad_norm": 0.23168951272964478, | |
| "learning_rate": 3.9800000000000005e-05, | |
| "loss": 0.0566, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6535433070866141, | |
| "grad_norm": 0.2528112828731537, | |
| "learning_rate": 4.18e-05, | |
| "loss": 0.0553, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.7322834645669292, | |
| "grad_norm": 0.31132972240448, | |
| "learning_rate": 4.38e-05, | |
| "loss": 0.0516, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.811023622047244, | |
| "grad_norm": 0.3506482243537903, | |
| "learning_rate": 4.58e-05, | |
| "loss": 0.051, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.889763779527559, | |
| "grad_norm": 0.30420321226119995, | |
| "learning_rate": 4.78e-05, | |
| "loss": 0.0498, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.968503937007874, | |
| "grad_norm": 0.27608105540275574, | |
| "learning_rate": 4.9800000000000004e-05, | |
| "loss": 0.0425, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.047244094488189, | |
| "grad_norm": 0.24153359234333038, | |
| "learning_rate": 5.1800000000000005e-05, | |
| "loss": 0.0425, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.1259842519685037, | |
| "grad_norm": 0.2684983015060425, | |
| "learning_rate": 5.380000000000001e-05, | |
| "loss": 0.0397, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.204724409448819, | |
| "grad_norm": 0.2812291979789734, | |
| "learning_rate": 5.580000000000001e-05, | |
| "loss": 0.0347, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.283464566929134, | |
| "grad_norm": 0.272079199552536, | |
| "learning_rate": 5.7799999999999995e-05, | |
| "loss": 0.0339, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.362204724409449, | |
| "grad_norm": 0.30601683259010315, | |
| "learning_rate": 5.9800000000000003e-05, | |
| "loss": 0.0359, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.440944881889764, | |
| "grad_norm": 0.3129172921180725, | |
| "learning_rate": 6.18e-05, | |
| "loss": 0.0351, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.5196850393700787, | |
| "grad_norm": 0.27252131700515747, | |
| "learning_rate": 6.38e-05, | |
| "loss": 0.0287, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.5984251968503935, | |
| "grad_norm": 0.2653070390224457, | |
| "learning_rate": 6.58e-05, | |
| "loss": 0.0313, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.677165354330709, | |
| "grad_norm": 0.35808777809143066, | |
| "learning_rate": 6.780000000000001e-05, | |
| "loss": 0.0356, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.7559055118110236, | |
| "grad_norm": 0.26742085814476013, | |
| "learning_rate": 6.98e-05, | |
| "loss": 0.0299, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.8346456692913384, | |
| "grad_norm": 0.4106348156929016, | |
| "learning_rate": 7.18e-05, | |
| "loss": 0.0324, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.9133858267716537, | |
| "grad_norm": 0.213535338640213, | |
| "learning_rate": 7.38e-05, | |
| "loss": 0.0273, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.9921259842519685, | |
| "grad_norm": 0.26808497309684753, | |
| "learning_rate": 7.58e-05, | |
| "loss": 0.0254, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.0708661417322833, | |
| "grad_norm": 0.18177832663059235, | |
| "learning_rate": 7.780000000000001e-05, | |
| "loss": 0.0261, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.1496062992125986, | |
| "grad_norm": 0.2706851065158844, | |
| "learning_rate": 7.98e-05, | |
| "loss": 0.0268, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.2283464566929134, | |
| "grad_norm": 0.29524528980255127, | |
| "learning_rate": 8.18e-05, | |
| "loss": 0.0285, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.3070866141732282, | |
| "grad_norm": 0.21399272978305817, | |
| "learning_rate": 8.38e-05, | |
| "loss": 0.0252, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.3858267716535435, | |
| "grad_norm": 0.290097177028656, | |
| "learning_rate": 8.58e-05, | |
| "loss": 0.0279, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.4645669291338583, | |
| "grad_norm": 0.2789689004421234, | |
| "learning_rate": 8.78e-05, | |
| "loss": 0.0236, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.543307086614173, | |
| "grad_norm": 0.3307545781135559, | |
| "learning_rate": 8.98e-05, | |
| "loss": 0.0256, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.622047244094488, | |
| "grad_norm": 0.2919306457042694, | |
| "learning_rate": 9.180000000000001e-05, | |
| "loss": 0.0227, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.7007874015748032, | |
| "grad_norm": 0.27534034848213196, | |
| "learning_rate": 9.38e-05, | |
| "loss": 0.0219, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.779527559055118, | |
| "grad_norm": 0.26348116993904114, | |
| "learning_rate": 9.58e-05, | |
| "loss": 0.0242, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.8582677165354333, | |
| "grad_norm": 0.29468125104904175, | |
| "learning_rate": 9.78e-05, | |
| "loss": 0.0224, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.937007874015748, | |
| "grad_norm": 0.20534993708133698, | |
| "learning_rate": 9.98e-05, | |
| "loss": 0.0235, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.015748031496063, | |
| "grad_norm": 0.2911393642425537, | |
| "learning_rate": 9.9999778549206e-05, | |
| "loss": 0.0206, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.094488188976378, | |
| "grad_norm": 0.2478438913822174, | |
| "learning_rate": 9.999901304280685e-05, | |
| "loss": 0.0219, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.173228346456693, | |
| "grad_norm": 0.32605063915252686, | |
| "learning_rate": 9.999770075521164e-05, | |
| "loss": 0.0232, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.251968503937007, | |
| "grad_norm": 0.22585000097751617, | |
| "learning_rate": 9.99958417007713e-05, | |
| "loss": 0.0228, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.330708661417323, | |
| "grad_norm": 0.3227289319038391, | |
| "learning_rate": 9.999343589981615e-05, | |
| "loss": 0.018, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.409448818897638, | |
| "grad_norm": 0.2523372769355774, | |
| "learning_rate": 9.999048337865568e-05, | |
| "loss": 0.0215, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.488188976377953, | |
| "grad_norm": 0.34720173478126526, | |
| "learning_rate": 9.998698416957815e-05, | |
| "loss": 0.0242, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.566929133858268, | |
| "grad_norm": 0.24005654454231262, | |
| "learning_rate": 9.998293831085037e-05, | |
| "loss": 0.0213, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.645669291338582, | |
| "grad_norm": 0.28940242528915405, | |
| "learning_rate": 9.997834584671719e-05, | |
| "loss": 0.0204, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.724409448818898, | |
| "grad_norm": 0.2654191255569458, | |
| "learning_rate": 9.997320682740107e-05, | |
| "loss": 0.0217, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.803149606299213, | |
| "grad_norm": 0.2912241816520691, | |
| "learning_rate": 9.996752130910149e-05, | |
| "loss": 0.0197, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.881889763779528, | |
| "grad_norm": 0.23718924820423126, | |
| "learning_rate": 9.99612893539944e-05, | |
| "loss": 0.0209, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.960629921259843, | |
| "grad_norm": 0.2647818326950073, | |
| "learning_rate": 9.995451103023144e-05, | |
| "loss": 0.0222, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.039370078740157, | |
| "grad_norm": 0.35324886441230774, | |
| "learning_rate": 9.994718641193928e-05, | |
| "loss": 0.0224, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.118110236220472, | |
| "grad_norm": 0.2671961188316345, | |
| "learning_rate": 9.993931557921874e-05, | |
| "loss": 0.0219, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.196850393700787, | |
| "grad_norm": 0.2596529722213745, | |
| "learning_rate": 9.993089861814402e-05, | |
| "loss": 0.0203, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.275590551181103, | |
| "grad_norm": 0.25885483622550964, | |
| "learning_rate": 9.992193562076166e-05, | |
| "loss": 0.0188, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.354330708661418, | |
| "grad_norm": 0.24976016581058502, | |
| "learning_rate": 9.991242668508954e-05, | |
| "loss": 0.0175, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.433070866141732, | |
| "grad_norm": 0.24121227860450745, | |
| "learning_rate": 9.990237191511587e-05, | |
| "loss": 0.0158, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.511811023622047, | |
| "grad_norm": 0.22227917611598969, | |
| "learning_rate": 9.989177142079802e-05, | |
| "loss": 0.0177, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.590551181102362, | |
| "grad_norm": 0.231464222073555, | |
| "learning_rate": 9.988062531806126e-05, | |
| "loss": 0.0183, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.669291338582677, | |
| "grad_norm": 0.16609017550945282, | |
| "learning_rate": 9.986893372879762e-05, | |
| "loss": 0.018, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.748031496062993, | |
| "grad_norm": 0.19624024629592896, | |
| "learning_rate": 9.985669678086443e-05, | |
| "loss": 0.018, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.826771653543307, | |
| "grad_norm": 0.22255055606365204, | |
| "learning_rate": 9.984391460808298e-05, | |
| "loss": 0.0199, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 5.905511811023622, | |
| "grad_norm": 0.22765639424324036, | |
| "learning_rate": 9.983058735023709e-05, | |
| "loss": 0.0191, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 5.984251968503937, | |
| "grad_norm": 0.23915418982505798, | |
| "learning_rate": 9.98167151530715e-05, | |
| "loss": 0.0178, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 6.062992125984252, | |
| "grad_norm": 0.2489311248064041, | |
| "learning_rate": 9.980229816829034e-05, | |
| "loss": 0.0202, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.141732283464567, | |
| "grad_norm": 0.22865547239780426, | |
| "learning_rate": 9.978733655355544e-05, | |
| "loss": 0.0187, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.2204724409448815, | |
| "grad_norm": 0.19393905997276306, | |
| "learning_rate": 9.977183047248464e-05, | |
| "loss": 0.0168, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 6.299212598425197, | |
| "grad_norm": 0.20525363087654114, | |
| "learning_rate": 9.975578009464992e-05, | |
| "loss": 0.018, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.377952755905512, | |
| "grad_norm": 0.2537108063697815, | |
| "learning_rate": 9.97391855955757e-05, | |
| "loss": 0.0143, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 6.456692913385827, | |
| "grad_norm": 0.2665018141269684, | |
| "learning_rate": 9.972204715673669e-05, | |
| "loss": 0.0165, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 6.535433070866142, | |
| "grad_norm": 0.18383699655532837, | |
| "learning_rate": 9.970436496555617e-05, | |
| "loss": 0.0164, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 6.6141732283464565, | |
| "grad_norm": 0.3430931270122528, | |
| "learning_rate": 9.968613921540373e-05, | |
| "loss": 0.0176, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 6.692913385826771, | |
| "grad_norm": 0.2601425349712372, | |
| "learning_rate": 9.966737010559326e-05, | |
| "loss": 0.0175, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.771653543307087, | |
| "grad_norm": 0.19988982379436493, | |
| "learning_rate": 9.964805784138072e-05, | |
| "loss": 0.0172, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 6.850393700787402, | |
| "grad_norm": 0.18660953640937805, | |
| "learning_rate": 9.962820263396195e-05, | |
| "loss": 0.0158, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 6.929133858267717, | |
| "grad_norm": 0.22756962478160858, | |
| "learning_rate": 9.960780470047033e-05, | |
| "loss": 0.0185, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 7.0078740157480315, | |
| "grad_norm": 0.14548353850841522, | |
| "learning_rate": 9.958686426397437e-05, | |
| "loss": 0.0164, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 7.086614173228346, | |
| "grad_norm": 0.20737145841121674, | |
| "learning_rate": 9.956538155347534e-05, | |
| "loss": 0.0182, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 7.165354330708661, | |
| "grad_norm": 0.20689648389816284, | |
| "learning_rate": 9.95433568039047e-05, | |
| "loss": 0.0145, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.244094488188976, | |
| "grad_norm": 0.26220783591270447, | |
| "learning_rate": 9.952079025612162e-05, | |
| "loss": 0.0145, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 7.322834645669292, | |
| "grad_norm": 0.23523452877998352, | |
| "learning_rate": 9.949768215691022e-05, | |
| "loss": 0.0168, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 7.4015748031496065, | |
| "grad_norm": 0.207063227891922, | |
| "learning_rate": 9.9474032758977e-05, | |
| "loss": 0.0154, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 7.480314960629921, | |
| "grad_norm": 0.2092580646276474, | |
| "learning_rate": 9.944984232094794e-05, | |
| "loss": 0.0169, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.559055118110236, | |
| "grad_norm": 0.1808154582977295, | |
| "learning_rate": 9.942511110736584e-05, | |
| "loss": 0.0157, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 7.637795275590551, | |
| "grad_norm": 0.2190985083580017, | |
| "learning_rate": 9.939983938868726e-05, | |
| "loss": 0.0155, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 7.716535433070866, | |
| "grad_norm": 0.1607908308506012, | |
| "learning_rate": 9.93740274412797e-05, | |
| "loss": 0.0136, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 7.7952755905511815, | |
| "grad_norm": 0.20882774889469147, | |
| "learning_rate": 9.934767554741846e-05, | |
| "loss": 0.0192, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 7.874015748031496, | |
| "grad_norm": 0.18141894042491913, | |
| "learning_rate": 9.932078399528361e-05, | |
| "loss": 0.0134, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 7.952755905511811, | |
| "grad_norm": 0.1842644363641739, | |
| "learning_rate": 9.929335307895689e-05, | |
| "loss": 0.0145, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 8.031496062992126, | |
| "grad_norm": 0.19102592766284943, | |
| "learning_rate": 9.926538309841839e-05, | |
| "loss": 0.0179, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 8.11023622047244, | |
| "grad_norm": 0.2554001212120056, | |
| "learning_rate": 9.923687435954334e-05, | |
| "loss": 0.0145, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 8.188976377952756, | |
| "grad_norm": 0.2188219279050827, | |
| "learning_rate": 9.920782717409873e-05, | |
| "loss": 0.0133, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.26771653543307, | |
| "grad_norm": 0.19668325781822205, | |
| "learning_rate": 9.917824185973994e-05, | |
| "loss": 0.013, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.346456692913385, | |
| "grad_norm": 0.19224300980567932, | |
| "learning_rate": 9.914811874000723e-05, | |
| "loss": 0.012, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 8.4251968503937, | |
| "grad_norm": 0.2617517113685608, | |
| "learning_rate": 9.911745814432218e-05, | |
| "loss": 0.0144, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 8.503937007874015, | |
| "grad_norm": 0.340850293636322, | |
| "learning_rate": 9.90862604079842e-05, | |
| "loss": 0.0163, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 8.582677165354331, | |
| "grad_norm": 0.24036389589309692, | |
| "learning_rate": 9.90545258721667e-05, | |
| "loss": 0.0143, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 8.661417322834646, | |
| "grad_norm": 0.2523621916770935, | |
| "learning_rate": 9.90222548839135e-05, | |
| "loss": 0.0137, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.740157480314961, | |
| "grad_norm": 0.25303855538368225, | |
| "learning_rate": 9.898944779613495e-05, | |
| "loss": 0.0124, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 8.818897637795276, | |
| "grad_norm": 0.2672367990016937, | |
| "learning_rate": 9.89561049676041e-05, | |
| "loss": 0.0135, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 8.89763779527559, | |
| "grad_norm": 0.22292408347129822, | |
| "learning_rate": 9.89222267629528e-05, | |
| "loss": 0.0155, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 8.976377952755906, | |
| "grad_norm": 0.2113981992006302, | |
| "learning_rate": 9.888781355266763e-05, | |
| "loss": 0.0139, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 9.05511811023622, | |
| "grad_norm": 0.16752807796001434, | |
| "learning_rate": 9.885286571308598e-05, | |
| "loss": 0.0124, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 9.133858267716535, | |
| "grad_norm": 0.1773703545331955, | |
| "learning_rate": 9.881738362639182e-05, | |
| "loss": 0.015, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 9.21259842519685, | |
| "grad_norm": 0.26974138617515564, | |
| "learning_rate": 9.878136768061154e-05, | |
| "loss": 0.0162, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 9.291338582677165, | |
| "grad_norm": 0.2184063196182251, | |
| "learning_rate": 9.874481826960979e-05, | |
| "loss": 0.0148, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 9.37007874015748, | |
| "grad_norm": 0.1977306753396988, | |
| "learning_rate": 9.870773579308503e-05, | |
| "loss": 0.0123, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 9.448818897637794, | |
| "grad_norm": 0.1981269121170044, | |
| "learning_rate": 9.867012065656533e-05, | |
| "loss": 0.0152, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.527559055118111, | |
| "grad_norm": 0.17817805707454681, | |
| "learning_rate": 9.863197327140376e-05, | |
| "loss": 0.0123, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 9.606299212598426, | |
| "grad_norm": 0.23420843482017517, | |
| "learning_rate": 9.859329405477403e-05, | |
| "loss": 0.0129, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 9.68503937007874, | |
| "grad_norm": 0.25216200947761536, | |
| "learning_rate": 9.855408342966585e-05, | |
| "loss": 0.0138, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 9.763779527559056, | |
| "grad_norm": 0.1990588754415512, | |
| "learning_rate": 9.851434182488033e-05, | |
| "loss": 0.0129, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 9.84251968503937, | |
| "grad_norm": 0.27837619185447693, | |
| "learning_rate": 9.84740696750253e-05, | |
| "loss": 0.0124, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 9.921259842519685, | |
| "grad_norm": 0.21090054512023926, | |
| "learning_rate": 9.843326742051055e-05, | |
| "loss": 0.013, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.19581645727157593, | |
| "learning_rate": 9.839193550754297e-05, | |
| "loss": 0.0126, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 10.078740157480315, | |
| "grad_norm": 0.21251627802848816, | |
| "learning_rate": 9.835007438812177e-05, | |
| "loss": 0.0148, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 10.15748031496063, | |
| "grad_norm": 0.18511821329593658, | |
| "learning_rate": 9.830768452003341e-05, | |
| "loss": 0.0133, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 10.236220472440944, | |
| "grad_norm": 0.18811464309692383, | |
| "learning_rate": 9.826476636684671e-05, | |
| "loss": 0.0126, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 10.31496062992126, | |
| "grad_norm": 0.18782231211662292, | |
| "learning_rate": 9.822132039790773e-05, | |
| "loss": 0.0117, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 10.393700787401574, | |
| "grad_norm": 0.16824057698249817, | |
| "learning_rate": 9.817734708833461e-05, | |
| "loss": 0.0106, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 10.472440944881889, | |
| "grad_norm": 0.1814710795879364, | |
| "learning_rate": 9.813284691901243e-05, | |
| "loss": 0.0162, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 10.551181102362206, | |
| "grad_norm": 0.2217687964439392, | |
| "learning_rate": 9.808782037658792e-05, | |
| "loss": 0.0155, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 10.62992125984252, | |
| "grad_norm": 0.19781896471977234, | |
| "learning_rate": 9.804226795346411e-05, | |
| "loss": 0.0133, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 10.708661417322835, | |
| "grad_norm": 0.24714171886444092, | |
| "learning_rate": 9.799619014779503e-05, | |
| "loss": 0.0129, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 10.78740157480315, | |
| "grad_norm": 0.16805458068847656, | |
| "learning_rate": 9.794958746348013e-05, | |
| "loss": 0.0125, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 10.866141732283465, | |
| "grad_norm": 0.18694327771663666, | |
| "learning_rate": 9.790246041015896e-05, | |
| "loss": 0.0112, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 10.94488188976378, | |
| "grad_norm": 0.21768535673618317, | |
| "learning_rate": 9.785480950320538e-05, | |
| "loss": 0.0121, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 11.023622047244094, | |
| "grad_norm": 0.16912485659122467, | |
| "learning_rate": 9.78066352637221e-05, | |
| "loss": 0.0109, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 11.10236220472441, | |
| "grad_norm": 0.15913233160972595, | |
| "learning_rate": 9.775793821853488e-05, | |
| "loss": 0.0115, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 11.181102362204724, | |
| "grad_norm": 0.15250848233699799, | |
| "learning_rate": 9.77087189001868e-05, | |
| "loss": 0.0123, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 11.259842519685039, | |
| "grad_norm": 0.17317131161689758, | |
| "learning_rate": 9.765897784693243e-05, | |
| "loss": 0.0117, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 11.338582677165354, | |
| "grad_norm": 0.23304998874664307, | |
| "learning_rate": 9.760871560273197e-05, | |
| "loss": 0.0107, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 11.417322834645669, | |
| "grad_norm": 0.2260117381811142, | |
| "learning_rate": 9.755793271724526e-05, | |
| "loss": 0.0113, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 11.496062992125985, | |
| "grad_norm": 0.20854035019874573, | |
| "learning_rate": 9.750662974582584e-05, | |
| "loss": 0.0156, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 11.5748031496063, | |
| "grad_norm": 0.18729598820209503, | |
| "learning_rate": 9.745480724951473e-05, | |
| "loss": 0.0115, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 11.653543307086615, | |
| "grad_norm": 0.1489574909210205, | |
| "learning_rate": 9.740246579503447e-05, | |
| "loss": 0.0122, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 11.73228346456693, | |
| "grad_norm": 0.16865724325180054, | |
| "learning_rate": 9.734960595478284e-05, | |
| "loss": 0.0121, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 11.811023622047244, | |
| "grad_norm": 0.1705121397972107, | |
| "learning_rate": 9.729622830682657e-05, | |
| "loss": 0.0117, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 11.88976377952756, | |
| "grad_norm": 0.12779462337493896, | |
| "learning_rate": 9.724233343489504e-05, | |
| "loss": 0.013, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 11.968503937007874, | |
| "grad_norm": 0.21109400689601898, | |
| "learning_rate": 9.718792192837396e-05, | |
| "loss": 0.0105, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 12.047244094488189, | |
| "grad_norm": 0.17350123822689056, | |
| "learning_rate": 9.713299438229886e-05, | |
| "loss": 0.0129, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 12.125984251968504, | |
| "grad_norm": 0.19555015861988068, | |
| "learning_rate": 9.707755139734855e-05, | |
| "loss": 0.0131, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 12.204724409448819, | |
| "grad_norm": 0.22949132323265076, | |
| "learning_rate": 9.702159357983866e-05, | |
| "loss": 0.0122, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 12.283464566929133, | |
| "grad_norm": 0.21299389004707336, | |
| "learning_rate": 9.696512154171492e-05, | |
| "loss": 0.013, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 12.362204724409448, | |
| "grad_norm": 0.2029636800289154, | |
| "learning_rate": 9.690813590054645e-05, | |
| "loss": 0.0127, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 12.440944881889763, | |
| "grad_norm": 0.2509428858757019, | |
| "learning_rate": 9.685063727951914e-05, | |
| "loss": 0.0115, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 12.519685039370078, | |
| "grad_norm": 0.17952832579612732, | |
| "learning_rate": 9.679262630742865e-05, | |
| "loss": 0.0123, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 12.598425196850394, | |
| "grad_norm": 0.17356553673744202, | |
| "learning_rate": 9.673410361867373e-05, | |
| "loss": 0.0133, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 12.67716535433071, | |
| "grad_norm": 0.2649160921573639, | |
| "learning_rate": 9.667506985324909e-05, | |
| "loss": 0.0116, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 12.755905511811024, | |
| "grad_norm": 0.209790900349617, | |
| "learning_rate": 9.661552565673855e-05, | |
| "loss": 0.0104, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 12.834645669291339, | |
| "grad_norm": 0.21641805768013, | |
| "learning_rate": 9.655547168030789e-05, | |
| "loss": 0.0129, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 12.913385826771654, | |
| "grad_norm": 0.2454116940498352, | |
| "learning_rate": 9.649490858069777e-05, | |
| "loss": 0.0104, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 12.992125984251969, | |
| "grad_norm": 0.21532991528511047, | |
| "learning_rate": 9.643383702021658e-05, | |
| "loss": 0.0111, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 13.070866141732283, | |
| "grad_norm": 0.22105859220027924, | |
| "learning_rate": 9.637225766673307e-05, | |
| "loss": 0.0098, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 13.149606299212598, | |
| "grad_norm": 0.16493800282478333, | |
| "learning_rate": 9.631017119366922e-05, | |
| "loss": 0.0107, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 13.228346456692913, | |
| "grad_norm": 0.18128368258476257, | |
| "learning_rate": 9.624757827999273e-05, | |
| "loss": 0.0117, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 13.307086614173228, | |
| "grad_norm": 0.2412339746952057, | |
| "learning_rate": 9.618447961020971e-05, | |
| "loss": 0.015, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 13.385826771653543, | |
| "grad_norm": 0.23829780519008636, | |
| "learning_rate": 9.612087587435707e-05, | |
| "loss": 0.0146, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 13.464566929133857, | |
| "grad_norm": 0.17435379326343536, | |
| "learning_rate": 9.605676776799508e-05, | |
| "loss": 0.012, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 13.543307086614174, | |
| "grad_norm": 0.26677370071411133, | |
| "learning_rate": 9.599215599219973e-05, | |
| "loss": 0.0119, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 13.622047244094489, | |
| "grad_norm": 0.17352107167243958, | |
| "learning_rate": 9.592704125355505e-05, | |
| "loss": 0.0119, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 13.700787401574804, | |
| "grad_norm": 0.1817910224199295, | |
| "learning_rate": 9.586142426414538e-05, | |
| "loss": 0.011, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 13.779527559055119, | |
| "grad_norm": 0.26779067516326904, | |
| "learning_rate": 9.57953057415476e-05, | |
| "loss": 0.0137, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 13.858267716535433, | |
| "grad_norm": 0.16992807388305664, | |
| "learning_rate": 9.572868640882328e-05, | |
| "loss": 0.0116, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 13.937007874015748, | |
| "grad_norm": 0.2475721836090088, | |
| "learning_rate": 9.56615669945108e-05, | |
| "loss": 0.013, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 14.015748031496063, | |
| "grad_norm": 0.30210572481155396, | |
| "learning_rate": 9.55939482326173e-05, | |
| "loss": 0.0124, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 14.094488188976378, | |
| "grad_norm": 0.19526968896389008, | |
| "learning_rate": 9.552583086261069e-05, | |
| "loss": 0.0135, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 14.173228346456693, | |
| "grad_norm": 0.1772489845752716, | |
| "learning_rate": 9.545721562941168e-05, | |
| "loss": 0.0119, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 14.251968503937007, | |
| "grad_norm": 0.20985430479049683, | |
| "learning_rate": 9.538810328338543e-05, | |
| "loss": 0.0106, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 14.330708661417322, | |
| "grad_norm": 0.2288864552974701, | |
| "learning_rate": 9.531849458033349e-05, | |
| "loss": 0.0121, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 14.409448818897637, | |
| "grad_norm": 0.14826878905296326, | |
| "learning_rate": 9.524839028148547e-05, | |
| "loss": 0.0109, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 14.488188976377952, | |
| "grad_norm": 0.24729447066783905, | |
| "learning_rate": 9.517779115349077e-05, | |
| "loss": 0.0122, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 14.566929133858268, | |
| "grad_norm": 0.23712359368801117, | |
| "learning_rate": 9.510669796841014e-05, | |
| "loss": 0.012, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 14.645669291338583, | |
| "grad_norm": 0.24393972754478455, | |
| "learning_rate": 9.503511150370727e-05, | |
| "loss": 0.014, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 14.724409448818898, | |
| "grad_norm": 0.16620883345603943, | |
| "learning_rate": 9.496303254224024e-05, | |
| "loss": 0.0124, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 14.803149606299213, | |
| "grad_norm": 0.19335606694221497, | |
| "learning_rate": 9.489046187225306e-05, | |
| "loss": 0.0116, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 14.881889763779528, | |
| "grad_norm": 0.17978379130363464, | |
| "learning_rate": 9.481740028736692e-05, | |
| "loss": 0.0127, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 14.960629921259843, | |
| "grad_norm": 0.15655072033405304, | |
| "learning_rate": 9.474384858657164e-05, | |
| "loss": 0.0112, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 15.039370078740157, | |
| "grad_norm": 0.13158245384693146, | |
| "learning_rate": 9.466980757421679e-05, | |
| "loss": 0.011, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 15.118110236220472, | |
| "grad_norm": 0.21858836710453033, | |
| "learning_rate": 9.459527806000305e-05, | |
| "loss": 0.0116, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 15.196850393700787, | |
| "grad_norm": 0.19522692263126373, | |
| "learning_rate": 9.452026085897325e-05, | |
| "loss": 0.0118, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 15.275590551181102, | |
| "grad_norm": 0.20890626311302185, | |
| "learning_rate": 9.444475679150348e-05, | |
| "loss": 0.0111, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 15.354330708661417, | |
| "grad_norm": 0.20746995508670807, | |
| "learning_rate": 9.436876668329411e-05, | |
| "loss": 0.0107, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 15.433070866141732, | |
| "grad_norm": 0.18878526985645294, | |
| "learning_rate": 9.429229136536079e-05, | |
| "loss": 0.0105, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 15.511811023622048, | |
| "grad_norm": 0.18786223232746124, | |
| "learning_rate": 9.421533167402534e-05, | |
| "loss": 0.0112, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 15.590551181102363, | |
| "grad_norm": 0.12698164582252502, | |
| "learning_rate": 9.413788845090666e-05, | |
| "loss": 0.011, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 15.669291338582678, | |
| "grad_norm": 0.22439169883728027, | |
| "learning_rate": 9.405996254291136e-05, | |
| "loss": 0.0113, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 15.748031496062993, | |
| "grad_norm": 0.16835476458072662, | |
| "learning_rate": 9.398155480222474e-05, | |
| "loss": 0.0111, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 79, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |