| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 2481, |
| "global_step": 14888, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006717270101430779, |
| "grad_norm": 4.896524906158447, |
| "learning_rate": 5.480984340044743e-06, |
| "loss": 5.1811, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.013434540202861557, |
| "grad_norm": 2.687243700027466, |
| "learning_rate": 1.1073825503355706e-05, |
| "loss": 4.4733, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.020151810304292337, |
| "grad_norm": 1.7079490423202515, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 3.8, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.026869080405723115, |
| "grad_norm": 1.7120026350021362, |
| "learning_rate": 2.225950782997763e-05, |
| "loss": 3.3653, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.03358635050715389, |
| "grad_norm": 1.4521100521087646, |
| "learning_rate": 2.785234899328859e-05, |
| "loss": 3.1308, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.040303620608584674, |
| "grad_norm": 1.4319677352905273, |
| "learning_rate": 3.3445190156599555e-05, |
| "loss": 2.9757, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.04702089071001545, |
| "grad_norm": 1.053918719291687, |
| "learning_rate": 3.903803131991052e-05, |
| "loss": 2.8591, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.05373816081144623, |
| "grad_norm": 0.9901153445243835, |
| "learning_rate": 4.463087248322148e-05, |
| "loss": 2.7635, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.06045543091287701, |
| "grad_norm": 1.0325074195861816, |
| "learning_rate": 4.999999763367056e-05, |
| "loss": 2.6867, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.06717270101430778, |
| "grad_norm": 0.7236190438270569, |
| "learning_rate": 4.999840037833225e-05, |
| "loss": 2.6204, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.07388997111573857, |
| "grad_norm": 0.7032467126846313, |
| "learning_rate": 4.9993845429571105e-05, |
| "loss": 2.5692, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.08060724121716935, |
| "grad_norm": 0.5874780416488647, |
| "learning_rate": 4.9986333326307276e-05, |
| "loss": 2.5271, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.08732451131860013, |
| "grad_norm": 0.5380228161811829, |
| "learning_rate": 4.997586495733758e-05, |
| "loss": 2.498, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0940417814200309, |
| "grad_norm": 0.5316600203514099, |
| "learning_rate": 4.996244156123031e-05, |
| "loss": 2.4672, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.10075905152146168, |
| "grad_norm": 0.4045695662498474, |
| "learning_rate": 4.994606472617869e-05, |
| "loss": 2.4403, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.10747632162289246, |
| "grad_norm": 0.4827907979488373, |
| "learning_rate": 4.9926736389813e-05, |
| "loss": 2.4176, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.11419359172432324, |
| "grad_norm": 0.4095781743526459, |
| "learning_rate": 4.99044588389713e-05, |
| "loss": 2.4055, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.12091086182575402, |
| "grad_norm": 0.3878771662712097, |
| "learning_rate": 4.9879234709428855e-05, |
| "loss": 2.3713, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.12762813192718478, |
| "grad_norm": 0.3336116671562195, |
| "learning_rate": 4.9851066985586316e-05, |
| "loss": 2.3656, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.13434540202861556, |
| "grad_norm": 0.3157025873661041, |
| "learning_rate": 4.981995900011657e-05, |
| "loss": 2.3446, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.14106267213004636, |
| "grad_norm": 0.305578351020813, |
| "learning_rate": 4.978591443357048e-05, |
| "loss": 2.3291, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.14777994223147714, |
| "grad_norm": 0.34396952390670776, |
| "learning_rate": 4.9748937313941414e-05, |
| "loss": 2.3196, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.15449721233290792, |
| "grad_norm": 0.32559117674827576, |
| "learning_rate": 4.970903201618863e-05, |
| "loss": 2.3039, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.1612144824343387, |
| "grad_norm": 0.28197962045669556, |
| "learning_rate": 4.966620326171969e-05, |
| "loss": 2.2978, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.16793175253576947, |
| "grad_norm": 0.3120637536048889, |
| "learning_rate": 4.962045611783186e-05, |
| "loss": 2.2814, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.17464902263720025, |
| "grad_norm": 0.31267744302749634, |
| "learning_rate": 4.9571795997112506e-05, |
| "loss": 2.2717, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.18136629273863103, |
| "grad_norm": 0.3214496374130249, |
| "learning_rate": 4.9520228656798784e-05, |
| "loss": 2.2573, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.1880835628400618, |
| "grad_norm": 0.36649203300476074, |
| "learning_rate": 4.946576019809639e-05, |
| "loss": 2.246, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.19480083294149259, |
| "grad_norm": 0.31962043046951294, |
| "learning_rate": 4.940839706545777e-05, |
| "loss": 2.2388, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.20151810304292336, |
| "grad_norm": 0.395917683839798, |
| "learning_rate": 4.9348146045819585e-05, |
| "loss": 2.2331, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.20823537314435414, |
| "grad_norm": 0.3968123495578766, |
| "learning_rate": 4.928501426779974e-05, |
| "loss": 2.2212, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.21495264324578492, |
| "grad_norm": 0.3795957863330841, |
| "learning_rate": 4.921900920085394e-05, |
| "loss": 2.2153, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.2216699133472157, |
| "grad_norm": 0.39966660737991333, |
| "learning_rate": 4.915013865439197e-05, |
| "loss": 2.2015, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.22838718344864647, |
| "grad_norm": 0.3845181465148926, |
| "learning_rate": 4.907841077685372e-05, |
| "loss": 2.204, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.23510445355007725, |
| "grad_norm": 0.48438650369644165, |
| "learning_rate": 4.900383405474503e-05, |
| "loss": 2.1953, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.24182172365150803, |
| "grad_norm": 0.4857676923274994, |
| "learning_rate": 4.892641731163372e-05, |
| "loss": 2.1815, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.2485389937529388, |
| "grad_norm": 0.6407597661018372, |
| "learning_rate": 4.8846169707105525e-05, |
| "loss": 2.1755, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.25525626385436956, |
| "grad_norm": 0.5669896006584167, |
| "learning_rate": 4.8763100735680445e-05, |
| "loss": 2.1698, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.26197353395580036, |
| "grad_norm": 0.5839398503303528, |
| "learning_rate": 4.867722022568936e-05, |
| "loss": 2.165, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.2686908040572311, |
| "grad_norm": 0.6011441946029663, |
| "learning_rate": 4.858853833811119e-05, |
| "loss": 2.1574, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2754080741586619, |
| "grad_norm": 0.562146008014679, |
| "learning_rate": 4.849706556537074e-05, |
| "loss": 2.146, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.2821253442600927, |
| "grad_norm": 0.5889083743095398, |
| "learning_rate": 4.840281273009719e-05, |
| "loss": 2.1465, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2888426143615235, |
| "grad_norm": 0.6204415559768677, |
| "learning_rate": 4.8305790983843744e-05, |
| "loss": 2.1457, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2955598844629543, |
| "grad_norm": 0.6884430050849915, |
| "learning_rate": 4.820601180576811e-05, |
| "loss": 2.1367, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.30227715456438503, |
| "grad_norm": 0.6785121560096741, |
| "learning_rate": 4.810348700127441e-05, |
| "loss": 2.1377, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.30899442466581584, |
| "grad_norm": 0.667234480381012, |
| "learning_rate": 4.7998228700616384e-05, |
| "loss": 2.1279, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.3157116947672466, |
| "grad_norm": 0.5685933232307434, |
| "learning_rate": 4.789024935746223e-05, |
| "loss": 2.1165, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.3224289648686774, |
| "grad_norm": 0.7456786036491394, |
| "learning_rate": 4.7779561747421106e-05, |
| "loss": 2.1148, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.32914623497010814, |
| "grad_norm": 0.7704935669898987, |
| "learning_rate": 4.766617896653162e-05, |
| "loss": 2.1035, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.33331094243299525, |
| "eval_loss": 2.031871795654297, |
| "eval_runtime": 1653.4784, |
| "eval_samples_per_second": 72.217, |
| "eval_steps_per_second": 9.028, |
| "step": 2481 |
| }, |
| { |
| "epoch": 0.33586350507153895, |
| "grad_norm": 0.6733186841011047, |
| "learning_rate": 4.755011442971233e-05, |
| "loss": 2.1033, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.3425807751729697, |
| "grad_norm": 0.7341210246086121, |
| "learning_rate": 4.7431381869174574e-05, |
| "loss": 2.1, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.3492980452744005, |
| "grad_norm": 0.7517989277839661, |
| "learning_rate": 4.730999533279775e-05, |
| "loss": 2.0965, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.35601531537583125, |
| "grad_norm": 0.7430043816566467, |
| "learning_rate": 4.71859691824672e-05, |
| "loss": 2.0893, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.36273258547726206, |
| "grad_norm": 0.7112200856208801, |
| "learning_rate": 4.7059318092375016e-05, |
| "loss": 2.0856, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.3694498555786928, |
| "grad_norm": 0.8947717547416687, |
| "learning_rate": 4.693005704728384e-05, |
| "loss": 2.0832, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.3761671256801236, |
| "grad_norm": 0.8591889142990112, |
| "learning_rate": 4.679820134075395e-05, |
| "loss": 2.0761, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.38288439578155437, |
| "grad_norm": 0.933829665184021, |
| "learning_rate": 4.666376657333379e-05, |
| "loss": 2.0781, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.38960166588298517, |
| "grad_norm": 0.8115527629852295, |
| "learning_rate": 4.652676865071417e-05, |
| "loss": 2.0758, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.3963189359844159, |
| "grad_norm": 0.7666231989860535, |
| "learning_rate": 4.638722378184641e-05, |
| "loss": 2.0693, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.4030362060858467, |
| "grad_norm": 0.9425594806671143, |
| "learning_rate": 4.624514847702454e-05, |
| "loss": 2.0592, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.4097534761872775, |
| "grad_norm": 0.8913134336471558, |
| "learning_rate": 4.610055954593192e-05, |
| "loss": 2.0595, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.4164707462887083, |
| "grad_norm": 0.9630874991416931, |
| "learning_rate": 4.595347409565237e-05, |
| "loss": 2.051, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.42318801639013903, |
| "grad_norm": 0.7468298673629761, |
| "learning_rate": 4.5803909528646125e-05, |
| "loss": 2.0541, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.42990528649156984, |
| "grad_norm": 0.9688680768013, |
| "learning_rate": 4.565188354069091e-05, |
| "loss": 2.0416, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.4366225565930006, |
| "grad_norm": 0.9443581700325012, |
| "learning_rate": 4.549741411878819e-05, |
| "loss": 2.0356, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.4433398266944314, |
| "grad_norm": 0.8630342483520508, |
| "learning_rate": 4.534051953903511e-05, |
| "loss": 2.0455, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.45005709679586214, |
| "grad_norm": 0.8253282904624939, |
| "learning_rate": 4.518121836446206e-05, |
| "loss": 2.0378, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.45677436689729295, |
| "grad_norm": 0.8722043633460999, |
| "learning_rate": 4.501952944283647e-05, |
| "loss": 2.0345, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.4634916369987237, |
| "grad_norm": 0.833014965057373, |
| "learning_rate": 4.4855471904432804e-05, |
| "loss": 2.0393, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.4702089071001545, |
| "grad_norm": 0.8858036994934082, |
| "learning_rate": 4.468906515976912e-05, |
| "loss": 2.0342, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.47692617720158526, |
| "grad_norm": 0.7766168713569641, |
| "learning_rate": 4.452032889731056e-05, |
| "loss": 2.0255, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.48364344730301606, |
| "grad_norm": 1.0143591165542603, |
| "learning_rate": 4.434928308113986e-05, |
| "loss": 2.0183, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.4903607174044468, |
| "grad_norm": 0.8125320672988892, |
| "learning_rate": 4.417594794859533e-05, |
| "loss": 2.0202, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.4970779875058776, |
| "grad_norm": 0.7722158432006836, |
| "learning_rate": 4.4000344007876444e-05, |
| "loss": 2.0114, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.5037952576073084, |
| "grad_norm": 0.9473446607589722, |
| "learning_rate": 4.3822492035617404e-05, |
| "loss": 2.0125, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.5105125277087391, |
| "grad_norm": 0.854096531867981, |
| "learning_rate": 4.3642413074428964e-05, |
| "loss": 2.0131, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.51722979781017, |
| "grad_norm": 0.9034811854362488, |
| "learning_rate": 4.346012843040877e-05, |
| "loss": 2.0056, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.5239470679116007, |
| "grad_norm": 0.8732491135597229, |
| "learning_rate": 4.327565967062048e-05, |
| "loss": 2.0074, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.5306643380130315, |
| "grad_norm": 1.0670524835586548, |
| "learning_rate": 4.3089028620542094e-05, |
| "loss": 2.0069, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.5373816081144622, |
| "grad_norm": 0.8823853135108948, |
| "learning_rate": 4.2900257361483666e-05, |
| "loss": 2.0071, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.5440988782158931, |
| "grad_norm": 0.9907485842704773, |
| "learning_rate": 4.2709368227974724e-05, |
| "loss": 2.0083, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.5508161483173238, |
| "grad_norm": 1.0381219387054443, |
| "learning_rate": 4.251638380512174e-05, |
| "loss": 2.0028, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.5575334184187546, |
| "grad_norm": 0.8755475878715515, |
| "learning_rate": 4.232132692593602e-05, |
| "loss": 1.9974, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.5642506885201855, |
| "grad_norm": 1.062873125076294, |
| "learning_rate": 4.212422066863218e-05, |
| "loss": 1.9859, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.5709679586216162, |
| "grad_norm": 0.9898818731307983, |
| "learning_rate": 4.19250883538976e-05, |
| "loss": 1.9922, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.577685228723047, |
| "grad_norm": 0.9990688562393188, |
| "learning_rate": 4.172395354213331e-05, |
| "loss": 1.9935, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.5844024988244777, |
| "grad_norm": 0.8213431239128113, |
| "learning_rate": 4.152084003066636e-05, |
| "loss": 1.9867, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.5911197689259086, |
| "grad_norm": 0.913465142250061, |
| "learning_rate": 4.1315771850934295e-05, |
| "loss": 1.9809, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.5978370390273393, |
| "grad_norm": 0.84449303150177, |
| "learning_rate": 4.110877326564179e-05, |
| "loss": 1.9707, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.6045543091287701, |
| "grad_norm": 0.9786863327026367, |
| "learning_rate": 4.08998687658901e-05, |
| "loss": 1.9753, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.6112715792302008, |
| "grad_norm": 1.060813069343567, |
| "learning_rate": 4.06890830682793e-05, |
| "loss": 1.9804, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.6179888493316317, |
| "grad_norm": 0.9997825026512146, |
| "learning_rate": 4.047644111198398e-05, |
| "loss": 1.974, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.6247061194330624, |
| "grad_norm": 0.8858147859573364, |
| "learning_rate": 4.026196805580253e-05, |
| "loss": 1.9705, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.6314233895344932, |
| "grad_norm": 1.018781065940857, |
| "learning_rate": 4.004568927518054e-05, |
| "loss": 1.9666, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.6381406596359239, |
| "grad_norm": 0.9801031947135925, |
| "learning_rate": 3.982763035920836e-05, |
| "loss": 1.9634, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.6448579297373548, |
| "grad_norm": 0.9946137070655823, |
| "learning_rate": 3.960781710759365e-05, |
| "loss": 1.9661, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.6515751998387855, |
| "grad_norm": 0.9870684146881104, |
| "learning_rate": 3.9386275527608845e-05, |
| "loss": 1.9616, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.6582924699402163, |
| "grad_norm": 0.9816562533378601, |
| "learning_rate": 3.916303183101405e-05, |
| "loss": 1.969, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.665009740041647, |
| "grad_norm": 0.9809496402740479, |
| "learning_rate": 3.8938112430955834e-05, |
| "loss": 1.964, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.6666218848659905, |
| "eval_loss": 1.8941781520843506, |
| "eval_runtime": 1652.0834, |
| "eval_samples_per_second": 72.278, |
| "eval_steps_per_second": 9.035, |
| "step": 4962 |
| }, |
| { |
| "epoch": 0.6717270101430779, |
| "grad_norm": 0.8745808005332947, |
| "learning_rate": 3.871154393884212e-05, |
| "loss": 1.9555, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.6784442802445086, |
| "grad_norm": 0.97087562084198, |
| "learning_rate": 3.848335316119369e-05, |
| "loss": 1.9614, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.6851615503459394, |
| "grad_norm": 1.2281529903411865, |
| "learning_rate": 3.825356709647252e-05, |
| "loss": 1.9554, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.6918788204473701, |
| "grad_norm": 0.9577364921569824, |
| "learning_rate": 3.802221293188748e-05, |
| "loss": 1.9569, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.698596090548801, |
| "grad_norm": 1.0820568799972534, |
| "learning_rate": 3.7789318040177636e-05, |
| "loss": 1.9489, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.7053133606502318, |
| "grad_norm": 1.0203666687011719, |
| "learning_rate": 3.7554909976373685e-05, |
| "loss": 1.9484, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.7120306307516625, |
| "grad_norm": 0.9432654976844788, |
| "learning_rate": 3.731901647453772e-05, |
| "loss": 1.9529, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.7187479008530933, |
| "grad_norm": 1.0531628131866455, |
| "learning_rate": 3.708166544448189e-05, |
| "loss": 1.9436, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.7254651709545241, |
| "grad_norm": 0.9235082864761353, |
| "learning_rate": 3.6842884968466276e-05, |
| "loss": 1.9427, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.7321824410559549, |
| "grad_norm": 1.1586941480636597, |
| "learning_rate": 3.6602703297876276e-05, |
| "loss": 1.9427, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.7388997111573856, |
| "grad_norm": 1.0403027534484863, |
| "learning_rate": 3.636114884988004e-05, |
| "loss": 1.9445, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.7456169812588164, |
| "grad_norm": 0.9487152099609375, |
| "learning_rate": 3.611825020406631e-05, |
| "loss": 1.9377, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.7523342513602472, |
| "grad_norm": 0.9370021224021912, |
| "learning_rate": 3.5874036099063025e-05, |
| "loss": 1.939, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.759051521461678, |
| "grad_norm": 1.0592392683029175, |
| "learning_rate": 3.562853542913706e-05, |
| "loss": 1.9331, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.7657687915631087, |
| "grad_norm": 0.9699749946594238, |
| "learning_rate": 3.538177724077562e-05, |
| "loss": 1.9332, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.7724860616645395, |
| "grad_norm": 1.1636828184127808, |
| "learning_rate": 3.5133790729249585e-05, |
| "loss": 1.9382, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.7792033317659703, |
| "grad_norm": 0.9427869915962219, |
| "learning_rate": 3.488460523515927e-05, |
| "loss": 1.9295, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.7859206018674011, |
| "grad_norm": 1.0565423965454102, |
| "learning_rate": 3.4634250240963e-05, |
| "loss": 1.9335, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.7926378719688318, |
| "grad_norm": 1.0577119588851929, |
| "learning_rate": 3.4382755367488845e-05, |
| "loss": 1.9296, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.7993551420702626, |
| "grad_norm": 1.0106233358383179, |
| "learning_rate": 3.413015037043003e-05, |
| "loss": 1.9314, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.8060724121716935, |
| "grad_norm": 1.0226566791534424, |
| "learning_rate": 3.387646513682442e-05, |
| "loss": 1.9265, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.8127896822731242, |
| "grad_norm": 0.9026387333869934, |
| "learning_rate": 3.362172968151838e-05, |
| "loss": 1.9248, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.819506952374555, |
| "grad_norm": 0.881735622882843, |
| "learning_rate": 3.33659741436156e-05, |
| "loss": 1.9239, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.8262242224759858, |
| "grad_norm": 1.0480873584747314, |
| "learning_rate": 3.3109228782911125e-05, |
| "loss": 1.9172, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.8329414925774166, |
| "grad_norm": 0.9652755260467529, |
| "learning_rate": 3.2851523976311214e-05, |
| "loss": 1.9212, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.8396587626788473, |
| "grad_norm": 0.9710230231285095, |
| "learning_rate": 3.2592890214239254e-05, |
| "loss": 1.9129, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.8463760327802781, |
| "grad_norm": 1.0008606910705566, |
| "learning_rate": 3.2333358097028284e-05, |
| "loss": 1.9194, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.8530933028817089, |
| "grad_norm": 0.9385756850242615, |
| "learning_rate": 3.207295833130049e-05, |
| "loss": 1.9222, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.8598105729831397, |
| "grad_norm": 1.0019196271896362, |
| "learning_rate": 3.18117217263342e-05, |
| "loss": 1.9175, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.8665278430845704, |
| "grad_norm": 0.9140141010284424, |
| "learning_rate": 3.154967919041859e-05, |
| "loss": 1.9108, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.8732451131860012, |
| "grad_norm": 0.8793635368347168, |
| "learning_rate": 3.128686172719684e-05, |
| "loss": 1.908, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.879962383287432, |
| "grad_norm": 1.009559988975525, |
| "learning_rate": 3.102330043199787e-05, |
| "loss": 1.9126, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.8866796533888628, |
| "grad_norm": 0.9987242817878723, |
| "learning_rate": 3.07590264881573e-05, |
| "loss": 1.915, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.8933969234902935, |
| "grad_norm": 1.0023444890975952, |
| "learning_rate": 3.049407116332802e-05, |
| "loss": 1.9125, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.9001141935917243, |
| "grad_norm": 0.9578864574432373, |
| "learning_rate": 3.022846580578071e-05, |
| "loss": 1.9116, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.9068314636931551, |
| "grad_norm": 0.9548916220664978, |
| "learning_rate": 2.9962241840694872e-05, |
| "loss": 1.9059, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.9135487337945859, |
| "grad_norm": 1.1118357181549072, |
| "learning_rate": 2.9695430766440736e-05, |
| "loss": 1.9027, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.9202660038960166, |
| "grad_norm": 1.0644875764846802, |
| "learning_rate": 2.942806415085255e-05, |
| "loss": 1.9093, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.9269832739974474, |
| "grad_norm": 0.9042929410934448, |
| "learning_rate": 2.9160173627493603e-05, |
| "loss": 1.9007, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.9337005440988783, |
| "grad_norm": 0.9487243890762329, |
| "learning_rate": 2.889179089191349e-05, |
| "loss": 1.9017, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.940417814200309, |
| "grad_norm": 1.0207464694976807, |
| "learning_rate": 2.862294769789804e-05, |
| "loss": 1.9048, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.9471350843017398, |
| "grad_norm": 0.8947831392288208, |
| "learning_rate": 2.8353675853712365e-05, |
| "loss": 1.8949, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.9538523544031705, |
| "grad_norm": 1.0309513807296753, |
| "learning_rate": 2.8084007218337467e-05, |
| "loss": 1.8972, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.9605696245046014, |
| "grad_norm": 0.9352649450302124, |
| "learning_rate": 2.7813973697700813e-05, |
| "loss": 1.9065, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.9672868946060321, |
| "grad_norm": 0.9510485529899597, |
| "learning_rate": 2.754360724090137e-05, |
| "loss": 1.8943, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.9740041647074629, |
| "grad_norm": 0.9971142411231995, |
| "learning_rate": 2.7272939836429563e-05, |
| "loss": 1.8983, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.9807214348088936, |
| "grad_norm": 0.9219502806663513, |
| "learning_rate": 2.700200350838253e-05, |
| "loss": 1.9019, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.9874387049103245, |
| "grad_norm": 1.0496368408203125, |
| "learning_rate": 2.6730830312675182e-05, |
| "loss": 1.8948, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.9941559750117552, |
| "grad_norm": 0.9791837930679321, |
| "learning_rate": 2.6459452333247497e-05, |
| "loss": 1.8907, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.9999328272989857, |
| "eval_loss": 1.831755518913269, |
| "eval_runtime": 1652.2475, |
| "eval_samples_per_second": 72.271, |
| "eval_steps_per_second": 9.034, |
| "step": 7443 |
| }, |
| { |
| "epoch": 1.0008060724121717, |
| "grad_norm": 0.8881465196609497, |
| "learning_rate": 2.618790167826851e-05, |
| "loss": 1.8913, |
| "step": 7450 |
| }, |
| { |
| "epoch": 1.0075233425136025, |
| "grad_norm": 0.9141615033149719, |
| "learning_rate": 2.5916210476337416e-05, |
| "loss": 1.8922, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.0142406126150332, |
| "grad_norm": 0.9290676116943359, |
| "learning_rate": 2.5644410872682262e-05, |
| "loss": 1.8921, |
| "step": 7550 |
| }, |
| { |
| "epoch": 1.020957882716464, |
| "grad_norm": 0.9989870190620422, |
| "learning_rate": 2.5372535025356674e-05, |
| "loss": 1.8913, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.027675152817895, |
| "grad_norm": 0.8889585137367249, |
| "learning_rate": 2.5100615101435078e-05, |
| "loss": 1.8875, |
| "step": 7650 |
| }, |
| { |
| "epoch": 1.0343924229193255, |
| "grad_norm": 0.906791627407074, |
| "learning_rate": 2.4828683273206837e-05, |
| "loss": 1.8811, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.0411096930207564, |
| "grad_norm": 0.933408260345459, |
| "learning_rate": 2.4556771714369775e-05, |
| "loss": 1.8829, |
| "step": 7750 |
| }, |
| { |
| "epoch": 1.047826963122187, |
| "grad_norm": 0.9643042683601379, |
| "learning_rate": 2.4284912596223532e-05, |
| "loss": 1.8757, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.054544233223618, |
| "grad_norm": 1.0680428743362427, |
| "learning_rate": 2.4013138083863217e-05, |
| "loss": 1.8896, |
| "step": 7850 |
| }, |
| { |
| "epoch": 1.0612615033250488, |
| "grad_norm": 0.9255633354187012, |
| "learning_rate": 2.3741480332373772e-05, |
| "loss": 1.885, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.0679787734264794, |
| "grad_norm": 0.9109947681427002, |
| "learning_rate": 2.346997148302555e-05, |
| "loss": 1.8732, |
| "step": 7950 |
| }, |
| { |
| "epoch": 1.0746960435279103, |
| "grad_norm": 0.9063991904258728, |
| "learning_rate": 2.3198643659471493e-05, |
| "loss": 1.8901, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.0814133136293411, |
| "grad_norm": 1.0128731727600098, |
| "learning_rate": 2.2927528963946435e-05, |
| "loss": 1.8882, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.0881305837307718, |
| "grad_norm": 0.8854324221611023, |
| "learning_rate": 2.2656659473468877e-05, |
| "loss": 1.8848, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.0948478538322026, |
| "grad_norm": 1.0122106075286865, |
| "learning_rate": 2.238606723604583e-05, |
| "loss": 1.8738, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.1015651239336335, |
| "grad_norm": 1.0020389556884766, |
| "learning_rate": 2.2115784266881022e-05, |
| "loss": 1.8758, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.1082823940350641, |
| "grad_norm": 1.013856291770935, |
| "learning_rate": 2.1845842544587014e-05, |
| "loss": 1.879, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.114999664136495, |
| "grad_norm": 1.0340920686721802, |
| "learning_rate": 2.157627400740161e-05, |
| "loss": 1.874, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.1217169342379256, |
| "grad_norm": 0.9874864220619202, |
| "learning_rate": 2.1307110549409143e-05, |
| "loss": 1.8789, |
| "step": 8350 |
| }, |
| { |
| "epoch": 1.1284342043393565, |
| "grad_norm": 0.9384549856185913, |
| "learning_rate": 2.1038384016766856e-05, |
| "loss": 1.8736, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.1351514744407873, |
| "grad_norm": 1.1180434226989746, |
| "learning_rate": 2.0770126203937057e-05, |
| "loss": 1.8709, |
| "step": 8450 |
| }, |
| { |
| "epoch": 1.141868744542218, |
| "grad_norm": 1.0236446857452393, |
| "learning_rate": 2.0502368849925268e-05, |
| "loss": 1.8701, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.1485860146436488, |
| "grad_norm": 0.8483227491378784, |
| "learning_rate": 2.0235143634525144e-05, |
| "loss": 1.8744, |
| "step": 8550 |
| }, |
| { |
| "epoch": 1.1553032847450795, |
| "grad_norm": 1.0728415250778198, |
| "learning_rate": 1.9968482174570154e-05, |
| "loss": 1.8707, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.1620205548465103, |
| "grad_norm": 0.9894828796386719, |
| "learning_rate": 1.970241602019288e-05, |
| "loss": 1.8722, |
| "step": 8650 |
| }, |
| { |
| "epoch": 1.1687378249479412, |
| "grad_norm": 0.9536831378936768, |
| "learning_rate": 1.9436976651092144e-05, |
| "loss": 1.875, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.1754550950493718, |
| "grad_norm": 0.8805471658706665, |
| "learning_rate": 1.9172195472808457e-05, |
| "loss": 1.8671, |
| "step": 8750 |
| }, |
| { |
| "epoch": 1.1821723651508027, |
| "grad_norm": 0.9444993734359741, |
| "learning_rate": 1.890810381300831e-05, |
| "loss": 1.8639, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.1888896352522336, |
| "grad_norm": 0.9312206506729126, |
| "learning_rate": 1.8644732917777578e-05, |
| "loss": 1.8704, |
| "step": 8850 |
| }, |
| { |
| "epoch": 1.1956069053536642, |
| "grad_norm": 0.8733024001121521, |
| "learning_rate": 1.838211394792468e-05, |
| "loss": 1.8661, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.202324175455095, |
| "grad_norm": 0.9139480590820312, |
| "learning_rate": 1.812027797529372e-05, |
| "loss": 1.8733, |
| "step": 8950 |
| }, |
| { |
| "epoch": 1.209041445556526, |
| "grad_norm": 0.8931354880332947, |
| "learning_rate": 1.7859255979088268e-05, |
| "loss": 1.8613, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.2157587156579566, |
| "grad_norm": 0.8285694122314453, |
| "learning_rate": 1.7599078842206024e-05, |
| "loss": 1.8596, |
| "step": 9050 |
| }, |
| { |
| "epoch": 1.2224759857593874, |
| "grad_norm": 0.9247407913208008, |
| "learning_rate": 1.7339777347584896e-05, |
| "loss": 1.8789, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.229193255860818, |
| "grad_norm": 1.0588265657424927, |
| "learning_rate": 1.708138217456088e-05, |
| "loss": 1.8627, |
| "step": 9150 |
| }, |
| { |
| "epoch": 1.235910525962249, |
| "grad_norm": 0.8769893646240234, |
| "learning_rate": 1.6823923895238303e-05, |
| "loss": 1.8663, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.2426277960636798, |
| "grad_norm": 0.9970062375068665, |
| "learning_rate": 1.6567432970872587e-05, |
| "loss": 1.8632, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.2493450661651104, |
| "grad_norm": 0.9387636780738831, |
| "learning_rate": 1.6311939748266282e-05, |
| "loss": 1.8658, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.2560623362665413, |
| "grad_norm": 0.9504996538162231, |
| "learning_rate": 1.605747445617851e-05, |
| "loss": 1.8655, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.262779606367972, |
| "grad_norm": 0.9407314658164978, |
| "learning_rate": 1.5804067201748526e-05, |
| "loss": 1.863, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.2694968764694028, |
| "grad_norm": 0.8854203224182129, |
| "learning_rate": 1.55517479669335e-05, |
| "loss": 1.8636, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.2762141465708337, |
| "grad_norm": 0.9684790372848511, |
| "learning_rate": 1.530054660496125e-05, |
| "loss": 1.8571, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.2829314166722643, |
| "grad_norm": 0.896827757358551, |
| "learning_rate": 1.5050492836798091e-05, |
| "loss": 1.8549, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.2896486867736952, |
| "grad_norm": 0.8551830649375916, |
| "learning_rate": 1.4801616247632455e-05, |
| "loss": 1.8664, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.296365956875126, |
| "grad_norm": 0.911526620388031, |
| "learning_rate": 1.4553946283374475e-05, |
| "loss": 1.8639, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.3030832269765567, |
| "grad_norm": 0.8315178155899048, |
| "learning_rate": 1.4307512247172077e-05, |
| "loss": 1.854, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.3098004970779875, |
| "grad_norm": 0.8541605472564697, |
| "learning_rate": 1.4062343295943998e-05, |
| "loss": 1.8609, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.3165177671794184, |
| "grad_norm": 0.8474893569946289, |
| "learning_rate": 1.381846843693002e-05, |
| "loss": 1.8603, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.323235037280849, |
| "grad_norm": 0.8238804340362549, |
| "learning_rate": 1.357591652425904e-05, |
| "loss": 1.8509, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.3299523073822799, |
| "grad_norm": 0.8796683549880981, |
| "learning_rate": 1.3334716255535146e-05, |
| "loss": 1.8595, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.3331765970309666, |
| "eval_loss": 1.7995295524597168, |
| "eval_runtime": 1651.9195, |
| "eval_samples_per_second": 72.285, |
| "eval_steps_per_second": 9.036, |
| "step": 9924 |
| }, |
| { |
| "epoch": 1.3366695774837107, |
| "grad_norm": 0.8274658918380737, |
| "learning_rate": 1.309489616844225e-05, |
| "loss": 1.8547, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.3433868475851414, |
| "grad_norm": 1.110520362854004, |
| "learning_rate": 1.2856484637367655e-05, |
| "loss": 1.8543, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.3501041176865722, |
| "grad_norm": 0.991307258605957, |
| "learning_rate": 1.2619509870044926e-05, |
| "loss": 1.8542, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.3568213877880029, |
| "grad_norm": 1.0729873180389404, |
| "learning_rate": 1.2383999904216485e-05, |
| "loss": 1.855, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.3635386578894337, |
| "grad_norm": 0.9162331819534302, |
| "learning_rate": 1.2149982604316311e-05, |
| "loss": 1.862, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.3702559279908644, |
| "grad_norm": 1.0340936183929443, |
| "learning_rate": 1.1917485658173145e-05, |
| "loss": 1.845, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.3769731980922952, |
| "grad_norm": 0.9109688401222229, |
| "learning_rate": 1.1686536573734625e-05, |
| "loss": 1.854, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.383690468193726, |
| "grad_norm": 0.901867151260376, |
| "learning_rate": 1.1457162675812647e-05, |
| "loss": 1.8607, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.3904077382951567, |
| "grad_norm": 0.9229015111923218, |
| "learning_rate": 1.1229391102850428e-05, |
| "loss": 1.8508, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.3971250083965876, |
| "grad_norm": 0.8118278384208679, |
| "learning_rate": 1.1003248803711625e-05, |
| "loss": 1.8478, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.4038422784980185, |
| "grad_norm": 0.8655158281326294, |
| "learning_rate": 1.0778762534491849e-05, |
| "loss": 1.8583, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.410559548599449, |
| "grad_norm": 0.8829045295715332, |
| "learning_rate": 1.0555958855353029e-05, |
| "loss": 1.8536, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.41727681870088, |
| "grad_norm": 0.8320277333259583, |
| "learning_rate": 1.0334864127380931e-05, |
| "loss": 1.8546, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.4239940888023108, |
| "grad_norm": 0.8472936749458313, |
| "learning_rate": 1.0115504509466244e-05, |
| "loss": 1.8528, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.4307113589037415, |
| "grad_norm": 0.9551612734794617, |
| "learning_rate": 9.89790595520956e-06, |
| "loss": 1.846, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.4374286290051723, |
| "grad_norm": 0.8682100772857666, |
| "learning_rate": 9.6820942098507e-06, |
| "loss": 1.8541, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.4441458991066032, |
| "grad_norm": 0.7923955321311951, |
| "learning_rate": 9.468094807222633e-06, |
| "loss": 1.8502, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.4508631692080338, |
| "grad_norm": 0.7568480968475342, |
| "learning_rate": 9.255933066730449e-06, |
| "loss": 1.8622, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.4575804393094647, |
| "grad_norm": 0.9057785868644714, |
| "learning_rate": 9.045634090355667e-06, |
| "loss": 1.852, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.4642977094108955, |
| "grad_norm": 0.8364023566246033, |
| "learning_rate": 8.837222759686306e-06, |
| "loss": 1.8495, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.4710149795123262, |
| "grad_norm": 0.9518344402313232, |
| "learning_rate": 8.630723732972998e-06, |
| "loss": 1.8545, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.477732249613757, |
| "grad_norm": 0.8617038130760193, |
| "learning_rate": 8.426161442211552e-06, |
| "loss": 1.8491, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.4844495197151877, |
| "grad_norm": 0.9056522846221924, |
| "learning_rate": 8.22356009025225e-06, |
| "loss": 1.8539, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.4911667898166185, |
| "grad_norm": 0.7397828102111816, |
| "learning_rate": 8.022943647936315e-06, |
| "loss": 1.8494, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.4978840599180492, |
| "grad_norm": 0.7917934656143188, |
| "learning_rate": 7.82433585125977e-06, |
| "loss": 1.8504, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.50460133001948, |
| "grad_norm": 0.8637117147445679, |
| "learning_rate": 7.627760198565112e-06, |
| "loss": 1.8502, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.511318600120911, |
| "grad_norm": 1.025888204574585, |
| "learning_rate": 7.433239947761095e-06, |
| "loss": 1.8503, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.5180358702223415, |
| "grad_norm": 0.7819210886955261, |
| "learning_rate": 7.2407981135709735e-06, |
| "loss": 1.8465, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.5247531403237724, |
| "grad_norm": 0.9122748374938965, |
| "learning_rate": 7.050457464809495e-06, |
| "loss": 1.8467, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.5314704104252033, |
| "grad_norm": 0.915520429611206, |
| "learning_rate": 6.862240521689011e-06, |
| "loss": 1.8435, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.538187680526634, |
| "grad_norm": 0.7819466590881348, |
| "learning_rate": 6.676169553154993e-06, |
| "loss": 1.8476, |
| "step": 11450 |
| }, |
| { |
| "epoch": 1.5449049506280648, |
| "grad_norm": 0.9129722714424133, |
| "learning_rate": 6.492266574251249e-06, |
| "loss": 1.8534, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.5516222207294956, |
| "grad_norm": 0.834613561630249, |
| "learning_rate": 6.310553343515249e-06, |
| "loss": 1.8522, |
| "step": 11550 |
| }, |
| { |
| "epoch": 1.5583394908309263, |
| "grad_norm": 0.8443191647529602, |
| "learning_rate": 6.131051360403731e-06, |
| "loss": 1.844, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.5650567609323571, |
| "grad_norm": 0.858828604221344, |
| "learning_rate": 5.953781862748983e-06, |
| "loss": 1.8446, |
| "step": 11650 |
| }, |
| { |
| "epoch": 1.571774031033788, |
| "grad_norm": 0.7843521237373352, |
| "learning_rate": 5.778765824246099e-06, |
| "loss": 1.8465, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.5784913011352186, |
| "grad_norm": 0.9034683704376221, |
| "learning_rate": 5.6060239519714565e-06, |
| "loss": 1.8485, |
| "step": 11750 |
| }, |
| { |
| "epoch": 1.5852085712366493, |
| "grad_norm": 0.8224745392799377, |
| "learning_rate": 5.435576683932758e-06, |
| "loss": 1.844, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.5919258413380803, |
| "grad_norm": 0.8487153649330139, |
| "learning_rate": 5.267444186650908e-06, |
| "loss": 1.8399, |
| "step": 11850 |
| }, |
| { |
| "epoch": 1.598643111439511, |
| "grad_norm": 0.9923579692840576, |
| "learning_rate": 5.101646352773973e-06, |
| "loss": 1.8438, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.6053603815409416, |
| "grad_norm": 1.0088372230529785, |
| "learning_rate": 4.938202798723632e-06, |
| "loss": 1.8533, |
| "step": 11950 |
| }, |
| { |
| "epoch": 1.6120776516423725, |
| "grad_norm": 0.9827088713645935, |
| "learning_rate": 4.777132862374201e-06, |
| "loss": 1.8482, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.6187949217438034, |
| "grad_norm": 0.7875204086303711, |
| "learning_rate": 4.618455600764701e-06, |
| "loss": 1.846, |
| "step": 12050 |
| }, |
| { |
| "epoch": 1.625512191845234, |
| "grad_norm": 0.8284263610839844, |
| "learning_rate": 4.462189787844101e-06, |
| "loss": 1.8462, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.6322294619466649, |
| "grad_norm": 0.9832016229629517, |
| "learning_rate": 4.308353912250077e-06, |
| "loss": 1.8467, |
| "step": 12150 |
| }, |
| { |
| "epoch": 1.6389467320480957, |
| "grad_norm": 0.8272432088851929, |
| "learning_rate": 4.156966175121524e-06, |
| "loss": 1.8507, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.6456640021495264, |
| "grad_norm": 0.8643631935119629, |
| "learning_rate": 4.008044487945087e-06, |
| "loss": 1.851, |
| "step": 12250 |
| }, |
| { |
| "epoch": 1.6523812722509572, |
| "grad_norm": 0.821950376033783, |
| "learning_rate": 3.861606470435939e-06, |
| "loss": 1.8422, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.659098542352388, |
| "grad_norm": 0.8796666860580444, |
| "learning_rate": 3.717669448453126e-06, |
| "loss": 1.8456, |
| "step": 12350 |
| }, |
| { |
| "epoch": 1.6658158124538187, |
| "grad_norm": 0.8530693650245667, |
| "learning_rate": 3.5762504519496365e-06, |
| "loss": 1.8408, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.666487539463962, |
| "eval_loss": 1.7879396677017212, |
| "eval_runtime": 1652.1563, |
| "eval_samples_per_second": 72.275, |
| "eval_steps_per_second": 9.035, |
| "step": 12405 |
| }, |
| { |
| "epoch": 1.6725330825552496, |
| "grad_norm": 0.836791455745697, |
| "learning_rate": 3.437366212957502e-06, |
| "loss": 1.8465, |
| "step": 12450 |
| }, |
| { |
| "epoch": 1.6792503526566804, |
| "grad_norm": 0.7426266670227051, |
| "learning_rate": 3.3010331636081387e-06, |
| "loss": 1.8475, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.685967622758111, |
| "grad_norm": 0.8120975494384766, |
| "learning_rate": 3.167267434188173e-06, |
| "loss": 1.847, |
| "step": 12550 |
| }, |
| { |
| "epoch": 1.692684892859542, |
| "grad_norm": 0.7585613131523132, |
| "learning_rate": 3.0360848512309887e-06, |
| "loss": 1.8452, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.6994021629609728, |
| "grad_norm": 0.8330528736114502, |
| "learning_rate": 2.907500935644203e-06, |
| "loss": 1.8455, |
| "step": 12650 |
| }, |
| { |
| "epoch": 1.7061194330624034, |
| "grad_norm": 0.998671293258667, |
| "learning_rate": 2.781530900873305e-06, |
| "loss": 1.8499, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.712836703163834, |
| "grad_norm": 0.8430862426757812, |
| "learning_rate": 2.6581896511016614e-06, |
| "loss": 1.8425, |
| "step": 12750 |
| }, |
| { |
| "epoch": 1.7195539732652652, |
| "grad_norm": 0.9203202724456787, |
| "learning_rate": 2.537491779487147e-06, |
| "loss": 1.8374, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.7262712433666958, |
| "grad_norm": 1.0353131294250488, |
| "learning_rate": 2.419451566435532e-06, |
| "loss": 1.8439, |
| "step": 12850 |
| }, |
| { |
| "epoch": 1.7329885134681264, |
| "grad_norm": 0.8823174834251404, |
| "learning_rate": 2.3040829779108985e-06, |
| "loss": 1.8408, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.7397057835695573, |
| "grad_norm": 0.8599026799201965, |
| "learning_rate": 2.19139966378325e-06, |
| "loss": 1.8518, |
| "step": 12950 |
| }, |
| { |
| "epoch": 1.7464230536709882, |
| "grad_norm": 0.8370893597602844, |
| "learning_rate": 2.081414956213526e-06, |
| "loss": 1.8489, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.7531403237724188, |
| "grad_norm": 0.7799704670906067, |
| "learning_rate": 1.9741418680762013e-06, |
| "loss": 1.845, |
| "step": 13050 |
| }, |
| { |
| "epoch": 1.7598575938738497, |
| "grad_norm": 0.8190951943397522, |
| "learning_rate": 1.8695930914196664e-06, |
| "loss": 1.8458, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.7665748639752805, |
| "grad_norm": 0.8451455235481262, |
| "learning_rate": 1.7677809959645548e-06, |
| "loss": 1.8446, |
| "step": 13150 |
| }, |
| { |
| "epoch": 1.7732921340767112, |
| "grad_norm": 0.8454949855804443, |
| "learning_rate": 1.6687176276402261e-06, |
| "loss": 1.8373, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.780009404178142, |
| "grad_norm": 0.9424586296081543, |
| "learning_rate": 1.572414707159553e-06, |
| "loss": 1.8441, |
| "step": 13250 |
| }, |
| { |
| "epoch": 1.7867266742795729, |
| "grad_norm": 0.8610557913780212, |
| "learning_rate": 1.4788836286321606e-06, |
| "loss": 1.8398, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.7934439443810035, |
| "grad_norm": 0.9130797982215881, |
| "learning_rate": 1.3881354582163525e-06, |
| "loss": 1.8398, |
| "step": 13350 |
| }, |
| { |
| "epoch": 1.8001612144824344, |
| "grad_norm": 0.9039607048034668, |
| "learning_rate": 1.3001809328097914e-06, |
| "loss": 1.8472, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.8068784845838652, |
| "grad_norm": 0.8576995730400085, |
| "learning_rate": 1.2150304587791873e-06, |
| "loss": 1.8481, |
| "step": 13450 |
| }, |
| { |
| "epoch": 1.8135957546852959, |
| "grad_norm": 0.9080734252929688, |
| "learning_rate": 1.1326941107290351e-06, |
| "loss": 1.8402, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.8203130247867265, |
| "grad_norm": 0.8428635597229004, |
| "learning_rate": 1.053181630309666e-06, |
| "loss": 1.846, |
| "step": 13550 |
| }, |
| { |
| "epoch": 1.8270302948881576, |
| "grad_norm": 0.890152633190155, |
| "learning_rate": 9.765024250646238e-07, |
| "loss": 1.8529, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.8337475649895882, |
| "grad_norm": 0.803615391254425, |
| "learning_rate": 9.026655673176454e-07, |
| "loss": 1.8444, |
| "step": 13650 |
| }, |
| { |
| "epoch": 1.8404648350910189, |
| "grad_norm": 0.8416168093681335, |
| "learning_rate": 8.316797930992465e-07, |
| "loss": 1.8413, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.8471821051924497, |
| "grad_norm": 0.9183295369148254, |
| "learning_rate": 7.635535011131178e-07, |
| "loss": 1.8438, |
| "step": 13750 |
| }, |
| { |
| "epoch": 1.8538993752938806, |
| "grad_norm": 0.8079262375831604, |
| "learning_rate": 6.982947517424315e-07, |
| "loss": 1.8446, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.8606166453953112, |
| "grad_norm": 0.7838028073310852, |
| "learning_rate": 6.35911266096173e-07, |
| "loss": 1.845, |
| "step": 13850 |
| }, |
| { |
| "epoch": 1.867333915496742, |
| "grad_norm": 0.8391366004943848, |
| "learning_rate": 5.764104250956165e-07, |
| "loss": 1.8483, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.874051185598173, |
| "grad_norm": 0.9014426469802856, |
| "learning_rate": 5.197992686010511e-07, |
| "loss": 1.8438, |
| "step": 13950 |
| }, |
| { |
| "epoch": 1.8807684556996036, |
| "grad_norm": 0.9228802919387817, |
| "learning_rate": 4.660844945788501e-07, |
| "loss": 1.8474, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.8874857258010345, |
| "grad_norm": 0.7464848756790161, |
| "learning_rate": 4.1527245830901563e-07, |
| "loss": 1.8436, |
| "step": 14050 |
| }, |
| { |
| "epoch": 1.8942029959024653, |
| "grad_norm": 0.771722674369812, |
| "learning_rate": 3.6736917163322505e-07, |
| "loss": 1.8438, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.900920266003896, |
| "grad_norm": 0.7891733050346375, |
| "learning_rate": 3.2238030224356897e-07, |
| "loss": 1.8423, |
| "step": 14150 |
| }, |
| { |
| "epoch": 1.9076375361053268, |
| "grad_norm": 0.8450707197189331, |
| "learning_rate": 2.803111730119545e-07, |
| "loss": 1.843, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.9143548062067577, |
| "grad_norm": 0.9002699255943298, |
| "learning_rate": 2.4116676136033135e-07, |
| "loss": 1.8433, |
| "step": 14250 |
| }, |
| { |
| "epoch": 1.9210720763081883, |
| "grad_norm": 0.8603649735450745, |
| "learning_rate": 2.049516986717931e-07, |
| "loss": 1.8396, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.927789346409619, |
| "grad_norm": 0.8522045612335205, |
| "learning_rate": 1.7167026974261313e-07, |
| "loss": 1.8498, |
| "step": 14350 |
| }, |
| { |
| "epoch": 1.93450661651105, |
| "grad_norm": 0.8563119769096375, |
| "learning_rate": 1.4132641227528054e-07, |
| "loss": 1.841, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.9412238866124807, |
| "grad_norm": 0.8482009172439575, |
| "learning_rate": 1.1392371641262001e-07, |
| "loss": 1.8502, |
| "step": 14450 |
| }, |
| { |
| "epoch": 1.9479411567139113, |
| "grad_norm": 0.8035847544670105, |
| "learning_rate": 8.946542431300942e-08, |
| "loss": 1.8467, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.9546584268153424, |
| "grad_norm": 0.8062044382095337, |
| "learning_rate": 6.795442976679501e-08, |
| "loss": 1.8438, |
| "step": 14550 |
| }, |
| { |
| "epoch": 1.961375696916773, |
| "grad_norm": 0.7858136296272278, |
| "learning_rate": 4.939327785390691e-08, |
| "loss": 1.8431, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.9680929670182037, |
| "grad_norm": 0.7691812515258789, |
| "learning_rate": 3.37841646427417e-08, |
| "loss": 1.8442, |
| "step": 14650 |
| }, |
| { |
| "epoch": 1.9748102371196345, |
| "grad_norm": 0.8699848055839539, |
| "learning_rate": 2.1128936930320254e-08, |
| "loss": 1.8388, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.9815275072210654, |
| "grad_norm": 0.7812600135803223, |
| "learning_rate": 1.142909202380138e-08, |
| "loss": 1.8403, |
| "step": 14750 |
| }, |
| { |
| "epoch": 1.988244777322496, |
| "grad_norm": 0.856472373008728, |
| "learning_rate": 4.6857775633152305e-09, |
| "loss": 1.837, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.994962047423927, |
| "grad_norm": 0.7702723741531372, |
| "learning_rate": 8.997913861857888e-10, |
| "loss": 1.8449, |
| "step": 14850 |
| }, |
| { |
| "epoch": 1.9997984818969572, |
| "eval_loss": 1.786035418510437, |
| "eval_runtime": 1652.2214, |
| "eval_samples_per_second": 72.272, |
| "eval_steps_per_second": 9.035, |
| "step": 14886 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 14888, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 2481, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.539345213296214e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|