Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 65000, | |
| "best_metric": 3.520042657852173, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_drop_frequency_3591/checkpoint-40000", | |
| "epoch": 20.0, | |
| "eval_steps": 1000, | |
| "global_step": 68700, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014559431599790344, | |
| "grad_norm": 1.4049561023712158, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4124, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029118863199580687, | |
| "grad_norm": 0.8360756635665894, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.7276, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.043678294799371034, | |
| "grad_norm": 0.42227354645729065, | |
| "learning_rate": 0.0005995714285714286, | |
| "loss": 6.3402, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.058237726399161374, | |
| "grad_norm": 0.9324970841407776, | |
| "learning_rate": 0.0005991341107871719, | |
| "loss": 6.1609, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07279715799895171, | |
| "grad_norm": 0.4407173991203308, | |
| "learning_rate": 0.0005986967930029154, | |
| "loss": 6.0089, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08735658959874207, | |
| "grad_norm": 0.4540535807609558, | |
| "learning_rate": 0.0005982594752186589, | |
| "loss": 5.8627, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10191602119853241, | |
| "grad_norm": 0.4887666404247284, | |
| "learning_rate": 0.0005978221574344022, | |
| "loss": 5.74, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11647545279832275, | |
| "grad_norm": 0.5809242129325867, | |
| "learning_rate": 0.0005973848396501457, | |
| "loss": 5.6281, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1310348843981131, | |
| "grad_norm": 0.4683547914028168, | |
| "learning_rate": 0.0005969475218658892, | |
| "loss": 5.5265, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.14559431599790343, | |
| "grad_norm": 0.4089968502521515, | |
| "learning_rate": 0.0005965102040816326, | |
| "loss": 5.4214, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1601537475976938, | |
| "grad_norm": 0.5466117858886719, | |
| "learning_rate": 0.000596072886297376, | |
| "loss": 5.3411, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17471317919748414, | |
| "grad_norm": 0.3846788704395294, | |
| "learning_rate": 0.0005956355685131195, | |
| "loss": 5.2665, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18927261079727448, | |
| "grad_norm": 0.4610619843006134, | |
| "learning_rate": 0.0005951982507288629, | |
| "loss": 5.2078, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20383204239706482, | |
| "grad_norm": 0.41991209983825684, | |
| "learning_rate": 0.0005947609329446064, | |
| "loss": 5.1301, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21839147399685516, | |
| "grad_norm": 0.4753279685974121, | |
| "learning_rate": 0.0005943236151603498, | |
| "loss": 5.0702, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2329509055966455, | |
| "grad_norm": 0.4781185984611511, | |
| "learning_rate": 0.0005938862973760932, | |
| "loss": 5.0195, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24751033719643586, | |
| "grad_norm": 0.41803014278411865, | |
| "learning_rate": 0.0005934489795918367, | |
| "loss": 4.971, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2620697687962262, | |
| "grad_norm": 0.444289892911911, | |
| "learning_rate": 0.0005930116618075802, | |
| "loss": 4.9305, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2766292003960165, | |
| "grad_norm": 0.4531804025173187, | |
| "learning_rate": 0.0005925743440233235, | |
| "loss": 4.8862, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.29118863199580686, | |
| "grad_norm": 0.4998404085636139, | |
| "learning_rate": 0.000592137026239067, | |
| "loss": 4.8266, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.29118863199580686, | |
| "eval_accuracy": 0.25396983481710367, | |
| "eval_loss": 4.760892868041992, | |
| "eval_runtime": 179.1934, | |
| "eval_samples_per_second": 92.877, | |
| "eval_steps_per_second": 5.809, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30574806359559725, | |
| "grad_norm": 0.44976159930229187, | |
| "learning_rate": 0.0005916997084548104, | |
| "loss": 4.7891, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3203074951953876, | |
| "grad_norm": 0.38453996181488037, | |
| "learning_rate": 0.0005912623906705539, | |
| "loss": 4.7294, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.33486692679517793, | |
| "grad_norm": 0.4332706928253174, | |
| "learning_rate": 0.0005908250728862974, | |
| "loss": 4.7002, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3494263583949683, | |
| "grad_norm": 0.42371395230293274, | |
| "learning_rate": 0.0005903877551020407, | |
| "loss": 4.6808, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3639857899947586, | |
| "grad_norm": 0.45705753564834595, | |
| "learning_rate": 0.0005899504373177842, | |
| "loss": 4.6327, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.37854522159454895, | |
| "grad_norm": 0.42063650488853455, | |
| "learning_rate": 0.0005895131195335277, | |
| "loss": 4.6117, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3931046531943393, | |
| "grad_norm": 0.43308427929878235, | |
| "learning_rate": 0.0005890758017492711, | |
| "loss": 4.5751, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.40766408479412963, | |
| "grad_norm": 0.43480074405670166, | |
| "learning_rate": 0.0005886384839650145, | |
| "loss": 4.5591, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.42222351639392, | |
| "grad_norm": 0.45868223905563354, | |
| "learning_rate": 0.000588201166180758, | |
| "loss": 4.5263, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4367829479937103, | |
| "grad_norm": 0.41984814405441284, | |
| "learning_rate": 0.0005877638483965014, | |
| "loss": 4.5044, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45134237959350065, | |
| "grad_norm": 0.4139959216117859, | |
| "learning_rate": 0.0005873265306122449, | |
| "loss": 4.4968, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.465901811193291, | |
| "grad_norm": 0.38750138878822327, | |
| "learning_rate": 0.0005868892128279882, | |
| "loss": 4.4646, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.48046124279308133, | |
| "grad_norm": 0.41930243372917175, | |
| "learning_rate": 0.0005864518950437317, | |
| "loss": 4.4529, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.49502067439287173, | |
| "grad_norm": 0.41106143593788147, | |
| "learning_rate": 0.0005860145772594752, | |
| "loss": 4.4362, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.509580105992662, | |
| "grad_norm": 0.39897602796554565, | |
| "learning_rate": 0.0005855772594752186, | |
| "loss": 4.4112, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5241395375924524, | |
| "grad_norm": 0.4214461147785187, | |
| "learning_rate": 0.000585139941690962, | |
| "loss": 4.404, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5386989691922427, | |
| "grad_norm": 0.3887820541858673, | |
| "learning_rate": 0.0005847026239067055, | |
| "loss": 4.3787, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.553258400792033, | |
| "grad_norm": 0.3768806755542755, | |
| "learning_rate": 0.0005842653061224489, | |
| "loss": 4.3711, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5678178323918234, | |
| "grad_norm": 0.3779532313346863, | |
| "learning_rate": 0.0005838279883381924, | |
| "loss": 4.3456, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5823772639916137, | |
| "grad_norm": 0.3921726942062378, | |
| "learning_rate": 0.0005833906705539359, | |
| "loss": 4.3399, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5823772639916137, | |
| "eval_accuracy": 0.2996934707950652, | |
| "eval_loss": 4.28386926651001, | |
| "eval_runtime": 179.6428, | |
| "eval_samples_per_second": 92.645, | |
| "eval_steps_per_second": 5.795, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5969366955914042, | |
| "grad_norm": 0.38071900606155396, | |
| "learning_rate": 0.0005829533527696792, | |
| "loss": 4.3206, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6114961271911945, | |
| "grad_norm": 0.4333866536617279, | |
| "learning_rate": 0.0005825160349854227, | |
| "loss": 4.316, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6260555587909848, | |
| "grad_norm": 0.3910558223724365, | |
| "learning_rate": 0.0005820787172011661, | |
| "loss": 4.2961, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6406149903907752, | |
| "grad_norm": 0.3819257318973541, | |
| "learning_rate": 0.0005816413994169096, | |
| "loss": 4.2951, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6551744219905655, | |
| "grad_norm": 0.4080394506454468, | |
| "learning_rate": 0.000581204081632653, | |
| "loss": 4.2756, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6697338535903559, | |
| "grad_norm": 0.37072518467903137, | |
| "learning_rate": 0.0005807667638483965, | |
| "loss": 4.2638, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6842932851901462, | |
| "grad_norm": 0.3981825113296509, | |
| "learning_rate": 0.0005803294460641399, | |
| "loss": 4.2662, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6988527167899365, | |
| "grad_norm": 0.384818971157074, | |
| "learning_rate": 0.0005798921282798834, | |
| "loss": 4.2509, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7134121483897269, | |
| "grad_norm": 0.43530362844467163, | |
| "learning_rate": 0.0005794548104956267, | |
| "loss": 4.2352, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7279715799895172, | |
| "grad_norm": 0.3544856607913971, | |
| "learning_rate": 0.0005790174927113702, | |
| "loss": 4.2268, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7425310115893076, | |
| "grad_norm": 0.38703247904777527, | |
| "learning_rate": 0.0005785801749271137, | |
| "loss": 4.2107, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7570904431890979, | |
| "grad_norm": 0.37904635071754456, | |
| "learning_rate": 0.000578142857142857, | |
| "loss": 4.1982, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7716498747888882, | |
| "grad_norm": 0.41309526562690735, | |
| "learning_rate": 0.0005777055393586005, | |
| "loss": 4.1833, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7862093063886786, | |
| "grad_norm": 0.42821475863456726, | |
| "learning_rate": 0.000577268221574344, | |
| "loss": 4.1892, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8007687379884689, | |
| "grad_norm": 0.4209707975387573, | |
| "learning_rate": 0.0005768309037900874, | |
| "loss": 4.1834, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8153281695882593, | |
| "grad_norm": 0.3531130254268646, | |
| "learning_rate": 0.0005763935860058308, | |
| "loss": 4.1801, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8298876011880496, | |
| "grad_norm": 0.34633395075798035, | |
| "learning_rate": 0.0005759562682215744, | |
| "loss": 4.1681, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.84444703278784, | |
| "grad_norm": 0.3938649892807007, | |
| "learning_rate": 0.0005755189504373177, | |
| "loss": 4.1636, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8590064643876303, | |
| "grad_norm": 0.3613823652267456, | |
| "learning_rate": 0.0005750816326530612, | |
| "loss": 4.1578, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8735658959874206, | |
| "grad_norm": 0.3491958677768707, | |
| "learning_rate": 0.0005746443148688046, | |
| "loss": 4.1452, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8735658959874206, | |
| "eval_accuracy": 0.31544864157201075, | |
| "eval_loss": 4.095163822174072, | |
| "eval_runtime": 179.6171, | |
| "eval_samples_per_second": 92.658, | |
| "eval_steps_per_second": 5.796, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.888125327587211, | |
| "grad_norm": 0.3552567958831787, | |
| "learning_rate": 0.000574206997084548, | |
| "loss": 4.1285, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9026847591870013, | |
| "grad_norm": 0.35991519689559937, | |
| "learning_rate": 0.0005737696793002915, | |
| "loss": 4.132, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9172441907867916, | |
| "grad_norm": 0.3861224949359894, | |
| "learning_rate": 0.000573332361516035, | |
| "loss": 4.1214, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.931803622386582, | |
| "grad_norm": 0.3921383023262024, | |
| "learning_rate": 0.0005728950437317784, | |
| "loss": 4.1157, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9463630539863723, | |
| "grad_norm": 0.3566656708717346, | |
| "learning_rate": 0.0005724577259475218, | |
| "loss": 4.1088, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9609224855861627, | |
| "grad_norm": 0.3769164979457855, | |
| "learning_rate": 0.0005720204081632652, | |
| "loss": 4.0963, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.975481917185953, | |
| "grad_norm": 0.3577769100666046, | |
| "learning_rate": 0.0005715830903790087, | |
| "loss": 4.1067, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9900413487857435, | |
| "grad_norm": 0.35588538646698, | |
| "learning_rate": 0.0005711457725947522, | |
| "loss": 4.0912, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0043678294799372, | |
| "grad_norm": 0.3580274283885956, | |
| "learning_rate": 0.0005707084548104955, | |
| "loss": 4.0849, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0189272610797275, | |
| "grad_norm": 0.3520485758781433, | |
| "learning_rate": 0.000570271137026239, | |
| "loss": 4.0188, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0334866926795179, | |
| "grad_norm": 0.3421690762042999, | |
| "learning_rate": 0.0005698338192419825, | |
| "loss": 4.0128, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0480461242793082, | |
| "grad_norm": 0.3418625593185425, | |
| "learning_rate": 0.0005693965014577259, | |
| "loss": 4.0056, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0626055558790986, | |
| "grad_norm": 0.34526926279067993, | |
| "learning_rate": 0.0005689591836734693, | |
| "loss": 4.0084, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.077164987478889, | |
| "grad_norm": 0.35390642285346985, | |
| "learning_rate": 0.0005685218658892128, | |
| "loss": 4.0061, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0917244190786792, | |
| "grad_norm": 0.3431430459022522, | |
| "learning_rate": 0.0005680845481049562, | |
| "loss": 3.9994, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1062838506784696, | |
| "grad_norm": 0.357334166765213, | |
| "learning_rate": 0.0005676472303206997, | |
| "loss": 4.0071, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.12084328227826, | |
| "grad_norm": 0.3587090075016022, | |
| "learning_rate": 0.000567209912536443, | |
| "loss": 3.985, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1354027138780503, | |
| "grad_norm": 0.3586151599884033, | |
| "learning_rate": 0.0005667725947521865, | |
| "loss": 4.0047, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1499621454778406, | |
| "grad_norm": 0.37636685371398926, | |
| "learning_rate": 0.00056633527696793, | |
| "loss": 3.9987, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.164521577077631, | |
| "grad_norm": 0.35518568754196167, | |
| "learning_rate": 0.0005658979591836735, | |
| "loss": 3.9904, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.164521577077631, | |
| "eval_accuracy": 0.32538388464653073, | |
| "eval_loss": 3.9894351959228516, | |
| "eval_runtime": 179.567, | |
| "eval_samples_per_second": 92.684, | |
| "eval_steps_per_second": 5.797, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1790810086774213, | |
| "grad_norm": 0.3445068895816803, | |
| "learning_rate": 0.0005654606413994169, | |
| "loss": 3.9831, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.1936404402772116, | |
| "grad_norm": 0.3411754369735718, | |
| "learning_rate": 0.0005650233236151603, | |
| "loss": 3.9741, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.208199871877002, | |
| "grad_norm": 0.3622643053531647, | |
| "learning_rate": 0.0005645860058309037, | |
| "loss": 3.9812, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2227593034767923, | |
| "grad_norm": 0.35340210795402527, | |
| "learning_rate": 0.0005641486880466472, | |
| "loss": 3.9853, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2373187350765826, | |
| "grad_norm": 0.34644776582717896, | |
| "learning_rate": 0.0005637113702623907, | |
| "loss": 3.9733, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.251878166676373, | |
| "grad_norm": 0.33221983909606934, | |
| "learning_rate": 0.000563274052478134, | |
| "loss": 3.9601, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2664375982761633, | |
| "grad_norm": 0.3372167646884918, | |
| "learning_rate": 0.0005628367346938775, | |
| "loss": 3.9708, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.2809970298759537, | |
| "grad_norm": 0.3629266321659088, | |
| "learning_rate": 0.0005623994169096209, | |
| "loss": 3.9556, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.295556461475744, | |
| "grad_norm": 0.31815558671951294, | |
| "learning_rate": 0.0005619620991253644, | |
| "loss": 3.9644, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3101158930755343, | |
| "grad_norm": 0.3518199622631073, | |
| "learning_rate": 0.0005615247813411078, | |
| "loss": 3.9551, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3246753246753247, | |
| "grad_norm": 0.3197888135910034, | |
| "learning_rate": 0.0005610874635568513, | |
| "loss": 3.9556, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.339234756275115, | |
| "grad_norm": 0.35236433148384094, | |
| "learning_rate": 0.0005606501457725947, | |
| "loss": 3.9573, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.3537941878749054, | |
| "grad_norm": 0.3366566002368927, | |
| "learning_rate": 0.0005602128279883382, | |
| "loss": 3.9619, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.3683536194746957, | |
| "grad_norm": 0.3635067939758301, | |
| "learning_rate": 0.0005597755102040816, | |
| "loss": 3.9568, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.382913051074486, | |
| "grad_norm": 0.3495481610298157, | |
| "learning_rate": 0.000559338192419825, | |
| "loss": 3.935, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.3974724826742764, | |
| "grad_norm": 0.34598347544670105, | |
| "learning_rate": 0.0005589008746355685, | |
| "loss": 3.9463, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4120319142740667, | |
| "grad_norm": 0.32707110047340393, | |
| "learning_rate": 0.0005584635568513118, | |
| "loss": 3.9388, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.426591345873857, | |
| "grad_norm": 0.35207509994506836, | |
| "learning_rate": 0.0005580262390670554, | |
| "loss": 3.9363, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4411507774736474, | |
| "grad_norm": 0.33082953095436096, | |
| "learning_rate": 0.0005575889212827988, | |
| "loss": 3.9443, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.4557102090734377, | |
| "grad_norm": 0.36195048689842224, | |
| "learning_rate": 0.0005571516034985422, | |
| "loss": 3.934, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4557102090734377, | |
| "eval_accuracy": 0.3320231362585752, | |
| "eval_loss": 3.9117023944854736, | |
| "eval_runtime": 179.5118, | |
| "eval_samples_per_second": 92.713, | |
| "eval_steps_per_second": 5.799, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.470269640673228, | |
| "grad_norm": 0.3603370487689972, | |
| "learning_rate": 0.0005567142857142856, | |
| "loss": 3.9232, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.4848290722730184, | |
| "grad_norm": 0.3303501307964325, | |
| "learning_rate": 0.0005562769679300292, | |
| "loss": 3.929, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.4993885038728088, | |
| "grad_norm": 0.34812071919441223, | |
| "learning_rate": 0.0005558396501457725, | |
| "loss": 3.9186, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5139479354725993, | |
| "grad_norm": 0.3245297372341156, | |
| "learning_rate": 0.000555402332361516, | |
| "loss": 3.9281, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5285073670723897, | |
| "grad_norm": 0.32848072052001953, | |
| "learning_rate": 0.0005549650145772595, | |
| "loss": 3.9081, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.54306679867218, | |
| "grad_norm": 0.3524268865585327, | |
| "learning_rate": 0.0005545276967930028, | |
| "loss": 3.9169, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5576262302719703, | |
| "grad_norm": 0.3273775279521942, | |
| "learning_rate": 0.0005540903790087463, | |
| "loss": 3.9057, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.5721856618717607, | |
| "grad_norm": 0.33142444491386414, | |
| "learning_rate": 0.0005536530612244898, | |
| "loss": 3.9117, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.586745093471551, | |
| "grad_norm": 0.35404613614082336, | |
| "learning_rate": 0.0005532157434402332, | |
| "loss": 3.9, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6013045250713414, | |
| "grad_norm": 0.3326050341129303, | |
| "learning_rate": 0.0005527784256559766, | |
| "loss": 3.9023, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6158639566711317, | |
| "grad_norm": 0.32253944873809814, | |
| "learning_rate": 0.00055234110787172, | |
| "loss": 3.9036, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.630423388270922, | |
| "grad_norm": 0.40896502137184143, | |
| "learning_rate": 0.0005519037900874635, | |
| "loss": 3.892, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6449828198707124, | |
| "grad_norm": 0.33099985122680664, | |
| "learning_rate": 0.000551466472303207, | |
| "loss": 3.8921, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.6595422514705027, | |
| "grad_norm": 0.3134934902191162, | |
| "learning_rate": 0.0005510291545189503, | |
| "loss": 3.8986, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.674101683070293, | |
| "grad_norm": 0.32286426424980164, | |
| "learning_rate": 0.0005505918367346938, | |
| "loss": 3.8705, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.6886611146700834, | |
| "grad_norm": 0.3152390122413635, | |
| "learning_rate": 0.0005501545189504373, | |
| "loss": 3.8843, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7032205462698737, | |
| "grad_norm": 0.3241208493709564, | |
| "learning_rate": 0.0005497172011661807, | |
| "loss": 3.8915, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.717779977869664, | |
| "grad_norm": 0.3297117054462433, | |
| "learning_rate": 0.0005492798833819241, | |
| "loss": 3.8959, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7323394094694544, | |
| "grad_norm": 0.34585368633270264, | |
| "learning_rate": 0.0005488425655976676, | |
| "loss": 3.8631, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7468988410692448, | |
| "grad_norm": 0.32093173265457153, | |
| "learning_rate": 0.000548405247813411, | |
| "loss": 3.8774, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7468988410692448, | |
| "eval_accuracy": 0.3372265721042079, | |
| "eval_loss": 3.8566107749938965, | |
| "eval_runtime": 179.5862, | |
| "eval_samples_per_second": 92.674, | |
| "eval_steps_per_second": 5.797, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.761458272669035, | |
| "grad_norm": 0.3342028260231018, | |
| "learning_rate": 0.0005479679300291545, | |
| "loss": 3.8767, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.7760177042688254, | |
| "grad_norm": 0.331476628780365, | |
| "learning_rate": 0.000547530612244898, | |
| "loss": 3.8741, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.7905771358686158, | |
| "grad_norm": 0.3178947865962982, | |
| "learning_rate": 0.0005470932944606413, | |
| "loss": 3.8753, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.8051365674684061, | |
| "grad_norm": 0.33139607310295105, | |
| "learning_rate": 0.0005466559766763848, | |
| "loss": 3.8686, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8196959990681965, | |
| "grad_norm": 0.35270482301712036, | |
| "learning_rate": 0.0005462186588921283, | |
| "loss": 3.8577, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.8342554306679868, | |
| "grad_norm": 0.3247964382171631, | |
| "learning_rate": 0.0005457813411078717, | |
| "loss": 3.8574, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8488148622677771, | |
| "grad_norm": 0.33985435962677, | |
| "learning_rate": 0.0005453440233236151, | |
| "loss": 3.8546, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.8633742938675675, | |
| "grad_norm": 0.33400237560272217, | |
| "learning_rate": 0.0005449067055393585, | |
| "loss": 3.8636, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.8779337254673578, | |
| "grad_norm": 0.3367692232131958, | |
| "learning_rate": 0.0005444693877551019, | |
| "loss": 3.8718, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.8924931570671482, | |
| "grad_norm": 0.3267197608947754, | |
| "learning_rate": 0.0005440320699708455, | |
| "loss": 3.8507, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9070525886669385, | |
| "grad_norm": 0.3389538824558258, | |
| "learning_rate": 0.0005435947521865888, | |
| "loss": 3.8546, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9216120202667288, | |
| "grad_norm": 0.32694804668426514, | |
| "learning_rate": 0.0005431574344023323, | |
| "loss": 3.8391, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9361714518665192, | |
| "grad_norm": 0.3353123366832733, | |
| "learning_rate": 0.0005427201166180758, | |
| "loss": 3.8435, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.9507308834663095, | |
| "grad_norm": 0.32406482100486755, | |
| "learning_rate": 0.0005422827988338192, | |
| "loss": 3.8409, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.9652903150660999, | |
| "grad_norm": 0.3334747850894928, | |
| "learning_rate": 0.0005418454810495626, | |
| "loss": 3.8506, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9798497466658902, | |
| "grad_norm": 0.33217740058898926, | |
| "learning_rate": 0.0005414081632653061, | |
| "loss": 3.8396, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.9944091782656805, | |
| "grad_norm": 0.33468008041381836, | |
| "learning_rate": 0.0005409708454810495, | |
| "loss": 3.8407, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.0087356589598744, | |
| "grad_norm": 0.3196060359477997, | |
| "learning_rate": 0.0005405335276967929, | |
| "loss": 3.7913, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.0232950905596647, | |
| "grad_norm": 0.3573300540447235, | |
| "learning_rate": 0.0005400962099125365, | |
| "loss": 3.7409, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.037854522159455, | |
| "grad_norm": 0.3402981460094452, | |
| "learning_rate": 0.0005396588921282798, | |
| "loss": 3.7556, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.037854522159455, | |
| "eval_accuracy": 0.34190017535271905, | |
| "eval_loss": 3.809979200363159, | |
| "eval_runtime": 179.6501, | |
| "eval_samples_per_second": 92.641, | |
| "eval_steps_per_second": 5.795, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0524139537592454, | |
| "grad_norm": 0.3510541319847107, | |
| "learning_rate": 0.0005392215743440233, | |
| "loss": 3.7422, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.0669733853590357, | |
| "grad_norm": 0.31116750836372375, | |
| "learning_rate": 0.0005387842565597666, | |
| "loss": 3.7475, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.081532816958826, | |
| "grad_norm": 0.3254874050617218, | |
| "learning_rate": 0.0005383469387755102, | |
| "loss": 3.7546, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.0960922485586164, | |
| "grad_norm": 0.3147241771221161, | |
| "learning_rate": 0.0005379096209912536, | |
| "loss": 3.7518, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.1106516801584068, | |
| "grad_norm": 0.3199782073497772, | |
| "learning_rate": 0.000537472303206997, | |
| "loss": 3.7659, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.125211111758197, | |
| "grad_norm": 0.3094785809516907, | |
| "learning_rate": 0.0005370349854227405, | |
| "loss": 3.7481, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.1397705433579874, | |
| "grad_norm": 0.3172190189361572, | |
| "learning_rate": 0.0005365976676384839, | |
| "loss": 3.7408, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.154329974957778, | |
| "grad_norm": 0.3381129801273346, | |
| "learning_rate": 0.0005361603498542273, | |
| "loss": 3.7448, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.168889406557568, | |
| "grad_norm": 0.3302014470100403, | |
| "learning_rate": 0.0005357230320699708, | |
| "loss": 3.7451, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.1834488381573585, | |
| "grad_norm": 0.34532982110977173, | |
| "learning_rate": 0.0005352857142857143, | |
| "loss": 3.7459, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.198008269757149, | |
| "grad_norm": 0.3262939751148224, | |
| "learning_rate": 0.0005348483965014576, | |
| "loss": 3.7466, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.212567701356939, | |
| "grad_norm": 0.33892711997032166, | |
| "learning_rate": 0.0005344110787172011, | |
| "loss": 3.7505, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.2271271329567295, | |
| "grad_norm": 0.3445602059364319, | |
| "learning_rate": 0.0005339737609329446, | |
| "loss": 3.7429, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.24168656455652, | |
| "grad_norm": 0.3161507248878479, | |
| "learning_rate": 0.000533536443148688, | |
| "loss": 3.7541, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.25624599615631, | |
| "grad_norm": 0.31178775429725647, | |
| "learning_rate": 0.0005330991253644314, | |
| "loss": 3.7447, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.2708054277561005, | |
| "grad_norm": 0.3178870975971222, | |
| "learning_rate": 0.0005326618075801749, | |
| "loss": 3.7506, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.285364859355891, | |
| "grad_norm": 0.3333457112312317, | |
| "learning_rate": 0.0005322244897959183, | |
| "loss": 3.7494, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.299924290955681, | |
| "grad_norm": 0.3204410672187805, | |
| "learning_rate": 0.0005317871720116618, | |
| "loss": 3.7474, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.3144837225554715, | |
| "grad_norm": 0.31767410039901733, | |
| "learning_rate": 0.0005313498542274051, | |
| "loss": 3.7368, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.329043154155262, | |
| "grad_norm": 0.33374062180519104, | |
| "learning_rate": 0.0005309125364431486, | |
| "loss": 3.7524, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.329043154155262, | |
| "eval_accuracy": 0.34463388108962084, | |
| "eval_loss": 3.7798807621002197, | |
| "eval_runtime": 179.8167, | |
| "eval_samples_per_second": 92.555, | |
| "eval_steps_per_second": 5.789, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.343602585755052, | |
| "grad_norm": 0.32286617159843445, | |
| "learning_rate": 0.0005304752186588921, | |
| "loss": 3.7492, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.3581620173548425, | |
| "grad_norm": 0.33228906989097595, | |
| "learning_rate": 0.0005300379008746355, | |
| "loss": 3.764, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.372721448954633, | |
| "grad_norm": 0.33857783675193787, | |
| "learning_rate": 0.000529600583090379, | |
| "loss": 3.759, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.3872808805544232, | |
| "grad_norm": 0.3177933394908905, | |
| "learning_rate": 0.0005291632653061224, | |
| "loss": 3.7536, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.4018403121542136, | |
| "grad_norm": 0.3171054720878601, | |
| "learning_rate": 0.0005287259475218658, | |
| "loss": 3.7403, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.416399743754004, | |
| "grad_norm": 0.32724741101264954, | |
| "learning_rate": 0.0005282886297376093, | |
| "loss": 3.7446, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.4309591753537942, | |
| "grad_norm": 0.3406330347061157, | |
| "learning_rate": 0.0005278513119533528, | |
| "loss": 3.7441, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.4455186069535846, | |
| "grad_norm": 0.3245644271373749, | |
| "learning_rate": 0.0005274139941690961, | |
| "loss": 3.7317, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.460078038553375, | |
| "grad_norm": 0.3408276438713074, | |
| "learning_rate": 0.0005269766763848396, | |
| "loss": 3.7373, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.4746374701531653, | |
| "grad_norm": 0.31394264101982117, | |
| "learning_rate": 0.0005265393586005831, | |
| "loss": 3.732, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4891969017529556, | |
| "grad_norm": 0.3347412645816803, | |
| "learning_rate": 0.0005261020408163265, | |
| "loss": 3.7266, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.503756333352746, | |
| "grad_norm": 0.32223114371299744, | |
| "learning_rate": 0.0005256647230320699, | |
| "loss": 3.7293, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.5183157649525363, | |
| "grad_norm": 0.3145173490047455, | |
| "learning_rate": 0.0005252274052478134, | |
| "loss": 3.7471, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.5328751965523266, | |
| "grad_norm": 0.31143006682395935, | |
| "learning_rate": 0.0005247900874635568, | |
| "loss": 3.7394, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.547434628152117, | |
| "grad_norm": 0.3238007724285126, | |
| "learning_rate": 0.0005243527696793003, | |
| "loss": 3.7283, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.5619940597519073, | |
| "grad_norm": 0.3301667869091034, | |
| "learning_rate": 0.0005239154518950436, | |
| "loss": 3.7463, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.5765534913516976, | |
| "grad_norm": 0.32847797870635986, | |
| "learning_rate": 0.0005234781341107871, | |
| "loss": 3.7397, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.591112922951488, | |
| "grad_norm": 0.32561489939689636, | |
| "learning_rate": 0.0005230408163265306, | |
| "loss": 3.7437, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.6056723545512783, | |
| "grad_norm": 0.30937111377716064, | |
| "learning_rate": 0.000522603498542274, | |
| "loss": 3.7399, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6202317861510687, | |
| "grad_norm": 0.32154905796051025, | |
| "learning_rate": 0.0005221661807580175, | |
| "loss": 3.7339, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6202317861510687, | |
| "eval_accuracy": 0.3472925683629266, | |
| "eval_loss": 3.7506699562072754, | |
| "eval_runtime": 179.8206, | |
| "eval_samples_per_second": 92.553, | |
| "eval_steps_per_second": 5.789, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.634791217750859, | |
| "grad_norm": 0.317490816116333, | |
| "learning_rate": 0.0005217288629737609, | |
| "loss": 3.7263, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.6493506493506493, | |
| "grad_norm": 0.32750970125198364, | |
| "learning_rate": 0.0005212915451895043, | |
| "loss": 3.7324, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.6639100809504397, | |
| "grad_norm": 0.3290070593357086, | |
| "learning_rate": 0.0005208542274052477, | |
| "loss": 3.7314, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.67846951255023, | |
| "grad_norm": 0.34482887387275696, | |
| "learning_rate": 0.0005204169096209913, | |
| "loss": 3.7192, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.6930289441500204, | |
| "grad_norm": 0.31812381744384766, | |
| "learning_rate": 0.0005199795918367346, | |
| "loss": 3.7308, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.7075883757498107, | |
| "grad_norm": 0.33570706844329834, | |
| "learning_rate": 0.0005195422740524781, | |
| "loss": 3.7338, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.722147807349601, | |
| "grad_norm": 0.3004995584487915, | |
| "learning_rate": 0.0005191049562682216, | |
| "loss": 3.7224, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.7367072389493914, | |
| "grad_norm": 0.3277261555194855, | |
| "learning_rate": 0.000518667638483965, | |
| "loss": 3.7313, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.7512666705491817, | |
| "grad_norm": 0.3260866701602936, | |
| "learning_rate": 0.0005182303206997084, | |
| "loss": 3.7252, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.765826102148972, | |
| "grad_norm": 0.30772513151168823, | |
| "learning_rate": 0.0005177930029154519, | |
| "loss": 3.7263, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7803855337487624, | |
| "grad_norm": 0.3158465027809143, | |
| "learning_rate": 0.0005173556851311953, | |
| "loss": 3.728, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.7949449653485527, | |
| "grad_norm": 0.31197673082351685, | |
| "learning_rate": 0.0005169183673469387, | |
| "loss": 3.7135, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.809504396948343, | |
| "grad_norm": 0.33720263838768005, | |
| "learning_rate": 0.0005164810495626821, | |
| "loss": 3.7205, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.8240638285481334, | |
| "grad_norm": 0.3222922086715698, | |
| "learning_rate": 0.0005160437317784256, | |
| "loss": 3.7212, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.8386232601479238, | |
| "grad_norm": 0.32163000106811523, | |
| "learning_rate": 0.0005156064139941691, | |
| "loss": 3.7303, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.853182691747714, | |
| "grad_norm": 0.29815468192100525, | |
| "learning_rate": 0.0005151690962099124, | |
| "loss": 3.7143, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.8677421233475044, | |
| "grad_norm": 0.3258896768093109, | |
| "learning_rate": 0.000514731778425656, | |
| "loss": 3.7076, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.882301554947295, | |
| "grad_norm": 0.32969552278518677, | |
| "learning_rate": 0.0005142944606413994, | |
| "loss": 3.7269, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.896860986547085, | |
| "grad_norm": 0.31835922598838806, | |
| "learning_rate": 0.0005138571428571428, | |
| "loss": 3.7207, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.9114204181468755, | |
| "grad_norm": 0.3245142102241516, | |
| "learning_rate": 0.0005134198250728862, | |
| "loss": 3.7167, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.9114204181468755, | |
| "eval_accuracy": 0.3495557037372717, | |
| "eval_loss": 3.7238857746124268, | |
| "eval_runtime": 179.8397, | |
| "eval_samples_per_second": 92.544, | |
| "eval_steps_per_second": 5.788, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.925979849746666, | |
| "grad_norm": 0.32630476355552673, | |
| "learning_rate": 0.0005129825072886297, | |
| "loss": 3.7083, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.940539281346456, | |
| "grad_norm": 0.3315964341163635, | |
| "learning_rate": 0.0005125451895043731, | |
| "loss": 3.7064, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.9550987129462465, | |
| "grad_norm": 0.31410086154937744, | |
| "learning_rate": 0.0005121078717201166, | |
| "loss": 3.7236, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.969658144546037, | |
| "grad_norm": 0.33839717507362366, | |
| "learning_rate": 0.0005116705539358601, | |
| "loss": 3.7078, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.984217576145827, | |
| "grad_norm": 0.32319313287734985, | |
| "learning_rate": 0.0005112332361516034, | |
| "loss": 3.7141, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 2.9987770077456175, | |
| "grad_norm": 0.3235074579715729, | |
| "learning_rate": 0.0005107959183673469, | |
| "loss": 3.7063, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.0131034884398114, | |
| "grad_norm": 0.314828097820282, | |
| "learning_rate": 0.0005103586005830903, | |
| "loss": 3.6245, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.0276629200396017, | |
| "grad_norm": 0.31607604026794434, | |
| "learning_rate": 0.0005099212827988338, | |
| "loss": 3.6112, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.042222351639392, | |
| "grad_norm": 0.35359737277030945, | |
| "learning_rate": 0.0005094839650145772, | |
| "loss": 3.609, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.0567817832391824, | |
| "grad_norm": 0.32654085755348206, | |
| "learning_rate": 0.0005090466472303206, | |
| "loss": 3.6166, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.0713412148389727, | |
| "grad_norm": 0.3420456051826477, | |
| "learning_rate": 0.0005086093294460641, | |
| "loss": 3.6039, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.085900646438763, | |
| "grad_norm": 0.32927215099334717, | |
| "learning_rate": 0.0005081720116618076, | |
| "loss": 3.6076, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.1004600780385534, | |
| "grad_norm": 0.32174116373062134, | |
| "learning_rate": 0.0005077346938775509, | |
| "loss": 3.62, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.1150195096383437, | |
| "grad_norm": 0.32081031799316406, | |
| "learning_rate": 0.0005072973760932944, | |
| "loss": 3.6198, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.129578941238134, | |
| "grad_norm": 0.3233294188976288, | |
| "learning_rate": 0.0005068600583090379, | |
| "loss": 3.6221, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.1441383728379244, | |
| "grad_norm": 0.3179484009742737, | |
| "learning_rate": 0.0005064227405247813, | |
| "loss": 3.6265, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.1586978044377148, | |
| "grad_norm": 0.3125128746032715, | |
| "learning_rate": 0.0005059854227405247, | |
| "loss": 3.6316, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.173257236037505, | |
| "grad_norm": 0.32463568449020386, | |
| "learning_rate": 0.0005055481049562682, | |
| "loss": 3.6245, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.1878166676372954, | |
| "grad_norm": 0.31310543417930603, | |
| "learning_rate": 0.0005051107871720116, | |
| "loss": 3.6185, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.2023760992370858, | |
| "grad_norm": 0.3464823067188263, | |
| "learning_rate": 0.0005046734693877551, | |
| "loss": 3.6204, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2023760992370858, | |
| "eval_accuracy": 0.3516197697403503, | |
| "eval_loss": 3.7100203037261963, | |
| "eval_runtime": 180.2504, | |
| "eval_samples_per_second": 92.333, | |
| "eval_steps_per_second": 5.775, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.216935530836876, | |
| "grad_norm": 0.3277588486671448, | |
| "learning_rate": 0.0005042361516034986, | |
| "loss": 3.6268, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.2314949624366665, | |
| "grad_norm": 0.32706061005592346, | |
| "learning_rate": 0.0005037988338192419, | |
| "loss": 3.6153, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.246054394036457, | |
| "grad_norm": 0.31766435503959656, | |
| "learning_rate": 0.0005033615160349854, | |
| "loss": 3.6336, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.260613825636247, | |
| "grad_norm": 0.3006264269351959, | |
| "learning_rate": 0.0005029241982507288, | |
| "loss": 3.6275, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.2751732572360375, | |
| "grad_norm": 0.32919037342071533, | |
| "learning_rate": 0.0005024868804664723, | |
| "loss": 3.6301, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.289732688835828, | |
| "grad_norm": 0.3155740797519684, | |
| "learning_rate": 0.0005020495626822157, | |
| "loss": 3.6203, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.304292120435618, | |
| "grad_norm": 0.3527681529521942, | |
| "learning_rate": 0.0005016122448979591, | |
| "loss": 3.6288, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.3188515520354085, | |
| "grad_norm": 0.3135804831981659, | |
| "learning_rate": 0.0005011749271137026, | |
| "loss": 3.6415, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.333410983635199, | |
| "grad_norm": 0.3078667223453522, | |
| "learning_rate": 0.0005007376093294461, | |
| "loss": 3.6284, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.347970415234989, | |
| "grad_norm": 0.319755494594574, | |
| "learning_rate": 0.0005003002915451894, | |
| "loss": 3.6314, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.3625298468347795, | |
| "grad_norm": 0.32641854882240295, | |
| "learning_rate": 0.0004998629737609329, | |
| "loss": 3.629, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.37708927843457, | |
| "grad_norm": 0.3268803060054779, | |
| "learning_rate": 0.0004994256559766764, | |
| "loss": 3.6372, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.39164871003436, | |
| "grad_norm": 0.32382065057754517, | |
| "learning_rate": 0.0004989883381924198, | |
| "loss": 3.6286, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.4062081416341505, | |
| "grad_norm": 0.3158361613750458, | |
| "learning_rate": 0.0004985510204081632, | |
| "loss": 3.6329, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.420767573233941, | |
| "grad_norm": 0.31245240569114685, | |
| "learning_rate": 0.0004981137026239067, | |
| "loss": 3.6428, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.435327004833731, | |
| "grad_norm": 0.3362303078174591, | |
| "learning_rate": 0.0004976763848396501, | |
| "loss": 3.6369, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.4498864364335216, | |
| "grad_norm": 0.3208737373352051, | |
| "learning_rate": 0.0004972390670553935, | |
| "loss": 3.6428, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.464445868033312, | |
| "grad_norm": 0.3163570761680603, | |
| "learning_rate": 0.000496801749271137, | |
| "loss": 3.6239, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.4790052996331022, | |
| "grad_norm": 0.3181529641151428, | |
| "learning_rate": 0.0004963644314868804, | |
| "loss": 3.6303, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.4935647312328926, | |
| "grad_norm": 0.33231833577156067, | |
| "learning_rate": 0.0004959271137026239, | |
| "loss": 3.6358, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4935647312328926, | |
| "eval_accuracy": 0.35356460577150667, | |
| "eval_loss": 3.6901774406433105, | |
| "eval_runtime": 180.1137, | |
| "eval_samples_per_second": 92.403, | |
| "eval_steps_per_second": 5.78, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.508124162832683, | |
| "grad_norm": 0.3368173837661743, | |
| "learning_rate": 0.0004954897959183672, | |
| "loss": 3.6389, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.5226835944324733, | |
| "grad_norm": 0.33402830362319946, | |
| "learning_rate": 0.0004950524781341108, | |
| "loss": 3.645, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.5372430260322636, | |
| "grad_norm": 0.33064502477645874, | |
| "learning_rate": 0.0004946151603498542, | |
| "loss": 3.6336, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.551802457632054, | |
| "grad_norm": 0.31694450974464417, | |
| "learning_rate": 0.0004941778425655976, | |
| "loss": 3.6325, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5663618892318443, | |
| "grad_norm": 0.3069068193435669, | |
| "learning_rate": 0.0004937405247813411, | |
| "loss": 3.6246, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.5809213208316346, | |
| "grad_norm": 0.3142222464084625, | |
| "learning_rate": 0.0004933032069970845, | |
| "loss": 3.6453, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.595480752431425, | |
| "grad_norm": 0.3237994909286499, | |
| "learning_rate": 0.0004928658892128279, | |
| "loss": 3.6295, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.6100401840312153, | |
| "grad_norm": 0.30255311727523804, | |
| "learning_rate": 0.0004924285714285714, | |
| "loss": 3.6468, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.6245996156310056, | |
| "grad_norm": 0.3128635883331299, | |
| "learning_rate": 0.0004919912536443149, | |
| "loss": 3.6346, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.639159047230796, | |
| "grad_norm": 0.31057000160217285, | |
| "learning_rate": 0.0004915539358600582, | |
| "loss": 3.622, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6537184788305863, | |
| "grad_norm": 0.3453236520290375, | |
| "learning_rate": 0.0004911166180758017, | |
| "loss": 3.6354, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6682779104303767, | |
| "grad_norm": 0.3358878493309021, | |
| "learning_rate": 0.0004906793002915452, | |
| "loss": 3.6276, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.682837342030167, | |
| "grad_norm": 0.3207370638847351, | |
| "learning_rate": 0.0004902419825072886, | |
| "loss": 3.6358, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.6973967736299573, | |
| "grad_norm": 0.31057843565940857, | |
| "learning_rate": 0.000489804664723032, | |
| "loss": 3.6433, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.7119562052297477, | |
| "grad_norm": 0.32829779386520386, | |
| "learning_rate": 0.0004893673469387754, | |
| "loss": 3.6282, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.726515636829538, | |
| "grad_norm": 0.32469305396080017, | |
| "learning_rate": 0.0004889300291545189, | |
| "loss": 3.6353, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.7410750684293284, | |
| "grad_norm": 0.32968953251838684, | |
| "learning_rate": 0.0004884927113702624, | |
| "loss": 3.6307, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.755634500029119, | |
| "grad_norm": 0.3125181198120117, | |
| "learning_rate": 0.0004880553935860058, | |
| "loss": 3.6321, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.770193931628909, | |
| "grad_norm": 0.31494152545928955, | |
| "learning_rate": 0.0004876180758017492, | |
| "loss": 3.6359, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.7847533632287, | |
| "grad_norm": 0.32235443592071533, | |
| "learning_rate": 0.0004871807580174927, | |
| "loss": 3.6319, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7847533632287, | |
| "eval_accuracy": 0.35539115250113085, | |
| "eval_loss": 3.6755480766296387, | |
| "eval_runtime": 179.9398, | |
| "eval_samples_per_second": 92.492, | |
| "eval_steps_per_second": 5.785, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7993127948284897, | |
| "grad_norm": 0.3097991645336151, | |
| "learning_rate": 0.00048674344023323613, | |
| "loss": 3.6393, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.8138722264282805, | |
| "grad_norm": 0.3186699450016022, | |
| "learning_rate": 0.00048630612244897955, | |
| "loss": 3.6318, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.8284316580280704, | |
| "grad_norm": 0.3037383556365967, | |
| "learning_rate": 0.00048586880466472296, | |
| "loss": 3.6293, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.842991089627861, | |
| "grad_norm": 0.32788893580436707, | |
| "learning_rate": 0.0004854314868804664, | |
| "loss": 3.6152, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.857550521227651, | |
| "grad_norm": 0.3229829967021942, | |
| "learning_rate": 0.0004849941690962099, | |
| "loss": 3.6483, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.872109952827442, | |
| "grad_norm": 0.3292683959007263, | |
| "learning_rate": 0.0004845568513119533, | |
| "loss": 3.6381, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.8866693844272318, | |
| "grad_norm": 0.3210625648498535, | |
| "learning_rate": 0.00048411953352769677, | |
| "loss": 3.6269, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.9012288160270225, | |
| "grad_norm": 0.31549862027168274, | |
| "learning_rate": 0.0004836822157434402, | |
| "loss": 3.6213, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.9157882476268124, | |
| "grad_norm": 0.30793866515159607, | |
| "learning_rate": 0.00048324489795918365, | |
| "loss": 3.6309, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.930347679226603, | |
| "grad_norm": 0.3356075882911682, | |
| "learning_rate": 0.00048280758017492706, | |
| "loss": 3.6262, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.944907110826393, | |
| "grad_norm": 0.32913827896118164, | |
| "learning_rate": 0.00048237026239067053, | |
| "loss": 3.6213, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.959466542426184, | |
| "grad_norm": 0.3327690362930298, | |
| "learning_rate": 0.00048193294460641394, | |
| "loss": 3.6438, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.974025974025974, | |
| "grad_norm": 0.3101835250854492, | |
| "learning_rate": 0.00048149562682215735, | |
| "loss": 3.6296, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.9885854056257646, | |
| "grad_norm": 0.327761709690094, | |
| "learning_rate": 0.0004810583090379009, | |
| "loss": 3.6235, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.002911886319958, | |
| "grad_norm": 0.32982325553894043, | |
| "learning_rate": 0.0004806209912536443, | |
| "loss": 3.6143, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.017471317919749, | |
| "grad_norm": 0.32604551315307617, | |
| "learning_rate": 0.0004801836734693877, | |
| "loss": 3.511, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.032030749519539, | |
| "grad_norm": 0.3375633955001831, | |
| "learning_rate": 0.0004797463556851311, | |
| "loss": 3.5208, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.046590181119329, | |
| "grad_norm": 0.3174295127391815, | |
| "learning_rate": 0.00047930903790087463, | |
| "loss": 3.5217, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.061149612719119, | |
| "grad_norm": 0.3297431170940399, | |
| "learning_rate": 0.00047887172011661805, | |
| "loss": 3.5218, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.07570904431891, | |
| "grad_norm": 0.33495378494262695, | |
| "learning_rate": 0.00047843440233236146, | |
| "loss": 3.5213, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.07570904431891, | |
| "eval_accuracy": 0.35668316328168387, | |
| "eval_loss": 3.6667861938476562, | |
| "eval_runtime": 179.8727, | |
| "eval_samples_per_second": 92.527, | |
| "eval_steps_per_second": 5.787, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.0902684759187, | |
| "grad_norm": 0.3054860234260559, | |
| "learning_rate": 0.0004779970845481049, | |
| "loss": 3.5277, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.104827907518491, | |
| "grad_norm": 0.3263727128505707, | |
| "learning_rate": 0.00047755976676384834, | |
| "loss": 3.5327, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.119387339118281, | |
| "grad_norm": 0.3170093894004822, | |
| "learning_rate": 0.0004771224489795918, | |
| "loss": 3.5418, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.1339467707180715, | |
| "grad_norm": 0.33194735646247864, | |
| "learning_rate": 0.00047668513119533527, | |
| "loss": 3.5364, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.148506202317861, | |
| "grad_norm": 0.32043886184692383, | |
| "learning_rate": 0.0004762478134110787, | |
| "loss": 3.544, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.163065633917652, | |
| "grad_norm": 0.32483235001564026, | |
| "learning_rate": 0.0004758104956268221, | |
| "loss": 3.5385, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.177625065517442, | |
| "grad_norm": 0.3203752934932709, | |
| "learning_rate": 0.0004753731778425656, | |
| "loss": 3.5636, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.192184497117233, | |
| "grad_norm": 0.3080170452594757, | |
| "learning_rate": 0.00047493586005830903, | |
| "loss": 3.5453, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.206743928717023, | |
| "grad_norm": 0.3298153877258301, | |
| "learning_rate": 0.00047449854227405244, | |
| "loss": 3.5491, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.2213033603168135, | |
| "grad_norm": 0.3529611825942993, | |
| "learning_rate": 0.00047406122448979585, | |
| "loss": 3.544, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.235862791916603, | |
| "grad_norm": 0.3273563086986542, | |
| "learning_rate": 0.00047362390670553926, | |
| "loss": 3.5498, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.250422223516394, | |
| "grad_norm": 0.313999205827713, | |
| "learning_rate": 0.0004731865889212828, | |
| "loss": 3.5526, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.264981655116184, | |
| "grad_norm": 0.30790430307388306, | |
| "learning_rate": 0.0004727492711370262, | |
| "loss": 3.5544, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.279541086715975, | |
| "grad_norm": 0.33186236023902893, | |
| "learning_rate": 0.0004723119533527696, | |
| "loss": 3.5506, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.294100518315765, | |
| "grad_norm": 0.32786890864372253, | |
| "learning_rate": 0.0004718746355685131, | |
| "loss": 3.5475, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.308659949915556, | |
| "grad_norm": 0.3234544098377228, | |
| "learning_rate": 0.0004714373177842565, | |
| "loss": 3.5549, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.3232193815153455, | |
| "grad_norm": 0.31056949496269226, | |
| "learning_rate": 0.00047099999999999996, | |
| "loss": 3.5447, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.337778813115136, | |
| "grad_norm": 0.3284071087837219, | |
| "learning_rate": 0.0004705626822157434, | |
| "loss": 3.5585, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.352338244714926, | |
| "grad_norm": 0.32166486978530884, | |
| "learning_rate": 0.00047012536443148683, | |
| "loss": 3.5546, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.366897676314717, | |
| "grad_norm": 0.3296414613723755, | |
| "learning_rate": 0.00046968804664723025, | |
| "loss": 3.5562, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.366897676314717, | |
| "eval_accuracy": 0.3575879706129867, | |
| "eval_loss": 3.6574151515960693, | |
| "eval_runtime": 179.9318, | |
| "eval_samples_per_second": 92.496, | |
| "eval_steps_per_second": 5.786, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.381457107914507, | |
| "grad_norm": 0.31862205266952515, | |
| "learning_rate": 0.00046925072886297377, | |
| "loss": 3.5609, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.396016539514298, | |
| "grad_norm": 0.321135938167572, | |
| "learning_rate": 0.0004688134110787172, | |
| "loss": 3.5592, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.4105759711140875, | |
| "grad_norm": 0.34049704670906067, | |
| "learning_rate": 0.0004683760932944606, | |
| "loss": 3.5666, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.425135402713878, | |
| "grad_norm": 0.32759514451026917, | |
| "learning_rate": 0.000467938775510204, | |
| "loss": 3.5645, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.439694834313668, | |
| "grad_norm": 0.31559038162231445, | |
| "learning_rate": 0.00046750145772594747, | |
| "loss": 3.5424, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.454254265913459, | |
| "grad_norm": 0.31429657340049744, | |
| "learning_rate": 0.00046706413994169094, | |
| "loss": 3.5577, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.468813697513249, | |
| "grad_norm": 0.32119688391685486, | |
| "learning_rate": 0.00046662682215743435, | |
| "loss": 3.5645, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.48337312911304, | |
| "grad_norm": 0.32725510001182556, | |
| "learning_rate": 0.0004661895043731778, | |
| "loss": 3.558, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.4979325607128295, | |
| "grad_norm": 0.3302425742149353, | |
| "learning_rate": 0.00046575218658892123, | |
| "loss": 3.5645, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.51249199231262, | |
| "grad_norm": 0.33752188086509705, | |
| "learning_rate": 0.0004653148688046647, | |
| "loss": 3.5654, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.52705142391241, | |
| "grad_norm": 0.3348866105079651, | |
| "learning_rate": 0.0004648775510204081, | |
| "loss": 3.5587, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.541610855512201, | |
| "grad_norm": 0.33069008588790894, | |
| "learning_rate": 0.0004644402332361516, | |
| "loss": 3.5564, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.556170287111991, | |
| "grad_norm": 0.36258620023727417, | |
| "learning_rate": 0.000464002915451895, | |
| "loss": 3.5586, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.570729718711782, | |
| "grad_norm": 0.3146510422229767, | |
| "learning_rate": 0.0004635655976676384, | |
| "loss": 3.5612, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.585289150311572, | |
| "grad_norm": 0.3268812298774719, | |
| "learning_rate": 0.0004631282798833819, | |
| "loss": 3.5536, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.599848581911362, | |
| "grad_norm": 0.31493905186653137, | |
| "learning_rate": 0.00046269096209912533, | |
| "loss": 3.5717, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.614408013511152, | |
| "grad_norm": 0.3173486590385437, | |
| "learning_rate": 0.00046225364431486875, | |
| "loss": 3.5678, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.628967445110943, | |
| "grad_norm": 0.32398083806037903, | |
| "learning_rate": 0.00046181632653061216, | |
| "loss": 3.557, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.643526876710733, | |
| "grad_norm": 0.31683549284935, | |
| "learning_rate": 0.0004613790087463557, | |
| "loss": 3.5652, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.658086308310524, | |
| "grad_norm": 0.3226284682750702, | |
| "learning_rate": 0.0004609416909620991, | |
| "loss": 3.5583, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.658086308310524, | |
| "eval_accuracy": 0.35886363724551484, | |
| "eval_loss": 3.641108989715576, | |
| "eval_runtime": 179.9045, | |
| "eval_samples_per_second": 92.51, | |
| "eval_steps_per_second": 5.786, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.672645739910314, | |
| "grad_norm": 0.3244362771511078, | |
| "learning_rate": 0.0004605043731778425, | |
| "loss": 3.5653, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.687205171510104, | |
| "grad_norm": 0.3218280076980591, | |
| "learning_rate": 0.00046006705539358597, | |
| "loss": 3.5573, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.701764603109894, | |
| "grad_norm": 0.31557270884513855, | |
| "learning_rate": 0.0004596297376093294, | |
| "loss": 3.5697, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.716324034709685, | |
| "grad_norm": 0.32409724593162537, | |
| "learning_rate": 0.00045919241982507285, | |
| "loss": 3.5727, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.730883466309475, | |
| "grad_norm": 0.32196715474128723, | |
| "learning_rate": 0.0004587551020408163, | |
| "loss": 3.5696, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.745442897909266, | |
| "grad_norm": 0.3190127909183502, | |
| "learning_rate": 0.00045831778425655973, | |
| "loss": 3.5589, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.760002329509056, | |
| "grad_norm": 0.3492906391620636, | |
| "learning_rate": 0.00045788046647230314, | |
| "loss": 3.576, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.7745617611088464, | |
| "grad_norm": 0.3227944076061249, | |
| "learning_rate": 0.00045744314868804666, | |
| "loss": 3.5511, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.789121192708636, | |
| "grad_norm": 0.3169122040271759, | |
| "learning_rate": 0.0004570058309037901, | |
| "loss": 3.5684, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.803680624308427, | |
| "grad_norm": 0.31213343143463135, | |
| "learning_rate": 0.0004565685131195335, | |
| "loss": 3.5667, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.818240055908217, | |
| "grad_norm": 0.32593971490859985, | |
| "learning_rate": 0.0004561311953352769, | |
| "loss": 3.5661, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.832799487508008, | |
| "grad_norm": 0.33425310254096985, | |
| "learning_rate": 0.0004556938775510203, | |
| "loss": 3.5688, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.847358919107798, | |
| "grad_norm": 0.32003140449523926, | |
| "learning_rate": 0.00045525655976676383, | |
| "loss": 3.5601, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.8619183507075885, | |
| "grad_norm": 0.3596481382846832, | |
| "learning_rate": 0.00045481924198250724, | |
| "loss": 3.5688, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.876477782307378, | |
| "grad_norm": 0.3375333547592163, | |
| "learning_rate": 0.00045438192419825066, | |
| "loss": 3.5685, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.891037213907169, | |
| "grad_norm": 0.31676721572875977, | |
| "learning_rate": 0.0004539446064139941, | |
| "loss": 3.5581, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.905596645506959, | |
| "grad_norm": 0.3257509469985962, | |
| "learning_rate": 0.0004535072886297376, | |
| "loss": 3.5537, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.92015607710675, | |
| "grad_norm": 0.3176610767841339, | |
| "learning_rate": 0.000453069970845481, | |
| "loss": 3.5678, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.93471550870654, | |
| "grad_norm": 0.3168198764324188, | |
| "learning_rate": 0.00045263265306122447, | |
| "loss": 3.5499, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.9492749403063305, | |
| "grad_norm": 0.31883013248443604, | |
| "learning_rate": 0.0004521953352769679, | |
| "loss": 3.5668, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.9492749403063305, | |
| "eval_accuracy": 0.360385171601208, | |
| "eval_loss": 3.626793146133423, | |
| "eval_runtime": 180.0874, | |
| "eval_samples_per_second": 92.416, | |
| "eval_steps_per_second": 5.781, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.96383437190612, | |
| "grad_norm": 0.3429825007915497, | |
| "learning_rate": 0.0004517580174927113, | |
| "loss": 3.5693, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.978393803505911, | |
| "grad_norm": 0.31468144059181213, | |
| "learning_rate": 0.0004513206997084548, | |
| "loss": 3.5648, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.992953235105701, | |
| "grad_norm": 0.3186092972755432, | |
| "learning_rate": 0.0004508833819241982, | |
| "loss": 3.5611, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.007279715799895, | |
| "grad_norm": 0.32911449670791626, | |
| "learning_rate": 0.00045044606413994164, | |
| "loss": 3.5003, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.021839147399685, | |
| "grad_norm": 0.32932335138320923, | |
| "learning_rate": 0.00045000874635568505, | |
| "loss": 3.4462, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.036398578999476, | |
| "grad_norm": 0.3199908435344696, | |
| "learning_rate": 0.00044957142857142857, | |
| "loss": 3.4569, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.050958010599266, | |
| "grad_norm": 0.33716824650764465, | |
| "learning_rate": 0.000449134110787172, | |
| "loss": 3.4669, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.065517442199057, | |
| "grad_norm": 0.32985949516296387, | |
| "learning_rate": 0.0004486967930029154, | |
| "loss": 3.4787, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.080076873798847, | |
| "grad_norm": 0.3227981925010681, | |
| "learning_rate": 0.00044825947521865886, | |
| "loss": 3.46, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.094636305398637, | |
| "grad_norm": 0.32830196619033813, | |
| "learning_rate": 0.0004478221574344023, | |
| "loss": 3.4714, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.109195736998427, | |
| "grad_norm": 0.33184128999710083, | |
| "learning_rate": 0.00044738483965014574, | |
| "loss": 3.4636, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.123755168598218, | |
| "grad_norm": 0.3285403251647949, | |
| "learning_rate": 0.00044694752186588915, | |
| "loss": 3.4711, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.138314600198008, | |
| "grad_norm": 0.3541177809238434, | |
| "learning_rate": 0.0004465102040816326, | |
| "loss": 3.4806, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.152874031797799, | |
| "grad_norm": 0.3223034143447876, | |
| "learning_rate": 0.00044607288629737603, | |
| "loss": 3.4813, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.167433463397589, | |
| "grad_norm": 0.3218257427215576, | |
| "learning_rate": 0.0004456355685131195, | |
| "loss": 3.4826, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.1819928949973795, | |
| "grad_norm": 0.3309643566608429, | |
| "learning_rate": 0.00044519825072886297, | |
| "loss": 3.4888, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.196552326597169, | |
| "grad_norm": 0.3264036774635315, | |
| "learning_rate": 0.0004447609329446064, | |
| "loss": 3.483, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.21111175819696, | |
| "grad_norm": 0.324790358543396, | |
| "learning_rate": 0.0004443236151603498, | |
| "loss": 3.4896, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.22567118979675, | |
| "grad_norm": 0.3223564922809601, | |
| "learning_rate": 0.0004438862973760932, | |
| "loss": 3.4711, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.240230621396541, | |
| "grad_norm": 0.33800962567329407, | |
| "learning_rate": 0.0004434489795918367, | |
| "loss": 3.4813, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.240230621396541, | |
| "eval_accuracy": 0.36069406424049744, | |
| "eval_loss": 3.6294045448303223, | |
| "eval_runtime": 179.9519, | |
| "eval_samples_per_second": 92.486, | |
| "eval_steps_per_second": 5.785, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.254790052996331, | |
| "grad_norm": 0.3145381510257721, | |
| "learning_rate": 0.00044301166180758014, | |
| "loss": 3.4916, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.2693494845961215, | |
| "grad_norm": 0.33484500646591187, | |
| "learning_rate": 0.00044257434402332355, | |
| "loss": 3.5041, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.283908916195911, | |
| "grad_norm": 0.3385532796382904, | |
| "learning_rate": 0.000442137026239067, | |
| "loss": 3.4968, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.298468347795702, | |
| "grad_norm": 0.32390275597572327, | |
| "learning_rate": 0.0004416997084548105, | |
| "loss": 3.4945, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.313027779395492, | |
| "grad_norm": 0.3384283483028412, | |
| "learning_rate": 0.0004412623906705539, | |
| "loss": 3.4902, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.327587210995283, | |
| "grad_norm": 0.3334660530090332, | |
| "learning_rate": 0.00044082507288629736, | |
| "loss": 3.4961, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.342146642595073, | |
| "grad_norm": 0.32754096388816833, | |
| "learning_rate": 0.0004403877551020408, | |
| "loss": 3.5051, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.3567060741948636, | |
| "grad_norm": 0.3279802203178406, | |
| "learning_rate": 0.0004399504373177842, | |
| "loss": 3.5005, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.3712655057946534, | |
| "grad_norm": 0.3342030346393585, | |
| "learning_rate": 0.0004395131195335277, | |
| "loss": 3.4931, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.385824937394444, | |
| "grad_norm": 0.3162689805030823, | |
| "learning_rate": 0.0004390758017492711, | |
| "loss": 3.4952, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.400384368994234, | |
| "grad_norm": 0.32114139199256897, | |
| "learning_rate": 0.00043863848396501453, | |
| "loss": 3.4999, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.414943800594025, | |
| "grad_norm": 0.32540494203567505, | |
| "learning_rate": 0.00043820116618075794, | |
| "loss": 3.5038, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.429503232193815, | |
| "grad_norm": 0.33477944135665894, | |
| "learning_rate": 0.00043776384839650147, | |
| "loss": 3.4941, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.444062663793606, | |
| "grad_norm": 0.3286316990852356, | |
| "learning_rate": 0.0004373265306122449, | |
| "loss": 3.502, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.4586220953933955, | |
| "grad_norm": 0.3312956690788269, | |
| "learning_rate": 0.0004368892128279883, | |
| "loss": 3.5054, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.473181526993186, | |
| "grad_norm": 0.33420413732528687, | |
| "learning_rate": 0.0004364518950437317, | |
| "loss": 3.5176, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.487740958592976, | |
| "grad_norm": 0.312959223985672, | |
| "learning_rate": 0.00043601457725947517, | |
| "loss": 3.5112, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.502300390192767, | |
| "grad_norm": 0.31764182448387146, | |
| "learning_rate": 0.00043557725947521864, | |
| "loss": 3.4987, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.516859821792557, | |
| "grad_norm": 0.3139015734195709, | |
| "learning_rate": 0.00043513994169096205, | |
| "loss": 3.4989, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.531419253392348, | |
| "grad_norm": 0.32444003224372864, | |
| "learning_rate": 0.0004347026239067055, | |
| "loss": 3.4996, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.531419253392348, | |
| "eval_accuracy": 0.3617237455660619, | |
| "eval_loss": 3.6197915077209473, | |
| "eval_runtime": 180.0468, | |
| "eval_samples_per_second": 92.437, | |
| "eval_steps_per_second": 5.782, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.5459786849921375, | |
| "grad_norm": 0.32666343450546265, | |
| "learning_rate": 0.0004342653061224489, | |
| "loss": 3.5084, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.560538116591928, | |
| "grad_norm": 0.32281461358070374, | |
| "learning_rate": 0.0004338279883381924, | |
| "loss": 3.5143, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.575097548191718, | |
| "grad_norm": 0.3272330164909363, | |
| "learning_rate": 0.00043339067055393586, | |
| "loss": 3.5108, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.589656979791509, | |
| "grad_norm": 0.31538012623786926, | |
| "learning_rate": 0.00043295335276967927, | |
| "loss": 3.504, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.604216411391299, | |
| "grad_norm": 0.34619444608688354, | |
| "learning_rate": 0.0004325160349854227, | |
| "loss": 3.4962, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.61877584299109, | |
| "grad_norm": 0.33601802587509155, | |
| "learning_rate": 0.0004320787172011661, | |
| "loss": 3.5107, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.6333352745908805, | |
| "grad_norm": 0.32709893584251404, | |
| "learning_rate": 0.0004316413994169096, | |
| "loss": 3.5085, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.64789470619067, | |
| "grad_norm": 0.332736611366272, | |
| "learning_rate": 0.00043120408163265303, | |
| "loss": 3.5027, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.66245413779046, | |
| "grad_norm": 0.32013025879859924, | |
| "learning_rate": 0.00043076676384839644, | |
| "loss": 3.507, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.677013569390251, | |
| "grad_norm": 0.34380871057510376, | |
| "learning_rate": 0.0004303294460641399, | |
| "loss": 3.5091, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.691573000990042, | |
| "grad_norm": 0.3146701455116272, | |
| "learning_rate": 0.0004298921282798834, | |
| "loss": 3.5085, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.706132432589832, | |
| "grad_norm": 0.3258221447467804, | |
| "learning_rate": 0.0004294548104956268, | |
| "loss": 3.513, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.720691864189622, | |
| "grad_norm": 0.3335384726524353, | |
| "learning_rate": 0.0004290174927113702, | |
| "loss": 3.5065, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.735251295789412, | |
| "grad_norm": 0.333322674036026, | |
| "learning_rate": 0.00042858017492711367, | |
| "loss": 3.5061, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.749810727389203, | |
| "grad_norm": 0.3227587342262268, | |
| "learning_rate": 0.0004281428571428571, | |
| "loss": 3.5212, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.764370158988993, | |
| "grad_norm": 0.3334672152996063, | |
| "learning_rate": 0.00042770553935860055, | |
| "loss": 3.4976, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.778929590588783, | |
| "grad_norm": 0.3109551966190338, | |
| "learning_rate": 0.000427268221574344, | |
| "loss": 3.5159, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.793489022188574, | |
| "grad_norm": 0.3229271471500397, | |
| "learning_rate": 0.0004268309037900874, | |
| "loss": 3.511, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.8080484537883645, | |
| "grad_norm": 0.31595003604888916, | |
| "learning_rate": 0.00042639358600583084, | |
| "loss": 3.5083, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.822607885388154, | |
| "grad_norm": 0.3313562273979187, | |
| "learning_rate": 0.00042595626822157436, | |
| "loss": 3.5032, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.822607885388154, | |
| "eval_accuracy": 0.3630212827851557, | |
| "eval_loss": 3.6066231727600098, | |
| "eval_runtime": 180.0689, | |
| "eval_samples_per_second": 92.426, | |
| "eval_steps_per_second": 5.781, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.837167316987944, | |
| "grad_norm": 0.32517609000205994, | |
| "learning_rate": 0.00042551895043731777, | |
| "loss": 3.5075, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.851726748587735, | |
| "grad_norm": 0.3312103748321533, | |
| "learning_rate": 0.0004250816326530612, | |
| "loss": 3.5038, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.866286180187526, | |
| "grad_norm": 0.3302570879459381, | |
| "learning_rate": 0.0004246443148688046, | |
| "loss": 3.5185, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.880845611787316, | |
| "grad_norm": 0.3184104561805725, | |
| "learning_rate": 0.00042420699708454806, | |
| "loss": 3.5069, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.895405043387106, | |
| "grad_norm": 0.31885817646980286, | |
| "learning_rate": 0.00042376967930029153, | |
| "loss": 3.5122, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.9099644749868965, | |
| "grad_norm": 0.3231607973575592, | |
| "learning_rate": 0.00042333236151603494, | |
| "loss": 3.522, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.924523906586687, | |
| "grad_norm": 0.3280011713504791, | |
| "learning_rate": 0.0004228950437317784, | |
| "loss": 3.5191, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.939083338186477, | |
| "grad_norm": 0.32695943117141724, | |
| "learning_rate": 0.0004224577259475218, | |
| "loss": 3.5189, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.953642769786267, | |
| "grad_norm": 0.31571418046951294, | |
| "learning_rate": 0.0004220204081632653, | |
| "loss": 3.506, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.968202201386058, | |
| "grad_norm": 0.3223441243171692, | |
| "learning_rate": 0.0004215830903790087, | |
| "loss": 3.5298, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.982761632985849, | |
| "grad_norm": 0.3090570569038391, | |
| "learning_rate": 0.00042114577259475217, | |
| "loss": 3.5086, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 5.9973210645856385, | |
| "grad_norm": 0.32136136293411255, | |
| "learning_rate": 0.0004207084548104956, | |
| "loss": 3.5194, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.011647545279832, | |
| "grad_norm": 0.33823925256729126, | |
| "learning_rate": 0.000420271137026239, | |
| "loss": 3.4225, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.026206976879623, | |
| "grad_norm": 0.31168079376220703, | |
| "learning_rate": 0.0004198338192419825, | |
| "loss": 3.4097, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.040766408479413, | |
| "grad_norm": 0.33379727602005005, | |
| "learning_rate": 0.0004193965014577259, | |
| "loss": 3.3974, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.055325840079203, | |
| "grad_norm": 0.3196876645088196, | |
| "learning_rate": 0.00041895918367346934, | |
| "loss": 3.4086, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.069885271678993, | |
| "grad_norm": 0.3263348937034607, | |
| "learning_rate": 0.00041852186588921275, | |
| "loss": 3.4206, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.084444703278784, | |
| "grad_norm": 0.3413217067718506, | |
| "learning_rate": 0.00041808454810495627, | |
| "loss": 3.4143, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.099004134878574, | |
| "grad_norm": 0.3205811381340027, | |
| "learning_rate": 0.0004176472303206997, | |
| "loss": 3.4236, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.113563566478365, | |
| "grad_norm": 0.3402191996574402, | |
| "learning_rate": 0.0004172099125364431, | |
| "loss": 3.4196, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.113563566478365, | |
| "eval_accuracy": 0.36323646113684954, | |
| "eval_loss": 3.610100030899048, | |
| "eval_runtime": 179.9878, | |
| "eval_samples_per_second": 92.467, | |
| "eval_steps_per_second": 5.784, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.128122998078155, | |
| "grad_norm": 0.35015061497688293, | |
| "learning_rate": 0.00041677259475218656, | |
| "loss": 3.4323, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.142682429677945, | |
| "grad_norm": 0.3365619480609894, | |
| "learning_rate": 0.00041633527696792997, | |
| "loss": 3.4316, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.157241861277735, | |
| "grad_norm": 0.32558462023735046, | |
| "learning_rate": 0.00041589795918367344, | |
| "loss": 3.4261, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.171801292877526, | |
| "grad_norm": 0.3229493498802185, | |
| "learning_rate": 0.0004154606413994169, | |
| "loss": 3.4273, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.186360724477316, | |
| "grad_norm": 0.3373366594314575, | |
| "learning_rate": 0.0004150233236151603, | |
| "loss": 3.4241, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.200920156077107, | |
| "grad_norm": 0.33470067381858826, | |
| "learning_rate": 0.00041458600583090373, | |
| "loss": 3.436, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.215479587676897, | |
| "grad_norm": 0.33129194378852844, | |
| "learning_rate": 0.00041414868804664725, | |
| "loss": 3.4464, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.2300390192766875, | |
| "grad_norm": 0.3305993676185608, | |
| "learning_rate": 0.00041371137026239066, | |
| "loss": 3.44, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.244598450876477, | |
| "grad_norm": 0.3288079500198364, | |
| "learning_rate": 0.0004132740524781341, | |
| "loss": 3.4417, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.259157882476268, | |
| "grad_norm": 0.33732712268829346, | |
| "learning_rate": 0.0004128367346938775, | |
| "loss": 3.445, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.273717314076059, | |
| "grad_norm": 0.3398957848548889, | |
| "learning_rate": 0.0004123994169096209, | |
| "loss": 3.4473, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.288276745675849, | |
| "grad_norm": 0.3353675305843353, | |
| "learning_rate": 0.0004119620991253644, | |
| "loss": 3.4428, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.302836177275639, | |
| "grad_norm": 0.3312719464302063, | |
| "learning_rate": 0.00041152478134110783, | |
| "loss": 3.4346, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.3173956088754295, | |
| "grad_norm": 0.32870662212371826, | |
| "learning_rate": 0.00041108746355685125, | |
| "loss": 3.4397, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.33195504047522, | |
| "grad_norm": 0.3326077461242676, | |
| "learning_rate": 0.0004106501457725947, | |
| "loss": 3.4494, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.34651447207501, | |
| "grad_norm": 0.32431626319885254, | |
| "learning_rate": 0.0004102128279883382, | |
| "loss": 3.435, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.3610739036748, | |
| "grad_norm": 0.32606053352355957, | |
| "learning_rate": 0.0004097755102040816, | |
| "loss": 3.4515, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.375633335274591, | |
| "grad_norm": 0.33837705850601196, | |
| "learning_rate": 0.00040933819241982506, | |
| "loss": 3.4578, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.390192766874382, | |
| "grad_norm": 0.35296231508255005, | |
| "learning_rate": 0.00040890087463556847, | |
| "loss": 3.4563, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.4047521984741715, | |
| "grad_norm": 0.3277094066143036, | |
| "learning_rate": 0.0004084635568513119, | |
| "loss": 3.4499, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.4047521984741715, | |
| "eval_accuracy": 0.3639197405913266, | |
| "eval_loss": 3.6030333042144775, | |
| "eval_runtime": 180.1326, | |
| "eval_samples_per_second": 92.393, | |
| "eval_steps_per_second": 5.779, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.419311630073962, | |
| "grad_norm": 0.3193458020687103, | |
| "learning_rate": 0.0004080262390670554, | |
| "loss": 3.4559, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.433871061673752, | |
| "grad_norm": 0.3288237452507019, | |
| "learning_rate": 0.0004075889212827988, | |
| "loss": 3.4597, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.448430493273543, | |
| "grad_norm": 0.3396027088165283, | |
| "learning_rate": 0.00040715160349854223, | |
| "loss": 3.4572, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.462989924873333, | |
| "grad_norm": 0.3147648572921753, | |
| "learning_rate": 0.00040671428571428564, | |
| "loss": 3.4541, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.477549356473124, | |
| "grad_norm": 0.31667134165763855, | |
| "learning_rate": 0.00040627696793002916, | |
| "loss": 3.4621, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.492108788072914, | |
| "grad_norm": 0.34657663106918335, | |
| "learning_rate": 0.0004058396501457726, | |
| "loss": 3.4604, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.506668219672704, | |
| "grad_norm": 0.32261285185813904, | |
| "learning_rate": 0.000405402332361516, | |
| "loss": 3.4612, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.521227651272494, | |
| "grad_norm": 0.3422505855560303, | |
| "learning_rate": 0.00040496501457725945, | |
| "loss": 3.4559, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.535787082872285, | |
| "grad_norm": 0.3316167891025543, | |
| "learning_rate": 0.00040452769679300287, | |
| "loss": 3.4496, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.550346514472075, | |
| "grad_norm": 0.3352113962173462, | |
| "learning_rate": 0.00040409037900874633, | |
| "loss": 3.4656, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.564905946071866, | |
| "grad_norm": 0.3302208185195923, | |
| "learning_rate": 0.00040365306122448974, | |
| "loss": 3.444, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.579465377671656, | |
| "grad_norm": 0.3382601737976074, | |
| "learning_rate": 0.0004032157434402332, | |
| "loss": 3.4749, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.594024809271446, | |
| "grad_norm": 0.32733502984046936, | |
| "learning_rate": 0.0004027784256559766, | |
| "loss": 3.4676, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.608584240871236, | |
| "grad_norm": 0.3271522521972656, | |
| "learning_rate": 0.0004023411078717201, | |
| "loss": 3.4625, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.623143672471027, | |
| "grad_norm": 0.35525378584861755, | |
| "learning_rate": 0.00040190379008746356, | |
| "loss": 3.47, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.637703104070817, | |
| "grad_norm": 0.34130969643592834, | |
| "learning_rate": 0.00040146647230320697, | |
| "loss": 3.4531, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.652262535670608, | |
| "grad_norm": 0.3281981647014618, | |
| "learning_rate": 0.0004010291545189504, | |
| "loss": 3.4675, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.666821967270398, | |
| "grad_norm": 0.3403642475605011, | |
| "learning_rate": 0.0004005918367346938, | |
| "loss": 3.4779, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.6813813988701884, | |
| "grad_norm": 0.35402730107307434, | |
| "learning_rate": 0.0004001545189504373, | |
| "loss": 3.4748, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.695940830469978, | |
| "grad_norm": 0.3444899618625641, | |
| "learning_rate": 0.0003997172011661807, | |
| "loss": 3.4726, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.695940830469978, | |
| "eval_accuracy": 0.36462571371896035, | |
| "eval_loss": 3.593517541885376, | |
| "eval_runtime": 180.0181, | |
| "eval_samples_per_second": 92.452, | |
| "eval_steps_per_second": 5.783, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.710500262069769, | |
| "grad_norm": 0.3251613676548004, | |
| "learning_rate": 0.00039927988338192414, | |
| "loss": 3.4531, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.725059693669559, | |
| "grad_norm": 0.3224335312843323, | |
| "learning_rate": 0.0003988425655976676, | |
| "loss": 3.4575, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.73961912526935, | |
| "grad_norm": 0.3301301598548889, | |
| "learning_rate": 0.00039840524781341107, | |
| "loss": 3.4608, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.75417855686914, | |
| "grad_norm": 0.3267367482185364, | |
| "learning_rate": 0.0003979679300291545, | |
| "loss": 3.4722, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.7687379884689305, | |
| "grad_norm": 0.34791019558906555, | |
| "learning_rate": 0.00039753061224489795, | |
| "loss": 3.4737, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.78329742006872, | |
| "grad_norm": 0.3289180099964142, | |
| "learning_rate": 0.00039709329446064136, | |
| "loss": 3.4727, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.797856851668511, | |
| "grad_norm": 0.326933890581131, | |
| "learning_rate": 0.0003966559766763848, | |
| "loss": 3.4499, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.812416283268301, | |
| "grad_norm": 0.3207686245441437, | |
| "learning_rate": 0.0003962186588921283, | |
| "loss": 3.4833, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.826975714868092, | |
| "grad_norm": 0.34312567114830017, | |
| "learning_rate": 0.0003957813411078717, | |
| "loss": 3.4743, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.841535146467882, | |
| "grad_norm": 0.32261767983436584, | |
| "learning_rate": 0.0003953440233236151, | |
| "loss": 3.4717, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.8560945780676725, | |
| "grad_norm": 0.3239823579788208, | |
| "learning_rate": 0.00039490670553935853, | |
| "loss": 3.4745, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.870654009667462, | |
| "grad_norm": 0.3287251889705658, | |
| "learning_rate": 0.00039446938775510195, | |
| "loss": 3.4706, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.885213441267253, | |
| "grad_norm": 0.33744481205940247, | |
| "learning_rate": 0.00039403206997084547, | |
| "loss": 3.4712, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.899772872867043, | |
| "grad_norm": 0.33784157037734985, | |
| "learning_rate": 0.0003935947521865889, | |
| "loss": 3.4744, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.914332304466834, | |
| "grad_norm": 0.33039554953575134, | |
| "learning_rate": 0.0003931574344023323, | |
| "loss": 3.4635, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.928891736066624, | |
| "grad_norm": 0.3252994418144226, | |
| "learning_rate": 0.00039272011661807576, | |
| "loss": 3.4699, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.943451167666415, | |
| "grad_norm": 0.33139488101005554, | |
| "learning_rate": 0.0003922827988338192, | |
| "loss": 3.4763, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.9580105992662045, | |
| "grad_norm": 0.3324434757232666, | |
| "learning_rate": 0.00039184548104956264, | |
| "loss": 3.467, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.972570030865995, | |
| "grad_norm": 0.32796579599380493, | |
| "learning_rate": 0.0003914081632653061, | |
| "loss": 3.4705, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.987129462465785, | |
| "grad_norm": 0.32173773646354675, | |
| "learning_rate": 0.0003909708454810495, | |
| "loss": 3.4734, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.987129462465785, | |
| "eval_accuracy": 0.36566327315904046, | |
| "eval_loss": 3.5826520919799805, | |
| "eval_runtime": 179.8608, | |
| "eval_samples_per_second": 92.533, | |
| "eval_steps_per_second": 5.788, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.001455943159979, | |
| "grad_norm": 0.3372614085674286, | |
| "learning_rate": 0.00039053352769679293, | |
| "loss": 3.4591, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.016015374759769, | |
| "grad_norm": 0.3403565585613251, | |
| "learning_rate": 0.00039009620991253645, | |
| "loss": 3.3491, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.03057480635956, | |
| "grad_norm": 0.34883126616477966, | |
| "learning_rate": 0.00038965889212827986, | |
| "loss": 3.3559, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.04513423795935, | |
| "grad_norm": 0.3384236693382263, | |
| "learning_rate": 0.0003892215743440233, | |
| "loss": 3.3671, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.059693669559141, | |
| "grad_norm": 0.33237436413764954, | |
| "learning_rate": 0.0003887842565597667, | |
| "loss": 3.3692, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.074253101158931, | |
| "grad_norm": 0.35221633315086365, | |
| "learning_rate": 0.0003883469387755102, | |
| "loss": 3.3795, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.0888125327587215, | |
| "grad_norm": 0.33727243542671204, | |
| "learning_rate": 0.0003879096209912536, | |
| "loss": 3.3848, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.103371964358511, | |
| "grad_norm": 0.34708696603775024, | |
| "learning_rate": 0.00038747230320699703, | |
| "loss": 3.3819, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.117931395958302, | |
| "grad_norm": 0.3307049572467804, | |
| "learning_rate": 0.0003870349854227405, | |
| "loss": 3.3914, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.132490827558092, | |
| "grad_norm": 0.3246367871761322, | |
| "learning_rate": 0.0003865976676384839, | |
| "loss": 3.3867, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.147050259157883, | |
| "grad_norm": 0.33385294675827026, | |
| "learning_rate": 0.0003861603498542274, | |
| "loss": 3.3878, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.161609690757673, | |
| "grad_norm": 0.35358157753944397, | |
| "learning_rate": 0.0003857230320699708, | |
| "loss": 3.3848, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.1761691223574635, | |
| "grad_norm": 0.3381134271621704, | |
| "learning_rate": 0.00038528571428571426, | |
| "loss": 3.381, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.190728553957253, | |
| "grad_norm": 0.33539456129074097, | |
| "learning_rate": 0.00038484839650145767, | |
| "loss": 3.391, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.205287985557044, | |
| "grad_norm": 0.3288535475730896, | |
| "learning_rate": 0.00038441107871720114, | |
| "loss": 3.3846, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.219847417156834, | |
| "grad_norm": 0.3503969609737396, | |
| "learning_rate": 0.0003839737609329446, | |
| "loss": 3.3942, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.234406848756625, | |
| "grad_norm": 0.34089216589927673, | |
| "learning_rate": 0.000383536443148688, | |
| "loss": 3.4039, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.248966280356415, | |
| "grad_norm": 0.33822911977767944, | |
| "learning_rate": 0.00038309912536443143, | |
| "loss": 3.3938, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.2635257119562056, | |
| "grad_norm": 0.34553372859954834, | |
| "learning_rate": 0.00038266180758017484, | |
| "loss": 3.4007, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.2780851435559955, | |
| "grad_norm": 0.34171566367149353, | |
| "learning_rate": 0.00038222448979591836, | |
| "loss": 3.4074, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.2780851435559955, | |
| "eval_accuracy": 0.3654761973352454, | |
| "eval_loss": 3.590606212615967, | |
| "eval_runtime": 179.9086, | |
| "eval_samples_per_second": 92.508, | |
| "eval_steps_per_second": 5.786, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.292644575155786, | |
| "grad_norm": 0.33524560928344727, | |
| "learning_rate": 0.00038178717201166177, | |
| "loss": 3.4011, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.307204006755576, | |
| "grad_norm": 0.3223832845687866, | |
| "learning_rate": 0.0003813498542274052, | |
| "loss": 3.4039, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.321763438355367, | |
| "grad_norm": 0.33097726106643677, | |
| "learning_rate": 0.00038091253644314865, | |
| "loss": 3.4045, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.336322869955157, | |
| "grad_norm": 0.3478914201259613, | |
| "learning_rate": 0.0003804752186588921, | |
| "loss": 3.4058, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.350882301554948, | |
| "grad_norm": 0.36241742968559265, | |
| "learning_rate": 0.00038003790087463553, | |
| "loss": 3.4074, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.3654417331547375, | |
| "grad_norm": 0.3258671164512634, | |
| "learning_rate": 0.000379600583090379, | |
| "loss": 3.4111, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.380001164754528, | |
| "grad_norm": 0.34953588247299194, | |
| "learning_rate": 0.0003791632653061224, | |
| "loss": 3.4106, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.394560596354318, | |
| "grad_norm": 0.33248066902160645, | |
| "learning_rate": 0.0003787259475218658, | |
| "loss": 3.4044, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.409120027954109, | |
| "grad_norm": 0.3584959805011749, | |
| "learning_rate": 0.00037828862973760934, | |
| "loss": 3.4181, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.423679459553899, | |
| "grad_norm": 0.3388174772262573, | |
| "learning_rate": 0.00037785131195335276, | |
| "loss": 3.4183, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.43823889115369, | |
| "grad_norm": 0.3261428773403168, | |
| "learning_rate": 0.00037741399416909617, | |
| "loss": 3.421, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.4527983227534795, | |
| "grad_norm": 0.33287757635116577, | |
| "learning_rate": 0.0003769766763848396, | |
| "loss": 3.4161, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.46735775435327, | |
| "grad_norm": 0.3487517237663269, | |
| "learning_rate": 0.0003765393586005831, | |
| "loss": 3.4124, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.48191718595306, | |
| "grad_norm": 0.3360964357852936, | |
| "learning_rate": 0.0003761020408163265, | |
| "loss": 3.4114, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.496476617552851, | |
| "grad_norm": 0.339276522397995, | |
| "learning_rate": 0.0003756647230320699, | |
| "loss": 3.4243, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.511036049152641, | |
| "grad_norm": 0.34367257356643677, | |
| "learning_rate": 0.00037522740524781334, | |
| "loss": 3.422, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.525595480752432, | |
| "grad_norm": 0.33407968282699585, | |
| "learning_rate": 0.0003747900874635568, | |
| "loss": 3.4223, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.540154912352222, | |
| "grad_norm": 0.3418211042881012, | |
| "learning_rate": 0.00037435276967930027, | |
| "loss": 3.4312, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.554714343952012, | |
| "grad_norm": 0.3525235950946808, | |
| "learning_rate": 0.0003739154518950437, | |
| "loss": 3.436, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.569273775551802, | |
| "grad_norm": 0.35437750816345215, | |
| "learning_rate": 0.00037347813411078715, | |
| "loss": 3.4173, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.569273775551802, | |
| "eval_accuracy": 0.36623061498796, | |
| "eval_loss": 3.5847508907318115, | |
| "eval_runtime": 179.8238, | |
| "eval_samples_per_second": 92.552, | |
| "eval_steps_per_second": 5.789, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.583833207151593, | |
| "grad_norm": 0.36680832505226135, | |
| "learning_rate": 0.00037304081632653056, | |
| "loss": 3.4282, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.598392638751383, | |
| "grad_norm": 0.32975292205810547, | |
| "learning_rate": 0.00037260349854227403, | |
| "loss": 3.4187, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.612952070351174, | |
| "grad_norm": 0.33258336782455444, | |
| "learning_rate": 0.0003721661807580175, | |
| "loss": 3.419, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.627511501950964, | |
| "grad_norm": 0.3520626723766327, | |
| "learning_rate": 0.0003717288629737609, | |
| "loss": 3.4317, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.642070933550754, | |
| "grad_norm": 0.34615185856819153, | |
| "learning_rate": 0.0003712915451895043, | |
| "loss": 3.4397, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.656630365150544, | |
| "grad_norm": 0.3472108542919159, | |
| "learning_rate": 0.00037085422740524773, | |
| "loss": 3.4183, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.671189796750335, | |
| "grad_norm": 0.3401790261268616, | |
| "learning_rate": 0.00037041690962099125, | |
| "loss": 3.4274, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.685749228350125, | |
| "grad_norm": 0.34616005420684814, | |
| "learning_rate": 0.00036997959183673467, | |
| "loss": 3.432, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.700308659949916, | |
| "grad_norm": 0.35238298773765564, | |
| "learning_rate": 0.0003695422740524781, | |
| "loss": 3.4308, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.714868091549706, | |
| "grad_norm": 0.3595859408378601, | |
| "learning_rate": 0.00036910495626822154, | |
| "loss": 3.4368, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.729427523149496, | |
| "grad_norm": 0.3455177843570709, | |
| "learning_rate": 0.000368667638483965, | |
| "loss": 3.4389, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.743986954749286, | |
| "grad_norm": 0.3548458516597748, | |
| "learning_rate": 0.0003682303206997084, | |
| "loss": 3.427, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.758546386349077, | |
| "grad_norm": 0.34287944436073303, | |
| "learning_rate": 0.00036779300291545184, | |
| "loss": 3.4268, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.773105817948867, | |
| "grad_norm": 0.3392084836959839, | |
| "learning_rate": 0.0003673556851311953, | |
| "loss": 3.4348, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.787665249548658, | |
| "grad_norm": 0.3404330015182495, | |
| "learning_rate": 0.0003669183673469387, | |
| "loss": 3.4162, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.802224681148448, | |
| "grad_norm": 0.34505772590637207, | |
| "learning_rate": 0.0003664810495626822, | |
| "loss": 3.4267, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.8167841127482385, | |
| "grad_norm": 0.34470146894454956, | |
| "learning_rate": 0.00036604373177842565, | |
| "loss": 3.444, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.831343544348028, | |
| "grad_norm": 0.33952096104621887, | |
| "learning_rate": 0.00036560641399416906, | |
| "loss": 3.4317, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.845902975947819, | |
| "grad_norm": 0.35519036650657654, | |
| "learning_rate": 0.0003651690962099125, | |
| "loss": 3.4335, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.860462407547609, | |
| "grad_norm": 0.3539314568042755, | |
| "learning_rate": 0.000364731778425656, | |
| "loss": 3.4318, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.860462407547609, | |
| "eval_accuracy": 0.36703947393949116, | |
| "eval_loss": 3.572652578353882, | |
| "eval_runtime": 179.9477, | |
| "eval_samples_per_second": 92.488, | |
| "eval_steps_per_second": 5.785, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.8750218391474, | |
| "grad_norm": 0.3434327244758606, | |
| "learning_rate": 0.0003642944606413994, | |
| "loss": 3.4372, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.88958127074719, | |
| "grad_norm": 0.3285406231880188, | |
| "learning_rate": 0.0003638571428571428, | |
| "loss": 3.4238, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.9041407023469805, | |
| "grad_norm": 0.3453764021396637, | |
| "learning_rate": 0.00036341982507288623, | |
| "loss": 3.4369, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.91870013394677, | |
| "grad_norm": 0.32807591557502747, | |
| "learning_rate": 0.0003629825072886297, | |
| "loss": 3.4333, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.933259565546561, | |
| "grad_norm": 0.33627721667289734, | |
| "learning_rate": 0.00036254518950437316, | |
| "loss": 3.4378, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.947818997146351, | |
| "grad_norm": 0.3372686207294464, | |
| "learning_rate": 0.0003621078717201166, | |
| "loss": 3.4311, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.962378428746142, | |
| "grad_norm": 0.3440007269382477, | |
| "learning_rate": 0.00036167055393586004, | |
| "loss": 3.4387, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.976937860345932, | |
| "grad_norm": 0.3503531813621521, | |
| "learning_rate": 0.00036123323615160346, | |
| "loss": 3.435, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.991497291945723, | |
| "grad_norm": 0.32801827788352966, | |
| "learning_rate": 0.0003607959183673469, | |
| "loss": 3.4472, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.005823772639916, | |
| "grad_norm": 0.35048505663871765, | |
| "learning_rate": 0.0003603586005830904, | |
| "loss": 3.3812, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.020383204239707, | |
| "grad_norm": 0.3340410888195038, | |
| "learning_rate": 0.0003599212827988338, | |
| "loss": 3.3375, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.034942635839498, | |
| "grad_norm": 0.34667739272117615, | |
| "learning_rate": 0.0003594839650145772, | |
| "loss": 3.3225, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.049502067439287, | |
| "grad_norm": 0.3594329059123993, | |
| "learning_rate": 0.0003590466472303206, | |
| "loss": 3.335, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.064061499039077, | |
| "grad_norm": 0.3574943244457245, | |
| "learning_rate": 0.00035860932944606415, | |
| "loss": 3.3357, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.078620930638868, | |
| "grad_norm": 0.3481893539428711, | |
| "learning_rate": 0.00035817201166180756, | |
| "loss": 3.3318, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.093180362238659, | |
| "grad_norm": 0.3395717144012451, | |
| "learning_rate": 0.00035773469387755097, | |
| "loss": 3.3532, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.107739793838448, | |
| "grad_norm": 0.33678963780403137, | |
| "learning_rate": 0.0003572973760932944, | |
| "loss": 3.3457, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.122299225438239, | |
| "grad_norm": 0.3517054617404938, | |
| "learning_rate": 0.0003568600583090379, | |
| "loss": 3.3469, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.13685865703803, | |
| "grad_norm": 0.3345564603805542, | |
| "learning_rate": 0.0003564227405247813, | |
| "loss": 3.3386, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.15141808863782, | |
| "grad_norm": 0.3494488596916199, | |
| "learning_rate": 0.00035598542274052473, | |
| "loss": 3.3639, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.15141808863782, | |
| "eval_accuracy": 0.3670094900708125, | |
| "eval_loss": 3.580695867538452, | |
| "eval_runtime": 179.627, | |
| "eval_samples_per_second": 92.653, | |
| "eval_steps_per_second": 5.795, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.16597752023761, | |
| "grad_norm": 0.33706334233283997, | |
| "learning_rate": 0.0003555481049562682, | |
| "loss": 3.369, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.1805369518374, | |
| "grad_norm": 0.34689971804618835, | |
| "learning_rate": 0.0003551107871720116, | |
| "loss": 3.3533, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.19509638343719, | |
| "grad_norm": 0.34161534905433655, | |
| "learning_rate": 0.0003546734693877551, | |
| "loss": 3.375, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.209655815036982, | |
| "grad_norm": 0.36119794845581055, | |
| "learning_rate": 0.00035423615160349854, | |
| "loss": 3.3611, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.22421524663677, | |
| "grad_norm": 0.3473355174064636, | |
| "learning_rate": 0.00035379883381924195, | |
| "loss": 3.3731, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.238774678236561, | |
| "grad_norm": 0.33798742294311523, | |
| "learning_rate": 0.00035336151603498537, | |
| "loss": 3.3692, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.253334109836352, | |
| "grad_norm": 0.3432019352912903, | |
| "learning_rate": 0.0003529241982507289, | |
| "loss": 3.3687, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.267893541436143, | |
| "grad_norm": 0.35700732469558716, | |
| "learning_rate": 0.0003524868804664723, | |
| "loss": 3.3723, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.282452973035932, | |
| "grad_norm": 0.348431795835495, | |
| "learning_rate": 0.0003520495626822157, | |
| "loss": 3.3593, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.297012404635723, | |
| "grad_norm": 0.34419500827789307, | |
| "learning_rate": 0.0003516122448979591, | |
| "loss": 3.3778, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.311571836235514, | |
| "grad_norm": 0.34864479303359985, | |
| "learning_rate": 0.0003511749271137026, | |
| "loss": 3.3809, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.326131267835304, | |
| "grad_norm": 0.35667717456817627, | |
| "learning_rate": 0.00035073760932944606, | |
| "loss": 3.363, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.340690699435093, | |
| "grad_norm": 0.3501654863357544, | |
| "learning_rate": 0.00035030029154518947, | |
| "loss": 3.378, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.355250131034884, | |
| "grad_norm": 0.3490404486656189, | |
| "learning_rate": 0.0003498629737609329, | |
| "loss": 3.3855, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.369809562634675, | |
| "grad_norm": 0.358019083738327, | |
| "learning_rate": 0.00034942565597667635, | |
| "loss": 3.3784, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.384368994234466, | |
| "grad_norm": 0.33226025104522705, | |
| "learning_rate": 0.0003489883381924198, | |
| "loss": 3.3714, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.398928425834255, | |
| "grad_norm": 0.3402322828769684, | |
| "learning_rate": 0.00034855102040816323, | |
| "loss": 3.3785, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.413487857434045, | |
| "grad_norm": 0.36141642928123474, | |
| "learning_rate": 0.0003481137026239067, | |
| "loss": 3.3745, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.428047289033836, | |
| "grad_norm": 0.36371850967407227, | |
| "learning_rate": 0.0003476763848396501, | |
| "loss": 3.3874, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.442606720633627, | |
| "grad_norm": 0.3497146666049957, | |
| "learning_rate": 0.0003472390670553935, | |
| "loss": 3.3846, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.442606720633627, | |
| "eval_accuracy": 0.3678150566759789, | |
| "eval_loss": 3.572382688522339, | |
| "eval_runtime": 179.6973, | |
| "eval_samples_per_second": 92.617, | |
| "eval_steps_per_second": 5.793, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.457166152233416, | |
| "grad_norm": 0.3523021936416626, | |
| "learning_rate": 0.00034680174927113704, | |
| "loss": 3.3833, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.471725583833207, | |
| "grad_norm": 0.3318672180175781, | |
| "learning_rate": 0.00034636443148688045, | |
| "loss": 3.3856, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.486285015432998, | |
| "grad_norm": 0.34436580538749695, | |
| "learning_rate": 0.00034592711370262386, | |
| "loss": 3.392, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.500844447032788, | |
| "grad_norm": 0.3374488651752472, | |
| "learning_rate": 0.0003454897959183673, | |
| "loss": 3.3784, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.515403878632577, | |
| "grad_norm": 0.3651833236217499, | |
| "learning_rate": 0.0003450524781341108, | |
| "loss": 3.3695, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.529963310232368, | |
| "grad_norm": 0.33650752902030945, | |
| "learning_rate": 0.0003446151603498542, | |
| "loss": 3.3818, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.544522741832159, | |
| "grad_norm": 0.3391404449939728, | |
| "learning_rate": 0.0003441778425655976, | |
| "loss": 3.3997, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.55908217343195, | |
| "grad_norm": 0.3535376787185669, | |
| "learning_rate": 0.0003437405247813411, | |
| "loss": 3.3867, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.573641605031739, | |
| "grad_norm": 0.3420208692550659, | |
| "learning_rate": 0.0003433032069970845, | |
| "loss": 3.3834, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.58820103663153, | |
| "grad_norm": 0.3331069052219391, | |
| "learning_rate": 0.00034286588921282797, | |
| "loss": 3.3903, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.60276046823132, | |
| "grad_norm": 0.3587231934070587, | |
| "learning_rate": 0.00034242857142857143, | |
| "loss": 3.3888, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.617319899831111, | |
| "grad_norm": 0.35839417576789856, | |
| "learning_rate": 0.00034199125364431485, | |
| "loss": 3.404, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.6318793314309, | |
| "grad_norm": 0.3896600902080536, | |
| "learning_rate": 0.00034155393586005826, | |
| "loss": 3.3906, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.646438763030691, | |
| "grad_norm": 0.35471850633621216, | |
| "learning_rate": 0.0003411166180758017, | |
| "loss": 3.3922, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.660998194630482, | |
| "grad_norm": 0.3513423800468445, | |
| "learning_rate": 0.0003406793002915452, | |
| "loss": 3.3873, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.675557626230272, | |
| "grad_norm": 0.34752732515335083, | |
| "learning_rate": 0.0003402419825072886, | |
| "loss": 3.3855, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.690117057830061, | |
| "grad_norm": 0.32745492458343506, | |
| "learning_rate": 0.000339804664723032, | |
| "loss": 3.3934, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.704676489429852, | |
| "grad_norm": 0.3485073745250702, | |
| "learning_rate": 0.00033936734693877543, | |
| "loss": 3.3878, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.719235921029643, | |
| "grad_norm": 0.3374342620372772, | |
| "learning_rate": 0.00033893002915451895, | |
| "loss": 3.388, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.733795352629434, | |
| "grad_norm": 0.3508179187774658, | |
| "learning_rate": 0.00033849271137026236, | |
| "loss": 3.3893, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.733795352629434, | |
| "eval_accuracy": 0.3683345418988114, | |
| "eval_loss": 3.5642998218536377, | |
| "eval_runtime": 179.3326, | |
| "eval_samples_per_second": 92.805, | |
| "eval_steps_per_second": 5.805, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.748354784229225, | |
| "grad_norm": 0.3522128760814667, | |
| "learning_rate": 0.0003380553935860058, | |
| "loss": 3.4013, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.762914215829014, | |
| "grad_norm": 0.3406279385089874, | |
| "learning_rate": 0.00033761807580174924, | |
| "loss": 3.4014, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.777473647428804, | |
| "grad_norm": 0.33040550351142883, | |
| "learning_rate": 0.0003371807580174927, | |
| "loss": 3.392, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.792033079028595, | |
| "grad_norm": 0.35470637679100037, | |
| "learning_rate": 0.0003367434402332361, | |
| "loss": 3.3986, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.806592510628384, | |
| "grad_norm": 0.35664665699005127, | |
| "learning_rate": 0.0003363061224489796, | |
| "loss": 3.4054, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.821151942228175, | |
| "grad_norm": 0.35443365573883057, | |
| "learning_rate": 0.000335868804664723, | |
| "loss": 3.3916, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.835711373827966, | |
| "grad_norm": 0.3552112579345703, | |
| "learning_rate": 0.0003354314868804664, | |
| "loss": 3.4106, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.850270805427757, | |
| "grad_norm": 0.3517363667488098, | |
| "learning_rate": 0.00033499416909620993, | |
| "loss": 3.3959, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.864830237027547, | |
| "grad_norm": 0.3412357568740845, | |
| "learning_rate": 0.00033455685131195335, | |
| "loss": 3.3977, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.879389668627336, | |
| "grad_norm": 0.3659086227416992, | |
| "learning_rate": 0.00033411953352769676, | |
| "loss": 3.4041, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.893949100227127, | |
| "grad_norm": 0.3394777476787567, | |
| "learning_rate": 0.00033368221574344017, | |
| "loss": 3.3925, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.908508531826918, | |
| "grad_norm": 0.3358438014984131, | |
| "learning_rate": 0.0003332448979591837, | |
| "loss": 3.3954, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.923067963426707, | |
| "grad_norm": 0.3618221879005432, | |
| "learning_rate": 0.0003328075801749271, | |
| "loss": 3.3963, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.937627395026498, | |
| "grad_norm": 0.35156282782554626, | |
| "learning_rate": 0.0003323702623906705, | |
| "loss": 3.3973, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.952186826626289, | |
| "grad_norm": 0.3404799997806549, | |
| "learning_rate": 0.00033193294460641393, | |
| "loss": 3.4064, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.96674625822608, | |
| "grad_norm": 0.3573434352874756, | |
| "learning_rate": 0.0003314956268221574, | |
| "loss": 3.3962, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.98130568982587, | |
| "grad_norm": 0.3326402008533478, | |
| "learning_rate": 0.00033105830903790086, | |
| "loss": 3.3904, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 8.995865121425659, | |
| "grad_norm": 0.3333438038825989, | |
| "learning_rate": 0.0003306209912536443, | |
| "loss": 3.4072, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.010191602119853, | |
| "grad_norm": 0.35445913672447205, | |
| "learning_rate": 0.00033018367346938774, | |
| "loss": 3.3354, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.024751033719644, | |
| "grad_norm": 0.3602832555770874, | |
| "learning_rate": 0.00032974635568513115, | |
| "loss": 3.2997, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.024751033719644, | |
| "eval_accuracy": 0.3683871018568481, | |
| "eval_loss": 3.56929087638855, | |
| "eval_runtime": 180.7113, | |
| "eval_samples_per_second": 92.097, | |
| "eval_steps_per_second": 5.761, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.039310465319433, | |
| "grad_norm": 0.33829203248023987, | |
| "learning_rate": 0.0003293090379008746, | |
| "loss": 3.2994, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.053869896919224, | |
| "grad_norm": 0.36634117364883423, | |
| "learning_rate": 0.0003288717201166181, | |
| "loss": 3.3039, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.068429328519015, | |
| "grad_norm": 0.34743866324424744, | |
| "learning_rate": 0.0003284344023323615, | |
| "loss": 3.3133, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.082988760118806, | |
| "grad_norm": 0.3573026657104492, | |
| "learning_rate": 0.0003279970845481049, | |
| "loss": 3.3067, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.097548191718595, | |
| "grad_norm": 0.3499259650707245, | |
| "learning_rate": 0.0003275597667638483, | |
| "loss": 3.311, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.112107623318385, | |
| "grad_norm": 0.3550528287887573, | |
| "learning_rate": 0.00032712244897959184, | |
| "loss": 3.3146, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.126667054918176, | |
| "grad_norm": 0.3766951262950897, | |
| "learning_rate": 0.00032668513119533526, | |
| "loss": 3.3203, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.141226486517967, | |
| "grad_norm": 0.3506350517272949, | |
| "learning_rate": 0.00032624781341107867, | |
| "loss": 3.3383, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.155785918117756, | |
| "grad_norm": 0.36587440967559814, | |
| "learning_rate": 0.00032581049562682213, | |
| "loss": 3.3249, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.170345349717547, | |
| "grad_norm": 0.3548264503479004, | |
| "learning_rate": 0.0003253731778425656, | |
| "loss": 3.3173, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.184904781317337, | |
| "grad_norm": 0.3574599325656891, | |
| "learning_rate": 0.000324935860058309, | |
| "loss": 3.3277, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.199464212917128, | |
| "grad_norm": 0.3559187948703766, | |
| "learning_rate": 0.0003244985422740524, | |
| "loss": 3.3302, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.214023644516917, | |
| "grad_norm": 0.3626471757888794, | |
| "learning_rate": 0.0003240612244897959, | |
| "loss": 3.3249, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.228583076116708, | |
| "grad_norm": 0.34642550349235535, | |
| "learning_rate": 0.0003236239067055393, | |
| "loss": 3.3288, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.243142507716499, | |
| "grad_norm": 0.3562052249908447, | |
| "learning_rate": 0.00032318658892128277, | |
| "loss": 3.3381, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.25770193931629, | |
| "grad_norm": 0.35299643874168396, | |
| "learning_rate": 0.00032274927113702624, | |
| "loss": 3.3398, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.272261370916079, | |
| "grad_norm": 0.3579034209251404, | |
| "learning_rate": 0.00032231195335276965, | |
| "loss": 3.3376, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.28682080251587, | |
| "grad_norm": 0.3582768738269806, | |
| "learning_rate": 0.00032187463556851306, | |
| "loss": 3.3417, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.30138023411566, | |
| "grad_norm": 0.3462630808353424, | |
| "learning_rate": 0.0003214373177842565, | |
| "loss": 3.3388, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.315939665715451, | |
| "grad_norm": 0.35994312167167664, | |
| "learning_rate": 0.000321, | |
| "loss": 3.3417, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.315939665715451, | |
| "eval_accuracy": 0.3688413280713799, | |
| "eval_loss": 3.5689337253570557, | |
| "eval_runtime": 181.1092, | |
| "eval_samples_per_second": 91.895, | |
| "eval_steps_per_second": 5.748, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.33049909731524, | |
| "grad_norm": 0.34968388080596924, | |
| "learning_rate": 0.0003205626822157434, | |
| "loss": 3.3376, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.34505852891503, | |
| "grad_norm": 0.35291755199432373, | |
| "learning_rate": 0.0003201253644314868, | |
| "loss": 3.3412, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.359617960514822, | |
| "grad_norm": 0.3643549978733063, | |
| "learning_rate": 0.0003196880466472303, | |
| "loss": 3.3411, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.374177392114612, | |
| "grad_norm": 0.3537770211696625, | |
| "learning_rate": 0.00031925072886297375, | |
| "loss": 3.3642, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.388736823714403, | |
| "grad_norm": 0.3553234338760376, | |
| "learning_rate": 0.00031881341107871717, | |
| "loss": 3.353, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.403296255314192, | |
| "grad_norm": 0.35173216462135315, | |
| "learning_rate": 0.00031837609329446063, | |
| "loss": 3.3433, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.417855686913983, | |
| "grad_norm": 0.3561984598636627, | |
| "learning_rate": 0.00031793877551020405, | |
| "loss": 3.3459, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.432415118513774, | |
| "grad_norm": 0.3734908699989319, | |
| "learning_rate": 0.00031750145772594746, | |
| "loss": 3.3495, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.446974550113563, | |
| "grad_norm": 0.3848966658115387, | |
| "learning_rate": 0.000317064139941691, | |
| "loss": 3.3486, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.461533981713353, | |
| "grad_norm": 0.36939284205436707, | |
| "learning_rate": 0.0003166268221574344, | |
| "loss": 3.3527, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.476093413313144, | |
| "grad_norm": 0.3429546654224396, | |
| "learning_rate": 0.0003161895043731778, | |
| "loss": 3.3402, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.490652844912935, | |
| "grad_norm": 0.34233972430229187, | |
| "learning_rate": 0.0003157521865889212, | |
| "loss": 3.3508, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.505212276512726, | |
| "grad_norm": 0.3572950065135956, | |
| "learning_rate": 0.00031531486880466474, | |
| "loss": 3.3599, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.519771708112515, | |
| "grad_norm": 0.34846094250679016, | |
| "learning_rate": 0.00031487755102040815, | |
| "loss": 3.3585, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.534331139712306, | |
| "grad_norm": 0.3666765093803406, | |
| "learning_rate": 0.00031444023323615156, | |
| "loss": 3.3535, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.548890571312096, | |
| "grad_norm": 0.3483474850654602, | |
| "learning_rate": 0.000314002915451895, | |
| "loss": 3.3492, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.563450002911885, | |
| "grad_norm": 0.3478499948978424, | |
| "learning_rate": 0.00031356559766763844, | |
| "loss": 3.3664, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.578009434511676, | |
| "grad_norm": 0.3615437150001526, | |
| "learning_rate": 0.0003131282798833819, | |
| "loss": 3.3739, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.592568866111467, | |
| "grad_norm": 0.35250964760780334, | |
| "learning_rate": 0.0003126909620991253, | |
| "loss": 3.3745, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.607128297711258, | |
| "grad_norm": 0.35164180397987366, | |
| "learning_rate": 0.0003122536443148688, | |
| "loss": 3.3562, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.607128297711258, | |
| "eval_accuracy": 0.3692384085597243, | |
| "eval_loss": 3.5602471828460693, | |
| "eval_runtime": 180.7731, | |
| "eval_samples_per_second": 92.066, | |
| "eval_steps_per_second": 5.759, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.621687729311049, | |
| "grad_norm": 0.37672215700149536, | |
| "learning_rate": 0.0003118163265306122, | |
| "loss": 3.3735, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.636247160910838, | |
| "grad_norm": 0.36633849143981934, | |
| "learning_rate": 0.00031137900874635566, | |
| "loss": 3.3498, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.650806592510628, | |
| "grad_norm": 0.3514011800289154, | |
| "learning_rate": 0.00031094169096209913, | |
| "loss": 3.3604, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.66536602411042, | |
| "grad_norm": 0.35586225986480713, | |
| "learning_rate": 0.00031050437317784254, | |
| "loss": 3.3574, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.67992545571021, | |
| "grad_norm": 0.33317190408706665, | |
| "learning_rate": 0.00031006705539358596, | |
| "loss": 3.3546, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.694484887309999, | |
| "grad_norm": 0.35271352529525757, | |
| "learning_rate": 0.00030962973760932937, | |
| "loss": 3.3632, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.70904431890979, | |
| "grad_norm": 0.3521358370780945, | |
| "learning_rate": 0.0003091924198250729, | |
| "loss": 3.3584, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.72360375050958, | |
| "grad_norm": 0.3574683666229248, | |
| "learning_rate": 0.0003087551020408163, | |
| "loss": 3.3581, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.738163182109371, | |
| "grad_norm": 0.3643791377544403, | |
| "learning_rate": 0.0003083177842565597, | |
| "loss": 3.3691, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.75272261370916, | |
| "grad_norm": 0.35385361313819885, | |
| "learning_rate": 0.0003078804664723032, | |
| "loss": 3.3547, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.767282045308951, | |
| "grad_norm": 0.35955286026000977, | |
| "learning_rate": 0.00030744314868804665, | |
| "loss": 3.3496, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.781841476908742, | |
| "grad_norm": 0.3493342697620392, | |
| "learning_rate": 0.00030700583090379006, | |
| "loss": 3.3629, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.796400908508533, | |
| "grad_norm": 0.3883078396320343, | |
| "learning_rate": 0.00030656851311953347, | |
| "loss": 3.3643, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.810960340108322, | |
| "grad_norm": 0.34926533699035645, | |
| "learning_rate": 0.00030613119533527694, | |
| "loss": 3.3662, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.825519771708112, | |
| "grad_norm": 0.37770354747772217, | |
| "learning_rate": 0.00030569387755102035, | |
| "loss": 3.3813, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.840079203307903, | |
| "grad_norm": 0.3666662275791168, | |
| "learning_rate": 0.0003052565597667638, | |
| "loss": 3.3669, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.854638634907694, | |
| "grad_norm": 0.3690825402736664, | |
| "learning_rate": 0.0003048192419825073, | |
| "loss": 3.3718, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.869198066507483, | |
| "grad_norm": 0.3668816387653351, | |
| "learning_rate": 0.0003043819241982507, | |
| "loss": 3.3572, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.883757498107274, | |
| "grad_norm": 0.35026848316192627, | |
| "learning_rate": 0.0003039446064139941, | |
| "loss": 3.3714, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.898316929707065, | |
| "grad_norm": 0.36591610312461853, | |
| "learning_rate": 0.00030350728862973763, | |
| "loss": 3.3759, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.898316929707065, | |
| "eval_accuracy": 0.36981339333556196, | |
| "eval_loss": 3.553095817565918, | |
| "eval_runtime": 180.8581, | |
| "eval_samples_per_second": 92.022, | |
| "eval_steps_per_second": 5.756, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.912876361306855, | |
| "grad_norm": 0.3765810430049896, | |
| "learning_rate": 0.00030306997084548104, | |
| "loss": 3.363, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.927435792906644, | |
| "grad_norm": 0.3594549000263214, | |
| "learning_rate": 0.00030263265306122445, | |
| "loss": 3.3671, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.941995224506435, | |
| "grad_norm": 0.35946381092071533, | |
| "learning_rate": 0.00030219533527696787, | |
| "loss": 3.3735, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.956554656106226, | |
| "grad_norm": 0.37179645895957947, | |
| "learning_rate": 0.00030175801749271133, | |
| "loss": 3.3874, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.971114087706017, | |
| "grad_norm": 0.36117124557495117, | |
| "learning_rate": 0.0003013206997084548, | |
| "loss": 3.3806, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.985673519305806, | |
| "grad_norm": 0.34759020805358887, | |
| "learning_rate": 0.0003008833819241982, | |
| "loss": 3.3681, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0003004460641399417, | |
| "loss": 3.3643, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.01455943159979, | |
| "grad_norm": 0.35527414083480835, | |
| "learning_rate": 0.0003000087463556851, | |
| "loss": 3.2614, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.029118863199582, | |
| "grad_norm": 0.3797459304332733, | |
| "learning_rate": 0.00029957142857142856, | |
| "loss": 3.269, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.04367829479937, | |
| "grad_norm": 0.36752596497535706, | |
| "learning_rate": 0.000299134110787172, | |
| "loss": 3.2838, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.058237726399161, | |
| "grad_norm": 0.34516459703445435, | |
| "learning_rate": 0.00029869679300291544, | |
| "loss": 3.272, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.072797157998952, | |
| "grad_norm": 0.3728445768356323, | |
| "learning_rate": 0.00029825947521865885, | |
| "loss": 3.2696, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.087356589598743, | |
| "grad_norm": 0.3747389018535614, | |
| "learning_rate": 0.0002978221574344023, | |
| "loss": 3.2888, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.101916021198532, | |
| "grad_norm": 0.34447789192199707, | |
| "learning_rate": 0.00029738483965014573, | |
| "loss": 3.2916, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.116475452798323, | |
| "grad_norm": 0.35870856046676636, | |
| "learning_rate": 0.0002969475218658892, | |
| "loss": 3.2913, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.131034884398114, | |
| "grad_norm": 0.35672426223754883, | |
| "learning_rate": 0.0002965102040816326, | |
| "loss": 3.2963, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.145594315997904, | |
| "grad_norm": 0.36722877621650696, | |
| "learning_rate": 0.0002960728862973761, | |
| "loss": 3.2886, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.160153747597693, | |
| "grad_norm": 0.3597167432308197, | |
| "learning_rate": 0.0002956355685131195, | |
| "loss": 3.3118, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.174713179197484, | |
| "grad_norm": 0.3561251759529114, | |
| "learning_rate": 0.00029519825072886295, | |
| "loss": 3.2997, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.189272610797275, | |
| "grad_norm": 0.37824273109436035, | |
| "learning_rate": 0.00029476093294460637, | |
| "loss": 3.3017, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.189272610797275, | |
| "eval_accuracy": 0.3693485845791435, | |
| "eval_loss": 3.5645644664764404, | |
| "eval_runtime": 180.1256, | |
| "eval_samples_per_second": 92.397, | |
| "eval_steps_per_second": 5.779, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.203832042397066, | |
| "grad_norm": 0.37217044830322266, | |
| "learning_rate": 0.00029432361516034983, | |
| "loss": 3.3035, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.218391473996855, | |
| "grad_norm": 0.3471571207046509, | |
| "learning_rate": 0.0002938862973760933, | |
| "loss": 3.3116, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.232950905596645, | |
| "grad_norm": 0.3539142310619354, | |
| "learning_rate": 0.0002934489795918367, | |
| "loss": 3.296, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.247510337196436, | |
| "grad_norm": 0.36773473024368286, | |
| "learning_rate": 0.0002930116618075802, | |
| "loss": 3.3028, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.262069768796227, | |
| "grad_norm": 0.3689476549625397, | |
| "learning_rate": 0.0002925743440233236, | |
| "loss": 3.3111, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.276629200396016, | |
| "grad_norm": 0.3640798032283783, | |
| "learning_rate": 0.00029213702623906706, | |
| "loss": 3.3157, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.291188631995807, | |
| "grad_norm": 0.3602818250656128, | |
| "learning_rate": 0.00029169970845481047, | |
| "loss": 3.3138, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.305748063595598, | |
| "grad_norm": 0.38390350341796875, | |
| "learning_rate": 0.00029126239067055394, | |
| "loss": 3.3077, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.320307495195388, | |
| "grad_norm": 0.36689597368240356, | |
| "learning_rate": 0.00029082507288629735, | |
| "loss": 3.309, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.334866926795177, | |
| "grad_norm": 0.3611031770706177, | |
| "learning_rate": 0.00029038775510204076, | |
| "loss": 3.3119, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.349426358394968, | |
| "grad_norm": 0.36774659156799316, | |
| "learning_rate": 0.0002899504373177842, | |
| "loss": 3.3189, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.363985789994759, | |
| "grad_norm": 0.36395514011383057, | |
| "learning_rate": 0.00028951311953352764, | |
| "loss": 3.3109, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.37854522159455, | |
| "grad_norm": 0.362166166305542, | |
| "learning_rate": 0.0002890758017492711, | |
| "loss": 3.3192, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.393104653194339, | |
| "grad_norm": 0.3618522882461548, | |
| "learning_rate": 0.0002886384839650145, | |
| "loss": 3.3183, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.40766408479413, | |
| "grad_norm": 0.3681625723838806, | |
| "learning_rate": 0.000288201166180758, | |
| "loss": 3.318, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.42222351639392, | |
| "grad_norm": 0.3899301588535309, | |
| "learning_rate": 0.00028776384839650145, | |
| "loss": 3.3182, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.436782947993711, | |
| "grad_norm": 0.35318905115127563, | |
| "learning_rate": 0.00028732653061224486, | |
| "loss": 3.3284, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.4513423795935, | |
| "grad_norm": 0.38061952590942383, | |
| "learning_rate": 0.00028688921282798833, | |
| "loss": 3.3173, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.46590181119329, | |
| "grad_norm": 0.3645211160182953, | |
| "learning_rate": 0.00028645189504373174, | |
| "loss": 3.3272, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.480461242793082, | |
| "grad_norm": 0.36433538794517517, | |
| "learning_rate": 0.0002860145772594752, | |
| "loss": 3.3164, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.480461242793082, | |
| "eval_accuracy": 0.37010747041621017, | |
| "eval_loss": 3.556314706802368, | |
| "eval_runtime": 180.1676, | |
| "eval_samples_per_second": 92.375, | |
| "eval_steps_per_second": 5.778, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.495020674392872, | |
| "grad_norm": 0.36834511160850525, | |
| "learning_rate": 0.0002855772594752186, | |
| "loss": 3.3291, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.509580105992661, | |
| "grad_norm": 0.3711186945438385, | |
| "learning_rate": 0.0002851399416909621, | |
| "loss": 3.3161, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.524139537592452, | |
| "grad_norm": 0.356585294008255, | |
| "learning_rate": 0.0002847026239067055, | |
| "loss": 3.3329, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.538698969192243, | |
| "grad_norm": 0.36765870451927185, | |
| "learning_rate": 0.00028426530612244897, | |
| "loss": 3.3352, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.553258400792034, | |
| "grad_norm": 0.3481246531009674, | |
| "learning_rate": 0.0002838279883381924, | |
| "loss": 3.3271, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.567817832391823, | |
| "grad_norm": 0.35420429706573486, | |
| "learning_rate": 0.00028339067055393585, | |
| "loss": 3.3268, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.582377263991614, | |
| "grad_norm": 0.3609519302845001, | |
| "learning_rate": 0.00028295335276967926, | |
| "loss": 3.3295, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.596936695591404, | |
| "grad_norm": 0.3677191138267517, | |
| "learning_rate": 0.0002825160349854227, | |
| "loss": 3.3333, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.611496127191195, | |
| "grad_norm": 0.37628525495529175, | |
| "learning_rate": 0.00028207871720116614, | |
| "loss": 3.3399, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.626055558790984, | |
| "grad_norm": 0.3637225925922394, | |
| "learning_rate": 0.0002816413994169096, | |
| "loss": 3.3349, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.640614990390775, | |
| "grad_norm": 0.3519335091114044, | |
| "learning_rate": 0.00028120408163265307, | |
| "loss": 3.3346, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.655174421990566, | |
| "grad_norm": 0.348203182220459, | |
| "learning_rate": 0.0002807667638483965, | |
| "loss": 3.3289, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.669733853590357, | |
| "grad_norm": 0.36233091354370117, | |
| "learning_rate": 0.00028032944606413995, | |
| "loss": 3.3264, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.684293285190146, | |
| "grad_norm": 0.3718380630016327, | |
| "learning_rate": 0.00027989212827988336, | |
| "loss": 3.3317, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.698852716789936, | |
| "grad_norm": 0.35991501808166504, | |
| "learning_rate": 0.00027945481049562683, | |
| "loss": 3.3365, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.713412148389727, | |
| "grad_norm": 0.37417152523994446, | |
| "learning_rate": 0.00027901749271137024, | |
| "loss": 3.3266, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.727971579989518, | |
| "grad_norm": 0.3618806004524231, | |
| "learning_rate": 0.00027858017492711365, | |
| "loss": 3.3338, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.742531011589307, | |
| "grad_norm": 0.3808761239051819, | |
| "learning_rate": 0.0002781428571428571, | |
| "loss": 3.3322, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.757090443189098, | |
| "grad_norm": 0.35829290747642517, | |
| "learning_rate": 0.00027770553935860053, | |
| "loss": 3.3405, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.771649874788888, | |
| "grad_norm": 0.35556626319885254, | |
| "learning_rate": 0.000277268221574344, | |
| "loss": 3.3349, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.771649874788888, | |
| "eval_accuracy": 0.3708527165326231, | |
| "eval_loss": 3.548063039779663, | |
| "eval_runtime": 180.2516, | |
| "eval_samples_per_second": 92.332, | |
| "eval_steps_per_second": 5.775, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.78620930638868, | |
| "grad_norm": 0.36781635880470276, | |
| "learning_rate": 0.0002768309037900874, | |
| "loss": 3.3393, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.800768737988468, | |
| "grad_norm": 0.3739968538284302, | |
| "learning_rate": 0.0002763935860058309, | |
| "loss": 3.3362, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.815328169588259, | |
| "grad_norm": 0.37725409865379333, | |
| "learning_rate": 0.0002759562682215743, | |
| "loss": 3.343, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.82988760118805, | |
| "grad_norm": 0.3467895984649658, | |
| "learning_rate": 0.00027551895043731776, | |
| "loss": 3.3424, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.84444703278784, | |
| "grad_norm": 0.3589009940624237, | |
| "learning_rate": 0.0002750816326530612, | |
| "loss": 3.3149, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.85900646438763, | |
| "grad_norm": 0.36413517594337463, | |
| "learning_rate": 0.00027464431486880464, | |
| "loss": 3.3448, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.87356589598742, | |
| "grad_norm": 0.3594954311847687, | |
| "learning_rate": 0.0002742069970845481, | |
| "loss": 3.3452, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.888125327587211, | |
| "grad_norm": 0.36977705359458923, | |
| "learning_rate": 0.0002737696793002915, | |
| "loss": 3.3387, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.902684759187002, | |
| "grad_norm": 0.3728332817554474, | |
| "learning_rate": 0.000273332361516035, | |
| "loss": 3.3554, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.917244190786791, | |
| "grad_norm": 0.3603312075138092, | |
| "learning_rate": 0.0002728950437317784, | |
| "loss": 3.3495, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.931803622386582, | |
| "grad_norm": 0.37357112765312195, | |
| "learning_rate": 0.00027245772594752186, | |
| "loss": 3.3509, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.946363053986373, | |
| "grad_norm": 0.3870396316051483, | |
| "learning_rate": 0.00027202040816326527, | |
| "loss": 3.3451, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.960922485586163, | |
| "grad_norm": 0.36924847960472107, | |
| "learning_rate": 0.00027158309037900874, | |
| "loss": 3.3482, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.975481917185952, | |
| "grad_norm": 0.3659966289997101, | |
| "learning_rate": 0.00027114577259475215, | |
| "loss": 3.3429, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 10.990041348785743, | |
| "grad_norm": 0.3750581741333008, | |
| "learning_rate": 0.00027070845481049556, | |
| "loss": 3.3467, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.004367829479937, | |
| "grad_norm": 0.3540584444999695, | |
| "learning_rate": 0.00027027113702623903, | |
| "loss": 3.3111, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.018927261079728, | |
| "grad_norm": 0.36422842741012573, | |
| "learning_rate": 0.0002698338192419825, | |
| "loss": 3.2475, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.033486692679517, | |
| "grad_norm": 0.36595383286476135, | |
| "learning_rate": 0.0002693965014577259, | |
| "loss": 3.2508, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.048046124279308, | |
| "grad_norm": 0.3714156448841095, | |
| "learning_rate": 0.0002689591836734694, | |
| "loss": 3.2548, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.062605555879099, | |
| "grad_norm": 0.38618841767311096, | |
| "learning_rate": 0.00026852186588921284, | |
| "loss": 3.2621, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.062605555879099, | |
| "eval_accuracy": 0.370781343166788, | |
| "eval_loss": 3.5563323497772217, | |
| "eval_runtime": 179.9746, | |
| "eval_samples_per_second": 92.474, | |
| "eval_steps_per_second": 5.784, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.07716498747889, | |
| "grad_norm": 0.3659396469593048, | |
| "learning_rate": 0.00026808454810495625, | |
| "loss": 3.2429, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.091724419078679, | |
| "grad_norm": 0.3699627220630646, | |
| "learning_rate": 0.0002676472303206997, | |
| "loss": 3.2574, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.10628385067847, | |
| "grad_norm": 0.371509313583374, | |
| "learning_rate": 0.00026720991253644313, | |
| "loss": 3.2581, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.12084328227826, | |
| "grad_norm": 0.3545081317424774, | |
| "learning_rate": 0.00026677259475218655, | |
| "loss": 3.2631, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.135402713878051, | |
| "grad_norm": 0.36414968967437744, | |
| "learning_rate": 0.00026633527696793, | |
| "loss": 3.2715, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.14996214547784, | |
| "grad_norm": 0.36221858859062195, | |
| "learning_rate": 0.0002658979591836734, | |
| "loss": 3.2702, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.16452157707763, | |
| "grad_norm": 0.35454094409942627, | |
| "learning_rate": 0.0002654606413994169, | |
| "loss": 3.2636, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.179081008677421, | |
| "grad_norm": 0.38314637541770935, | |
| "learning_rate": 0.0002650233236151603, | |
| "loss": 3.2647, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.193640440277212, | |
| "grad_norm": 0.36567234992980957, | |
| "learning_rate": 0.00026458600583090377, | |
| "loss": 3.2764, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.208199871877001, | |
| "grad_norm": 0.36688846349716187, | |
| "learning_rate": 0.0002641486880466472, | |
| "loss": 3.2722, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.222759303476792, | |
| "grad_norm": 0.38438311219215393, | |
| "learning_rate": 0.00026371137026239065, | |
| "loss": 3.2675, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.237318735076583, | |
| "grad_norm": 0.3896602392196655, | |
| "learning_rate": 0.0002632740524781341, | |
| "loss": 3.2684, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.251878166676374, | |
| "grad_norm": 0.3787361681461334, | |
| "learning_rate": 0.00026283673469387753, | |
| "loss": 3.2774, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.266437598276163, | |
| "grad_norm": 0.36523544788360596, | |
| "learning_rate": 0.000262399416909621, | |
| "loss": 3.2731, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.280997029875953, | |
| "grad_norm": 0.38595035672187805, | |
| "learning_rate": 0.0002619620991253644, | |
| "loss": 3.2802, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.295556461475744, | |
| "grad_norm": 0.3597980737686157, | |
| "learning_rate": 0.0002615247813411079, | |
| "loss": 3.2836, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.310115893075535, | |
| "grad_norm": 0.375411719083786, | |
| "learning_rate": 0.0002610874635568513, | |
| "loss": 3.2809, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.324675324675324, | |
| "grad_norm": 0.3683791756629944, | |
| "learning_rate": 0.00026065014577259475, | |
| "loss": 3.2832, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.339234756275115, | |
| "grad_norm": 0.36232179403305054, | |
| "learning_rate": 0.00026021282798833817, | |
| "loss": 3.279, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.353794187874906, | |
| "grad_norm": 0.3584194779396057, | |
| "learning_rate": 0.0002597755102040816, | |
| "loss": 3.2832, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.353794187874906, | |
| "eval_accuracy": 0.37091233151858416, | |
| "eval_loss": 3.553581953048706, | |
| "eval_runtime": 179.822, | |
| "eval_samples_per_second": 92.553, | |
| "eval_steps_per_second": 5.789, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.368353619474696, | |
| "grad_norm": 0.36148691177368164, | |
| "learning_rate": 0.00025933819241982504, | |
| "loss": 3.2926, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.382913051074485, | |
| "grad_norm": 0.36825209856033325, | |
| "learning_rate": 0.00025890087463556846, | |
| "loss": 3.2974, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.397472482674276, | |
| "grad_norm": 0.3690287470817566, | |
| "learning_rate": 0.0002584635568513119, | |
| "loss": 3.285, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.412031914274067, | |
| "grad_norm": 0.37193694710731506, | |
| "learning_rate": 0.00025802623906705534, | |
| "loss": 3.3057, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.426591345873858, | |
| "grad_norm": 0.3798997700214386, | |
| "learning_rate": 0.0002575889212827988, | |
| "loss": 3.2901, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.441150777473647, | |
| "grad_norm": 0.3867810368537903, | |
| "learning_rate": 0.00025715160349854227, | |
| "loss": 3.2994, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.455710209073438, | |
| "grad_norm": 0.3750901520252228, | |
| "learning_rate": 0.0002567142857142857, | |
| "loss": 3.2932, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.470269640673228, | |
| "grad_norm": 0.35880762338638306, | |
| "learning_rate": 0.00025627696793002915, | |
| "loss": 3.2873, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.484829072273019, | |
| "grad_norm": 0.3917964994907379, | |
| "learning_rate": 0.00025583965014577256, | |
| "loss": 3.2952, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.499388503872808, | |
| "grad_norm": 0.3772904574871063, | |
| "learning_rate": 0.000255402332361516, | |
| "loss": 3.3044, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.513947935472599, | |
| "grad_norm": 0.3691461980342865, | |
| "learning_rate": 0.00025496501457725944, | |
| "loss": 3.3156, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.52850736707239, | |
| "grad_norm": 0.36424410343170166, | |
| "learning_rate": 0.0002545276967930029, | |
| "loss": 3.3019, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.54306679867218, | |
| "grad_norm": 0.3689974844455719, | |
| "learning_rate": 0.0002540903790087463, | |
| "loss": 3.3016, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.55762623027197, | |
| "grad_norm": 0.38458317518234253, | |
| "learning_rate": 0.0002536530612244898, | |
| "loss": 3.3063, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.57218566187176, | |
| "grad_norm": 0.3871372640132904, | |
| "learning_rate": 0.0002532157434402332, | |
| "loss": 3.3034, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.586745093471551, | |
| "grad_norm": 0.3936833143234253, | |
| "learning_rate": 0.00025277842565597666, | |
| "loss": 3.3101, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.601304525071342, | |
| "grad_norm": 0.3917473256587982, | |
| "learning_rate": 0.0002523411078717201, | |
| "loss": 3.3225, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.61586395667113, | |
| "grad_norm": 0.3640928864479065, | |
| "learning_rate": 0.00025190379008746354, | |
| "loss": 3.304, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.630423388270922, | |
| "grad_norm": 0.4092429578304291, | |
| "learning_rate": 0.00025146647230320696, | |
| "loss": 3.3123, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.644982819870712, | |
| "grad_norm": 0.3751949071884155, | |
| "learning_rate": 0.0002510291545189504, | |
| "loss": 3.3111, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.644982819870712, | |
| "eval_accuracy": 0.371191240289195, | |
| "eval_loss": 3.5466434955596924, | |
| "eval_runtime": 179.6345, | |
| "eval_samples_per_second": 92.649, | |
| "eval_steps_per_second": 5.795, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.659542251470503, | |
| "grad_norm": 0.3745579719543457, | |
| "learning_rate": 0.0002505918367346939, | |
| "loss": 3.3036, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.674101683070292, | |
| "grad_norm": 0.3651117980480194, | |
| "learning_rate": 0.0002501545189504373, | |
| "loss": 3.3111, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.688661114670083, | |
| "grad_norm": 0.3845028579235077, | |
| "learning_rate": 0.00024971720116618077, | |
| "loss": 3.3049, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.703220546269874, | |
| "grad_norm": 0.3631397783756256, | |
| "learning_rate": 0.0002492798833819242, | |
| "loss": 3.3031, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.717779977869665, | |
| "grad_norm": 0.37985455989837646, | |
| "learning_rate": 0.00024884256559766765, | |
| "loss": 3.3111, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.732339409469454, | |
| "grad_norm": 0.3810136616230011, | |
| "learning_rate": 0.00024840524781341106, | |
| "loss": 3.304, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.746898841069244, | |
| "grad_norm": 0.37321019172668457, | |
| "learning_rate": 0.00024796793002915447, | |
| "loss": 3.3037, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.761458272669035, | |
| "grad_norm": 0.37630370259284973, | |
| "learning_rate": 0.00024753061224489794, | |
| "loss": 3.3086, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.776017704268826, | |
| "grad_norm": 0.36770492792129517, | |
| "learning_rate": 0.00024709329446064135, | |
| "loss": 3.3181, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.790577135868615, | |
| "grad_norm": 0.3986623287200928, | |
| "learning_rate": 0.0002466559766763848, | |
| "loss": 3.3203, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.805136567468406, | |
| "grad_norm": 0.3720547556877136, | |
| "learning_rate": 0.00024621865889212823, | |
| "loss": 3.3183, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.819695999068196, | |
| "grad_norm": 0.37636956572532654, | |
| "learning_rate": 0.0002457813411078717, | |
| "loss": 3.3105, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.834255430667987, | |
| "grad_norm": 0.3576502501964569, | |
| "learning_rate": 0.00024534402332361516, | |
| "loss": 3.3248, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.848814862267776, | |
| "grad_norm": 0.3808489739894867, | |
| "learning_rate": 0.0002449067055393586, | |
| "loss": 3.3177, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.863374293867567, | |
| "grad_norm": 0.3698633313179016, | |
| "learning_rate": 0.00024446938775510204, | |
| "loss": 3.3259, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.877933725467358, | |
| "grad_norm": 0.3820943534374237, | |
| "learning_rate": 0.00024403206997084545, | |
| "loss": 3.3181, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.892493157067149, | |
| "grad_norm": 0.385206401348114, | |
| "learning_rate": 0.0002435947521865889, | |
| "loss": 3.3164, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.90705258866694, | |
| "grad_norm": 0.3971438705921173, | |
| "learning_rate": 0.00024315743440233233, | |
| "loss": 3.3099, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.921612020266728, | |
| "grad_norm": 0.38719749450683594, | |
| "learning_rate": 0.0002427201166180758, | |
| "loss": 3.3164, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.93617145186652, | |
| "grad_norm": 0.3685580790042877, | |
| "learning_rate": 0.0002422827988338192, | |
| "loss": 3.326, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.93617145186652, | |
| "eval_accuracy": 0.37202020607031067, | |
| "eval_loss": 3.5386884212493896, | |
| "eval_runtime": 180.0955, | |
| "eval_samples_per_second": 92.412, | |
| "eval_steps_per_second": 5.78, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.95073088346631, | |
| "grad_norm": 0.37299463152885437, | |
| "learning_rate": 0.00024184548104956268, | |
| "loss": 3.3102, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.965290315066099, | |
| "grad_norm": 0.40013447403907776, | |
| "learning_rate": 0.0002414081632653061, | |
| "loss": 3.3126, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.97984974666589, | |
| "grad_norm": 0.3682084083557129, | |
| "learning_rate": 0.00024097084548104956, | |
| "loss": 3.3236, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 11.99440917826568, | |
| "grad_norm": 0.37663114070892334, | |
| "learning_rate": 0.00024053352769679297, | |
| "loss": 3.3187, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.008735658959875, | |
| "grad_norm": 0.3868860900402069, | |
| "learning_rate": 0.0002400962099125364, | |
| "loss": 3.2557, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.023295090559664, | |
| "grad_norm": 0.39282509684562683, | |
| "learning_rate": 0.00023965889212827988, | |
| "loss": 3.2263, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.037854522159455, | |
| "grad_norm": 0.38918325304985046, | |
| "learning_rate": 0.0002392215743440233, | |
| "loss": 3.2284, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.052413953759245, | |
| "grad_norm": 0.37995877861976624, | |
| "learning_rate": 0.00023878425655976675, | |
| "loss": 3.232, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.066973385359036, | |
| "grad_norm": 0.38068705797195435, | |
| "learning_rate": 0.00023834693877551017, | |
| "loss": 3.2244, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.081532816958825, | |
| "grad_norm": 0.3786507844924927, | |
| "learning_rate": 0.00023790962099125363, | |
| "loss": 3.2388, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.096092248558616, | |
| "grad_norm": 0.39703622460365295, | |
| "learning_rate": 0.00023747230320699707, | |
| "loss": 3.2361, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.110651680158407, | |
| "grad_norm": 0.3827173709869385, | |
| "learning_rate": 0.0002370349854227405, | |
| "loss": 3.2446, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.125211111758198, | |
| "grad_norm": 0.38849908113479614, | |
| "learning_rate": 0.00023659766763848395, | |
| "loss": 3.2449, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.139770543357987, | |
| "grad_norm": 0.3723032772541046, | |
| "learning_rate": 0.00023616034985422736, | |
| "loss": 3.2473, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.154329974957777, | |
| "grad_norm": 0.3933330774307251, | |
| "learning_rate": 0.00023572303206997083, | |
| "loss": 3.2458, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.168889406557568, | |
| "grad_norm": 0.3799208998680115, | |
| "learning_rate": 0.00023528571428571424, | |
| "loss": 3.2397, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.183448838157359, | |
| "grad_norm": 0.37628769874572754, | |
| "learning_rate": 0.0002348483965014577, | |
| "loss": 3.2561, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.198008269757148, | |
| "grad_norm": 0.39289426803588867, | |
| "learning_rate": 0.00023441107871720115, | |
| "loss": 3.2559, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.212567701356939, | |
| "grad_norm": 0.38283148407936096, | |
| "learning_rate": 0.0002339737609329446, | |
| "loss": 3.2436, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.22712713295673, | |
| "grad_norm": 0.3877449929714203, | |
| "learning_rate": 0.00023353644314868803, | |
| "loss": 3.2475, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.22712713295673, | |
| "eval_accuracy": 0.3715757393110742, | |
| "eval_loss": 3.5504579544067383, | |
| "eval_runtime": 180.0981, | |
| "eval_samples_per_second": 92.411, | |
| "eval_steps_per_second": 5.78, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.24168656455652, | |
| "grad_norm": 0.382993221282959, | |
| "learning_rate": 0.0002330991253644315, | |
| "loss": 3.2507, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.25624599615631, | |
| "grad_norm": 0.3783535063266754, | |
| "learning_rate": 0.0002326618075801749, | |
| "loss": 3.2515, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.2708054277561, | |
| "grad_norm": 0.3950106203556061, | |
| "learning_rate": 0.00023222448979591835, | |
| "loss": 3.2695, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.28536485935589, | |
| "grad_norm": 0.368351548910141, | |
| "learning_rate": 0.00023178717201166179, | |
| "loss": 3.264, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.299924290955682, | |
| "grad_norm": 0.39170610904693604, | |
| "learning_rate": 0.00023134985422740523, | |
| "loss": 3.2618, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.31448372255547, | |
| "grad_norm": 0.3892439901828766, | |
| "learning_rate": 0.00023091253644314866, | |
| "loss": 3.2575, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.329043154155261, | |
| "grad_norm": 0.40385738015174866, | |
| "learning_rate": 0.0002304752186588921, | |
| "loss": 3.2627, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.343602585755052, | |
| "grad_norm": 0.3920726776123047, | |
| "learning_rate": 0.00023003790087463557, | |
| "loss": 3.2548, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.358162017354843, | |
| "grad_norm": 0.389912873506546, | |
| "learning_rate": 0.00022960058309037898, | |
| "loss": 3.2589, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.372721448954632, | |
| "grad_norm": 0.3855544924736023, | |
| "learning_rate": 0.00022916326530612245, | |
| "loss": 3.277, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.387280880554423, | |
| "grad_norm": 0.4056563973426819, | |
| "learning_rate": 0.00022872594752186586, | |
| "loss": 3.2563, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.401840312154214, | |
| "grad_norm": 0.3610406816005707, | |
| "learning_rate": 0.0002282886297376093, | |
| "loss": 3.2695, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.416399743754004, | |
| "grad_norm": 0.3934992253780365, | |
| "learning_rate": 0.00022785131195335277, | |
| "loss": 3.2654, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.430959175353793, | |
| "grad_norm": 0.3924916386604309, | |
| "learning_rate": 0.00022741399416909618, | |
| "loss": 3.2659, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.445518606953584, | |
| "grad_norm": 0.3711550533771515, | |
| "learning_rate": 0.00022697667638483965, | |
| "loss": 3.2705, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.460078038553375, | |
| "grad_norm": 0.38350147008895874, | |
| "learning_rate": 0.00022653935860058306, | |
| "loss": 3.2869, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.474637470153166, | |
| "grad_norm": 0.38396674394607544, | |
| "learning_rate": 0.00022610204081632653, | |
| "loss": 3.2721, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.489196901752955, | |
| "grad_norm": 0.41561633348464966, | |
| "learning_rate": 0.00022566472303206994, | |
| "loss": 3.2791, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.503756333352746, | |
| "grad_norm": 0.384716659784317, | |
| "learning_rate": 0.00022522740524781338, | |
| "loss": 3.259, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.518315764952536, | |
| "grad_norm": 0.36449480056762695, | |
| "learning_rate": 0.00022479008746355684, | |
| "loss": 3.2651, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.518315764952536, | |
| "eval_accuracy": 0.3719537712240227, | |
| "eval_loss": 3.545558214187622, | |
| "eval_runtime": 179.9007, | |
| "eval_samples_per_second": 92.512, | |
| "eval_steps_per_second": 5.787, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.532875196552327, | |
| "grad_norm": 0.36965641379356384, | |
| "learning_rate": 0.00022435276967930026, | |
| "loss": 3.2749, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.547434628152118, | |
| "grad_norm": 0.3777758777141571, | |
| "learning_rate": 0.00022391545189504372, | |
| "loss": 3.2745, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.561994059751907, | |
| "grad_norm": 0.37569695711135864, | |
| "learning_rate": 0.00022347813411078714, | |
| "loss": 3.2759, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.576553491351698, | |
| "grad_norm": 0.40388092398643494, | |
| "learning_rate": 0.0002230408163265306, | |
| "loss": 3.2815, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.591112922951488, | |
| "grad_norm": 0.3894331455230713, | |
| "learning_rate": 0.00022260349854227402, | |
| "loss": 3.2916, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.605672354551277, | |
| "grad_norm": 0.4040619432926178, | |
| "learning_rate": 0.00022216618075801748, | |
| "loss": 3.2824, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.620231786151068, | |
| "grad_norm": 0.3677726089954376, | |
| "learning_rate": 0.00022172886297376092, | |
| "loss": 3.2909, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.634791217750859, | |
| "grad_norm": 0.4069276750087738, | |
| "learning_rate": 0.00022129154518950433, | |
| "loss": 3.2797, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.64935064935065, | |
| "grad_norm": 0.37287694215774536, | |
| "learning_rate": 0.0002208542274052478, | |
| "loss": 3.2765, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.66391008095044, | |
| "grad_norm": 0.3790263831615448, | |
| "learning_rate": 0.0002204169096209912, | |
| "loss": 3.2883, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.67846951255023, | |
| "grad_norm": 0.3825741708278656, | |
| "learning_rate": 0.00021997959183673468, | |
| "loss": 3.2682, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.69302894415002, | |
| "grad_norm": 0.3945907652378082, | |
| "learning_rate": 0.00021954227405247812, | |
| "loss": 3.2781, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.707588375749811, | |
| "grad_norm": 0.3808131515979767, | |
| "learning_rate": 0.00021910495626822156, | |
| "loss": 3.28, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.7221478073496, | |
| "grad_norm": 0.3769155442714691, | |
| "learning_rate": 0.000218667638483965, | |
| "loss": 3.2763, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.736707238949391, | |
| "grad_norm": 0.39110711216926575, | |
| "learning_rate": 0.00021823032069970844, | |
| "loss": 3.288, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.751266670549182, | |
| "grad_norm": 0.3744281828403473, | |
| "learning_rate": 0.00021779300291545188, | |
| "loss": 3.2741, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.765826102148973, | |
| "grad_norm": 0.3719866871833801, | |
| "learning_rate": 0.0002173556851311953, | |
| "loss": 3.2686, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.780385533748763, | |
| "grad_norm": 0.39358916878700256, | |
| "learning_rate": 0.00021691836734693876, | |
| "loss": 3.2931, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.794944965348552, | |
| "grad_norm": 0.3743979334831238, | |
| "learning_rate": 0.0002164810495626822, | |
| "loss": 3.2879, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.809504396948343, | |
| "grad_norm": 0.3975497782230377, | |
| "learning_rate": 0.00021604373177842563, | |
| "loss": 3.2914, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.809504396948343, | |
| "eval_accuracy": 0.3726891403013414, | |
| "eval_loss": 3.5382912158966064, | |
| "eval_runtime": 179.7659, | |
| "eval_samples_per_second": 92.582, | |
| "eval_steps_per_second": 5.791, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.824063828548134, | |
| "grad_norm": 0.3927364647388458, | |
| "learning_rate": 0.00021560641399416907, | |
| "loss": 3.2988, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.838623260147925, | |
| "grad_norm": 0.38748300075531006, | |
| "learning_rate": 0.00021516909620991254, | |
| "loss": 3.2933, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.853182691747714, | |
| "grad_norm": 0.3822428584098816, | |
| "learning_rate": 0.00021473177842565595, | |
| "loss": 3.2882, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.867742123347504, | |
| "grad_norm": 0.3862573504447937, | |
| "learning_rate": 0.00021429446064139942, | |
| "loss": 3.2869, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.882301554947295, | |
| "grad_norm": 0.37098950147628784, | |
| "learning_rate": 0.00021385714285714283, | |
| "loss": 3.2866, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.896860986547086, | |
| "grad_norm": 0.3941670358181, | |
| "learning_rate": 0.00021341982507288627, | |
| "loss": 3.293, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.911420418146875, | |
| "grad_norm": 0.36743009090423584, | |
| "learning_rate": 0.0002129825072886297, | |
| "loss": 3.296, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.925979849746666, | |
| "grad_norm": 0.38341784477233887, | |
| "learning_rate": 0.00021254518950437315, | |
| "loss": 3.2897, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.940539281346457, | |
| "grad_norm": 0.36301809549331665, | |
| "learning_rate": 0.00021210787172011662, | |
| "loss": 3.3029, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.955098712946247, | |
| "grad_norm": 0.38180243968963623, | |
| "learning_rate": 0.00021167055393586003, | |
| "loss": 3.286, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.969658144546036, | |
| "grad_norm": 0.3855466842651367, | |
| "learning_rate": 0.0002112332361516035, | |
| "loss": 3.2899, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.984217576145827, | |
| "grad_norm": 0.38620680570602417, | |
| "learning_rate": 0.0002107959183673469, | |
| "loss": 3.3077, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 12.998777007745618, | |
| "grad_norm": 0.38125503063201904, | |
| "learning_rate": 0.00021035860058309037, | |
| "loss": 3.2907, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.01310348843981, | |
| "grad_norm": 0.3893320858478546, | |
| "learning_rate": 0.00020992128279883381, | |
| "loss": 3.199, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.027662920039601, | |
| "grad_norm": 0.4010103642940521, | |
| "learning_rate": 0.00020948396501457723, | |
| "loss": 3.2042, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.042222351639392, | |
| "grad_norm": 0.40282300114631653, | |
| "learning_rate": 0.0002090466472303207, | |
| "loss": 3.212, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.056781783239183, | |
| "grad_norm": 0.39462268352508545, | |
| "learning_rate": 0.0002086093294460641, | |
| "loss": 3.2093, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.071341214838972, | |
| "grad_norm": 0.4072793126106262, | |
| "learning_rate": 0.00020817201166180757, | |
| "loss": 3.209, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.085900646438763, | |
| "grad_norm": 0.40736645460128784, | |
| "learning_rate": 0.00020773469387755098, | |
| "loss": 3.2128, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.100460078038553, | |
| "grad_norm": 0.4051344096660614, | |
| "learning_rate": 0.00020729737609329445, | |
| "loss": 3.2055, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.100460078038553, | |
| "eval_accuracy": 0.37210968734115163, | |
| "eval_loss": 3.5462467670440674, | |
| "eval_runtime": 179.64, | |
| "eval_samples_per_second": 92.646, | |
| "eval_steps_per_second": 5.795, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.115019509638344, | |
| "grad_norm": 0.3923526406288147, | |
| "learning_rate": 0.0002068600583090379, | |
| "loss": 3.2176, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.129578941238133, | |
| "grad_norm": 0.4024291932582855, | |
| "learning_rate": 0.00020642274052478133, | |
| "loss": 3.2117, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.144138372837924, | |
| "grad_norm": 0.3926296532154083, | |
| "learning_rate": 0.00020598542274052477, | |
| "loss": 3.2315, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.158697804437715, | |
| "grad_norm": 0.39263248443603516, | |
| "learning_rate": 0.00020554810495626818, | |
| "loss": 3.2206, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.173257236037506, | |
| "grad_norm": 0.389493852853775, | |
| "learning_rate": 0.00020511078717201165, | |
| "loss": 3.2258, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.187816667637296, | |
| "grad_norm": 0.41162851452827454, | |
| "learning_rate": 0.00020467346938775506, | |
| "loss": 3.2259, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.202376099237085, | |
| "grad_norm": 0.42013758420944214, | |
| "learning_rate": 0.00020423615160349853, | |
| "loss": 3.2225, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.216935530836876, | |
| "grad_norm": 0.37507620453834534, | |
| "learning_rate": 0.00020379883381924197, | |
| "loss": 3.2382, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.231494962436667, | |
| "grad_norm": 0.39401939511299133, | |
| "learning_rate": 0.0002033615160349854, | |
| "loss": 3.2174, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.246054394036458, | |
| "grad_norm": 0.39347732067108154, | |
| "learning_rate": 0.00020292419825072885, | |
| "loss": 3.2407, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.260613825636247, | |
| "grad_norm": 0.41897672414779663, | |
| "learning_rate": 0.0002024868804664723, | |
| "loss": 3.239, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.275173257236037, | |
| "grad_norm": 0.3950546681880951, | |
| "learning_rate": 0.00020204956268221572, | |
| "loss": 3.236, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.289732688835828, | |
| "grad_norm": 0.38286805152893066, | |
| "learning_rate": 0.00020161224489795916, | |
| "loss": 3.2309, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.304292120435619, | |
| "grad_norm": 0.39312657713890076, | |
| "learning_rate": 0.0002011749271137026, | |
| "loss": 3.2248, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.318851552035408, | |
| "grad_norm": 0.4044516980648041, | |
| "learning_rate": 0.00020073760932944604, | |
| "loss": 3.2353, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.333410983635199, | |
| "grad_norm": 0.3935719132423401, | |
| "learning_rate": 0.00020030029154518948, | |
| "loss": 3.2494, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.34797041523499, | |
| "grad_norm": 0.39138996601104736, | |
| "learning_rate": 0.00019986297376093292, | |
| "loss": 3.2334, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.36252984683478, | |
| "grad_norm": 0.40938061475753784, | |
| "learning_rate": 0.0001994256559766764, | |
| "loss": 3.2537, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.37708927843457, | |
| "grad_norm": 0.3899918496608734, | |
| "learning_rate": 0.0001989883381924198, | |
| "loss": 3.2348, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.39164871003436, | |
| "grad_norm": 0.39088475704193115, | |
| "learning_rate": 0.00019855102040816327, | |
| "loss": 3.2443, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.39164871003436, | |
| "eval_accuracy": 0.3726506903991535, | |
| "eval_loss": 3.5426511764526367, | |
| "eval_runtime": 179.8521, | |
| "eval_samples_per_second": 92.537, | |
| "eval_steps_per_second": 5.788, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.406208141634151, | |
| "grad_norm": 0.3731467127799988, | |
| "learning_rate": 0.00019811370262390668, | |
| "loss": 3.2468, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.420767573233942, | |
| "grad_norm": 0.4023798704147339, | |
| "learning_rate": 0.00019767638483965012, | |
| "loss": 3.2414, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.43532700483373, | |
| "grad_norm": 0.40062659978866577, | |
| "learning_rate": 0.00019723906705539359, | |
| "loss": 3.2514, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.449886436433522, | |
| "grad_norm": 0.41188135743141174, | |
| "learning_rate": 0.000196801749271137, | |
| "loss": 3.2714, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.464445868033312, | |
| "grad_norm": 0.3918535113334656, | |
| "learning_rate": 0.00019636443148688047, | |
| "loss": 3.2557, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.479005299633103, | |
| "grad_norm": 0.39971768856048584, | |
| "learning_rate": 0.00019592711370262388, | |
| "loss": 3.2398, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.493564731232892, | |
| "grad_norm": 0.39411237835884094, | |
| "learning_rate": 0.00019548979591836734, | |
| "loss": 3.2663, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.508124162832683, | |
| "grad_norm": 0.38199934363365173, | |
| "learning_rate": 0.00019505247813411076, | |
| "loss": 3.2564, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.522683594432474, | |
| "grad_norm": 0.3960552513599396, | |
| "learning_rate": 0.00019461516034985422, | |
| "loss": 3.2518, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.537243026032264, | |
| "grad_norm": 0.397707462310791, | |
| "learning_rate": 0.00019417784256559766, | |
| "loss": 3.2318, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.551802457632053, | |
| "grad_norm": 0.38950562477111816, | |
| "learning_rate": 0.00019374052478134108, | |
| "loss": 3.2526, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.566361889231844, | |
| "grad_norm": 0.40381306409835815, | |
| "learning_rate": 0.00019330320699708454, | |
| "loss": 3.2461, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.580921320831635, | |
| "grad_norm": 0.3885638117790222, | |
| "learning_rate": 0.00019286588921282795, | |
| "loss": 3.2479, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.595480752431426, | |
| "grad_norm": 0.38914021849632263, | |
| "learning_rate": 0.00019242857142857142, | |
| "loss": 3.2615, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.610040184031215, | |
| "grad_norm": 0.38882437348365784, | |
| "learning_rate": 0.00019199125364431483, | |
| "loss": 3.2557, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.624599615631006, | |
| "grad_norm": 0.4011668264865875, | |
| "learning_rate": 0.0001915539358600583, | |
| "loss": 3.2575, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.639159047230796, | |
| "grad_norm": 0.38090625405311584, | |
| "learning_rate": 0.00019111661807580174, | |
| "loss": 3.2482, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.653718478830587, | |
| "grad_norm": 0.3797299861907959, | |
| "learning_rate": 0.00019067930029154515, | |
| "loss": 3.2633, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.668277910430376, | |
| "grad_norm": 0.42316409945487976, | |
| "learning_rate": 0.00019024198250728862, | |
| "loss": 3.2572, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.682837342030167, | |
| "grad_norm": 0.3918803930282593, | |
| "learning_rate": 0.00018980466472303203, | |
| "loss": 3.2584, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.682837342030167, | |
| "eval_accuracy": 0.3731333718929804, | |
| "eval_loss": 3.5378518104553223, | |
| "eval_runtime": 179.8803, | |
| "eval_samples_per_second": 92.523, | |
| "eval_steps_per_second": 5.787, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.697396773629958, | |
| "grad_norm": 0.4246753752231598, | |
| "learning_rate": 0.0001893673469387755, | |
| "loss": 3.2589, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.711956205229749, | |
| "grad_norm": 0.3818982243537903, | |
| "learning_rate": 0.00018893002915451894, | |
| "loss": 3.2612, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.726515636829538, | |
| "grad_norm": 0.3882550597190857, | |
| "learning_rate": 0.00018849271137026238, | |
| "loss": 3.268, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.741075068429328, | |
| "grad_norm": 0.4041791260242462, | |
| "learning_rate": 0.00018805539358600582, | |
| "loss": 3.2534, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.75563450002912, | |
| "grad_norm": 0.37502363324165344, | |
| "learning_rate": 0.00018761807580174925, | |
| "loss": 3.2593, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.77019393162891, | |
| "grad_norm": 0.38316377997398376, | |
| "learning_rate": 0.0001871807580174927, | |
| "loss": 3.2434, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.784753363228699, | |
| "grad_norm": 0.3869149386882782, | |
| "learning_rate": 0.0001867434402332361, | |
| "loss": 3.2641, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.79931279482849, | |
| "grad_norm": 0.38152143359184265, | |
| "learning_rate": 0.00018630612244897957, | |
| "loss": 3.2716, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.81387222642828, | |
| "grad_norm": 0.4002053439617157, | |
| "learning_rate": 0.000185868804664723, | |
| "loss": 3.2533, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.828431658028071, | |
| "grad_norm": 0.3985072374343872, | |
| "learning_rate": 0.00018543148688046645, | |
| "loss": 3.2614, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.84299108962786, | |
| "grad_norm": 0.4056275486946106, | |
| "learning_rate": 0.0001849941690962099, | |
| "loss": 3.2618, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.857550521227651, | |
| "grad_norm": 0.3932760953903198, | |
| "learning_rate": 0.00018455685131195336, | |
| "loss": 3.2764, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.872109952827442, | |
| "grad_norm": 0.41119423508644104, | |
| "learning_rate": 0.00018411953352769677, | |
| "loss": 3.2654, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.886669384427233, | |
| "grad_norm": 0.40140846371650696, | |
| "learning_rate": 0.00018368221574344024, | |
| "loss": 3.2602, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.901228816027022, | |
| "grad_norm": 0.3891817033290863, | |
| "learning_rate": 0.00018324489795918365, | |
| "loss": 3.2599, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.915788247626812, | |
| "grad_norm": 0.39611101150512695, | |
| "learning_rate": 0.0001828075801749271, | |
| "loss": 3.2673, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.930347679226603, | |
| "grad_norm": 0.41129791736602783, | |
| "learning_rate": 0.00018237026239067053, | |
| "loss": 3.2628, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.944907110826394, | |
| "grad_norm": 0.39136409759521484, | |
| "learning_rate": 0.00018193294460641397, | |
| "loss": 3.2689, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.959466542426183, | |
| "grad_norm": 0.38141852617263794, | |
| "learning_rate": 0.00018149562682215743, | |
| "loss": 3.2768, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.974025974025974, | |
| "grad_norm": 0.4137849509716034, | |
| "learning_rate": 0.00018105830903790085, | |
| "loss": 3.2518, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.974025974025974, | |
| "eval_accuracy": 0.37369154218559825, | |
| "eval_loss": 3.529177665710449, | |
| "eval_runtime": 180.2712, | |
| "eval_samples_per_second": 92.322, | |
| "eval_steps_per_second": 5.775, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.988585405625765, | |
| "grad_norm": 0.3934214413166046, | |
| "learning_rate": 0.00018062099125364431, | |
| "loss": 3.2683, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 14.002911886319959, | |
| "grad_norm": 0.4021332561969757, | |
| "learning_rate": 0.00018018367346938773, | |
| "loss": 3.2537, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.017471317919748, | |
| "grad_norm": 0.41304075717926025, | |
| "learning_rate": 0.0001797463556851312, | |
| "loss": 3.186, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.032030749519539, | |
| "grad_norm": 0.41855913400650024, | |
| "learning_rate": 0.00017930903790087463, | |
| "loss": 3.1729, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.04659018111933, | |
| "grad_norm": 0.38180437684059143, | |
| "learning_rate": 0.00017887172011661804, | |
| "loss": 3.1865, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.06114961271912, | |
| "grad_norm": 0.4037982225418091, | |
| "learning_rate": 0.0001784344023323615, | |
| "loss": 3.1946, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.07570904431891, | |
| "grad_norm": 0.4038594663143158, | |
| "learning_rate": 0.00017799708454810492, | |
| "loss": 3.1867, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.0902684759187, | |
| "grad_norm": 0.4171351492404938, | |
| "learning_rate": 0.0001775597667638484, | |
| "loss": 3.2073, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.10482790751849, | |
| "grad_norm": 0.40714016556739807, | |
| "learning_rate": 0.0001771224489795918, | |
| "loss": 3.2059, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.119387339118282, | |
| "grad_norm": 0.40129297971725464, | |
| "learning_rate": 0.00017668513119533527, | |
| "loss": 3.1958, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.13394677071807, | |
| "grad_norm": 0.4317340850830078, | |
| "learning_rate": 0.0001762478134110787, | |
| "loss": 3.2103, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.148506202317861, | |
| "grad_norm": 0.4058551788330078, | |
| "learning_rate": 0.00017581049562682215, | |
| "loss": 3.1942, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.163065633917652, | |
| "grad_norm": 0.4167109429836273, | |
| "learning_rate": 0.0001753731778425656, | |
| "loss": 3.1981, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.177625065517443, | |
| "grad_norm": 0.3935483694076538, | |
| "learning_rate": 0.000174935860058309, | |
| "loss": 3.202, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.192184497117232, | |
| "grad_norm": 0.4171966016292572, | |
| "learning_rate": 0.00017449854227405247, | |
| "loss": 3.2034, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.206743928717023, | |
| "grad_norm": 0.3995937705039978, | |
| "learning_rate": 0.00017406122448979588, | |
| "loss": 3.2011, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.221303360316814, | |
| "grad_norm": 0.38564836978912354, | |
| "learning_rate": 0.00017362390670553935, | |
| "loss": 3.2065, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.235862791916604, | |
| "grad_norm": 0.4166134297847748, | |
| "learning_rate": 0.00017318658892128278, | |
| "loss": 3.2106, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.250422223516393, | |
| "grad_norm": 0.4112185537815094, | |
| "learning_rate": 0.00017274927113702622, | |
| "loss": 3.2138, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.264981655116184, | |
| "grad_norm": 0.39878368377685547, | |
| "learning_rate": 0.00017231195335276966, | |
| "loss": 3.2007, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.264981655116184, | |
| "eval_accuracy": 0.3732468402587643, | |
| "eval_loss": 3.5399742126464844, | |
| "eval_runtime": 179.8028, | |
| "eval_samples_per_second": 92.562, | |
| "eval_steps_per_second": 5.79, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.279541086715975, | |
| "grad_norm": 0.39703792333602905, | |
| "learning_rate": 0.00017187463556851313, | |
| "loss": 3.2176, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.294100518315766, | |
| "grad_norm": 0.3934576213359833, | |
| "learning_rate": 0.00017143731778425654, | |
| "loss": 3.2121, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.308659949915555, | |
| "grad_norm": 0.3929370045661926, | |
| "learning_rate": 0.00017099999999999998, | |
| "loss": 3.2031, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.323219381515345, | |
| "grad_norm": 0.4261437654495239, | |
| "learning_rate": 0.00017056268221574342, | |
| "loss": 3.2201, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.337778813115136, | |
| "grad_norm": 0.3950461447238922, | |
| "learning_rate": 0.00017012536443148686, | |
| "loss": 3.2297, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.352338244714927, | |
| "grad_norm": 0.40033966302871704, | |
| "learning_rate": 0.0001696880466472303, | |
| "loss": 3.2149, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.366897676314716, | |
| "grad_norm": 0.4025183618068695, | |
| "learning_rate": 0.00016925072886297374, | |
| "loss": 3.2214, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.381457107914507, | |
| "grad_norm": 0.4048652648925781, | |
| "learning_rate": 0.0001688134110787172, | |
| "loss": 3.224, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.396016539514298, | |
| "grad_norm": 0.3995135426521301, | |
| "learning_rate": 0.00016837609329446062, | |
| "loss": 3.212, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.410575971114088, | |
| "grad_norm": 0.40483996272087097, | |
| "learning_rate": 0.00016793877551020409, | |
| "loss": 3.2175, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.425135402713877, | |
| "grad_norm": 0.40723147988319397, | |
| "learning_rate": 0.0001675014577259475, | |
| "loss": 3.2251, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.439694834313668, | |
| "grad_norm": 0.41353291273117065, | |
| "learning_rate": 0.00016706413994169094, | |
| "loss": 3.2274, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.454254265913459, | |
| "grad_norm": 0.4138233959674835, | |
| "learning_rate": 0.0001666268221574344, | |
| "loss": 3.2189, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.46881369751325, | |
| "grad_norm": 0.41592147946357727, | |
| "learning_rate": 0.00016618950437317782, | |
| "loss": 3.2207, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.483373129113039, | |
| "grad_norm": 0.41638514399528503, | |
| "learning_rate": 0.00016575218658892128, | |
| "loss": 3.2294, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.49793256071283, | |
| "grad_norm": 0.40838560461997986, | |
| "learning_rate": 0.0001653148688046647, | |
| "loss": 3.226, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.51249199231262, | |
| "grad_norm": 0.42148107290267944, | |
| "learning_rate": 0.00016487755102040816, | |
| "loss": 3.2231, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.527051423912411, | |
| "grad_norm": 0.3983476161956787, | |
| "learning_rate": 0.00016444023323615157, | |
| "loss": 3.2314, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.5416108555122, | |
| "grad_norm": 0.40091437101364136, | |
| "learning_rate": 0.00016400291545189504, | |
| "loss": 3.2257, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.556170287111991, | |
| "grad_norm": 0.4068416953086853, | |
| "learning_rate": 0.00016356559766763848, | |
| "loss": 3.2079, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.556170287111991, | |
| "eval_accuracy": 0.373755390188314, | |
| "eval_loss": 3.536447763442993, | |
| "eval_runtime": 179.8111, | |
| "eval_samples_per_second": 92.558, | |
| "eval_steps_per_second": 5.789, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.570729718711782, | |
| "grad_norm": 0.400130957365036, | |
| "learning_rate": 0.0001631282798833819, | |
| "loss": 3.2299, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.585289150311572, | |
| "grad_norm": 0.397758424282074, | |
| "learning_rate": 0.00016269096209912536, | |
| "loss": 3.2291, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.599848581911361, | |
| "grad_norm": 0.41178637742996216, | |
| "learning_rate": 0.00016225364431486877, | |
| "loss": 3.23, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.614408013511152, | |
| "grad_norm": 0.4252997040748596, | |
| "learning_rate": 0.00016181632653061224, | |
| "loss": 3.2266, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.628967445110943, | |
| "grad_norm": 0.3951346278190613, | |
| "learning_rate": 0.00016137900874635568, | |
| "loss": 3.2399, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.643526876710734, | |
| "grad_norm": 0.40295711159706116, | |
| "learning_rate": 0.00016094169096209912, | |
| "loss": 3.2332, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.658086308310523, | |
| "grad_norm": 0.39642152190208435, | |
| "learning_rate": 0.00016050437317784256, | |
| "loss": 3.2316, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.672645739910314, | |
| "grad_norm": 0.38787510991096497, | |
| "learning_rate": 0.000160067055393586, | |
| "loss": 3.2442, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.687205171510104, | |
| "grad_norm": 0.4102849066257477, | |
| "learning_rate": 0.00015962973760932944, | |
| "loss": 3.232, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.701764603109895, | |
| "grad_norm": 0.40455612540245056, | |
| "learning_rate": 0.00015919241982507285, | |
| "loss": 3.2336, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.716324034709684, | |
| "grad_norm": 0.4081745147705078, | |
| "learning_rate": 0.00015875510204081631, | |
| "loss": 3.2318, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.730883466309475, | |
| "grad_norm": 0.39961087703704834, | |
| "learning_rate": 0.00015831778425655975, | |
| "loss": 3.2325, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.745442897909266, | |
| "grad_norm": 0.39770328998565674, | |
| "learning_rate": 0.0001578804664723032, | |
| "loss": 3.2374, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.760002329509057, | |
| "grad_norm": 0.4101422131061554, | |
| "learning_rate": 0.00015744314868804663, | |
| "loss": 3.2367, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.774561761108846, | |
| "grad_norm": 0.40297961235046387, | |
| "learning_rate": 0.00015700583090379007, | |
| "loss": 3.2426, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.789121192708636, | |
| "grad_norm": 0.39780429005622864, | |
| "learning_rate": 0.0001565685131195335, | |
| "loss": 3.2377, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.803680624308427, | |
| "grad_norm": 0.4247409403324127, | |
| "learning_rate": 0.00015613119533527698, | |
| "loss": 3.2329, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.818240055908218, | |
| "grad_norm": 0.4056137800216675, | |
| "learning_rate": 0.0001556938775510204, | |
| "loss": 3.2377, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.832799487508007, | |
| "grad_norm": 0.3977799117565155, | |
| "learning_rate": 0.00015525655976676383, | |
| "loss": 3.2334, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.847358919107798, | |
| "grad_norm": 0.40999510884284973, | |
| "learning_rate": 0.00015481924198250727, | |
| "loss": 3.245, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.847358919107798, | |
| "eval_accuracy": 0.37410919983872204, | |
| "eval_loss": 3.528653860092163, | |
| "eval_runtime": 179.8528, | |
| "eval_samples_per_second": 92.537, | |
| "eval_steps_per_second": 5.788, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.861918350707588, | |
| "grad_norm": 0.42608842253685, | |
| "learning_rate": 0.0001543819241982507, | |
| "loss": 3.2402, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.87647778230738, | |
| "grad_norm": 0.3977234363555908, | |
| "learning_rate": 0.00015394460641399418, | |
| "loss": 3.2373, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.891037213907168, | |
| "grad_norm": 0.4012013077735901, | |
| "learning_rate": 0.0001535072886297376, | |
| "loss": 3.2312, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.905596645506959, | |
| "grad_norm": 0.3960820734500885, | |
| "learning_rate": 0.00015306997084548106, | |
| "loss": 3.2527, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.92015607710675, | |
| "grad_norm": 0.4106712341308594, | |
| "learning_rate": 0.00015263265306122447, | |
| "loss": 3.2379, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.93471550870654, | |
| "grad_norm": 0.4187733829021454, | |
| "learning_rate": 0.0001521953352769679, | |
| "loss": 3.24, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.94927494030633, | |
| "grad_norm": 0.4008129835128784, | |
| "learning_rate": 0.00015175801749271135, | |
| "loss": 3.241, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.96383437190612, | |
| "grad_norm": 0.42290031909942627, | |
| "learning_rate": 0.00015132069970845479, | |
| "loss": 3.2498, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.978393803505911, | |
| "grad_norm": 0.39067307114601135, | |
| "learning_rate": 0.00015088338192419825, | |
| "loss": 3.2471, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 14.992953235105702, | |
| "grad_norm": 0.3999600112438202, | |
| "learning_rate": 0.00015044606413994167, | |
| "loss": 3.249, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.007279715799895, | |
| "grad_norm": 0.41343411803245544, | |
| "learning_rate": 0.00015000874635568513, | |
| "loss": 3.2085, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.021839147399685, | |
| "grad_norm": 0.429661363363266, | |
| "learning_rate": 0.00014957142857142854, | |
| "loss": 3.1666, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.036398578999476, | |
| "grad_norm": 0.4097861647605896, | |
| "learning_rate": 0.00014913411078717198, | |
| "loss": 3.1655, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.050958010599267, | |
| "grad_norm": 0.4214050769805908, | |
| "learning_rate": 0.00014869679300291545, | |
| "loss": 3.1668, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.065517442199056, | |
| "grad_norm": 0.4203738868236542, | |
| "learning_rate": 0.0001482594752186589, | |
| "loss": 3.1648, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.080076873798847, | |
| "grad_norm": 0.41821858286857605, | |
| "learning_rate": 0.00014782215743440233, | |
| "loss": 3.1647, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.094636305398637, | |
| "grad_norm": 0.4323025941848755, | |
| "learning_rate": 0.00014738483965014577, | |
| "loss": 3.1814, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.109195736998428, | |
| "grad_norm": 0.39484599232673645, | |
| "learning_rate": 0.0001469475218658892, | |
| "loss": 3.1772, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.123755168598217, | |
| "grad_norm": 0.41107800602912903, | |
| "learning_rate": 0.00014651020408163265, | |
| "loss": 3.1784, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.138314600198008, | |
| "grad_norm": 0.4029431939125061, | |
| "learning_rate": 0.0001460728862973761, | |
| "loss": 3.1718, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.138314600198008, | |
| "eval_accuracy": 0.37368789708783734, | |
| "eval_loss": 3.5386550426483154, | |
| "eval_runtime": 179.8512, | |
| "eval_samples_per_second": 92.538, | |
| "eval_steps_per_second": 5.788, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.152874031797799, | |
| "grad_norm": 0.40105974674224854, | |
| "learning_rate": 0.00014563556851311953, | |
| "loss": 3.1775, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.16743346339759, | |
| "grad_norm": 0.42404162883758545, | |
| "learning_rate": 0.00014519825072886297, | |
| "loss": 3.1747, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.181992894997379, | |
| "grad_norm": 0.4054589867591858, | |
| "learning_rate": 0.0001447609329446064, | |
| "loss": 3.1894, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.19655232659717, | |
| "grad_norm": 0.41971468925476074, | |
| "learning_rate": 0.00014432361516034984, | |
| "loss": 3.1836, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.21111175819696, | |
| "grad_norm": 0.4210222661495209, | |
| "learning_rate": 0.00014388629737609328, | |
| "loss": 3.1732, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.225671189796751, | |
| "grad_norm": 0.42625147104263306, | |
| "learning_rate": 0.00014344897959183672, | |
| "loss": 3.1857, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.24023062139654, | |
| "grad_norm": 0.4069248139858246, | |
| "learning_rate": 0.00014301166180758016, | |
| "loss": 3.1868, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.25479005299633, | |
| "grad_norm": 0.4041866660118103, | |
| "learning_rate": 0.0001425743440233236, | |
| "loss": 3.1881, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.269349484596122, | |
| "grad_norm": 0.40314981341362, | |
| "learning_rate": 0.00014213702623906704, | |
| "loss": 3.191, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.283908916195912, | |
| "grad_norm": 0.4245761036872864, | |
| "learning_rate": 0.00014169970845481048, | |
| "loss": 3.1888, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.298468347795701, | |
| "grad_norm": 0.40877848863601685, | |
| "learning_rate": 0.00014126239067055392, | |
| "loss": 3.1917, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.313027779395492, | |
| "grad_norm": 0.4157589375972748, | |
| "learning_rate": 0.00014082507288629736, | |
| "loss": 3.2, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.327587210995283, | |
| "grad_norm": 0.39100512862205505, | |
| "learning_rate": 0.0001403877551020408, | |
| "loss": 3.2019, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.342146642595074, | |
| "grad_norm": 0.4026622474193573, | |
| "learning_rate": 0.00013995043731778424, | |
| "loss": 3.1937, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.356706074194863, | |
| "grad_norm": 0.4168094992637634, | |
| "learning_rate": 0.00013951311953352768, | |
| "loss": 3.2028, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.371265505794653, | |
| "grad_norm": 0.41785821318626404, | |
| "learning_rate": 0.00013907580174927112, | |
| "loss": 3.1985, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.385824937394444, | |
| "grad_norm": 0.4225537180900574, | |
| "learning_rate": 0.00013863848396501456, | |
| "loss": 3.1917, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.400384368994235, | |
| "grad_norm": 0.421794056892395, | |
| "learning_rate": 0.000138201166180758, | |
| "loss": 3.2067, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.414943800594024, | |
| "grad_norm": 0.42855533957481384, | |
| "learning_rate": 0.00013776384839650144, | |
| "loss": 3.1923, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.429503232193815, | |
| "grad_norm": 0.4089881181716919, | |
| "learning_rate": 0.00013732653061224488, | |
| "loss": 3.1904, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.429503232193815, | |
| "eval_accuracy": 0.3743012141820642, | |
| "eval_loss": 3.535125732421875, | |
| "eval_runtime": 179.8043, | |
| "eval_samples_per_second": 92.562, | |
| "eval_steps_per_second": 5.79, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.444062663793606, | |
| "grad_norm": 0.4249016046524048, | |
| "learning_rate": 0.00013688921282798832, | |
| "loss": 3.1893, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.458622095393396, | |
| "grad_norm": 0.4056449234485626, | |
| "learning_rate": 0.00013645189504373176, | |
| "loss": 3.1973, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.473181526993185, | |
| "grad_norm": 0.411278635263443, | |
| "learning_rate": 0.00013601457725947522, | |
| "loss": 3.2089, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.487740958592976, | |
| "grad_norm": 0.4109366536140442, | |
| "learning_rate": 0.00013557725947521866, | |
| "loss": 3.2041, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.502300390192767, | |
| "grad_norm": 0.42039617896080017, | |
| "learning_rate": 0.0001351399416909621, | |
| "loss": 3.2103, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.516859821792558, | |
| "grad_norm": 0.4206897020339966, | |
| "learning_rate": 0.0001347026239067055, | |
| "loss": 3.2148, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.531419253392347, | |
| "grad_norm": 0.4062785506248474, | |
| "learning_rate": 0.00013426530612244895, | |
| "loss": 3.2114, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.545978684992138, | |
| "grad_norm": 0.42714810371398926, | |
| "learning_rate": 0.0001338279883381924, | |
| "loss": 3.1939, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.560538116591928, | |
| "grad_norm": 0.39876991510391235, | |
| "learning_rate": 0.00013339067055393586, | |
| "loss": 3.1901, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.575097548191719, | |
| "grad_norm": 0.397652804851532, | |
| "learning_rate": 0.0001329533527696793, | |
| "loss": 3.2233, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.58965697979151, | |
| "grad_norm": 0.40526559948921204, | |
| "learning_rate": 0.00013251603498542274, | |
| "loss": 3.214, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.604216411391299, | |
| "grad_norm": 0.40763163566589355, | |
| "learning_rate": 0.00013207871720116618, | |
| "loss": 3.2026, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.61877584299109, | |
| "grad_norm": 0.4108482003211975, | |
| "learning_rate": 0.00013164139941690962, | |
| "loss": 3.1983, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.63333527459088, | |
| "grad_norm": 0.42270195484161377, | |
| "learning_rate": 0.00013120408163265306, | |
| "loss": 3.2072, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.64789470619067, | |
| "grad_norm": 0.4099654257297516, | |
| "learning_rate": 0.0001307667638483965, | |
| "loss": 3.2059, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.66245413779046, | |
| "grad_norm": 0.416862428188324, | |
| "learning_rate": 0.00013032944606413994, | |
| "loss": 3.2046, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.677013569390251, | |
| "grad_norm": 0.41440367698669434, | |
| "learning_rate": 0.00012989212827988337, | |
| "loss": 3.206, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.691573000990042, | |
| "grad_norm": 0.4144529402256012, | |
| "learning_rate": 0.00012945481049562681, | |
| "loss": 3.206, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.706132432589833, | |
| "grad_norm": 0.41030511260032654, | |
| "learning_rate": 0.00012901749271137025, | |
| "loss": 3.2183, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.720691864189622, | |
| "grad_norm": 0.4152276813983917, | |
| "learning_rate": 0.0001285801749271137, | |
| "loss": 3.2151, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.720691864189622, | |
| "eval_accuracy": 0.37434131025743445, | |
| "eval_loss": 3.5293774604797363, | |
| "eval_runtime": 179.8109, | |
| "eval_samples_per_second": 92.558, | |
| "eval_steps_per_second": 5.789, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.735251295789412, | |
| "grad_norm": 0.40948715806007385, | |
| "learning_rate": 0.00012814285714285713, | |
| "loss": 3.2193, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.749810727389203, | |
| "grad_norm": 0.3965863883495331, | |
| "learning_rate": 0.00012770553935860057, | |
| "loss": 3.2169, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.764370158988992, | |
| "grad_norm": 0.4163205325603485, | |
| "learning_rate": 0.000127268221574344, | |
| "loss": 3.2179, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.778929590588783, | |
| "grad_norm": 0.4208901524543762, | |
| "learning_rate": 0.00012683090379008745, | |
| "loss": 3.2119, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.793489022188574, | |
| "grad_norm": 0.4129723012447357, | |
| "learning_rate": 0.0001263935860058309, | |
| "loss": 3.2213, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.808048453788365, | |
| "grad_norm": 0.4082995653152466, | |
| "learning_rate": 0.00012595626822157433, | |
| "loss": 3.2165, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.822607885388155, | |
| "grad_norm": 0.40925273299217224, | |
| "learning_rate": 0.00012551895043731777, | |
| "loss": 3.2099, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.837167316987944, | |
| "grad_norm": 0.41905489563941956, | |
| "learning_rate": 0.0001250816326530612, | |
| "loss": 3.2053, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.851726748587735, | |
| "grad_norm": 0.40665403008461, | |
| "learning_rate": 0.00012464431486880465, | |
| "loss": 3.2119, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.866286180187526, | |
| "grad_norm": 0.41052520275115967, | |
| "learning_rate": 0.0001242069970845481, | |
| "loss": 3.2157, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.880845611787315, | |
| "grad_norm": 0.42254316806793213, | |
| "learning_rate": 0.00012376967930029153, | |
| "loss": 3.223, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.895405043387106, | |
| "grad_norm": 0.40757060050964355, | |
| "learning_rate": 0.00012333236151603497, | |
| "loss": 3.2128, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.909964474986896, | |
| "grad_norm": 0.4044566750526428, | |
| "learning_rate": 0.0001228950437317784, | |
| "loss": 3.2277, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.924523906586687, | |
| "grad_norm": 0.40794673562049866, | |
| "learning_rate": 0.00012245772594752185, | |
| "loss": 3.2132, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.939083338186478, | |
| "grad_norm": 0.41059088706970215, | |
| "learning_rate": 0.0001220204081632653, | |
| "loss": 3.2337, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.953642769786267, | |
| "grad_norm": 0.40359726548194885, | |
| "learning_rate": 0.00012158309037900874, | |
| "loss": 3.2256, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.968202201386058, | |
| "grad_norm": 0.4164576828479767, | |
| "learning_rate": 0.00012114577259475218, | |
| "loss": 3.2193, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 15.982761632985849, | |
| "grad_norm": 0.41666367650032043, | |
| "learning_rate": 0.00012070845481049562, | |
| "loss": 3.1938, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 15.99732106458564, | |
| "grad_norm": 0.41611310839653015, | |
| "learning_rate": 0.00012027113702623906, | |
| "loss": 3.215, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.011647545279832, | |
| "grad_norm": 0.4197295606136322, | |
| "learning_rate": 0.00011983381924198251, | |
| "loss": 3.1627, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.011647545279832, | |
| "eval_accuracy": 0.3746472633017554, | |
| "eval_loss": 3.531348943710327, | |
| "eval_runtime": 179.744, | |
| "eval_samples_per_second": 92.593, | |
| "eval_steps_per_second": 5.792, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.02620697687962, | |
| "grad_norm": 0.4169209897518158, | |
| "learning_rate": 0.00011939650145772594, | |
| "loss": 3.1447, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.040766408479413, | |
| "grad_norm": 0.4214211702346802, | |
| "learning_rate": 0.00011895918367346938, | |
| "loss": 3.1481, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.055325840079202, | |
| "grad_norm": 0.4274289309978485, | |
| "learning_rate": 0.00011852186588921281, | |
| "loss": 3.1546, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.069885271678995, | |
| "grad_norm": 0.4178299009799957, | |
| "learning_rate": 0.00011808454810495625, | |
| "loss": 3.1519, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.084444703278784, | |
| "grad_norm": 0.42355021834373474, | |
| "learning_rate": 0.0001176472303206997, | |
| "loss": 3.157, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.099004134878573, | |
| "grad_norm": 0.40021342039108276, | |
| "learning_rate": 0.00011720991253644315, | |
| "loss": 3.1528, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.113563566478366, | |
| "grad_norm": 0.42021602392196655, | |
| "learning_rate": 0.00011677259475218659, | |
| "loss": 3.1428, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.128122998078155, | |
| "grad_norm": 0.42361438274383545, | |
| "learning_rate": 0.00011633527696793003, | |
| "loss": 3.1672, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.142682429677944, | |
| "grad_norm": 0.4197485148906708, | |
| "learning_rate": 0.00011589795918367347, | |
| "loss": 3.1628, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.157241861277736, | |
| "grad_norm": 0.4117751717567444, | |
| "learning_rate": 0.00011546064139941689, | |
| "loss": 3.1614, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.171801292877525, | |
| "grad_norm": 0.4112408459186554, | |
| "learning_rate": 0.00011502332361516033, | |
| "loss": 3.1468, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.186360724477318, | |
| "grad_norm": 0.4194517731666565, | |
| "learning_rate": 0.00011458600583090377, | |
| "loss": 3.1663, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.200920156077107, | |
| "grad_norm": 0.40641582012176514, | |
| "learning_rate": 0.00011414868804664722, | |
| "loss": 3.1604, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.215479587676896, | |
| "grad_norm": 0.41321802139282227, | |
| "learning_rate": 0.00011371137026239066, | |
| "loss": 3.1541, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.23003901927669, | |
| "grad_norm": 0.40778738260269165, | |
| "learning_rate": 0.0001132740524781341, | |
| "loss": 3.1585, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.244598450876477, | |
| "grad_norm": 0.41870421171188354, | |
| "learning_rate": 0.00011283673469387754, | |
| "loss": 3.1611, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.259157882476266, | |
| "grad_norm": 0.4245312809944153, | |
| "learning_rate": 0.00011239941690962098, | |
| "loss": 3.1824, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.27371731407606, | |
| "grad_norm": 0.4253610074520111, | |
| "learning_rate": 0.00011196209912536443, | |
| "loss": 3.1707, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.288276745675848, | |
| "grad_norm": 0.42540794610977173, | |
| "learning_rate": 0.00011152478134110786, | |
| "loss": 3.164, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.30283617727564, | |
| "grad_norm": 0.4161224663257599, | |
| "learning_rate": 0.0001110874635568513, | |
| "loss": 3.1717, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.30283617727564, | |
| "eval_accuracy": 0.37458894173758045, | |
| "eval_loss": 3.533280372619629, | |
| "eval_runtime": 179.7993, | |
| "eval_samples_per_second": 92.564, | |
| "eval_steps_per_second": 5.79, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.31739560887543, | |
| "grad_norm": 0.4027422368526459, | |
| "learning_rate": 0.00011065014577259474, | |
| "loss": 3.1687, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.33195504047522, | |
| "grad_norm": 0.43346402049064636, | |
| "learning_rate": 0.00011021282798833818, | |
| "loss": 3.1666, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.34651447207501, | |
| "grad_norm": 0.4163694381713867, | |
| "learning_rate": 0.00010977551020408162, | |
| "loss": 3.176, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.3610739036748, | |
| "grad_norm": 0.41778096556663513, | |
| "learning_rate": 0.00010933819241982507, | |
| "loss": 3.1565, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.375633335274593, | |
| "grad_norm": 0.43710342049598694, | |
| "learning_rate": 0.00010890087463556851, | |
| "loss": 3.1731, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.39019276687438, | |
| "grad_norm": 0.4328019320964813, | |
| "learning_rate": 0.00010846355685131195, | |
| "loss": 3.1798, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.40475219847417, | |
| "grad_norm": 0.41606712341308594, | |
| "learning_rate": 0.00010802623906705539, | |
| "loss": 3.1892, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.419311630073963, | |
| "grad_norm": 0.4367011487483978, | |
| "learning_rate": 0.00010758892128279882, | |
| "loss": 3.1866, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.433871061673752, | |
| "grad_norm": 0.41168850660324097, | |
| "learning_rate": 0.00010715160349854226, | |
| "loss": 3.184, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.44843049327354, | |
| "grad_norm": 0.4158318042755127, | |
| "learning_rate": 0.00010671428571428571, | |
| "loss": 3.177, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.462989924873334, | |
| "grad_norm": 0.41305047273635864, | |
| "learning_rate": 0.00010627696793002915, | |
| "loss": 3.1765, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.477549356473123, | |
| "grad_norm": 0.4143592119216919, | |
| "learning_rate": 0.00010583965014577259, | |
| "loss": 3.1872, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.492108788072915, | |
| "grad_norm": 0.427137553691864, | |
| "learning_rate": 0.00010540233236151603, | |
| "loss": 3.1752, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.506668219672704, | |
| "grad_norm": 0.42718973755836487, | |
| "learning_rate": 0.00010496501457725947, | |
| "loss": 3.1832, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.521227651272493, | |
| "grad_norm": 0.4194405972957611, | |
| "learning_rate": 0.00010452769679300292, | |
| "loss": 3.1749, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.535787082872286, | |
| "grad_norm": 0.4275641441345215, | |
| "learning_rate": 0.00010409037900874634, | |
| "loss": 3.1785, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.550346514472075, | |
| "grad_norm": 0.42118945717811584, | |
| "learning_rate": 0.00010365306122448978, | |
| "loss": 3.1807, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.564905946071864, | |
| "grad_norm": 0.42112356424331665, | |
| "learning_rate": 0.00010321574344023322, | |
| "loss": 3.185, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.579465377671657, | |
| "grad_norm": 0.40832531452178955, | |
| "learning_rate": 0.00010277842565597666, | |
| "loss": 3.1863, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.594024809271446, | |
| "grad_norm": 0.41556891798973083, | |
| "learning_rate": 0.0001023411078717201, | |
| "loss": 3.1887, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.594024809271446, | |
| "eval_accuracy": 0.375009891737069, | |
| "eval_loss": 3.5277862548828125, | |
| "eval_runtime": 179.8457, | |
| "eval_samples_per_second": 92.54, | |
| "eval_steps_per_second": 5.788, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.608584240871238, | |
| "grad_norm": 0.4110031723976135, | |
| "learning_rate": 0.00010190379008746356, | |
| "loss": 3.1741, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.623143672471027, | |
| "grad_norm": 0.4251270890235901, | |
| "learning_rate": 0.000101466472303207, | |
| "loss": 3.1853, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.637703104070816, | |
| "grad_norm": 0.4084523320198059, | |
| "learning_rate": 0.00010102915451895043, | |
| "loss": 3.1895, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.65226253567061, | |
| "grad_norm": 0.43704357743263245, | |
| "learning_rate": 0.00010059183673469387, | |
| "loss": 3.2, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.666821967270398, | |
| "grad_norm": 0.45471593737602234, | |
| "learning_rate": 0.0001001545189504373, | |
| "loss": 3.1835, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.681381398870187, | |
| "grad_norm": 0.41394278407096863, | |
| "learning_rate": 9.971720116618074e-05, | |
| "loss": 3.1846, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.69594083046998, | |
| "grad_norm": 0.4269932210445404, | |
| "learning_rate": 9.927988338192418e-05, | |
| "loss": 3.1975, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.71050026206977, | |
| "grad_norm": 0.41786158084869385, | |
| "learning_rate": 9.884256559766763e-05, | |
| "loss": 3.2013, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.72505969366956, | |
| "grad_norm": 0.41637122631073, | |
| "learning_rate": 9.840524781341107e-05, | |
| "loss": 3.1928, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.73961912526935, | |
| "grad_norm": 0.42277225852012634, | |
| "learning_rate": 9.796793002915451e-05, | |
| "loss": 3.1766, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.75417855686914, | |
| "grad_norm": 0.421393483877182, | |
| "learning_rate": 9.753061224489795e-05, | |
| "loss": 3.1925, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.76873798846893, | |
| "grad_norm": 0.4143362045288086, | |
| "learning_rate": 9.709329446064139e-05, | |
| "loss": 3.195, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.78329742006872, | |
| "grad_norm": 0.4214780628681183, | |
| "learning_rate": 9.665597667638484e-05, | |
| "loss": 3.1782, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.79785685166851, | |
| "grad_norm": 0.43336132168769836, | |
| "learning_rate": 9.621865889212827e-05, | |
| "loss": 3.1895, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.812416283268302, | |
| "grad_norm": 0.4261871874332428, | |
| "learning_rate": 9.578134110787171e-05, | |
| "loss": 3.1901, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.82697571486809, | |
| "grad_norm": 0.43283653259277344, | |
| "learning_rate": 9.534402332361515e-05, | |
| "loss": 3.1966, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.841535146467884, | |
| "grad_norm": 0.4269659221172333, | |
| "learning_rate": 9.490670553935859e-05, | |
| "loss": 3.1832, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.856094578067673, | |
| "grad_norm": 0.417172372341156, | |
| "learning_rate": 9.446938775510203e-05, | |
| "loss": 3.1962, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.87065400966746, | |
| "grad_norm": 0.41687533259391785, | |
| "learning_rate": 9.403206997084548e-05, | |
| "loss": 3.187, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.885213441267254, | |
| "grad_norm": 0.40989968180656433, | |
| "learning_rate": 9.359475218658892e-05, | |
| "loss": 3.1901, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.885213441267254, | |
| "eval_accuracy": 0.375465411373387, | |
| "eval_loss": 3.5243453979492188, | |
| "eval_runtime": 179.7012, | |
| "eval_samples_per_second": 92.615, | |
| "eval_steps_per_second": 5.793, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.899772872867043, | |
| "grad_norm": 0.42806559801101685, | |
| "learning_rate": 9.315743440233236e-05, | |
| "loss": 3.1907, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.914332304466832, | |
| "grad_norm": 0.40956610441207886, | |
| "learning_rate": 9.27201166180758e-05, | |
| "loss": 3.1856, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.928891736066625, | |
| "grad_norm": 0.41996699571609497, | |
| "learning_rate": 9.228279883381922e-05, | |
| "loss": 3.2155, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.943451167666414, | |
| "grad_norm": 0.4105769693851471, | |
| "learning_rate": 9.184548104956266e-05, | |
| "loss": 3.1813, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.958010599266206, | |
| "grad_norm": 0.4150310754776001, | |
| "learning_rate": 9.140816326530612e-05, | |
| "loss": 3.1908, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.972570030865995, | |
| "grad_norm": 0.42143964767456055, | |
| "learning_rate": 9.097084548104956e-05, | |
| "loss": 3.1859, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 16.987129462465784, | |
| "grad_norm": 0.42836061120033264, | |
| "learning_rate": 9.0533527696793e-05, | |
| "loss": 3.1989, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 17.00145594315998, | |
| "grad_norm": 0.413256973028183, | |
| "learning_rate": 9.009620991253644e-05, | |
| "loss": 3.1796, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.01601537475977, | |
| "grad_norm": 0.43588265776634216, | |
| "learning_rate": 8.965889212827987e-05, | |
| "loss": 3.1193, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.03057480635956, | |
| "grad_norm": 0.4288026690483093, | |
| "learning_rate": 8.922157434402333e-05, | |
| "loss": 3.1271, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.04513423795935, | |
| "grad_norm": 0.42423388361930847, | |
| "learning_rate": 8.878425655976677e-05, | |
| "loss": 3.1333, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.05969366955914, | |
| "grad_norm": 0.41696009039878845, | |
| "learning_rate": 8.83469387755102e-05, | |
| "loss": 3.134, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.07425310115893, | |
| "grad_norm": 0.4192046523094177, | |
| "learning_rate": 8.790962099125363e-05, | |
| "loss": 3.1377, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.08881253275872, | |
| "grad_norm": 0.43218058347702026, | |
| "learning_rate": 8.747230320699707e-05, | |
| "loss": 3.1403, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.103371964358512, | |
| "grad_norm": 0.4226701855659485, | |
| "learning_rate": 8.703498542274051e-05, | |
| "loss": 3.1419, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.1179313959583, | |
| "grad_norm": 0.4426027536392212, | |
| "learning_rate": 8.659766763848396e-05, | |
| "loss": 3.1454, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.132490827558094, | |
| "grad_norm": 0.43630558252334595, | |
| "learning_rate": 8.61603498542274e-05, | |
| "loss": 3.1482, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.147050259157883, | |
| "grad_norm": 0.43396347761154175, | |
| "learning_rate": 8.572303206997084e-05, | |
| "loss": 3.1412, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.161609690757672, | |
| "grad_norm": 0.42278262972831726, | |
| "learning_rate": 8.528571428571428e-05, | |
| "loss": 3.1413, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.176169122357464, | |
| "grad_norm": 0.4185219705104828, | |
| "learning_rate": 8.484839650145771e-05, | |
| "loss": 3.1498, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.176169122357464, | |
| "eval_accuracy": 0.37519579172287665, | |
| "eval_loss": 3.5312256813049316, | |
| "eval_runtime": 179.692, | |
| "eval_samples_per_second": 92.62, | |
| "eval_steps_per_second": 5.793, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.190728553957253, | |
| "grad_norm": 0.437406063079834, | |
| "learning_rate": 8.441107871720115e-05, | |
| "loss": 3.1484, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.205287985557042, | |
| "grad_norm": 0.41340211033821106, | |
| "learning_rate": 8.397376093294459e-05, | |
| "loss": 3.1348, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.219847417156835, | |
| "grad_norm": 0.43353283405303955, | |
| "learning_rate": 8.353644314868804e-05, | |
| "loss": 3.1418, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.234406848756624, | |
| "grad_norm": 0.430377721786499, | |
| "learning_rate": 8.309912536443148e-05, | |
| "loss": 3.1397, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.248966280356417, | |
| "grad_norm": 0.4174799919128418, | |
| "learning_rate": 8.266180758017492e-05, | |
| "loss": 3.1419, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.263525711956206, | |
| "grad_norm": 0.4215719699859619, | |
| "learning_rate": 8.222448979591836e-05, | |
| "loss": 3.1431, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.278085143555995, | |
| "grad_norm": 0.42813974618911743, | |
| "learning_rate": 8.17871720116618e-05, | |
| "loss": 3.1442, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.292644575155787, | |
| "grad_norm": 0.4232740104198456, | |
| "learning_rate": 8.134985422740525e-05, | |
| "loss": 3.1537, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.307204006755576, | |
| "grad_norm": 0.42498260736465454, | |
| "learning_rate": 8.091253644314868e-05, | |
| "loss": 3.1549, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.321763438355365, | |
| "grad_norm": 0.42179837822914124, | |
| "learning_rate": 8.047521865889212e-05, | |
| "loss": 3.1431, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.336322869955158, | |
| "grad_norm": 0.44017553329467773, | |
| "learning_rate": 8.003790087463556e-05, | |
| "loss": 3.154, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.350882301554947, | |
| "grad_norm": 0.41716912388801575, | |
| "learning_rate": 7.9600583090379e-05, | |
| "loss": 3.1716, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.36544173315474, | |
| "grad_norm": 0.41841381788253784, | |
| "learning_rate": 7.916326530612244e-05, | |
| "loss": 3.1608, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.38000116475453, | |
| "grad_norm": 0.42180711030960083, | |
| "learning_rate": 7.872594752186589e-05, | |
| "loss": 3.1682, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.394560596354317, | |
| "grad_norm": 0.42136502265930176, | |
| "learning_rate": 7.828862973760933e-05, | |
| "loss": 3.152, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.40912002795411, | |
| "grad_norm": 0.4209064245223999, | |
| "learning_rate": 7.785131195335277e-05, | |
| "loss": 3.146, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.4236794595539, | |
| "grad_norm": 0.4188046157360077, | |
| "learning_rate": 7.741399416909621e-05, | |
| "loss": 3.1496, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.438238891153688, | |
| "grad_norm": 0.4277164041996002, | |
| "learning_rate": 7.697667638483963e-05, | |
| "loss": 3.1616, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.45279832275348, | |
| "grad_norm": 0.44717901945114136, | |
| "learning_rate": 7.653935860058307e-05, | |
| "loss": 3.161, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.46735775435327, | |
| "grad_norm": 0.40624383091926575, | |
| "learning_rate": 7.610204081632653e-05, | |
| "loss": 3.153, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.46735775435327, | |
| "eval_accuracy": 0.37553701990681954, | |
| "eval_loss": 3.5280678272247314, | |
| "eval_runtime": 179.5658, | |
| "eval_samples_per_second": 92.685, | |
| "eval_steps_per_second": 5.797, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.481917185953062, | |
| "grad_norm": 0.44909200072288513, | |
| "learning_rate": 7.566472303206997e-05, | |
| "loss": 3.1741, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.49647661755285, | |
| "grad_norm": 0.43057718873023987, | |
| "learning_rate": 7.52274052478134e-05, | |
| "loss": 3.1605, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.51103604915264, | |
| "grad_norm": 0.41980892419815063, | |
| "learning_rate": 7.479008746355684e-05, | |
| "loss": 3.1542, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.525595480752433, | |
| "grad_norm": 0.41728129982948303, | |
| "learning_rate": 7.435276967930028e-05, | |
| "loss": 3.1517, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.54015491235222, | |
| "grad_norm": 0.41326892375946045, | |
| "learning_rate": 7.391545189504372e-05, | |
| "loss": 3.1509, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.55471434395201, | |
| "grad_norm": 0.43065664172172546, | |
| "learning_rate": 7.347813411078716e-05, | |
| "loss": 3.1619, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.569273775551803, | |
| "grad_norm": 0.40984541177749634, | |
| "learning_rate": 7.30408163265306e-05, | |
| "loss": 3.1531, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.583833207151592, | |
| "grad_norm": 0.4190158247947693, | |
| "learning_rate": 7.260349854227406e-05, | |
| "loss": 3.1643, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.598392638751385, | |
| "grad_norm": 0.4273621141910553, | |
| "learning_rate": 7.216618075801748e-05, | |
| "loss": 3.1627, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.612952070351174, | |
| "grad_norm": 0.41709256172180176, | |
| "learning_rate": 7.172886297376092e-05, | |
| "loss": 3.1575, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.627511501950963, | |
| "grad_norm": 0.42943283915519714, | |
| "learning_rate": 7.129154518950437e-05, | |
| "loss": 3.1634, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.642070933550755, | |
| "grad_norm": 0.43262234330177307, | |
| "learning_rate": 7.085422740524781e-05, | |
| "loss": 3.1558, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.656630365150544, | |
| "grad_norm": 0.4381544589996338, | |
| "learning_rate": 7.041690962099124e-05, | |
| "loss": 3.1505, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.671189796750333, | |
| "grad_norm": 0.4137042164802551, | |
| "learning_rate": 6.997959183673469e-05, | |
| "loss": 3.1699, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.685749228350126, | |
| "grad_norm": 0.4235377311706543, | |
| "learning_rate": 6.954227405247813e-05, | |
| "loss": 3.169, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.700308659949915, | |
| "grad_norm": 0.4289599061012268, | |
| "learning_rate": 6.910495626822157e-05, | |
| "loss": 3.1644, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.714868091549707, | |
| "grad_norm": 0.4362151622772217, | |
| "learning_rate": 6.866763848396501e-05, | |
| "loss": 3.1613, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.729427523149496, | |
| "grad_norm": 0.42833998799324036, | |
| "learning_rate": 6.823032069970845e-05, | |
| "loss": 3.1587, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.743986954749285, | |
| "grad_norm": 0.4399029314517975, | |
| "learning_rate": 6.779300291545189e-05, | |
| "loss": 3.1602, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.758546386349078, | |
| "grad_norm": 0.41309642791748047, | |
| "learning_rate": 6.735568513119533e-05, | |
| "loss": 3.1645, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.758546386349078, | |
| "eval_accuracy": 0.37569305360774724, | |
| "eval_loss": 3.5239369869232178, | |
| "eval_runtime": 179.7036, | |
| "eval_samples_per_second": 92.614, | |
| "eval_steps_per_second": 5.793, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.773105817948867, | |
| "grad_norm": 0.4099070727825165, | |
| "learning_rate": 6.691836734693877e-05, | |
| "loss": 3.165, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.787665249548656, | |
| "grad_norm": 0.4193074405193329, | |
| "learning_rate": 6.648104956268221e-05, | |
| "loss": 3.1705, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.80222468114845, | |
| "grad_norm": 0.42403796315193176, | |
| "learning_rate": 6.604373177842565e-05, | |
| "loss": 3.1646, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.816784112748238, | |
| "grad_norm": 0.4121449589729309, | |
| "learning_rate": 6.560641399416909e-05, | |
| "loss": 3.1642, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.83134354434803, | |
| "grad_norm": 0.4212753176689148, | |
| "learning_rate": 6.516909620991253e-05, | |
| "loss": 3.163, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.84590297594782, | |
| "grad_norm": 0.43732142448425293, | |
| "learning_rate": 6.473177842565598e-05, | |
| "loss": 3.1773, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.860462407547608, | |
| "grad_norm": 0.4187993109226227, | |
| "learning_rate": 6.42944606413994e-05, | |
| "loss": 3.1628, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.8750218391474, | |
| "grad_norm": 0.4270228445529938, | |
| "learning_rate": 6.385714285714284e-05, | |
| "loss": 3.1583, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.88958127074719, | |
| "grad_norm": 0.42085975408554077, | |
| "learning_rate": 6.34198250728863e-05, | |
| "loss": 3.157, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.90414070234698, | |
| "grad_norm": 0.4262135326862335, | |
| "learning_rate": 6.298250728862974e-05, | |
| "loss": 3.1748, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.91870013394677, | |
| "grad_norm": 0.43082737922668457, | |
| "learning_rate": 6.254518950437316e-05, | |
| "loss": 3.1695, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.93325956554656, | |
| "grad_norm": 0.43583792448043823, | |
| "learning_rate": 6.210787172011662e-05, | |
| "loss": 3.1674, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.947818997146353, | |
| "grad_norm": 0.4230196177959442, | |
| "learning_rate": 6.167055393586006e-05, | |
| "loss": 3.1615, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.962378428746142, | |
| "grad_norm": 0.41631415486335754, | |
| "learning_rate": 6.12332361516035e-05, | |
| "loss": 3.1596, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.97693786034593, | |
| "grad_norm": 0.42340517044067383, | |
| "learning_rate": 6.079591836734693e-05, | |
| "loss": 3.1593, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 17.991497291945723, | |
| "grad_norm": 0.4355524182319641, | |
| "learning_rate": 6.0358600583090374e-05, | |
| "loss": 3.1761, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.005823772639918, | |
| "grad_norm": 0.4226396083831787, | |
| "learning_rate": 5.9921282798833814e-05, | |
| "loss": 3.1571, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.020383204239707, | |
| "grad_norm": 0.42405375838279724, | |
| "learning_rate": 5.948396501457725e-05, | |
| "loss": 3.1209, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.034942635839496, | |
| "grad_norm": 0.42336833477020264, | |
| "learning_rate": 5.90466472303207e-05, | |
| "loss": 3.1297, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.04950206743929, | |
| "grad_norm": 0.43224647641181946, | |
| "learning_rate": 5.860932944606413e-05, | |
| "loss": 3.1118, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.04950206743929, | |
| "eval_accuracy": 0.37571586486470276, | |
| "eval_loss": 3.526365041732788, | |
| "eval_runtime": 179.6025, | |
| "eval_samples_per_second": 92.666, | |
| "eval_steps_per_second": 5.796, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.064061499039077, | |
| "grad_norm": 0.44077619910240173, | |
| "learning_rate": 5.817201166180757e-05, | |
| "loss": 3.1216, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.078620930638866, | |
| "grad_norm": 0.4180721342563629, | |
| "learning_rate": 5.773469387755102e-05, | |
| "loss": 3.1149, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.09318036223866, | |
| "grad_norm": 0.4361351430416107, | |
| "learning_rate": 5.729737609329446e-05, | |
| "loss": 3.1209, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.107739793838448, | |
| "grad_norm": 0.4234081208705902, | |
| "learning_rate": 5.686005830903789e-05, | |
| "loss": 3.1161, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.12229922543824, | |
| "grad_norm": 0.4476962387561798, | |
| "learning_rate": 5.6422740524781336e-05, | |
| "loss": 3.1178, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.13685865703803, | |
| "grad_norm": 0.4251765012741089, | |
| "learning_rate": 5.5985422740524776e-05, | |
| "loss": 3.1241, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.15141808863782, | |
| "grad_norm": 0.4326912462711334, | |
| "learning_rate": 5.554810495626822e-05, | |
| "loss": 3.1365, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.16597752023761, | |
| "grad_norm": 0.41611161828041077, | |
| "learning_rate": 5.5110787172011655e-05, | |
| "loss": 3.1302, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.1805369518374, | |
| "grad_norm": 0.424152672290802, | |
| "learning_rate": 5.4673469387755094e-05, | |
| "loss": 3.1176, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.19509638343719, | |
| "grad_norm": 0.41061875224113464, | |
| "learning_rate": 5.4236151603498534e-05, | |
| "loss": 3.121, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.20965581503698, | |
| "grad_norm": 0.4476122260093689, | |
| "learning_rate": 5.379883381924198e-05, | |
| "loss": 3.1174, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.22421524663677, | |
| "grad_norm": 0.4128943979740143, | |
| "learning_rate": 5.336151603498542e-05, | |
| "loss": 3.139, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.238774678236563, | |
| "grad_norm": 0.43339040875434875, | |
| "learning_rate": 5.292419825072885e-05, | |
| "loss": 3.1296, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.253334109836352, | |
| "grad_norm": 0.4117395877838135, | |
| "learning_rate": 5.24868804664723e-05, | |
| "loss": 3.1261, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.26789354143614, | |
| "grad_norm": 0.4270617961883545, | |
| "learning_rate": 5.204956268221574e-05, | |
| "loss": 3.1174, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.282452973035934, | |
| "grad_norm": 0.4476640820503235, | |
| "learning_rate": 5.1612244897959184e-05, | |
| "loss": 3.1307, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.297012404635723, | |
| "grad_norm": 0.4141584038734436, | |
| "learning_rate": 5.117492711370262e-05, | |
| "loss": 3.1366, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.31157183623551, | |
| "grad_norm": 0.4253668189048767, | |
| "learning_rate": 5.0737609329446057e-05, | |
| "loss": 3.1301, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.326131267835304, | |
| "grad_norm": 0.4373990595340729, | |
| "learning_rate": 5.03002915451895e-05, | |
| "loss": 3.1225, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.340690699435093, | |
| "grad_norm": 0.41809549927711487, | |
| "learning_rate": 4.986297376093294e-05, | |
| "loss": 3.1376, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.340690699435093, | |
| "eval_accuracy": 0.3758768370851776, | |
| "eval_loss": 3.526252031326294, | |
| "eval_runtime": 179.5814, | |
| "eval_samples_per_second": 92.677, | |
| "eval_steps_per_second": 5.797, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.355250131034886, | |
| "grad_norm": 0.42304909229278564, | |
| "learning_rate": 4.942565597667638e-05, | |
| "loss": 3.1323, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.369809562634675, | |
| "grad_norm": 0.42781421542167664, | |
| "learning_rate": 4.898833819241982e-05, | |
| "loss": 3.134, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.384368994234464, | |
| "grad_norm": 0.42817869782447815, | |
| "learning_rate": 4.855102040816326e-05, | |
| "loss": 3.1311, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.398928425834256, | |
| "grad_norm": 0.43528953194618225, | |
| "learning_rate": 4.81137026239067e-05, | |
| "loss": 3.1295, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.413487857434045, | |
| "grad_norm": 0.4256283938884735, | |
| "learning_rate": 4.7676384839650146e-05, | |
| "loss": 3.1365, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.428047289033834, | |
| "grad_norm": 0.42997053265571594, | |
| "learning_rate": 4.723906705539358e-05, | |
| "loss": 3.138, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.442606720633627, | |
| "grad_norm": 0.4277358651161194, | |
| "learning_rate": 4.680174927113702e-05, | |
| "loss": 3.1319, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.457166152233416, | |
| "grad_norm": 0.43694230914115906, | |
| "learning_rate": 4.6364431486880465e-05, | |
| "loss": 3.1376, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.47172558383321, | |
| "grad_norm": 0.427422434091568, | |
| "learning_rate": 4.5927113702623904e-05, | |
| "loss": 3.1373, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.486285015432998, | |
| "grad_norm": 0.43156328797340393, | |
| "learning_rate": 4.548979591836734e-05, | |
| "loss": 3.1221, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.500844447032787, | |
| "grad_norm": 0.42131608724594116, | |
| "learning_rate": 4.505247813411078e-05, | |
| "loss": 3.1361, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.51540387863258, | |
| "grad_norm": 0.44428882002830505, | |
| "learning_rate": 4.461516034985422e-05, | |
| "loss": 3.1381, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.529963310232368, | |
| "grad_norm": 0.4337320327758789, | |
| "learning_rate": 4.417784256559766e-05, | |
| "loss": 3.1281, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.544522741832157, | |
| "grad_norm": 0.4291313588619232, | |
| "learning_rate": 4.374052478134111e-05, | |
| "loss": 3.1439, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.55908217343195, | |
| "grad_norm": 0.42422613501548767, | |
| "learning_rate": 4.330320699708454e-05, | |
| "loss": 3.1322, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.57364160503174, | |
| "grad_norm": 0.4216996729373932, | |
| "learning_rate": 4.286588921282798e-05, | |
| "loss": 3.1422, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.58820103663153, | |
| "grad_norm": 0.4323543608188629, | |
| "learning_rate": 4.242857142857143e-05, | |
| "loss": 3.1405, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.60276046823132, | |
| "grad_norm": 0.4178192615509033, | |
| "learning_rate": 4.1991253644314866e-05, | |
| "loss": 3.1384, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.61731989983111, | |
| "grad_norm": 0.4237135052680969, | |
| "learning_rate": 4.15539358600583e-05, | |
| "loss": 3.1346, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.631879331430902, | |
| "grad_norm": 0.42646321654319763, | |
| "learning_rate": 4.1116618075801745e-05, | |
| "loss": 3.1384, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.631879331430902, | |
| "eval_accuracy": 0.3760594447246205, | |
| "eval_loss": 3.5238232612609863, | |
| "eval_runtime": 179.56, | |
| "eval_samples_per_second": 92.688, | |
| "eval_steps_per_second": 5.798, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.64643876303069, | |
| "grad_norm": 0.43727925419807434, | |
| "learning_rate": 4.0679300291545185e-05, | |
| "loss": 3.1504, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.66099819463048, | |
| "grad_norm": 0.43674325942993164, | |
| "learning_rate": 4.024198250728863e-05, | |
| "loss": 3.1456, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.675557626230272, | |
| "grad_norm": 0.4338884949684143, | |
| "learning_rate": 3.980466472303207e-05, | |
| "loss": 3.1343, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.69011705783006, | |
| "grad_norm": 0.4316923916339874, | |
| "learning_rate": 3.93673469387755e-05, | |
| "loss": 3.1403, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.704676489429854, | |
| "grad_norm": 0.4154166281223297, | |
| "learning_rate": 3.893002915451895e-05, | |
| "loss": 3.134, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.719235921029643, | |
| "grad_norm": 0.43093207478523254, | |
| "learning_rate": 3.849271137026239e-05, | |
| "loss": 3.1324, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.733795352629432, | |
| "grad_norm": 0.42517316341400146, | |
| "learning_rate": 3.805539358600583e-05, | |
| "loss": 3.138, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.748354784229225, | |
| "grad_norm": 0.4249558448791504, | |
| "learning_rate": 3.761807580174926e-05, | |
| "loss": 3.1348, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.762914215829014, | |
| "grad_norm": 0.4178551137447357, | |
| "learning_rate": 3.718075801749271e-05, | |
| "loss": 3.1347, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.777473647428806, | |
| "grad_norm": 0.43049004673957825, | |
| "learning_rate": 3.674344023323615e-05, | |
| "loss": 3.1401, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.792033079028595, | |
| "grad_norm": 0.4242251515388489, | |
| "learning_rate": 3.6306122448979586e-05, | |
| "loss": 3.1359, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.806592510628384, | |
| "grad_norm": 0.41446393728256226, | |
| "learning_rate": 3.5868804664723026e-05, | |
| "loss": 3.1467, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.821151942228177, | |
| "grad_norm": 0.4318779408931732, | |
| "learning_rate": 3.543148688046647e-05, | |
| "loss": 3.1429, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.835711373827966, | |
| "grad_norm": 0.4431349039077759, | |
| "learning_rate": 3.499416909620991e-05, | |
| "loss": 3.1446, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.850270805427755, | |
| "grad_norm": 0.44169580936431885, | |
| "learning_rate": 3.455685131195335e-05, | |
| "loss": 3.1393, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.864830237027547, | |
| "grad_norm": 0.42799896001815796, | |
| "learning_rate": 3.411953352769679e-05, | |
| "loss": 3.1429, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.879389668627336, | |
| "grad_norm": 0.42514893412590027, | |
| "learning_rate": 3.368221574344023e-05, | |
| "loss": 3.1263, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.893949100227125, | |
| "grad_norm": 0.4253270626068115, | |
| "learning_rate": 3.324489795918367e-05, | |
| "loss": 3.1494, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.908508531826918, | |
| "grad_norm": 0.4338584244251251, | |
| "learning_rate": 3.280758017492711e-05, | |
| "loss": 3.1512, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.923067963426707, | |
| "grad_norm": 0.42179763317108154, | |
| "learning_rate": 3.237026239067055e-05, | |
| "loss": 3.1365, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.923067963426707, | |
| "eval_accuracy": 0.3764943871961591, | |
| "eval_loss": 3.520042657852173, | |
| "eval_runtime": 179.62, | |
| "eval_samples_per_second": 92.657, | |
| "eval_steps_per_second": 5.796, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.9376273950265, | |
| "grad_norm": 0.4421677589416504, | |
| "learning_rate": 3.1932944606413995e-05, | |
| "loss": 3.1499, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.95218682662629, | |
| "grad_norm": 0.4211215674877167, | |
| "learning_rate": 3.149562682215743e-05, | |
| "loss": 3.1447, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.966746258226078, | |
| "grad_norm": 0.42026522755622864, | |
| "learning_rate": 3.1058309037900874e-05, | |
| "loss": 3.1472, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 18.98130568982587, | |
| "grad_norm": 0.42903971672058105, | |
| "learning_rate": 3.062099125364431e-05, | |
| "loss": 3.1352, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 18.99586512142566, | |
| "grad_norm": 0.4268266558647156, | |
| "learning_rate": 3.0183673469387753e-05, | |
| "loss": 3.1464, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.010191602119853, | |
| "grad_norm": 0.43615421652793884, | |
| "learning_rate": 2.9746355685131196e-05, | |
| "loss": 3.1255, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.024751033719642, | |
| "grad_norm": 0.43458208441734314, | |
| "learning_rate": 2.9309037900874632e-05, | |
| "loss": 3.1095, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.039310465319435, | |
| "grad_norm": 0.4229234457015991, | |
| "learning_rate": 2.8871720116618075e-05, | |
| "loss": 3.119, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.053869896919224, | |
| "grad_norm": 0.42766526341438293, | |
| "learning_rate": 2.8434402332361514e-05, | |
| "loss": 3.1083, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.068429328519013, | |
| "grad_norm": 0.4272925853729248, | |
| "learning_rate": 2.7997084548104954e-05, | |
| "loss": 3.0968, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.082988760118806, | |
| "grad_norm": 0.43676096200942993, | |
| "learning_rate": 2.7559766763848393e-05, | |
| "loss": 3.1023, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.097548191718595, | |
| "grad_norm": 0.4115598797798157, | |
| "learning_rate": 2.7122448979591836e-05, | |
| "loss": 3.1031, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.112107623318387, | |
| "grad_norm": 0.4189133048057556, | |
| "learning_rate": 2.6685131195335272e-05, | |
| "loss": 3.0964, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.126667054918176, | |
| "grad_norm": 0.4264732003211975, | |
| "learning_rate": 2.6247813411078715e-05, | |
| "loss": 3.0977, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.141226486517965, | |
| "grad_norm": 0.4251263439655304, | |
| "learning_rate": 2.5810495626822158e-05, | |
| "loss": 3.1168, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.155785918117758, | |
| "grad_norm": 0.41753003001213074, | |
| "learning_rate": 2.5373177842565594e-05, | |
| "loss": 3.1148, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.170345349717547, | |
| "grad_norm": 0.4261728525161743, | |
| "learning_rate": 2.4935860058309037e-05, | |
| "loss": 3.1159, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.184904781317336, | |
| "grad_norm": 0.4432784914970398, | |
| "learning_rate": 2.4498542274052476e-05, | |
| "loss": 3.1098, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.19946421291713, | |
| "grad_norm": 0.4324243366718292, | |
| "learning_rate": 2.406122448979592e-05, | |
| "loss": 3.1032, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.214023644516917, | |
| "grad_norm": 0.42517364025115967, | |
| "learning_rate": 2.3623906705539355e-05, | |
| "loss": 3.1048, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.214023644516917, | |
| "eval_accuracy": 0.37641925114876434, | |
| "eval_loss": 3.522754430770874, | |
| "eval_runtime": 179.766, | |
| "eval_samples_per_second": 92.581, | |
| "eval_steps_per_second": 5.791, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.22858307611671, | |
| "grad_norm": 0.43855613470077515, | |
| "learning_rate": 2.3186588921282798e-05, | |
| "loss": 3.1086, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.2431425077165, | |
| "grad_norm": 0.42424947023391724, | |
| "learning_rate": 2.2749271137026234e-05, | |
| "loss": 3.1104, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.257701939316288, | |
| "grad_norm": 0.4278988540172577, | |
| "learning_rate": 2.2311953352769677e-05, | |
| "loss": 3.1145, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.27226137091608, | |
| "grad_norm": 0.4234248399734497, | |
| "learning_rate": 2.1874635568513116e-05, | |
| "loss": 3.1091, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.28682080251587, | |
| "grad_norm": 0.42990297079086304, | |
| "learning_rate": 2.143731778425656e-05, | |
| "loss": 3.1092, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.30138023411566, | |
| "grad_norm": 0.4154321849346161, | |
| "learning_rate": 2.1e-05, | |
| "loss": 3.1191, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.31593966571545, | |
| "grad_norm": 0.42328861355781555, | |
| "learning_rate": 2.056268221574344e-05, | |
| "loss": 3.1237, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.33049909731524, | |
| "grad_norm": 0.4259921610355377, | |
| "learning_rate": 2.012536443148688e-05, | |
| "loss": 3.1164, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.345058528915033, | |
| "grad_norm": 0.4307624101638794, | |
| "learning_rate": 1.9688046647230317e-05, | |
| "loss": 3.1103, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.35961796051482, | |
| "grad_norm": 0.43328267335891724, | |
| "learning_rate": 1.925072886297376e-05, | |
| "loss": 3.1119, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.37417739211461, | |
| "grad_norm": 0.4142916798591614, | |
| "learning_rate": 1.88134110787172e-05, | |
| "loss": 3.1167, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.388736823714403, | |
| "grad_norm": 0.4262329041957855, | |
| "learning_rate": 1.837609329446064e-05, | |
| "loss": 3.1089, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.403296255314192, | |
| "grad_norm": 0.4410352110862732, | |
| "learning_rate": 1.7938775510204082e-05, | |
| "loss": 3.1179, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.41785568691398, | |
| "grad_norm": 0.4364875853061676, | |
| "learning_rate": 1.750145772594752e-05, | |
| "loss": 3.1097, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.432415118513774, | |
| "grad_norm": 0.4255106449127197, | |
| "learning_rate": 1.706413994169096e-05, | |
| "loss": 3.1238, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.446974550113563, | |
| "grad_norm": 0.4275910258293152, | |
| "learning_rate": 1.66268221574344e-05, | |
| "loss": 3.1268, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.461533981713355, | |
| "grad_norm": 0.42917412519454956, | |
| "learning_rate": 1.618950437317784e-05, | |
| "loss": 3.1079, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.476093413313144, | |
| "grad_norm": 0.4198929965496063, | |
| "learning_rate": 1.5752186588921283e-05, | |
| "loss": 3.1104, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.490652844912933, | |
| "grad_norm": 0.43348103761672974, | |
| "learning_rate": 1.5314868804664722e-05, | |
| "loss": 3.1199, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.505212276512726, | |
| "grad_norm": 0.4270970821380615, | |
| "learning_rate": 1.4877551020408162e-05, | |
| "loss": 3.1252, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.505212276512726, | |
| "eval_accuracy": 0.3765385987044852, | |
| "eval_loss": 3.52213716506958, | |
| "eval_runtime": 179.576, | |
| "eval_samples_per_second": 92.679, | |
| "eval_steps_per_second": 5.797, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.519771708112515, | |
| "grad_norm": 0.42685285210609436, | |
| "learning_rate": 1.4440233236151601e-05, | |
| "loss": 3.112, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.534331139712307, | |
| "grad_norm": 0.4240424633026123, | |
| "learning_rate": 1.400291545189504e-05, | |
| "loss": 3.115, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.548890571312096, | |
| "grad_norm": 0.41864222288131714, | |
| "learning_rate": 1.3565597667638484e-05, | |
| "loss": 3.1075, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.563450002911885, | |
| "grad_norm": 0.430819571018219, | |
| "learning_rate": 1.3128279883381923e-05, | |
| "loss": 3.102, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.578009434511678, | |
| "grad_norm": 0.45127809047698975, | |
| "learning_rate": 1.2690962099125364e-05, | |
| "loss": 3.1151, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.592568866111467, | |
| "grad_norm": 0.4324467182159424, | |
| "learning_rate": 1.2253644314868804e-05, | |
| "loss": 3.1218, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.607128297711256, | |
| "grad_norm": 0.41157129406929016, | |
| "learning_rate": 1.1816326530612243e-05, | |
| "loss": 3.1168, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.62168772931105, | |
| "grad_norm": 0.4352055788040161, | |
| "learning_rate": 1.1379008746355684e-05, | |
| "loss": 3.1135, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.636247160910838, | |
| "grad_norm": 0.4236614406108856, | |
| "learning_rate": 1.0941690962099124e-05, | |
| "loss": 3.1209, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.650806592510627, | |
| "grad_norm": 0.4268696904182434, | |
| "learning_rate": 1.0504373177842565e-05, | |
| "loss": 3.119, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.66536602411042, | |
| "grad_norm": 0.42168357968330383, | |
| "learning_rate": 1.0067055393586005e-05, | |
| "loss": 3.092, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.679925455710208, | |
| "grad_norm": 0.4286148250102997, | |
| "learning_rate": 9.629737609329444e-06, | |
| "loss": 3.1094, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.69448488731, | |
| "grad_norm": 0.4331967830657959, | |
| "learning_rate": 9.192419825072885e-06, | |
| "loss": 3.1227, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.70904431890979, | |
| "grad_norm": 0.4273418188095093, | |
| "learning_rate": 8.755102040816326e-06, | |
| "loss": 3.1184, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.72360375050958, | |
| "grad_norm": 0.44387224316596985, | |
| "learning_rate": 8.317784256559766e-06, | |
| "loss": 3.1179, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.73816318210937, | |
| "grad_norm": 0.4357852637767792, | |
| "learning_rate": 7.880466472303207e-06, | |
| "loss": 3.1163, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.75272261370916, | |
| "grad_norm": 0.41910186409950256, | |
| "learning_rate": 7.443148688046647e-06, | |
| "loss": 3.1157, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.767282045308953, | |
| "grad_norm": 0.4404396414756775, | |
| "learning_rate": 7.005830903790087e-06, | |
| "loss": 3.1209, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.781841476908742, | |
| "grad_norm": 0.4255041182041168, | |
| "learning_rate": 6.568513119533527e-06, | |
| "loss": 3.1139, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.79640090850853, | |
| "grad_norm": 0.42811915278434753, | |
| "learning_rate": 6.1311953352769675e-06, | |
| "loss": 3.1198, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.79640090850853, | |
| "eval_accuracy": 0.3765822222938177, | |
| "eval_loss": 3.520904064178467, | |
| "eval_runtime": 179.5918, | |
| "eval_samples_per_second": 92.671, | |
| "eval_steps_per_second": 5.796, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.810960340108323, | |
| "grad_norm": 0.43425923585891724, | |
| "learning_rate": 5.693877551020407e-06, | |
| "loss": 3.1135, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.825519771708112, | |
| "grad_norm": 0.4284214377403259, | |
| "learning_rate": 5.256559766763848e-06, | |
| "loss": 3.1227, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.8400792033079, | |
| "grad_norm": 0.43212711811065674, | |
| "learning_rate": 4.8192419825072884e-06, | |
| "loss": 3.1076, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.854638634907694, | |
| "grad_norm": 0.41544848680496216, | |
| "learning_rate": 4.381924198250729e-06, | |
| "loss": 3.1172, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.869198066507483, | |
| "grad_norm": 0.41702643036842346, | |
| "learning_rate": 3.944606413994168e-06, | |
| "loss": 3.1174, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.883757498107276, | |
| "grad_norm": 0.432802677154541, | |
| "learning_rate": 3.5072886297376094e-06, | |
| "loss": 3.1218, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.898316929707065, | |
| "grad_norm": 0.4204798936843872, | |
| "learning_rate": 3.0699708454810493e-06, | |
| "loss": 3.1128, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.912876361306854, | |
| "grad_norm": 0.43337222933769226, | |
| "learning_rate": 2.6326530612244892e-06, | |
| "loss": 3.1171, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.927435792906646, | |
| "grad_norm": 0.4287244975566864, | |
| "learning_rate": 2.1953352769679296e-06, | |
| "loss": 3.1083, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.941995224506435, | |
| "grad_norm": 0.4293520152568817, | |
| "learning_rate": 1.7580174927113701e-06, | |
| "loss": 3.1071, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.956554656106224, | |
| "grad_norm": 0.43255236744880676, | |
| "learning_rate": 1.3206997084548104e-06, | |
| "loss": 3.1085, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.971114087706017, | |
| "grad_norm": 0.42116379737854004, | |
| "learning_rate": 8.833819241982507e-07, | |
| "loss": 3.1196, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 19.985673519305806, | |
| "grad_norm": 0.4275839328765869, | |
| "learning_rate": 4.4606413994169093e-07, | |
| "loss": 3.1015, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 1.8701350688934326, | |
| "learning_rate": 8.746355685131195e-09, | |
| "loss": 3.1135, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "step": 68700, | |
| "total_flos": 1.43566384398336e+18, | |
| "train_loss": 3.438475851533715, | |
| "train_runtime": 136533.177, | |
| "train_samples_per_second": 40.243, | |
| "train_steps_per_second": 0.503 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 68700, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 3 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.43566384398336e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |