Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_metric": 3.746429681777954, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_2000_634/checkpoint-10000", | |
| "epoch": 1.0781671159029649, | |
| "eval_steps": 1000, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005390835579514825, | |
| "grad_norm": 1.4145963191986084, | |
| "learning_rate": 0.0003, | |
| "loss": 8.6259, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01078167115902965, | |
| "grad_norm": 3.1651010513305664, | |
| "learning_rate": 0.0006, | |
| "loss": 6.9317, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.016172506738544475, | |
| "grad_norm": 2.394423723220825, | |
| "learning_rate": 0.0005996762007555315, | |
| "loss": 6.4779, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0215633423180593, | |
| "grad_norm": 1.727589726448059, | |
| "learning_rate": 0.000599352401511063, | |
| "loss": 6.2178, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.026954177897574125, | |
| "grad_norm": 1.7129422426223755, | |
| "learning_rate": 0.0005990286022665946, | |
| "loss": 6.0805, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03234501347708895, | |
| "grad_norm": 1.352902889251709, | |
| "learning_rate": 0.0005987048030221263, | |
| "loss": 5.9783, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03773584905660377, | |
| "grad_norm": 2.4004905223846436, | |
| "learning_rate": 0.0005983810037776578, | |
| "loss": 5.8715, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0431266846361186, | |
| "grad_norm": 1.5558369159698486, | |
| "learning_rate": 0.0005980572045331894, | |
| "loss": 5.7796, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.04851752021563342, | |
| "grad_norm": 1.405604362487793, | |
| "learning_rate": 0.0005977334052887209, | |
| "loss": 5.7133, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.05390835579514825, | |
| "grad_norm": 1.7021785974502563, | |
| "learning_rate": 0.0005974096060442526, | |
| "loss": 5.6534, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05929919137466307, | |
| "grad_norm": 1.6914323568344116, | |
| "learning_rate": 0.0005970858067997841, | |
| "loss": 5.5752, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0646900269541779, | |
| "grad_norm": 1.8869633674621582, | |
| "learning_rate": 0.0005967620075553157, | |
| "loss": 5.5078, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07008086253369272, | |
| "grad_norm": 0.8667061924934387, | |
| "learning_rate": 0.0005964382083108472, | |
| "loss": 5.418, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.07547169811320754, | |
| "grad_norm": 1.124100685119629, | |
| "learning_rate": 0.0005961144090663788, | |
| "loss": 5.3656, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08086253369272237, | |
| "grad_norm": 1.3503167629241943, | |
| "learning_rate": 0.0005957906098219104, | |
| "loss": 5.2977, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0862533692722372, | |
| "grad_norm": 1.0930562019348145, | |
| "learning_rate": 0.0005954668105774419, | |
| "loss": 5.2523, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.09164420485175202, | |
| "grad_norm": 0.9786685705184937, | |
| "learning_rate": 0.0005951430113329735, | |
| "loss": 5.1895, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.09703504043126684, | |
| "grad_norm": 1.2884083986282349, | |
| "learning_rate": 0.0005948192120885051, | |
| "loss": 5.168, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.10242587601078167, | |
| "grad_norm": 0.9569465517997742, | |
| "learning_rate": 0.0005944954128440366, | |
| "loss": 5.1311, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.1078167115902965, | |
| "grad_norm": 1.0159235000610352, | |
| "learning_rate": 0.0005941716135995682, | |
| "loss": 5.0746, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1078167115902965, | |
| "eval_accuracy": 0.22805328468001887, | |
| "eval_loss": 5.009091377258301, | |
| "eval_runtime": 181.7061, | |
| "eval_samples_per_second": 99.122, | |
| "eval_steps_per_second": 6.197, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11320754716981132, | |
| "grad_norm": 1.0572841167449951, | |
| "learning_rate": 0.0005938478143550997, | |
| "loss": 5.0385, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.11859838274932614, | |
| "grad_norm": 1.0152630805969238, | |
| "learning_rate": 0.0005935240151106314, | |
| "loss": 5.0085, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.12398921832884097, | |
| "grad_norm": 1.1099059581756592, | |
| "learning_rate": 0.0005932002158661629, | |
| "loss": 4.9873, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1293800539083558, | |
| "grad_norm": 0.9290792346000671, | |
| "learning_rate": 0.0005928764166216945, | |
| "loss": 4.9231, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.1347708894878706, | |
| "grad_norm": 0.9619135856628418, | |
| "learning_rate": 0.000592552617377226, | |
| "loss": 4.901, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.14016172506738545, | |
| "grad_norm": 1.0886056423187256, | |
| "learning_rate": 0.0005922288181327577, | |
| "loss": 4.8781, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.14555256064690028, | |
| "grad_norm": 0.8178435564041138, | |
| "learning_rate": 0.0005919050188882893, | |
| "loss": 4.8423, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.1509433962264151, | |
| "grad_norm": 0.95741206407547, | |
| "learning_rate": 0.0005915812196438207, | |
| "loss": 4.8349, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.15633423180592992, | |
| "grad_norm": 1.099839448928833, | |
| "learning_rate": 0.0005912574203993524, | |
| "loss": 4.8422, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.16172506738544473, | |
| "grad_norm": 0.8492836952209473, | |
| "learning_rate": 0.0005909336211548839, | |
| "loss": 4.8014, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.16711590296495957, | |
| "grad_norm": 0.9975847601890564, | |
| "learning_rate": 0.0005906098219104155, | |
| "loss": 4.7623, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.1725067385444744, | |
| "grad_norm": 0.9260163307189941, | |
| "learning_rate": 0.000590286022665947, | |
| "loss": 4.7475, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1778975741239892, | |
| "grad_norm": 0.8640759587287903, | |
| "learning_rate": 0.0005899622234214787, | |
| "loss": 4.7228, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.18328840970350405, | |
| "grad_norm": 1.009647011756897, | |
| "learning_rate": 0.0005896384241770102, | |
| "loss": 4.6961, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.18867924528301888, | |
| "grad_norm": 0.8946645259857178, | |
| "learning_rate": 0.0005893146249325418, | |
| "loss": 4.705, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.1940700808625337, | |
| "grad_norm": 1.1306875944137573, | |
| "learning_rate": 0.0005889908256880733, | |
| "loss": 4.6712, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.19946091644204852, | |
| "grad_norm": 1.0593204498291016, | |
| "learning_rate": 0.0005886670264436049, | |
| "loss": 4.6409, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.20485175202156333, | |
| "grad_norm": 0.8434944748878479, | |
| "learning_rate": 0.0005883432271991365, | |
| "loss": 4.6297, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.21024258760107817, | |
| "grad_norm": 0.8913785815238953, | |
| "learning_rate": 0.0005880194279546681, | |
| "loss": 4.6276, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.215633423180593, | |
| "grad_norm": 0.7697395086288452, | |
| "learning_rate": 0.0005876956287101996, | |
| "loss": 4.5779, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.215633423180593, | |
| "eval_accuracy": 0.2714229540742714, | |
| "eval_loss": 4.502364158630371, | |
| "eval_runtime": 181.696, | |
| "eval_samples_per_second": 99.127, | |
| "eval_steps_per_second": 6.197, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2210242587601078, | |
| "grad_norm": 0.9948697686195374, | |
| "learning_rate": 0.0005873718294657312, | |
| "loss": 4.5556, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.22641509433962265, | |
| "grad_norm": 0.8819608688354492, | |
| "learning_rate": 0.0005870480302212628, | |
| "loss": 4.5434, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.23180592991913745, | |
| "grad_norm": 0.8973624110221863, | |
| "learning_rate": 0.0005867242309767943, | |
| "loss": 4.5261, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.2371967654986523, | |
| "grad_norm": 0.8225829601287842, | |
| "learning_rate": 0.0005864004317323259, | |
| "loss": 4.4894, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.24258760107816713, | |
| "grad_norm": 0.7101904153823853, | |
| "learning_rate": 0.0005860766324878575, | |
| "loss": 4.5017, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.24797843665768193, | |
| "grad_norm": 0.8717466592788696, | |
| "learning_rate": 0.000585752833243389, | |
| "loss": 4.4742, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.25336927223719674, | |
| "grad_norm": 0.9836375117301941, | |
| "learning_rate": 0.0005854290339989206, | |
| "loss": 4.4495, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.2587601078167116, | |
| "grad_norm": 0.9477341175079346, | |
| "learning_rate": 0.0005851052347544521, | |
| "loss": 4.4442, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.2641509433962264, | |
| "grad_norm": 0.7532225847244263, | |
| "learning_rate": 0.0005847814355099838, | |
| "loss": 4.4306, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.2695417789757412, | |
| "grad_norm": 0.6514259576797485, | |
| "learning_rate": 0.0005844576362655154, | |
| "loss": 4.4197, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.2749326145552561, | |
| "grad_norm": 0.9490562081336975, | |
| "learning_rate": 0.0005841338370210469, | |
| "loss": 4.3958, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.2803234501347709, | |
| "grad_norm": 0.856418788433075, | |
| "learning_rate": 0.0005838100377765785, | |
| "loss": 4.3722, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.8336089849472046, | |
| "learning_rate": 0.0005834862385321101, | |
| "loss": 4.3709, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.29110512129380056, | |
| "grad_norm": 0.8533580899238586, | |
| "learning_rate": 0.0005831624392876417, | |
| "loss": 4.3617, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.29649595687331537, | |
| "grad_norm": 1.0777424573898315, | |
| "learning_rate": 0.0005828386400431731, | |
| "loss": 4.3696, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.3018867924528302, | |
| "grad_norm": 0.7660175561904907, | |
| "learning_rate": 0.0005825148407987048, | |
| "loss": 4.3629, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.30727762803234504, | |
| "grad_norm": 0.8766116499900818, | |
| "learning_rate": 0.0005821910415542363, | |
| "loss": 4.3306, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.31266846361185985, | |
| "grad_norm": 0.9792775511741638, | |
| "learning_rate": 0.0005818672423097679, | |
| "loss": 4.3132, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.31805929919137466, | |
| "grad_norm": 0.8375799059867859, | |
| "learning_rate": 0.0005815434430652994, | |
| "loss": 4.3457, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.32345013477088946, | |
| "grad_norm": 0.7174794673919678, | |
| "learning_rate": 0.0005812196438208311, | |
| "loss": 4.3057, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.32345013477088946, | |
| "eval_accuracy": 0.29961283716485065, | |
| "eval_loss": 4.228518486022949, | |
| "eval_runtime": 181.6163, | |
| "eval_samples_per_second": 99.171, | |
| "eval_steps_per_second": 6.2, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3288409703504043, | |
| "grad_norm": 0.6986973285675049, | |
| "learning_rate": 0.0005808958445763626, | |
| "loss": 4.2999, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.33423180592991913, | |
| "grad_norm": 0.7411074042320251, | |
| "learning_rate": 0.0005805720453318942, | |
| "loss": 4.2815, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.33962264150943394, | |
| "grad_norm": 0.8088811635971069, | |
| "learning_rate": 0.0005802482460874257, | |
| "loss": 4.2689, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.3450134770889488, | |
| "grad_norm": 0.80063396692276, | |
| "learning_rate": 0.0005799244468429573, | |
| "loss": 4.2754, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3504043126684636, | |
| "grad_norm": 0.8073013424873352, | |
| "learning_rate": 0.0005796006475984889, | |
| "loss": 4.2485, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.3557951482479784, | |
| "grad_norm": 0.7534303665161133, | |
| "learning_rate": 0.0005792768483540205, | |
| "loss": 4.2577, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.3611859838274933, | |
| "grad_norm": 0.9090972542762756, | |
| "learning_rate": 0.000578953049109552, | |
| "loss": 4.2544, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.3665768194070081, | |
| "grad_norm": 0.741401731967926, | |
| "learning_rate": 0.0005786292498650836, | |
| "loss": 4.231, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.3719676549865229, | |
| "grad_norm": 0.9652010202407837, | |
| "learning_rate": 0.0005783054506206152, | |
| "loss": 4.223, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.37735849056603776, | |
| "grad_norm": 0.6787908673286438, | |
| "learning_rate": 0.0005779816513761467, | |
| "loss": 4.2163, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.38274932614555257, | |
| "grad_norm": 0.63587486743927, | |
| "learning_rate": 0.0005776578521316782, | |
| "loss": 4.1965, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.3881401617250674, | |
| "grad_norm": 0.8312519788742065, | |
| "learning_rate": 0.0005773340528872099, | |
| "loss": 4.2034, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.3935309973045822, | |
| "grad_norm": 0.82889324426651, | |
| "learning_rate": 0.0005770102536427414, | |
| "loss": 4.2196, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.39892183288409705, | |
| "grad_norm": 0.6806962490081787, | |
| "learning_rate": 0.000576686454398273, | |
| "loss": 4.2082, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.40431266846361186, | |
| "grad_norm": 0.7568163871765137, | |
| "learning_rate": 0.0005763626551538045, | |
| "loss": 4.1913, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.40970350404312667, | |
| "grad_norm": 0.6309810876846313, | |
| "learning_rate": 0.0005760388559093362, | |
| "loss": 4.1834, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.41509433962264153, | |
| "grad_norm": 0.6354190707206726, | |
| "learning_rate": 0.0005757150566648678, | |
| "loss": 4.1777, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.42048517520215634, | |
| "grad_norm": 0.8854219317436218, | |
| "learning_rate": 0.0005753912574203993, | |
| "loss": 4.1682, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.42587601078167114, | |
| "grad_norm": 0.7516558766365051, | |
| "learning_rate": 0.0005750674581759309, | |
| "loss": 4.1666, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.431266846361186, | |
| "grad_norm": 0.8482111692428589, | |
| "learning_rate": 0.0005747436589314624, | |
| "loss": 4.154, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.431266846361186, | |
| "eval_accuracy": 0.3124987437009846, | |
| "eval_loss": 4.088163375854492, | |
| "eval_runtime": 181.3707, | |
| "eval_samples_per_second": 99.305, | |
| "eval_steps_per_second": 6.208, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.4366576819407008, | |
| "grad_norm": 0.7016038298606873, | |
| "learning_rate": 0.0005744198596869941, | |
| "loss": 4.1634, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.4420485175202156, | |
| "grad_norm": 0.5946022868156433, | |
| "learning_rate": 0.0005740960604425255, | |
| "loss": 4.1587, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.4474393530997305, | |
| "grad_norm": 0.6194251775741577, | |
| "learning_rate": 0.0005737722611980572, | |
| "loss": 4.1437, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.4528301886792453, | |
| "grad_norm": 0.6552414894104004, | |
| "learning_rate": 0.0005734484619535887, | |
| "loss": 4.1145, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.4582210242587601, | |
| "grad_norm": 0.6080745458602905, | |
| "learning_rate": 0.0005731246627091203, | |
| "loss": 4.1491, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.4636118598382749, | |
| "grad_norm": 0.6025022864341736, | |
| "learning_rate": 0.0005728008634646518, | |
| "loss": 4.1221, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.46900269541778977, | |
| "grad_norm": 0.7329853177070618, | |
| "learning_rate": 0.0005724770642201835, | |
| "loss": 4.1343, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.4743935309973046, | |
| "grad_norm": 0.6319265365600586, | |
| "learning_rate": 0.000572153264975715, | |
| "loss": 4.1239, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.4797843665768194, | |
| "grad_norm": 0.5935447216033936, | |
| "learning_rate": 0.0005718294657312466, | |
| "loss": 4.0961, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.48517520215633425, | |
| "grad_norm": 0.6037242412567139, | |
| "learning_rate": 0.0005715056664867781, | |
| "loss": 4.0931, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.49056603773584906, | |
| "grad_norm": 0.7396536469459534, | |
| "learning_rate": 0.0005711818672423097, | |
| "loss": 4.0852, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.49595687331536387, | |
| "grad_norm": 0.6765477061271667, | |
| "learning_rate": 0.0005708580679978413, | |
| "loss": 4.0949, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5013477088948787, | |
| "grad_norm": 0.7021300196647644, | |
| "learning_rate": 0.0005705342687533729, | |
| "loss": 4.087, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.5067385444743935, | |
| "grad_norm": 0.6416023969650269, | |
| "learning_rate": 0.0005702104695089044, | |
| "loss": 4.069, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5121293800539084, | |
| "grad_norm": 0.6855347752571106, | |
| "learning_rate": 0.000569886670264436, | |
| "loss": 4.0833, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.5175202156334232, | |
| "grad_norm": 0.5605803728103638, | |
| "learning_rate": 0.0005695628710199675, | |
| "loss": 4.071, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.522911051212938, | |
| "grad_norm": 0.6609882116317749, | |
| "learning_rate": 0.0005692390717754991, | |
| "loss": 4.0662, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.5283018867924528, | |
| "grad_norm": 0.7520987391471863, | |
| "learning_rate": 0.0005689152725310306, | |
| "loss": 4.0593, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.5336927223719676, | |
| "grad_norm": 0.7330611348152161, | |
| "learning_rate": 0.0005685914732865623, | |
| "loss": 4.0614, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.5390835579514824, | |
| "grad_norm": 0.5883002877235413, | |
| "learning_rate": 0.0005682676740420939, | |
| "loss": 4.0678, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5390835579514824, | |
| "eval_accuracy": 0.32201423765711346, | |
| "eval_loss": 3.987787961959839, | |
| "eval_runtime": 181.5162, | |
| "eval_samples_per_second": 99.225, | |
| "eval_steps_per_second": 6.203, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5444743935309974, | |
| "grad_norm": 0.7446970343589783, | |
| "learning_rate": 0.0005679438747976254, | |
| "loss": 4.0443, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.5498652291105122, | |
| "grad_norm": 0.5801444053649902, | |
| "learning_rate": 0.000567620075553157, | |
| "loss": 4.0443, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.555256064690027, | |
| "grad_norm": 0.6403396129608154, | |
| "learning_rate": 0.0005672962763086886, | |
| "loss": 4.048, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.5606469002695418, | |
| "grad_norm": 0.6608542799949646, | |
| "learning_rate": 0.0005669724770642202, | |
| "loss": 4.0321, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5660377358490566, | |
| "grad_norm": 0.6073299050331116, | |
| "learning_rate": 0.0005666486778197517, | |
| "loss": 4.0307, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.6488654017448425, | |
| "learning_rate": 0.0005663248785752833, | |
| "loss": 4.0401, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5768194070080862, | |
| "grad_norm": 0.6238702535629272, | |
| "learning_rate": 0.0005660010793308148, | |
| "loss": 4.0249, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.5822102425876011, | |
| "grad_norm": 0.6104423999786377, | |
| "learning_rate": 0.0005656772800863465, | |
| "loss": 4.0438, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5876010781671159, | |
| "grad_norm": 0.5740390419960022, | |
| "learning_rate": 0.0005653534808418779, | |
| "loss": 4.0193, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.5929919137466307, | |
| "grad_norm": 0.6076253056526184, | |
| "learning_rate": 0.0005650296815974096, | |
| "loss": 4.023, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5983827493261455, | |
| "grad_norm": 0.5750396251678467, | |
| "learning_rate": 0.0005647058823529411, | |
| "loss": 4.0152, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.6037735849056604, | |
| "grad_norm": 0.5481240153312683, | |
| "learning_rate": 0.0005643820831084727, | |
| "loss": 4.0053, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.6091644204851752, | |
| "grad_norm": 0.7279238104820251, | |
| "learning_rate": 0.0005640582838640042, | |
| "loss": 4.0191, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.6145552560646901, | |
| "grad_norm": 0.7080326676368713, | |
| "learning_rate": 0.0005637344846195358, | |
| "loss": 4.0134, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.6199460916442049, | |
| "grad_norm": 0.588047206401825, | |
| "learning_rate": 0.0005634106853750674, | |
| "loss": 4.0209, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.6253369272237197, | |
| "grad_norm": 0.7702767252922058, | |
| "learning_rate": 0.000563086886130599, | |
| "loss": 3.9901, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.6307277628032345, | |
| "grad_norm": 0.7166429162025452, | |
| "learning_rate": 0.0005627630868861305, | |
| "loss": 3.9905, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.6361185983827493, | |
| "grad_norm": 0.5853614807128906, | |
| "learning_rate": 0.0005624392876416621, | |
| "loss": 3.9772, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.6415094339622641, | |
| "grad_norm": 0.6194868087768555, | |
| "learning_rate": 0.0005621154883971937, | |
| "loss": 3.9817, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.6469002695417789, | |
| "grad_norm": 0.6746755242347717, | |
| "learning_rate": 0.0005617916891527253, | |
| "loss": 3.9609, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6469002695417789, | |
| "eval_accuracy": 0.3282789458627208, | |
| "eval_loss": 3.917991876602173, | |
| "eval_runtime": 181.6023, | |
| "eval_samples_per_second": 99.178, | |
| "eval_steps_per_second": 6.2, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6522911051212938, | |
| "grad_norm": 0.6664725542068481, | |
| "learning_rate": 0.0005614678899082568, | |
| "loss": 3.995, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.6576819407008087, | |
| "grad_norm": 0.5693532228469849, | |
| "learning_rate": 0.0005611440906637884, | |
| "loss": 3.9818, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.6630727762803235, | |
| "grad_norm": 0.8059385418891907, | |
| "learning_rate": 0.00056082029141932, | |
| "loss": 3.9733, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.6684636118598383, | |
| "grad_norm": 0.6112721562385559, | |
| "learning_rate": 0.0005604964921748515, | |
| "loss": 3.9865, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.6738544474393531, | |
| "grad_norm": 0.6212737560272217, | |
| "learning_rate": 0.000560172692930383, | |
| "loss": 3.9479, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.6792452830188679, | |
| "grad_norm": 0.5734388828277588, | |
| "learning_rate": 0.0005598488936859147, | |
| "loss": 3.9591, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.6846361185983828, | |
| "grad_norm": 0.7219184041023254, | |
| "learning_rate": 0.0005595250944414463, | |
| "loss": 3.9504, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.6900269541778976, | |
| "grad_norm": 0.6439206004142761, | |
| "learning_rate": 0.0005592012951969778, | |
| "loss": 3.9434, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.6954177897574124, | |
| "grad_norm": 0.6976845860481262, | |
| "learning_rate": 0.0005588774959525094, | |
| "loss": 3.9712, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.7008086253369272, | |
| "grad_norm": 0.5976806282997131, | |
| "learning_rate": 0.000558553696708041, | |
| "loss": 3.9546, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.706199460916442, | |
| "grad_norm": 0.5768620371818542, | |
| "learning_rate": 0.0005582298974635726, | |
| "loss": 3.9514, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.7115902964959568, | |
| "grad_norm": 0.6398442983627319, | |
| "learning_rate": 0.0005579060982191041, | |
| "loss": 3.949, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.7169811320754716, | |
| "grad_norm": 0.6361405849456787, | |
| "learning_rate": 0.0005575822989746357, | |
| "loss": 3.9437, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.7223719676549866, | |
| "grad_norm": 0.5549120903015137, | |
| "learning_rate": 0.0005572584997301672, | |
| "loss": 3.9364, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.7277628032345014, | |
| "grad_norm": 0.8019936084747314, | |
| "learning_rate": 0.0005569347004856989, | |
| "loss": 3.9399, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.7331536388140162, | |
| "grad_norm": 0.6074087023735046, | |
| "learning_rate": 0.0005566109012412303, | |
| "loss": 3.9487, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.738544474393531, | |
| "grad_norm": 0.5666708946228027, | |
| "learning_rate": 0.000556287101996762, | |
| "loss": 3.9318, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.7439353099730458, | |
| "grad_norm": 0.6135334968566895, | |
| "learning_rate": 0.0005559633027522935, | |
| "loss": 3.9286, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.7493261455525606, | |
| "grad_norm": 0.6896835565567017, | |
| "learning_rate": 0.0005556395035078251, | |
| "loss": 3.9139, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 0.536331057548523, | |
| "learning_rate": 0.0005553157042633566, | |
| "loss": 3.9291, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "eval_accuracy": 0.33301186565591956, | |
| "eval_loss": 3.8630003929138184, | |
| "eval_runtime": 181.6998, | |
| "eval_samples_per_second": 99.125, | |
| "eval_steps_per_second": 6.197, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7601078167115903, | |
| "grad_norm": 0.5458576083183289, | |
| "learning_rate": 0.0005549919050188882, | |
| "loss": 3.9324, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.7654986522911051, | |
| "grad_norm": 0.5892027020454407, | |
| "learning_rate": 0.0005546681057744198, | |
| "loss": 3.9325, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.77088948787062, | |
| "grad_norm": 0.6889460682868958, | |
| "learning_rate": 0.0005543443065299514, | |
| "loss": 3.9066, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.7762803234501348, | |
| "grad_norm": 0.6623647212982178, | |
| "learning_rate": 0.000554020507285483, | |
| "loss": 3.9168, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.7816711590296496, | |
| "grad_norm": 0.6174417734146118, | |
| "learning_rate": 0.0005536967080410145, | |
| "loss": 3.9172, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.7870619946091644, | |
| "grad_norm": 0.5966586470603943, | |
| "learning_rate": 0.0005533729087965462, | |
| "loss": 3.9069, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.7924528301886793, | |
| "grad_norm": 0.5991404056549072, | |
| "learning_rate": 0.0005530491095520777, | |
| "loss": 3.9153, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.7978436657681941, | |
| "grad_norm": 0.6612049341201782, | |
| "learning_rate": 0.0005527253103076093, | |
| "loss": 3.8796, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.8032345013477089, | |
| "grad_norm": 0.5842414498329163, | |
| "learning_rate": 0.0005524015110631408, | |
| "loss": 3.89, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.8086253369272237, | |
| "grad_norm": 0.7356441617012024, | |
| "learning_rate": 0.0005520777118186724, | |
| "loss": 3.9037, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.8140161725067385, | |
| "grad_norm": 0.5401440858840942, | |
| "learning_rate": 0.0005517539125742039, | |
| "loss": 3.8968, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.8194070080862533, | |
| "grad_norm": 0.6095986366271973, | |
| "learning_rate": 0.0005514301133297355, | |
| "loss": 3.9171, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.8247978436657682, | |
| "grad_norm": 0.5622900724411011, | |
| "learning_rate": 0.0005511063140852671, | |
| "loss": 3.8997, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.8301886792452831, | |
| "grad_norm": 0.6393899917602539, | |
| "learning_rate": 0.0005507825148407987, | |
| "loss": 3.8719, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.8355795148247979, | |
| "grad_norm": 0.6330990791320801, | |
| "learning_rate": 0.0005504587155963302, | |
| "loss": 3.8863, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.8409703504043127, | |
| "grad_norm": 0.555696427822113, | |
| "learning_rate": 0.0005501349163518618, | |
| "loss": 3.8994, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.8463611859838275, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0005498175930922827, | |
| "loss": 3.8864, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.8517520215633423, | |
| "grad_norm": 0.6537197232246399, | |
| "learning_rate": 0.0005494937938478143, | |
| "loss": 3.8855, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.5033941268920898, | |
| "learning_rate": 0.0005491699946033459, | |
| "loss": 3.8855, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.862533692722372, | |
| "grad_norm": 0.5997176766395569, | |
| "learning_rate": 0.0005488461953588775, | |
| "loss": 3.8673, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.862533692722372, | |
| "eval_accuracy": 0.33755735921763835, | |
| "eval_loss": 3.8170533180236816, | |
| "eval_runtime": 181.3255, | |
| "eval_samples_per_second": 99.33, | |
| "eval_steps_per_second": 6.21, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8679245283018868, | |
| "grad_norm": 0.618726909160614, | |
| "learning_rate": 0.000548522396114409, | |
| "loss": 3.8841, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.8733153638814016, | |
| "grad_norm": 0.7399865984916687, | |
| "learning_rate": 0.0005481985968699406, | |
| "loss": 3.861, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.8787061994609164, | |
| "grad_norm": 0.5887700915336609, | |
| "learning_rate": 0.0005478747976254721, | |
| "loss": 3.8717, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.8840970350404312, | |
| "grad_norm": 0.5937870144844055, | |
| "learning_rate": 0.0005475509983810037, | |
| "loss": 3.864, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.889487870619946, | |
| "grad_norm": 0.5936031937599182, | |
| "learning_rate": 0.0005472271991365352, | |
| "loss": 3.8783, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.894878706199461, | |
| "grad_norm": 0.5343406200408936, | |
| "learning_rate": 0.0005469033998920669, | |
| "loss": 3.8528, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.9002695417789758, | |
| "grad_norm": 0.5612528324127197, | |
| "learning_rate": 0.0005465796006475984, | |
| "loss": 3.8672, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 0.5439274311065674, | |
| "learning_rate": 0.00054625580140313, | |
| "loss": 3.8675, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.9110512129380054, | |
| "grad_norm": 0.6031448245048523, | |
| "learning_rate": 0.0005459320021586615, | |
| "loss": 3.8391, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.9164420485175202, | |
| "grad_norm": 0.5453014969825745, | |
| "learning_rate": 0.0005456082029141932, | |
| "loss": 3.8505, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.921832884097035, | |
| "grad_norm": 0.5259692668914795, | |
| "learning_rate": 0.0005452844036697248, | |
| "loss": 3.8519, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.9272237196765498, | |
| "grad_norm": 0.6086834669113159, | |
| "learning_rate": 0.0005449606044252563, | |
| "loss": 3.8812, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.9326145552560647, | |
| "grad_norm": 0.5958359837532043, | |
| "learning_rate": 0.0005446368051807879, | |
| "loss": 3.8548, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.9380053908355795, | |
| "grad_norm": 0.5887793898582458, | |
| "learning_rate": 0.0005443130059363194, | |
| "loss": 3.8635, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.9433962264150944, | |
| "grad_norm": 0.5301551818847656, | |
| "learning_rate": 0.0005439892066918511, | |
| "loss": 3.8457, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.9487870619946092, | |
| "grad_norm": 0.6073969602584839, | |
| "learning_rate": 0.0005436654074473825, | |
| "loss": 3.8448, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.954177897574124, | |
| "grad_norm": 0.530245840549469, | |
| "learning_rate": 0.0005433416082029142, | |
| "loss": 3.8465, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.9595687331536388, | |
| "grad_norm": 0.5945138931274414, | |
| "learning_rate": 0.0005430178089584457, | |
| "loss": 3.8472, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.9649595687331537, | |
| "grad_norm": 0.9510626792907715, | |
| "learning_rate": 0.0005426940097139773, | |
| "loss": 3.8487, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.9703504043126685, | |
| "grad_norm": 0.5848538875579834, | |
| "learning_rate": 0.0005423702104695088, | |
| "loss": 3.8645, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9703504043126685, | |
| "eval_accuracy": 0.34104250924717566, | |
| "eval_loss": 3.7781906127929688, | |
| "eval_runtime": 181.7189, | |
| "eval_samples_per_second": 99.115, | |
| "eval_steps_per_second": 6.196, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9757412398921833, | |
| "grad_norm": 0.7085767984390259, | |
| "learning_rate": 0.0005420464112250404, | |
| "loss": 3.8406, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.9811320754716981, | |
| "grad_norm": 0.6005401015281677, | |
| "learning_rate": 0.000541722611980572, | |
| "loss": 3.8315, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.9865229110512129, | |
| "grad_norm": 0.5444480776786804, | |
| "learning_rate": 0.0005413988127361036, | |
| "loss": 3.8438, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.9919137466307277, | |
| "grad_norm": 0.5543394684791565, | |
| "learning_rate": 0.0005410750134916351, | |
| "loss": 3.8471, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.9973045822102425, | |
| "grad_norm": 0.530593991279602, | |
| "learning_rate": 0.0005407512142471667, | |
| "loss": 3.8383, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.0026954177897573, | |
| "grad_norm": 0.6533575057983398, | |
| "learning_rate": 0.0005404274150026983, | |
| "loss": 3.8157, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.0080862533692723, | |
| "grad_norm": 0.5836777091026306, | |
| "learning_rate": 0.0005401036157582299, | |
| "loss": 3.7826, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.013477088948787, | |
| "grad_norm": 0.5416027307510376, | |
| "learning_rate": 0.0005397798165137614, | |
| "loss": 3.7923, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.0188679245283019, | |
| "grad_norm": 0.5336276888847351, | |
| "learning_rate": 0.000539456017269293, | |
| "loss": 3.7841, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.0242587601078168, | |
| "grad_norm": 0.5434985160827637, | |
| "learning_rate": 0.0005391322180248245, | |
| "loss": 3.7778, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.0296495956873315, | |
| "grad_norm": 0.5847581624984741, | |
| "learning_rate": 0.0005388084187803561, | |
| "loss": 3.7874, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.0350404312668464, | |
| "grad_norm": 0.5592184066772461, | |
| "learning_rate": 0.0005384846195358876, | |
| "loss": 3.7689, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.0404312668463611, | |
| "grad_norm": 0.5576586127281189, | |
| "learning_rate": 0.0005381608202914193, | |
| "loss": 3.7629, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.045822102425876, | |
| "grad_norm": 0.5352767109870911, | |
| "learning_rate": 0.0005378370210469509, | |
| "loss": 3.7724, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.0512129380053907, | |
| "grad_norm": 0.5537406206130981, | |
| "learning_rate": 0.0005375132218024824, | |
| "loss": 3.7647, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.0566037735849056, | |
| "grad_norm": 0.6254362463951111, | |
| "learning_rate": 0.000537189422558014, | |
| "loss": 3.771, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.0619946091644206, | |
| "grad_norm": 0.5927165150642395, | |
| "learning_rate": 0.0005368656233135455, | |
| "loss": 3.7559, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.0673854447439353, | |
| "grad_norm": 0.5955566763877869, | |
| "learning_rate": 0.0005365418240690772, | |
| "loss": 3.7656, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.0727762803234502, | |
| "grad_norm": 0.5806096792221069, | |
| "learning_rate": 0.0005362180248246087, | |
| "loss": 3.7619, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.0781671159029649, | |
| "grad_norm": 0.6050166487693787, | |
| "learning_rate": 0.0005358942255801403, | |
| "loss": 3.767, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0781671159029649, | |
| "eval_accuracy": 0.3450713583273366, | |
| "eval_loss": 3.746429681777954, | |
| "eval_runtime": 181.6002, | |
| "eval_samples_per_second": 99.179, | |
| "eval_steps_per_second": 6.2, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 92750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.36069179392e+16, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |