diff --git "a/cost_to_hit_frequency_40817/checkpoint-50000/trainer_state.json" "b/cost_to_hit_frequency_40817/checkpoint-50000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_hit_frequency_40817/checkpoint-50000/trainer_state.json" @@ -0,0 +1,7493 @@ +{ + "best_global_step": 49000, + "best_metric": 3.5516107082366943, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_hit_frequency_40817/checkpoint-20000", + "epoch": 15.114900822447993, + "eval_steps": 1000, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.015118529269472665, + "grad_norm": 0.6569534540176392, + "learning_rate": 0.000294, + "loss": 8.48, + "step": 50 + }, + { + "epoch": 0.03023705853894533, + "grad_norm": 0.9759527444839478, + "learning_rate": 0.0005939999999999999, + "loss": 6.7387, + "step": 100 + }, + { + "epoch": 0.045355587808417995, + "grad_norm": 0.4822550415992737, + "learning_rate": 0.0005995549500454132, + "loss": 6.3406, + "step": 150 + }, + { + "epoch": 0.06047411707789066, + "grad_norm": 0.4631986618041992, + "learning_rate": 0.000599100817438692, + "loss": 6.137, + "step": 200 + }, + { + "epoch": 0.07559264634736333, + "grad_norm": 0.4041077494621277, + "learning_rate": 0.0005986466848319708, + "loss": 5.9853, + "step": 250 + }, + { + "epoch": 0.09071117561683599, + "grad_norm": 0.3731299042701721, + "learning_rate": 0.0005981925522252497, + "loss": 5.8362, + "step": 300 + }, + { + "epoch": 0.10582970488630866, + "grad_norm": 0.44940078258514404, + "learning_rate": 0.0005977384196185286, + "loss": 5.7301, + "step": 350 + }, + { + "epoch": 0.12094823415578132, + "grad_norm": 0.4535612165927887, + "learning_rate": 0.0005972842870118073, + "loss": 5.6198, + "step": 400 + }, + { + "epoch": 0.13606676342525398, + "grad_norm": 0.47181379795074463, + "learning_rate": 0.0005968301544050862, + "loss": 5.5126, + "step": 450 + }, + { + "epoch": 0.15118529269472666, + "grad_norm": 0.47652333974838257, + "learning_rate": 0.0005963760217983651, + "loss": 5.4013, + "step": 500 + }, + { + "epoch": 0.16630382196419932, + "grad_norm": 0.45077475905418396, + "learning_rate": 0.0005959218891916439, + "loss": 5.3135, + "step": 550 + }, + { + "epoch": 0.18142235123367198, + "grad_norm": 0.4728659987449646, + "learning_rate": 0.0005954677565849228, + "loss": 5.2565, + "step": 600 + }, + { + "epoch": 0.19654088050314467, + "grad_norm": 0.447640985250473, + "learning_rate": 0.0005950136239782016, + "loss": 5.1786, + "step": 650 + }, + { + "epoch": 0.21165940977261732, + "grad_norm": 0.4660065174102783, + "learning_rate": 0.0005945594913714804, + "loss": 5.1181, + "step": 700 + }, + { + "epoch": 0.22677793904208998, + "grad_norm": 0.4138477146625519, + "learning_rate": 0.0005941053587647593, + "loss": 5.0609, + "step": 750 + }, + { + "epoch": 0.24189646831156264, + "grad_norm": 0.43173298239707947, + "learning_rate": 0.0005936512261580381, + "loss": 5.0094, + "step": 800 + }, + { + "epoch": 0.2570149975810353, + "grad_norm": 0.41261234879493713, + "learning_rate": 0.0005931970935513169, + "loss": 4.9896, + "step": 850 + }, + { + "epoch": 0.27213352685050796, + "grad_norm": 0.4491656720638275, + "learning_rate": 0.0005927429609445958, + "loss": 4.9266, + "step": 900 + }, + { + "epoch": 0.28725205611998067, + "grad_norm": 0.4212245047092438, + "learning_rate": 0.0005922888283378746, + "loss": 4.8791, + "step": 950 + }, + { + "epoch": 0.30237058538945333, + "grad_norm": 0.4332761764526367, + "learning_rate": 0.0005918346957311534, + "loss": 4.8382, + "step": 1000 + }, + { + "epoch": 0.30237058538945333, + "eval_accuracy": 0.2525349543872818, + "eval_loss": 4.76720666885376, + "eval_runtime": 182.6488, + "eval_samples_per_second": 91.126, + "eval_steps_per_second": 5.699, + "step": 1000 + }, + { + "epoch": 0.317489114658926, + "grad_norm": 0.5782724022865295, + "learning_rate": 0.0005913805631244322, + "loss": 4.7861, + "step": 1050 + }, + { + "epoch": 0.33260764392839864, + "grad_norm": 0.5215025544166565, + "learning_rate": 0.0005909264305177112, + "loss": 4.7449, + "step": 1100 + }, + { + "epoch": 0.3477261731978713, + "grad_norm": 0.5074264407157898, + "learning_rate": 0.0005904722979109899, + "loss": 4.7092, + "step": 1150 + }, + { + "epoch": 0.36284470246734396, + "grad_norm": 0.4659997820854187, + "learning_rate": 0.0005900181653042687, + "loss": 4.6644, + "step": 1200 + }, + { + "epoch": 0.3779632317368166, + "grad_norm": 0.46156764030456543, + "learning_rate": 0.0005895640326975477, + "loss": 4.6383, + "step": 1250 + }, + { + "epoch": 0.39308176100628933, + "grad_norm": 0.43247443437576294, + "learning_rate": 0.0005891099000908265, + "loss": 4.5939, + "step": 1300 + }, + { + "epoch": 0.408200290275762, + "grad_norm": 0.43695470690727234, + "learning_rate": 0.0005886557674841053, + "loss": 4.545, + "step": 1350 + }, + { + "epoch": 0.42331881954523465, + "grad_norm": 0.4135619103908539, + "learning_rate": 0.0005882016348773842, + "loss": 4.5432, + "step": 1400 + }, + { + "epoch": 0.4384373488147073, + "grad_norm": 0.3893912136554718, + "learning_rate": 0.000587747502270663, + "loss": 4.5358, + "step": 1450 + }, + { + "epoch": 0.45355587808417996, + "grad_norm": 0.4318796694278717, + "learning_rate": 0.0005872933696639418, + "loss": 4.5063, + "step": 1500 + }, + { + "epoch": 0.4686744073536526, + "grad_norm": 0.404938668012619, + "learning_rate": 0.0005868392370572207, + "loss": 4.4833, + "step": 1550 + }, + { + "epoch": 0.4837929366231253, + "grad_norm": 0.39407220482826233, + "learning_rate": 0.0005863851044504995, + "loss": 4.4573, + "step": 1600 + }, + { + "epoch": 0.498911465892598, + "grad_norm": 0.4312690496444702, + "learning_rate": 0.0005859309718437783, + "loss": 4.4493, + "step": 1650 + }, + { + "epoch": 0.5140299951620706, + "grad_norm": 0.42534348368644714, + "learning_rate": 0.0005854768392370572, + "loss": 4.4358, + "step": 1700 + }, + { + "epoch": 0.5291485244315433, + "grad_norm": 0.3989129066467285, + "learning_rate": 0.000585022706630336, + "loss": 4.4004, + "step": 1750 + }, + { + "epoch": 0.5442670537010159, + "grad_norm": 0.4206331968307495, + "learning_rate": 0.0005845685740236149, + "loss": 4.3903, + "step": 1800 + }, + { + "epoch": 0.5593855829704887, + "grad_norm": 0.39128994941711426, + "learning_rate": 0.0005841144414168936, + "loss": 4.3716, + "step": 1850 + }, + { + "epoch": 0.5745041122399613, + "grad_norm": 0.4127030670642853, + "learning_rate": 0.0005836603088101725, + "loss": 4.3646, + "step": 1900 + }, + { + "epoch": 0.589622641509434, + "grad_norm": 0.47868236899375916, + "learning_rate": 0.0005832061762034513, + "loss": 4.3474, + "step": 1950 + }, + { + "epoch": 0.6047411707789067, + "grad_norm": 0.41013041138648987, + "learning_rate": 0.0005827520435967303, + "loss": 4.3297, + "step": 2000 + }, + { + "epoch": 0.6047411707789067, + "eval_accuracy": 0.29954742363508696, + "eval_loss": 4.283868312835693, + "eval_runtime": 182.004, + "eval_samples_per_second": 91.449, + "eval_steps_per_second": 5.72, + "step": 2000 + }, + { + "epoch": 0.6198597000483793, + "grad_norm": 0.3787741959095001, + "learning_rate": 0.0005822979109900091, + "loss": 4.3142, + "step": 2050 + }, + { + "epoch": 0.634978229317852, + "grad_norm": 0.3691158890724182, + "learning_rate": 0.0005818437783832879, + "loss": 4.3103, + "step": 2100 + }, + { + "epoch": 0.6500967585873246, + "grad_norm": 0.38457924127578735, + "learning_rate": 0.0005813896457765666, + "loss": 4.2851, + "step": 2150 + }, + { + "epoch": 0.6652152878567973, + "grad_norm": 0.40187668800354004, + "learning_rate": 0.0005809355131698456, + "loss": 4.2813, + "step": 2200 + }, + { + "epoch": 0.68033381712627, + "grad_norm": 0.38091689348220825, + "learning_rate": 0.0005804813805631244, + "loss": 4.2651, + "step": 2250 + }, + { + "epoch": 0.6954523463957426, + "grad_norm": 0.4001523554325104, + "learning_rate": 0.0005800272479564032, + "loss": 4.2552, + "step": 2300 + }, + { + "epoch": 0.7105708756652153, + "grad_norm": 0.3821207880973816, + "learning_rate": 0.0005795731153496821, + "loss": 4.2479, + "step": 2350 + }, + { + "epoch": 0.7256894049346879, + "grad_norm": 0.3614497482776642, + "learning_rate": 0.0005791189827429609, + "loss": 4.2514, + "step": 2400 + }, + { + "epoch": 0.7408079342041606, + "grad_norm": 0.40655964612960815, + "learning_rate": 0.0005786648501362397, + "loss": 4.2292, + "step": 2450 + }, + { + "epoch": 0.7559264634736332, + "grad_norm": 0.3501703441143036, + "learning_rate": 0.0005782107175295186, + "loss": 4.2267, + "step": 2500 + }, + { + "epoch": 0.771044992743106, + "grad_norm": 0.41427701711654663, + "learning_rate": 0.0005777565849227974, + "loss": 4.2247, + "step": 2550 + }, + { + "epoch": 0.7861635220125787, + "grad_norm": 0.44100141525268555, + "learning_rate": 0.0005773024523160762, + "loss": 4.2089, + "step": 2600 + }, + { + "epoch": 0.8012820512820513, + "grad_norm": 0.3879771828651428, + "learning_rate": 0.000576848319709355, + "loss": 4.2073, + "step": 2650 + }, + { + "epoch": 0.816400580551524, + "grad_norm": 0.3788333833217621, + "learning_rate": 0.0005763941871026339, + "loss": 4.2015, + "step": 2700 + }, + { + "epoch": 0.8315191098209966, + "grad_norm": 0.366184800863266, + "learning_rate": 0.0005759400544959128, + "loss": 4.1719, + "step": 2750 + }, + { + "epoch": 0.8466376390904693, + "grad_norm": 0.3733968436717987, + "learning_rate": 0.0005754859218891917, + "loss": 4.1749, + "step": 2800 + }, + { + "epoch": 0.861756168359942, + "grad_norm": 0.37004509568214417, + "learning_rate": 0.0005750317892824704, + "loss": 4.1642, + "step": 2850 + }, + { + "epoch": 0.8768746976294146, + "grad_norm": 0.36916959285736084, + "learning_rate": 0.0005745776566757492, + "loss": 4.1666, + "step": 2900 + }, + { + "epoch": 0.8919932268988873, + "grad_norm": 0.39038950204849243, + "learning_rate": 0.0005741235240690281, + "loss": 4.1523, + "step": 2950 + }, + { + "epoch": 0.9071117561683599, + "grad_norm": 0.3550504744052887, + "learning_rate": 0.000573669391462307, + "loss": 4.1437, + "step": 3000 + }, + { + "epoch": 0.9071117561683599, + "eval_accuracy": 0.3157440890648464, + "eval_loss": 4.091999530792236, + "eval_runtime": 182.3379, + "eval_samples_per_second": 91.281, + "eval_steps_per_second": 5.709, + "step": 3000 + }, + { + "epoch": 0.9222302854378326, + "grad_norm": 0.3827008903026581, + "learning_rate": 0.0005732152588555858, + "loss": 4.1333, + "step": 3050 + }, + { + "epoch": 0.9373488147073052, + "grad_norm": 0.379339337348938, + "learning_rate": 0.0005727611262488646, + "loss": 4.1309, + "step": 3100 + }, + { + "epoch": 0.9524673439767779, + "grad_norm": 0.38624098896980286, + "learning_rate": 0.0005723069936421435, + "loss": 4.1218, + "step": 3150 + }, + { + "epoch": 0.9675858732462506, + "grad_norm": 0.36264657974243164, + "learning_rate": 0.0005718528610354223, + "loss": 4.1127, + "step": 3200 + }, + { + "epoch": 0.9827044025157232, + "grad_norm": 0.3813261389732361, + "learning_rate": 0.0005713987284287011, + "loss": 4.1095, + "step": 3250 + }, + { + "epoch": 0.997822931785196, + "grad_norm": 0.46720796823501587, + "learning_rate": 0.00057094459582198, + "loss": 4.0923, + "step": 3300 + }, + { + "epoch": 1.012699564586357, + "grad_norm": 0.3777443766593933, + "learning_rate": 0.0005704904632152588, + "loss": 4.0438, + "step": 3350 + }, + { + "epoch": 1.0278180938558297, + "grad_norm": 0.35427603125572205, + "learning_rate": 0.0005700363306085376, + "loss": 4.0293, + "step": 3400 + }, + { + "epoch": 1.0429366231253023, + "grad_norm": 0.36464598774909973, + "learning_rate": 0.0005695821980018164, + "loss": 4.0156, + "step": 3450 + }, + { + "epoch": 1.058055152394775, + "grad_norm": 0.39198657870292664, + "learning_rate": 0.0005691280653950954, + "loss": 4.0147, + "step": 3500 + }, + { + "epoch": 1.0731736816642476, + "grad_norm": 0.3432103395462036, + "learning_rate": 0.0005686739327883741, + "loss": 4.0026, + "step": 3550 + }, + { + "epoch": 1.0882922109337203, + "grad_norm": 0.370108962059021, + "learning_rate": 0.0005682198001816529, + "loss": 4.0078, + "step": 3600 + }, + { + "epoch": 1.103410740203193, + "grad_norm": 0.3692167103290558, + "learning_rate": 0.0005677656675749318, + "loss": 4.0147, + "step": 3650 + }, + { + "epoch": 1.1185292694726656, + "grad_norm": 0.354033887386322, + "learning_rate": 0.0005673115349682107, + "loss": 3.9974, + "step": 3700 + }, + { + "epoch": 1.1336477987421383, + "grad_norm": 0.3713250160217285, + "learning_rate": 0.0005668574023614895, + "loss": 3.9928, + "step": 3750 + }, + { + "epoch": 1.148766328011611, + "grad_norm": 0.364036500453949, + "learning_rate": 0.0005664032697547684, + "loss": 4.0023, + "step": 3800 + }, + { + "epoch": 1.1638848572810838, + "grad_norm": 0.3609767258167267, + "learning_rate": 0.0005659491371480471, + "loss": 3.9908, + "step": 3850 + }, + { + "epoch": 1.1790033865505563, + "grad_norm": 0.3580855131149292, + "learning_rate": 0.000565495004541326, + "loss": 3.9918, + "step": 3900 + }, + { + "epoch": 1.1941219158200291, + "grad_norm": 0.35853010416030884, + "learning_rate": 0.0005650408719346049, + "loss": 3.9891, + "step": 3950 + }, + { + "epoch": 1.2092404450895016, + "grad_norm": 0.3546011745929718, + "learning_rate": 0.0005645867393278837, + "loss": 3.9999, + "step": 4000 + }, + { + "epoch": 1.2092404450895016, + "eval_accuracy": 0.32564334461599675, + "eval_loss": 3.9847805500030518, + "eval_runtime": 182.3722, + "eval_samples_per_second": 91.264, + "eval_steps_per_second": 5.708, + "step": 4000 + }, + { + "epoch": 1.2243589743589745, + "grad_norm": 0.3494962453842163, + "learning_rate": 0.0005641326067211625, + "loss": 3.9874, + "step": 4050 + }, + { + "epoch": 1.239477503628447, + "grad_norm": 0.37261122465133667, + "learning_rate": 0.0005636784741144414, + "loss": 3.9849, + "step": 4100 + }, + { + "epoch": 1.2545960328979198, + "grad_norm": 0.3606380224227905, + "learning_rate": 0.0005632243415077202, + "loss": 3.9697, + "step": 4150 + }, + { + "epoch": 1.2697145621673924, + "grad_norm": 0.3712509572505951, + "learning_rate": 0.000562770208900999, + "loss": 3.9746, + "step": 4200 + }, + { + "epoch": 1.284833091436865, + "grad_norm": 0.3441880941390991, + "learning_rate": 0.000562316076294278, + "loss": 3.9746, + "step": 4250 + }, + { + "epoch": 1.2999516207063377, + "grad_norm": 0.37153229117393494, + "learning_rate": 0.0005618619436875567, + "loss": 3.9607, + "step": 4300 + }, + { + "epoch": 1.3150701499758104, + "grad_norm": 0.3481687605381012, + "learning_rate": 0.0005614078110808355, + "loss": 3.9699, + "step": 4350 + }, + { + "epoch": 1.330188679245283, + "grad_norm": 0.3434485197067261, + "learning_rate": 0.0005609536784741143, + "loss": 3.955, + "step": 4400 + }, + { + "epoch": 1.3453072085147557, + "grad_norm": 0.36444127559661865, + "learning_rate": 0.0005604995458673933, + "loss": 3.959, + "step": 4450 + }, + { + "epoch": 1.3604257377842284, + "grad_norm": 0.3475540280342102, + "learning_rate": 0.0005600454132606721, + "loss": 3.9587, + "step": 4500 + }, + { + "epoch": 1.375544267053701, + "grad_norm": 0.35136139392852783, + "learning_rate": 0.0005595912806539509, + "loss": 3.9493, + "step": 4550 + }, + { + "epoch": 1.3906627963231737, + "grad_norm": 0.3854037821292877, + "learning_rate": 0.0005591371480472297, + "loss": 3.9523, + "step": 4600 + }, + { + "epoch": 1.4057813255926463, + "grad_norm": 0.32536762952804565, + "learning_rate": 0.0005586830154405086, + "loss": 3.9458, + "step": 4650 + }, + { + "epoch": 1.420899854862119, + "grad_norm": 0.33678901195526123, + "learning_rate": 0.0005582288828337874, + "loss": 3.9349, + "step": 4700 + }, + { + "epoch": 1.4360183841315917, + "grad_norm": 0.38056421279907227, + "learning_rate": 0.0005577747502270663, + "loss": 3.9357, + "step": 4750 + }, + { + "epoch": 1.4511369134010643, + "grad_norm": 0.3556210994720459, + "learning_rate": 0.0005573206176203451, + "loss": 3.9439, + "step": 4800 + }, + { + "epoch": 1.466255442670537, + "grad_norm": 0.3364984691143036, + "learning_rate": 0.0005568664850136239, + "loss": 3.9258, + "step": 4850 + }, + { + "epoch": 1.4813739719400096, + "grad_norm": 0.34836122393608093, + "learning_rate": 0.0005564123524069028, + "loss": 3.9111, + "step": 4900 + }, + { + "epoch": 1.4964925012094823, + "grad_norm": 0.32042673230171204, + "learning_rate": 0.0005559582198001816, + "loss": 3.9368, + "step": 4950 + }, + { + "epoch": 1.511611030478955, + "grad_norm": 0.34929823875427246, + "learning_rate": 0.0005555040871934604, + "loss": 3.9251, + "step": 5000 + }, + { + "epoch": 1.511611030478955, + "eval_accuracy": 0.33236250223983677, + "eval_loss": 3.909000873565674, + "eval_runtime": 182.2679, + "eval_samples_per_second": 91.316, + "eval_steps_per_second": 5.711, + "step": 5000 + }, + { + "epoch": 1.5267295597484276, + "grad_norm": 0.3525310158729553, + "learning_rate": 0.0005550499545867392, + "loss": 3.9314, + "step": 5050 + }, + { + "epoch": 1.5418480890179005, + "grad_norm": 0.34068188071250916, + "learning_rate": 0.0005545958219800181, + "loss": 3.9343, + "step": 5100 + }, + { + "epoch": 1.556966618287373, + "grad_norm": 0.3572525084018707, + "learning_rate": 0.0005541416893732969, + "loss": 3.9272, + "step": 5150 + }, + { + "epoch": 1.5720851475568458, + "grad_norm": 0.3401585519313812, + "learning_rate": 0.0005536875567665758, + "loss": 3.9159, + "step": 5200 + }, + { + "epoch": 1.5872036768263182, + "grad_norm": 0.3368934094905853, + "learning_rate": 0.0005532334241598547, + "loss": 3.9024, + "step": 5250 + }, + { + "epoch": 1.6023222060957911, + "grad_norm": 0.33803030848503113, + "learning_rate": 0.0005527792915531334, + "loss": 3.9133, + "step": 5300 + }, + { + "epoch": 1.6174407353652636, + "grad_norm": 0.3491084575653076, + "learning_rate": 0.0005523251589464122, + "loss": 3.9049, + "step": 5350 + }, + { + "epoch": 1.6325592646347364, + "grad_norm": 0.3191988170146942, + "learning_rate": 0.0005518710263396912, + "loss": 3.897, + "step": 5400 + }, + { + "epoch": 1.6476777939042089, + "grad_norm": 0.33832481503486633, + "learning_rate": 0.00055141689373297, + "loss": 3.8915, + "step": 5450 + }, + { + "epoch": 1.6627963231736818, + "grad_norm": 0.3492324650287628, + "learning_rate": 0.0005509627611262488, + "loss": 3.8975, + "step": 5500 + }, + { + "epoch": 1.6779148524431542, + "grad_norm": 0.3200375735759735, + "learning_rate": 0.0005505086285195277, + "loss": 3.8861, + "step": 5550 + }, + { + "epoch": 1.693033381712627, + "grad_norm": 0.3539038896560669, + "learning_rate": 0.0005500544959128065, + "loss": 3.8972, + "step": 5600 + }, + { + "epoch": 1.7081519109820995, + "grad_norm": 0.34983664751052856, + "learning_rate": 0.0005496003633060853, + "loss": 3.876, + "step": 5650 + }, + { + "epoch": 1.7232704402515724, + "grad_norm": 0.33722037076950073, + "learning_rate": 0.0005491462306993642, + "loss": 3.8811, + "step": 5700 + }, + { + "epoch": 1.738388969521045, + "grad_norm": 0.34622326493263245, + "learning_rate": 0.000548692098092643, + "loss": 3.8913, + "step": 5750 + }, + { + "epoch": 1.7535074987905177, + "grad_norm": 0.3671897351741791, + "learning_rate": 0.0005482379654859218, + "loss": 3.8912, + "step": 5800 + }, + { + "epoch": 1.7686260280599904, + "grad_norm": 0.3275216817855835, + "learning_rate": 0.0005477838328792006, + "loss": 3.8837, + "step": 5850 + }, + { + "epoch": 1.783744557329463, + "grad_norm": 0.3239729404449463, + "learning_rate": 0.0005473297002724795, + "loss": 3.8796, + "step": 5900 + }, + { + "epoch": 1.7988630865989357, + "grad_norm": 0.3572208881378174, + "learning_rate": 0.0005468755676657584, + "loss": 3.8735, + "step": 5950 + }, + { + "epoch": 1.8139816158684083, + "grad_norm": 0.3316764831542969, + "learning_rate": 0.0005464214350590371, + "loss": 3.8666, + "step": 6000 + }, + { + "epoch": 1.8139816158684083, + "eval_accuracy": 0.33760983430616326, + "eval_loss": 3.853510856628418, + "eval_runtime": 182.1185, + "eval_samples_per_second": 91.391, + "eval_steps_per_second": 5.716, + "step": 6000 + }, + { + "epoch": 1.829100145137881, + "grad_norm": 0.3352029025554657, + "learning_rate": 0.000545967302452316, + "loss": 3.8716, + "step": 6050 + }, + { + "epoch": 1.8442186744073537, + "grad_norm": 0.33704429864883423, + "learning_rate": 0.0005455131698455948, + "loss": 3.8742, + "step": 6100 + }, + { + "epoch": 1.8593372036768263, + "grad_norm": 0.32570329308509827, + "learning_rate": 0.0005450590372388737, + "loss": 3.8545, + "step": 6150 + }, + { + "epoch": 1.874455732946299, + "grad_norm": 0.3355255424976349, + "learning_rate": 0.0005446049046321526, + "loss": 3.8584, + "step": 6200 + }, + { + "epoch": 1.8895742622157716, + "grad_norm": 0.3264237642288208, + "learning_rate": 0.0005441507720254314, + "loss": 3.8647, + "step": 6250 + }, + { + "epoch": 1.9046927914852443, + "grad_norm": 0.34331536293029785, + "learning_rate": 0.0005436966394187102, + "loss": 3.8587, + "step": 6300 + }, + { + "epoch": 1.919811320754717, + "grad_norm": 0.3500491976737976, + "learning_rate": 0.0005432425068119891, + "loss": 3.853, + "step": 6350 + }, + { + "epoch": 1.9349298500241896, + "grad_norm": 0.33102309703826904, + "learning_rate": 0.0005427883742052679, + "loss": 3.8585, + "step": 6400 + }, + { + "epoch": 1.9500483792936623, + "grad_norm": 0.3271898329257965, + "learning_rate": 0.0005423342415985467, + "loss": 3.8577, + "step": 6450 + }, + { + "epoch": 1.965166908563135, + "grad_norm": 0.328890323638916, + "learning_rate": 0.0005418801089918256, + "loss": 3.8379, + "step": 6500 + }, + { + "epoch": 1.9802854378326078, + "grad_norm": 0.31617918610572815, + "learning_rate": 0.0005414259763851044, + "loss": 3.8392, + "step": 6550 + }, + { + "epoch": 1.9954039671020802, + "grad_norm": 0.324520081281662, + "learning_rate": 0.0005409718437783832, + "loss": 3.8461, + "step": 6600 + }, + { + "epoch": 2.0102805999032416, + "grad_norm": 0.324241042137146, + "learning_rate": 0.000540517711171662, + "loss": 3.7744, + "step": 6650 + }, + { + "epoch": 2.025399129172714, + "grad_norm": 0.36148601770401, + "learning_rate": 0.000540063578564941, + "loss": 3.7479, + "step": 6700 + }, + { + "epoch": 2.040517658442187, + "grad_norm": 0.32816359400749207, + "learning_rate": 0.0005396094459582197, + "loss": 3.74, + "step": 6750 + }, + { + "epoch": 2.0556361877116593, + "grad_norm": 0.33710262179374695, + "learning_rate": 0.0005391553133514985, + "loss": 3.7454, + "step": 6800 + }, + { + "epoch": 2.0707547169811322, + "grad_norm": 0.3189140558242798, + "learning_rate": 0.0005387011807447775, + "loss": 3.7555, + "step": 6850 + }, + { + "epoch": 2.0858732462506047, + "grad_norm": 0.34328320622444153, + "learning_rate": 0.0005382470481380563, + "loss": 3.7461, + "step": 6900 + }, + { + "epoch": 2.1009917755200775, + "grad_norm": 0.3447844982147217, + "learning_rate": 0.0005377929155313351, + "loss": 3.7688, + "step": 6950 + }, + { + "epoch": 2.11611030478955, + "grad_norm": 0.3423329293727875, + "learning_rate": 0.000537338782924614, + "loss": 3.748, + "step": 7000 + }, + { + "epoch": 2.11611030478955, + "eval_accuracy": 0.3412945715762478, + "eval_loss": 3.8153622150421143, + "eval_runtime": 182.3197, + "eval_samples_per_second": 91.29, + "eval_steps_per_second": 5.71, + "step": 7000 + }, + { + "epoch": 2.131228834059023, + "grad_norm": 0.3441483974456787, + "learning_rate": 0.0005368846503178928, + "loss": 3.7481, + "step": 7050 + }, + { + "epoch": 2.1463473633284953, + "grad_norm": 0.3269549608230591, + "learning_rate": 0.0005364305177111716, + "loss": 3.7588, + "step": 7100 + }, + { + "epoch": 2.161465892597968, + "grad_norm": 0.330601304769516, + "learning_rate": 0.0005359763851044505, + "loss": 3.7418, + "step": 7150 + }, + { + "epoch": 2.1765844218674406, + "grad_norm": 0.31895920634269714, + "learning_rate": 0.0005355222524977293, + "loss": 3.7563, + "step": 7200 + }, + { + "epoch": 2.1917029511369135, + "grad_norm": 0.33730292320251465, + "learning_rate": 0.0005350681198910081, + "loss": 3.7622, + "step": 7250 + }, + { + "epoch": 2.206821480406386, + "grad_norm": 0.32292884588241577, + "learning_rate": 0.000534613987284287, + "loss": 3.7546, + "step": 7300 + }, + { + "epoch": 2.221940009675859, + "grad_norm": 0.33400362730026245, + "learning_rate": 0.0005341598546775658, + "loss": 3.7655, + "step": 7350 + }, + { + "epoch": 2.2370585389453312, + "grad_norm": 0.3238161504268646, + "learning_rate": 0.0005337057220708446, + "loss": 3.7483, + "step": 7400 + }, + { + "epoch": 2.252177068214804, + "grad_norm": 0.3304459750652313, + "learning_rate": 0.0005332515894641234, + "loss": 3.7577, + "step": 7450 + }, + { + "epoch": 2.2672955974842766, + "grad_norm": 0.34201282262802124, + "learning_rate": 0.0005327974568574023, + "loss": 3.7453, + "step": 7500 + }, + { + "epoch": 2.2824141267537494, + "grad_norm": 0.3679095208644867, + "learning_rate": 0.0005323433242506811, + "loss": 3.7635, + "step": 7550 + }, + { + "epoch": 2.297532656023222, + "grad_norm": 0.3323363959789276, + "learning_rate": 0.00053188919164396, + "loss": 3.7442, + "step": 7600 + }, + { + "epoch": 2.3126511852926948, + "grad_norm": 0.3315403163433075, + "learning_rate": 0.0005314350590372389, + "loss": 3.751, + "step": 7650 + }, + { + "epoch": 2.3277697145621676, + "grad_norm": 0.35335952043533325, + "learning_rate": 0.0005309809264305177, + "loss": 3.7569, + "step": 7700 + }, + { + "epoch": 2.34288824383164, + "grad_norm": 0.33651435375213623, + "learning_rate": 0.0005305267938237964, + "loss": 3.7466, + "step": 7750 + }, + { + "epoch": 2.3580067731011125, + "grad_norm": 0.31409308314323425, + "learning_rate": 0.0005300726612170754, + "loss": 3.7599, + "step": 7800 + }, + { + "epoch": 2.3731253023705854, + "grad_norm": 0.3148058354854584, + "learning_rate": 0.0005296185286103542, + "loss": 3.7423, + "step": 7850 + }, + { + "epoch": 2.3882438316400583, + "grad_norm": 0.34514766931533813, + "learning_rate": 0.000529164396003633, + "loss": 3.7253, + "step": 7900 + }, + { + "epoch": 2.4033623609095307, + "grad_norm": 0.32518208026885986, + "learning_rate": 0.0005287102633969119, + "loss": 3.7365, + "step": 7950 + }, + { + "epoch": 2.418480890179003, + "grad_norm": 0.3367462158203125, + "learning_rate": 0.0005282561307901907, + "loss": 3.7526, + "step": 8000 + }, + { + "epoch": 2.418480890179003, + "eval_accuracy": 0.3448259887850608, + "eval_loss": 3.7824463844299316, + "eval_runtime": 182.5201, + "eval_samples_per_second": 91.19, + "eval_steps_per_second": 5.703, + "step": 8000 + }, + { + "epoch": 2.433599419448476, + "grad_norm": 0.3492235541343689, + "learning_rate": 0.0005278019981834695, + "loss": 3.7511, + "step": 8050 + }, + { + "epoch": 2.448717948717949, + "grad_norm": 0.3320801556110382, + "learning_rate": 0.0005273478655767484, + "loss": 3.7335, + "step": 8100 + }, + { + "epoch": 2.4638364779874213, + "grad_norm": 0.32247304916381836, + "learning_rate": 0.0005268937329700272, + "loss": 3.7359, + "step": 8150 + }, + { + "epoch": 2.478955007256894, + "grad_norm": 0.3158959448337555, + "learning_rate": 0.000526439600363306, + "loss": 3.7384, + "step": 8200 + }, + { + "epoch": 2.4940735365263667, + "grad_norm": 0.3267774283885956, + "learning_rate": 0.0005259854677565848, + "loss": 3.7429, + "step": 8250 + }, + { + "epoch": 2.5091920657958395, + "grad_norm": 0.34227943420410156, + "learning_rate": 0.0005255313351498637, + "loss": 3.7212, + "step": 8300 + }, + { + "epoch": 2.524310595065312, + "grad_norm": 0.3359690010547638, + "learning_rate": 0.0005250772025431426, + "loss": 3.7497, + "step": 8350 + }, + { + "epoch": 2.539429124334785, + "grad_norm": 0.3244568407535553, + "learning_rate": 0.0005246230699364214, + "loss": 3.738, + "step": 8400 + }, + { + "epoch": 2.5545476536042573, + "grad_norm": 0.3494398891925812, + "learning_rate": 0.0005241689373297002, + "loss": 3.7482, + "step": 8450 + }, + { + "epoch": 2.56966618287373, + "grad_norm": 0.33279120922088623, + "learning_rate": 0.000523714804722979, + "loss": 3.7474, + "step": 8500 + }, + { + "epoch": 2.5847847121432026, + "grad_norm": 0.3192063868045807, + "learning_rate": 0.0005232606721162579, + "loss": 3.7344, + "step": 8550 + }, + { + "epoch": 2.5999032414126755, + "grad_norm": 0.32627859711647034, + "learning_rate": 0.0005228065395095368, + "loss": 3.7326, + "step": 8600 + }, + { + "epoch": 2.615021770682148, + "grad_norm": 0.32838842272758484, + "learning_rate": 0.0005223524069028156, + "loss": 3.7382, + "step": 8650 + }, + { + "epoch": 2.630140299951621, + "grad_norm": 0.30039238929748535, + "learning_rate": 0.0005218982742960944, + "loss": 3.7325, + "step": 8700 + }, + { + "epoch": 2.6452588292210932, + "grad_norm": 0.35245180130004883, + "learning_rate": 0.0005214441416893733, + "loss": 3.7375, + "step": 8750 + }, + { + "epoch": 2.660377358490566, + "grad_norm": 0.3139996826648712, + "learning_rate": 0.0005209900090826521, + "loss": 3.7314, + "step": 8800 + }, + { + "epoch": 2.6754958877600385, + "grad_norm": 0.32654982805252075, + "learning_rate": 0.0005205358764759309, + "loss": 3.7347, + "step": 8850 + }, + { + "epoch": 2.6906144170295114, + "grad_norm": 0.3103334307670593, + "learning_rate": 0.0005200817438692098, + "loss": 3.7339, + "step": 8900 + }, + { + "epoch": 2.7057329462989843, + "grad_norm": 0.3158484101295471, + "learning_rate": 0.0005196276112624886, + "loss": 3.7294, + "step": 8950 + }, + { + "epoch": 2.7208514755684567, + "grad_norm": 0.3339202404022217, + "learning_rate": 0.0005191734786557674, + "loss": 3.7241, + "step": 9000 + }, + { + "epoch": 2.7208514755684567, + "eval_accuracy": 0.3476167901457528, + "eval_loss": 3.7497520446777344, + "eval_runtime": 182.7073, + "eval_samples_per_second": 91.097, + "eval_steps_per_second": 5.698, + "step": 9000 + }, + { + "epoch": 2.735970004837929, + "grad_norm": 0.33506032824516296, + "learning_rate": 0.0005187193460490462, + "loss": 3.7369, + "step": 9050 + }, + { + "epoch": 2.751088534107402, + "grad_norm": 0.34870266914367676, + "learning_rate": 0.0005182652134423252, + "loss": 3.7235, + "step": 9100 + }, + { + "epoch": 2.766207063376875, + "grad_norm": 0.33400285243988037, + "learning_rate": 0.000517811080835604, + "loss": 3.7238, + "step": 9150 + }, + { + "epoch": 2.7813255926463474, + "grad_norm": 0.32307007908821106, + "learning_rate": 0.0005173569482288827, + "loss": 3.7244, + "step": 9200 + }, + { + "epoch": 2.79644412191582, + "grad_norm": 0.3319950997829437, + "learning_rate": 0.0005169028156221616, + "loss": 3.7157, + "step": 9250 + }, + { + "epoch": 2.8115626511852927, + "grad_norm": 0.3344840705394745, + "learning_rate": 0.0005164486830154405, + "loss": 3.7192, + "step": 9300 + }, + { + "epoch": 2.8266811804547656, + "grad_norm": 0.30677810311317444, + "learning_rate": 0.0005159945504087193, + "loss": 3.7245, + "step": 9350 + }, + { + "epoch": 2.841799709724238, + "grad_norm": 0.3162977695465088, + "learning_rate": 0.0005155404178019982, + "loss": 3.7228, + "step": 9400 + }, + { + "epoch": 2.8569182389937104, + "grad_norm": 0.3243446946144104, + "learning_rate": 0.000515086285195277, + "loss": 3.716, + "step": 9450 + }, + { + "epoch": 2.8720367682631833, + "grad_norm": 0.3386097252368927, + "learning_rate": 0.0005146321525885558, + "loss": 3.7195, + "step": 9500 + }, + { + "epoch": 2.887155297532656, + "grad_norm": 0.3524998128414154, + "learning_rate": 0.0005141780199818347, + "loss": 3.7071, + "step": 9550 + }, + { + "epoch": 2.9022738268021286, + "grad_norm": 0.3099507689476013, + "learning_rate": 0.0005137238873751135, + "loss": 3.7196, + "step": 9600 + }, + { + "epoch": 2.917392356071601, + "grad_norm": 0.31758880615234375, + "learning_rate": 0.0005132697547683923, + "loss": 3.7128, + "step": 9650 + }, + { + "epoch": 2.932510885341074, + "grad_norm": 0.3277053236961365, + "learning_rate": 0.0005128156221616712, + "loss": 3.7313, + "step": 9700 + }, + { + "epoch": 2.947629414610547, + "grad_norm": 0.30629098415374756, + "learning_rate": 0.00051236148955495, + "loss": 3.7135, + "step": 9750 + }, + { + "epoch": 2.9627479438800193, + "grad_norm": 0.3403231203556061, + "learning_rate": 0.0005119073569482288, + "loss": 3.7087, + "step": 9800 + }, + { + "epoch": 2.977866473149492, + "grad_norm": 0.3153614103794098, + "learning_rate": 0.0005114532243415078, + "loss": 3.7156, + "step": 9850 + }, + { + "epoch": 2.9929850024189646, + "grad_norm": 0.3428856134414673, + "learning_rate": 0.0005109990917347865, + "loss": 3.7188, + "step": 9900 + }, + { + "epoch": 3.007861635220126, + "grad_norm": 0.32290196418762207, + "learning_rate": 0.0005105449591280653, + "loss": 3.657, + "step": 9950 + }, + { + "epoch": 3.0229801644895984, + "grad_norm": 0.3294488787651062, + "learning_rate": 0.0005100908265213441, + "loss": 3.5917, + "step": 10000 + }, + { + "epoch": 3.0229801644895984, + "eval_accuracy": 0.35003781267768785, + "eval_loss": 3.730295181274414, + "eval_runtime": 182.2568, + "eval_samples_per_second": 91.322, + "eval_steps_per_second": 5.712, + "step": 10000 + }, + { + "epoch": 3.0380986937590713, + "grad_norm": 0.3343505561351776, + "learning_rate": 0.0005096366939146231, + "loss": 3.5981, + "step": 10050 + }, + { + "epoch": 3.0532172230285437, + "grad_norm": 0.32677552103996277, + "learning_rate": 0.0005091825613079019, + "loss": 3.6168, + "step": 10100 + }, + { + "epoch": 3.0683357522980166, + "grad_norm": 0.32819923758506775, + "learning_rate": 0.0005087284287011807, + "loss": 3.6285, + "step": 10150 + }, + { + "epoch": 3.083454281567489, + "grad_norm": 0.34550225734710693, + "learning_rate": 0.0005082742960944595, + "loss": 3.6147, + "step": 10200 + }, + { + "epoch": 3.098572810836962, + "grad_norm": 0.3343297839164734, + "learning_rate": 0.0005078201634877384, + "loss": 3.6216, + "step": 10250 + }, + { + "epoch": 3.1136913401064343, + "grad_norm": 0.3187820315361023, + "learning_rate": 0.0005073660308810172, + "loss": 3.6035, + "step": 10300 + }, + { + "epoch": 3.128809869375907, + "grad_norm": 0.31732386350631714, + "learning_rate": 0.0005069118982742961, + "loss": 3.6181, + "step": 10350 + }, + { + "epoch": 3.1439283986453797, + "grad_norm": 0.33218568563461304, + "learning_rate": 0.0005064577656675749, + "loss": 3.6207, + "step": 10400 + }, + { + "epoch": 3.1590469279148525, + "grad_norm": 0.3314531147480011, + "learning_rate": 0.0005060036330608537, + "loss": 3.6108, + "step": 10450 + }, + { + "epoch": 3.174165457184325, + "grad_norm": 0.3125026226043701, + "learning_rate": 0.0005055495004541326, + "loss": 3.6393, + "step": 10500 + }, + { + "epoch": 3.189283986453798, + "grad_norm": 0.3300221562385559, + "learning_rate": 0.0005050953678474114, + "loss": 3.6234, + "step": 10550 + }, + { + "epoch": 3.2044025157232703, + "grad_norm": 0.3192563056945801, + "learning_rate": 0.0005046412352406902, + "loss": 3.6309, + "step": 10600 + }, + { + "epoch": 3.219521044992743, + "grad_norm": 0.3273380994796753, + "learning_rate": 0.000504187102633969, + "loss": 3.6277, + "step": 10650 + }, + { + "epoch": 3.2346395742622156, + "grad_norm": 0.3166305422782898, + "learning_rate": 0.0005037329700272479, + "loss": 3.6222, + "step": 10700 + }, + { + "epoch": 3.2497581035316885, + "grad_norm": 0.34870728850364685, + "learning_rate": 0.0005032788374205267, + "loss": 3.6306, + "step": 10750 + }, + { + "epoch": 3.264876632801161, + "grad_norm": 0.3375926911830902, + "learning_rate": 0.0005028247048138056, + "loss": 3.6342, + "step": 10800 + }, + { + "epoch": 3.279995162070634, + "grad_norm": 0.3100663721561432, + "learning_rate": 0.0005023705722070845, + "loss": 3.6279, + "step": 10850 + }, + { + "epoch": 3.2951136913401067, + "grad_norm": 0.31706514954566956, + "learning_rate": 0.0005019164396003632, + "loss": 3.6229, + "step": 10900 + }, + { + "epoch": 3.310232220609579, + "grad_norm": 0.32962343096733093, + "learning_rate": 0.000501462306993642, + "loss": 3.6332, + "step": 10950 + }, + { + "epoch": 3.3253507498790515, + "grad_norm": 0.33387279510498047, + "learning_rate": 0.000501008174386921, + "loss": 3.6298, + "step": 11000 + }, + { + "epoch": 3.3253507498790515, + "eval_accuracy": 0.3517179842080337, + "eval_loss": 3.7141380310058594, + "eval_runtime": 182.4417, + "eval_samples_per_second": 91.229, + "eval_steps_per_second": 5.706, + "step": 11000 + }, + { + "epoch": 3.3404692791485244, + "grad_norm": 0.31837111711502075, + "learning_rate": 0.0005005540417801998, + "loss": 3.637, + "step": 11050 + }, + { + "epoch": 3.3555878084179973, + "grad_norm": 0.33772799372673035, + "learning_rate": 0.0005000999091734786, + "loss": 3.6324, + "step": 11100 + }, + { + "epoch": 3.3707063376874697, + "grad_norm": 0.3386647403240204, + "learning_rate": 0.0004996457765667575, + "loss": 3.629, + "step": 11150 + }, + { + "epoch": 3.385824866956942, + "grad_norm": 0.31823232769966125, + "learning_rate": 0.0004991916439600363, + "loss": 3.6385, + "step": 11200 + }, + { + "epoch": 3.400943396226415, + "grad_norm": 0.32417285442352295, + "learning_rate": 0.0004987375113533151, + "loss": 3.6336, + "step": 11250 + }, + { + "epoch": 3.416061925495888, + "grad_norm": 0.34577304124832153, + "learning_rate": 0.000498283378746594, + "loss": 3.6371, + "step": 11300 + }, + { + "epoch": 3.4311804547653604, + "grad_norm": 0.35057663917541504, + "learning_rate": 0.0004978292461398728, + "loss": 3.6232, + "step": 11350 + }, + { + "epoch": 3.4462989840348333, + "grad_norm": 0.33663854002952576, + "learning_rate": 0.0004973751135331516, + "loss": 3.6322, + "step": 11400 + }, + { + "epoch": 3.4614175133043057, + "grad_norm": 0.32272565364837646, + "learning_rate": 0.0004969209809264304, + "loss": 3.6351, + "step": 11450 + }, + { + "epoch": 3.4765360425737786, + "grad_norm": 0.34051501750946045, + "learning_rate": 0.0004964668483197093, + "loss": 3.6287, + "step": 11500 + }, + { + "epoch": 3.491654571843251, + "grad_norm": 0.3307649493217468, + "learning_rate": 0.0004960127157129882, + "loss": 3.6389, + "step": 11550 + }, + { + "epoch": 3.506773101112724, + "grad_norm": 0.3179891109466553, + "learning_rate": 0.0004955585831062669, + "loss": 3.6281, + "step": 11600 + }, + { + "epoch": 3.5218916303821963, + "grad_norm": 0.32227689027786255, + "learning_rate": 0.0004951044504995458, + "loss": 3.6423, + "step": 11650 + }, + { + "epoch": 3.537010159651669, + "grad_norm": 0.35890889167785645, + "learning_rate": 0.0004946503178928246, + "loss": 3.6468, + "step": 11700 + }, + { + "epoch": 3.5521286889211416, + "grad_norm": 0.36508727073669434, + "learning_rate": 0.0004941961852861035, + "loss": 3.6366, + "step": 11750 + }, + { + "epoch": 3.5672472181906145, + "grad_norm": 0.32733410596847534, + "learning_rate": 0.0004937420526793824, + "loss": 3.6355, + "step": 11800 + }, + { + "epoch": 3.582365747460087, + "grad_norm": 0.3346957564353943, + "learning_rate": 0.0004932879200726612, + "loss": 3.6388, + "step": 11850 + }, + { + "epoch": 3.59748427672956, + "grad_norm": 0.32891422510147095, + "learning_rate": 0.00049283378746594, + "loss": 3.629, + "step": 11900 + }, + { + "epoch": 3.6126028059990323, + "grad_norm": 0.3372475206851959, + "learning_rate": 0.0004923796548592189, + "loss": 3.6244, + "step": 11950 + }, + { + "epoch": 3.627721335268505, + "grad_norm": 0.3311806917190552, + "learning_rate": 0.0004919255222524977, + "loss": 3.6253, + "step": 12000 + }, + { + "epoch": 3.627721335268505, + "eval_accuracy": 0.35409750215283, + "eval_loss": 3.6940906047821045, + "eval_runtime": 182.4197, + "eval_samples_per_second": 91.24, + "eval_steps_per_second": 5.707, + "step": 12000 + }, + { + "epoch": 3.6428398645379776, + "grad_norm": 0.31895172595977783, + "learning_rate": 0.0004914713896457765, + "loss": 3.6391, + "step": 12050 + }, + { + "epoch": 3.6579583938074505, + "grad_norm": 0.3537571430206299, + "learning_rate": 0.0004910172570390554, + "loss": 3.6328, + "step": 12100 + }, + { + "epoch": 3.6730769230769234, + "grad_norm": 0.336915522813797, + "learning_rate": 0.0004905631244323342, + "loss": 3.6389, + "step": 12150 + }, + { + "epoch": 3.688195452346396, + "grad_norm": 0.33801692724227905, + "learning_rate": 0.000490108991825613, + "loss": 3.6309, + "step": 12200 + }, + { + "epoch": 3.7033139816158682, + "grad_norm": 0.3218366801738739, + "learning_rate": 0.0004896548592188918, + "loss": 3.625, + "step": 12250 + }, + { + "epoch": 3.718432510885341, + "grad_norm": 0.33120447397232056, + "learning_rate": 0.0004892007266121708, + "loss": 3.6343, + "step": 12300 + }, + { + "epoch": 3.733551040154814, + "grad_norm": 0.32918858528137207, + "learning_rate": 0.0004887465940054495, + "loss": 3.6272, + "step": 12350 + }, + { + "epoch": 3.7486695694242864, + "grad_norm": 0.3185270130634308, + "learning_rate": 0.0004882924613987283, + "loss": 3.6325, + "step": 12400 + }, + { + "epoch": 3.763788098693759, + "grad_norm": 0.31657835841178894, + "learning_rate": 0.00048783832879200717, + "loss": 3.6286, + "step": 12450 + }, + { + "epoch": 3.7789066279632317, + "grad_norm": 0.31327733397483826, + "learning_rate": 0.00048738419618528606, + "loss": 3.6213, + "step": 12500 + }, + { + "epoch": 3.7940251572327046, + "grad_norm": 0.32215988636016846, + "learning_rate": 0.0004869300635785649, + "loss": 3.6203, + "step": 12550 + }, + { + "epoch": 3.809143686502177, + "grad_norm": 0.3287181556224823, + "learning_rate": 0.0004864759309718437, + "loss": 3.6269, + "step": 12600 + }, + { + "epoch": 3.8242622157716495, + "grad_norm": 0.3348924219608307, + "learning_rate": 0.0004860217983651226, + "loss": 3.6146, + "step": 12650 + }, + { + "epoch": 3.8393807450411224, + "grad_norm": 0.3488834500312805, + "learning_rate": 0.00048556766575840143, + "loss": 3.6344, + "step": 12700 + }, + { + "epoch": 3.8544992743105952, + "grad_norm": 0.32925039529800415, + "learning_rate": 0.00048511353315168026, + "loss": 3.6386, + "step": 12750 + }, + { + "epoch": 3.8696178035800677, + "grad_norm": 0.33648958802223206, + "learning_rate": 0.00048465940054495904, + "loss": 3.6341, + "step": 12800 + }, + { + "epoch": 3.8847363328495406, + "grad_norm": 0.3223407566547394, + "learning_rate": 0.0004842052679382379, + "loss": 3.6252, + "step": 12850 + }, + { + "epoch": 3.899854862119013, + "grad_norm": 0.3572065830230713, + "learning_rate": 0.00048375113533151676, + "loss": 3.6246, + "step": 12900 + }, + { + "epoch": 3.914973391388486, + "grad_norm": 0.34701117873191833, + "learning_rate": 0.0004832970027247956, + "loss": 3.6167, + "step": 12950 + }, + { + "epoch": 3.9300919206579583, + "grad_norm": 0.33134695887565613, + "learning_rate": 0.0004828428701180744, + "loss": 3.6323, + "step": 13000 + }, + { + "epoch": 3.9300919206579583, + "eval_accuracy": 0.3554904337217598, + "eval_loss": 3.676191568374634, + "eval_runtime": 182.5829, + "eval_samples_per_second": 91.159, + "eval_steps_per_second": 5.702, + "step": 13000 + }, + { + "epoch": 3.945210449927431, + "grad_norm": 0.3298724293708801, + "learning_rate": 0.0004823887375113533, + "loss": 3.6291, + "step": 13050 + }, + { + "epoch": 3.9603289791969036, + "grad_norm": 0.3415639102458954, + "learning_rate": 0.00048193460490463213, + "loss": 3.6256, + "step": 13100 + }, + { + "epoch": 3.9754475084663765, + "grad_norm": 0.3238424062728882, + "learning_rate": 0.0004814804722979109, + "loss": 3.6304, + "step": 13150 + }, + { + "epoch": 3.990566037735849, + "grad_norm": 0.31136244535446167, + "learning_rate": 0.00048102633969118974, + "loss": 3.6211, + "step": 13200 + }, + { + "epoch": 4.00544267053701, + "grad_norm": 0.3414509892463684, + "learning_rate": 0.0004805722070844686, + "loss": 3.5841, + "step": 13250 + }, + { + "epoch": 4.020561199806483, + "grad_norm": 0.3479083776473999, + "learning_rate": 0.00048011807447774746, + "loss": 3.5103, + "step": 13300 + }, + { + "epoch": 4.035679729075955, + "grad_norm": 0.32956740260124207, + "learning_rate": 0.0004796639418710263, + "loss": 3.5115, + "step": 13350 + }, + { + "epoch": 4.050798258345428, + "grad_norm": 0.32761216163635254, + "learning_rate": 0.00047920980926430517, + "loss": 3.5252, + "step": 13400 + }, + { + "epoch": 4.065916787614901, + "grad_norm": 0.32798823714256287, + "learning_rate": 0.000478755676657584, + "loss": 3.529, + "step": 13450 + }, + { + "epoch": 4.081035316884374, + "grad_norm": 0.3446704149246216, + "learning_rate": 0.00047830154405086283, + "loss": 3.5224, + "step": 13500 + }, + { + "epoch": 4.096153846153846, + "grad_norm": 0.3277835547924042, + "learning_rate": 0.0004778474114441416, + "loss": 3.5262, + "step": 13550 + }, + { + "epoch": 4.111272375423319, + "grad_norm": 0.3266860544681549, + "learning_rate": 0.0004773932788374205, + "loss": 3.5234, + "step": 13600 + }, + { + "epoch": 4.126390904692792, + "grad_norm": 0.33199459314346313, + "learning_rate": 0.0004769391462306993, + "loss": 3.5329, + "step": 13650 + }, + { + "epoch": 4.1415094339622645, + "grad_norm": 0.3167458176612854, + "learning_rate": 0.00047648501362397816, + "loss": 3.5315, + "step": 13700 + }, + { + "epoch": 4.1566279632317364, + "grad_norm": 0.33163997530937195, + "learning_rate": 0.000476030881017257, + "loss": 3.539, + "step": 13750 + }, + { + "epoch": 4.171746492501209, + "grad_norm": 0.3242467939853668, + "learning_rate": 0.00047557674841053587, + "loss": 3.5322, + "step": 13800 + }, + { + "epoch": 4.186865021770682, + "grad_norm": 0.3438900113105774, + "learning_rate": 0.0004751226158038147, + "loss": 3.5399, + "step": 13850 + }, + { + "epoch": 4.201983551040155, + "grad_norm": 0.34759825468063354, + "learning_rate": 0.0004746684831970935, + "loss": 3.5452, + "step": 13900 + }, + { + "epoch": 4.217102080309627, + "grad_norm": 0.3428172171115875, + "learning_rate": 0.0004742143505903723, + "loss": 3.5468, + "step": 13950 + }, + { + "epoch": 4.2322206095791, + "grad_norm": 0.3243090808391571, + "learning_rate": 0.0004737602179836512, + "loss": 3.5399, + "step": 14000 + }, + { + "epoch": 4.2322206095791, + "eval_accuracy": 0.35642869605990957, + "eval_loss": 3.6728808879852295, + "eval_runtime": 182.5974, + "eval_samples_per_second": 91.151, + "eval_steps_per_second": 5.701, + "step": 14000 + }, + { + "epoch": 4.247339138848573, + "grad_norm": 0.3331063985824585, + "learning_rate": 0.00047330608537693, + "loss": 3.5417, + "step": 14050 + }, + { + "epoch": 4.262457668118046, + "grad_norm": 0.32661089301109314, + "learning_rate": 0.00047285195277020886, + "loss": 3.5406, + "step": 14100 + }, + { + "epoch": 4.277576197387518, + "grad_norm": 0.33398354053497314, + "learning_rate": 0.00047239782016348774, + "loss": 3.5467, + "step": 14150 + }, + { + "epoch": 4.292694726656991, + "grad_norm": 0.3145262897014618, + "learning_rate": 0.00047194368755676657, + "loss": 3.5434, + "step": 14200 + }, + { + "epoch": 4.3078132559264635, + "grad_norm": 0.3353433609008789, + "learning_rate": 0.00047148955495004535, + "loss": 3.5485, + "step": 14250 + }, + { + "epoch": 4.322931785195936, + "grad_norm": 0.33363988995552063, + "learning_rate": 0.0004710354223433242, + "loss": 3.5426, + "step": 14300 + }, + { + "epoch": 4.338050314465409, + "grad_norm": 0.33351442217826843, + "learning_rate": 0.00047058128973660306, + "loss": 3.5549, + "step": 14350 + }, + { + "epoch": 4.353168843734881, + "grad_norm": 0.3099449872970581, + "learning_rate": 0.0004701271571298819, + "loss": 3.5518, + "step": 14400 + }, + { + "epoch": 4.368287373004354, + "grad_norm": 0.3316916823387146, + "learning_rate": 0.0004696730245231607, + "loss": 3.557, + "step": 14450 + }, + { + "epoch": 4.383405902273827, + "grad_norm": 0.32294154167175293, + "learning_rate": 0.00046921889191643956, + "loss": 3.5462, + "step": 14500 + }, + { + "epoch": 4.3985244315433, + "grad_norm": 0.32864052057266235, + "learning_rate": 0.00046876475930971844, + "loss": 3.5644, + "step": 14550 + }, + { + "epoch": 4.413642960812772, + "grad_norm": 0.3232649564743042, + "learning_rate": 0.0004683106267029972, + "loss": 3.5485, + "step": 14600 + }, + { + "epoch": 4.428761490082245, + "grad_norm": 0.33318766951560974, + "learning_rate": 0.00046785649409627605, + "loss": 3.5564, + "step": 14650 + }, + { + "epoch": 4.443880019351718, + "grad_norm": 0.3175857663154602, + "learning_rate": 0.0004674023614895549, + "loss": 3.5574, + "step": 14700 + }, + { + "epoch": 4.4589985486211905, + "grad_norm": 0.3410588502883911, + "learning_rate": 0.00046694822888283376, + "loss": 3.5549, + "step": 14750 + }, + { + "epoch": 4.4741170778906625, + "grad_norm": 0.33438578248023987, + "learning_rate": 0.0004664940962761126, + "loss": 3.5644, + "step": 14800 + }, + { + "epoch": 4.489235607160135, + "grad_norm": 0.3205425441265106, + "learning_rate": 0.0004660399636693914, + "loss": 3.5554, + "step": 14850 + }, + { + "epoch": 4.504354136429608, + "grad_norm": 0.3268115222454071, + "learning_rate": 0.0004655858310626703, + "loss": 3.568, + "step": 14900 + }, + { + "epoch": 4.519472665699081, + "grad_norm": 0.32924684882164, + "learning_rate": 0.0004651316984559491, + "loss": 3.5596, + "step": 14950 + }, + { + "epoch": 4.534591194968553, + "grad_norm": 0.3317072093486786, + "learning_rate": 0.0004646775658492279, + "loss": 3.5538, + "step": 15000 + }, + { + "epoch": 4.534591194968553, + "eval_accuracy": 0.35780751842074693, + "eval_loss": 3.6592884063720703, + "eval_runtime": 182.5425, + "eval_samples_per_second": 91.179, + "eval_steps_per_second": 5.703, + "step": 15000 + }, + { + "epoch": 4.549709724238026, + "grad_norm": 0.3319750428199768, + "learning_rate": 0.00046422343324250675, + "loss": 3.5586, + "step": 15050 + }, + { + "epoch": 4.564828253507499, + "grad_norm": 0.33184248208999634, + "learning_rate": 0.00046376930063578563, + "loss": 3.5611, + "step": 15100 + }, + { + "epoch": 4.579946782776972, + "grad_norm": 0.3535155653953552, + "learning_rate": 0.00046331516802906446, + "loss": 3.56, + "step": 15150 + }, + { + "epoch": 4.595065312046444, + "grad_norm": 0.322519987821579, + "learning_rate": 0.0004628610354223433, + "loss": 3.5594, + "step": 15200 + }, + { + "epoch": 4.610183841315917, + "grad_norm": 0.31665462255477905, + "learning_rate": 0.00046240690281562207, + "loss": 3.5673, + "step": 15250 + }, + { + "epoch": 4.6253023705853895, + "grad_norm": 0.341170072555542, + "learning_rate": 0.00046195277020890096, + "loss": 3.5536, + "step": 15300 + }, + { + "epoch": 4.640420899854862, + "grad_norm": 0.3396996557712555, + "learning_rate": 0.0004614986376021798, + "loss": 3.5653, + "step": 15350 + }, + { + "epoch": 4.655539429124335, + "grad_norm": 0.33313286304473877, + "learning_rate": 0.0004610445049954586, + "loss": 3.5667, + "step": 15400 + }, + { + "epoch": 4.670657958393807, + "grad_norm": 0.3346698582172394, + "learning_rate": 0.00046059037238873745, + "loss": 3.5552, + "step": 15450 + }, + { + "epoch": 4.68577648766328, + "grad_norm": 0.32503342628479004, + "learning_rate": 0.00046013623978201633, + "loss": 3.5628, + "step": 15500 + }, + { + "epoch": 4.700895016932753, + "grad_norm": 0.3317323327064514, + "learning_rate": 0.00045968210717529516, + "loss": 3.5598, + "step": 15550 + }, + { + "epoch": 4.716013546202225, + "grad_norm": 0.31406593322753906, + "learning_rate": 0.00045922797456857394, + "loss": 3.5652, + "step": 15600 + }, + { + "epoch": 4.731132075471698, + "grad_norm": 0.35221582651138306, + "learning_rate": 0.0004587738419618528, + "loss": 3.5625, + "step": 15650 + }, + { + "epoch": 4.746250604741171, + "grad_norm": 0.3168243169784546, + "learning_rate": 0.00045831970935513166, + "loss": 3.5446, + "step": 15700 + }, + { + "epoch": 4.761369134010644, + "grad_norm": 0.3266247510910034, + "learning_rate": 0.0004578655767484105, + "loss": 3.5824, + "step": 15750 + }, + { + "epoch": 4.7764876632801165, + "grad_norm": 0.3345303535461426, + "learning_rate": 0.0004574114441416893, + "loss": 3.5577, + "step": 15800 + }, + { + "epoch": 4.7916061925495885, + "grad_norm": 0.36775925755500793, + "learning_rate": 0.0004569573115349682, + "loss": 3.5736, + "step": 15850 + }, + { + "epoch": 4.806724721819061, + "grad_norm": 0.31279629468917847, + "learning_rate": 0.00045650317892824703, + "loss": 3.5719, + "step": 15900 + }, + { + "epoch": 4.821843251088534, + "grad_norm": 0.32946261763572693, + "learning_rate": 0.0004560490463215258, + "loss": 3.5472, + "step": 15950 + }, + { + "epoch": 4.836961780358006, + "grad_norm": 0.3353503942489624, + "learning_rate": 0.00045559491371480464, + "loss": 3.5512, + "step": 16000 + }, + { + "epoch": 4.836961780358006, + "eval_accuracy": 0.3589908106727694, + "eval_loss": 3.6440577507019043, + "eval_runtime": 182.5579, + "eval_samples_per_second": 91.171, + "eval_steps_per_second": 5.702, + "step": 16000 + }, + { + "epoch": 4.852080309627479, + "grad_norm": 0.3155944347381592, + "learning_rate": 0.0004551407811080835, + "loss": 3.5697, + "step": 16050 + }, + { + "epoch": 4.867198838896952, + "grad_norm": 0.34022998809814453, + "learning_rate": 0.00045468664850136235, + "loss": 3.5635, + "step": 16100 + }, + { + "epoch": 4.882317368166425, + "grad_norm": 0.349436491727829, + "learning_rate": 0.0004542325158946412, + "loss": 3.5606, + "step": 16150 + }, + { + "epoch": 4.897435897435898, + "grad_norm": 0.32045167684555054, + "learning_rate": 0.00045377838328792, + "loss": 3.5483, + "step": 16200 + }, + { + "epoch": 4.91255442670537, + "grad_norm": 0.3273170292377472, + "learning_rate": 0.0004533242506811989, + "loss": 3.561, + "step": 16250 + }, + { + "epoch": 4.927672955974843, + "grad_norm": 0.3492163121700287, + "learning_rate": 0.00045287011807447773, + "loss": 3.5538, + "step": 16300 + }, + { + "epoch": 4.9427914852443156, + "grad_norm": 0.3319571018218994, + "learning_rate": 0.0004524159854677565, + "loss": 3.5678, + "step": 16350 + }, + { + "epoch": 4.957910014513788, + "grad_norm": 0.3193274438381195, + "learning_rate": 0.0004519618528610354, + "loss": 3.5604, + "step": 16400 + }, + { + "epoch": 4.97302854378326, + "grad_norm": 0.3205724060535431, + "learning_rate": 0.0004515077202543142, + "loss": 3.5501, + "step": 16450 + }, + { + "epoch": 4.988147073052733, + "grad_norm": 0.32597634196281433, + "learning_rate": 0.00045105358764759305, + "loss": 3.5648, + "step": 16500 + }, + { + "epoch": 5.003023705853894, + "grad_norm": 0.3266983926296234, + "learning_rate": 0.0004505994550408719, + "loss": 3.5279, + "step": 16550 + }, + { + "epoch": 5.018142235123367, + "grad_norm": 0.32244426012039185, + "learning_rate": 0.00045014532243415077, + "loss": 3.4542, + "step": 16600 + }, + { + "epoch": 5.03326076439284, + "grad_norm": 0.32874855399131775, + "learning_rate": 0.0004496911898274296, + "loss": 3.4671, + "step": 16650 + }, + { + "epoch": 5.048379293662313, + "grad_norm": 0.34664833545684814, + "learning_rate": 0.0004492370572207084, + "loss": 3.4582, + "step": 16700 + }, + { + "epoch": 5.063497822931785, + "grad_norm": 0.3226701021194458, + "learning_rate": 0.0004487829246139872, + "loss": 3.4471, + "step": 16750 + }, + { + "epoch": 5.078616352201258, + "grad_norm": 0.3484433889389038, + "learning_rate": 0.0004483287920072661, + "loss": 3.4685, + "step": 16800 + }, + { + "epoch": 5.093734881470731, + "grad_norm": 0.338821142911911, + "learning_rate": 0.0004478746594005449, + "loss": 3.4738, + "step": 16850 + }, + { + "epoch": 5.1088534107402035, + "grad_norm": 0.3581444323062897, + "learning_rate": 0.00044742052679382375, + "loss": 3.4712, + "step": 16900 + }, + { + "epoch": 5.1239719400096755, + "grad_norm": 0.34923458099365234, + "learning_rate": 0.00044696639418710264, + "loss": 3.4607, + "step": 16950 + }, + { + "epoch": 5.139090469279148, + "grad_norm": 0.33646735548973083, + "learning_rate": 0.00044651226158038147, + "loss": 3.4722, + "step": 17000 + }, + { + "epoch": 5.139090469279148, + "eval_accuracy": 0.35967545999545686, + "eval_loss": 3.646660327911377, + "eval_runtime": 182.5899, + "eval_samples_per_second": 91.155, + "eval_steps_per_second": 5.701, + "step": 17000 + }, + { + "epoch": 5.154208998548621, + "grad_norm": 0.33612239360809326, + "learning_rate": 0.00044605812897366025, + "loss": 3.4686, + "step": 17050 + }, + { + "epoch": 5.169327527818094, + "grad_norm": 0.3267473876476288, + "learning_rate": 0.0004456039963669391, + "loss": 3.474, + "step": 17100 + }, + { + "epoch": 5.184446057087566, + "grad_norm": 0.33728793263435364, + "learning_rate": 0.00044514986376021796, + "loss": 3.4742, + "step": 17150 + }, + { + "epoch": 5.199564586357039, + "grad_norm": 0.33321622014045715, + "learning_rate": 0.0004446957311534968, + "loss": 3.4825, + "step": 17200 + }, + { + "epoch": 5.214683115626512, + "grad_norm": 0.3262038230895996, + "learning_rate": 0.0004442415985467756, + "loss": 3.4769, + "step": 17250 + }, + { + "epoch": 5.229801644895985, + "grad_norm": 0.3380536139011383, + "learning_rate": 0.00044378746594005445, + "loss": 3.4779, + "step": 17300 + }, + { + "epoch": 5.244920174165458, + "grad_norm": 0.32373255491256714, + "learning_rate": 0.00044333333333333334, + "loss": 3.4721, + "step": 17350 + }, + { + "epoch": 5.26003870343493, + "grad_norm": 0.34807828068733215, + "learning_rate": 0.0004428792007266121, + "loss": 3.4817, + "step": 17400 + }, + { + "epoch": 5.2751572327044025, + "grad_norm": 0.34097328782081604, + "learning_rate": 0.00044242506811989095, + "loss": 3.4845, + "step": 17450 + }, + { + "epoch": 5.290275761973875, + "grad_norm": 0.33616551756858826, + "learning_rate": 0.0004419709355131698, + "loss": 3.5016, + "step": 17500 + }, + { + "epoch": 5.305394291243347, + "grad_norm": 0.3330867886543274, + "learning_rate": 0.00044151680290644866, + "loss": 3.4844, + "step": 17550 + }, + { + "epoch": 5.32051282051282, + "grad_norm": 0.3476291298866272, + "learning_rate": 0.0004410626702997275, + "loss": 3.4877, + "step": 17600 + }, + { + "epoch": 5.335631349782293, + "grad_norm": 0.3363373577594757, + "learning_rate": 0.0004406085376930063, + "loss": 3.4931, + "step": 17650 + }, + { + "epoch": 5.350749879051766, + "grad_norm": 0.35095226764678955, + "learning_rate": 0.0004401544050862852, + "loss": 3.4904, + "step": 17700 + }, + { + "epoch": 5.365868408321239, + "grad_norm": 0.35196179151535034, + "learning_rate": 0.000439700272479564, + "loss": 3.5003, + "step": 17750 + }, + { + "epoch": 5.380986937590711, + "grad_norm": 0.3490413427352905, + "learning_rate": 0.0004392461398728428, + "loss": 3.4914, + "step": 17800 + }, + { + "epoch": 5.396105466860184, + "grad_norm": 0.3287891447544098, + "learning_rate": 0.00043879200726612165, + "loss": 3.482, + "step": 17850 + }, + { + "epoch": 5.411223996129657, + "grad_norm": 0.3373957574367523, + "learning_rate": 0.00043833787465940053, + "loss": 3.4928, + "step": 17900 + }, + { + "epoch": 5.4263425253991295, + "grad_norm": 0.3305628299713135, + "learning_rate": 0.00043788374205267936, + "loss": 3.4918, + "step": 17950 + }, + { + "epoch": 5.4414610546686015, + "grad_norm": 0.33179208636283875, + "learning_rate": 0.0004374296094459582, + "loss": 3.5072, + "step": 18000 + }, + { + "epoch": 5.4414610546686015, + "eval_accuracy": 0.36049544013909796, + "eval_loss": 3.6349031925201416, + "eval_runtime": 182.6046, + "eval_samples_per_second": 91.148, + "eval_steps_per_second": 5.701, + "step": 18000 + }, + { + "epoch": 5.456579583938074, + "grad_norm": 0.3481210172176361, + "learning_rate": 0.00043697547683923697, + "loss": 3.5004, + "step": 18050 + }, + { + "epoch": 5.471698113207547, + "grad_norm": 0.34053078293800354, + "learning_rate": 0.00043652134423251585, + "loss": 3.4979, + "step": 18100 + }, + { + "epoch": 5.48681664247702, + "grad_norm": 0.3321893811225891, + "learning_rate": 0.0004360672116257947, + "loss": 3.5042, + "step": 18150 + }, + { + "epoch": 5.501935171746492, + "grad_norm": 0.3595956861972809, + "learning_rate": 0.0004356130790190735, + "loss": 3.4927, + "step": 18200 + }, + { + "epoch": 5.517053701015965, + "grad_norm": 0.33346107602119446, + "learning_rate": 0.00043515894641235235, + "loss": 3.4992, + "step": 18250 + }, + { + "epoch": 5.532172230285438, + "grad_norm": 0.3272981345653534, + "learning_rate": 0.00043470481380563123, + "loss": 3.4941, + "step": 18300 + }, + { + "epoch": 5.547290759554911, + "grad_norm": 0.3254975378513336, + "learning_rate": 0.00043425068119891006, + "loss": 3.5031, + "step": 18350 + }, + { + "epoch": 5.562409288824383, + "grad_norm": 0.3381115198135376, + "learning_rate": 0.00043379654859218884, + "loss": 3.5026, + "step": 18400 + }, + { + "epoch": 5.577527818093856, + "grad_norm": 0.35261309146881104, + "learning_rate": 0.0004333424159854677, + "loss": 3.5006, + "step": 18450 + }, + { + "epoch": 5.5926463473633286, + "grad_norm": 0.3541949987411499, + "learning_rate": 0.00043288828337874655, + "loss": 3.5055, + "step": 18500 + }, + { + "epoch": 5.607764876632801, + "grad_norm": 0.34113311767578125, + "learning_rate": 0.0004324341507720254, + "loss": 3.5077, + "step": 18550 + }, + { + "epoch": 5.622883405902273, + "grad_norm": 0.32559531927108765, + "learning_rate": 0.0004319800181653042, + "loss": 3.509, + "step": 18600 + }, + { + "epoch": 5.638001935171746, + "grad_norm": 0.3446948528289795, + "learning_rate": 0.0004315258855585831, + "loss": 3.504, + "step": 18650 + }, + { + "epoch": 5.653120464441219, + "grad_norm": 0.33571872115135193, + "learning_rate": 0.00043107175295186193, + "loss": 3.4972, + "step": 18700 + }, + { + "epoch": 5.668238993710692, + "grad_norm": 0.331429660320282, + "learning_rate": 0.0004306176203451407, + "loss": 3.4938, + "step": 18750 + }, + { + "epoch": 5.683357522980165, + "grad_norm": 0.3403501510620117, + "learning_rate": 0.00043016348773841954, + "loss": 3.5087, + "step": 18800 + }, + { + "epoch": 5.698476052249637, + "grad_norm": 0.3406428098678589, + "learning_rate": 0.0004297093551316984, + "loss": 3.5111, + "step": 18850 + }, + { + "epoch": 5.71359458151911, + "grad_norm": 0.33290889859199524, + "learning_rate": 0.00042925522252497725, + "loss": 3.4946, + "step": 18900 + }, + { + "epoch": 5.728713110788583, + "grad_norm": 0.3506568670272827, + "learning_rate": 0.0004288010899182561, + "loss": 3.4977, + "step": 18950 + }, + { + "epoch": 5.743831640058055, + "grad_norm": 0.32340604066848755, + "learning_rate": 0.0004283469573115349, + "loss": 3.501, + "step": 19000 + }, + { + "epoch": 5.743831640058055, + "eval_accuracy": 0.3617825526473342, + "eval_loss": 3.6229019165039062, + "eval_runtime": 182.4602, + "eval_samples_per_second": 91.22, + "eval_steps_per_second": 5.705, + "step": 19000 + }, + { + "epoch": 5.758950169327528, + "grad_norm": 0.3353550434112549, + "learning_rate": 0.0004278928247048138, + "loss": 3.4941, + "step": 19050 + }, + { + "epoch": 5.7740686985970004, + "grad_norm": 0.33119282126426697, + "learning_rate": 0.00042743869209809263, + "loss": 3.5137, + "step": 19100 + }, + { + "epoch": 5.789187227866473, + "grad_norm": 0.33732518553733826, + "learning_rate": 0.0004269845594913714, + "loss": 3.5052, + "step": 19150 + }, + { + "epoch": 5.804305757135946, + "grad_norm": 0.32491031289100647, + "learning_rate": 0.0004265304268846503, + "loss": 3.505, + "step": 19200 + }, + { + "epoch": 5.819424286405418, + "grad_norm": 0.328688383102417, + "learning_rate": 0.0004260762942779291, + "loss": 3.5022, + "step": 19250 + }, + { + "epoch": 5.834542815674891, + "grad_norm": 0.33090704679489136, + "learning_rate": 0.00042562216167120795, + "loss": 3.5023, + "step": 19300 + }, + { + "epoch": 5.849661344944364, + "grad_norm": 0.3403943181037903, + "learning_rate": 0.0004251680290644868, + "loss": 3.4987, + "step": 19350 + }, + { + "epoch": 5.864779874213837, + "grad_norm": 0.33841657638549805, + "learning_rate": 0.00042471389645776567, + "loss": 3.5105, + "step": 19400 + }, + { + "epoch": 5.879898403483309, + "grad_norm": 0.31852486729621887, + "learning_rate": 0.0004242597638510445, + "loss": 3.5051, + "step": 19450 + }, + { + "epoch": 5.895016932752782, + "grad_norm": 0.3277070224285126, + "learning_rate": 0.0004238056312443233, + "loss": 3.4959, + "step": 19500 + }, + { + "epoch": 5.910135462022255, + "grad_norm": 0.32635408639907837, + "learning_rate": 0.0004233514986376021, + "loss": 3.5042, + "step": 19550 + }, + { + "epoch": 5.9252539912917275, + "grad_norm": 0.32446935772895813, + "learning_rate": 0.000422897366030881, + "loss": 3.5092, + "step": 19600 + }, + { + "epoch": 5.9403725205611995, + "grad_norm": 0.33674439787864685, + "learning_rate": 0.0004224432334241598, + "loss": 3.4991, + "step": 19650 + }, + { + "epoch": 5.955491049830672, + "grad_norm": 0.32446402311325073, + "learning_rate": 0.00042198910081743865, + "loss": 3.5077, + "step": 19700 + }, + { + "epoch": 5.970609579100145, + "grad_norm": 0.3281049132347107, + "learning_rate": 0.0004215349682107175, + "loss": 3.5125, + "step": 19750 + }, + { + "epoch": 5.985728108369618, + "grad_norm": 0.33700883388519287, + "learning_rate": 0.00042108083560399637, + "loss": 3.5084, + "step": 19800 + }, + { + "epoch": 6.000604741170779, + "grad_norm": 0.35103023052215576, + "learning_rate": 0.00042062670299727515, + "loss": 3.5109, + "step": 19850 + }, + { + "epoch": 6.015723270440252, + "grad_norm": 0.33696576952934265, + "learning_rate": 0.000420172570390554, + "loss": 3.4014, + "step": 19900 + }, + { + "epoch": 6.030841799709724, + "grad_norm": 0.3408699631690979, + "learning_rate": 0.00041971843778383286, + "loss": 3.3988, + "step": 19950 + }, + { + "epoch": 6.045960328979197, + "grad_norm": 0.340057909488678, + "learning_rate": 0.0004192643051771117, + "loss": 3.411, + "step": 20000 + }, + { + "epoch": 6.045960328979197, + "eval_accuracy": 0.3621478635601953, + "eval_loss": 3.6226511001586914, + "eval_runtime": 181.7969, + "eval_samples_per_second": 91.553, + "eval_steps_per_second": 5.726, + "step": 20000 + }, + { + "epoch": 6.06107885824867, + "grad_norm": 0.36212554574012756, + "learning_rate": 0.0004188101725703905, + "loss": 3.4062, + "step": 20050 + }, + { + "epoch": 6.0761973875181425, + "grad_norm": 0.35196641087532043, + "learning_rate": 0.00041835603996366935, + "loss": 3.4145, + "step": 20100 + }, + { + "epoch": 6.0913159167876145, + "grad_norm": 0.35778287053108215, + "learning_rate": 0.00041790190735694824, + "loss": 3.4204, + "step": 20150 + }, + { + "epoch": 6.106434446057087, + "grad_norm": 0.34076035022735596, + "learning_rate": 0.000417447774750227, + "loss": 3.4187, + "step": 20200 + }, + { + "epoch": 6.12155297532656, + "grad_norm": 0.3436349034309387, + "learning_rate": 0.00041699364214350585, + "loss": 3.4213, + "step": 20250 + }, + { + "epoch": 6.136671504596033, + "grad_norm": 0.32700756192207336, + "learning_rate": 0.0004165395095367847, + "loss": 3.4231, + "step": 20300 + }, + { + "epoch": 6.151790033865505, + "grad_norm": 0.3409166932106018, + "learning_rate": 0.00041608537693006356, + "loss": 3.408, + "step": 20350 + }, + { + "epoch": 6.166908563134978, + "grad_norm": 0.3603594899177551, + "learning_rate": 0.0004156312443233424, + "loss": 3.4209, + "step": 20400 + }, + { + "epoch": 6.182027092404451, + "grad_norm": 0.3426113426685333, + "learning_rate": 0.0004151771117166212, + "loss": 3.4202, + "step": 20450 + }, + { + "epoch": 6.197145621673924, + "grad_norm": 0.3426719009876251, + "learning_rate": 0.0004147229791099, + "loss": 3.4261, + "step": 20500 + }, + { + "epoch": 6.212264150943396, + "grad_norm": 0.34170880913734436, + "learning_rate": 0.0004142688465031789, + "loss": 3.423, + "step": 20550 + }, + { + "epoch": 6.227382680212869, + "grad_norm": 0.33598586916923523, + "learning_rate": 0.0004138147138964577, + "loss": 3.4305, + "step": 20600 + }, + { + "epoch": 6.2425012094823416, + "grad_norm": 0.331582635641098, + "learning_rate": 0.00041336058128973655, + "loss": 3.4281, + "step": 20650 + }, + { + "epoch": 6.257619738751814, + "grad_norm": 0.33808058500289917, + "learning_rate": 0.00041290644868301543, + "loss": 3.4321, + "step": 20700 + }, + { + "epoch": 6.272738268021287, + "grad_norm": 0.34110236167907715, + "learning_rate": 0.00041245231607629426, + "loss": 3.4264, + "step": 20750 + }, + { + "epoch": 6.287856797290759, + "grad_norm": 0.34383150935173035, + "learning_rate": 0.0004119981834695731, + "loss": 3.438, + "step": 20800 + }, + { + "epoch": 6.302975326560232, + "grad_norm": 0.34354880452156067, + "learning_rate": 0.00041154405086285187, + "loss": 3.4231, + "step": 20850 + }, + { + "epoch": 6.318093855829705, + "grad_norm": 0.3443782329559326, + "learning_rate": 0.00041108991825613075, + "loss": 3.4324, + "step": 20900 + }, + { + "epoch": 6.333212385099178, + "grad_norm": 0.3713175654411316, + "learning_rate": 0.0004106357856494096, + "loss": 3.4445, + "step": 20950 + }, + { + "epoch": 6.34833091436865, + "grad_norm": 0.32927024364471436, + "learning_rate": 0.0004101816530426884, + "loss": 3.4363, + "step": 21000 + }, + { + "epoch": 6.34833091436865, + "eval_accuracy": 0.36282722192984806, + "eval_loss": 3.619049310684204, + "eval_runtime": 181.9714, + "eval_samples_per_second": 91.465, + "eval_steps_per_second": 5.721, + "step": 21000 + }, + { + "epoch": 6.363449443638123, + "grad_norm": 0.33876073360443115, + "learning_rate": 0.00040972752043596725, + "loss": 3.4475, + "step": 21050 + }, + { + "epoch": 6.378567972907596, + "grad_norm": 0.3428070843219757, + "learning_rate": 0.00040927338782924613, + "loss": 3.4501, + "step": 21100 + }, + { + "epoch": 6.393686502177069, + "grad_norm": 0.33244138956069946, + "learning_rate": 0.00040881925522252496, + "loss": 3.4431, + "step": 21150 + }, + { + "epoch": 6.408805031446541, + "grad_norm": 0.3557736575603485, + "learning_rate": 0.00040836512261580374, + "loss": 3.4562, + "step": 21200 + }, + { + "epoch": 6.4239235607160134, + "grad_norm": 0.3500126600265503, + "learning_rate": 0.00040791099000908257, + "loss": 3.451, + "step": 21250 + }, + { + "epoch": 6.439042089985486, + "grad_norm": 0.3625536561012268, + "learning_rate": 0.00040745685740236145, + "loss": 3.4482, + "step": 21300 + }, + { + "epoch": 6.454160619254959, + "grad_norm": 0.34508174657821655, + "learning_rate": 0.0004070027247956403, + "loss": 3.4541, + "step": 21350 + }, + { + "epoch": 6.469279148524431, + "grad_norm": 0.3331073224544525, + "learning_rate": 0.0004065485921889191, + "loss": 3.4521, + "step": 21400 + }, + { + "epoch": 6.484397677793904, + "grad_norm": 0.332691490650177, + "learning_rate": 0.000406094459582198, + "loss": 3.4507, + "step": 21450 + }, + { + "epoch": 6.499516207063377, + "grad_norm": 0.32340583205223083, + "learning_rate": 0.00040564032697547683, + "loss": 3.4496, + "step": 21500 + }, + { + "epoch": 6.51463473633285, + "grad_norm": 0.33922895789146423, + "learning_rate": 0.0004051861943687556, + "loss": 3.4528, + "step": 21550 + }, + { + "epoch": 6.529753265602322, + "grad_norm": 0.342859148979187, + "learning_rate": 0.00040473206176203444, + "loss": 3.4543, + "step": 21600 + }, + { + "epoch": 6.544871794871795, + "grad_norm": 0.3467361032962799, + "learning_rate": 0.0004042779291553133, + "loss": 3.4578, + "step": 21650 + }, + { + "epoch": 6.559990324141268, + "grad_norm": 0.3421052396297455, + "learning_rate": 0.00040382379654859215, + "loss": 3.4531, + "step": 21700 + }, + { + "epoch": 6.5751088534107405, + "grad_norm": 0.3430230915546417, + "learning_rate": 0.000403369663941871, + "loss": 3.4529, + "step": 21750 + }, + { + "epoch": 6.590227382680213, + "grad_norm": 0.3446337878704071, + "learning_rate": 0.0004029155313351498, + "loss": 3.4538, + "step": 21800 + }, + { + "epoch": 6.605345911949685, + "grad_norm": 0.3413420617580414, + "learning_rate": 0.0004024613987284287, + "loss": 3.4459, + "step": 21850 + }, + { + "epoch": 6.620464441219158, + "grad_norm": 0.36404335498809814, + "learning_rate": 0.00040200726612170753, + "loss": 3.4481, + "step": 21900 + }, + { + "epoch": 6.635582970488631, + "grad_norm": 0.3348482549190521, + "learning_rate": 0.0004015531335149863, + "loss": 3.4477, + "step": 21950 + }, + { + "epoch": 6.650701499758103, + "grad_norm": 0.33620691299438477, + "learning_rate": 0.0004010990009082652, + "loss": 3.4492, + "step": 22000 + }, + { + "epoch": 6.650701499758103, + "eval_accuracy": 0.3638837664625064, + "eval_loss": 3.6092684268951416, + "eval_runtime": 181.7457, + "eval_samples_per_second": 91.579, + "eval_steps_per_second": 5.728, + "step": 22000 + }, + { + "epoch": 6.665820029027576, + "grad_norm": 0.3584924042224884, + "learning_rate": 0.000400644868301544, + "loss": 3.4621, + "step": 22050 + }, + { + "epoch": 6.680938558297049, + "grad_norm": 0.342899352312088, + "learning_rate": 0.00040019073569482285, + "loss": 3.4605, + "step": 22100 + }, + { + "epoch": 6.696057087566522, + "grad_norm": 0.3478437066078186, + "learning_rate": 0.0003997366030881017, + "loss": 3.4763, + "step": 22150 + }, + { + "epoch": 6.711175616835995, + "grad_norm": 0.3423757553100586, + "learning_rate": 0.00039928247048138057, + "loss": 3.4587, + "step": 22200 + }, + { + "epoch": 6.726294146105467, + "grad_norm": 0.3426273465156555, + "learning_rate": 0.0003988283378746594, + "loss": 3.453, + "step": 22250 + }, + { + "epoch": 6.7414126753749395, + "grad_norm": 0.3257652223110199, + "learning_rate": 0.0003983742052679382, + "loss": 3.4499, + "step": 22300 + }, + { + "epoch": 6.756531204644412, + "grad_norm": 0.32835474610328674, + "learning_rate": 0.000397920072661217, + "loss": 3.4683, + "step": 22350 + }, + { + "epoch": 6.771649733913884, + "grad_norm": 0.3534278869628906, + "learning_rate": 0.0003974659400544959, + "loss": 3.461, + "step": 22400 + }, + { + "epoch": 6.786768263183357, + "grad_norm": 0.3387344479560852, + "learning_rate": 0.0003970118074477747, + "loss": 3.4594, + "step": 22450 + }, + { + "epoch": 6.80188679245283, + "grad_norm": 0.3469718396663666, + "learning_rate": 0.00039655767484105355, + "loss": 3.4633, + "step": 22500 + }, + { + "epoch": 6.817005321722303, + "grad_norm": 0.34182408452033997, + "learning_rate": 0.0003961035422343324, + "loss": 3.4561, + "step": 22550 + }, + { + "epoch": 6.832123850991776, + "grad_norm": 0.35019853711128235, + "learning_rate": 0.00039564940962761127, + "loss": 3.4721, + "step": 22600 + }, + { + "epoch": 6.847242380261248, + "grad_norm": 0.32239803671836853, + "learning_rate": 0.00039519527702089005, + "loss": 3.4599, + "step": 22650 + }, + { + "epoch": 6.862360909530721, + "grad_norm": 0.35213080048561096, + "learning_rate": 0.0003947411444141689, + "loss": 3.4652, + "step": 22700 + }, + { + "epoch": 6.877479438800194, + "grad_norm": 0.3404783010482788, + "learning_rate": 0.00039428701180744776, + "loss": 3.4592, + "step": 22750 + }, + { + "epoch": 6.8925979680696665, + "grad_norm": 0.33358967304229736, + "learning_rate": 0.0003938328792007266, + "loss": 3.4612, + "step": 22800 + }, + { + "epoch": 6.9077164973391385, + "grad_norm": 0.3438171446323395, + "learning_rate": 0.0003933787465940054, + "loss": 3.4415, + "step": 22850 + }, + { + "epoch": 6.922835026608611, + "grad_norm": 0.3446924090385437, + "learning_rate": 0.00039292461398728425, + "loss": 3.4612, + "step": 22900 + }, + { + "epoch": 6.937953555878084, + "grad_norm": 0.34483805298805237, + "learning_rate": 0.00039247048138056314, + "loss": 3.475, + "step": 22950 + }, + { + "epoch": 6.953072085147557, + "grad_norm": 0.3411277234554291, + "learning_rate": 0.0003920163487738419, + "loss": 3.4632, + "step": 23000 + }, + { + "epoch": 6.953072085147557, + "eval_accuracy": 0.36477981875311283, + "eval_loss": 3.597496271133423, + "eval_runtime": 181.6718, + "eval_samples_per_second": 91.616, + "eval_steps_per_second": 5.73, + "step": 23000 + }, + { + "epoch": 6.968190614417029, + "grad_norm": 0.3497656583786011, + "learning_rate": 0.00039156221616712075, + "loss": 3.4725, + "step": 23050 + }, + { + "epoch": 6.983309143686502, + "grad_norm": 0.3395549952983856, + "learning_rate": 0.0003911080835603996, + "loss": 3.4574, + "step": 23100 + }, + { + "epoch": 6.998427672955975, + "grad_norm": 0.35077255964279175, + "learning_rate": 0.00039065395095367846, + "loss": 3.4717, + "step": 23150 + }, + { + "epoch": 7.013304305757136, + "grad_norm": 0.3488810956478119, + "learning_rate": 0.0003901998183469573, + "loss": 3.3635, + "step": 23200 + }, + { + "epoch": 7.028422835026609, + "grad_norm": 0.3391869068145752, + "learning_rate": 0.0003897456857402361, + "loss": 3.3509, + "step": 23250 + }, + { + "epoch": 7.043541364296082, + "grad_norm": 0.3469946086406708, + "learning_rate": 0.0003892915531335149, + "loss": 3.3675, + "step": 23300 + }, + { + "epoch": 7.058659893565554, + "grad_norm": 0.36358246207237244, + "learning_rate": 0.0003888374205267938, + "loss": 3.3506, + "step": 23350 + }, + { + "epoch": 7.0737784228350264, + "grad_norm": 0.33452320098876953, + "learning_rate": 0.0003883832879200726, + "loss": 3.3665, + "step": 23400 + }, + { + "epoch": 7.088896952104499, + "grad_norm": 0.34786656498908997, + "learning_rate": 0.00038792915531335145, + "loss": 3.3834, + "step": 23450 + }, + { + "epoch": 7.104015481373972, + "grad_norm": 0.36504653096199036, + "learning_rate": 0.00038747502270663033, + "loss": 3.3682, + "step": 23500 + }, + { + "epoch": 7.119134010643444, + "grad_norm": 0.345797061920166, + "learning_rate": 0.00038702089009990916, + "loss": 3.3713, + "step": 23550 + }, + { + "epoch": 7.134252539912917, + "grad_norm": 0.3616042137145996, + "learning_rate": 0.000386566757493188, + "loss": 3.3793, + "step": 23600 + }, + { + "epoch": 7.14937106918239, + "grad_norm": 0.33136123418807983, + "learning_rate": 0.00038611262488646677, + "loss": 3.3777, + "step": 23650 + }, + { + "epoch": 7.164489598451863, + "grad_norm": 0.36124685406684875, + "learning_rate": 0.00038565849227974565, + "loss": 3.3797, + "step": 23700 + }, + { + "epoch": 7.179608127721336, + "grad_norm": 0.3644281327724457, + "learning_rate": 0.0003852043596730245, + "loss": 3.3808, + "step": 23750 + }, + { + "epoch": 7.194726656990808, + "grad_norm": 0.3486228287220001, + "learning_rate": 0.0003847502270663033, + "loss": 3.3862, + "step": 23800 + }, + { + "epoch": 7.209845186260281, + "grad_norm": 0.350700706243515, + "learning_rate": 0.00038429609445958215, + "loss": 3.3776, + "step": 23850 + }, + { + "epoch": 7.2249637155297535, + "grad_norm": 0.35353735089302063, + "learning_rate": 0.00038384196185286103, + "loss": 3.3948, + "step": 23900 + }, + { + "epoch": 7.240082244799226, + "grad_norm": 0.34329304099082947, + "learning_rate": 0.00038338782924613986, + "loss": 3.3934, + "step": 23950 + }, + { + "epoch": 7.255200774068698, + "grad_norm": 0.37160131335258484, + "learning_rate": 0.00038293369663941864, + "loss": 3.3846, + "step": 24000 + }, + { + "epoch": 7.255200774068698, + "eval_accuracy": 0.36480168802565616, + "eval_loss": 3.606091260910034, + "eval_runtime": 181.8857, + "eval_samples_per_second": 91.508, + "eval_steps_per_second": 5.723, + "step": 24000 + }, + { + "epoch": 7.270319303338171, + "grad_norm": 0.36367154121398926, + "learning_rate": 0.00038247956403269747, + "loss": 3.3927, + "step": 24050 + }, + { + "epoch": 7.285437832607644, + "grad_norm": 0.3284398913383484, + "learning_rate": 0.00038202543142597635, + "loss": 3.4085, + "step": 24100 + }, + { + "epoch": 7.300556361877117, + "grad_norm": 0.36100390553474426, + "learning_rate": 0.0003815712988192552, + "loss": 3.4062, + "step": 24150 + }, + { + "epoch": 7.315674891146589, + "grad_norm": 0.3739434778690338, + "learning_rate": 0.000381117166212534, + "loss": 3.3941, + "step": 24200 + }, + { + "epoch": 7.330793420416062, + "grad_norm": 0.3636663854122162, + "learning_rate": 0.0003806630336058129, + "loss": 3.3991, + "step": 24250 + }, + { + "epoch": 7.345911949685535, + "grad_norm": 0.3598420023918152, + "learning_rate": 0.00038020890099909173, + "loss": 3.4083, + "step": 24300 + }, + { + "epoch": 7.361030478955008, + "grad_norm": 0.37003642320632935, + "learning_rate": 0.0003797547683923705, + "loss": 3.4077, + "step": 24350 + }, + { + "epoch": 7.37614900822448, + "grad_norm": 0.34832462668418884, + "learning_rate": 0.00037930063578564934, + "loss": 3.3993, + "step": 24400 + }, + { + "epoch": 7.3912675374939525, + "grad_norm": 0.3355371356010437, + "learning_rate": 0.0003788465031789282, + "loss": 3.3961, + "step": 24450 + }, + { + "epoch": 7.406386066763425, + "grad_norm": 0.3435608148574829, + "learning_rate": 0.00037839237057220705, + "loss": 3.4002, + "step": 24500 + }, + { + "epoch": 7.421504596032898, + "grad_norm": 0.3562789559364319, + "learning_rate": 0.0003779382379654859, + "loss": 3.4164, + "step": 24550 + }, + { + "epoch": 7.43662312530237, + "grad_norm": 0.33656221628189087, + "learning_rate": 0.0003774841053587647, + "loss": 3.4101, + "step": 24600 + }, + { + "epoch": 7.451741654571843, + "grad_norm": 0.3509146273136139, + "learning_rate": 0.0003770299727520436, + "loss": 3.4069, + "step": 24650 + }, + { + "epoch": 7.466860183841316, + "grad_norm": 0.3496881425380707, + "learning_rate": 0.00037657584014532243, + "loss": 3.4107, + "step": 24700 + }, + { + "epoch": 7.481978713110789, + "grad_norm": 0.3623145520687103, + "learning_rate": 0.0003761217075386012, + "loss": 3.4143, + "step": 24750 + }, + { + "epoch": 7.497097242380261, + "grad_norm": 0.3448486924171448, + "learning_rate": 0.00037566757493188004, + "loss": 3.4059, + "step": 24800 + }, + { + "epoch": 7.512215771649734, + "grad_norm": 0.35689592361450195, + "learning_rate": 0.0003752134423251589, + "loss": 3.4122, + "step": 24850 + }, + { + "epoch": 7.527334300919207, + "grad_norm": 0.36233118176460266, + "learning_rate": 0.00037475930971843775, + "loss": 3.4118, + "step": 24900 + }, + { + "epoch": 7.5424528301886795, + "grad_norm": 0.3479229211807251, + "learning_rate": 0.0003743051771117166, + "loss": 3.4052, + "step": 24950 + }, + { + "epoch": 7.5575713594581515, + "grad_norm": 0.338264524936676, + "learning_rate": 0.00037385104450499547, + "loss": 3.4205, + "step": 25000 + }, + { + "epoch": 7.5575713594581515, + "eval_accuracy": 0.36550926481149393, + "eval_loss": 3.5999324321746826, + "eval_runtime": 182.6684, + "eval_samples_per_second": 91.116, + "eval_steps_per_second": 5.699, + "step": 25000 + }, + { + "epoch": 7.572689888727624, + "grad_norm": 0.3575372099876404, + "learning_rate": 0.0003733969118982743, + "loss": 3.4177, + "step": 25050 + }, + { + "epoch": 7.587808417997097, + "grad_norm": 0.356697678565979, + "learning_rate": 0.0003729427792915531, + "loss": 3.4052, + "step": 25100 + }, + { + "epoch": 7.60292694726657, + "grad_norm": 0.340265154838562, + "learning_rate": 0.0003724886466848319, + "loss": 3.4063, + "step": 25150 + }, + { + "epoch": 7.618045476536043, + "grad_norm": 0.36485445499420166, + "learning_rate": 0.0003720345140781108, + "loss": 3.4231, + "step": 25200 + }, + { + "epoch": 7.633164005805515, + "grad_norm": 0.3679625988006592, + "learning_rate": 0.0003715803814713896, + "loss": 3.4219, + "step": 25250 + }, + { + "epoch": 7.648282535074988, + "grad_norm": 0.36495083570480347, + "learning_rate": 0.00037112624886466845, + "loss": 3.4118, + "step": 25300 + }, + { + "epoch": 7.663401064344461, + "grad_norm": 0.3572041094303131, + "learning_rate": 0.0003706721162579473, + "loss": 3.4184, + "step": 25350 + }, + { + "epoch": 7.678519593613933, + "grad_norm": 0.3356753885746002, + "learning_rate": 0.00037021798365122617, + "loss": 3.4145, + "step": 25400 + }, + { + "epoch": 7.693638122883406, + "grad_norm": 0.34812238812446594, + "learning_rate": 0.00036976385104450495, + "loss": 3.4273, + "step": 25450 + }, + { + "epoch": 7.7087566521528785, + "grad_norm": 0.35833728313446045, + "learning_rate": 0.0003693097184377838, + "loss": 3.4167, + "step": 25500 + }, + { + "epoch": 7.723875181422351, + "grad_norm": 0.35254955291748047, + "learning_rate": 0.0003688555858310626, + "loss": 3.4091, + "step": 25550 + }, + { + "epoch": 7.738993710691824, + "grad_norm": 0.36359110474586487, + "learning_rate": 0.0003684014532243415, + "loss": 3.4176, + "step": 25600 + }, + { + "epoch": 7.754112239961296, + "grad_norm": 0.3410344421863556, + "learning_rate": 0.0003679473206176203, + "loss": 3.4118, + "step": 25650 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 0.3466271162033081, + "learning_rate": 0.00036749318801089915, + "loss": 3.4288, + "step": 25700 + }, + { + "epoch": 7.784349298500242, + "grad_norm": 0.34525296092033386, + "learning_rate": 0.00036703905540417804, + "loss": 3.4064, + "step": 25750 + }, + { + "epoch": 7.799467827769715, + "grad_norm": 0.34507691860198975, + "learning_rate": 0.0003665849227974568, + "loss": 3.4162, + "step": 25800 + }, + { + "epoch": 7.814586357039187, + "grad_norm": 0.3572472333908081, + "learning_rate": 0.00036613079019073565, + "loss": 3.4211, + "step": 25850 + }, + { + "epoch": 7.82970488630866, + "grad_norm": 0.3335019052028656, + "learning_rate": 0.0003656766575840145, + "loss": 3.4183, + "step": 25900 + }, + { + "epoch": 7.844823415578133, + "grad_norm": 0.34102049469947815, + "learning_rate": 0.00036522252497729336, + "loss": 3.4322, + "step": 25950 + }, + { + "epoch": 7.8599419448476056, + "grad_norm": 0.3497759699821472, + "learning_rate": 0.0003647683923705722, + "loss": 3.4231, + "step": 26000 + }, + { + "epoch": 7.8599419448476056, + "eval_accuracy": 0.3660651676103375, + "eval_loss": 3.5885255336761475, + "eval_runtime": 182.172, + "eval_samples_per_second": 91.364, + "eval_steps_per_second": 5.714, + "step": 26000 + }, + { + "epoch": 7.8750604741170775, + "grad_norm": 0.35175567865371704, + "learning_rate": 0.000364314259763851, + "loss": 3.4266, + "step": 26050 + }, + { + "epoch": 7.89017900338655, + "grad_norm": 0.3355766534805298, + "learning_rate": 0.0003638601271571298, + "loss": 3.4316, + "step": 26100 + }, + { + "epoch": 7.905297532656023, + "grad_norm": 0.36761733889579773, + "learning_rate": 0.0003634059945504087, + "loss": 3.4175, + "step": 26150 + }, + { + "epoch": 7.920416061925496, + "grad_norm": 0.35893869400024414, + "learning_rate": 0.0003629518619436875, + "loss": 3.4332, + "step": 26200 + }, + { + "epoch": 7.935534591194968, + "grad_norm": 0.3662582337856293, + "learning_rate": 0.00036249772933696635, + "loss": 3.4186, + "step": 26250 + }, + { + "epoch": 7.950653120464441, + "grad_norm": 0.36315032839775085, + "learning_rate": 0.0003620435967302452, + "loss": 3.4248, + "step": 26300 + }, + { + "epoch": 7.965771649733914, + "grad_norm": 0.3308250308036804, + "learning_rate": 0.00036158946412352406, + "loss": 3.4402, + "step": 26350 + }, + { + "epoch": 7.980890179003387, + "grad_norm": 0.3648611307144165, + "learning_rate": 0.0003611353315168029, + "loss": 3.4168, + "step": 26400 + }, + { + "epoch": 7.996008708272859, + "grad_norm": 0.342036634683609, + "learning_rate": 0.00036068119891008167, + "loss": 3.4285, + "step": 26450 + }, + { + "epoch": 8.01088534107402, + "grad_norm": 0.37517133355140686, + "learning_rate": 0.00036022706630336055, + "loss": 3.3465, + "step": 26500 + }, + { + "epoch": 8.026003870343493, + "grad_norm": 0.346508651971817, + "learning_rate": 0.0003597729336966394, + "loss": 3.3161, + "step": 26550 + }, + { + "epoch": 8.041122399612966, + "grad_norm": 0.37528109550476074, + "learning_rate": 0.0003593188010899182, + "loss": 3.3215, + "step": 26600 + }, + { + "epoch": 8.056240928882438, + "grad_norm": 0.3728856146335602, + "learning_rate": 0.00035886466848319705, + "loss": 3.3283, + "step": 26650 + }, + { + "epoch": 8.07135945815191, + "grad_norm": 0.3430670201778412, + "learning_rate": 0.00035841053587647593, + "loss": 3.3318, + "step": 26700 + }, + { + "epoch": 8.086477987421384, + "grad_norm": 0.3535607159137726, + "learning_rate": 0.00035795640326975476, + "loss": 3.3282, + "step": 26750 + }, + { + "epoch": 8.101596516690856, + "grad_norm": 0.3596191108226776, + "learning_rate": 0.00035750227066303354, + "loss": 3.3194, + "step": 26800 + }, + { + "epoch": 8.116715045960328, + "grad_norm": 0.36048582196235657, + "learning_rate": 0.00035704813805631237, + "loss": 3.3234, + "step": 26850 + }, + { + "epoch": 8.131833575229802, + "grad_norm": 0.3535524904727936, + "learning_rate": 0.00035659400544959125, + "loss": 3.3467, + "step": 26900 + }, + { + "epoch": 8.146952104499274, + "grad_norm": 0.3580203950405121, + "learning_rate": 0.0003561398728428701, + "loss": 3.3281, + "step": 26950 + }, + { + "epoch": 8.162070633768748, + "grad_norm": 0.36240702867507935, + "learning_rate": 0.0003556857402361489, + "loss": 3.349, + "step": 27000 + }, + { + "epoch": 8.162070633768748, + "eval_accuracy": 0.36599462156987517, + "eval_loss": 3.598177671432495, + "eval_runtime": 182.0613, + "eval_samples_per_second": 91.42, + "eval_steps_per_second": 5.718, + "step": 27000 + }, + { + "epoch": 8.17718916303822, + "grad_norm": 0.36847227811813354, + "learning_rate": 0.0003552316076294278, + "loss": 3.3383, + "step": 27050 + }, + { + "epoch": 8.192307692307692, + "grad_norm": 0.3737618029117584, + "learning_rate": 0.00035477747502270663, + "loss": 3.3452, + "step": 27100 + }, + { + "epoch": 8.207426221577165, + "grad_norm": 0.36412081122398376, + "learning_rate": 0.00035432334241598546, + "loss": 3.3605, + "step": 27150 + }, + { + "epoch": 8.222544750846637, + "grad_norm": 0.3439239263534546, + "learning_rate": 0.00035386920980926424, + "loss": 3.3609, + "step": 27200 + }, + { + "epoch": 8.237663280116111, + "grad_norm": 0.35170575976371765, + "learning_rate": 0.0003534150772025431, + "loss": 3.3691, + "step": 27250 + }, + { + "epoch": 8.252781809385583, + "grad_norm": 0.36688730120658875, + "learning_rate": 0.00035296094459582195, + "loss": 3.3521, + "step": 27300 + }, + { + "epoch": 8.267900338655055, + "grad_norm": 0.3887461721897125, + "learning_rate": 0.0003525068119891008, + "loss": 3.3552, + "step": 27350 + }, + { + "epoch": 8.283018867924529, + "grad_norm": 0.37762829661369324, + "learning_rate": 0.0003520526793823796, + "loss": 3.3567, + "step": 27400 + }, + { + "epoch": 8.298137397194001, + "grad_norm": 0.3905569911003113, + "learning_rate": 0.0003515985467756585, + "loss": 3.3724, + "step": 27450 + }, + { + "epoch": 8.313255926463473, + "grad_norm": 0.3466584384441376, + "learning_rate": 0.00035114441416893733, + "loss": 3.3576, + "step": 27500 + }, + { + "epoch": 8.328374455732947, + "grad_norm": 0.3696681559085846, + "learning_rate": 0.0003506902815622161, + "loss": 3.3526, + "step": 27550 + }, + { + "epoch": 8.343492985002419, + "grad_norm": 0.3654014468193054, + "learning_rate": 0.00035023614895549494, + "loss": 3.3696, + "step": 27600 + }, + { + "epoch": 8.358611514271892, + "grad_norm": 0.35293635725975037, + "learning_rate": 0.0003497820163487738, + "loss": 3.3705, + "step": 27650 + }, + { + "epoch": 8.373730043541364, + "grad_norm": 0.35331737995147705, + "learning_rate": 0.00034932788374205265, + "loss": 3.3732, + "step": 27700 + }, + { + "epoch": 8.388848572810836, + "grad_norm": 0.35930585861206055, + "learning_rate": 0.0003488737511353315, + "loss": 3.3677, + "step": 27750 + }, + { + "epoch": 8.40396710208031, + "grad_norm": 0.3438582122325897, + "learning_rate": 0.00034841961852861037, + "loss": 3.3588, + "step": 27800 + }, + { + "epoch": 8.419085631349782, + "grad_norm": 0.34405338764190674, + "learning_rate": 0.0003479654859218892, + "loss": 3.3703, + "step": 27850 + }, + { + "epoch": 8.434204160619254, + "grad_norm": 0.34330499172210693, + "learning_rate": 0.000347511353315168, + "loss": 3.3554, + "step": 27900 + }, + { + "epoch": 8.449322689888728, + "grad_norm": 0.38266411423683167, + "learning_rate": 0.0003470572207084468, + "loss": 3.3699, + "step": 27950 + }, + { + "epoch": 8.4644412191582, + "grad_norm": 0.3567814528942108, + "learning_rate": 0.0003466030881017257, + "loss": 3.3654, + "step": 28000 + }, + { + "epoch": 8.4644412191582, + "eval_accuracy": 0.3666786830088921, + "eval_loss": 3.588674306869507, + "eval_runtime": 182.8049, + "eval_samples_per_second": 91.048, + "eval_steps_per_second": 5.695, + "step": 28000 + }, + { + "epoch": 8.479559748427674, + "grad_norm": 0.3407067358493805, + "learning_rate": 0.0003461489554950045, + "loss": 3.3759, + "step": 28050 + }, + { + "epoch": 8.494678277697146, + "grad_norm": 0.36050164699554443, + "learning_rate": 0.00034569482288828335, + "loss": 3.3732, + "step": 28100 + }, + { + "epoch": 8.509796806966618, + "grad_norm": 0.34035587310791016, + "learning_rate": 0.0003452406902815622, + "loss": 3.3653, + "step": 28150 + }, + { + "epoch": 8.524915336236091, + "grad_norm": 0.3827550709247589, + "learning_rate": 0.00034478655767484107, + "loss": 3.3745, + "step": 28200 + }, + { + "epoch": 8.540033865505563, + "grad_norm": 0.367535799741745, + "learning_rate": 0.00034433242506811984, + "loss": 3.3773, + "step": 28250 + }, + { + "epoch": 8.555152394775035, + "grad_norm": 0.35586029291152954, + "learning_rate": 0.0003438782924613987, + "loss": 3.381, + "step": 28300 + }, + { + "epoch": 8.57027092404451, + "grad_norm": 0.3650664687156677, + "learning_rate": 0.0003434241598546775, + "loss": 3.379, + "step": 28350 + }, + { + "epoch": 8.585389453313981, + "grad_norm": 0.35350197553634644, + "learning_rate": 0.0003429700272479564, + "loss": 3.3727, + "step": 28400 + }, + { + "epoch": 8.600507982583455, + "grad_norm": 0.3827017545700073, + "learning_rate": 0.0003425158946412352, + "loss": 3.3785, + "step": 28450 + }, + { + "epoch": 8.615626511852927, + "grad_norm": 0.3796612024307251, + "learning_rate": 0.00034206176203451405, + "loss": 3.3805, + "step": 28500 + }, + { + "epoch": 8.630745041122399, + "grad_norm": 0.36361196637153625, + "learning_rate": 0.00034160762942779294, + "loss": 3.3769, + "step": 28550 + }, + { + "epoch": 8.645863570391873, + "grad_norm": 0.365169882774353, + "learning_rate": 0.0003411534968210717, + "loss": 3.3815, + "step": 28600 + }, + { + "epoch": 8.660982099661345, + "grad_norm": 0.3632723391056061, + "learning_rate": 0.00034069936421435054, + "loss": 3.3771, + "step": 28650 + }, + { + "epoch": 8.676100628930818, + "grad_norm": 0.3668626546859741, + "learning_rate": 0.0003402452316076294, + "loss": 3.3811, + "step": 28700 + }, + { + "epoch": 8.69121915820029, + "grad_norm": 0.35274365544319153, + "learning_rate": 0.00033979109900090826, + "loss": 3.3836, + "step": 28750 + }, + { + "epoch": 8.706337687469762, + "grad_norm": 0.35408350825309753, + "learning_rate": 0.0003393369663941871, + "loss": 3.3814, + "step": 28800 + }, + { + "epoch": 8.721456216739236, + "grad_norm": 0.34329164028167725, + "learning_rate": 0.0003388828337874659, + "loss": 3.3864, + "step": 28850 + }, + { + "epoch": 8.736574746008708, + "grad_norm": 0.3636920750141144, + "learning_rate": 0.0003384287011807447, + "loss": 3.3855, + "step": 28900 + }, + { + "epoch": 8.75169327527818, + "grad_norm": 0.36424097418785095, + "learning_rate": 0.0003379745685740236, + "loss": 3.3914, + "step": 28950 + }, + { + "epoch": 8.766811804547654, + "grad_norm": 0.3490569293498993, + "learning_rate": 0.0003375204359673024, + "loss": 3.3779, + "step": 29000 + }, + { + "epoch": 8.766811804547654, + "eval_accuracy": 0.3673816743020998, + "eval_loss": 3.579127073287964, + "eval_runtime": 181.9617, + "eval_samples_per_second": 91.47, + "eval_steps_per_second": 5.721, + "step": 29000 + }, + { + "epoch": 8.781930333817126, + "grad_norm": 0.36123400926589966, + "learning_rate": 0.00033706630336058124, + "loss": 3.389, + "step": 29050 + }, + { + "epoch": 8.7970488630866, + "grad_norm": 0.34295931458473206, + "learning_rate": 0.0003366121707538601, + "loss": 3.3975, + "step": 29100 + }, + { + "epoch": 8.812167392356072, + "grad_norm": 0.38402318954467773, + "learning_rate": 0.00033615803814713896, + "loss": 3.3888, + "step": 29150 + }, + { + "epoch": 8.827285921625544, + "grad_norm": 0.3944171667098999, + "learning_rate": 0.0003357039055404178, + "loss": 3.3868, + "step": 29200 + }, + { + "epoch": 8.842404450895017, + "grad_norm": 0.3558756113052368, + "learning_rate": 0.00033524977293369657, + "loss": 3.3993, + "step": 29250 + }, + { + "epoch": 8.85752298016449, + "grad_norm": 0.3514937162399292, + "learning_rate": 0.00033479564032697545, + "loss": 3.3866, + "step": 29300 + }, + { + "epoch": 8.872641509433961, + "grad_norm": 0.363704651594162, + "learning_rate": 0.0003343415077202543, + "loss": 3.3843, + "step": 29350 + }, + { + "epoch": 8.887760038703435, + "grad_norm": 0.35803845524787903, + "learning_rate": 0.0003338873751135331, + "loss": 3.398, + "step": 29400 + }, + { + "epoch": 8.902878567972907, + "grad_norm": 0.39813512563705444, + "learning_rate": 0.00033343324250681194, + "loss": 3.4079, + "step": 29450 + }, + { + "epoch": 8.917997097242381, + "grad_norm": 0.34561270475387573, + "learning_rate": 0.00033297910990009083, + "loss": 3.3967, + "step": 29500 + }, + { + "epoch": 8.933115626511853, + "grad_norm": 0.377175509929657, + "learning_rate": 0.00033252497729336966, + "loss": 3.3931, + "step": 29550 + }, + { + "epoch": 8.948234155781325, + "grad_norm": 0.3505743145942688, + "learning_rate": 0.00033207084468664844, + "loss": 3.3993, + "step": 29600 + }, + { + "epoch": 8.963352685050799, + "grad_norm": 0.3531405031681061, + "learning_rate": 0.00033161671207992727, + "loss": 3.3932, + "step": 29650 + }, + { + "epoch": 8.97847121432027, + "grad_norm": 0.3554518222808838, + "learning_rate": 0.00033116257947320615, + "loss": 3.3915, + "step": 29700 + }, + { + "epoch": 8.993589743589745, + "grad_norm": 0.34890449047088623, + "learning_rate": 0.000330708446866485, + "loss": 3.3907, + "step": 29750 + }, + { + "epoch": 9.008466376390905, + "grad_norm": 0.38579925894737244, + "learning_rate": 0.0003302543142597638, + "loss": 3.3333, + "step": 29800 + }, + { + "epoch": 9.023584905660377, + "grad_norm": 0.3463684618473053, + "learning_rate": 0.00032980018165304264, + "loss": 3.2836, + "step": 29850 + }, + { + "epoch": 9.03870343492985, + "grad_norm": 0.3551153838634491, + "learning_rate": 0.00032934604904632153, + "loss": 3.2839, + "step": 29900 + }, + { + "epoch": 9.053821964199322, + "grad_norm": 0.36974823474884033, + "learning_rate": 0.00032889191643960036, + "loss": 3.2932, + "step": 29950 + }, + { + "epoch": 9.068940493468796, + "grad_norm": 0.3919062316417694, + "learning_rate": 0.00032843778383287914, + "loss": 3.2906, + "step": 30000 + }, + { + "epoch": 9.068940493468796, + "eval_accuracy": 0.3672455204440074, + "eval_loss": 3.5892333984375, + "eval_runtime": 182.034, + "eval_samples_per_second": 91.433, + "eval_steps_per_second": 5.719, + "step": 30000 + }, + { + "epoch": 9.084059022738268, + "grad_norm": 0.3697023093700409, + "learning_rate": 0.000327983651226158, + "loss": 3.2994, + "step": 30050 + }, + { + "epoch": 9.09917755200774, + "grad_norm": 0.35424408316612244, + "learning_rate": 0.00032752951861943685, + "loss": 3.3047, + "step": 30100 + }, + { + "epoch": 9.114296081277214, + "grad_norm": 0.3747372329235077, + "learning_rate": 0.0003270753860127157, + "loss": 3.3119, + "step": 30150 + }, + { + "epoch": 9.129414610546686, + "grad_norm": 0.36294251680374146, + "learning_rate": 0.0003266212534059945, + "loss": 3.3163, + "step": 30200 + }, + { + "epoch": 9.144533139816158, + "grad_norm": 0.3802247941493988, + "learning_rate": 0.0003261671207992734, + "loss": 3.3031, + "step": 30250 + }, + { + "epoch": 9.159651669085632, + "grad_norm": 0.3664504885673523, + "learning_rate": 0.00032571298819255223, + "loss": 3.3157, + "step": 30300 + }, + { + "epoch": 9.174770198355104, + "grad_norm": 0.3593531548976898, + "learning_rate": 0.000325258855585831, + "loss": 3.3118, + "step": 30350 + }, + { + "epoch": 9.189888727624577, + "grad_norm": 0.38480937480926514, + "learning_rate": 0.00032480472297910984, + "loss": 3.3224, + "step": 30400 + }, + { + "epoch": 9.20500725689405, + "grad_norm": 0.3553849458694458, + "learning_rate": 0.0003243505903723887, + "loss": 3.3005, + "step": 30450 + }, + { + "epoch": 9.220125786163521, + "grad_norm": 0.3630408048629761, + "learning_rate": 0.00032389645776566755, + "loss": 3.3213, + "step": 30500 + }, + { + "epoch": 9.235244315432995, + "grad_norm": 0.3702227473258972, + "learning_rate": 0.0003234423251589464, + "loss": 3.3157, + "step": 30550 + }, + { + "epoch": 9.250362844702467, + "grad_norm": 0.3907235860824585, + "learning_rate": 0.0003229881925522252, + "loss": 3.3162, + "step": 30600 + }, + { + "epoch": 9.26548137397194, + "grad_norm": 0.35727185010910034, + "learning_rate": 0.0003225340599455041, + "loss": 3.3156, + "step": 30650 + }, + { + "epoch": 9.280599903241413, + "grad_norm": 0.3772910237312317, + "learning_rate": 0.0003220799273387829, + "loss": 3.3272, + "step": 30700 + }, + { + "epoch": 9.295718432510885, + "grad_norm": 0.38522136211395264, + "learning_rate": 0.0003216257947320617, + "loss": 3.3221, + "step": 30750 + }, + { + "epoch": 9.310836961780359, + "grad_norm": 0.37482699751853943, + "learning_rate": 0.0003211716621253406, + "loss": 3.3235, + "step": 30800 + }, + { + "epoch": 9.32595549104983, + "grad_norm": 0.3574056029319763, + "learning_rate": 0.0003207175295186194, + "loss": 3.3391, + "step": 30850 + }, + { + "epoch": 9.341074020319303, + "grad_norm": 0.37357887625694275, + "learning_rate": 0.00032026339691189825, + "loss": 3.3356, + "step": 30900 + }, + { + "epoch": 9.356192549588776, + "grad_norm": 0.3630390763282776, + "learning_rate": 0.0003198092643051771, + "loss": 3.3334, + "step": 30950 + }, + { + "epoch": 9.371311078858248, + "grad_norm": 0.3472421169281006, + "learning_rate": 0.00031935513169845597, + "loss": 3.3281, + "step": 31000 + }, + { + "epoch": 9.371311078858248, + "eval_accuracy": 0.3679117102194405, + "eval_loss": 3.584568977355957, + "eval_runtime": 182.5198, + "eval_samples_per_second": 91.19, + "eval_steps_per_second": 5.703, + "step": 31000 + }, + { + "epoch": 9.386429608127722, + "grad_norm": 0.3653644919395447, + "learning_rate": 0.00031890099909173474, + "loss": 3.3286, + "step": 31050 + }, + { + "epoch": 9.401548137397194, + "grad_norm": 0.3632447123527527, + "learning_rate": 0.0003184468664850136, + "loss": 3.3314, + "step": 31100 + }, + { + "epoch": 9.416666666666666, + "grad_norm": 0.37565043568611145, + "learning_rate": 0.0003179927338782924, + "loss": 3.3458, + "step": 31150 + }, + { + "epoch": 9.43178519593614, + "grad_norm": 0.3509560525417328, + "learning_rate": 0.0003175386012715713, + "loss": 3.3354, + "step": 31200 + }, + { + "epoch": 9.446903725205612, + "grad_norm": 0.38907599449157715, + "learning_rate": 0.0003170844686648501, + "loss": 3.3462, + "step": 31250 + }, + { + "epoch": 9.462022254475084, + "grad_norm": 0.3524637222290039, + "learning_rate": 0.00031663033605812895, + "loss": 3.3425, + "step": 31300 + }, + { + "epoch": 9.477140783744558, + "grad_norm": 0.3674623370170593, + "learning_rate": 0.00031617620345140773, + "loss": 3.3356, + "step": 31350 + }, + { + "epoch": 9.49225931301403, + "grad_norm": 0.3574652373790741, + "learning_rate": 0.0003157220708446866, + "loss": 3.3486, + "step": 31400 + }, + { + "epoch": 9.507377842283503, + "grad_norm": 0.36924028396606445, + "learning_rate": 0.00031526793823796544, + "loss": 3.3381, + "step": 31450 + }, + { + "epoch": 9.522496371552975, + "grad_norm": 0.3748309910297394, + "learning_rate": 0.0003148138056312443, + "loss": 3.3494, + "step": 31500 + }, + { + "epoch": 9.537614900822447, + "grad_norm": 0.35588720440864563, + "learning_rate": 0.00031435967302452316, + "loss": 3.3437, + "step": 31550 + }, + { + "epoch": 9.552733430091921, + "grad_norm": 0.36605000495910645, + "learning_rate": 0.000313905540417802, + "loss": 3.3367, + "step": 31600 + }, + { + "epoch": 9.567851959361393, + "grad_norm": 0.358410120010376, + "learning_rate": 0.0003134514078110808, + "loss": 3.3494, + "step": 31650 + }, + { + "epoch": 9.582970488630867, + "grad_norm": 0.35448575019836426, + "learning_rate": 0.0003129972752043596, + "loss": 3.3329, + "step": 31700 + }, + { + "epoch": 9.598089017900339, + "grad_norm": 0.36444327235221863, + "learning_rate": 0.0003125431425976385, + "loss": 3.352, + "step": 31750 + }, + { + "epoch": 9.61320754716981, + "grad_norm": 0.3727281093597412, + "learning_rate": 0.0003120890099909173, + "loss": 3.3421, + "step": 31800 + }, + { + "epoch": 9.628326076439285, + "grad_norm": 0.3686982989311218, + "learning_rate": 0.00031163487738419614, + "loss": 3.3528, + "step": 31850 + }, + { + "epoch": 9.643444605708757, + "grad_norm": 0.3870965838432312, + "learning_rate": 0.000311180744777475, + "loss": 3.3532, + "step": 31900 + }, + { + "epoch": 9.658563134978229, + "grad_norm": 0.3674597144126892, + "learning_rate": 0.00031072661217075386, + "loss": 3.3457, + "step": 31950 + }, + { + "epoch": 9.673681664247702, + "grad_norm": 0.3584219813346863, + "learning_rate": 0.0003102724795640327, + "loss": 3.3594, + "step": 32000 + }, + { + "epoch": 9.673681664247702, + "eval_accuracy": 0.36860141534169444, + "eval_loss": 3.5766985416412354, + "eval_runtime": 182.1545, + "eval_samples_per_second": 91.373, + "eval_steps_per_second": 5.715, + "step": 32000 + }, + { + "epoch": 9.688800193517174, + "grad_norm": 0.37279021739959717, + "learning_rate": 0.00030981834695731147, + "loss": 3.3629, + "step": 32050 + }, + { + "epoch": 9.703918722786648, + "grad_norm": 0.35782763361930847, + "learning_rate": 0.00030936421435059035, + "loss": 3.3531, + "step": 32100 + }, + { + "epoch": 9.71903725205612, + "grad_norm": 0.35830315947532654, + "learning_rate": 0.0003089100817438692, + "loss": 3.3649, + "step": 32150 + }, + { + "epoch": 9.734155781325592, + "grad_norm": 0.3557308614253998, + "learning_rate": 0.000308455949137148, + "loss": 3.3552, + "step": 32200 + }, + { + "epoch": 9.749274310595066, + "grad_norm": 0.36622321605682373, + "learning_rate": 0.00030800181653042684, + "loss": 3.3478, + "step": 32250 + }, + { + "epoch": 9.764392839864538, + "grad_norm": 0.38028189539909363, + "learning_rate": 0.00030754768392370573, + "loss": 3.3565, + "step": 32300 + }, + { + "epoch": 9.77951136913401, + "grad_norm": 0.36501839756965637, + "learning_rate": 0.00030709355131698456, + "loss": 3.3522, + "step": 32350 + }, + { + "epoch": 9.794629898403484, + "grad_norm": 0.3978039026260376, + "learning_rate": 0.00030663941871026334, + "loss": 3.3553, + "step": 32400 + }, + { + "epoch": 9.809748427672956, + "grad_norm": 0.36402684450149536, + "learning_rate": 0.00030618528610354217, + "loss": 3.3593, + "step": 32450 + }, + { + "epoch": 9.82486695694243, + "grad_norm": 0.38092276453971863, + "learning_rate": 0.00030573115349682105, + "loss": 3.3608, + "step": 32500 + }, + { + "epoch": 9.839985486211901, + "grad_norm": 0.3872188329696655, + "learning_rate": 0.0003052770208900999, + "loss": 3.3612, + "step": 32550 + }, + { + "epoch": 9.855104015481373, + "grad_norm": 0.3871103525161743, + "learning_rate": 0.0003048228882833787, + "loss": 3.353, + "step": 32600 + }, + { + "epoch": 9.870222544750847, + "grad_norm": 0.3618019223213196, + "learning_rate": 0.00030436875567665754, + "loss": 3.3555, + "step": 32650 + }, + { + "epoch": 9.88534107402032, + "grad_norm": 0.3572273552417755, + "learning_rate": 0.00030391462306993643, + "loss": 3.359, + "step": 32700 + }, + { + "epoch": 9.900459603289793, + "grad_norm": 0.3581383228302002, + "learning_rate": 0.00030346049046321526, + "loss": 3.361, + "step": 32750 + }, + { + "epoch": 9.915578132559265, + "grad_norm": 0.3652154505252838, + "learning_rate": 0.00030300635785649404, + "loss": 3.3456, + "step": 32800 + }, + { + "epoch": 9.930696661828737, + "grad_norm": 0.3734304904937744, + "learning_rate": 0.0003025522252497729, + "loss": 3.3664, + "step": 32850 + }, + { + "epoch": 9.94581519109821, + "grad_norm": 0.3810529112815857, + "learning_rate": 0.00030209809264305175, + "loss": 3.3742, + "step": 32900 + }, + { + "epoch": 9.960933720367683, + "grad_norm": 0.3573470115661621, + "learning_rate": 0.0003016439600363306, + "loss": 3.3652, + "step": 32950 + }, + { + "epoch": 9.976052249637155, + "grad_norm": 0.3516159951686859, + "learning_rate": 0.0003011898274296094, + "loss": 3.3589, + "step": 33000 + }, + { + "epoch": 9.976052249637155, + "eval_accuracy": 0.36912145723663636, + "eval_loss": 3.5680832862854004, + "eval_runtime": 182.3106, + "eval_samples_per_second": 91.295, + "eval_steps_per_second": 5.71, + "step": 33000 + }, + { + "epoch": 9.991170778906628, + "grad_norm": 0.35868018865585327, + "learning_rate": 0.0003007356948228883, + "loss": 3.3603, + "step": 33050 + }, + { + "epoch": 10.006047411707788, + "grad_norm": 0.386972576379776, + "learning_rate": 0.00030028156221616713, + "loss": 3.3157, + "step": 33100 + }, + { + "epoch": 10.021165940977262, + "grad_norm": 0.37675729393959045, + "learning_rate": 0.0002998274296094459, + "loss": 3.2594, + "step": 33150 + }, + { + "epoch": 10.036284470246734, + "grad_norm": 0.3801371157169342, + "learning_rate": 0.0002993732970027248, + "loss": 3.2623, + "step": 33200 + }, + { + "epoch": 10.051402999516206, + "grad_norm": 0.38906607031822205, + "learning_rate": 0.0002989191643960036, + "loss": 3.264, + "step": 33250 + }, + { + "epoch": 10.06652152878568, + "grad_norm": 0.38562434911727905, + "learning_rate": 0.00029846503178928245, + "loss": 3.2688, + "step": 33300 + }, + { + "epoch": 10.081640058055152, + "grad_norm": 0.37735775113105774, + "learning_rate": 0.0002980108991825613, + "loss": 3.2636, + "step": 33350 + }, + { + "epoch": 10.096758587324626, + "grad_norm": 0.3712925314903259, + "learning_rate": 0.0002975567665758401, + "loss": 3.2732, + "step": 33400 + }, + { + "epoch": 10.111877116594098, + "grad_norm": 0.37154871225357056, + "learning_rate": 0.00029710263396911894, + "loss": 3.2768, + "step": 33450 + }, + { + "epoch": 10.12699564586357, + "grad_norm": 0.3917189836502075, + "learning_rate": 0.0002966485013623978, + "loss": 3.2761, + "step": 33500 + }, + { + "epoch": 10.142114175133043, + "grad_norm": 0.3951186239719391, + "learning_rate": 0.00029619436875567666, + "loss": 3.2847, + "step": 33550 + }, + { + "epoch": 10.157232704402515, + "grad_norm": 0.3763997256755829, + "learning_rate": 0.0002957402361489555, + "loss": 3.2954, + "step": 33600 + }, + { + "epoch": 10.17235123367199, + "grad_norm": 0.3606552183628082, + "learning_rate": 0.0002952861035422343, + "loss": 3.2845, + "step": 33650 + }, + { + "epoch": 10.187469762941461, + "grad_norm": 0.37124159932136536, + "learning_rate": 0.00029483197093551315, + "loss": 3.2954, + "step": 33700 + }, + { + "epoch": 10.202588292210933, + "grad_norm": 0.3961988389492035, + "learning_rate": 0.000294377838328792, + "loss": 3.28, + "step": 33750 + }, + { + "epoch": 10.217706821480407, + "grad_norm": 0.3912711441516876, + "learning_rate": 0.0002939237057220708, + "loss": 3.2891, + "step": 33800 + }, + { + "epoch": 10.232825350749879, + "grad_norm": 0.3611540198326111, + "learning_rate": 0.00029346957311534964, + "loss": 3.2862, + "step": 33850 + }, + { + "epoch": 10.247943880019351, + "grad_norm": 0.3812921345233917, + "learning_rate": 0.0002930154405086285, + "loss": 3.2914, + "step": 33900 + }, + { + "epoch": 10.263062409288825, + "grad_norm": 0.3808566927909851, + "learning_rate": 0.00029256130790190736, + "loss": 3.3022, + "step": 33950 + }, + { + "epoch": 10.278180938558297, + "grad_norm": 0.3736206889152527, + "learning_rate": 0.00029210717529518614, + "loss": 3.2886, + "step": 34000 + }, + { + "epoch": 10.278180938558297, + "eval_accuracy": 0.36839671424761944, + "eval_loss": 3.5810015201568604, + "eval_runtime": 182.0653, + "eval_samples_per_second": 91.418, + "eval_steps_per_second": 5.718, + "step": 34000 + }, + { + "epoch": 10.29329946782777, + "grad_norm": 0.36434870958328247, + "learning_rate": 0.000291653042688465, + "loss": 3.2954, + "step": 34050 + }, + { + "epoch": 10.308417997097242, + "grad_norm": 0.38643673062324524, + "learning_rate": 0.00029119891008174385, + "loss": 3.2943, + "step": 34100 + }, + { + "epoch": 10.323536526366714, + "grad_norm": 0.38649046421051025, + "learning_rate": 0.0002907447774750227, + "loss": 3.3119, + "step": 34150 + }, + { + "epoch": 10.338655055636188, + "grad_norm": 0.3665493130683899, + "learning_rate": 0.0002902906448683015, + "loss": 3.3019, + "step": 34200 + }, + { + "epoch": 10.35377358490566, + "grad_norm": 0.3971233069896698, + "learning_rate": 0.00028983651226158034, + "loss": 3.2944, + "step": 34250 + }, + { + "epoch": 10.368892114175132, + "grad_norm": 0.3803875148296356, + "learning_rate": 0.00028938237965485923, + "loss": 3.3025, + "step": 34300 + }, + { + "epoch": 10.384010643444606, + "grad_norm": 0.3761271834373474, + "learning_rate": 0.000288928247048138, + "loss": 3.2975, + "step": 34350 + }, + { + "epoch": 10.399129172714078, + "grad_norm": 0.3758532702922821, + "learning_rate": 0.0002884741144414169, + "loss": 3.3038, + "step": 34400 + }, + { + "epoch": 10.414247701983552, + "grad_norm": 0.3741230368614197, + "learning_rate": 0.0002880199818346957, + "loss": 3.2986, + "step": 34450 + }, + { + "epoch": 10.429366231253024, + "grad_norm": 0.371698260307312, + "learning_rate": 0.00028756584922797455, + "loss": 3.3093, + "step": 34500 + }, + { + "epoch": 10.444484760522496, + "grad_norm": 0.3635827898979187, + "learning_rate": 0.0002871117166212534, + "loss": 3.3104, + "step": 34550 + }, + { + "epoch": 10.45960328979197, + "grad_norm": 0.3689658045768738, + "learning_rate": 0.0002866575840145322, + "loss": 3.3083, + "step": 34600 + }, + { + "epoch": 10.474721819061442, + "grad_norm": 0.37572813034057617, + "learning_rate": 0.00028620345140781104, + "loss": 3.3092, + "step": 34650 + }, + { + "epoch": 10.489840348330915, + "grad_norm": 0.37176594138145447, + "learning_rate": 0.0002857493188010899, + "loss": 3.3139, + "step": 34700 + }, + { + "epoch": 10.504958877600387, + "grad_norm": 0.38343092799186707, + "learning_rate": 0.0002852951861943687, + "loss": 3.3042, + "step": 34750 + }, + { + "epoch": 10.52007740686986, + "grad_norm": 0.3622838258743286, + "learning_rate": 0.0002848410535876476, + "loss": 3.3099, + "step": 34800 + }, + { + "epoch": 10.535195936139333, + "grad_norm": 0.3808203339576721, + "learning_rate": 0.00028438692098092637, + "loss": 3.3134, + "step": 34850 + }, + { + "epoch": 10.550314465408805, + "grad_norm": 0.3763934373855591, + "learning_rate": 0.00028393278837420525, + "loss": 3.3119, + "step": 34900 + }, + { + "epoch": 10.565432994678277, + "grad_norm": 0.39781829714775085, + "learning_rate": 0.0002834786557674841, + "loss": 3.3192, + "step": 34950 + }, + { + "epoch": 10.58055152394775, + "grad_norm": 0.3610644042491913, + "learning_rate": 0.0002830245231607629, + "loss": 3.3164, + "step": 35000 + }, + { + "epoch": 10.58055152394775, + "eval_accuracy": 0.3692526728718964, + "eval_loss": 3.5734996795654297, + "eval_runtime": 182.1137, + "eval_samples_per_second": 91.393, + "eval_steps_per_second": 5.716, + "step": 35000 + }, + { + "epoch": 10.595670053217223, + "grad_norm": 0.3807249665260315, + "learning_rate": 0.00028257039055404174, + "loss": 3.3209, + "step": 35050 + }, + { + "epoch": 10.610788582486695, + "grad_norm": 0.3662697672843933, + "learning_rate": 0.0002821162579473206, + "loss": 3.3197, + "step": 35100 + }, + { + "epoch": 10.625907111756169, + "grad_norm": 0.3839268386363983, + "learning_rate": 0.00028166212534059946, + "loss": 3.3253, + "step": 35150 + }, + { + "epoch": 10.64102564102564, + "grad_norm": 0.41075611114501953, + "learning_rate": 0.00028120799273387824, + "loss": 3.3404, + "step": 35200 + }, + { + "epoch": 10.656144170295114, + "grad_norm": 0.37889009714126587, + "learning_rate": 0.0002807538601271571, + "loss": 3.3136, + "step": 35250 + }, + { + "epoch": 10.671262699564586, + "grad_norm": 0.37822359800338745, + "learning_rate": 0.00028029972752043595, + "loss": 3.3198, + "step": 35300 + }, + { + "epoch": 10.686381228834058, + "grad_norm": 0.3874446749687195, + "learning_rate": 0.0002798455949137148, + "loss": 3.3341, + "step": 35350 + }, + { + "epoch": 10.701499758103532, + "grad_norm": 0.36894306540489197, + "learning_rate": 0.0002793914623069936, + "loss": 3.3199, + "step": 35400 + }, + { + "epoch": 10.716618287373004, + "grad_norm": 0.377370148897171, + "learning_rate": 0.00027893732970027244, + "loss": 3.3164, + "step": 35450 + }, + { + "epoch": 10.731736816642478, + "grad_norm": 0.4032292366027832, + "learning_rate": 0.0002784831970935513, + "loss": 3.3203, + "step": 35500 + }, + { + "epoch": 10.74685534591195, + "grad_norm": 0.38730841875076294, + "learning_rate": 0.00027802906448683016, + "loss": 3.3291, + "step": 35550 + }, + { + "epoch": 10.761973875181422, + "grad_norm": 0.3680964708328247, + "learning_rate": 0.00027757493188010894, + "loss": 3.3402, + "step": 35600 + }, + { + "epoch": 10.777092404450896, + "grad_norm": 0.3801586925983429, + "learning_rate": 0.0002771207992733878, + "loss": 3.3207, + "step": 35650 + }, + { + "epoch": 10.792210933720368, + "grad_norm": 0.3695020079612732, + "learning_rate": 0.00027666666666666665, + "loss": 3.3398, + "step": 35700 + }, + { + "epoch": 10.80732946298984, + "grad_norm": 0.37439194321632385, + "learning_rate": 0.0002762125340599455, + "loss": 3.3204, + "step": 35750 + }, + { + "epoch": 10.822447992259313, + "grad_norm": 0.38966503739356995, + "learning_rate": 0.0002757584014532243, + "loss": 3.321, + "step": 35800 + }, + { + "epoch": 10.837566521528785, + "grad_norm": 0.3790731430053711, + "learning_rate": 0.00027530426884650314, + "loss": 3.3157, + "step": 35850 + }, + { + "epoch": 10.852685050798259, + "grad_norm": 0.3855128586292267, + "learning_rate": 0.00027485013623978203, + "loss": 3.3352, + "step": 35900 + }, + { + "epoch": 10.867803580067731, + "grad_norm": 0.36516350507736206, + "learning_rate": 0.0002743960036330608, + "loss": 3.3265, + "step": 35950 + }, + { + "epoch": 10.882922109337203, + "grad_norm": 0.3645329177379608, + "learning_rate": 0.0002739418710263397, + "loss": 3.333, + "step": 36000 + }, + { + "epoch": 10.882922109337203, + "eval_accuracy": 0.3697523739918383, + "eval_loss": 3.5634424686431885, + "eval_runtime": 182.1916, + "eval_samples_per_second": 91.354, + "eval_steps_per_second": 5.714, + "step": 36000 + }, + { + "epoch": 10.898040638606677, + "grad_norm": 0.3843611180782318, + "learning_rate": 0.0002734877384196185, + "loss": 3.3418, + "step": 36050 + }, + { + "epoch": 10.913159167876149, + "grad_norm": 0.3862791061401367, + "learning_rate": 0.00027303360581289735, + "loss": 3.3377, + "step": 36100 + }, + { + "epoch": 10.92827769714562, + "grad_norm": 0.39486852288246155, + "learning_rate": 0.0002725794732061762, + "loss": 3.3234, + "step": 36150 + }, + { + "epoch": 10.943396226415095, + "grad_norm": 0.38032087683677673, + "learning_rate": 0.000272125340599455, + "loss": 3.3271, + "step": 36200 + }, + { + "epoch": 10.958514755684567, + "grad_norm": 0.3659936189651489, + "learning_rate": 0.00027167120799273384, + "loss": 3.3357, + "step": 36250 + }, + { + "epoch": 10.97363328495404, + "grad_norm": 0.37582048773765564, + "learning_rate": 0.0002712170753860127, + "loss": 3.3281, + "step": 36300 + }, + { + "epoch": 10.988751814223512, + "grad_norm": 0.3983675241470337, + "learning_rate": 0.0002707629427792915, + "loss": 3.3304, + "step": 36350 + }, + { + "epoch": 11.003628447024674, + "grad_norm": 0.37397173047065735, + "learning_rate": 0.0002703088101725704, + "loss": 3.3006, + "step": 36400 + }, + { + "epoch": 11.018746976294146, + "grad_norm": 0.3784228265285492, + "learning_rate": 0.0002698546775658492, + "loss": 3.2259, + "step": 36450 + }, + { + "epoch": 11.033865505563618, + "grad_norm": 0.3985145390033722, + "learning_rate": 0.00026940054495912805, + "loss": 3.2336, + "step": 36500 + }, + { + "epoch": 11.048984034833092, + "grad_norm": 0.3934805989265442, + "learning_rate": 0.0002689464123524069, + "loss": 3.2294, + "step": 36550 + }, + { + "epoch": 11.064102564102564, + "grad_norm": 0.3866509199142456, + "learning_rate": 0.0002684922797456857, + "loss": 3.2465, + "step": 36600 + }, + { + "epoch": 11.079221093372038, + "grad_norm": 0.38315460085868835, + "learning_rate": 0.00026803814713896454, + "loss": 3.2519, + "step": 36650 + }, + { + "epoch": 11.09433962264151, + "grad_norm": 0.37440481781959534, + "learning_rate": 0.0002675840145322434, + "loss": 3.2416, + "step": 36700 + }, + { + "epoch": 11.109458151910982, + "grad_norm": 0.3933398425579071, + "learning_rate": 0.00026712988192552226, + "loss": 3.2557, + "step": 36750 + }, + { + "epoch": 11.124576681180455, + "grad_norm": 0.3803589940071106, + "learning_rate": 0.00026667574931880104, + "loss": 3.252, + "step": 36800 + }, + { + "epoch": 11.139695210449927, + "grad_norm": 0.38710469007492065, + "learning_rate": 0.0002662216167120799, + "loss": 3.2331, + "step": 36850 + }, + { + "epoch": 11.1548137397194, + "grad_norm": 0.38195425271987915, + "learning_rate": 0.00026576748410535875, + "loss": 3.2622, + "step": 36900 + }, + { + "epoch": 11.169932268988873, + "grad_norm": 0.3998967409133911, + "learning_rate": 0.0002653133514986376, + "loss": 3.2556, + "step": 36950 + }, + { + "epoch": 11.185050798258345, + "grad_norm": 0.398590624332428, + "learning_rate": 0.0002648592188919164, + "loss": 3.2511, + "step": 37000 + }, + { + "epoch": 11.185050798258345, + "eval_accuracy": 0.3694328004285437, + "eval_loss": 3.575971841812134, + "eval_runtime": 181.6699, + "eval_samples_per_second": 91.617, + "eval_steps_per_second": 5.73, + "step": 37000 + }, + { + "epoch": 11.200169327527819, + "grad_norm": 0.40366700291633606, + "learning_rate": 0.00026440508628519524, + "loss": 3.26, + "step": 37050 + }, + { + "epoch": 11.215287856797291, + "grad_norm": 0.39497584104537964, + "learning_rate": 0.0002639509536784741, + "loss": 3.2659, + "step": 37100 + }, + { + "epoch": 11.230406386066763, + "grad_norm": 0.39283204078674316, + "learning_rate": 0.0002634968210717529, + "loss": 3.2634, + "step": 37150 + }, + { + "epoch": 11.245524915336237, + "grad_norm": 0.39168092608451843, + "learning_rate": 0.0002630426884650318, + "loss": 3.2622, + "step": 37200 + }, + { + "epoch": 11.260643444605709, + "grad_norm": 0.37125763297080994, + "learning_rate": 0.0002625885558583106, + "loss": 3.2655, + "step": 37250 + }, + { + "epoch": 11.27576197387518, + "grad_norm": 0.3719472289085388, + "learning_rate": 0.00026213442325158945, + "loss": 3.2694, + "step": 37300 + }, + { + "epoch": 11.290880503144654, + "grad_norm": 0.39197173714637756, + "learning_rate": 0.0002616802906448683, + "loss": 3.2616, + "step": 37350 + }, + { + "epoch": 11.305999032414126, + "grad_norm": 0.3874538540840149, + "learning_rate": 0.0002612261580381471, + "loss": 3.2726, + "step": 37400 + }, + { + "epoch": 11.3211175616836, + "grad_norm": 0.39033427834510803, + "learning_rate": 0.00026077202543142594, + "loss": 3.2733, + "step": 37450 + }, + { + "epoch": 11.336236090953072, + "grad_norm": 0.38132426142692566, + "learning_rate": 0.0002603178928247048, + "loss": 3.2841, + "step": 37500 + }, + { + "epoch": 11.351354620222544, + "grad_norm": 0.39073213934898376, + "learning_rate": 0.0002598637602179836, + "loss": 3.2753, + "step": 37550 + }, + { + "epoch": 11.366473149492018, + "grad_norm": 0.4000060558319092, + "learning_rate": 0.0002594096276112625, + "loss": 3.2635, + "step": 37600 + }, + { + "epoch": 11.38159167876149, + "grad_norm": 0.39490681886672974, + "learning_rate": 0.00025895549500454127, + "loss": 3.2764, + "step": 37650 + }, + { + "epoch": 11.396710208030962, + "grad_norm": 0.4136489927768707, + "learning_rate": 0.00025850136239782015, + "loss": 3.2757, + "step": 37700 + }, + { + "epoch": 11.411828737300436, + "grad_norm": 0.3896493911743164, + "learning_rate": 0.000258047229791099, + "loss": 3.2851, + "step": 37750 + }, + { + "epoch": 11.426947266569908, + "grad_norm": 0.3810581862926483, + "learning_rate": 0.0002575930971843778, + "loss": 3.285, + "step": 37800 + }, + { + "epoch": 11.442065795839381, + "grad_norm": 0.38600221276283264, + "learning_rate": 0.00025713896457765664, + "loss": 3.2936, + "step": 37850 + }, + { + "epoch": 11.457184325108853, + "grad_norm": 0.3670991361141205, + "learning_rate": 0.0002566848319709355, + "loss": 3.2807, + "step": 37900 + }, + { + "epoch": 11.472302854378325, + "grad_norm": 0.3892036974430084, + "learning_rate": 0.00025623069936421436, + "loss": 3.2759, + "step": 37950 + }, + { + "epoch": 11.4874213836478, + "grad_norm": 0.4121161997318268, + "learning_rate": 0.00025577656675749314, + "loss": 3.2867, + "step": 38000 + }, + { + "epoch": 11.4874213836478, + "eval_accuracy": 0.3699749467494971, + "eval_loss": 3.5709855556488037, + "eval_runtime": 181.4502, + "eval_samples_per_second": 91.728, + "eval_steps_per_second": 5.737, + "step": 38000 + }, + { + "epoch": 11.502539912917271, + "grad_norm": 0.3651583790779114, + "learning_rate": 0.000255322434150772, + "loss": 3.2857, + "step": 38050 + }, + { + "epoch": 11.517658442186743, + "grad_norm": 0.39698708057403564, + "learning_rate": 0.00025486830154405085, + "loss": 3.2755, + "step": 38100 + }, + { + "epoch": 11.532776971456217, + "grad_norm": 0.3846209943294525, + "learning_rate": 0.0002544141689373297, + "loss": 3.2946, + "step": 38150 + }, + { + "epoch": 11.547895500725689, + "grad_norm": 0.395131915807724, + "learning_rate": 0.0002539600363306085, + "loss": 3.2884, + "step": 38200 + }, + { + "epoch": 11.563014029995163, + "grad_norm": 0.4034663736820221, + "learning_rate": 0.00025350590372388734, + "loss": 3.2797, + "step": 38250 + }, + { + "epoch": 11.578132559264635, + "grad_norm": 0.3774363398551941, + "learning_rate": 0.0002530517711171662, + "loss": 3.2811, + "step": 38300 + }, + { + "epoch": 11.593251088534107, + "grad_norm": 0.386628121137619, + "learning_rate": 0.00025259763851044506, + "loss": 3.2884, + "step": 38350 + }, + { + "epoch": 11.60836961780358, + "grad_norm": 0.39379242062568665, + "learning_rate": 0.00025214350590372384, + "loss": 3.2942, + "step": 38400 + }, + { + "epoch": 11.623488147073052, + "grad_norm": 0.403767466545105, + "learning_rate": 0.0002516893732970027, + "loss": 3.2802, + "step": 38450 + }, + { + "epoch": 11.638606676342526, + "grad_norm": 0.37925538420677185, + "learning_rate": 0.00025123524069028155, + "loss": 3.2897, + "step": 38500 + }, + { + "epoch": 11.653725205611998, + "grad_norm": 0.38747406005859375, + "learning_rate": 0.0002507811080835604, + "loss": 3.2969, + "step": 38550 + }, + { + "epoch": 11.66884373488147, + "grad_norm": 0.39923471212387085, + "learning_rate": 0.0002503269754768392, + "loss": 3.2988, + "step": 38600 + }, + { + "epoch": 11.683962264150944, + "grad_norm": 0.3835216164588928, + "learning_rate": 0.00024987284287011804, + "loss": 3.2925, + "step": 38650 + }, + { + "epoch": 11.699080793420416, + "grad_norm": 0.37910664081573486, + "learning_rate": 0.00024941871026339693, + "loss": 3.2974, + "step": 38700 + }, + { + "epoch": 11.714199322689888, + "grad_norm": 0.3880641758441925, + "learning_rate": 0.0002489645776566757, + "loss": 3.2975, + "step": 38750 + }, + { + "epoch": 11.729317851959362, + "grad_norm": 0.4259456694126129, + "learning_rate": 0.0002485104450499546, + "loss": 3.3015, + "step": 38800 + }, + { + "epoch": 11.744436381228834, + "grad_norm": 0.387437105178833, + "learning_rate": 0.0002480563124432334, + "loss": 3.3009, + "step": 38850 + }, + { + "epoch": 11.759554910498307, + "grad_norm": 0.39015689492225647, + "learning_rate": 0.00024760217983651225, + "loss": 3.2999, + "step": 38900 + }, + { + "epoch": 11.77467343976778, + "grad_norm": 0.41513824462890625, + "learning_rate": 0.0002471480472297911, + "loss": 3.2961, + "step": 38950 + }, + { + "epoch": 11.789791969037251, + "grad_norm": 0.38026106357574463, + "learning_rate": 0.0002466939146230699, + "loss": 3.2981, + "step": 39000 + }, + { + "epoch": 11.789791969037251, + "eval_accuracy": 0.3703576590190056, + "eval_loss": 3.5617599487304688, + "eval_runtime": 181.747, + "eval_samples_per_second": 91.578, + "eval_steps_per_second": 5.728, + "step": 39000 + }, + { + "epoch": 11.804910498306725, + "grad_norm": 0.3747030794620514, + "learning_rate": 0.00024623978201634874, + "loss": 3.3024, + "step": 39050 + }, + { + "epoch": 11.820029027576197, + "grad_norm": 0.37710049748420715, + "learning_rate": 0.0002457856494096276, + "loss": 3.3002, + "step": 39100 + }, + { + "epoch": 11.83514755684567, + "grad_norm": 0.3922806978225708, + "learning_rate": 0.0002453315168029064, + "loss": 3.313, + "step": 39150 + }, + { + "epoch": 11.850266086115143, + "grad_norm": 0.4067835509777069, + "learning_rate": 0.0002448773841961853, + "loss": 3.3049, + "step": 39200 + }, + { + "epoch": 11.865384615384615, + "grad_norm": 0.3902236223220825, + "learning_rate": 0.00024442325158946407, + "loss": 3.2974, + "step": 39250 + }, + { + "epoch": 11.880503144654089, + "grad_norm": 0.39979732036590576, + "learning_rate": 0.00024396911898274295, + "loss": 3.3028, + "step": 39300 + }, + { + "epoch": 11.89562167392356, + "grad_norm": 0.3815559446811676, + "learning_rate": 0.00024351498637602178, + "loss": 3.3034, + "step": 39350 + }, + { + "epoch": 11.910740203193033, + "grad_norm": 0.39621683955192566, + "learning_rate": 0.0002430608537693006, + "loss": 3.3031, + "step": 39400 + }, + { + "epoch": 11.925858732462506, + "grad_norm": 0.3927263915538788, + "learning_rate": 0.00024260672116257947, + "loss": 3.3094, + "step": 39450 + }, + { + "epoch": 11.940977261731978, + "grad_norm": 0.386884868144989, + "learning_rate": 0.00024215258855585827, + "loss": 3.3016, + "step": 39500 + }, + { + "epoch": 11.956095791001452, + "grad_norm": 0.3932025134563446, + "learning_rate": 0.00024169845594913713, + "loss": 3.2988, + "step": 39550 + }, + { + "epoch": 11.971214320270924, + "grad_norm": 0.38946929574012756, + "learning_rate": 0.00024124432334241596, + "loss": 3.2982, + "step": 39600 + }, + { + "epoch": 11.986332849540396, + "grad_norm": 0.386614054441452, + "learning_rate": 0.00024079019073569482, + "loss": 3.3105, + "step": 39650 + }, + { + "epoch": 12.001209482341558, + "grad_norm": 0.4196450710296631, + "learning_rate": 0.00024033605812897362, + "loss": 3.3075, + "step": 39700 + }, + { + "epoch": 12.01632801161103, + "grad_norm": 0.38088303804397583, + "learning_rate": 0.00023988192552225248, + "loss": 3.1953, + "step": 39750 + }, + { + "epoch": 12.031446540880504, + "grad_norm": 0.4057537615299225, + "learning_rate": 0.0002394277929155313, + "loss": 3.2045, + "step": 39800 + }, + { + "epoch": 12.046565070149976, + "grad_norm": 0.4013504683971405, + "learning_rate": 0.00023897366030881014, + "loss": 3.2117, + "step": 39850 + }, + { + "epoch": 12.061683599419448, + "grad_norm": 0.4028356075286865, + "learning_rate": 0.00023851952770208897, + "loss": 3.2161, + "step": 39900 + }, + { + "epoch": 12.076802128688922, + "grad_norm": 0.39186498522758484, + "learning_rate": 0.00023806539509536783, + "loss": 3.2256, + "step": 39950 + }, + { + "epoch": 12.091920657958394, + "grad_norm": 0.40833449363708496, + "learning_rate": 0.00023761126248864664, + "loss": 3.2113, + "step": 40000 + }, + { + "epoch": 12.091920657958394, + "eval_accuracy": 0.37015895433836987, + "eval_loss": 3.572819471359253, + "eval_runtime": 181.5667, + "eval_samples_per_second": 91.669, + "eval_steps_per_second": 5.733, + "step": 40000 + }, + { + "epoch": 12.107039187227867, + "grad_norm": 0.39534837007522583, + "learning_rate": 0.0002371571298819255, + "loss": 3.2207, + "step": 40050 + }, + { + "epoch": 12.12215771649734, + "grad_norm": 0.4102253317832947, + "learning_rate": 0.00023670299727520435, + "loss": 3.21, + "step": 40100 + }, + { + "epoch": 12.137276245766811, + "grad_norm": 0.41309070587158203, + "learning_rate": 0.00023624886466848318, + "loss": 3.2275, + "step": 40150 + }, + { + "epoch": 12.152394775036285, + "grad_norm": 0.3820253908634186, + "learning_rate": 0.00023579473206176204, + "loss": 3.2241, + "step": 40200 + }, + { + "epoch": 12.167513304305757, + "grad_norm": 0.4166340231895447, + "learning_rate": 0.00023534059945504084, + "loss": 3.2359, + "step": 40250 + }, + { + "epoch": 12.182631833575229, + "grad_norm": 0.38069966435432434, + "learning_rate": 0.0002348864668483197, + "loss": 3.2177, + "step": 40300 + }, + { + "epoch": 12.197750362844703, + "grad_norm": 0.4127219617366791, + "learning_rate": 0.00023443233424159853, + "loss": 3.2377, + "step": 40350 + }, + { + "epoch": 12.212868892114175, + "grad_norm": 0.3831850588321686, + "learning_rate": 0.00023397820163487736, + "loss": 3.2367, + "step": 40400 + }, + { + "epoch": 12.227987421383649, + "grad_norm": 0.3869633078575134, + "learning_rate": 0.0002335240690281562, + "loss": 3.2443, + "step": 40450 + }, + { + "epoch": 12.24310595065312, + "grad_norm": 0.4016641676425934, + "learning_rate": 0.00023306993642143505, + "loss": 3.2473, + "step": 40500 + }, + { + "epoch": 12.258224479922593, + "grad_norm": 0.4102655053138733, + "learning_rate": 0.00023261580381471385, + "loss": 3.2412, + "step": 40550 + }, + { + "epoch": 12.273343009192066, + "grad_norm": 0.396590918302536, + "learning_rate": 0.0002321616712079927, + "loss": 3.2485, + "step": 40600 + }, + { + "epoch": 12.288461538461538, + "grad_norm": 0.39611926674842834, + "learning_rate": 0.00023170753860127154, + "loss": 3.2371, + "step": 40650 + }, + { + "epoch": 12.30358006773101, + "grad_norm": 0.3962503969669342, + "learning_rate": 0.0002312534059945504, + "loss": 3.2564, + "step": 40700 + }, + { + "epoch": 12.318698597000484, + "grad_norm": 0.3942914605140686, + "learning_rate": 0.00023079927338782923, + "loss": 3.2442, + "step": 40750 + }, + { + "epoch": 12.333817126269956, + "grad_norm": 0.3998517096042633, + "learning_rate": 0.00023034514078110806, + "loss": 3.2515, + "step": 40800 + }, + { + "epoch": 12.34893565553943, + "grad_norm": 0.3914017081260681, + "learning_rate": 0.00022989100817438692, + "loss": 3.2522, + "step": 40850 + }, + { + "epoch": 12.364054184808902, + "grad_norm": 0.3937617242336273, + "learning_rate": 0.00022943687556766572, + "loss": 3.2534, + "step": 40900 + }, + { + "epoch": 12.379172714078374, + "grad_norm": 0.39187556505203247, + "learning_rate": 0.00022898274296094458, + "loss": 3.2531, + "step": 40950 + }, + { + "epoch": 12.394291243347848, + "grad_norm": 0.40456128120422363, + "learning_rate": 0.0002285286103542234, + "loss": 3.2584, + "step": 41000 + }, + { + "epoch": 12.394291243347848, + "eval_accuracy": 0.370464418693572, + "eval_loss": 3.568147659301758, + "eval_runtime": 181.5665, + "eval_samples_per_second": 91.669, + "eval_steps_per_second": 5.733, + "step": 41000 + }, + { + "epoch": 12.40940977261732, + "grad_norm": 0.4108814597129822, + "learning_rate": 0.00022807447774750227, + "loss": 3.2602, + "step": 41050 + }, + { + "epoch": 12.424528301886792, + "grad_norm": 0.40934261679649353, + "learning_rate": 0.00022762034514078107, + "loss": 3.2471, + "step": 41100 + }, + { + "epoch": 12.439646831156265, + "grad_norm": 0.4114815592765808, + "learning_rate": 0.00022716621253405993, + "loss": 3.2563, + "step": 41150 + }, + { + "epoch": 12.454765360425737, + "grad_norm": 0.41049736738204956, + "learning_rate": 0.00022671207992733876, + "loss": 3.2481, + "step": 41200 + }, + { + "epoch": 12.469883889695211, + "grad_norm": 0.40271687507629395, + "learning_rate": 0.0002262579473206176, + "loss": 3.2655, + "step": 41250 + }, + { + "epoch": 12.485002418964683, + "grad_norm": 0.39302605390548706, + "learning_rate": 0.00022580381471389642, + "loss": 3.2559, + "step": 41300 + }, + { + "epoch": 12.500120948234155, + "grad_norm": 0.4222753643989563, + "learning_rate": 0.00022534968210717528, + "loss": 3.2646, + "step": 41350 + }, + { + "epoch": 12.515239477503629, + "grad_norm": 0.3878752887248993, + "learning_rate": 0.00022489554950045408, + "loss": 3.2597, + "step": 41400 + }, + { + "epoch": 12.5303580067731, + "grad_norm": 0.3960876762866974, + "learning_rate": 0.00022444141689373294, + "loss": 3.2597, + "step": 41450 + }, + { + "epoch": 12.545476536042575, + "grad_norm": 0.3885483741760254, + "learning_rate": 0.0002239872842870118, + "loss": 3.2664, + "step": 41500 + }, + { + "epoch": 12.560595065312047, + "grad_norm": 0.37746551632881165, + "learning_rate": 0.00022353315168029063, + "loss": 3.27, + "step": 41550 + }, + { + "epoch": 12.575713594581519, + "grad_norm": 0.4066789150238037, + "learning_rate": 0.0002230790190735695, + "loss": 3.262, + "step": 41600 + }, + { + "epoch": 12.590832123850992, + "grad_norm": 0.4039228558540344, + "learning_rate": 0.0002226248864668483, + "loss": 3.2678, + "step": 41650 + }, + { + "epoch": 12.605950653120464, + "grad_norm": 0.3782307803630829, + "learning_rate": 0.00022217075386012715, + "loss": 3.272, + "step": 41700 + }, + { + "epoch": 12.621069182389936, + "grad_norm": 0.40933120250701904, + "learning_rate": 0.00022171662125340598, + "loss": 3.271, + "step": 41750 + }, + { + "epoch": 12.63618771165941, + "grad_norm": 0.422473669052124, + "learning_rate": 0.0002212624886466848, + "loss": 3.2552, + "step": 41800 + }, + { + "epoch": 12.651306240928882, + "grad_norm": 0.39664652943611145, + "learning_rate": 0.00022080835603996364, + "loss": 3.2603, + "step": 41850 + }, + { + "epoch": 12.666424770198356, + "grad_norm": 0.4060044586658478, + "learning_rate": 0.0002203542234332425, + "loss": 3.268, + "step": 41900 + }, + { + "epoch": 12.681543299467828, + "grad_norm": 0.4117172956466675, + "learning_rate": 0.0002199000908265213, + "loss": 3.2788, + "step": 41950 + }, + { + "epoch": 12.6966618287373, + "grad_norm": 0.3968527317047119, + "learning_rate": 0.00021944595821980016, + "loss": 3.2717, + "step": 42000 + }, + { + "epoch": 12.6966618287373, + "eval_accuracy": 0.37127358177767555, + "eval_loss": 3.560342788696289, + "eval_runtime": 181.731, + "eval_samples_per_second": 91.586, + "eval_steps_per_second": 5.728, + "step": 42000 + }, + { + "epoch": 12.711780358006774, + "grad_norm": 0.3838765025138855, + "learning_rate": 0.000218991825613079, + "loss": 3.269, + "step": 42050 + }, + { + "epoch": 12.726898887276246, + "grad_norm": 0.40154194831848145, + "learning_rate": 0.00021853769300635785, + "loss": 3.2805, + "step": 42100 + }, + { + "epoch": 12.742017416545718, + "grad_norm": 0.41265416145324707, + "learning_rate": 0.00021808356039963665, + "loss": 3.2806, + "step": 42150 + }, + { + "epoch": 12.757135945815191, + "grad_norm": 0.40578603744506836, + "learning_rate": 0.0002176294277929155, + "loss": 3.2676, + "step": 42200 + }, + { + "epoch": 12.772254475084663, + "grad_norm": 0.4076383113861084, + "learning_rate": 0.00021717529518619437, + "loss": 3.273, + "step": 42250 + }, + { + "epoch": 12.787373004354137, + "grad_norm": 0.38173574209213257, + "learning_rate": 0.00021672116257947317, + "loss": 3.2639, + "step": 42300 + }, + { + "epoch": 12.80249153362361, + "grad_norm": 0.4017915427684784, + "learning_rate": 0.00021626702997275203, + "loss": 3.2709, + "step": 42350 + }, + { + "epoch": 12.817610062893081, + "grad_norm": 0.400341659784317, + "learning_rate": 0.00021581289736603086, + "loss": 3.2604, + "step": 42400 + }, + { + "epoch": 12.832728592162555, + "grad_norm": 0.40442341566085815, + "learning_rate": 0.00021535876475930972, + "loss": 3.2677, + "step": 42450 + }, + { + "epoch": 12.847847121432027, + "grad_norm": 0.39479678869247437, + "learning_rate": 0.00021490463215258852, + "loss": 3.2707, + "step": 42500 + }, + { + "epoch": 12.8629656507015, + "grad_norm": 0.3941473364830017, + "learning_rate": 0.00021445049954586738, + "loss": 3.2603, + "step": 42550 + }, + { + "epoch": 12.878084179970973, + "grad_norm": 0.3957080841064453, + "learning_rate": 0.0002139963669391462, + "loss": 3.2705, + "step": 42600 + }, + { + "epoch": 12.893202709240445, + "grad_norm": 0.3981188237667084, + "learning_rate": 0.00021354223433242504, + "loss": 3.2811, + "step": 42650 + }, + { + "epoch": 12.908321238509918, + "grad_norm": 0.40485623478889465, + "learning_rate": 0.00021308810172570387, + "loss": 3.2726, + "step": 42700 + }, + { + "epoch": 12.92343976777939, + "grad_norm": 0.42019230127334595, + "learning_rate": 0.00021263396911898273, + "loss": 3.273, + "step": 42750 + }, + { + "epoch": 12.938558297048862, + "grad_norm": 0.40033140778541565, + "learning_rate": 0.00021217983651226153, + "loss": 3.2837, + "step": 42800 + }, + { + "epoch": 12.953676826318336, + "grad_norm": 0.3891710638999939, + "learning_rate": 0.0002117257039055404, + "loss": 3.2718, + "step": 42850 + }, + { + "epoch": 12.968795355587808, + "grad_norm": 0.38506442308425903, + "learning_rate": 0.00021127157129881925, + "loss": 3.2775, + "step": 42900 + }, + { + "epoch": 12.98391388485728, + "grad_norm": 0.38258466124534607, + "learning_rate": 0.00021081743869209808, + "loss": 3.2817, + "step": 42950 + }, + { + "epoch": 12.999032414126754, + "grad_norm": 0.3776659667491913, + "learning_rate": 0.00021036330608537694, + "loss": 3.2805, + "step": 43000 + }, + { + "epoch": 12.999032414126754, + "eval_accuracy": 0.3715297814812881, + "eval_loss": 3.5555002689361572, + "eval_runtime": 181.8803, + "eval_samples_per_second": 91.511, + "eval_steps_per_second": 5.724, + "step": 43000 + }, + { + "epoch": 13.013909046927916, + "grad_norm": 0.3910834789276123, + "learning_rate": 0.00020990917347865574, + "loss": 3.1917, + "step": 43050 + }, + { + "epoch": 13.029027576197388, + "grad_norm": 0.3958907425403595, + "learning_rate": 0.0002094550408719346, + "loss": 3.188, + "step": 43100 + }, + { + "epoch": 13.04414610546686, + "grad_norm": 0.3842446506023407, + "learning_rate": 0.00020900090826521343, + "loss": 3.175, + "step": 43150 + }, + { + "epoch": 13.059264634736333, + "grad_norm": 0.40903329849243164, + "learning_rate": 0.00020854677565849226, + "loss": 3.188, + "step": 43200 + }, + { + "epoch": 13.074383164005805, + "grad_norm": 0.42529699206352234, + "learning_rate": 0.0002080926430517711, + "loss": 3.1999, + "step": 43250 + }, + { + "epoch": 13.089501693275277, + "grad_norm": 0.38891202211380005, + "learning_rate": 0.00020763851044504995, + "loss": 3.1974, + "step": 43300 + }, + { + "epoch": 13.104620222544751, + "grad_norm": 0.4020247459411621, + "learning_rate": 0.00020718437783832875, + "loss": 3.2018, + "step": 43350 + }, + { + "epoch": 13.119738751814223, + "grad_norm": 0.4016630947589874, + "learning_rate": 0.0002067302452316076, + "loss": 3.206, + "step": 43400 + }, + { + "epoch": 13.134857281083697, + "grad_norm": 0.4027242958545685, + "learning_rate": 0.00020627611262488644, + "loss": 3.1896, + "step": 43450 + }, + { + "epoch": 13.149975810353169, + "grad_norm": 0.4166494607925415, + "learning_rate": 0.0002058219800181653, + "loss": 3.215, + "step": 43500 + }, + { + "epoch": 13.165094339622641, + "grad_norm": 0.42354124784469604, + "learning_rate": 0.0002053678474114441, + "loss": 3.2164, + "step": 43550 + }, + { + "epoch": 13.180212868892115, + "grad_norm": 0.387747198343277, + "learning_rate": 0.00020491371480472296, + "loss": 3.2104, + "step": 43600 + }, + { + "epoch": 13.195331398161587, + "grad_norm": 0.411479115486145, + "learning_rate": 0.00020445958219800182, + "loss": 3.2221, + "step": 43650 + }, + { + "epoch": 13.210449927431059, + "grad_norm": 0.42029666900634766, + "learning_rate": 0.00020400544959128062, + "loss": 3.2243, + "step": 43700 + }, + { + "epoch": 13.225568456700532, + "grad_norm": 0.42034053802490234, + "learning_rate": 0.00020355131698455948, + "loss": 3.2211, + "step": 43750 + }, + { + "epoch": 13.240686985970004, + "grad_norm": 0.4205058217048645, + "learning_rate": 0.0002030971843778383, + "loss": 3.2086, + "step": 43800 + }, + { + "epoch": 13.255805515239478, + "grad_norm": 0.4254988431930542, + "learning_rate": 0.00020264305177111717, + "loss": 3.2186, + "step": 43850 + }, + { + "epoch": 13.27092404450895, + "grad_norm": 0.43356308341026306, + "learning_rate": 0.00020218891916439597, + "loss": 3.2186, + "step": 43900 + }, + { + "epoch": 13.286042573778422, + "grad_norm": 0.39849865436553955, + "learning_rate": 0.00020173478655767483, + "loss": 3.2152, + "step": 43950 + }, + { + "epoch": 13.301161103047896, + "grad_norm": 0.4155410826206207, + "learning_rate": 0.00020128065395095366, + "loss": 3.2124, + "step": 44000 + }, + { + "epoch": 13.301161103047896, + "eval_accuracy": 0.37124994885412066, + "eval_loss": 3.567697048187256, + "eval_runtime": 181.5157, + "eval_samples_per_second": 91.695, + "eval_steps_per_second": 5.735, + "step": 44000 + }, + { + "epoch": 13.316279632317368, + "grad_norm": 0.3929089605808258, + "learning_rate": 0.0002008265213442325, + "loss": 3.2163, + "step": 44050 + }, + { + "epoch": 13.33139816158684, + "grad_norm": 0.4119015336036682, + "learning_rate": 0.00020037238873751132, + "loss": 3.2243, + "step": 44100 + }, + { + "epoch": 13.346516690856314, + "grad_norm": 0.3972527086734772, + "learning_rate": 0.00019991825613079018, + "loss": 3.2297, + "step": 44150 + }, + { + "epoch": 13.361635220125786, + "grad_norm": 0.4144071042537689, + "learning_rate": 0.00019946412352406898, + "loss": 3.2162, + "step": 44200 + }, + { + "epoch": 13.37675374939526, + "grad_norm": 0.39286085963249207, + "learning_rate": 0.00019900999091734784, + "loss": 3.2202, + "step": 44250 + }, + { + "epoch": 13.391872278664732, + "grad_norm": 0.39850661158561707, + "learning_rate": 0.00019855585831062667, + "loss": 3.2285, + "step": 44300 + }, + { + "epoch": 13.406990807934204, + "grad_norm": 0.3939213454723358, + "learning_rate": 0.00019810172570390553, + "loss": 3.2397, + "step": 44350 + }, + { + "epoch": 13.422109337203677, + "grad_norm": 0.4019733965396881, + "learning_rate": 0.0001976475930971844, + "loss": 3.2131, + "step": 44400 + }, + { + "epoch": 13.43722786647315, + "grad_norm": 0.40081989765167236, + "learning_rate": 0.0001971934604904632, + "loss": 3.2349, + "step": 44450 + }, + { + "epoch": 13.452346395742623, + "grad_norm": 0.41404154896736145, + "learning_rate": 0.00019673932788374205, + "loss": 3.2303, + "step": 44500 + }, + { + "epoch": 13.467464925012095, + "grad_norm": 0.4119343161582947, + "learning_rate": 0.00019628519527702088, + "loss": 3.2266, + "step": 44550 + }, + { + "epoch": 13.482583454281567, + "grad_norm": 0.41343453526496887, + "learning_rate": 0.0001958310626702997, + "loss": 3.2294, + "step": 44600 + }, + { + "epoch": 13.49770198355104, + "grad_norm": 0.3903872072696686, + "learning_rate": 0.00019537693006357854, + "loss": 3.223, + "step": 44650 + }, + { + "epoch": 13.512820512820513, + "grad_norm": 0.41756418347358704, + "learning_rate": 0.0001949227974568574, + "loss": 3.2336, + "step": 44700 + }, + { + "epoch": 13.527939042089985, + "grad_norm": 0.40313833951950073, + "learning_rate": 0.0001944686648501362, + "loss": 3.2332, + "step": 44750 + }, + { + "epoch": 13.543057571359459, + "grad_norm": 0.40880855917930603, + "learning_rate": 0.00019401453224341506, + "loss": 3.237, + "step": 44800 + }, + { + "epoch": 13.55817610062893, + "grad_norm": 0.41109326481819153, + "learning_rate": 0.0001935603996366939, + "loss": 3.2484, + "step": 44850 + }, + { + "epoch": 13.573294629898404, + "grad_norm": 0.4063083231449127, + "learning_rate": 0.00019310626702997275, + "loss": 3.2367, + "step": 44900 + }, + { + "epoch": 13.588413159167876, + "grad_norm": 0.39859169721603394, + "learning_rate": 0.00019265213442325155, + "loss": 3.2337, + "step": 44950 + }, + { + "epoch": 13.603531688437348, + "grad_norm": 0.39923831820487976, + "learning_rate": 0.0001921980018165304, + "loss": 3.2352, + "step": 45000 + }, + { + "epoch": 13.603531688437348, + "eval_accuracy": 0.371590686229554, + "eval_loss": 3.5596277713775635, + "eval_runtime": 182.4311, + "eval_samples_per_second": 91.234, + "eval_steps_per_second": 5.706, + "step": 45000 + }, + { + "epoch": 13.618650217706822, + "grad_norm": 0.40481671690940857, + "learning_rate": 0.00019174386920980924, + "loss": 3.2446, + "step": 45050 + }, + { + "epoch": 13.633768746976294, + "grad_norm": 0.4066201448440552, + "learning_rate": 0.00019128973660308807, + "loss": 3.2385, + "step": 45100 + }, + { + "epoch": 13.648887276245766, + "grad_norm": 0.411175012588501, + "learning_rate": 0.00019083560399636693, + "loss": 3.2362, + "step": 45150 + }, + { + "epoch": 13.66400580551524, + "grad_norm": 0.4065161645412445, + "learning_rate": 0.00019038147138964576, + "loss": 3.2415, + "step": 45200 + }, + { + "epoch": 13.679124334784712, + "grad_norm": 0.41124144196510315, + "learning_rate": 0.00018992733878292462, + "loss": 3.2396, + "step": 45250 + }, + { + "epoch": 13.694242864054186, + "grad_norm": 0.4290773272514343, + "learning_rate": 0.00018947320617620342, + "loss": 3.2601, + "step": 45300 + }, + { + "epoch": 13.709361393323658, + "grad_norm": 0.4111163020133972, + "learning_rate": 0.00018901907356948228, + "loss": 3.241, + "step": 45350 + }, + { + "epoch": 13.72447992259313, + "grad_norm": 0.40063321590423584, + "learning_rate": 0.0001885649409627611, + "loss": 3.2305, + "step": 45400 + }, + { + "epoch": 13.739598451862603, + "grad_norm": 0.38352343440055847, + "learning_rate": 0.00018811080835603994, + "loss": 3.2383, + "step": 45450 + }, + { + "epoch": 13.754716981132075, + "grad_norm": 0.43132641911506653, + "learning_rate": 0.00018765667574931877, + "loss": 3.2537, + "step": 45500 + }, + { + "epoch": 13.769835510401549, + "grad_norm": 0.409151166677475, + "learning_rate": 0.00018720254314259763, + "loss": 3.2405, + "step": 45550 + }, + { + "epoch": 13.784954039671021, + "grad_norm": 0.4174506366252899, + "learning_rate": 0.00018674841053587643, + "loss": 3.2594, + "step": 45600 + }, + { + "epoch": 13.800072568940493, + "grad_norm": 0.40947169065475464, + "learning_rate": 0.0001862942779291553, + "loss": 3.248, + "step": 45650 + }, + { + "epoch": 13.815191098209967, + "grad_norm": 0.4033055603504181, + "learning_rate": 0.00018584014532243412, + "loss": 3.2331, + "step": 45700 + }, + { + "epoch": 13.830309627479439, + "grad_norm": 0.4000418484210968, + "learning_rate": 0.00018538601271571298, + "loss": 3.249, + "step": 45750 + }, + { + "epoch": 13.84542815674891, + "grad_norm": 0.4021860957145691, + "learning_rate": 0.00018493188010899184, + "loss": 3.2478, + "step": 45800 + }, + { + "epoch": 13.860546686018385, + "grad_norm": 0.3957579731941223, + "learning_rate": 0.00018447774750227064, + "loss": 3.2536, + "step": 45850 + }, + { + "epoch": 13.875665215287857, + "grad_norm": 0.43215376138687134, + "learning_rate": 0.0001840236148955495, + "loss": 3.247, + "step": 45900 + }, + { + "epoch": 13.890783744557329, + "grad_norm": 0.4118666350841522, + "learning_rate": 0.00018356948228882833, + "loss": 3.2431, + "step": 45950 + }, + { + "epoch": 13.905902273826802, + "grad_norm": 0.4115840494632721, + "learning_rate": 0.00018311534968210716, + "loss": 3.2477, + "step": 46000 + }, + { + "epoch": 13.905902273826802, + "eval_accuracy": 0.3719852737492069, + "eval_loss": 3.5575146675109863, + "eval_runtime": 182.5723, + "eval_samples_per_second": 91.164, + "eval_steps_per_second": 5.702, + "step": 46000 + }, + { + "epoch": 13.921020803096274, + "grad_norm": 0.4212381839752197, + "learning_rate": 0.000182661217075386, + "loss": 3.2476, + "step": 46050 + }, + { + "epoch": 13.936139332365748, + "grad_norm": 0.4087257385253906, + "learning_rate": 0.00018220708446866485, + "loss": 3.2438, + "step": 46100 + }, + { + "epoch": 13.95125786163522, + "grad_norm": 0.39609295129776, + "learning_rate": 0.00018175295186194365, + "loss": 3.252, + "step": 46150 + }, + { + "epoch": 13.966376390904692, + "grad_norm": 0.38680288195610046, + "learning_rate": 0.0001812988192552225, + "loss": 3.2479, + "step": 46200 + }, + { + "epoch": 13.981494920174166, + "grad_norm": 0.39530307054519653, + "learning_rate": 0.00018084468664850134, + "loss": 3.2696, + "step": 46250 + }, + { + "epoch": 13.996613449443638, + "grad_norm": 0.40767961740493774, + "learning_rate": 0.0001803905540417802, + "loss": 3.2543, + "step": 46300 + }, + { + "epoch": 14.0114900822448, + "grad_norm": 0.4131290018558502, + "learning_rate": 0.000179936421435059, + "loss": 3.1936, + "step": 46350 + }, + { + "epoch": 14.026608611514272, + "grad_norm": 0.40820956230163574, + "learning_rate": 0.00017948228882833786, + "loss": 3.1675, + "step": 46400 + }, + { + "epoch": 14.041727140783745, + "grad_norm": 0.4055199921131134, + "learning_rate": 0.0001790281562216167, + "loss": 3.1683, + "step": 46450 + }, + { + "epoch": 14.056845670053217, + "grad_norm": 0.41949304938316345, + "learning_rate": 0.00017857402361489552, + "loss": 3.1716, + "step": 46500 + }, + { + "epoch": 14.07196419932269, + "grad_norm": 0.4142007529735565, + "learning_rate": 0.00017811989100817438, + "loss": 3.1763, + "step": 46550 + }, + { + "epoch": 14.087082728592163, + "grad_norm": 0.42433109879493713, + "learning_rate": 0.0001776657584014532, + "loss": 3.1737, + "step": 46600 + }, + { + "epoch": 14.102201257861635, + "grad_norm": 0.4084174931049347, + "learning_rate": 0.00017721162579473207, + "loss": 3.1795, + "step": 46650 + }, + { + "epoch": 14.117319787131107, + "grad_norm": 0.42154616117477417, + "learning_rate": 0.00017675749318801087, + "loss": 3.1796, + "step": 46700 + }, + { + "epoch": 14.132438316400581, + "grad_norm": 0.42639973759651184, + "learning_rate": 0.00017630336058128973, + "loss": 3.186, + "step": 46750 + }, + { + "epoch": 14.147556845670053, + "grad_norm": 0.40734776854515076, + "learning_rate": 0.00017584922797456856, + "loss": 3.1896, + "step": 46800 + }, + { + "epoch": 14.162675374939527, + "grad_norm": 0.44012895226478577, + "learning_rate": 0.0001753950953678474, + "loss": 3.1707, + "step": 46850 + }, + { + "epoch": 14.177793904208999, + "grad_norm": 0.42036205530166626, + "learning_rate": 0.00017494096276112622, + "loss": 3.1837, + "step": 46900 + }, + { + "epoch": 14.19291243347847, + "grad_norm": 0.4042738676071167, + "learning_rate": 0.00017448683015440508, + "loss": 3.1915, + "step": 46950 + }, + { + "epoch": 14.208030962747944, + "grad_norm": 0.41336682438850403, + "learning_rate": 0.00017403269754768388, + "loss": 3.1954, + "step": 47000 + }, + { + "epoch": 14.208030962747944, + "eval_accuracy": 0.37157375517984303, + "eval_loss": 3.564302682876587, + "eval_runtime": 182.5131, + "eval_samples_per_second": 91.193, + "eval_steps_per_second": 5.704, + "step": 47000 + }, + { + "epoch": 14.223149492017416, + "grad_norm": 0.46255195140838623, + "learning_rate": 0.00017357856494096274, + "loss": 3.1851, + "step": 47050 + }, + { + "epoch": 14.238268021286888, + "grad_norm": 0.4109150469303131, + "learning_rate": 0.00017312443233424157, + "loss": 3.1951, + "step": 47100 + }, + { + "epoch": 14.253386550556362, + "grad_norm": 0.4143703877925873, + "learning_rate": 0.00017267029972752043, + "loss": 3.2023, + "step": 47150 + }, + { + "epoch": 14.268505079825834, + "grad_norm": 0.41615933179855347, + "learning_rate": 0.00017221616712079923, + "loss": 3.196, + "step": 47200 + }, + { + "epoch": 14.283623609095308, + "grad_norm": 0.43715471029281616, + "learning_rate": 0.0001717620345140781, + "loss": 3.2061, + "step": 47250 + }, + { + "epoch": 14.29874213836478, + "grad_norm": 0.4121893048286438, + "learning_rate": 0.00017130790190735695, + "loss": 3.2027, + "step": 47300 + }, + { + "epoch": 14.313860667634252, + "grad_norm": 0.424890398979187, + "learning_rate": 0.00017085376930063578, + "loss": 3.1913, + "step": 47350 + }, + { + "epoch": 14.328979196903726, + "grad_norm": 0.4319377839565277, + "learning_rate": 0.0001703996366939146, + "loss": 3.2066, + "step": 47400 + }, + { + "epoch": 14.344097726173198, + "grad_norm": 0.4028719663619995, + "learning_rate": 0.00016994550408719344, + "loss": 3.1982, + "step": 47450 + }, + { + "epoch": 14.359216255442671, + "grad_norm": 0.4185102880001068, + "learning_rate": 0.0001694913714804723, + "loss": 3.1978, + "step": 47500 + }, + { + "epoch": 14.374334784712143, + "grad_norm": 0.4216921627521515, + "learning_rate": 0.0001690372388737511, + "loss": 3.1973, + "step": 47550 + }, + { + "epoch": 14.389453313981615, + "grad_norm": 0.40805599093437195, + "learning_rate": 0.00016858310626702996, + "loss": 3.1978, + "step": 47600 + }, + { + "epoch": 14.40457184325109, + "grad_norm": 0.42335882782936096, + "learning_rate": 0.0001681289736603088, + "loss": 3.199, + "step": 47650 + }, + { + "epoch": 14.419690372520561, + "grad_norm": 0.4133867621421814, + "learning_rate": 0.00016767484105358765, + "loss": 3.2183, + "step": 47700 + }, + { + "epoch": 14.434808901790033, + "grad_norm": 0.41599327325820923, + "learning_rate": 0.00016722070844686645, + "loss": 3.1966, + "step": 47750 + }, + { + "epoch": 14.449927431059507, + "grad_norm": 0.41563186049461365, + "learning_rate": 0.0001667665758401453, + "loss": 3.1937, + "step": 47800 + }, + { + "epoch": 14.465045960328979, + "grad_norm": 0.39862963557243347, + "learning_rate": 0.00016631244323342414, + "loss": 3.1962, + "step": 47850 + }, + { + "epoch": 14.480164489598453, + "grad_norm": 0.4591251015663147, + "learning_rate": 0.00016585831062670297, + "loss": 3.1961, + "step": 47900 + }, + { + "epoch": 14.495283018867925, + "grad_norm": 0.4092547595500946, + "learning_rate": 0.00016540417801998183, + "loss": 3.214, + "step": 47950 + }, + { + "epoch": 14.510401548137397, + "grad_norm": 0.4241653084754944, + "learning_rate": 0.00016495004541326066, + "loss": 3.2191, + "step": 48000 + }, + { + "epoch": 14.510401548137397, + "eval_accuracy": 0.37222407209617214, + "eval_loss": 3.557905673980713, + "eval_runtime": 182.5474, + "eval_samples_per_second": 91.176, + "eval_steps_per_second": 5.703, + "step": 48000 + }, + { + "epoch": 14.52552007740687, + "grad_norm": 0.4303036034107208, + "learning_rate": 0.00016449591280653952, + "loss": 3.2149, + "step": 48050 + }, + { + "epoch": 14.540638606676342, + "grad_norm": 0.4225316047668457, + "learning_rate": 0.00016404178019981832, + "loss": 3.2069, + "step": 48100 + }, + { + "epoch": 14.555757135945814, + "grad_norm": 0.44241517782211304, + "learning_rate": 0.00016358764759309718, + "loss": 3.2143, + "step": 48150 + }, + { + "epoch": 14.570875665215288, + "grad_norm": 0.44027650356292725, + "learning_rate": 0.000163133514986376, + "loss": 3.2134, + "step": 48200 + }, + { + "epoch": 14.58599419448476, + "grad_norm": 0.40961533784866333, + "learning_rate": 0.00016267938237965484, + "loss": 3.22, + "step": 48250 + }, + { + "epoch": 14.601112723754234, + "grad_norm": 0.4150550663471222, + "learning_rate": 0.00016222524977293367, + "loss": 3.2215, + "step": 48300 + }, + { + "epoch": 14.616231253023706, + "grad_norm": 0.41550400853157043, + "learning_rate": 0.00016177111716621253, + "loss": 3.2093, + "step": 48350 + }, + { + "epoch": 14.631349782293178, + "grad_norm": 0.4034069776535034, + "learning_rate": 0.00016131698455949133, + "loss": 3.2205, + "step": 48400 + }, + { + "epoch": 14.646468311562652, + "grad_norm": 0.43780672550201416, + "learning_rate": 0.0001608628519527702, + "loss": 3.2103, + "step": 48450 + }, + { + "epoch": 14.661586840832124, + "grad_norm": 0.41662082076072693, + "learning_rate": 0.00016040871934604902, + "loss": 3.2207, + "step": 48500 + }, + { + "epoch": 14.676705370101596, + "grad_norm": 0.41818660497665405, + "learning_rate": 0.00015995458673932788, + "loss": 3.219, + "step": 48550 + }, + { + "epoch": 14.69182389937107, + "grad_norm": 0.4019031524658203, + "learning_rate": 0.00015950045413260668, + "loss": 3.2011, + "step": 48600 + }, + { + "epoch": 14.706942428640541, + "grad_norm": 0.4318902790546417, + "learning_rate": 0.00015904632152588554, + "loss": 3.2212, + "step": 48650 + }, + { + "epoch": 14.722060957910015, + "grad_norm": 0.4129679203033447, + "learning_rate": 0.0001585921889191644, + "loss": 3.2094, + "step": 48700 + }, + { + "epoch": 14.737179487179487, + "grad_norm": 0.4268816411495209, + "learning_rate": 0.00015813805631244323, + "loss": 3.2194, + "step": 48750 + }, + { + "epoch": 14.75229801644896, + "grad_norm": 0.43164098262786865, + "learning_rate": 0.00015768392370572206, + "loss": 3.2072, + "step": 48800 + }, + { + "epoch": 14.767416545718433, + "grad_norm": 0.4305291771888733, + "learning_rate": 0.0001572297910990009, + "loss": 3.2239, + "step": 48850 + }, + { + "epoch": 14.782535074987905, + "grad_norm": 0.4099324941635132, + "learning_rate": 0.00015677565849227975, + "loss": 3.2257, + "step": 48900 + }, + { + "epoch": 14.797653604257377, + "grad_norm": 0.431211918592453, + "learning_rate": 0.00015632152588555855, + "loss": 3.2173, + "step": 48950 + }, + { + "epoch": 14.81277213352685, + "grad_norm": 0.43717148900032043, + "learning_rate": 0.0001558673932788374, + "loss": 3.2221, + "step": 49000 + }, + { + "epoch": 14.81277213352685, + "eval_accuracy": 0.37258820724169217, + "eval_loss": 3.5516107082366943, + "eval_runtime": 182.6239, + "eval_samples_per_second": 91.138, + "eval_steps_per_second": 5.7, + "step": 49000 + }, + { + "epoch": 14.827890662796323, + "grad_norm": 0.4287441372871399, + "learning_rate": 0.00015541326067211624, + "loss": 3.2187, + "step": 49050 + }, + { + "epoch": 14.843009192065796, + "grad_norm": 0.39929869771003723, + "learning_rate": 0.0001549591280653951, + "loss": 3.2176, + "step": 49100 + }, + { + "epoch": 14.858127721335268, + "grad_norm": 0.43470391631126404, + "learning_rate": 0.0001545049954586739, + "loss": 3.2304, + "step": 49150 + }, + { + "epoch": 14.87324625060474, + "grad_norm": 0.4141775667667389, + "learning_rate": 0.00015405086285195276, + "loss": 3.2258, + "step": 49200 + }, + { + "epoch": 14.888364779874214, + "grad_norm": 0.4020072817802429, + "learning_rate": 0.0001535967302452316, + "loss": 3.2275, + "step": 49250 + }, + { + "epoch": 14.903483309143686, + "grad_norm": 0.42552366852760315, + "learning_rate": 0.00015314259763851042, + "loss": 3.2308, + "step": 49300 + }, + { + "epoch": 14.91860183841316, + "grad_norm": 0.41669708490371704, + "learning_rate": 0.00015268846503178925, + "loss": 3.219, + "step": 49350 + }, + { + "epoch": 14.933720367682632, + "grad_norm": 0.43189725279808044, + "learning_rate": 0.0001522343324250681, + "loss": 3.2263, + "step": 49400 + }, + { + "epoch": 14.948838896952104, + "grad_norm": 0.4100758135318756, + "learning_rate": 0.00015178019981834697, + "loss": 3.2267, + "step": 49450 + }, + { + "epoch": 14.963957426221578, + "grad_norm": 0.41336727142333984, + "learning_rate": 0.00015132606721162577, + "loss": 3.2247, + "step": 49500 + }, + { + "epoch": 14.97907595549105, + "grad_norm": 0.4141419529914856, + "learning_rate": 0.00015087193460490463, + "loss": 3.213, + "step": 49550 + }, + { + "epoch": 14.994194484760522, + "grad_norm": 0.4345361888408661, + "learning_rate": 0.00015041780199818346, + "loss": 3.2258, + "step": 49600 + }, + { + "epoch": 15.009071117561684, + "grad_norm": 0.40050098299980164, + "learning_rate": 0.0001499636693914623, + "loss": 3.182, + "step": 49650 + }, + { + "epoch": 15.024189646831156, + "grad_norm": 0.41687700152397156, + "learning_rate": 0.00014950953678474112, + "loss": 3.1643, + "step": 49700 + }, + { + "epoch": 15.03930817610063, + "grad_norm": 0.43859630823135376, + "learning_rate": 0.00014905540417801998, + "loss": 3.1614, + "step": 49750 + }, + { + "epoch": 15.054426705370101, + "grad_norm": 0.4150136709213257, + "learning_rate": 0.0001486012715712988, + "loss": 3.1513, + "step": 49800 + }, + { + "epoch": 15.069545234639575, + "grad_norm": 0.41403576731681824, + "learning_rate": 0.00014814713896457764, + "loss": 3.1426, + "step": 49850 + }, + { + "epoch": 15.084663763909047, + "grad_norm": 0.4175975024700165, + "learning_rate": 0.0001476930063578565, + "loss": 3.1457, + "step": 49900 + }, + { + "epoch": 15.099782293178519, + "grad_norm": 0.4216543734073639, + "learning_rate": 0.00014723887375113533, + "loss": 3.1489, + "step": 49950 + }, + { + "epoch": 15.114900822447993, + "grad_norm": 0.4189589321613312, + "learning_rate": 0.00014678474114441416, + "loss": 3.1676, + "step": 50000 + }, + { + "epoch": 15.114900822447993, + "eval_accuracy": 0.3722348891557097, + "eval_loss": 3.562861442565918, + "eval_runtime": 182.4146, + "eval_samples_per_second": 91.243, + "eval_steps_per_second": 5.707, + "step": 50000 + } + ], + "logging_steps": 50, + "max_steps": 66160, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.04491728764928e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}