diff --git "a/checkpoint-90000/trainer_state.json" "b/checkpoint-90000/trainer_state.json" --- "a/checkpoint-90000/trainer_state.json" +++ "b/checkpoint-90000/trainer_state.json" @@ -1,5 +1,5 @@ { - "best_metric": 3.302138566970825, + "best_metric": 3.304030179977417, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_500_6910/checkpoint-90000", "epoch": 9.703504043126685, "eval_steps": 1000, @@ -10,13412 +10,13412 @@ "log_history": [ { "epoch": 0.005390835579514825, - "grad_norm": 2.9886317253112793, + "grad_norm": 4.202730655670166, "learning_rate": 0.000276, - "loss": 9.0224, + "loss": 9.0233, "step": 50 }, { "epoch": 0.01078167115902965, - "grad_norm": 0.8695021867752075, + "grad_norm": 3.1378865242004395, "learning_rate": 0.0005759999999999999, - "loss": 6.9609, + "loss": 6.9276, "step": 100 }, { "epoch": 0.016172506738544475, - "grad_norm": 1.7424172163009644, + "grad_norm": 1.2884689569473267, "learning_rate": 0.000599702104695089, - "loss": 6.5403, + "loss": 6.4839, "step": 150 }, { "epoch": 0.0215633423180593, - "grad_norm": 2.535945415496826, + "grad_norm": 1.6449090242385864, "learning_rate": 0.0005993783054506205, - "loss": 6.2529, + "loss": 6.2463, "step": 200 }, { "epoch": 0.026954177897574125, - "grad_norm": 1.765920877456665, + "grad_norm": 1.5109689235687256, "learning_rate": 0.0005990545062061521, - "loss": 6.0905, + "loss": 6.094, "step": 250 }, { "epoch": 0.03234501347708895, - "grad_norm": 1.0415717363357544, + "grad_norm": 1.170856237411499, "learning_rate": 0.0005987307069616836, - "loss": 5.9552, + "loss": 5.949, "step": 300 }, { "epoch": 0.03773584905660377, - "grad_norm": 1.3854601383209229, + "grad_norm": 2.105668783187866, "learning_rate": 0.0005984069077172153, - "loss": 5.8847, + "loss": 5.8791, "step": 350 }, { "epoch": 0.0431266846361186, - "grad_norm": 0.8729716539382935, + "grad_norm": 1.3761645555496216, "learning_rate": 0.0005980831084727469, - "loss": 5.8182, + "loss": 5.8165, "step": 400 }, { "epoch": 0.04851752021563342, - "grad_norm": 1.8327264785766602, + "grad_norm": 1.8226579427719116, "learning_rate": 0.0005977593092282784, - "loss": 5.7573, + "loss": 5.756, "step": 450 }, { "epoch": 0.05390835579514825, - "grad_norm": 1.8601025342941284, + "grad_norm": 0.8710860013961792, "learning_rate": 0.00059743550998381, - "loss": 5.6461, + "loss": 5.6467, "step": 500 }, { "epoch": 0.05929919137466307, - "grad_norm": 1.083327054977417, + "grad_norm": 0.9444846510887146, "learning_rate": 0.0005971117107393416, - "loss": 5.6014, + "loss": 5.6021, "step": 550 }, { "epoch": 0.0646900269541779, - "grad_norm": 0.8286775350570679, + "grad_norm": 1.4856325387954712, "learning_rate": 0.0005967879114948732, - "loss": 5.5275, + "loss": 5.5298, "step": 600 }, { "epoch": 0.07008086253369272, - "grad_norm": 1.0809779167175293, + "grad_norm": 1.6019251346588135, "learning_rate": 0.0005964641122504047, - "loss": 5.4639, + "loss": 5.4623, "step": 650 }, { "epoch": 0.07547169811320754, - "grad_norm": 1.215111255645752, + "grad_norm": 1.1106631755828857, "learning_rate": 0.0005961403130059363, - "loss": 5.3863, + "loss": 5.3906, "step": 700 }, { "epoch": 0.08086253369272237, - "grad_norm": 0.9550946950912476, + "grad_norm": 1.1751388311386108, "learning_rate": 0.0005958165137614678, - "loss": 5.3178, + "loss": 5.3255, "step": 750 }, { "epoch": 0.0862533692722372, - "grad_norm": 1.219592809677124, + "grad_norm": 1.028311848640442, "learning_rate": 0.0005954927145169995, - "loss": 5.2468, + "loss": 5.2479, "step": 800 }, { "epoch": 0.09164420485175202, - "grad_norm": 1.0037598609924316, + "grad_norm": 1.1774542331695557, "learning_rate": 0.0005951689152725309, - "loss": 5.2228, + "loss": 5.2316, "step": 850 }, { "epoch": 0.09703504043126684, - "grad_norm": 0.9939533472061157, + "grad_norm": 0.9043501019477844, "learning_rate": 0.0005948451160280626, - "loss": 5.167, + "loss": 5.1739, "step": 900 }, { "epoch": 0.10242587601078167, - "grad_norm": 0.8934879899024963, + "grad_norm": 0.9826597571372986, "learning_rate": 0.0005945213167835941, - "loss": 5.1051, + "loss": 5.1094, "step": 950 }, { "epoch": 0.1078167115902965, - "grad_norm": 1.4683802127838135, + "grad_norm": 1.1902110576629639, "learning_rate": 0.0005941975175391257, - "loss": 5.112, + "loss": 5.1176, "step": 1000 }, { "epoch": 0.1078167115902965, - "eval_accuracy": 0.22728771643247805, - "eval_loss": 5.021671295166016, - "eval_runtime": 185.7281, - "eval_samples_per_second": 96.975, - "eval_steps_per_second": 6.063, + "eval_accuracy": 0.22649987434293525, + "eval_loss": 5.031488418579102, + "eval_runtime": 185.546, + "eval_samples_per_second": 97.07, + "eval_steps_per_second": 6.069, "step": 1000 }, { "epoch": 0.11320754716981132, - "grad_norm": 0.9396394491195679, + "grad_norm": 1.1733556985855103, "learning_rate": 0.0005938737182946572, - "loss": 5.0399, + "loss": 5.0471, "step": 1050 }, { "epoch": 0.11859838274932614, - "grad_norm": 1.2031350135803223, + "grad_norm": 1.4435728788375854, "learning_rate": 0.0005935499190501888, - "loss": 5.0063, + "loss": 5.0089, "step": 1100 }, { "epoch": 0.12398921832884097, - "grad_norm": 1.067723274230957, + "grad_norm": 1.056373119354248, "learning_rate": 0.0005932261198057204, - "loss": 5.0062, + "loss": 5.01, "step": 1150 }, { "epoch": 0.1293800539083558, - "grad_norm": 1.0314335823059082, + "grad_norm": 0.8520533442497253, "learning_rate": 0.000592902320561252, - "loss": 4.9655, + "loss": 4.9711, "step": 1200 }, { "epoch": 0.1347708894878706, - "grad_norm": 0.7959302067756653, + "grad_norm": 0.9913970828056335, "learning_rate": 0.0005925785213167835, - "loss": 4.946, + "loss": 4.9486, "step": 1250 }, { "epoch": 0.14016172506738545, - "grad_norm": 1.1784368753433228, + "grad_norm": 1.615565299987793, "learning_rate": 0.0005922547220723151, - "loss": 4.9007, + "loss": 4.9054, "step": 1300 }, { "epoch": 0.14555256064690028, - "grad_norm": 0.9498503804206848, + "grad_norm": 1.1613094806671143, "learning_rate": 0.0005919309228278468, - "loss": 4.8504, + "loss": 4.8517, "step": 1350 }, { "epoch": 0.1509433962264151, - "grad_norm": 1.2894479036331177, + "grad_norm": 1.1608201265335083, "learning_rate": 0.0005916071235833783, - "loss": 4.8604, + "loss": 4.8605, "step": 1400 }, { "epoch": 0.15633423180592992, - "grad_norm": 0.9321544170379639, + "grad_norm": 1.0951576232910156, "learning_rate": 0.0005912833243389097, - "loss": 4.8149, + "loss": 4.8191, "step": 1450 }, { "epoch": 0.16172506738544473, - "grad_norm": 1.0761699676513672, + "grad_norm": 0.896648108959198, "learning_rate": 0.0005909595250944414, - "loss": 4.8135, + "loss": 4.8189, "step": 1500 }, { "epoch": 0.16711590296495957, - "grad_norm": 1.3190892934799194, + "grad_norm": 1.0958908796310425, "learning_rate": 0.000590635725849973, - "loss": 4.7957, + "loss": 4.8031, "step": 1550 }, { "epoch": 0.1725067385444744, - "grad_norm": 1.0538240671157837, + "grad_norm": 1.2471723556518555, "learning_rate": 0.0005903119266055045, - "loss": 4.7729, + "loss": 4.7804, "step": 1600 }, { "epoch": 0.1778975741239892, - "grad_norm": 0.9775657057762146, + "grad_norm": 1.2964743375778198, "learning_rate": 0.0005899881273610361, - "loss": 4.7419, + "loss": 4.7491, "step": 1650 }, { "epoch": 0.18328840970350405, - "grad_norm": 1.1415176391601562, + "grad_norm": 1.2858680486679077, "learning_rate": 0.0005896643281165677, - "loss": 4.7273, + "loss": 4.7306, "step": 1700 }, { "epoch": 0.18867924528301888, - "grad_norm": 1.054643154144287, + "grad_norm": 1.1489158868789673, "learning_rate": 0.0005893405288720993, - "loss": 4.676, + "loss": 4.6824, "step": 1750 }, { "epoch": 0.1940700808625337, - "grad_norm": 0.9605634212493896, + "grad_norm": 0.9289330244064331, "learning_rate": 0.0005890167296276308, - "loss": 4.6987, + "loss": 4.7103, "step": 1800 }, { "epoch": 0.19946091644204852, - "grad_norm": 1.2974011898040771, + "grad_norm": 1.142987608909607, "learning_rate": 0.0005886929303831624, - "loss": 4.6522, + "loss": 4.6613, "step": 1850 }, { "epoch": 0.20485175202156333, - "grad_norm": 0.8335526585578918, + "grad_norm": 0.80745929479599, "learning_rate": 0.0005883691311386939, - "loss": 4.6288, + "loss": 4.6385, "step": 1900 }, { "epoch": 0.21024258760107817, - "grad_norm": 0.8267828822135925, + "grad_norm": 0.6197893619537354, "learning_rate": 0.0005880453318942256, - "loss": 4.6095, + "loss": 4.6164, "step": 1950 }, { "epoch": 0.215633423180593, - "grad_norm": 0.8468182682991028, + "grad_norm": 0.6791684627532959, "learning_rate": 0.0005877215326497571, - "loss": 4.5942, + "loss": 4.6009, "step": 2000 }, { "epoch": 0.215633423180593, - "eval_accuracy": 0.2697699090390619, - "eval_loss": 4.5105061531066895, - "eval_runtime": 185.0374, - "eval_samples_per_second": 97.337, - "eval_steps_per_second": 6.085, + "eval_accuracy": 0.26871010877131946, + "eval_loss": 4.521060943603516, + "eval_runtime": 184.8086, + "eval_samples_per_second": 97.458, + "eval_steps_per_second": 6.093, "step": 2000 }, { "epoch": 0.2210242587601078, - "grad_norm": 0.8596065044403076, + "grad_norm": 0.865337073802948, "learning_rate": 0.0005873977334052887, - "loss": 4.577, + "loss": 4.5827, "step": 2050 }, { "epoch": 0.22641509433962265, - "grad_norm": 0.8865411877632141, + "grad_norm": 1.018101453781128, "learning_rate": 0.0005870739341608202, - "loss": 4.5573, + "loss": 4.5648, "step": 2100 }, { "epoch": 0.23180592991913745, - "grad_norm": 0.759678065776825, + "grad_norm": 0.7497203350067139, "learning_rate": 0.0005867501349163519, - "loss": 4.531, + "loss": 4.5389, "step": 2150 }, { "epoch": 0.2371967654986523, - "grad_norm": 0.9333673715591431, + "grad_norm": 0.9881597757339478, "learning_rate": 0.0005864263356718833, - "loss": 4.5288, + "loss": 4.5317, "step": 2200 }, { "epoch": 0.24258760107816713, - "grad_norm": 0.9579219818115234, + "grad_norm": 0.8000950813293457, "learning_rate": 0.000586102536427415, - "loss": 4.5046, + "loss": 4.5076, "step": 2250 }, { "epoch": 0.24797843665768193, - "grad_norm": 0.9088818430900574, + "grad_norm": 0.8468273878097534, "learning_rate": 0.0005857787371829465, - "loss": 4.498, + "loss": 4.4986, "step": 2300 }, { "epoch": 0.25336927223719674, - "grad_norm": 1.0662283897399902, + "grad_norm": 1.2672775983810425, "learning_rate": 0.0005854549379384781, - "loss": 4.476, + "loss": 4.4835, "step": 2350 }, { "epoch": 0.2587601078167116, - "grad_norm": 0.8263185024261475, + "grad_norm": 0.8388407826423645, "learning_rate": 0.0005851311386940096, - "loss": 4.4585, + "loss": 4.4637, "step": 2400 }, { "epoch": 0.2641509433962264, - "grad_norm": 0.7838813066482544, + "grad_norm": 0.8386316895484924, "learning_rate": 0.0005848073394495412, - "loss": 4.4501, + "loss": 4.4537, "step": 2450 }, { "epoch": 0.2695417789757412, - "grad_norm": 1.0609959363937378, + "grad_norm": 0.9034900665283203, "learning_rate": 0.0005844835402050728, - "loss": 4.4276, + "loss": 4.43, "step": 2500 }, { "epoch": 0.2749326145552561, - "grad_norm": 0.7959164381027222, + "grad_norm": 0.8214098215103149, "learning_rate": 0.0005841597409606044, - "loss": 4.3959, + "loss": 4.4001, "step": 2550 }, { "epoch": 0.2803234501347709, - "grad_norm": 0.7520493268966675, + "grad_norm": 0.803986132144928, "learning_rate": 0.000583835941716136, - "loss": 4.4101, + "loss": 4.4119, "step": 2600 }, { "epoch": 0.2857142857142857, - "grad_norm": 0.8316442370414734, + "grad_norm": 0.7659591436386108, "learning_rate": 0.0005835121424716675, - "loss": 4.3887, + "loss": 4.3984, "step": 2650 }, { "epoch": 0.29110512129380056, - "grad_norm": 0.8290981650352478, + "grad_norm": 0.6612359285354614, "learning_rate": 0.0005831883432271992, - "loss": 4.3746, + "loss": 4.3813, "step": 2700 }, { "epoch": 0.29649595687331537, - "grad_norm": 0.8543332815170288, + "grad_norm": 0.8132438659667969, "learning_rate": 0.0005828645439827307, - "loss": 4.3708, + "loss": 4.375, "step": 2750 }, { "epoch": 0.3018867924528302, - "grad_norm": 1.093960165977478, + "grad_norm": 0.734201192855835, "learning_rate": 0.0005825407447382622, - "loss": 4.329, + "loss": 4.3299, "step": 2800 }, { "epoch": 0.30727762803234504, - "grad_norm": 0.704647958278656, + "grad_norm": 0.6258540153503418, "learning_rate": 0.0005822169454937938, - "loss": 4.3542, + "loss": 4.3571, "step": 2850 }, { "epoch": 0.31266846361185985, - "grad_norm": 0.7982504963874817, + "grad_norm": 0.77208411693573, "learning_rate": 0.0005818931462493254, - "loss": 4.3236, + "loss": 4.3253, "step": 2900 }, { "epoch": 0.31805929919137466, - "grad_norm": 0.7402891516685486, + "grad_norm": 0.7789588570594788, "learning_rate": 0.0005815693470048569, - "loss": 4.3262, + "loss": 4.3301, "step": 2950 }, { "epoch": 0.32345013477088946, - "grad_norm": 0.7455512285232544, + "grad_norm": 0.6333003640174866, "learning_rate": 0.0005812455477603885, - "loss": 4.3169, + "loss": 4.3192, "step": 3000 }, { "epoch": 0.32345013477088946, - "eval_accuracy": 0.2977507439734861, - "eval_loss": 4.238898754119873, - "eval_runtime": 185.2161, - "eval_samples_per_second": 97.243, - "eval_steps_per_second": 6.079, + "eval_accuracy": 0.2970622106234057, + "eval_loss": 4.244666576385498, + "eval_runtime": 184.9563, + "eval_samples_per_second": 97.38, + "eval_steps_per_second": 6.088, "step": 3000 }, { "epoch": 0.3288409703504043, - "grad_norm": 0.7529988288879395, + "grad_norm": 0.8093159198760986, "learning_rate": 0.0005809217485159201, - "loss": 4.3159, + "loss": 4.3201, "step": 3050 }, { "epoch": 0.33423180592991913, - "grad_norm": 0.7738096117973328, + "grad_norm": 0.6877481341362, "learning_rate": 0.0005805979492714517, - "loss": 4.2958, + "loss": 4.2994, "step": 3100 }, { "epoch": 0.33962264150943394, - "grad_norm": 0.9844054579734802, + "grad_norm": 0.8910661339759827, "learning_rate": 0.0005802741500269832, - "loss": 4.2765, + "loss": 4.277, "step": 3150 }, { "epoch": 0.3450134770889488, - "grad_norm": 0.6849039793014526, + "grad_norm": 0.6189978122711182, "learning_rate": 0.0005799503507825148, - "loss": 4.2784, + "loss": 4.2794, "step": 3200 }, { "epoch": 0.3504043126684636, - "grad_norm": 0.8223270177841187, + "grad_norm": 0.7318760752677917, "learning_rate": 0.0005796265515380463, - "loss": 4.2661, + "loss": 4.2705, "step": 3250 }, { "epoch": 0.3557951482479784, - "grad_norm": 1.0759813785552979, + "grad_norm": 0.853126049041748, "learning_rate": 0.000579302752293578, - "loss": 4.2702, + "loss": 4.2752, "step": 3300 }, { "epoch": 0.3611859838274933, - "grad_norm": 0.8592544794082642, + "grad_norm": 0.7733789682388306, "learning_rate": 0.0005789789530491095, - "loss": 4.2441, + "loss": 4.245, "step": 3350 }, { "epoch": 0.3665768194070081, - "grad_norm": 0.7978523373603821, + "grad_norm": 0.7828088998794556, "learning_rate": 0.0005786551538046411, - "loss": 4.2357, + "loss": 4.2378, "step": 3400 }, { "epoch": 0.3719676549865229, - "grad_norm": 0.671616792678833, + "grad_norm": 0.6192962527275085, "learning_rate": 0.0005783313545601726, - "loss": 4.2296, + "loss": 4.2325, "step": 3450 }, { "epoch": 0.37735849056603776, - "grad_norm": 0.9814181327819824, + "grad_norm": 0.9660407304763794, "learning_rate": 0.0005780075553157043, - "loss": 4.2266, + "loss": 4.2287, "step": 3500 }, { "epoch": 0.38274932614555257, - "grad_norm": 0.6863210797309875, + "grad_norm": 0.6514875888824463, "learning_rate": 0.0005776837560712357, - "loss": 4.2231, + "loss": 4.2273, "step": 3550 }, { "epoch": 0.3881401617250674, - "grad_norm": 0.8627532720565796, + "grad_norm": 0.7967301607131958, "learning_rate": 0.0005773599568267673, - "loss": 4.213, + "loss": 4.2175, "step": 3600 }, { "epoch": 0.3935309973045822, - "grad_norm": 0.7562503218650818, + "grad_norm": 0.7382152676582336, "learning_rate": 0.0005770361575822989, - "loss": 4.2039, + "loss": 4.2058, "step": 3650 }, { "epoch": 0.39892183288409705, - "grad_norm": 0.7050570249557495, + "grad_norm": 0.6826912760734558, "learning_rate": 0.0005767123583378305, - "loss": 4.1944, + "loss": 4.1984, "step": 3700 }, { "epoch": 0.40431266846361186, - "grad_norm": 0.7500070929527283, + "grad_norm": 0.7629984617233276, "learning_rate": 0.000576388559093362, - "loss": 4.1959, + "loss": 4.1999, "step": 3750 }, { "epoch": 0.40970350404312667, - "grad_norm": 0.7112507224082947, + "grad_norm": 0.7264732122421265, "learning_rate": 0.0005760647598488936, - "loss": 4.1652, + "loss": 4.1661, "step": 3800 }, { "epoch": 0.41509433962264153, - "grad_norm": 0.6652435660362244, + "grad_norm": 0.6775606870651245, "learning_rate": 0.0005757409606044253, "loss": 4.1705, "step": 3850 }, { "epoch": 0.42048517520215634, - "grad_norm": 0.7143222093582153, + "grad_norm": 0.6371147036552429, "learning_rate": 0.0005754171613599568, - "loss": 4.1853, + "loss": 4.1898, "step": 3900 }, { "epoch": 0.42587601078167114, - "grad_norm": 1.307312250137329, + "grad_norm": 0.9279155731201172, "learning_rate": 0.0005750933621154884, - "loss": 4.155, + "loss": 4.1564, "step": 3950 }, { "epoch": 0.431266846361186, - "grad_norm": 0.7014321088790894, + "grad_norm": 0.7013996243476868, "learning_rate": 0.0005747695628710199, - "loss": 4.1568, + "loss": 4.1584, "step": 4000 }, { "epoch": 0.431266846361186, - "eval_accuracy": 0.3122283066632144, - "eval_loss": 4.091763496398926, - "eval_runtime": 184.8885, - "eval_samples_per_second": 97.415, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.3116742855882484, + "eval_loss": 4.094541549682617, + "eval_runtime": 184.9211, + "eval_samples_per_second": 97.398, + "eval_steps_per_second": 6.089, "step": 4000 }, { "epoch": 0.4366576819407008, - "grad_norm": 0.6528450846672058, + "grad_norm": 0.6695048809051514, "learning_rate": 0.0005744457636265515, - "loss": 4.1642, + "loss": 4.1673, "step": 4050 }, { "epoch": 0.4420485175202156, - "grad_norm": 0.6965686082839966, + "grad_norm": 0.7323116660118103, "learning_rate": 0.0005741219643820831, - "loss": 4.1599, + "loss": 4.1606, "step": 4100 }, { "epoch": 0.4474393530997305, - "grad_norm": 0.672761857509613, + "grad_norm": 0.6928706169128418, "learning_rate": 0.0005737981651376146, - "loss": 4.1398, + "loss": 4.1431, "step": 4150 }, { "epoch": 0.4528301886792453, - "grad_norm": 0.6647113561630249, + "grad_norm": 0.7627272009849548, "learning_rate": 0.0005734743658931462, - "loss": 4.1447, + "loss": 4.1496, "step": 4200 }, { "epoch": 0.4582210242587601, - "grad_norm": 0.5870072245597839, + "grad_norm": 0.5734204053878784, "learning_rate": 0.0005731505666486778, - "loss": 4.1341, + "loss": 4.1349, "step": 4250 }, { "epoch": 0.4636118598382749, - "grad_norm": 0.7313339710235596, + "grad_norm": 0.7368983626365662, "learning_rate": 0.0005728267674042093, - "loss": 4.1366, + "loss": 4.135, "step": 4300 }, { "epoch": 0.46900269541778977, - "grad_norm": 0.8136706352233887, + "grad_norm": 0.7932204008102417, "learning_rate": 0.0005725029681597409, - "loss": 4.124, + "loss": 4.1263, "step": 4350 }, { "epoch": 0.4743935309973046, - "grad_norm": 0.6834421753883362, + "grad_norm": 0.69672030210495, "learning_rate": 0.0005721791689152725, - "loss": 4.1078, + "loss": 4.1093, "step": 4400 }, { "epoch": 0.4797843665768194, - "grad_norm": 0.6508508920669556, + "grad_norm": 0.743208110332489, "learning_rate": 0.0005718553696708041, - "loss": 4.1152, + "loss": 4.1153, "step": 4450 }, { "epoch": 0.48517520215633425, - "grad_norm": 0.6280442476272583, + "grad_norm": 0.6159020066261292, "learning_rate": 0.0005715315704263356, - "loss": 4.1121, + "loss": 4.1113, "step": 4500 }, { "epoch": 0.49056603773584906, - "grad_norm": 0.7698476910591125, + "grad_norm": 0.7856785655021667, "learning_rate": 0.0005712077711818672, - "loss": 4.1191, + "loss": 4.1188, "step": 4550 }, { "epoch": 0.49595687331536387, - "grad_norm": 0.7502248883247375, + "grad_norm": 0.8774890899658203, "learning_rate": 0.0005708839719373987, - "loss": 4.1148, + "loss": 4.1134, "step": 4600 }, { "epoch": 0.5013477088948787, - "grad_norm": 0.6504122018814087, + "grad_norm": 0.6919339299201965, "learning_rate": 0.0005705601726929304, - "loss": 4.0719, + "loss": 4.0736, "step": 4650 }, { "epoch": 0.5067385444743935, - "grad_norm": 0.663817286491394, + "grad_norm": 0.7403694987297058, "learning_rate": 0.0005702363734484619, - "loss": 4.0717, + "loss": 4.0742, "step": 4700 }, { "epoch": 0.5121293800539084, - "grad_norm": 0.6721544861793518, + "grad_norm": 0.6341198086738586, "learning_rate": 0.0005699125742039935, - "loss": 4.0791, + "loss": 4.0777, "step": 4750 }, { "epoch": 0.5175202156334232, - "grad_norm": 0.6749738454818726, + "grad_norm": 0.6231762170791626, "learning_rate": 0.000569588774959525, "loss": 4.074, "step": 4800 }, { "epoch": 0.522911051212938, - "grad_norm": 0.640407383441925, + "grad_norm": 0.6197823286056519, "learning_rate": 0.0005692649757150567, - "loss": 4.0707, + "loss": 4.0696, "step": 4850 }, { "epoch": 0.5283018867924528, - "grad_norm": 0.7699702382087708, + "grad_norm": 0.7616347670555115, "learning_rate": 0.0005689411764705881, - "loss": 4.071, + "loss": 4.072, "step": 4900 }, { "epoch": 0.5336927223719676, - "grad_norm": 0.7380169630050659, + "grad_norm": 0.6769264340400696, "learning_rate": 0.0005686173772261197, - "loss": 4.0611, + "loss": 4.0616, "step": 4950 }, { "epoch": 0.5390835579514824, - "grad_norm": 0.7213010191917419, + "grad_norm": 0.8056491017341614, "learning_rate": 0.0005682935779816514, - "loss": 4.0605, + "loss": 4.0604, "step": 5000 }, { "epoch": 0.5390835579514824, - "eval_accuracy": 0.32159711922079365, - "eval_loss": 3.9895858764648438, - "eval_runtime": 184.9121, - "eval_samples_per_second": 97.403, - "eval_steps_per_second": 6.089, + "eval_accuracy": 0.32076407752992003, + "eval_loss": 3.9996907711029053, + "eval_runtime": 185.0096, + "eval_samples_per_second": 97.352, + "eval_steps_per_second": 6.086, "step": 5000 }, { "epoch": 0.5444743935309974, - "grad_norm": 0.6962867379188538, + "grad_norm": 0.5992985367774963, "learning_rate": 0.0005679697787371829, - "loss": 4.0683, + "loss": 4.0685, "step": 5050 }, { "epoch": 0.5498652291105122, - "grad_norm": 0.6025412678718567, + "grad_norm": 0.623742938041687, "learning_rate": 0.0005676459794927145, - "loss": 4.0577, + "loss": 4.0541, "step": 5100 }, { "epoch": 0.555256064690027, - "grad_norm": 0.7060973048210144, + "grad_norm": 0.670574963092804, "learning_rate": 0.000567322180248246, - "loss": 4.0478, + "loss": 4.049, "step": 5150 }, { "epoch": 0.5606469002695418, - "grad_norm": 0.7477322816848755, + "grad_norm": 0.6862586140632629, "learning_rate": 0.0005669983810037777, - "loss": 4.0597, + "loss": 4.0581, "step": 5200 }, { "epoch": 0.5660377358490566, - "grad_norm": 0.6687774658203125, + "grad_norm": 0.7968271374702454, "learning_rate": 0.0005666745817593092, - "loss": 4.0342, + "loss": 4.0331, "step": 5250 }, { "epoch": 0.5714285714285714, - "grad_norm": 0.5782555937767029, + "grad_norm": 0.6383443474769592, "learning_rate": 0.0005663507825148408, - "loss": 4.0336, + "loss": 4.0329, "step": 5300 }, { "epoch": 0.5768194070080862, - "grad_norm": 0.7279411554336548, + "grad_norm": 0.6728504300117493, "learning_rate": 0.0005660269832703723, - "loss": 4.023, + "loss": 4.0234, "step": 5350 }, { "epoch": 0.5822102425876011, - "grad_norm": 0.6305038332939148, + "grad_norm": 0.6788572669029236, "learning_rate": 0.0005657031840259039, - "loss": 4.0282, + "loss": 4.0272, "step": 5400 }, { "epoch": 0.5876010781671159, - "grad_norm": 0.7332347631454468, + "grad_norm": 0.7220979928970337, "learning_rate": 0.0005653793847814355, - "loss": 4.0282, + "loss": 4.0266, "step": 5450 }, { "epoch": 0.5929919137466307, - "grad_norm": 0.7261084318161011, + "grad_norm": 0.7897782921791077, "learning_rate": 0.000565055585536967, - "loss": 4.0045, + "loss": 4.004, "step": 5500 }, { "epoch": 0.5983827493261455, - "grad_norm": 0.6593828797340393, + "grad_norm": 0.6859530210494995, "learning_rate": 0.0005647317862924986, - "loss": 4.0288, + "loss": 4.0272, "step": 5550 }, { "epoch": 0.6037735849056604, - "grad_norm": 0.6685374975204468, + "grad_norm": 0.6835060715675354, "learning_rate": 0.0005644079870480302, - "loss": 3.9767, + "loss": 3.9762, "step": 5600 }, { "epoch": 0.6091644204851752, - "grad_norm": 0.6912181973457336, + "grad_norm": 0.6331577301025391, "learning_rate": 0.0005640841878035617, - "loss": 4.013, + "loss": 4.0138, "step": 5650 }, { "epoch": 0.6145552560646901, - "grad_norm": 0.7449594736099243, + "grad_norm": 0.6407967209815979, "learning_rate": 0.0005637603885590933, - "loss": 4.0117, + "loss": 4.0133, "step": 5700 }, { "epoch": 0.6199460916442049, - "grad_norm": 0.6433312892913818, + "grad_norm": 0.5845387578010559, "learning_rate": 0.0005634365893146248, - "loss": 4.012, + "loss": 4.0109, "step": 5750 }, { "epoch": 0.6253369272237197, - "grad_norm": 0.6818335652351379, + "grad_norm": 0.660351037979126, "learning_rate": 0.0005631127900701565, - "loss": 4.002, + "loss": 4.0022, "step": 5800 }, { "epoch": 0.6307277628032345, - "grad_norm": 0.6618849039077759, + "grad_norm": 0.607296347618103, "learning_rate": 0.000562788990825688, - "loss": 3.9769, + "loss": 3.9779, "step": 5850 }, { "epoch": 0.6361185983827493, - "grad_norm": 0.5509379506111145, + "grad_norm": 0.5927079916000366, "learning_rate": 0.0005624651915812196, - "loss": 3.989, + "loss": 3.9895, "step": 5900 }, { "epoch": 0.6415094339622641, - "grad_norm": 0.794357419013977, + "grad_norm": 0.6702271103858948, "learning_rate": 0.0005621413923367511, - "loss": 3.995, + "loss": 3.9958, "step": 5950 }, { "epoch": 0.6469002695417789, - "grad_norm": 0.5914753675460815, + "grad_norm": 0.6015697121620178, "learning_rate": 0.0005618175930922828, - "loss": 4.0027, + "loss": 3.9995, "step": 6000 }, { "epoch": 0.6469002695417789, - "eval_accuracy": 0.32826688539217336, - "eval_loss": 3.91684627532959, - "eval_runtime": 184.8458, - "eval_samples_per_second": 97.438, - "eval_steps_per_second": 6.092, + "eval_accuracy": 0.32856003088349683, + "eval_loss": 3.9154956340789795, + "eval_runtime": 184.9163, + "eval_samples_per_second": 97.401, + "eval_steps_per_second": 6.089, "step": 6000 }, { "epoch": 0.6522911051212938, - "grad_norm": 0.6078530550003052, + "grad_norm": 0.6032575368881226, "learning_rate": 0.0005614937938478143, - "loss": 3.9924, + "loss": 3.9936, "step": 6050 }, { "epoch": 0.6576819407008087, - "grad_norm": 0.6487135887145996, + "grad_norm": 0.67863529920578, "learning_rate": 0.0005611699946033459, - "loss": 3.9847, + "loss": 3.9843, "step": 6100 }, { "epoch": 0.6630727762803235, - "grad_norm": 0.5724067687988281, + "grad_norm": 0.5835185647010803, "learning_rate": 0.0005608461953588774, - "loss": 3.9778, + "loss": 3.9774, "step": 6150 }, { "epoch": 0.6684636118598383, - "grad_norm": 0.5982683300971985, + "grad_norm": 0.6360609531402588, "learning_rate": 0.000560522396114409, - "loss": 3.9716, + "loss": 3.9718, "step": 6200 }, { "epoch": 0.6738544474393531, - "grad_norm": 0.6258571743965149, + "grad_norm": 0.5755443572998047, "learning_rate": 0.0005601985968699405, - "loss": 3.9527, + "loss": 3.9553, "step": 6250 }, { "epoch": 0.6792452830188679, - "grad_norm": 0.6944162249565125, + "grad_norm": 0.6723095178604126, "learning_rate": 0.0005598747976254721, - "loss": 3.9692, + "loss": 3.97, "step": 6300 }, { "epoch": 0.6846361185983828, - "grad_norm": 0.5871492624282837, + "grad_norm": 0.6210877299308777, "learning_rate": 0.0005595509983810038, - "loss": 3.94, + "loss": 3.9446, "step": 6350 }, { "epoch": 0.6900269541778976, - "grad_norm": 0.6267895102500916, + "grad_norm": 0.6090090870857239, "learning_rate": 0.0005592271991365353, - "loss": 3.9551, + "loss": 3.9554, "step": 6400 }, { "epoch": 0.6954177897574124, - "grad_norm": 0.6211588978767395, + "grad_norm": 0.6561355590820312, "learning_rate": 0.0005589033998920669, - "loss": 3.9716, + "loss": 3.9706, "step": 6450 }, { "epoch": 0.7008086253369272, - "grad_norm": 0.6593815088272095, + "grad_norm": 0.6227344870567322, "learning_rate": 0.0005585796006475984, - "loss": 3.9397, + "loss": 3.9421, "step": 6500 }, { "epoch": 0.706199460916442, - "grad_norm": 0.7002833485603333, + "grad_norm": 0.679628312587738, "learning_rate": 0.0005582558014031301, - "loss": 3.9657, + "loss": 3.9669, "step": 6550 }, { "epoch": 0.7115902964959568, - "grad_norm": 0.6196280121803284, + "grad_norm": 0.5673393607139587, "learning_rate": 0.0005579320021586616, - "loss": 3.9491, + "loss": 3.9506, "step": 6600 }, { "epoch": 0.7169811320754716, - "grad_norm": 0.7528320550918579, + "grad_norm": 0.603226900100708, "learning_rate": 0.0005576082029141932, - "loss": 3.9453, + "loss": 3.9447, "step": 6650 }, { "epoch": 0.7223719676549866, - "grad_norm": 0.5507506728172302, + "grad_norm": 0.5796992778778076, "learning_rate": 0.0005572844036697247, - "loss": 3.9307, + "loss": 3.9305, "step": 6700 }, { "epoch": 0.7277628032345014, - "grad_norm": 0.5690281987190247, + "grad_norm": 0.5560516119003296, "learning_rate": 0.0005569606044252563, - "loss": 3.9365, + "loss": 3.9388, "step": 6750 }, { "epoch": 0.7331536388140162, - "grad_norm": 0.8382434844970703, + "grad_norm": 0.711595356464386, "learning_rate": 0.0005566368051807879, - "loss": 3.9536, + "loss": 3.9534, "step": 6800 }, { "epoch": 0.738544474393531, - "grad_norm": 0.7441468238830566, + "grad_norm": 0.6935557126998901, "learning_rate": 0.0005563130059363194, - "loss": 3.9389, + "loss": 3.9397, "step": 6850 }, { "epoch": 0.7439353099730458, - "grad_norm": 0.5325085520744324, + "grad_norm": 0.5889500379562378, "learning_rate": 0.000555989206691851, - "loss": 3.9368, + "loss": 3.9376, "step": 6900 }, { "epoch": 0.7493261455525606, - "grad_norm": 0.6673761606216431, + "grad_norm": 0.6055012941360474, "learning_rate": 0.0005556654074473826, - "loss": 3.942, + "loss": 3.9432, "step": 6950 }, { "epoch": 0.7547169811320755, - "grad_norm": 0.6331908702850342, + "grad_norm": 0.7356480956077576, "learning_rate": 0.0005553416082029141, - "loss": 3.9256, + "loss": 3.9243, "step": 7000 }, { "epoch": 0.7547169811320755, - "eval_accuracy": 0.33349167680850833, - "eval_loss": 3.861177682876587, - "eval_runtime": 184.9411, - "eval_samples_per_second": 97.388, - "eval_steps_per_second": 6.088, + "eval_accuracy": 0.3333837844909085, + "eval_loss": 3.861570358276367, + "eval_runtime": 185.0498, + "eval_samples_per_second": 97.331, + "eval_steps_per_second": 6.085, "step": 7000 }, { "epoch": 0.7601078167115903, - "grad_norm": 0.5495949387550354, + "grad_norm": 0.6299076676368713, "learning_rate": 0.0005550178089584457, "loss": 3.9179, "step": 7050 }, { "epoch": 0.7654986522911051, - "grad_norm": 0.5872019529342651, + "grad_norm": 0.6104505062103271, "learning_rate": 0.0005546940097139772, - "loss": 3.9178, + "loss": 3.9182, "step": 7100 }, { "epoch": 0.77088948787062, - "grad_norm": 0.6886711120605469, + "grad_norm": 0.7075724005699158, "learning_rate": 0.0005543702104695089, - "loss": 3.9203, + "loss": 3.921, "step": 7150 }, { "epoch": 0.7762803234501348, - "grad_norm": 0.5374932885169983, + "grad_norm": 0.5794752240180969, "learning_rate": 0.0005540464112250404, - "loss": 3.918, + "loss": 3.9183, "step": 7200 }, { "epoch": 0.7816711590296496, - "grad_norm": 0.6181663274765015, + "grad_norm": 0.6488854289054871, "learning_rate": 0.000553722611980572, - "loss": 3.9298, + "loss": 3.9308, "step": 7250 }, { "epoch": 0.7870619946091644, - "grad_norm": 0.5765889883041382, + "grad_norm": 0.632280707359314, "learning_rate": 0.0005533988127361035, - "loss": 3.9127, + "loss": 3.9145, "step": 7300 }, { "epoch": 0.7924528301886793, - "grad_norm": 0.6276585459709167, + "grad_norm": 0.5979216694831848, "learning_rate": 0.0005530750134916352, - "loss": 3.9291, + "loss": 3.9286, "step": 7350 }, { "epoch": 0.7978436657681941, - "grad_norm": 0.6201632618904114, + "grad_norm": 0.6133264899253845, "learning_rate": 0.0005527512142471668, - "loss": 3.9022, + "loss": 3.9042, "step": 7400 }, { "epoch": 0.8032345013477089, - "grad_norm": 0.5955806970596313, + "grad_norm": 0.6683976650238037, "learning_rate": 0.0005524274150026982, - "loss": 3.9034, + "loss": 3.9044, "step": 7450 }, { "epoch": 0.8086253369272237, - "grad_norm": 0.6247159242630005, + "grad_norm": 0.5934157967567444, "learning_rate": 0.0005521036157582299, - "loss": 3.8948, + "loss": 3.8953, "step": 7500 }, { "epoch": 0.8140161725067385, - "grad_norm": 0.6614282131195068, + "grad_norm": 0.6800050735473633, "learning_rate": 0.0005517798165137614, - "loss": 3.8995, + "loss": 3.9003, "step": 7550 }, { "epoch": 0.8194070080862533, - "grad_norm": 0.599558413028717, + "grad_norm": 0.6356220841407776, "learning_rate": 0.000551456017269293, - "loss": 3.8749, + "loss": 3.8741, "step": 7600 }, { "epoch": 0.8247978436657682, - "grad_norm": 0.5789263844490051, + "grad_norm": 0.6044333577156067, "learning_rate": 0.0005511322180248245, - "loss": 3.8905, + "loss": 3.8914, "step": 7650 }, { "epoch": 0.8301886792452831, - "grad_norm": 0.616805911064148, + "grad_norm": 0.6075876951217651, "learning_rate": 0.0005508084187803562, - "loss": 3.9028, + "loss": 3.9014, "step": 7700 }, { "epoch": 0.8355795148247979, - "grad_norm": 0.5778168439865112, + "grad_norm": 0.5683922171592712, "learning_rate": 0.0005504846195358877, - "loss": 3.8969, + "loss": 3.8975, "step": 7750 }, { "epoch": 0.8409703504043127, - "grad_norm": 0.5527185201644897, + "grad_norm": 0.6191318035125732, "learning_rate": 0.0005501608202914193, - "loss": 3.8859, + "loss": 3.8865, "step": 7800 }, { "epoch": 0.8463611859838275, - "grad_norm": 0.6867278218269348, + "grad_norm": 0.6902214288711548, "learning_rate": 0.0005498370210469508, - "loss": 3.8885, + "loss": 3.8898, "step": 7850 }, { "epoch": 0.8517520215633423, - "grad_norm": 0.6439316272735596, + "grad_norm": 0.6202863454818726, "learning_rate": 0.0005495132218024824, - "loss": 3.8741, + "loss": 3.875, "step": 7900 }, { "epoch": 0.8571428571428571, - "grad_norm": 0.5910453796386719, + "grad_norm": 0.594792902469635, "learning_rate": 0.000549189422558014, - "loss": 3.8856, + "loss": 3.887, "step": 7950 }, { "epoch": 0.862533692722372, - "grad_norm": 0.5654035210609436, + "grad_norm": 0.5793900489807129, "learning_rate": 0.0005488656233135456, - "loss": 3.8821, + "loss": 3.8798, "step": 8000 }, { "epoch": 0.862533692722372, - "eval_accuracy": 0.33826936159148663, - "eval_loss": 3.815802574157715, - "eval_runtime": 184.9876, - "eval_samples_per_second": 97.363, - "eval_steps_per_second": 6.087, + "eval_accuracy": 0.33811703024277073, + "eval_loss": 3.8144943714141846, + "eval_runtime": 184.7794, + "eval_samples_per_second": 97.473, + "eval_steps_per_second": 6.094, "step": 8000 }, { "epoch": 0.8679245283018868, - "grad_norm": 0.6305513381958008, + "grad_norm": 0.5921681523323059, "learning_rate": 0.0005485418240690771, - "loss": 3.8629, + "loss": 3.8632, "step": 8050 }, { "epoch": 0.8733153638814016, - "grad_norm": 0.6361500024795532, + "grad_norm": 0.6462425589561462, "learning_rate": 0.0005482180248246087, - "loss": 3.8813, + "loss": 3.8786, "step": 8100 }, { "epoch": 0.8787061994609164, - "grad_norm": 0.650390625, + "grad_norm": 0.5800376534461975, "learning_rate": 0.0005478942255801403, - "loss": 3.8776, + "loss": 3.8749, "step": 8150 }, { "epoch": 0.8840970350404312, - "grad_norm": 0.5771349668502808, + "grad_norm": 0.5411289930343628, "learning_rate": 0.0005475704263356718, - "loss": 3.863, + "loss": 3.8658, "step": 8200 }, { "epoch": 0.889487870619946, - "grad_norm": 0.6004456877708435, + "grad_norm": 0.5984181761741638, "learning_rate": 0.0005472466270912034, - "loss": 3.874, + "loss": 3.8742, "step": 8250 }, { "epoch": 0.894878706199461, - "grad_norm": 0.646091878414154, + "grad_norm": 0.5967395901679993, "learning_rate": 0.000546922827846735, - "loss": 3.862, + "loss": 3.8625, "step": 8300 }, { "epoch": 0.9002695417789758, - "grad_norm": 0.5574592351913452, + "grad_norm": 0.6251577138900757, "learning_rate": 0.0005465990286022665, "loss": 3.8631, "step": 8350 }, { "epoch": 0.9056603773584906, - "grad_norm": 0.6087723970413208, + "grad_norm": 0.5644067525863647, "learning_rate": 0.0005462752293577981, "loss": 3.8792, "step": 8400 }, { "epoch": 0.9110512129380054, - "grad_norm": 0.5992222428321838, + "grad_norm": 0.5981811285018921, "learning_rate": 0.0005459514301133296, - "loss": 3.8725, + "loss": 3.8723, "step": 8450 }, { "epoch": 0.9164420485175202, - "grad_norm": 0.5356247425079346, + "grad_norm": 0.5163571834564209, "learning_rate": 0.0005456276308688613, - "loss": 3.8668, + "loss": 3.8679, "step": 8500 }, { "epoch": 0.921832884097035, - "grad_norm": 0.5599382519721985, + "grad_norm": 0.5923091173171997, "learning_rate": 0.0005453038316243929, - "loss": 3.861, + "loss": 3.8621, "step": 8550 }, { "epoch": 0.9272237196765498, - "grad_norm": 0.5570565462112427, + "grad_norm": 0.5390773415565491, "learning_rate": 0.0005449800323799244, - "loss": 3.874, + "loss": 3.8713, "step": 8600 }, { "epoch": 0.9326145552560647, - "grad_norm": 0.5484678149223328, + "grad_norm": 0.5587196350097656, "learning_rate": 0.000544656233135456, - "loss": 3.8519, + "loss": 3.851, "step": 8650 }, { "epoch": 0.9380053908355795, - "grad_norm": 0.6040818095207214, + "grad_norm": 0.6344064474105835, "learning_rate": 0.0005443324338909875, - "loss": 3.8544, + "loss": 3.8547, "step": 8700 }, { "epoch": 0.9433962264150944, - "grad_norm": 0.6044812202453613, + "grad_norm": 0.6433781981468201, "learning_rate": 0.0005440086346465192, - "loss": 3.8669, + "loss": 3.8682, "step": 8750 }, { "epoch": 0.9487870619946092, - "grad_norm": 0.5890029072761536, + "grad_norm": 0.5929216146469116, "learning_rate": 0.0005436848354020506, - "loss": 3.8472, + "loss": 3.8499, "step": 8800 }, { "epoch": 0.954177897574124, - "grad_norm": 0.5707723498344421, + "grad_norm": 0.5464727282524109, "learning_rate": 0.0005433610361575823, - "loss": 3.8532, + "loss": 3.8531, "step": 8850 }, { "epoch": 0.9595687331536388, - "grad_norm": 0.5903648734092712, + "grad_norm": 0.5909833908081055, "learning_rate": 0.0005430372369131138, - "loss": 3.8367, + "loss": 3.8389, "step": 8900 }, { "epoch": 0.9649595687331537, - "grad_norm": 0.5946788191795349, + "grad_norm": 0.6431576013565063, "learning_rate": 0.0005427134376686454, - "loss": 3.85, + "loss": 3.8517, "step": 8950 }, { "epoch": 0.9703504043126685, - "grad_norm": 0.6123566031455994, + "grad_norm": 0.6032969951629639, "learning_rate": 0.0005423896384241769, - "loss": 3.8366, + "loss": 3.8389, "step": 9000 }, { "epoch": 0.9703504043126685, - "eval_accuracy": 0.3415915322893022, - "eval_loss": 3.775902509689331, - "eval_runtime": 184.7441, - "eval_samples_per_second": 97.492, - "eval_steps_per_second": 6.095, + "eval_accuracy": 0.341406605074242, + "eval_loss": 3.777738332748413, + "eval_runtime": 185.0795, + "eval_samples_per_second": 97.315, + "eval_steps_per_second": 6.084, "step": 9000 }, { "epoch": 0.9757412398921833, - "grad_norm": 0.604682445526123, + "grad_norm": 0.5933600664138794, "learning_rate": 0.0005420658391797086, - "loss": 3.8309, + "loss": 3.8341, "step": 9050 }, { "epoch": 0.9811320754716981, - "grad_norm": 0.5386356115341187, + "grad_norm": 0.5503290891647339, "learning_rate": 0.0005417420399352401, - "loss": 3.843, + "loss": 3.8429, "step": 9100 }, { "epoch": 0.9865229110512129, - "grad_norm": 0.6552569270133972, + "grad_norm": 0.667158305644989, "learning_rate": 0.0005414182406907717, - "loss": 3.8365, + "loss": 3.8374, "step": 9150 }, { "epoch": 0.9919137466307277, - "grad_norm": 0.6831485629081726, + "grad_norm": 0.6329292058944702, "learning_rate": 0.0005410944414463032, - "loss": 3.8525, + "loss": 3.8518, "step": 9200 }, { "epoch": 0.9973045822102425, - "grad_norm": 0.5169652700424194, + "grad_norm": 0.602115273475647, "learning_rate": 0.0005407706422018348, - "loss": 3.8489, + "loss": 3.8477, "step": 9250 }, { "epoch": 1.0026954177897573, - "grad_norm": 0.556510329246521, + "grad_norm": 0.5481619834899902, "learning_rate": 0.0005404468429573664, - "loss": 3.7966, + "loss": 3.7957, "step": 9300 }, { "epoch": 1.0080862533692723, - "grad_norm": 0.6710174083709717, + "grad_norm": 0.7251120805740356, "learning_rate": 0.000540123043712898, - "loss": 3.7679, + "loss": 3.7684, "step": 9350 }, { "epoch": 1.013477088948787, - "grad_norm": 0.6061473488807678, + "grad_norm": 0.6227707862854004, "learning_rate": 0.0005397992444684295, - "loss": 3.7491, + "loss": 3.7501, "step": 9400 }, { "epoch": 1.0188679245283019, - "grad_norm": 0.6734007000923157, + "grad_norm": 0.6860117316246033, "learning_rate": 0.0005394754452239611, - "loss": 3.7614, + "loss": 3.7643, "step": 9450 }, { "epoch": 1.0242587601078168, - "grad_norm": 0.5629554986953735, + "grad_norm": 0.5848320126533508, "learning_rate": 0.0005391516459794927, - "loss": 3.7738, + "loss": 3.773, "step": 9500 }, { "epoch": 1.0296495956873315, - "grad_norm": 0.5931057929992676, + "grad_norm": 0.5614584684371948, "learning_rate": 0.0005388278467350242, - "loss": 3.7805, + "loss": 3.7825, "step": 9550 }, { "epoch": 1.0350404312668464, - "grad_norm": 0.5350716710090637, + "grad_norm": 0.5323917865753174, "learning_rate": 0.0005385040474905557, - "loss": 3.7747, + "loss": 3.7755, "step": 9600 }, { "epoch": 1.0404312668463611, - "grad_norm": 0.570182204246521, + "grad_norm": 0.5698872804641724, "learning_rate": 0.0005381802482460874, - "loss": 3.785, + "loss": 3.7865, "step": 9650 }, { "epoch": 1.045822102425876, - "grad_norm": 0.5359431505203247, + "grad_norm": 0.5502426028251648, "learning_rate": 0.000537856449001619, - "loss": 3.7839, + "loss": 3.7843, "step": 9700 }, { "epoch": 1.0512129380053907, - "grad_norm": 0.590753436088562, + "grad_norm": 0.6106315851211548, "learning_rate": 0.0005375326497571505, - "loss": 3.7542, + "loss": 3.7552, "step": 9750 }, { "epoch": 1.0566037735849056, - "grad_norm": 0.6063977479934692, + "grad_norm": 0.6247063279151917, "learning_rate": 0.000537208850512682, - "loss": 3.7645, + "loss": 3.7613, "step": 9800 }, { "epoch": 1.0619946091644206, - "grad_norm": 0.6077662706375122, + "grad_norm": 0.5800876617431641, "learning_rate": 0.0005368850512682137, - "loss": 3.7599, + "loss": 3.7608, "step": 9850 }, { "epoch": 1.0673854447439353, - "grad_norm": 0.52806156873703, + "grad_norm": 0.5341230630874634, "learning_rate": 0.0005365612520237453, - "loss": 3.7725, + "loss": 3.7733, "step": 9900 }, { "epoch": 1.0727762803234502, - "grad_norm": 0.5774953365325928, + "grad_norm": 0.5820408463478088, "learning_rate": 0.0005362374527792768, - "loss": 3.7737, + "loss": 3.7746, "step": 9950 }, { "epoch": 1.0781671159029649, - "grad_norm": 0.5876293182373047, + "grad_norm": 0.5988202691078186, "learning_rate": 0.0005359136535348084, - "loss": 3.7721, + "loss": 3.7747, "step": 10000 }, { "epoch": 1.0781671159029649, - "eval_accuracy": 0.3451080830034179, - "eval_loss": 3.7522928714752197, - "eval_runtime": 184.8196, - "eval_samples_per_second": 97.452, - "eval_steps_per_second": 6.092, + "eval_accuracy": 0.34501203385058987, + "eval_loss": 3.7526464462280273, + "eval_runtime": 184.85, + "eval_samples_per_second": 97.436, + "eval_steps_per_second": 6.091, "step": 10000 }, { "epoch": 1.0835579514824798, - "grad_norm": 0.5495566129684448, + "grad_norm": 0.5564333200454712, "learning_rate": 0.0005355898542903399, - "loss": 3.7495, + "loss": 3.7497, "step": 10050 }, { "epoch": 1.0889487870619945, - "grad_norm": 0.5702345967292786, + "grad_norm": 0.6517497301101685, "learning_rate": 0.0005352660550458716, - "loss": 3.7524, + "loss": 3.7541, "step": 10100 }, { "epoch": 1.0943396226415094, - "grad_norm": 0.6201688051223755, + "grad_norm": 0.6653234958648682, "learning_rate": 0.000534942255801403, "loss": 3.7444, "step": 10150 }, { "epoch": 1.0997304582210243, - "grad_norm": 0.6020630598068237, + "grad_norm": 0.6291157007217407, "learning_rate": 0.0005346184565569347, - "loss": 3.7593, + "loss": 3.7606, "step": 10200 }, { "epoch": 1.105121293800539, - "grad_norm": 0.5759705901145935, + "grad_norm": 0.5519572496414185, "learning_rate": 0.0005342946573124662, - "loss": 3.7785, + "loss": 3.7783, "step": 10250 }, { "epoch": 1.110512129380054, - "grad_norm": 0.6400898098945618, + "grad_norm": 0.6604307889938354, "learning_rate": 0.0005339708580679978, - "loss": 3.7777, + "loss": 3.7808, "step": 10300 }, { "epoch": 1.1159029649595686, - "grad_norm": 0.587372362613678, + "grad_norm": 0.5644993185997009, "learning_rate": 0.0005336470588235293, - "loss": 3.7561, + "loss": 3.755, "step": 10350 }, { "epoch": 1.1212938005390836, - "grad_norm": 0.5996315479278564, + "grad_norm": 0.6334912776947021, "learning_rate": 0.000533323259579061, - "loss": 3.7586, + "loss": 3.7597, "step": 10400 }, { "epoch": 1.1266846361185983, - "grad_norm": 0.5710271000862122, + "grad_norm": 0.5180814266204834, "learning_rate": 0.0005329994603345925, - "loss": 3.7552, + "loss": 3.7527, "step": 10450 }, { "epoch": 1.1320754716981132, - "grad_norm": 0.5642911195755005, + "grad_norm": 0.6314664483070374, "learning_rate": 0.0005326756610901241, - "loss": 3.7533, + "loss": 3.7526, "step": 10500 }, { "epoch": 1.137466307277628, - "grad_norm": 0.5622043609619141, + "grad_norm": 0.5654125213623047, "learning_rate": 0.0005323518618456556, - "loss": 3.7532, + "loss": 3.7534, "step": 10550 }, { "epoch": 1.1428571428571428, - "grad_norm": 0.5839791893959045, + "grad_norm": 0.6075195670127869, "learning_rate": 0.0005320280626011872, - "loss": 3.7666, + "loss": 3.7658, "step": 10600 }, { "epoch": 1.1482479784366577, - "grad_norm": 0.5701001882553101, + "grad_norm": 0.6319811940193176, "learning_rate": 0.0005317042633567188, - "loss": 3.7541, + "loss": 3.7513, "step": 10650 }, { "epoch": 1.1536388140161726, - "grad_norm": 0.6332500576972961, + "grad_norm": 0.6518101096153259, "learning_rate": 0.0005313804641122504, - "loss": 3.7563, + "loss": 3.7577, "step": 10700 }, { "epoch": 1.1590296495956873, - "grad_norm": 0.5746287703514099, + "grad_norm": 0.5641539692878723, "learning_rate": 0.0005310566648677819, - "loss": 3.752, + "loss": 3.7533, "step": 10750 }, { "epoch": 1.1644204851752022, - "grad_norm": 0.5689650774002075, + "grad_norm": 0.6202017068862915, "learning_rate": 0.0005307328656233135, - "loss": 3.7422, + "loss": 3.742, "step": 10800 }, { "epoch": 1.169811320754717, - "grad_norm": 0.5871029496192932, + "grad_norm": 0.5958346724510193, "learning_rate": 0.000530409066378845, - "loss": 3.7375, + "loss": 3.7384, "step": 10850 }, { "epoch": 1.1752021563342319, - "grad_norm": 0.7632201313972473, + "grad_norm": 1.0744181871414185, "learning_rate": 0.0005300852671343766, - "loss": 3.7499, + "loss": 3.7539, "step": 10900 }, { "epoch": 1.1805929919137466, - "grad_norm": 0.7118698358535767, + "grad_norm": 0.6128881573677063, "learning_rate": 0.0005297614678899081, - "loss": 3.7436, + "loss": 3.7442, "step": 10950 }, { "epoch": 1.1859838274932615, - "grad_norm": 0.5203385353088379, + "grad_norm": 0.5362077951431274, "learning_rate": 0.0005294376686454398, - "loss": 3.733, + "loss": 3.7347, "step": 11000 }, { "epoch": 1.1859838274932615, - "eval_accuracy": 0.3484138471151735, - "eval_loss": 3.71924090385437, - "eval_runtime": 185.1007, - "eval_samples_per_second": 97.304, - "eval_steps_per_second": 6.083, + "eval_accuracy": 0.34794022917719014, + "eval_loss": 3.718404769897461, + "eval_runtime": 184.5725, + "eval_samples_per_second": 97.582, + "eval_steps_per_second": 6.101, "step": 11000 }, { "epoch": 1.1913746630727764, - "grad_norm": 0.5789610743522644, + "grad_norm": 0.5539353489875793, "learning_rate": 0.0005291138694009714, - "loss": 3.7447, + "loss": 3.7456, "step": 11050 }, { "epoch": 1.196765498652291, - "grad_norm": 0.6329853534698486, + "grad_norm": 0.6573549509048462, "learning_rate": 0.0005287900701565029, - "loss": 3.7378, + "loss": 3.7375, "step": 11100 }, { "epoch": 1.202156334231806, - "grad_norm": 0.6139509081840515, + "grad_norm": 0.5665242075920105, "learning_rate": 0.0005284662709120345, - "loss": 3.7366, + "loss": 3.7363, "step": 11150 }, { "epoch": 1.2075471698113207, - "grad_norm": 0.7028432488441467, + "grad_norm": 0.6505994200706482, "learning_rate": 0.0005281424716675661, - "loss": 3.7382, + "loss": 3.7368, "step": 11200 }, { "epoch": 1.2129380053908356, - "grad_norm": 0.6056612730026245, + "grad_norm": 0.586138904094696, "learning_rate": 0.0005278186724230977, - "loss": 3.7298, + "loss": 3.7303, "step": 11250 }, { "epoch": 1.2183288409703503, - "grad_norm": 0.5431009531021118, + "grad_norm": 0.5795910954475403, "learning_rate": 0.0005274948731786292, - "loss": 3.7267, + "loss": 3.7274, "step": 11300 }, { "epoch": 1.2237196765498652, - "grad_norm": 0.6328643560409546, + "grad_norm": 0.6727373003959656, "learning_rate": 0.0005271710739341608, - "loss": 3.7457, + "loss": 3.7443, "step": 11350 }, { "epoch": 1.2291105121293802, - "grad_norm": 0.5836541056632996, + "grad_norm": 0.5898151397705078, "learning_rate": 0.0005268472746896923, - "loss": 3.7387, + "loss": 3.7391, "step": 11400 }, { "epoch": 1.2345013477088949, - "grad_norm": 0.6509481072425842, + "grad_norm": 0.7265191674232483, "learning_rate": 0.000526523475445224, - "loss": 3.7302, + "loss": 3.7297, "step": 11450 }, { "epoch": 1.2398921832884098, - "grad_norm": 0.5459529757499695, + "grad_norm": 0.5766255259513855, "learning_rate": 0.0005261996762007554, "loss": 3.7167, "step": 11500 }, { "epoch": 1.2452830188679245, - "grad_norm": 0.575712263584137, + "grad_norm": 0.5683987140655518, "learning_rate": 0.0005258758769562871, - "loss": 3.738, + "loss": 3.7372, "step": 11550 }, { "epoch": 1.2506738544474394, - "grad_norm": 0.6626757383346558, + "grad_norm": 0.611061692237854, "learning_rate": 0.0005255520777118186, - "loss": 3.7379, + "loss": 3.7395, "step": 11600 }, { "epoch": 1.256064690026954, - "grad_norm": 0.6254045963287354, + "grad_norm": 0.5668408274650574, "learning_rate": 0.0005252282784673502, - "loss": 3.7293, + "loss": 3.7299, "step": 11650 }, { "epoch": 1.261455525606469, - "grad_norm": 0.5604497194290161, + "grad_norm": 0.5914852023124695, "learning_rate": 0.0005249044792228817, - "loss": 3.7315, + "loss": 3.7326, "step": 11700 }, { "epoch": 1.266846361185984, - "grad_norm": 0.5226693153381348, + "grad_norm": 0.5146292448043823, "learning_rate": 0.0005245806799784133, - "loss": 3.7267, + "loss": 3.7275, "step": 11750 }, { "epoch": 1.2722371967654986, - "grad_norm": 0.5691946744918823, + "grad_norm": 0.5495054721832275, "learning_rate": 0.0005242568807339449, - "loss": 3.7223, + "loss": 3.7229, "step": 11800 }, { "epoch": 1.2776280323450135, - "grad_norm": 0.5967990159988403, + "grad_norm": 0.6662834286689758, "learning_rate": 0.0005239330814894765, - "loss": 3.718, + "loss": 3.719, "step": 11850 }, { "epoch": 1.2830188679245282, - "grad_norm": 0.5392739772796631, + "grad_norm": 0.5426035523414612, "learning_rate": 0.000523609282245008, - "loss": 3.7396, + "loss": 3.738, "step": 11900 }, { "epoch": 1.2884097035040432, - "grad_norm": 0.6016465425491333, + "grad_norm": 0.638138473033905, "learning_rate": 0.0005232854830005396, - "loss": 3.7361, + "loss": 3.7367, "step": 11950 }, { "epoch": 1.2938005390835579, - "grad_norm": 0.5855374336242676, + "grad_norm": 0.5627890229225159, "learning_rate": 0.0005229616837560712, - "loss": 3.7312, + "loss": 3.7311, "step": 12000 }, { "epoch": 1.2938005390835579, - "eval_accuracy": 0.3501790219306075, - "eval_loss": 3.6982507705688477, - "eval_runtime": 185.0644, - "eval_samples_per_second": 97.323, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.3499498729902068, + "eval_loss": 3.698021411895752, + "eval_runtime": 185.1248, + "eval_samples_per_second": 97.291, + "eval_steps_per_second": 6.082, "step": 12000 }, { "epoch": 1.2991913746630728, - "grad_norm": 0.5653311610221863, + "grad_norm": 0.5502892136573792, "learning_rate": 0.0005226378845116028, - "loss": 3.7125, + "loss": 3.7115, "step": 12050 }, { "epoch": 1.3045822102425877, - "grad_norm": 0.5788018703460693, + "grad_norm": 0.5573764443397522, "learning_rate": 0.0005223140852671344, - "loss": 3.7259, + "loss": 3.7253, "step": 12100 }, { "epoch": 1.3099730458221024, - "grad_norm": 0.6671530604362488, + "grad_norm": 0.6834853887557983, "learning_rate": 0.0005219902860226659, - "loss": 3.7371, + "loss": 3.7359, "step": 12150 }, { "epoch": 1.3153638814016173, - "grad_norm": 0.5864307880401611, + "grad_norm": 0.5750380158424377, "learning_rate": 0.0005216664867781975, - "loss": 3.7351, + "loss": 3.7342, "step": 12200 }, { "epoch": 1.320754716981132, - "grad_norm": 0.655349850654602, + "grad_norm": 0.6798782348632812, "learning_rate": 0.000521342687533729, - "loss": 3.7162, + "loss": 3.715, "step": 12250 }, { "epoch": 1.326145552560647, - "grad_norm": 0.5737414956092834, + "grad_norm": 0.6055248379707336, "learning_rate": 0.0005210188882892606, - "loss": 3.7245, + "loss": 3.727, "step": 12300 }, { "epoch": 1.3315363881401616, - "grad_norm": 0.5403628945350647, + "grad_norm": 0.5729120373725891, "learning_rate": 0.0005206950890447922, - "loss": 3.7137, + "loss": 3.7154, "step": 12350 }, { "epoch": 1.3369272237196765, - "grad_norm": 0.5257432460784912, + "grad_norm": 0.505923867225647, "learning_rate": 0.0005203712898003238, "loss": 3.73, "step": 12400 }, { "epoch": 1.3423180592991915, - "grad_norm": 0.6442195177078247, + "grad_norm": 0.6457091569900513, "learning_rate": 0.0005200474905558553, - "loss": 3.7193, + "loss": 3.7195, "step": 12450 }, { "epoch": 1.3477088948787062, - "grad_norm": 0.6109151244163513, + "grad_norm": 0.6898245215415955, "learning_rate": 0.0005197236913113869, - "loss": 3.722, + "loss": 3.7196, "step": 12500 }, { "epoch": 1.353099730458221, - "grad_norm": 0.5608176589012146, + "grad_norm": 0.5175744891166687, "learning_rate": 0.0005193998920669184, - "loss": 3.7286, + "loss": 3.728, "step": 12550 }, { "epoch": 1.3584905660377358, - "grad_norm": 0.5631012916564941, + "grad_norm": 0.6096599102020264, "learning_rate": 0.0005190760928224501, - "loss": 3.7126, + "loss": 3.7121, "step": 12600 }, { "epoch": 1.3638814016172507, - "grad_norm": 0.5685343146324158, + "grad_norm": 0.5449326634407043, "learning_rate": 0.0005187522935779816, - "loss": 3.7195, + "loss": 3.7192, "step": 12650 }, { "epoch": 1.3692722371967654, - "grad_norm": 0.5835712552070618, + "grad_norm": 0.6141244173049927, "learning_rate": 0.0005184284943335132, - "loss": 3.7342, + "loss": 3.7337, "step": 12700 }, { "epoch": 1.3746630727762803, - "grad_norm": 0.5368698239326477, + "grad_norm": 0.5343438386917114, "learning_rate": 0.0005181046950890447, - "loss": 3.7183, + "loss": 3.7162, "step": 12750 }, { "epoch": 1.3800539083557952, - "grad_norm": 0.5741342306137085, + "grad_norm": 0.5652272701263428, "learning_rate": 0.0005177808958445764, - "loss": 3.7084, + "loss": 3.7075, "step": 12800 }, { "epoch": 1.38544474393531, - "grad_norm": 0.6180281043052673, + "grad_norm": 0.5998131036758423, "learning_rate": 0.0005174570966001078, - "loss": 3.7058, + "loss": 3.7052, "step": 12850 }, { "epoch": 1.3908355795148248, - "grad_norm": 0.5740448832511902, + "grad_norm": 0.5848682522773743, "learning_rate": 0.0005171332973556395, - "loss": 3.7109, + "loss": 3.7117, "step": 12900 }, { "epoch": 1.3962264150943398, - "grad_norm": 0.5574175119400024, + "grad_norm": 0.5479270219802856, "learning_rate": 0.000516809498111171, - "loss": 3.7104, + "loss": 3.7096, "step": 12950 }, { "epoch": 1.4016172506738545, - "grad_norm": 0.6154153347015381, + "grad_norm": 0.582422137260437, "learning_rate": 0.0005164856988667026, - "loss": 3.7055, + "loss": 3.705, "step": 13000 }, { "epoch": 1.4016172506738545, - "eval_accuracy": 0.35201014904894495, - "eval_loss": 3.6755025386810303, - "eval_runtime": 184.7138, - "eval_samples_per_second": 97.508, - "eval_steps_per_second": 6.096, + "eval_accuracy": 0.35188910973191967, + "eval_loss": 3.6742594242095947, + "eval_runtime": 184.8492, + "eval_samples_per_second": 97.436, + "eval_steps_per_second": 6.091, "step": 13000 }, { "epoch": 1.4070080862533692, - "grad_norm": 0.6463391780853271, + "grad_norm": 0.5786948800086975, "learning_rate": 0.0005161618996222341, - "loss": 3.7168, + "loss": 3.7174, "step": 13050 }, { "epoch": 1.412398921832884, - "grad_norm": 0.5860043168067932, + "grad_norm": 0.5718762278556824, "learning_rate": 0.0005158381003777657, - "loss": 3.7283, + "loss": 3.7291, "step": 13100 }, { "epoch": 1.417789757412399, - "grad_norm": 0.6286531686782837, + "grad_norm": 0.5814722776412964, "learning_rate": 0.0005155143011332973, - "loss": 3.7071, + "loss": 3.7076, "step": 13150 }, { "epoch": 1.4231805929919137, - "grad_norm": 0.545039713382721, + "grad_norm": 0.5653195977210999, "learning_rate": 0.0005151905018888289, - "loss": 3.7097, + "loss": 3.7109, "step": 13200 }, { "epoch": 1.4285714285714286, - "grad_norm": 0.5539329648017883, - "learning_rate": 0.0005148667026443604, - "loss": 3.7058, + "grad_norm": 0.5738582015037537, + "learning_rate": 0.0005148731786292498, + "loss": 3.7059, "step": 13250 }, { "epoch": 1.4339622641509435, - "grad_norm": 0.6011884212493896, - "learning_rate": 0.000514542903399892, - "loss": 3.7048, + "grad_norm": 0.5981550812721252, + "learning_rate": 0.0005145493793847814, + "loss": 3.7036, "step": 13300 }, { "epoch": 1.4393530997304582, - "grad_norm": 0.5569776892662048, - "learning_rate": 0.0005142191041554237, - "loss": 3.6906, + "grad_norm": 0.5781781077384949, + "learning_rate": 0.000514225580140313, + "loss": 3.6913, "step": 13350 }, { "epoch": 1.444743935309973, - "grad_norm": 0.6196861863136292, - "learning_rate": 0.0005138953049109552, - "loss": 3.7005, + "grad_norm": 0.6027509570121765, + "learning_rate": 0.0005139017808958445, + "loss": 3.7001, "step": 13400 }, { "epoch": 1.4501347708894878, - "grad_norm": 0.6113570332527161, - "learning_rate": 0.0005135715056664868, - "loss": 3.7094, + "grad_norm": 0.6018847227096558, + "learning_rate": 0.0005135779816513762, + "loss": 3.7117, "step": 13450 }, { "epoch": 1.4555256064690028, - "grad_norm": 0.528312087059021, - "learning_rate": 0.0005132477064220183, - "loss": 3.6953, + "grad_norm": 0.5335890650749207, + "learning_rate": 0.0005132541824069076, + "loss": 3.6938, "step": 13500 }, { "epoch": 1.4609164420485174, - "grad_norm": 0.6123299598693848, - "learning_rate": 0.0005129239071775499, - "loss": 3.7059, + "grad_norm": 0.5770127177238464, + "learning_rate": 0.0005129303831624393, + "loss": 3.7058, "step": 13550 }, { "epoch": 1.4663072776280324, - "grad_norm": 0.5983057618141174, - "learning_rate": 0.0005126001079330814, - "loss": 3.7097, + "grad_norm": 0.5771490335464478, + "learning_rate": 0.0005126065839179708, + "loss": 3.7089, "step": 13600 }, { "epoch": 1.4716981132075473, - "grad_norm": 0.5747043490409851, - "learning_rate": 0.000512276308688613, - "loss": 3.6826, + "grad_norm": 0.620576024055481, + "learning_rate": 0.0005122827846735024, + "loss": 3.6832, "step": 13650 }, { "epoch": 1.477088948787062, - "grad_norm": 0.5804892182350159, - "learning_rate": 0.0005119525094441446, - "loss": 3.6802, + "grad_norm": 0.5802292227745056, + "learning_rate": 0.0005119589854290339, + "loss": 3.6798, "step": 13700 }, { "epoch": 1.482479784366577, - "grad_norm": 0.5395514369010925, - "learning_rate": 0.0005116287101996762, - "loss": 3.6981, + "grad_norm": 0.603459358215332, + "learning_rate": 0.0005116351861845655, + "loss": 3.6995, "step": 13750 }, { "epoch": 1.4878706199460916, - "grad_norm": 0.635943591594696, - "learning_rate": 0.0005113049109552077, - "loss": 3.7122, + "grad_norm": 0.5906022787094116, + "learning_rate": 0.0005113113869400971, + "loss": 3.7129, "step": 13800 }, { "epoch": 1.4932614555256065, - "grad_norm": 0.6453544497489929, - "learning_rate": 0.0005109811117107393, - "loss": 3.7088, + "grad_norm": 0.580629289150238, + "learning_rate": 0.0005109875876956287, + "loss": 3.707, "step": 13850 }, { "epoch": 1.4986522911051212, - "grad_norm": 0.5905641913414001, - "learning_rate": 0.0005106573124662708, - "loss": 3.7223, + "grad_norm": 0.6544400453567505, + "learning_rate": 0.0005106637884511602, + "loss": 3.7212, "step": 13900 }, { "epoch": 1.5040431266846361, - "grad_norm": 0.5349699854850769, - "learning_rate": 0.0005103335132218025, - "loss": 3.6964, + "grad_norm": 0.5429078936576843, + "learning_rate": 0.0005103399892066918, + "loss": 3.6957, "step": 13950 }, { "epoch": 1.509433962264151, - "grad_norm": 0.6119936108589172, - "learning_rate": 0.000510009713977334, - "loss": 3.697, + "grad_norm": 0.5718664526939392, + "learning_rate": 0.0005100161899622234, + "loss": 3.6961, "step": 14000 }, { "epoch": 1.509433962264151, - "eval_accuracy": 0.35438225889571073, - "eval_loss": 3.6525440216064453, - "eval_runtime": 184.8462, - "eval_samples_per_second": 97.438, - "eval_steps_per_second": 6.092, + "eval_accuracy": 0.35413159668352273, + "eval_loss": 3.6542694568634033, + "eval_runtime": 185.2098, + "eval_samples_per_second": 97.246, + "eval_steps_per_second": 6.08, "step": 14000 }, { "epoch": 1.5148247978436657, - "grad_norm": 0.549912691116333, - "learning_rate": 0.0005096859147328656, - "loss": 3.6853, + "grad_norm": 0.530343770980835, + "learning_rate": 0.000509692390717755, + "loss": 3.6869, "step": 14050 }, { "epoch": 1.5202156334231804, - "grad_norm": 0.6812787055969238, + "grad_norm": 0.6733381152153015, "learning_rate": 0.0005093685914732865, - "loss": 3.6843, + "loss": 3.6809, "step": 14100 }, { "epoch": 1.5256064690026954, - "grad_norm": 0.5409011840820312, + "grad_norm": 0.5676352977752686, "learning_rate": 0.0005090447922288181, - "loss": 3.6802, + "loss": 3.6794, "step": 14150 }, { "epoch": 1.5309973045822103, - "grad_norm": 0.5610489845275879, + "grad_norm": 0.5399664640426636, "learning_rate": 0.0005087209929843496, - "loss": 3.7142, + "loss": 3.7143, "step": 14200 }, { "epoch": 1.536388140161725, - "grad_norm": 0.5780990123748779, + "grad_norm": 0.6497036218643188, "learning_rate": 0.0005083971937398812, - "loss": 3.6901, + "loss": 3.6866, "step": 14250 }, { "epoch": 1.54177897574124, - "grad_norm": 0.5896368026733398, + "grad_norm": 0.6060945391654968, "learning_rate": 0.0005080733944954127, - "loss": 3.6923, + "loss": 3.6915, "step": 14300 }, { "epoch": 1.5471698113207548, - "grad_norm": 0.5635419487953186, + "grad_norm": 0.6297807693481445, "learning_rate": 0.0005077495952509444, - "loss": 3.6735, + "loss": 3.6741, "step": 14350 }, { "epoch": 1.5525606469002695, - "grad_norm": 0.5706263184547424, + "grad_norm": 0.5488743782043457, "learning_rate": 0.0005074257960064759, - "loss": 3.6872, + "loss": 3.6868, "step": 14400 }, { "epoch": 1.5579514824797842, - "grad_norm": 0.5970407724380493, + "grad_norm": 0.5748883485794067, "learning_rate": 0.0005071019967620075, - "loss": 3.6924, + "loss": 3.6918, "step": 14450 }, { "epoch": 1.5633423180592994, - "grad_norm": 0.5763431787490845, + "grad_norm": 0.5836804509162903, "learning_rate": 0.000506778197517539, - "loss": 3.6854, + "loss": 3.6834, "step": 14500 }, { "epoch": 1.568733153638814, - "grad_norm": 0.6334835290908813, + "grad_norm": 0.612476110458374, "learning_rate": 0.0005064543982730707, - "loss": 3.6955, + "loss": 3.6924, "step": 14550 }, { "epoch": 1.5741239892183287, - "grad_norm": 0.5830768942832947, + "grad_norm": 0.6098917126655579, "learning_rate": 0.0005061305990286023, - "loss": 3.6875, + "loss": 3.6865, "step": 14600 }, { "epoch": 1.5795148247978437, - "grad_norm": 0.6113629341125488, + "grad_norm": 0.6164373755455017, "learning_rate": 0.0005058067997841338, - "loss": 3.6957, + "loss": 3.6947, "step": 14650 }, { "epoch": 1.5849056603773586, - "grad_norm": 0.5635071992874146, + "grad_norm": 0.5995565056800842, "learning_rate": 0.0005054830005396654, - "loss": 3.6826, + "loss": 3.6821, "step": 14700 }, { "epoch": 1.5902964959568733, - "grad_norm": 0.6076672673225403, + "grad_norm": 0.5815383791923523, "learning_rate": 0.0005051592012951969, - "loss": 3.6742, + "loss": 3.6734, "step": 14750 }, { "epoch": 1.595687331536388, - "grad_norm": 0.6091296076774597, + "grad_norm": 0.5790833830833435, "learning_rate": 0.0005048354020507286, - "loss": 3.6943, + "loss": 3.6948, "step": 14800 }, { "epoch": 1.6010781671159031, - "grad_norm": 0.6177613139152527, + "grad_norm": 0.6062554717063904, "learning_rate": 0.00050451160280626, - "loss": 3.6943, + "loss": 3.6944, "step": 14850 }, { "epoch": 1.6064690026954178, - "grad_norm": 0.5547499060630798, + "grad_norm": 0.5852131247520447, "learning_rate": 0.0005041878035617917, - "loss": 3.6845, + "loss": 3.6853, "step": 14900 }, { "epoch": 1.6118598382749325, - "grad_norm": 0.6204666495323181, + "grad_norm": 0.5513139367103577, "learning_rate": 0.0005038640043173232, - "loss": 3.6796, + "loss": 3.6793, "step": 14950 }, { "epoch": 1.6172506738544474, - "grad_norm": 0.7229707837104797, + "grad_norm": 0.6422461867332458, "learning_rate": 0.0005035402050728548, - "loss": 3.6759, + "loss": 3.6739, "step": 15000 }, { "epoch": 1.6172506738544474, - "eval_accuracy": 0.35614971542178886, - "eval_loss": 3.6349523067474365, - "eval_runtime": 184.7752, - "eval_samples_per_second": 97.475, - "eval_steps_per_second": 6.094, + "eval_accuracy": 0.35602030983240185, + "eval_loss": 3.6368651390075684, + "eval_runtime": 184.8953, + "eval_samples_per_second": 97.412, + "eval_steps_per_second": 6.09, "step": 15000 }, { "epoch": 1.6226415094339623, - "grad_norm": 0.5129961967468262, + "grad_norm": 0.5301791429519653, "learning_rate": 0.0005032164058283863, - "loss": 3.6765, + "loss": 3.6771, "step": 15050 }, { "epoch": 1.628032345013477, - "grad_norm": 0.5776543617248535, + "grad_norm": 0.5744919180870056, "learning_rate": 0.0005028926065839179, - "loss": 3.6737, + "loss": 3.6726, "step": 15100 }, { "epoch": 1.633423180592992, - "grad_norm": 0.5395156145095825, + "grad_norm": 0.6005228161811829, "learning_rate": 0.0005025688073394495, - "loss": 3.6743, + "loss": 3.6734, "step": 15150 }, { "epoch": 1.6388140161725069, - "grad_norm": 0.566416323184967, + "grad_norm": 0.5289723873138428, "learning_rate": 0.0005022450080949811, - "loss": 3.6799, + "loss": 3.681, "step": 15200 }, { "epoch": 1.6442048517520216, - "grad_norm": 0.6514060497283936, + "grad_norm": 0.6079738140106201, "learning_rate": 0.0005019212088505126, - "loss": 3.6803, + "loss": 3.6818, "step": 15250 }, { "epoch": 1.6495956873315363, - "grad_norm": 0.5592887997627258, + "grad_norm": 0.5393943190574646, "learning_rate": 0.0005015974096060442, - "loss": 3.6829, + "loss": 3.6825, "step": 15300 }, { "epoch": 1.6549865229110512, - "grad_norm": 0.5593628287315369, + "grad_norm": 0.5830407738685608, "learning_rate": 0.0005012736103615758, - "loss": 3.6681, + "loss": 3.6678, "step": 15350 }, { "epoch": 1.6603773584905661, - "grad_norm": 0.5848199129104614, + "grad_norm": 0.6033615469932556, "learning_rate": 0.0005009498111171074, - "loss": 3.6728, + "loss": 3.6727, "step": 15400 }, { "epoch": 1.6657681940700808, - "grad_norm": 0.5796011090278625, + "grad_norm": 0.5500646233558655, "learning_rate": 0.0005006260118726389, - "loss": 3.6723, + "loss": 3.67, "step": 15450 }, { "epoch": 1.6711590296495957, - "grad_norm": 0.5999243855476379, + "grad_norm": 0.5812546610832214, "learning_rate": 0.0005003022126281705, - "loss": 3.6724, + "loss": 3.6717, "step": 15500 }, { "epoch": 1.6765498652291106, - "grad_norm": 0.6494645476341248, + "grad_norm": 0.6952954530715942, "learning_rate": 0.000499978413383702, - "loss": 3.667, + "loss": 3.666, "step": 15550 }, { "epoch": 1.6819407008086253, - "grad_norm": 0.592124879360199, + "grad_norm": 0.591118335723877, "learning_rate": 0.0004996546141392336, - "loss": 3.663, + "loss": 3.6629, "step": 15600 }, { "epoch": 1.68733153638814, - "grad_norm": 0.5345093607902527, + "grad_norm": 0.5555945038795471, "learning_rate": 0.0004993308148947651, - "loss": 3.6713, + "loss": 3.6708, "step": 15650 }, { "epoch": 1.692722371967655, - "grad_norm": 0.5050610303878784, + "grad_norm": 0.5302060842514038, "learning_rate": 0.0004990070156502968, - "loss": 3.6604, + "loss": 3.6621, "step": 15700 }, { "epoch": 1.6981132075471699, - "grad_norm": 0.559531033039093, + "grad_norm": 0.5419127941131592, "learning_rate": 0.0004986832164058284, - "loss": 3.6585, + "loss": 3.6563, "step": 15750 }, { "epoch": 1.7035040431266846, - "grad_norm": 0.5738038420677185, + "grad_norm": 0.6328555941581726, "learning_rate": 0.0004983594171613599, - "loss": 3.6718, + "loss": 3.6704, "step": 15800 }, { "epoch": 1.7088948787061995, - "grad_norm": 0.573388934135437, + "grad_norm": 0.5873494744300842, "learning_rate": 0.0004980356179168915, - "loss": 3.6761, + "loss": 3.6774, "step": 15850 }, { "epoch": 1.7142857142857144, - "grad_norm": 0.5734029412269592, + "grad_norm": 0.5820769667625427, "learning_rate": 0.000497711818672423, - "loss": 3.6985, + "loss": 3.6993, "step": 15900 }, { "epoch": 1.719676549865229, - "grad_norm": 0.5339508652687073, + "grad_norm": 0.5341653227806091, "learning_rate": 0.0004973880194279547, - "loss": 3.6647, + "loss": 3.6659, "step": 15950 }, { "epoch": 1.7250673854447438, - "grad_norm": 0.58454430103302, + "grad_norm": 0.5919328927993774, "learning_rate": 0.0004970642201834862, - "loss": 3.6742, + "loss": 3.6728, "step": 16000 }, { "epoch": 1.7250673854447438, - "eval_accuracy": 0.3578629541568476, - "eval_loss": 3.6172311305999756, - "eval_runtime": 185.271, - "eval_samples_per_second": 97.214, - "eval_steps_per_second": 6.078, + "eval_accuracy": 0.357589800796882, + "eval_loss": 3.6153509616851807, + "eval_runtime": 184.7079, + "eval_samples_per_second": 97.511, + "eval_steps_per_second": 6.096, "step": 16000 }, { "epoch": 1.7304582210242587, - "grad_norm": 0.5619959831237793, + "grad_norm": 0.5269375443458557, "learning_rate": 0.0004967404209390178, - "loss": 3.6425, + "loss": 3.641, "step": 16050 }, { "epoch": 1.7358490566037736, - "grad_norm": 0.551400899887085, - "learning_rate": 0.0004964230976794387, - "loss": 3.6503, + "grad_norm": 0.5364742279052734, + "learning_rate": 0.0004964166216945493, + "loss": 3.6507, "step": 16100 }, { "epoch": 1.7412398921832883, - "grad_norm": 0.5677595734596252, - "learning_rate": 0.0004960992984349703, - "loss": 3.6675, + "grad_norm": 0.5592376589775085, + "learning_rate": 0.000496092822450081, + "loss": 3.6648, "step": 16150 }, { "epoch": 1.7466307277628033, - "grad_norm": 0.587281346321106, - "learning_rate": 0.0004957754991905018, - "loss": 3.6526, + "grad_norm": 0.6180751323699951, + "learning_rate": 0.0004957690232056125, + "loss": 3.6529, "step": 16200 }, { "epoch": 1.7520215633423182, - "grad_norm": 0.536168098449707, - "learning_rate": 0.0004954516999460334, - "loss": 3.6623, + "grad_norm": 0.5617481470108032, + "learning_rate": 0.0004954452239611441, + "loss": 3.6616, "step": 16250 }, { "epoch": 1.7574123989218329, - "grad_norm": 0.5161067843437195, - "learning_rate": 0.0004951279007015649, - "loss": 3.6526, + "grad_norm": 0.5292233824729919, + "learning_rate": 0.0004951214247166756, + "loss": 3.652, "step": 16300 }, { "epoch": 1.7628032345013476, - "grad_norm": 0.5492506623268127, - "learning_rate": 0.0004948041014570966, - "loss": 3.6495, + "grad_norm": 0.5463518500328064, + "learning_rate": 0.0004947976254722072, + "loss": 3.6471, "step": 16350 }, { "epoch": 1.7681940700808625, - "grad_norm": 0.5799858570098877, - "learning_rate": 0.0004944803022126281, - "loss": 3.6613, + "grad_norm": 0.5895617008209229, + "learning_rate": 0.0004944738262277387, + "loss": 3.6603, "step": 16400 }, { "epoch": 1.7735849056603774, - "grad_norm": 0.538158655166626, - "learning_rate": 0.0004941565029681597, - "loss": 3.6595, + "grad_norm": 0.541398286819458, + "learning_rate": 0.0004941500269832703, + "loss": 3.6592, "step": 16450 }, { "epoch": 1.778975741239892, - "grad_norm": 0.5593761205673218, - "learning_rate": 0.0004938327037236912, + "grad_norm": 0.5419638752937317, + "learning_rate": 0.0004938262277388019, "loss": 3.6481, "step": 16500 }, { "epoch": 1.784366576819407, - "grad_norm": 0.6031091809272766, - "learning_rate": 0.0004935089044792229, - "loss": 3.6445, + "grad_norm": 0.5745832324028015, + "learning_rate": 0.0004935024284943335, + "loss": 3.6433, "step": 16550 }, { "epoch": 1.789757412398922, - "grad_norm": 0.5604456067085266, - "learning_rate": 0.0004931851052347544, - "loss": 3.6512, + "grad_norm": 0.5785013437271118, + "learning_rate": 0.000493178629249865, + "loss": 3.6502, "step": 16600 }, { "epoch": 1.7951482479784366, - "grad_norm": 0.5797213315963745, - "learning_rate": 0.000492861305990286, - "loss": 3.6587, + "grad_norm": 0.5288052558898926, + "learning_rate": 0.0004928548300053966, + "loss": 3.6573, "step": 16650 }, { "epoch": 1.8005390835579513, - "grad_norm": 0.5687010884284973, - "learning_rate": 0.0004925375067458175, - "loss": 3.6505, + "grad_norm": 0.5558264255523682, + "learning_rate": 0.0004925310307609282, + "loss": 3.6499, "step": 16700 }, { "epoch": 1.8059299191374663, - "grad_norm": 0.5328428745269775, - "learning_rate": 0.0004922137075013491, - "loss": 3.6433, + "grad_norm": 0.5273551344871521, + "learning_rate": 0.0004922072315164598, + "loss": 3.6437, "step": 16750 }, { "epoch": 1.8113207547169812, - "grad_norm": 0.5607346892356873, - "learning_rate": 0.0004918899082568807, - "loss": 3.6503, + "grad_norm": 0.5901119709014893, + "learning_rate": 0.0004918834322719913, + "loss": 3.6486, "step": 16800 }, { "epoch": 1.8167115902964959, - "grad_norm": 0.5864443182945251, - "learning_rate": 0.0004915661090124123, - "loss": 3.6385, + "grad_norm": 0.5500622987747192, + "learning_rate": 0.0004915596330275229, + "loss": 3.6377, "step": 16850 }, { "epoch": 1.8221024258760108, - "grad_norm": 0.6052229404449463, - "learning_rate": 0.0004912423097679439, - "loss": 3.6449, + "grad_norm": 0.5930582284927368, + "learning_rate": 0.0004912358337830544, + "loss": 3.6429, "step": 16900 }, { "epoch": 1.8274932614555257, - "grad_norm": 0.542421281337738, - "learning_rate": 0.0004909185105234754, - "loss": 3.6512, + "grad_norm": 0.5709859728813171, + "learning_rate": 0.000490912034538586, + "loss": 3.6515, "step": 16950 }, { "epoch": 1.8328840970350404, - "grad_norm": 0.5311638712882996, - "learning_rate": 0.000490594711279007, - "loss": 3.6581, + "grad_norm": 0.5371534824371338, + "learning_rate": 0.0004905882352941175, + "loss": 3.6584, "step": 17000 }, { "epoch": 1.8328840970350404, - "eval_accuracy": 0.359163529223987, - "eval_loss": 3.602339506149292, - "eval_runtime": 184.8776, - "eval_samples_per_second": 97.421, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.35917635026474903, + "eval_loss": 3.601609230041504, + "eval_runtime": 185.0779, + "eval_samples_per_second": 97.316, + "eval_steps_per_second": 6.084, "step": 17000 }, { "epoch": 1.838274932614555, - "grad_norm": 0.5611234307289124, - "learning_rate": 0.0004902709120345385, - "loss": 3.6485, + "grad_norm": 0.580851137638092, + "learning_rate": 0.0004902644360496492, + "loss": 3.651, "step": 17050 }, { "epoch": 1.8436657681940702, - "grad_norm": 0.5618613362312317, - "learning_rate": 0.00048994711279007, - "loss": 3.6394, + "grad_norm": 0.5615604519844055, + "learning_rate": 0.0004899406368051808, + "loss": 3.6398, "step": 17100 }, { "epoch": 1.849056603773585, - "grad_norm": 0.6035332679748535, - "learning_rate": 0.0004896233135456017, - "loss": 3.6409, + "grad_norm": 0.6272096037864685, + "learning_rate": 0.0004896168375607123, + "loss": 3.6379, "step": 17150 }, { "epoch": 1.8544474393530996, - "grad_norm": 0.6086268424987793, - "learning_rate": 0.0004892995143011333, - "loss": 3.6381, + "grad_norm": 0.5532569289207458, + "learning_rate": 0.0004892930383162439, + "loss": 3.6385, "step": 17200 }, { "epoch": 1.8598382749326146, - "grad_norm": 0.5861483812332153, + "grad_norm": 0.5877029299736023, "learning_rate": 0.0004889757150566648, - "loss": 3.6353, + "loss": 3.6333, "step": 17250 }, { "epoch": 1.8652291105121295, - "grad_norm": 0.5193630456924438, + "grad_norm": 0.5302947163581848, "learning_rate": 0.0004886519158121964, - "loss": 3.6603, + "loss": 3.6591, "step": 17300 }, { "epoch": 1.8706199460916442, - "grad_norm": 0.5715697407722473, + "grad_norm": 0.5727546811103821, "learning_rate": 0.000488328116567728, - "loss": 3.6605, + "loss": 3.6621, "step": 17350 }, { "epoch": 1.8760107816711589, - "grad_norm": 0.5773372054100037, + "grad_norm": 0.6038647890090942, "learning_rate": 0.0004880043173232595, - "loss": 3.6424, + "loss": 3.6402, "step": 17400 }, { "epoch": 1.881401617250674, - "grad_norm": 0.583741307258606, + "grad_norm": 0.554803192615509, "learning_rate": 0.0004876805180787911, - "loss": 3.6527, + "loss": 3.6514, "step": 17450 }, { "epoch": 1.8867924528301887, - "grad_norm": 0.5900930762290955, + "grad_norm": 0.5694112181663513, "learning_rate": 0.0004873567188343227, - "loss": 3.6337, + "loss": 3.6342, "step": 17500 }, { "epoch": 1.8921832884097034, - "grad_norm": 0.593124508857727, + "grad_norm": 0.5965739488601685, "learning_rate": 0.0004870329195898542, - "loss": 3.6494, + "loss": 3.6502, "step": 17550 }, { "epoch": 1.8975741239892183, - "grad_norm": 0.5326477885246277, + "grad_norm": 0.5624055862426758, "learning_rate": 0.00048670912034538583, - "loss": 3.6383, + "loss": 3.6382, "step": 17600 }, { "epoch": 1.9029649595687332, - "grad_norm": 0.5563052296638489, + "grad_norm": 0.5538950562477112, "learning_rate": 0.0004863853211009174, - "loss": 3.6343, + "loss": 3.6351, "step": 17650 }, { "epoch": 1.908355795148248, - "grad_norm": 0.5851816534996033, + "grad_norm": 0.5740907788276672, "learning_rate": 0.000486061521856449, - "loss": 3.6484, + "loss": 3.6466, "step": 17700 }, { "epoch": 1.9137466307277629, - "grad_norm": 0.538767397403717, + "grad_norm": 0.5794373154640198, "learning_rate": 0.00048573772261198054, - "loss": 3.6369, + "loss": 3.6373, "step": 17750 }, { "epoch": 1.9191374663072778, - "grad_norm": 0.5974882245063782, + "grad_norm": 0.5746911764144897, "learning_rate": 0.00048541392336751214, - "loss": 3.6523, + "loss": 3.652, "step": 17800 }, { "epoch": 1.9245283018867925, - "grad_norm": 0.631157398223877, + "grad_norm": 0.5943441390991211, "learning_rate": 0.0004850901241230437, - "loss": 3.6591, + "loss": 3.6579, "step": 17850 }, { "epoch": 1.9299191374663072, - "grad_norm": 0.6010056734085083, - "learning_rate": 0.0004847663248785753, - "loss": 3.6333, + "grad_norm": 0.5650938153266907, + "learning_rate": 0.00048477280086346464, + "loss": 3.6337, "step": 17900 }, { "epoch": 1.935309973045822, - "grad_norm": 0.5925098061561584, - "learning_rate": 0.0004844425256341068, - "loss": 3.6426, + "grad_norm": 0.5545641779899597, + "learning_rate": 0.00048444900161899614, + "loss": 3.6433, "step": 17950 }, { "epoch": 1.940700808625337, - "grad_norm": 0.6020907163619995, - "learning_rate": 0.00048411872638963834, - "loss": 3.6398, + "grad_norm": 0.6059868931770325, + "learning_rate": 0.00048412520237452774, + "loss": 3.6383, "step": 18000 }, { "epoch": 1.940700808625337, - "eval_accuracy": 0.36062664901129676, - "eval_loss": 3.586404323577881, - "eval_runtime": 184.7776, - "eval_samples_per_second": 97.474, - "eval_steps_per_second": 6.094, + "eval_accuracy": 0.360640556580937, + "eval_loss": 3.586578607559204, + "eval_runtime": 184.65, + "eval_samples_per_second": 97.541, + "eval_steps_per_second": 6.098, "step": 18000 }, { "epoch": 1.9460916442048517, - "grad_norm": 0.5491907596588135, - "learning_rate": 0.00048379492714516995, + "grad_norm": 0.5713542103767395, + "learning_rate": 0.0004838014031300593, "loss": 3.6239, "step": 18050 }, { "epoch": 1.9514824797843666, - "grad_norm": 0.5421905517578125, - "learning_rate": 0.0004834711279007015, - "loss": 3.6544, + "grad_norm": 0.5506201386451721, + "learning_rate": 0.0004834776038855909, + "loss": 3.6535, "step": 18100 }, { "epoch": 1.9568733153638815, - "grad_norm": 0.5486034154891968, + "grad_norm": 0.5446757674217224, "learning_rate": 0.00048315380464112245, - "loss": 3.6525, + "loss": 3.6543, "step": 18150 }, { "epoch": 1.9622641509433962, - "grad_norm": 0.6060916185379028, + "grad_norm": 0.5964956283569336, "learning_rate": 0.00048283000539665405, - "loss": 3.6409, + "loss": 3.641, "step": 18200 }, { "epoch": 1.967654986522911, - "grad_norm": 0.5878822207450867, + "grad_norm": 0.5960961580276489, "learning_rate": 0.0004825062061521856, - "loss": 3.6446, + "loss": 3.6438, "step": 18250 }, { "epoch": 1.9730458221024259, - "grad_norm": 0.5905686020851135, + "grad_norm": 0.5544432401657104, "learning_rate": 0.00048218240690771716, - "loss": 3.6221, + "loss": 3.6236, "step": 18300 }, { "epoch": 1.9784366576819408, - "grad_norm": 0.8368579149246216, + "grad_norm": 0.766299843788147, "learning_rate": 0.00048185860766324876, - "loss": 3.6497, + "loss": 3.6485, "step": 18350 }, { "epoch": 1.9838274932614555, - "grad_norm": 0.5327203869819641, + "grad_norm": 0.5031926035881042, "learning_rate": 0.0004815348084187803, - "loss": 3.6241, + "loss": 3.6232, "step": 18400 }, { "epoch": 1.9892183288409704, - "grad_norm": 0.5534670948982239, + "grad_norm": 0.5916299223899841, "learning_rate": 0.0004812110091743119, - "loss": 3.6272, + "loss": 3.6269, "step": 18450 }, { "epoch": 1.9946091644204853, - "grad_norm": 0.5606032013893127, + "grad_norm": 0.5806682705879211, "learning_rate": 0.00048088720992984347, - "loss": 3.6188, + "loss": 3.618, "step": 18500 }, { "epoch": 2.0, - "grad_norm": 1.0810041427612305, + "grad_norm": 1.0938396453857422, "learning_rate": 0.00048056341068537507, - "loss": 3.6353, + "loss": 3.6348, "step": 18550 }, { "epoch": 2.0053908355795147, - "grad_norm": 0.6048265099525452, + "grad_norm": 0.6288138628005981, "learning_rate": 0.00048023961144090657, - "loss": 3.5478, + "loss": 3.5499, "step": 18600 }, { "epoch": 2.01078167115903, - "grad_norm": 0.6086897253990173, + "grad_norm": 0.5435187816619873, "learning_rate": 0.00047991581219643817, - "loss": 3.5393, + "loss": 3.539, "step": 18650 }, { "epoch": 2.0161725067385445, - "grad_norm": 0.5395742654800415, + "grad_norm": 0.5978911519050598, "learning_rate": 0.0004795920129519697, - "loss": 3.5481, + "loss": 3.5485, "step": 18700 }, { "epoch": 2.0215633423180592, - "grad_norm": 0.5895045399665833, + "grad_norm": 0.5855212211608887, "learning_rate": 0.0004792682137075013, - "loss": 3.5393, + "loss": 3.5387, "step": 18750 }, { "epoch": 2.026954177897574, - "grad_norm": 0.5812539458274841, + "grad_norm": 0.5721486210823059, "learning_rate": 0.0004789444144630329, - "loss": 3.5482, + "loss": 3.5468, "step": 18800 }, { "epoch": 2.032345013477089, - "grad_norm": 0.5699259638786316, + "grad_norm": 0.5630236864089966, "learning_rate": 0.00047862061521856443, - "loss": 3.5458, + "loss": 3.5456, "step": 18850 }, { "epoch": 2.0377358490566038, - "grad_norm": 0.59361332654953, + "grad_norm": 0.5774985551834106, "learning_rate": 0.00047829681597409603, - "loss": 3.5598, + "loss": 3.5596, "step": 18900 }, { "epoch": 2.0431266846361185, - "grad_norm": 0.5848776698112488, + "grad_norm": 0.6049696207046509, "learning_rate": 0.0004779730167296276, - "loss": 3.5539, + "loss": 3.5537, "step": 18950 }, { "epoch": 2.0485175202156336, - "grad_norm": 0.6138538718223572, + "grad_norm": 0.6631167531013489, "learning_rate": 0.0004776492174851592, - "loss": 3.5537, + "loss": 3.5558, "step": 19000 }, { "epoch": 2.0485175202156336, - "eval_accuracy": 0.3620915072448116, - "eval_loss": 3.577709197998047, - "eval_runtime": 184.7136, - "eval_samples_per_second": 97.508, - "eval_steps_per_second": 6.096, + "eval_accuracy": 0.36188202447710527, + "eval_loss": 3.580307960510254, + "eval_runtime": 185.1459, + "eval_samples_per_second": 97.28, + "eval_steps_per_second": 6.082, "step": 19000 }, { "epoch": 2.0539083557951483, - "grad_norm": 0.5327613949775696, + "grad_norm": 0.5472202301025391, "learning_rate": 0.00047732541824069074, - "loss": 3.5637, + "loss": 3.5642, "step": 19050 }, { "epoch": 2.059299191374663, - "grad_norm": 0.5497714281082153, + "grad_norm": 0.5591016411781311, "learning_rate": 0.0004770016189962223, - "loss": 3.5572, + "loss": 3.5543, "step": 19100 }, { "epoch": 2.0646900269541777, - "grad_norm": 0.5397418737411499, + "grad_norm": 0.545282781124115, "learning_rate": 0.0004766778197517539, - "loss": 3.5523, + "loss": 3.5512, "step": 19150 }, { "epoch": 2.070080862533693, - "grad_norm": 0.6005552411079407, + "grad_norm": 0.5512953400611877, "learning_rate": 0.0004763540205072854, - "loss": 3.5718, + "loss": 3.5699, "step": 19200 }, { "epoch": 2.0754716981132075, - "grad_norm": 0.5779317617416382, + "grad_norm": 0.5678845047950745, "learning_rate": 0.00047603022126281705, - "loss": 3.5608, + "loss": 3.5604, "step": 19250 }, { "epoch": 2.0808625336927222, - "grad_norm": 0.5900464057922363, + "grad_norm": 0.5643090605735779, "learning_rate": 0.00047570642201834855, - "loss": 3.5525, + "loss": 3.5518, "step": 19300 }, { "epoch": 2.0862533692722374, - "grad_norm": 0.5555198788642883, + "grad_norm": 0.6203526258468628, "learning_rate": 0.00047538262277388015, - "loss": 3.5668, + "loss": 3.5674, "step": 19350 }, { "epoch": 2.091644204851752, - "grad_norm": 0.5912312269210815, + "grad_norm": 0.5465493202209473, "learning_rate": 0.0004750588235294117, - "loss": 3.5583, + "loss": 3.5609, "step": 19400 }, { "epoch": 2.0970350404312668, - "grad_norm": 0.62127685546875, + "grad_norm": 0.7021563053131104, "learning_rate": 0.0004747350242849433, - "loss": 3.5424, + "loss": 3.5422, "step": 19450 }, { "epoch": 2.1024258760107815, - "grad_norm": 0.5501288175582886, + "grad_norm": 0.5742030739784241, "learning_rate": 0.00047441122504047486, - "loss": 3.5675, + "loss": 3.565, "step": 19500 }, { "epoch": 2.1078167115902966, - "grad_norm": 0.5837997198104858, + "grad_norm": 0.567764163017273, "learning_rate": 0.0004740874257960064, - "loss": 3.5581, + "loss": 3.5572, "step": 19550 }, { "epoch": 2.1132075471698113, - "grad_norm": 0.6498492360115051, + "grad_norm": 0.5916157364845276, "learning_rate": 0.000473763626551538, - "loss": 3.5545, + "loss": 3.5543, "step": 19600 }, { "epoch": 2.118598382749326, - "grad_norm": 0.5806640982627869, + "grad_norm": 0.557542622089386, "learning_rate": 0.00047343982730706956, - "loss": 3.558, + "loss": 3.5573, "step": 19650 }, { "epoch": 2.123989218328841, - "grad_norm": 0.5803947448730469, + "grad_norm": 0.5682848691940308, "learning_rate": 0.00047311602806260117, - "loss": 3.5497, + "loss": 3.5494, "step": 19700 }, { "epoch": 2.129380053908356, - "grad_norm": 0.583768904209137, + "grad_norm": 0.6219768524169922, "learning_rate": 0.0004727922288181327, - "loss": 3.5417, + "loss": 3.5406, "step": 19750 }, { "epoch": 2.1347708894878705, - "grad_norm": 0.5913686156272888, + "grad_norm": 0.6204505562782288, "learning_rate": 0.0004724684295736643, - "loss": 3.5624, + "loss": 3.5623, "step": 19800 }, { "epoch": 2.1401617250673857, - "grad_norm": 0.5883073210716248, + "grad_norm": 0.5561828017234802, "learning_rate": 0.0004721446303291959, "loss": 3.5538, "step": 19850 }, { "epoch": 2.1455525606469004, - "grad_norm": 0.5684877634048462, + "grad_norm": 0.5439440011978149, "learning_rate": 0.0004718208310847275, - "loss": 3.5582, + "loss": 3.5568, "step": 19900 }, { "epoch": 2.150943396226415, - "grad_norm": 0.5947640538215637, + "grad_norm": 0.556121289730072, "learning_rate": 0.000471497031840259, - "loss": 3.5498, + "loss": 3.5496, "step": 19950 }, { "epoch": 2.1563342318059298, - "grad_norm": 0.5927711725234985, + "grad_norm": 0.6231219172477722, "learning_rate": 0.0004711732325957905, - "loss": 3.5547, + "loss": 3.5512, "step": 20000 }, { "epoch": 2.1563342318059298, - "eval_accuracy": 0.3632744112344478, - "eval_loss": 3.5697343349456787, - "eval_runtime": 185.0798, - "eval_samples_per_second": 97.315, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.3632916870436103, + "eval_loss": 3.5682075023651123, + "eval_runtime": 185.4001, + "eval_samples_per_second": 97.147, + "eval_steps_per_second": 6.073, "step": 20000 }, { "epoch": 2.161725067385445, - "grad_norm": 0.6061388254165649, + "grad_norm": 0.5872836709022522, "learning_rate": 0.00047084943335132213, - "loss": 3.555, + "loss": 3.5553, "step": 20050 }, { "epoch": 2.1671159029649596, - "grad_norm": 0.6365026235580444, + "grad_norm": 0.6557894945144653, "learning_rate": 0.0004705256341068537, - "loss": 3.5514, + "loss": 3.5517, "step": 20100 }, { "epoch": 2.1725067385444743, - "grad_norm": 0.5569615960121155, - "learning_rate": 0.00047020831084727463, - "loss": 3.5625, + "grad_norm": 0.5851044654846191, + "learning_rate": 0.0004702018348623853, + "loss": 3.5631, "step": 20150 }, { "epoch": 2.177897574123989, - "grad_norm": 0.5334418416023254, - "learning_rate": 0.00046988451160280624, - "loss": 3.566, + "grad_norm": 0.5255047678947449, + "learning_rate": 0.00046987803561791684, + "loss": 3.5632, "step": 20200 }, { "epoch": 2.183288409703504, - "grad_norm": 0.5309886336326599, - "learning_rate": 0.0004695607123583378, - "loss": 3.5691, + "grad_norm": 0.541897177696228, + "learning_rate": 0.00046955423637344844, + "loss": 3.5704, "step": 20250 }, { "epoch": 2.188679245283019, - "grad_norm": 0.6989066004753113, - "learning_rate": 0.00046923691311386934, - "loss": 3.5521, + "grad_norm": 0.6552609205245972, + "learning_rate": 0.00046923043712898, + "loss": 3.5505, "step": 20300 }, { "epoch": 2.1940700808625335, - "grad_norm": 0.5249040722846985, - "learning_rate": 0.00046891311386940094, - "loss": 3.5447, + "grad_norm": 0.54781574010849, + "learning_rate": 0.0004689066378845116, + "loss": 3.5429, "step": 20350 }, { "epoch": 2.1994609164420487, - "grad_norm": 0.5840235352516174, - "learning_rate": 0.0004685893146249325, - "loss": 3.5579, + "grad_norm": 0.587614119052887, + "learning_rate": 0.00046858283864004315, + "loss": 3.558, "step": 20400 }, { "epoch": 2.2048517520215634, - "grad_norm": 0.8253178000450134, - "learning_rate": 0.0004682655153804641, - "loss": 3.5529, + "grad_norm": 0.7089626789093018, + "learning_rate": 0.0004682590393955747, + "loss": 3.5531, "step": 20450 }, { "epoch": 2.210242587601078, - "grad_norm": 0.5192914009094238, - "learning_rate": 0.00046794171613599565, - "loss": 3.5688, + "grad_norm": 0.571935772895813, + "learning_rate": 0.0004679352401511063, + "loss": 3.5682, "step": 20500 }, { "epoch": 2.215633423180593, - "grad_norm": 0.6052975058555603, - "learning_rate": 0.00046761791689152725, - "loss": 3.5645, + "grad_norm": 0.5847911834716797, + "learning_rate": 0.0004676114409066378, + "loss": 3.563, "step": 20550 }, { "epoch": 2.221024258760108, - "grad_norm": 0.6332587599754333, - "learning_rate": 0.00046729411764705875, - "loss": 3.5602, + "grad_norm": 0.6640413999557495, + "learning_rate": 0.00046728764166216946, + "loss": 3.5612, "step": 20600 }, { "epoch": 2.2264150943396226, - "grad_norm": 0.5721376538276672, - "learning_rate": 0.0004669703184025904, - "loss": 3.5692, + "grad_norm": 0.5688964128494263, + "learning_rate": 0.00046696384241770095, + "loss": 3.5682, "step": 20650 }, { "epoch": 2.2318059299191373, - "grad_norm": 0.5925312042236328, - "learning_rate": 0.0004666465191581219, - "loss": 3.5511, + "grad_norm": 0.6168888807296753, + "learning_rate": 0.00046664004317323256, + "loss": 3.5507, "step": 20700 }, { "epoch": 2.2371967654986524, - "grad_norm": 0.5780835151672363, - "learning_rate": 0.00046632271991365346, - "loss": 3.5705, + "grad_norm": 0.5358821749687195, + "learning_rate": 0.0004663162439287641, + "loss": 3.5699, "step": 20750 }, { "epoch": 2.242587601078167, - "grad_norm": 0.5989285111427307, - "learning_rate": 0.00046599892066918506, - "loss": 3.564, + "grad_norm": 0.6521897315979004, + "learning_rate": 0.00046599244468429566, + "loss": 3.5645, "step": 20800 }, { "epoch": 2.247978436657682, - "grad_norm": 0.5859040021896362, - "learning_rate": 0.0004656751214247166, - "loss": 3.5667, + "grad_norm": 0.580860435962677, + "learning_rate": 0.00046566864543982726, + "loss": 3.5668, "step": 20850 }, { "epoch": 2.2533692722371965, - "grad_norm": 0.5282529592514038, - "learning_rate": 0.0004653513221802482, - "loss": 3.5562, + "grad_norm": 0.5867982506752014, + "learning_rate": 0.0004653448461953588, + "loss": 3.5576, "step": 20900 }, { "epoch": 2.2587601078167117, - "grad_norm": 0.5917240381240845, - "learning_rate": 0.00046502752293577977, - "loss": 3.5576, + "grad_norm": 0.5754712224006653, + "learning_rate": 0.0004650210469508904, + "loss": 3.5556, "step": 20950 }, { "epoch": 2.2641509433962264, - "grad_norm": 0.5365268588066101, - "learning_rate": 0.00046470372369131137, - "loss": 3.5552, + "grad_norm": 0.5653140544891357, + "learning_rate": 0.00046469724770642197, + "loss": 3.5558, "step": 21000 }, { "epoch": 2.2641509433962264, - "eval_accuracy": 0.36426271790200837, - "eval_loss": 3.5592100620269775, - "eval_runtime": 184.8544, - "eval_samples_per_second": 97.433, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.3638638531508414, + "eval_loss": 3.5593972206115723, + "eval_runtime": 184.9946, + "eval_samples_per_second": 97.36, + "eval_steps_per_second": 6.087, "step": 21000 }, { "epoch": 2.269541778975741, - "grad_norm": 0.6421787142753601, - "learning_rate": 0.0004643799244468429, - "loss": 3.5506, + "grad_norm": 0.6279696226119995, + "learning_rate": 0.0004643734484619536, + "loss": 3.5504, "step": 21050 }, { "epoch": 2.274932614555256, - "grad_norm": 0.6219934821128845, - "learning_rate": 0.0004640561252023745, - "loss": 3.5415, + "grad_norm": 0.6201480627059937, + "learning_rate": 0.0004640496492174851, + "loss": 3.5406, "step": 21100 }, { "epoch": 2.280323450134771, - "grad_norm": 0.5714690089225769, - "learning_rate": 0.0004637323259579061, - "loss": 3.5694, + "grad_norm": 0.6064199805259705, + "learning_rate": 0.00046372584997301673, + "loss": 3.5679, "step": 21150 }, { "epoch": 2.2857142857142856, - "grad_norm": 0.5706064701080322, - "learning_rate": 0.0004634085267134376, - "loss": 3.548, + "grad_norm": 0.5709848999977112, + "learning_rate": 0.0004634020507285483, + "loss": 3.5475, "step": 21200 }, { "epoch": 2.2911051212938007, - "grad_norm": 0.5707243084907532, - "learning_rate": 0.00046308472746896923, + "grad_norm": 0.5898020267486572, + "learning_rate": 0.0004630782514840798, "loss": 3.5456, "step": 21250 }, { "epoch": 2.2964959568733154, - "grad_norm": 0.5919787883758545, - "learning_rate": 0.00046276092822450073, - "loss": 3.5567, + "grad_norm": 0.6298962235450745, + "learning_rate": 0.0004627544522396114, + "loss": 3.5555, "step": 21300 }, { "epoch": 2.30188679245283, - "grad_norm": 0.581691563129425, - "learning_rate": 0.00046243712898003233, - "loss": 3.5333, + "grad_norm": 0.5597742795944214, + "learning_rate": 0.00046243065299514293, + "loss": 3.5329, "step": 21350 }, { "epoch": 2.3072776280323453, - "grad_norm": 0.5467907786369324, - "learning_rate": 0.0004621133297355639, - "loss": 3.5726, + "grad_norm": 0.5654948949813843, + "learning_rate": 0.00046210685375067454, + "loss": 3.5718, "step": 21400 }, { "epoch": 2.31266846361186, - "grad_norm": 0.5869949460029602, - "learning_rate": 0.0004617895304910955, - "loss": 3.5297, + "grad_norm": 0.6111935377120972, + "learning_rate": 0.0004617830545062061, + "loss": 3.53, "step": 21450 }, { "epoch": 2.3180592991913747, - "grad_norm": 0.546911895275116, - "learning_rate": 0.00046146573124662704, - "loss": 3.5624, + "grad_norm": 0.5851426720619202, + "learning_rate": 0.0004614592552617377, + "loss": 3.5606, "step": 21500 }, { "epoch": 2.3234501347708894, - "grad_norm": 0.5730652213096619, - "learning_rate": 0.00046114193200215864, - "loss": 3.5607, + "grad_norm": 0.5756331086158752, + "learning_rate": 0.00046113545601726924, + "loss": 3.5628, "step": 21550 }, { "epoch": 2.3288409703504045, - "grad_norm": 0.5786536335945129, - "learning_rate": 0.0004608181327576902, - "loss": 3.5352, + "grad_norm": 0.5970633625984192, + "learning_rate": 0.00046081165677280085, + "loss": 3.5357, "step": 21600 }, { "epoch": 2.334231805929919, - "grad_norm": 0.5828068852424622, - "learning_rate": 0.00046049433351322175, - "loss": 3.5558, + "grad_norm": 0.6157798171043396, + "learning_rate": 0.0004604878575283324, + "loss": 3.5545, "step": 21650 }, { "epoch": 2.339622641509434, - "grad_norm": 0.57169508934021, - "learning_rate": 0.00046017053426875335, - "loss": 3.5672, + "grad_norm": 0.5729793906211853, + "learning_rate": 0.00046016405828386395, + "loss": 3.567, "step": 21700 }, { "epoch": 2.3450134770889486, - "grad_norm": 0.6271734237670898, - "learning_rate": 0.0004598467350242849, - "loss": 3.55, + "grad_norm": 0.6059099435806274, + "learning_rate": 0.00045984025903939555, + "loss": 3.5498, "step": 21750 }, { "epoch": 2.3504043126684637, - "grad_norm": 0.589015543460846, - "learning_rate": 0.0004595229357798165, - "loss": 3.5597, + "grad_norm": 0.5564419031143188, + "learning_rate": 0.0004595164597949271, + "loss": 3.561, "step": 21800 }, { "epoch": 2.3557951482479784, - "grad_norm": 0.5915622115135193, - "learning_rate": 0.00045919913653534806, - "loss": 3.561, + "grad_norm": 0.6240026950836182, + "learning_rate": 0.0004591926605504587, + "loss": 3.5613, "step": 21850 }, { "epoch": 2.361185983827493, - "grad_norm": 0.5321695804595947, - "learning_rate": 0.00045887533729087966, - "loss": 3.5536, + "grad_norm": 0.609406590461731, + "learning_rate": 0.0004588688613059902, + "loss": 3.555, "step": 21900 }, { "epoch": 2.3665768194070083, - "grad_norm": 0.5577532649040222, - "learning_rate": 0.00045855153804641116, - "loss": 3.5468, + "grad_norm": 0.5573409795761108, + "learning_rate": 0.00045854506206152186, + "loss": 3.5453, "step": 21950 }, { "epoch": 2.371967654986523, - "grad_norm": 0.5654785633087158, + "grad_norm": 0.5612466335296631, "learning_rate": 0.0004582277388019427, - "loss": 3.5389, + "loss": 3.5396, "step": 22000 }, { "epoch": 2.371967654986523, - "eval_accuracy": 0.36546517941145124, - "eval_loss": 3.5472066402435303, - "eval_runtime": 184.8497, - "eval_samples_per_second": 97.436, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.36513335349206577, + "eval_loss": 3.549255132675171, + "eval_runtime": 185.1446, + "eval_samples_per_second": 97.281, + "eval_steps_per_second": 6.082, "step": 22000 }, { "epoch": 2.3773584905660377, - "grad_norm": 0.5921724438667297, + "grad_norm": 0.6105454564094543, "learning_rate": 0.0004579039395574743, - "loss": 3.5423, + "loss": 3.5424, "step": 22050 }, { "epoch": 2.382749326145553, - "grad_norm": 0.5459533929824829, + "grad_norm": 0.5819680690765381, "learning_rate": 0.00045758014031300586, - "loss": 3.5607, + "loss": 3.5612, "step": 22100 }, { "epoch": 2.3881401617250675, - "grad_norm": 0.5743412971496582, + "grad_norm": 0.5976172089576721, "learning_rate": 0.00045725634106853747, - "loss": 3.5526, + "loss": 3.552, "step": 22150 }, { "epoch": 2.393530997304582, - "grad_norm": 0.5897930264472961, - "learning_rate": 0.0004569390178089584, - "loss": 3.5395, + "grad_norm": 0.5951831936836243, + "learning_rate": 0.000456932541824069, + "loss": 3.5414, "step": 22200 }, { "epoch": 2.398921832884097, - "grad_norm": 0.7762314677238464, - "learning_rate": 0.00045661521856448997, - "loss": 3.5346, + "grad_norm": 0.785123348236084, + "learning_rate": 0.0004566087425796006, + "loss": 3.534, "step": 22250 }, { "epoch": 2.404312668463612, - "grad_norm": 0.5821401476860046, - "learning_rate": 0.0004562914193200216, - "loss": 3.5631, + "grad_norm": 0.5873304009437561, + "learning_rate": 0.0004562849433351322, + "loss": 3.5623, "step": 22300 }, { "epoch": 2.4097035040431267, - "grad_norm": 0.6131104826927185, - "learning_rate": 0.0004559676200755531, - "loss": 3.5702, + "grad_norm": 0.5963989496231079, + "learning_rate": 0.0004559611440906638, + "loss": 3.5714, "step": 22350 }, { "epoch": 2.4150943396226414, - "grad_norm": 0.64360111951828, - "learning_rate": 0.0004556438208310847, - "loss": 3.5555, + "grad_norm": 0.6528258323669434, + "learning_rate": 0.00045563734484619533, + "loss": 3.5552, "step": 22400 }, { "epoch": 2.420485175202156, - "grad_norm": 0.5733264684677124, - "learning_rate": 0.0004553200215866163, - "loss": 3.5617, + "grad_norm": 0.5636788606643677, + "learning_rate": 0.0004553135456017269, + "loss": 3.5598, "step": 22450 }, { "epoch": 2.4258760107816713, - "grad_norm": 0.6092203855514526, - "learning_rate": 0.00045499622234214783, - "loss": 3.5574, + "grad_norm": 0.6383706331253052, + "learning_rate": 0.0004549897463572585, + "loss": 3.5585, "step": 22500 }, { "epoch": 2.431266846361186, - "grad_norm": 0.5596003532409668, - "learning_rate": 0.00045467242309767944, - "loss": 3.5368, + "grad_norm": 0.5658510327339172, + "learning_rate": 0.00045466594711279, + "loss": 3.5387, "step": 22550 }, { "epoch": 2.4366576819407006, - "grad_norm": 0.5687330961227417, - "learning_rate": 0.00045434862385321093, - "loss": 3.5544, + "grad_norm": 0.6189313530921936, + "learning_rate": 0.00045434214786832164, + "loss": 3.5571, "step": 22600 }, { "epoch": 2.442048517520216, - "grad_norm": 0.5486346483230591, - "learning_rate": 0.0004540248246087426, - "loss": 3.5514, + "grad_norm": 0.5614275336265564, + "learning_rate": 0.00045401834862385314, + "loss": 3.5504, "step": 22650 }, { "epoch": 2.4474393530997305, - "grad_norm": 0.5911766290664673, - "learning_rate": 0.0004537010253642741, - "loss": 3.558, + "grad_norm": 0.6230819225311279, + "learning_rate": 0.00045369454937938474, + "loss": 3.5575, "step": 22700 }, { "epoch": 2.452830188679245, - "grad_norm": 0.6131563186645508, - "learning_rate": 0.00045337722611980564, - "loss": 3.5475, + "grad_norm": 0.6143561005592346, + "learning_rate": 0.0004533707501349163, + "loss": 3.5467, "step": 22750 }, { "epoch": 2.4582210242587603, - "grad_norm": 0.5406696796417236, - "learning_rate": 0.00045305342687533724, - "loss": 3.5442, + "grad_norm": 0.5280505418777466, + "learning_rate": 0.0004530469508904479, + "loss": 3.5427, "step": 22800 }, { "epoch": 2.463611859838275, - "grad_norm": 0.6020655632019043, - "learning_rate": 0.0004527296276308688, - "loss": 3.5327, + "grad_norm": 0.5940893888473511, + "learning_rate": 0.00045272315164597945, + "loss": 3.535, "step": 22850 }, { "epoch": 2.4690026954177897, - "grad_norm": 0.6064804792404175, - "learning_rate": 0.0004524058283864004, - "loss": 3.5471, + "grad_norm": 0.5543529391288757, + "learning_rate": 0.000452399352401511, + "loss": 3.5475, "step": 22900 }, { "epoch": 2.4743935309973044, - "grad_norm": 0.6027748584747314, - "learning_rate": 0.00045208202914193195, - "loss": 3.5323, + "grad_norm": 0.630915641784668, + "learning_rate": 0.0004520755531570426, + "loss": 3.5332, "step": 22950 }, { "epoch": 2.4797843665768196, - "grad_norm": 0.6128622889518738, - "learning_rate": 0.00045175822989746355, - "loss": 3.5484, + "grad_norm": 0.5720409750938416, + "learning_rate": 0.00045175175391257415, + "loss": 3.55, "step": 23000 }, { "epoch": 2.4797843665768196, - "eval_accuracy": 0.3659796508352528, - "eval_loss": 3.539062261581421, - "eval_runtime": 185.3247, - "eval_samples_per_second": 97.186, - "eval_steps_per_second": 6.076, + "eval_accuracy": 0.36602463313080796, + "eval_loss": 3.5372166633605957, + "eval_runtime": 185.2165, + "eval_samples_per_second": 97.243, + "eval_steps_per_second": 6.079, "step": 23000 }, { "epoch": 2.4851752021563343, - "grad_norm": 0.6107762455940247, - "learning_rate": 0.0004514344306529951, - "loss": 3.5545, + "grad_norm": 0.6525805592536926, + "learning_rate": 0.00045142795466810576, + "loss": 3.5552, "step": 23050 }, { "epoch": 2.490566037735849, - "grad_norm": 0.5848401784896851, - "learning_rate": 0.0004511106314085267, - "loss": 3.5517, + "grad_norm": 0.5877313613891602, + "learning_rate": 0.0004511041554236373, + "loss": 3.5516, "step": 23100 }, { "epoch": 2.4959568733153636, - "grad_norm": 0.5667966604232788, - "learning_rate": 0.00045078683216405826, - "loss": 3.5641, + "grad_norm": 0.6073674559593201, + "learning_rate": 0.0004507803561791689, + "loss": 3.5651, "step": 23150 }, { "epoch": 2.501347708894879, - "grad_norm": 0.6321859359741211, - "learning_rate": 0.00045046303291958976, - "loss": 3.556, + "grad_norm": 0.6147558689117432, + "learning_rate": 0.00045045655693470046, + "loss": 3.5572, "step": 23200 }, { "epoch": 2.5067385444743935, - "grad_norm": 0.5637291669845581, - "learning_rate": 0.0004501392336751214, - "loss": 3.5556, + "grad_norm": 0.5562535524368286, + "learning_rate": 0.00045013275769023207, + "loss": 3.5547, "step": 23250 }, { "epoch": 2.512129380053908, - "grad_norm": 0.5089269280433655, - "learning_rate": 0.0004498154344306529, - "loss": 3.5528, + "grad_norm": 0.5438495874404907, + "learning_rate": 0.00044980895844576356, + "loss": 3.5517, "step": 23300 }, { "epoch": 2.5175202156334233, - "grad_norm": 0.5717602968215942, - "learning_rate": 0.0004494916351861845, - "loss": 3.5522, + "grad_norm": 0.5914117693901062, + "learning_rate": 0.0004494851592012951, + "loss": 3.5508, "step": 23350 }, { "epoch": 2.522911051212938, - "grad_norm": 0.5922253727912903, - "learning_rate": 0.00044916783594171607, - "loss": 3.5279, + "grad_norm": 0.593285322189331, + "learning_rate": 0.0004491613599568267, + "loss": 3.529, "step": 23400 }, { "epoch": 2.5283018867924527, - "grad_norm": 0.6174511909484863, - "learning_rate": 0.00044884403669724767, - "loss": 3.5454, + "grad_norm": 0.5920347571372986, + "learning_rate": 0.00044883756071235827, + "loss": 3.546, "step": 23450 }, { "epoch": 2.533692722371968, - "grad_norm": 0.5651429891586304, - "learning_rate": 0.0004485202374527792, - "loss": 3.541, + "grad_norm": 0.5992326736450195, + "learning_rate": 0.0004485137614678899, + "loss": 3.5419, "step": 23500 }, { "epoch": 2.5390835579514826, - "grad_norm": 0.5340074896812439, - "learning_rate": 0.00044819643820831083, - "loss": 3.5434, + "grad_norm": 0.5436864495277405, + "learning_rate": 0.0004481899622234214, + "loss": 3.5438, "step": 23550 }, { "epoch": 2.5444743935309972, - "grad_norm": 0.5876573920249939, - "learning_rate": 0.0004478726389638424, - "loss": 3.5491, + "grad_norm": 0.6082063913345337, + "learning_rate": 0.00044786616297895303, + "loss": 3.5472, "step": 23600 }, { "epoch": 2.5498652291105124, - "grad_norm": 0.544531524181366, - "learning_rate": 0.00044754883971937393, - "loss": 3.5358, + "grad_norm": 0.5552800893783569, + "learning_rate": 0.0004475423637344846, + "loss": 3.5362, "step": 23650 }, { "epoch": 2.555256064690027, - "grad_norm": 0.5737462043762207, - "learning_rate": 0.00044722504047490553, - "loss": 3.5247, + "grad_norm": 0.5726051926612854, + "learning_rate": 0.00044721856449001613, + "loss": 3.5254, "step": 23700 }, { "epoch": 2.560646900269542, - "grad_norm": 0.5849447846412659, - "learning_rate": 0.0004469012412304371, - "loss": 3.5321, + "grad_norm": 0.6111428737640381, + "learning_rate": 0.00044689476524554774, + "loss": 3.5348, "step": 23750 }, { "epoch": 2.5660377358490565, - "grad_norm": 0.548485517501831, - "learning_rate": 0.0004465774419859687, - "loss": 3.5518, + "grad_norm": 0.5674059987068176, + "learning_rate": 0.0004465709660010793, + "loss": 3.5519, "step": 23800 }, { "epoch": 2.571428571428571, - "grad_norm": 0.5926434993743896, - "learning_rate": 0.00044625364274150024, - "loss": 3.5384, + "grad_norm": 0.5440058708190918, + "learning_rate": 0.0004462471667566109, + "loss": 3.5381, "step": 23850 }, { "epoch": 2.5768194070080863, - "grad_norm": 0.5925818681716919, - "learning_rate": 0.00044592984349703184, - "loss": 3.5406, + "grad_norm": 0.57765132188797, + "learning_rate": 0.00044592336751214244, + "loss": 3.5399, "step": 23900 }, { "epoch": 2.582210242587601, - "grad_norm": 0.5864969491958618, - "learning_rate": 0.00044560604425256334, - "loss": 3.5313, + "grad_norm": 0.6293656826019287, + "learning_rate": 0.00044559956826767405, + "loss": 3.5326, "step": 23950 }, { "epoch": 2.5876010781671157, - "grad_norm": 0.5714327096939087, + "grad_norm": 0.5684266686439514, "learning_rate": 0.000445282245008095, - "loss": 3.5354, + "loss": 3.5369, "step": 24000 }, { "epoch": 2.5876010781671157, - "eval_accuracy": 0.36750046530599206, - "eval_loss": 3.5272812843322754, - "eval_runtime": 184.8982, - "eval_samples_per_second": 97.41, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.3672214446900845, + "eval_loss": 3.5291759967803955, + "eval_runtime": 184.7979, + "eval_samples_per_second": 97.463, + "eval_steps_per_second": 6.093, "step": 24000 }, { "epoch": 2.592991913746631, - "grad_norm": 0.5608143210411072, + "grad_norm": 0.5553839802742004, "learning_rate": 0.0004449584457636265, - "loss": 3.5616, + "loss": 3.5619, "step": 24050 }, { "epoch": 2.5983827493261455, - "grad_norm": 0.6214185357093811, + "grad_norm": 0.6244831681251526, "learning_rate": 0.00044463464651915805, - "loss": 3.5359, + "loss": 3.536, "step": 24100 }, { "epoch": 2.6037735849056602, - "grad_norm": 0.561044454574585, + "grad_norm": 0.5737437009811401, "learning_rate": 0.00044431084727468965, - "loss": 3.5475, + "loss": 3.5483, "step": 24150 }, { "epoch": 2.6091644204851754, - "grad_norm": 0.5782501101493835, - "learning_rate": 0.0004439935240151106, - "loss": 3.5338, + "grad_norm": 0.6138004660606384, + "learning_rate": 0.0004439870480302212, + "loss": 3.5341, "step": 24200 }, { "epoch": 2.61455525606469, - "grad_norm": 0.5893598198890686, - "learning_rate": 0.00044366972477064215, - "loss": 3.5526, + "grad_norm": 0.5860102772712708, + "learning_rate": 0.0004436632487857528, + "loss": 3.5505, "step": 24250 }, { "epoch": 2.6199460916442048, - "grad_norm": 0.6279098987579346, - "learning_rate": 0.00044334592552617376, - "loss": 3.5458, + "grad_norm": 0.6097062230110168, + "learning_rate": 0.00044333944954128436, + "loss": 3.5457, "step": 24300 }, { "epoch": 2.62533692722372, - "grad_norm": 0.5761608481407166, - "learning_rate": 0.0004430221262817053, - "loss": 3.5383, + "grad_norm": 0.6122974157333374, + "learning_rate": 0.00044301565029681596, + "loss": 3.538, "step": 24350 }, { "epoch": 2.6307277628032346, - "grad_norm": 0.5751183032989502, - "learning_rate": 0.00044269832703723686, - "loss": 3.5401, + "grad_norm": 0.5906981825828552, + "learning_rate": 0.0004426918510523475, + "loss": 3.5389, "step": 24400 }, { "epoch": 2.6361185983827493, - "grad_norm": 0.5772199034690857, - "learning_rate": 0.00044237452779276846, - "loss": 3.5392, + "grad_norm": 0.5892931222915649, + "learning_rate": 0.0004423680518078791, + "loss": 3.5378, "step": 24450 }, { "epoch": 2.641509433962264, - "grad_norm": 0.592406690120697, - "learning_rate": 0.0004420507285483, - "loss": 3.5474, + "grad_norm": 0.5747993588447571, + "learning_rate": 0.00044204425256341067, + "loss": 3.5483, "step": 24500 }, { "epoch": 2.6469002695417787, - "grad_norm": 0.6260506510734558, - "learning_rate": 0.0004417269293038316, - "loss": 3.5316, + "grad_norm": 0.5863109827041626, + "learning_rate": 0.0004417204533189422, + "loss": 3.5317, "step": 24550 }, { "epoch": 2.652291105121294, - "grad_norm": 0.603329062461853, - "learning_rate": 0.0004414031300593631, - "loss": 3.5403, + "grad_norm": 0.6000504493713379, + "learning_rate": 0.0004413966540744738, + "loss": 3.5408, "step": 24600 }, { "epoch": 2.6576819407008085, - "grad_norm": 0.5417729020118713, - "learning_rate": 0.0004410793308148948, - "loss": 3.5352, + "grad_norm": 0.5356788635253906, + "learning_rate": 0.0004410728548300053, + "loss": 3.536, "step": 24650 }, { "epoch": 2.6630727762803232, - "grad_norm": 0.5827444791793823, - "learning_rate": 0.00044075553157042627, - "loss": 3.5417, + "grad_norm": 0.5767205953598022, + "learning_rate": 0.0004407490555855369, + "loss": 3.5419, "step": 24700 }, { "epoch": 2.6684636118598384, - "grad_norm": 0.5908746719360352, - "learning_rate": 0.0004404317323259579, - "loss": 3.5581, + "grad_norm": 0.5705932974815369, + "learning_rate": 0.0004404252563410685, + "loss": 3.5596, "step": 24750 }, { "epoch": 2.673854447439353, - "grad_norm": 0.5514307618141174, - "learning_rate": 0.0004401079330814894, - "loss": 3.5529, + "grad_norm": 0.5411615967750549, + "learning_rate": 0.0004401014570966001, + "loss": 3.5523, "step": 24800 }, { "epoch": 2.6792452830188678, - "grad_norm": 0.5702772736549377, - "learning_rate": 0.000439784133837021, - "loss": 3.5533, + "grad_norm": 0.5571548342704773, + "learning_rate": 0.00043977765785213163, + "loss": 3.5519, "step": 24850 }, { "epoch": 2.684636118598383, - "grad_norm": 0.6261325478553772, - "learning_rate": 0.0004394603345925526, - "loss": 3.5245, + "grad_norm": 0.6669823527336121, + "learning_rate": 0.0004394538586076632, + "loss": 3.5243, "step": 24900 }, { "epoch": 2.6900269541778976, - "grad_norm": 0.602677047252655, - "learning_rate": 0.00043913653534808413, - "loss": 3.5444, + "grad_norm": 0.5549421906471252, + "learning_rate": 0.0004391300593631948, + "loss": 3.5437, "step": 24950 }, { "epoch": 2.6954177897574123, - "grad_norm": 0.5706930160522461, - "learning_rate": 0.00043881273610361574, - "loss": 3.5216, + "grad_norm": 0.6019677519798279, + "learning_rate": 0.00043880626011872634, + "loss": 3.5235, "step": 25000 }, { "epoch": 2.6954177897574123, - "eval_accuracy": 0.3683038447584923, - "eval_loss": 3.5193662643432617, - "eval_runtime": 185.1095, - "eval_samples_per_second": 97.299, - "eval_steps_per_second": 6.083, + "eval_accuracy": 0.3678405488448514, + "eval_loss": 3.520496129989624, + "eval_runtime": 184.864, + "eval_samples_per_second": 97.428, + "eval_steps_per_second": 6.091, "step": 25000 }, { "epoch": 2.7008086253369274, - "grad_norm": 0.5820638537406921, - "learning_rate": 0.0004384889368591473, - "loss": 3.537, + "grad_norm": 0.5758979916572571, + "learning_rate": 0.00043848246087425794, + "loss": 3.5348, "step": 25050 }, { "epoch": 2.706199460916442, - "grad_norm": 0.5792041420936584, - "learning_rate": 0.0004381651376146789, - "loss": 3.5356, + "grad_norm": 0.5502886176109314, + "learning_rate": 0.0004381586616297895, + "loss": 3.538, "step": 25100 }, { "epoch": 2.711590296495957, - "grad_norm": 0.5905164480209351, - "learning_rate": 0.00043784133837021044, - "loss": 3.5398, + "grad_norm": 0.5907125473022461, + "learning_rate": 0.0004378348623853211, + "loss": 3.5389, "step": 25150 }, { "epoch": 2.7169811320754715, - "grad_norm": 0.5989269018173218, - "learning_rate": 0.00043751753912574205, - "loss": 3.5395, + "grad_norm": 0.5703557729721069, + "learning_rate": 0.00043751106314085265, + "loss": 3.5389, "step": 25200 }, { "epoch": 2.7223719676549867, - "grad_norm": 0.5796598196029663, - "learning_rate": 0.0004371937398812736, - "loss": 3.5316, + "grad_norm": 0.605928361415863, + "learning_rate": 0.00043718726389638425, + "loss": 3.5305, "step": 25250 }, { "epoch": 2.7277628032345014, - "grad_norm": 0.5831441283226013, - "learning_rate": 0.0004368699406368051, - "loss": 3.54, + "grad_norm": 0.5767539739608765, + "learning_rate": 0.00043686346465191575, + "loss": 3.5402, "step": 25300 }, { "epoch": 2.733153638814016, - "grad_norm": 0.662450909614563, - "learning_rate": 0.0004365461413923367, - "loss": 3.5337, + "grad_norm": 0.6421152353286743, + "learning_rate": 0.0004365396654074473, + "loss": 3.5344, "step": 25350 }, { "epoch": 2.7385444743935308, - "grad_norm": 0.5832592844963074, - "learning_rate": 0.00043622234214786825, - "loss": 3.5294, + "grad_norm": 0.5596376061439514, + "learning_rate": 0.0004362158661629789, + "loss": 3.5272, "step": 25400 }, { "epoch": 2.743935309973046, - "grad_norm": 0.6044817566871643, - "learning_rate": 0.00043589854290339985, - "loss": 3.5455, + "grad_norm": 0.5788835883140564, + "learning_rate": 0.00043589206691851045, + "loss": 3.545, "step": 25450 }, { "epoch": 2.7493261455525606, - "grad_norm": 0.5936079025268555, - "learning_rate": 0.0004355747436589314, - "loss": 3.5239, + "grad_norm": 0.6065754294395447, + "learning_rate": 0.00043556826767404206, + "loss": 3.5221, "step": 25500 }, { "epoch": 2.7547169811320753, - "grad_norm": 0.6509273052215576, - "learning_rate": 0.000435250944414463, - "loss": 3.5151, + "grad_norm": 0.6557634472846985, + "learning_rate": 0.0004352444684295736, + "loss": 3.5164, "step": 25550 }, { "epoch": 2.7601078167115904, - "grad_norm": 0.6094829440116882, - "learning_rate": 0.00043492714516999456, - "loss": 3.5212, + "grad_norm": 0.5632269382476807, + "learning_rate": 0.0004349206691851052, + "loss": 3.5205, "step": 25600 }, { "epoch": 2.765498652291105, - "grad_norm": 0.6017758846282959, - "learning_rate": 0.0004346033459255261, - "loss": 3.5276, + "grad_norm": 0.7714040875434875, + "learning_rate": 0.00043459686994063676, + "loss": 3.5271, "step": 25650 }, { "epoch": 2.77088948787062, - "grad_norm": 0.5787220597267151, - "learning_rate": 0.0004342795466810577, - "loss": 3.5148, + "grad_norm": 0.572063684463501, + "learning_rate": 0.00043427307069616837, + "loss": 3.5135, "step": 25700 }, { "epoch": 2.776280323450135, - "grad_norm": 0.6034836769104004, - "learning_rate": 0.00043395574743658927, - "loss": 3.5171, + "grad_norm": 0.5755751132965088, + "learning_rate": 0.0004339492714516999, + "loss": 3.5176, "step": 25750 }, { "epoch": 2.7816711590296497, - "grad_norm": 0.6130497455596924, - "learning_rate": 0.00043363194819212087, - "loss": 3.5329, + "grad_norm": 0.5770022869110107, + "learning_rate": 0.00043362547220723147, + "loss": 3.5318, "step": 25800 }, { "epoch": 2.7870619946091644, - "grad_norm": 0.5641265511512756, - "learning_rate": 0.0004333081489476524, - "loss": 3.5179, + "grad_norm": 0.5711796283721924, + "learning_rate": 0.0004333016729627631, + "loss": 3.5175, "step": 25850 }, { "epoch": 2.7924528301886795, - "grad_norm": 0.5724943280220032, - "learning_rate": 0.000432984349703184, - "loss": 3.5203, + "grad_norm": 0.561193585395813, + "learning_rate": 0.0004329778737182946, + "loss": 3.5207, "step": 25900 }, { "epoch": 2.797843665768194, - "grad_norm": 0.5406250357627869, - "learning_rate": 0.0004326605504587155, - "loss": 3.5313, + "grad_norm": 0.5335695743560791, + "learning_rate": 0.00043265407447382623, + "loss": 3.5301, "step": 25950 }, { "epoch": 2.803234501347709, - "grad_norm": 0.6399490833282471, - "learning_rate": 0.0004323367512142472, - "loss": 3.5355, + "grad_norm": 0.6352400779724121, + "learning_rate": 0.0004323302752293577, + "loss": 3.5368, "step": 26000 }, { "epoch": 2.803234501347709, - "eval_accuracy": 0.3691859975546581, - "eval_loss": 3.5100622177124023, - "eval_runtime": 184.655, - "eval_samples_per_second": 97.539, - "eval_steps_per_second": 6.098, + "eval_accuracy": 0.3689961809596462, + "eval_loss": 3.5120186805725098, + "eval_runtime": 185.5238, + "eval_samples_per_second": 97.082, + "eval_steps_per_second": 6.069, "step": 26000 }, { "epoch": 2.8086253369272236, - "grad_norm": 0.6148707866668701, + "grad_norm": 0.6344072222709656, "learning_rate": 0.0004320129519697787, - "loss": 3.5351, + "loss": 3.5355, "step": 26050 }, { "epoch": 2.8140161725067383, - "grad_norm": 0.5590334534645081, + "grad_norm": 0.5527616739273071, "learning_rate": 0.00043168915272531023, - "loss": 3.5176, + "loss": 3.5154, "step": 26100 }, { "epoch": 2.8194070080862534, - "grad_norm": 0.585767388343811, + "grad_norm": 0.5766391754150391, "learning_rate": 0.00043136535348084183, - "loss": 3.5271, + "loss": 3.526, "step": 26150 }, { "epoch": 2.824797843665768, - "grad_norm": 0.6158841252326965, + "grad_norm": 0.6358306407928467, "learning_rate": 0.0004310415542363734, - "loss": 3.5225, + "loss": 3.5227, "step": 26200 }, { "epoch": 2.830188679245283, - "grad_norm": 0.6011167168617249, + "grad_norm": 0.6099495887756348, "learning_rate": 0.000430717754991905, - "loss": 3.5186, + "loss": 3.5171, "step": 26250 }, { "epoch": 2.835579514824798, - "grad_norm": 0.6300170421600342, - "learning_rate": 0.00043040043173232594, - "loss": 3.5321, + "grad_norm": 0.603064775466919, + "learning_rate": 0.00043039395574743654, + "loss": 3.5312, "step": 26300 }, { "epoch": 2.8409703504043127, - "grad_norm": 0.6018158197402954, - "learning_rate": 0.0004300766324878575, - "loss": 3.5369, + "grad_norm": 0.6099294424057007, + "learning_rate": 0.00043007015650296814, + "loss": 3.5359, "step": 26350 }, { "epoch": 2.8463611859838274, - "grad_norm": 0.6889051795005798, - "learning_rate": 0.0004297528332433891, - "loss": 3.524, + "grad_norm": 0.5590049624443054, + "learning_rate": 0.0004297463572584997, + "loss": 3.5232, "step": 26400 }, { "epoch": 2.8517520215633425, - "grad_norm": 0.5744615793228149, - "learning_rate": 0.00042942903399892065, - "loss": 3.5252, + "grad_norm": 0.5742785334587097, + "learning_rate": 0.0004294225580140313, + "loss": 3.523, "step": 26450 }, { "epoch": 2.857142857142857, - "grad_norm": 0.6640810966491699, - "learning_rate": 0.0004291052347544522, - "loss": 3.5226, + "grad_norm": 0.6284919381141663, + "learning_rate": 0.00042909875876956285, + "loss": 3.5214, "step": 26500 }, { "epoch": 2.862533692722372, - "grad_norm": 0.6115678548812866, - "learning_rate": 0.0004287814355099838, - "loss": 3.5145, + "grad_norm": 0.626808762550354, + "learning_rate": 0.0004287749595250944, + "loss": 3.5127, "step": 26550 }, { "epoch": 2.867924528301887, - "grad_norm": 0.5903047919273376, - "learning_rate": 0.0004284576362655153, - "loss": 3.5249, + "grad_norm": 0.5913968682289124, + "learning_rate": 0.000428451160280626, + "loss": 3.5246, "step": 26600 }, { "epoch": 2.8733153638814017, - "grad_norm": 0.5858241319656372, - "learning_rate": 0.00042813383702104696, - "loss": 3.5192, + "grad_norm": 0.5790668725967407, + "learning_rate": 0.0004281273610361575, + "loss": 3.5198, "step": 26650 }, { "epoch": 2.8787061994609164, - "grad_norm": 0.5560069680213928, - "learning_rate": 0.00042781003777657845, - "loss": 3.5193, + "grad_norm": 0.5755479335784912, + "learning_rate": 0.0004278035617916891, + "loss": 3.5196, "step": 26700 }, { "epoch": 2.884097035040431, - "grad_norm": 0.5569444298744202, - "learning_rate": 0.00042748623853211006, - "loss": 3.5385, + "grad_norm": 0.5627382397651672, + "learning_rate": 0.00042747976254722066, + "loss": 3.5359, "step": 26750 }, { "epoch": 2.889487870619946, - "grad_norm": 0.6125451922416687, - "learning_rate": 0.0004271624392876416, - "loss": 3.5248, + "grad_norm": 0.6645321249961853, + "learning_rate": 0.00042715596330275226, + "loss": 3.5238, "step": 26800 }, { "epoch": 2.894878706199461, - "grad_norm": 0.5783609747886658, + "grad_norm": 0.5590260028839111, "learning_rate": 0.00042683864004317316, - "loss": 3.52, + "loss": 3.517, "step": 26850 }, { "epoch": 2.9002695417789757, - "grad_norm": 0.6026281714439392, + "grad_norm": 0.5804741382598877, "learning_rate": 0.00042651484079870476, - "loss": 3.5427, + "loss": 3.5424, "step": 26900 }, { "epoch": 2.9056603773584904, - "grad_norm": 0.5993627905845642, + "grad_norm": 0.5705453157424927, "learning_rate": 0.0004261910415542363, - "loss": 3.5342, + "loss": 3.535, "step": 26950 }, { "epoch": 2.9110512129380055, - "grad_norm": 0.5433017015457153, + "grad_norm": 0.5481330156326294, "learning_rate": 0.0004258672423097679, - "loss": 3.5181, + "loss": 3.5182, "step": 27000 }, { "epoch": 2.9110512129380055, - "eval_accuracy": 0.3701280180920097, - "eval_loss": 3.5018980503082275, - "eval_runtime": 185.0799, - "eval_samples_per_second": 97.315, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.3699219035638256, + "eval_loss": 3.5017757415771484, + "eval_runtime": 185.4733, + "eval_samples_per_second": 97.108, + "eval_steps_per_second": 6.071, "step": 27000 }, { "epoch": 2.91644204851752, - "grad_norm": 0.6073448657989502, + "grad_norm": 0.5877059102058411, "learning_rate": 0.00042554344306529947, - "loss": 3.5363, + "loss": 3.536, "step": 27050 }, { "epoch": 2.921832884097035, - "grad_norm": 0.5769268274307251, + "grad_norm": 0.5492408275604248, "learning_rate": 0.0004252196438208311, - "loss": 3.5422, + "loss": 3.5394, "step": 27100 }, { "epoch": 2.92722371967655, - "grad_norm": 0.6756018400192261, + "grad_norm": 0.596275806427002, "learning_rate": 0.0004248958445763626, - "loss": 3.5257, + "loss": 3.524, "step": 27150 }, { "epoch": 2.9326145552560647, - "grad_norm": 0.6934609413146973, + "grad_norm": 0.5996169447898865, "learning_rate": 0.00042457204533189423, - "loss": 3.5172, + "loss": 3.5154, "step": 27200 }, { "epoch": 2.9380053908355794, - "grad_norm": 0.558773934841156, + "grad_norm": 0.5572853684425354, "learning_rate": 0.0004242482460874258, - "loss": 3.5289, + "loss": 3.5263, "step": 27250 }, { "epoch": 2.9433962264150946, - "grad_norm": 0.6366307139396667, + "grad_norm": 0.6095960736274719, "learning_rate": 0.0004239244468429573, - "loss": 3.5247, + "loss": 3.5228, "step": 27300 }, { "epoch": 2.9487870619946093, - "grad_norm": 0.6284673810005188, + "grad_norm": 0.6216484308242798, "learning_rate": 0.0004236006475984889, - "loss": 3.5192, + "loss": 3.5175, "step": 27350 }, { "epoch": 2.954177897574124, - "grad_norm": 0.5934546589851379, + "grad_norm": 0.5865620970726013, "learning_rate": 0.00042327684835402043, - "loss": 3.5213, + "loss": 3.5215, "step": 27400 }, { "epoch": 2.9595687331536387, - "grad_norm": 0.5586668848991394, + "grad_norm": 0.553994357585907, "learning_rate": 0.00042295304910955204, - "loss": 3.5181, + "loss": 3.5165, "step": 27450 }, { "epoch": 2.964959568733154, - "grad_norm": 0.5713486075401306, + "grad_norm": 0.557462751865387, "learning_rate": 0.0004226292498650836, - "loss": 3.5242, + "loss": 3.5238, "step": 27500 }, { "epoch": 2.9703504043126685, - "grad_norm": 0.5594163537025452, + "grad_norm": 0.5714412331581116, "learning_rate": 0.0004223054506206152, - "loss": 3.5266, + "loss": 3.5264, "step": 27550 }, { "epoch": 2.975741239892183, - "grad_norm": 0.5783873796463013, + "grad_norm": 0.5819886922836304, "learning_rate": 0.00042198165137614674, - "loss": 3.5273, + "loss": 3.5267, "step": 27600 }, { "epoch": 2.981132075471698, - "grad_norm": 0.5773907899856567, + "grad_norm": 0.5586761236190796, "learning_rate": 0.00042165785213167835, - "loss": 3.5309, + "loss": 3.5297, "step": 27650 }, { "epoch": 2.986522911051213, - "grad_norm": 0.6616211533546448, + "grad_norm": 0.6088539958000183, "learning_rate": 0.0004213340528872099, - "loss": 3.5215, + "loss": 3.52, "step": 27700 }, { "epoch": 2.9919137466307277, - "grad_norm": 0.5861223936080933, + "grad_norm": 0.597719132900238, "learning_rate": 0.00042101025364274145, - "loss": 3.5321, + "loss": 3.5318, "step": 27750 }, { "epoch": 2.9973045822102424, - "grad_norm": 0.6247793436050415, + "grad_norm": 0.6238610148429871, "learning_rate": 0.00042068645439827305, - "loss": 3.5279, + "loss": 3.5263, "step": 27800 }, { "epoch": 3.0026954177897576, - "grad_norm": 0.6077458262443542, + "grad_norm": 0.6423973441123962, "learning_rate": 0.0004203626551538046, - "loss": 3.4729, + "loss": 3.474, "step": 27850 }, { "epoch": 3.0080862533692723, - "grad_norm": 0.5924696326255798, + "grad_norm": 0.6364337801933289, "learning_rate": 0.0004200388559093362, - "loss": 3.4233, + "loss": 3.4241, "step": 27900 }, { "epoch": 3.013477088948787, - "grad_norm": 0.6317850351333618, + "grad_norm": 0.6469203233718872, "learning_rate": 0.0004197150566648677, - "loss": 3.4231, + "loss": 3.4202, "step": 27950 }, { "epoch": 3.018867924528302, - "grad_norm": 0.588413655757904, + "grad_norm": 0.557319164276123, "learning_rate": 0.00041939125742039936, - "loss": 3.4434, + "loss": 3.4403, "step": 28000 }, { "epoch": 3.018867924528302, - "eval_accuracy": 0.37104624364692984, - "eval_loss": 3.4995107650756836, - "eval_runtime": 184.7835, - "eval_samples_per_second": 97.471, - "eval_steps_per_second": 6.094, + "eval_accuracy": 0.3709357436600225, + "eval_loss": 3.4979920387268066, + "eval_runtime": 185.1473, + "eval_samples_per_second": 97.279, + "eval_steps_per_second": 6.082, "step": 28000 }, { "epoch": 3.024258760107817, - "grad_norm": 0.6233013272285461, + "grad_norm": 0.6403119564056396, "learning_rate": 0.00041906745817593086, - "loss": 3.4372, + "loss": 3.4365, "step": 28050 }, { "epoch": 3.0296495956873315, - "grad_norm": 0.6402817964553833, + "grad_norm": 0.5956209301948547, "learning_rate": 0.00041874365893146247, - "loss": 3.444, + "loss": 3.4416, "step": 28100 }, { "epoch": 3.035040431266846, - "grad_norm": 0.5920224189758301, + "grad_norm": 0.6170210838317871, "learning_rate": 0.000418419859686994, - "loss": 3.4329, + "loss": 3.4325, "step": 28150 }, { "epoch": 3.0404312668463613, - "grad_norm": 0.6073600053787231, + "grad_norm": 0.6147815585136414, "learning_rate": 0.00041809606044252557, - "loss": 3.4372, + "loss": 3.4369, "step": 28200 }, { "epoch": 3.045822102425876, - "grad_norm": 0.5874477028846741, + "grad_norm": 0.6116907000541687, "learning_rate": 0.00041777226119805717, - "loss": 3.4204, + "loss": 3.418, "step": 28250 }, { "epoch": 3.0512129380053907, - "grad_norm": 0.5746080279350281, + "grad_norm": 0.5721989870071411, "learning_rate": 0.0004174484619535887, - "loss": 3.4409, + "loss": 3.4393, "step": 28300 }, { "epoch": 3.056603773584906, - "grad_norm": 0.6111629605293274, - "learning_rate": 0.0004171311386940097, - "loss": 3.4156, + "grad_norm": 0.6529187560081482, + "learning_rate": 0.00041712466270912033, + "loss": 3.4149, "step": 28350 }, { "epoch": 3.0619946091644206, - "grad_norm": 0.6025534868240356, - "learning_rate": 0.0004168073394495413, - "loss": 3.4449, + "grad_norm": 0.5911017656326294, + "learning_rate": 0.0004168008634646519, + "loss": 3.4432, "step": 28400 }, { "epoch": 3.0673854447439353, - "grad_norm": 0.6089310050010681, - "learning_rate": 0.00041648354020507283, - "loss": 3.414, + "grad_norm": 0.595206618309021, + "learning_rate": 0.0004164770642201835, + "loss": 3.4124, "step": 28450 }, { "epoch": 3.07277628032345, - "grad_norm": 0.6200700402259827, - "learning_rate": 0.0004161597409606044, - "loss": 3.4394, + "grad_norm": 0.657249927520752, + "learning_rate": 0.00041615326497571503, + "loss": 3.4377, "step": 28500 }, { "epoch": 3.078167115902965, - "grad_norm": 0.6414934992790222, - "learning_rate": 0.000415835941716136, - "loss": 3.4516, + "grad_norm": 0.6252180933952332, + "learning_rate": 0.0004158294657312466, + "loss": 3.4501, "step": 28550 }, { "epoch": 3.08355795148248, - "grad_norm": 0.6194086670875549, - "learning_rate": 0.0004155121424716675, - "loss": 3.4361, + "grad_norm": 0.6360760927200317, + "learning_rate": 0.0004155056664867782, + "loss": 3.4355, "step": 28600 }, { "epoch": 3.0889487870619945, - "grad_norm": 0.6263707876205444, - "learning_rate": 0.00041518834322719914, - "loss": 3.4466, + "grad_norm": 0.6364948749542236, + "learning_rate": 0.0004151818672423097, + "loss": 3.445, "step": 28650 }, { "epoch": 3.0943396226415096, - "grad_norm": 0.598533570766449, - "learning_rate": 0.00041486454398273064, - "loss": 3.4456, + "grad_norm": 0.5616390109062195, + "learning_rate": 0.0004148580679978413, + "loss": 3.4433, "step": 28700 }, { "epoch": 3.0997304582210243, - "grad_norm": 0.5736040472984314, - "learning_rate": 0.00041454074473826224, - "loss": 3.4713, + "grad_norm": 0.6200069189071655, + "learning_rate": 0.00041453426875337284, + "loss": 3.4692, "step": 28750 }, { "epoch": 3.105121293800539, - "grad_norm": 0.6180632710456848, - "learning_rate": 0.0004142169454937938, - "loss": 3.4296, + "grad_norm": 0.5786812901496887, + "learning_rate": 0.00041421046950890445, + "loss": 3.4284, "step": 28800 }, { "epoch": 3.1105121293800537, - "grad_norm": 0.6590995788574219, - "learning_rate": 0.0004138931462493254, - "loss": 3.4389, + "grad_norm": 0.631637454032898, + "learning_rate": 0.000413886670264436, + "loss": 3.4383, "step": 28850 }, { "epoch": 3.115902964959569, - "grad_norm": 0.6105198264122009, - "learning_rate": 0.00041356934700485695, - "loss": 3.4346, + "grad_norm": 0.5604854822158813, + "learning_rate": 0.0004135628710199676, + "loss": 3.4327, "step": 28900 }, { "epoch": 3.1212938005390836, - "grad_norm": 0.6112799048423767, - "learning_rate": 0.0004132455477603885, - "loss": 3.4646, + "grad_norm": 0.5949662923812866, + "learning_rate": 0.00041323907177549915, + "loss": 3.4644, "step": 28950 }, { "epoch": 3.1266846361185983, - "grad_norm": 0.6425892114639282, - "learning_rate": 0.0004129217485159201, - "loss": 3.4374, + "grad_norm": 0.6039780378341675, + "learning_rate": 0.0004129152725310307, + "loss": 3.435, "step": 29000 }, { "epoch": 3.1266846361185983, - "eval_accuracy": 0.37091336116513274, - "eval_loss": 3.493210554122925, - "eval_runtime": 185.0904, - "eval_samples_per_second": 97.309, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.3711198016519802, + "eval_loss": 3.4920432567596436, + "eval_runtime": 185.3198, + "eval_samples_per_second": 97.189, + "eval_steps_per_second": 6.076, "step": 29000 }, { "epoch": 3.1320754716981134, - "grad_norm": 0.5860080718994141, - "learning_rate": 0.00041259794927145165, - "loss": 3.4543, + "grad_norm": 0.6303644776344299, + "learning_rate": 0.0004125914732865623, + "loss": 3.4524, "step": 29050 }, { "epoch": 3.137466307277628, - "grad_norm": 0.5804362893104553, - "learning_rate": 0.00041227415002698326, - "loss": 3.4552, + "grad_norm": 0.617788553237915, + "learning_rate": 0.00041226767404209386, + "loss": 3.4542, "step": 29100 }, { "epoch": 3.142857142857143, - "grad_norm": 0.6355499625205994, - "learning_rate": 0.0004119503507825148, - "loss": 3.4463, + "grad_norm": 0.5849179625511169, + "learning_rate": 0.00041194387479762546, + "loss": 3.4471, "step": 29150 }, { "epoch": 3.1482479784366575, - "grad_norm": 0.5874181389808655, - "learning_rate": 0.0004116265515380464, - "loss": 3.4556, + "grad_norm": 0.5979399681091309, + "learning_rate": 0.000411620075553157, + "loss": 3.4536, "step": 29200 }, { "epoch": 3.1536388140161726, - "grad_norm": 0.6099561452865601, - "learning_rate": 0.00041130275229357796, - "loss": 3.458, + "grad_norm": 0.6045482158660889, + "learning_rate": 0.0004112962763086886, + "loss": 3.4567, "step": 29250 }, { "epoch": 3.1590296495956873, - "grad_norm": 0.6350792646408081, - "learning_rate": 0.00041097895304910946, - "loss": 3.4627, + "grad_norm": 0.6165677905082703, + "learning_rate": 0.0004109724770642201, + "loss": 3.46, "step": 29300 }, { "epoch": 3.164420485175202, - "grad_norm": 0.634882390499115, - "learning_rate": 0.00041065515380464107, - "loss": 3.4526, + "grad_norm": 0.6779764294624329, + "learning_rate": 0.00041064867781975177, + "loss": 3.4515, "step": 29350 }, { "epoch": 3.169811320754717, - "grad_norm": 0.6106262803077698, - "learning_rate": 0.0004103313545601726, - "loss": 3.4448, + "grad_norm": 0.6538941860198975, + "learning_rate": 0.00041032487857528327, + "loss": 3.4451, "step": 29400 }, { "epoch": 3.175202156334232, - "grad_norm": 0.6202868223190308, - "learning_rate": 0.0004100075553157042, - "loss": 3.4313, + "grad_norm": 0.6297999024391174, + "learning_rate": 0.0004100010793308148, + "loss": 3.4287, "step": 29450 }, { "epoch": 3.1805929919137466, - "grad_norm": 0.5957272052764893, - "learning_rate": 0.00040968375607123577, - "loss": 3.4549, + "grad_norm": 0.6178961396217346, + "learning_rate": 0.0004096772800863464, + "loss": 3.4557, "step": 29500 }, { "epoch": 3.1859838274932613, - "grad_norm": 0.6086465716362, - "learning_rate": 0.0004093599568267674, - "loss": 3.4405, + "grad_norm": 0.582787275314331, + "learning_rate": 0.000409353480841878, + "loss": 3.4397, "step": 29550 }, { "epoch": 3.1913746630727764, - "grad_norm": 0.5802353024482727, - "learning_rate": 0.0004090361575822989, - "loss": 3.4496, + "grad_norm": 0.5640174150466919, + "learning_rate": 0.0004090296815974096, + "loss": 3.4464, "step": 29600 }, { "epoch": 3.196765498652291, - "grad_norm": 0.5590112805366516, - "learning_rate": 0.00040871235833783053, - "loss": 3.464, + "grad_norm": 0.5874976515769958, + "learning_rate": 0.00040870588235294113, + "loss": 3.4652, "step": 29650 }, { "epoch": 3.202156334231806, - "grad_norm": 0.6418723464012146, - "learning_rate": 0.0004083885590933621, - "loss": 3.4478, + "grad_norm": 0.6412698030471802, + "learning_rate": 0.00040838208310847273, + "loss": 3.4481, "step": 29700 }, { "epoch": 3.207547169811321, - "grad_norm": 0.5875370502471924, - "learning_rate": 0.00040806475984889363, - "loss": 3.4571, + "grad_norm": 0.6078434586524963, + "learning_rate": 0.0004080582838640043, + "loss": 3.4554, "step": 29750 }, { "epoch": 3.2129380053908356, - "grad_norm": 0.6010445952415466, - "learning_rate": 0.00040774096060442524, - "loss": 3.4676, + "grad_norm": 0.6255976557731628, + "learning_rate": 0.0004077344846195359, + "loss": 3.4665, "step": 29800 }, { "epoch": 3.2183288409703503, - "grad_norm": 0.6460816860198975, - "learning_rate": 0.0004074171613599568, - "loss": 3.4719, + "grad_norm": 0.6625264286994934, + "learning_rate": 0.00040741068537506744, + "loss": 3.4688, "step": 29850 }, { "epoch": 3.223719676549865, - "grad_norm": 0.6212185621261597, - "learning_rate": 0.0004070933621154884, - "loss": 3.4391, + "grad_norm": 0.6073241233825684, + "learning_rate": 0.000407086886130599, + "loss": 3.4371, "step": 29900 }, { "epoch": 3.22911051212938, - "grad_norm": 0.5984194278717041, - "learning_rate": 0.00040676956287101994, - "loss": 3.442, + "grad_norm": 0.5774091482162476, + "learning_rate": 0.0004067630868861306, + "loss": 3.4395, "step": 29950 }, { "epoch": 3.234501347708895, - "grad_norm": 0.6364345550537109, - "learning_rate": 0.00040644576362655155, - "loss": 3.4658, + "grad_norm": 0.5973829030990601, + "learning_rate": 0.0004064392876416621, + "loss": 3.4633, "step": 30000 }, { "epoch": 3.234501347708895, - "eval_accuracy": 0.3722194775295506, - "eval_loss": 3.4868123531341553, - "eval_runtime": 184.808, - "eval_samples_per_second": 97.458, - "eval_steps_per_second": 6.093, + "eval_accuracy": 0.37187917668491566, + "eval_loss": 3.487455368041992, + "eval_runtime": 185.0825, + "eval_samples_per_second": 97.313, + "eval_steps_per_second": 6.084, "step": 30000 }, { "epoch": 3.2398921832884096, - "grad_norm": 0.6432105898857117, - "learning_rate": 0.00040612196438208304, - "loss": 3.4541, + "grad_norm": 0.6407408714294434, + "learning_rate": 0.0004061154883971937, + "loss": 3.451, "step": 30050 }, { "epoch": 3.2452830188679247, - "grad_norm": 0.5993592739105225, - "learning_rate": 0.00040579816513761465, - "loss": 3.4567, + "grad_norm": 0.6154466271400452, + "learning_rate": 0.00040579168915272525, + "loss": 3.4558, "step": 30100 }, { "epoch": 3.2506738544474394, - "grad_norm": 0.6198611259460449, - "learning_rate": 0.0004054743658931462, - "loss": 3.4649, + "grad_norm": 0.5675138831138611, + "learning_rate": 0.00040546788990825685, + "loss": 3.4632, "step": 30150 }, { "epoch": 3.256064690026954, - "grad_norm": 0.5803235769271851, - "learning_rate": 0.00040515056664867775, - "loss": 3.4479, + "grad_norm": 0.6031027436256409, + "learning_rate": 0.0004051440906637884, + "loss": 3.4464, "step": 30200 }, { "epoch": 3.2614555256064692, - "grad_norm": 0.6663078665733337, - "learning_rate": 0.00040482676740420935, - "loss": 3.4527, + "grad_norm": 0.6430271863937378, + "learning_rate": 0.00040482029141931995, + "loss": 3.4514, "step": 30250 }, { "epoch": 3.266846361185984, - "grad_norm": 0.6873801350593567, - "learning_rate": 0.0004045029681597409, - "loss": 3.4399, + "grad_norm": 0.6006436944007874, + "learning_rate": 0.00040449649217485156, + "loss": 3.4387, "step": 30300 }, { "epoch": 3.2722371967654986, - "grad_norm": 0.6202357411384583, - "learning_rate": 0.00040418564490016186, - "loss": 3.4526, + "grad_norm": 0.6477491855621338, + "learning_rate": 0.0004041726929303831, + "loss": 3.4535, "step": 30350 }, { "epoch": 3.2776280323450133, - "grad_norm": 0.5937652587890625, - "learning_rate": 0.00040386184565569346, - "loss": 3.4651, + "grad_norm": 0.633296549320221, + "learning_rate": 0.0004038488936859147, + "loss": 3.4625, "step": 30400 }, { "epoch": 3.2830188679245285, - "grad_norm": 0.586410641670227, - "learning_rate": 0.000403538046411225, - "loss": 3.4547, + "grad_norm": 0.6047157049179077, + "learning_rate": 0.00040352509444144626, + "loss": 3.4539, "step": 30450 }, { "epoch": 3.288409703504043, - "grad_norm": 0.5957598090171814, - "learning_rate": 0.00040321424716675656, - "loss": 3.4752, + "grad_norm": 0.6030910015106201, + "learning_rate": 0.00040320129519697787, + "loss": 3.4739, "step": 30500 }, { "epoch": 3.293800539083558, - "grad_norm": 0.5925768613815308, - "learning_rate": 0.00040289044792228817, - "loss": 3.466, + "grad_norm": 0.5749094486236572, + "learning_rate": 0.0004028774959525094, + "loss": 3.4636, "step": 30550 }, { "epoch": 3.2991913746630726, - "grad_norm": 0.590333878993988, - "learning_rate": 0.0004025666486778197, - "loss": 3.4576, + "grad_norm": 0.6478707790374756, + "learning_rate": 0.000402553696708041, + "loss": 3.4597, "step": 30600 }, { "epoch": 3.3045822102425877, - "grad_norm": 0.5901454091072083, - "learning_rate": 0.0004022428494333513, - "loss": 3.4574, + "grad_norm": 0.6037589311599731, + "learning_rate": 0.0004022298974635726, + "loss": 3.4564, "step": 30650 }, { "epoch": 3.3099730458221024, - "grad_norm": 0.6296274065971375, - "learning_rate": 0.0004019190501888828, - "loss": 3.4482, + "grad_norm": 0.6136059761047363, + "learning_rate": 0.00040190609821910407, + "loss": 3.4472, "step": 30700 }, { "epoch": 3.315363881401617, - "grad_norm": 0.6582797765731812, - "learning_rate": 0.0004015952509444144, - "loss": 3.4542, + "grad_norm": 0.5998915433883667, + "learning_rate": 0.0004015822989746357, + "loss": 3.4531, "step": 30750 }, { "epoch": 3.3207547169811322, - "grad_norm": 0.6020658016204834, - "learning_rate": 0.000401271451699946, - "loss": 3.4564, + "grad_norm": 0.6182710528373718, + "learning_rate": 0.0004012584997301672, + "loss": 3.4565, "step": 30800 }, { "epoch": 3.326145552560647, - "grad_norm": 0.5704158544540405, - "learning_rate": 0.0004009476524554776, - "loss": 3.4486, + "grad_norm": 0.5568892359733582, + "learning_rate": 0.00040093470048569883, + "loss": 3.4465, "step": 30850 }, { "epoch": 3.3315363881401616, - "grad_norm": 0.6447783708572388, - "learning_rate": 0.0004006303291958985, - "loss": 3.4704, + "grad_norm": 0.6910562515258789, + "learning_rate": 0.0004006173772261198, + "loss": 3.4696, "step": 30900 }, { "epoch": 3.3369272237196768, - "grad_norm": 0.5830173492431641, - "learning_rate": 0.0004003065299514301, - "loss": 3.4526, + "grad_norm": 0.5722136497497559, + "learning_rate": 0.00040029357798165133, + "loss": 3.451, "step": 30950 }, { "epoch": 3.3423180592991915, - "grad_norm": 0.5701765418052673, - "learning_rate": 0.00039998273070696163, - "loss": 3.4715, + "grad_norm": 0.565493106842041, + "learning_rate": 0.00039996977873718294, + "loss": 3.4709, "step": 31000 }, { "epoch": 3.3423180592991915, - "eval_accuracy": 0.3730165551145576, - "eval_loss": 3.481886148452759, - "eval_runtime": 185.2338, - "eval_samples_per_second": 97.234, - "eval_steps_per_second": 6.079, + "eval_accuracy": 0.37296038157155753, + "eval_loss": 3.4795143604278564, + "eval_runtime": 185.0973, + "eval_samples_per_second": 97.306, + "eval_steps_per_second": 6.083, "step": 31000 }, { "epoch": 3.347708894878706, - "grad_norm": 0.6165177822113037, - "learning_rate": 0.00039965893146249324, - "loss": 3.4472, + "grad_norm": 0.6347475647926331, + "learning_rate": 0.0003996459794927145, + "loss": 3.4439, "step": 31050 }, { "epoch": 3.353099730458221, - "grad_norm": 0.6259761452674866, - "learning_rate": 0.0003993351322180248, - "loss": 3.466, + "grad_norm": 0.5855362415313721, + "learning_rate": 0.00039932218024824604, + "loss": 3.4663, "step": 31100 }, { "epoch": 3.358490566037736, - "grad_norm": 0.6147885918617249, - "learning_rate": 0.0003990113329735564, - "loss": 3.447, + "grad_norm": 0.625121533870697, + "learning_rate": 0.00039899838100377764, + "loss": 3.4442, "step": 31150 }, { "epoch": 3.3638814016172507, - "grad_norm": 0.5740400552749634, - "learning_rate": 0.00039868753372908794, - "loss": 3.4492, + "grad_norm": 0.5701158046722412, + "learning_rate": 0.0003986745817593092, + "loss": 3.4462, "step": 31200 }, { "epoch": 3.3692722371967654, - "grad_norm": 0.6359267830848694, - "learning_rate": 0.0003983637344846195, - "loss": 3.4654, + "grad_norm": 0.6462088227272034, + "learning_rate": 0.0003983507825148408, + "loss": 3.4639, "step": 31250 }, { "epoch": 3.37466307277628, - "grad_norm": 0.5735960006713867, - "learning_rate": 0.0003980399352401511, - "loss": 3.4509, + "grad_norm": 0.5862804055213928, + "learning_rate": 0.00039802698327037235, + "loss": 3.4482, "step": 31300 }, { "epoch": 3.3800539083557952, - "grad_norm": 0.6017249226570129, - "learning_rate": 0.0003977161359956826, - "loss": 3.4392, + "grad_norm": 0.592103123664856, + "learning_rate": 0.00039770318402590396, + "loss": 3.436, "step": 31350 }, { "epoch": 3.38544474393531, - "grad_norm": 0.629580020904541, - "learning_rate": 0.0003973923367512142, - "loss": 3.4468, + "grad_norm": 0.5994466543197632, + "learning_rate": 0.00039737938478143545, + "loss": 3.4432, "step": 31400 }, { "epoch": 3.3908355795148246, - "grad_norm": 0.5926845669746399, - "learning_rate": 0.00039706853750674575, - "loss": 3.4644, + "grad_norm": 0.5867735743522644, + "learning_rate": 0.000397055585536967, + "loss": 3.4651, "step": 31450 }, { "epoch": 3.3962264150943398, - "grad_norm": 0.6186175346374512, - "learning_rate": 0.00039674473826227736, - "loss": 3.4646, + "grad_norm": 0.5661984086036682, + "learning_rate": 0.0003967317862924986, + "loss": 3.4628, "step": 31500 }, { "epoch": 3.4016172506738545, - "grad_norm": 0.6381295919418335, - "learning_rate": 0.0003964209390178089, - "loss": 3.4516, + "grad_norm": 0.6031181216239929, + "learning_rate": 0.00039640798704803016, + "loss": 3.4507, "step": 31550 }, { "epoch": 3.407008086253369, - "grad_norm": 0.6201534271240234, - "learning_rate": 0.0003960971397733405, - "loss": 3.4373, + "grad_norm": 0.6224333047866821, + "learning_rate": 0.00039608418780356176, + "loss": 3.4377, "step": 31600 }, { "epoch": 3.4123989218328843, - "grad_norm": 0.6230754256248474, - "learning_rate": 0.00039577334052887206, - "loss": 3.4583, + "grad_norm": 0.6393580436706543, + "learning_rate": 0.0003957603885590933, + "loss": 3.4576, "step": 31650 }, { "epoch": 3.417789757412399, - "grad_norm": 0.6434163451194763, - "learning_rate": 0.0003954495412844036, - "loss": 3.4661, + "grad_norm": 0.6339957118034363, + "learning_rate": 0.0003954365893146249, + "loss": 3.4621, "step": 31700 }, { "epoch": 3.4231805929919137, - "grad_norm": 0.6090736389160156, - "learning_rate": 0.0003951257420399352, - "loss": 3.4617, + "grad_norm": 0.5791376829147339, + "learning_rate": 0.00039511279007015647, + "loss": 3.46, "step": 31750 }, { "epoch": 3.4285714285714284, - "grad_norm": 0.72145015001297, - "learning_rate": 0.00039480194279546677, - "loss": 3.4469, + "grad_norm": 0.6502759456634521, + "learning_rate": 0.00039478899082568807, + "loss": 3.4458, "step": 31800 }, { "epoch": 3.4339622641509435, - "grad_norm": 0.6008581519126892, - "learning_rate": 0.00039447814355099837, - "loss": 3.4511, + "grad_norm": 0.6216784715652466, + "learning_rate": 0.0003944651915812196, + "loss": 3.4494, "step": 31850 }, { "epoch": 3.439353099730458, - "grad_norm": 0.6144813895225525, - "learning_rate": 0.0003941543443065299, - "loss": 3.4622, + "grad_norm": 0.5869659185409546, + "learning_rate": 0.0003941413923367512, + "loss": 3.4608, "step": 31900 }, { "epoch": 3.444743935309973, - "grad_norm": 0.5947038531303406, - "learning_rate": 0.00039383054506206153, - "loss": 3.4469, + "grad_norm": 0.5932062864303589, + "learning_rate": 0.0003938175930922828, + "loss": 3.4443, "step": 31950 }, { "epoch": 3.450134770889488, - "grad_norm": 0.580974280834198, - "learning_rate": 0.000393506745817593, - "loss": 3.4594, + "grad_norm": 0.6314504742622375, + "learning_rate": 0.0003934937938478143, + "loss": 3.457, "step": 32000 }, { "epoch": 3.450134770889488, - "eval_accuracy": 0.3735301573152567, - "eval_loss": 3.4732370376586914, - "eval_runtime": 185.0385, - "eval_samples_per_second": 97.337, - "eval_steps_per_second": 6.085, + "eval_accuracy": 0.3736922674238759, + "eval_loss": 3.472475051879883, + "eval_runtime": 185.2489, + "eval_samples_per_second": 97.226, + "eval_steps_per_second": 6.078, "step": 32000 }, { "epoch": 3.4555256064690028, - "grad_norm": 0.6096420884132385, - "learning_rate": 0.0003931829465731247, - "loss": 3.4827, + "grad_norm": 0.5765188336372375, + "learning_rate": 0.00039317647058823523, + "loss": 3.4814, "step": 32050 }, { "epoch": 3.4609164420485174, - "grad_norm": 0.6233165264129639, - "learning_rate": 0.0003928591473286562, - "loss": 3.4725, + "grad_norm": 0.5741029381752014, + "learning_rate": 0.00039285267134376683, + "loss": 3.469, "step": 32100 }, { "epoch": 3.466307277628032, - "grad_norm": 0.6157932281494141, - "learning_rate": 0.00039253534808418773, - "loss": 3.444, + "grad_norm": 0.6119958758354187, + "learning_rate": 0.0003925288720992984, + "loss": 3.4415, "step": 32150 }, { "epoch": 3.4716981132075473, - "grad_norm": 0.5871294140815735, - "learning_rate": 0.00039221154883971933, - "loss": 3.4577, + "grad_norm": 0.5945273637771606, + "learning_rate": 0.00039220507285482993, + "loss": 3.4564, "step": 32200 }, { "epoch": 3.477088948787062, - "grad_norm": 0.6096975207328796, - "learning_rate": 0.0003918877495952509, - "loss": 3.4479, + "grad_norm": 0.6451990604400635, + "learning_rate": 0.00039188127361036154, + "loss": 3.4472, "step": 32250 }, { "epoch": 3.4824797843665767, - "grad_norm": 0.6300262212753296, - "learning_rate": 0.0003915639503507825, - "loss": 3.4509, + "grad_norm": 0.6479616761207581, + "learning_rate": 0.0003915574743658931, + "loss": 3.4494, "step": 32300 }, { "epoch": 3.487870619946092, - "grad_norm": 0.6068784594535828, - "learning_rate": 0.00039124015110631404, - "loss": 3.4435, + "grad_norm": 0.6039360761642456, + "learning_rate": 0.0003912336751214247, + "loss": 3.4409, "step": 32350 }, { "epoch": 3.4932614555256065, - "grad_norm": 0.652286946773529, - "learning_rate": 0.00039091635186184565, - "loss": 3.4495, + "grad_norm": 0.6061731576919556, + "learning_rate": 0.00039090987587695624, + "loss": 3.4462, "step": 32400 }, { "epoch": 3.498652291105121, - "grad_norm": 0.580116868019104, - "learning_rate": 0.0003905925526173772, - "loss": 3.4604, + "grad_norm": 0.5710239410400391, + "learning_rate": 0.00039058607663248785, + "loss": 3.4558, "step": 32450 }, { "epoch": 3.5040431266846364, - "grad_norm": 0.6080635190010071, - "learning_rate": 0.0003902687533729088, - "loss": 3.4637, + "grad_norm": 0.5853672623634338, + "learning_rate": 0.0003902622773880194, + "loss": 3.4638, "step": 32500 }, { "epoch": 3.509433962264151, - "grad_norm": 0.6209675073623657, - "learning_rate": 0.00038994495412844035, - "loss": 3.4659, + "grad_norm": 0.6254067420959473, + "learning_rate": 0.000389938478143551, + "loss": 3.4625, "step": 32550 }, { "epoch": 3.5148247978436657, - "grad_norm": 0.6277631521224976, - "learning_rate": 0.0003896211548839719, - "loss": 3.4535, + "grad_norm": 0.6255186796188354, + "learning_rate": 0.00038961467889908255, + "loss": 3.452, "step": 32600 }, { "epoch": 3.5202156334231804, - "grad_norm": 0.5931004285812378, - "learning_rate": 0.0003892973556395035, - "loss": 3.457, + "grad_norm": 0.5967075228691101, + "learning_rate": 0.00038929087965461405, + "loss": 3.4556, "step": 32650 }, { "epoch": 3.525606469002695, - "grad_norm": 0.6000095009803772, - "learning_rate": 0.000388973556395035, - "loss": 3.4726, + "grad_norm": 0.58425372838974, + "learning_rate": 0.00038896708041014566, + "loss": 3.4709, "step": 32700 }, { "epoch": 3.5309973045822103, - "grad_norm": 0.6495856046676636, - "learning_rate": 0.0003886497571505666, - "loss": 3.4472, + "grad_norm": 0.611172080039978, + "learning_rate": 0.0003886432811656772, + "loss": 3.4462, "step": 32750 }, { "epoch": 3.536388140161725, - "grad_norm": 0.6353185176849365, - "learning_rate": 0.00038832595790609816, - "loss": 3.4751, + "grad_norm": 0.6028887033462524, + "learning_rate": 0.0003883194819212088, + "loss": 3.473, "step": 32800 }, { "epoch": 3.5417789757412397, - "grad_norm": 0.5833432674407959, - "learning_rate": 0.00038800215866162976, - "loss": 3.475, + "grad_norm": 0.5676169991493225, + "learning_rate": 0.00038799568267674036, + "loss": 3.4729, "step": 32850 }, { "epoch": 3.547169811320755, - "grad_norm": 0.6292868256568909, - "learning_rate": 0.0003876783594171613, - "loss": 3.4682, + "grad_norm": 0.573436975479126, + "learning_rate": 0.00038767188343227197, + "loss": 3.4675, "step": 32900 }, { "epoch": 3.5525606469002695, - "grad_norm": 0.6485346555709839, - "learning_rate": 0.0003873545601726929, - "loss": 3.4554, + "grad_norm": 0.6455245018005371, + "learning_rate": 0.0003873480841878035, + "loss": 3.454, "step": 32950 }, { "epoch": 3.557951482479784, - "grad_norm": 0.63539057970047, - "learning_rate": 0.00038703076092822447, - "loss": 3.452, + "grad_norm": 0.6214224696159363, + "learning_rate": 0.0003870242849433351, + "loss": 3.4501, "step": 33000 }, { "epoch": 3.557951482479784, - "eval_accuracy": 0.3738674158790328, - "eval_loss": 3.4682674407958984, - "eval_runtime": 184.927, - "eval_samples_per_second": 97.395, - "eval_steps_per_second": 6.089, + "eval_accuracy": 0.37362457667476745, + "eval_loss": 3.466701030731201, + "eval_runtime": 185.1362, + "eval_samples_per_second": 97.285, + "eval_steps_per_second": 6.082, "step": 33000 }, { "epoch": 3.5633423180592994, - "grad_norm": 0.5895044803619385, - "learning_rate": 0.000386706961683756, - "loss": 3.4616, + "grad_norm": 0.6276306509971619, + "learning_rate": 0.00038670048569886667, + "loss": 3.4599, "step": 33050 }, { "epoch": 3.568733153638814, - "grad_norm": 0.597101092338562, - "learning_rate": 0.00038638963842417697, - "loss": 3.4612, + "grad_norm": 0.6255683898925781, + "learning_rate": 0.0003863766864543982, + "loss": 3.463, "step": 33100 }, { "epoch": 3.5741239892183287, - "grad_norm": 0.6554183959960938, - "learning_rate": 0.0003860658391797086, - "loss": 3.4665, + "grad_norm": 0.5910325050354004, + "learning_rate": 0.00038605288720992983, + "loss": 3.4655, "step": 33150 }, { "epoch": 3.579514824797844, - "grad_norm": 0.6352536678314209, - "learning_rate": 0.0003857420399352401, - "loss": 3.4622, + "grad_norm": 0.6093518137931824, + "learning_rate": 0.0003857290879654614, + "loss": 3.4621, "step": 33200 }, { "epoch": 3.5849056603773586, - "grad_norm": 0.6003103256225586, - "learning_rate": 0.00038541824069077173, - "loss": 3.4461, + "grad_norm": 0.6162254214286804, + "learning_rate": 0.000385405288720993, + "loss": 3.4442, "step": 33250 }, { "epoch": 3.5902964959568733, - "grad_norm": 0.6177358031272888, - "learning_rate": 0.0003850944414463033, - "loss": 3.4733, + "grad_norm": 0.6433115005493164, + "learning_rate": 0.00038508148947652453, + "loss": 3.4724, "step": 33300 }, { "epoch": 3.595687331536388, - "grad_norm": 0.6115977764129639, - "learning_rate": 0.0003847706422018348, - "loss": 3.4604, + "grad_norm": 0.616127073764801, + "learning_rate": 0.00038475769023205614, + "loss": 3.4594, "step": 33350 }, { "epoch": 3.601078167115903, - "grad_norm": 0.6683585047721863, - "learning_rate": 0.0003844468429573664, - "loss": 3.4566, + "grad_norm": 0.6631184220314026, + "learning_rate": 0.00038443389098758763, + "loss": 3.4554, "step": 33400 }, { "epoch": 3.606469002695418, - "grad_norm": 0.6268858909606934, - "learning_rate": 0.00038412304371289793, - "loss": 3.475, + "grad_norm": 0.6478034853935242, + "learning_rate": 0.00038411009174311924, + "loss": 3.473, "step": 33450 }, { "epoch": 3.6118598382749325, - "grad_norm": 0.56812584400177, - "learning_rate": 0.00038379924446842954, - "loss": 3.4508, + "grad_norm": 0.57593834400177, + "learning_rate": 0.0003837862924986508, + "loss": 3.4495, "step": 33500 }, { "epoch": 3.617250673854447, - "grad_norm": 0.6435724496841431, - "learning_rate": 0.0003834754452239611, - "loss": 3.449, + "grad_norm": 0.5845508575439453, + "learning_rate": 0.00038346249325418234, + "loss": 3.4489, "step": 33550 }, { "epoch": 3.6226415094339623, - "grad_norm": 0.6608956456184387, - "learning_rate": 0.0003831516459794927, - "loss": 3.4437, + "grad_norm": 0.6428506970405579, + "learning_rate": 0.00038313869400971395, + "loss": 3.4434, "step": 33600 }, { "epoch": 3.628032345013477, - "grad_norm": 0.6201449632644653, - "learning_rate": 0.00038282784673502424, - "loss": 3.4597, + "grad_norm": 0.6328097581863403, + "learning_rate": 0.0003828213707501349, + "loss": 3.4675, "step": 33650 }, { "epoch": 3.6334231805929917, - "grad_norm": 0.6143207550048828, - "learning_rate": 0.00038250404749055585, - "loss": 3.4646, + "grad_norm": 0.6197866201400757, + "learning_rate": 0.00038249757150566645, + "loss": 3.4638, "step": 33700 }, { "epoch": 3.638814016172507, - "grad_norm": 0.6062964200973511, - "learning_rate": 0.0003821802482460874, - "loss": 3.4443, + "grad_norm": 0.6091422438621521, + "learning_rate": 0.00038217377226119805, + "loss": 3.4442, "step": 33750 }, { "epoch": 3.6442048517520216, - "grad_norm": 0.6019686460494995, - "learning_rate": 0.00038185644900161895, - "loss": 3.4499, + "grad_norm": 0.6277755498886108, + "learning_rate": 0.0003818499730167296, + "loss": 3.449, "step": 33800 }, { "epoch": 3.6495956873315363, - "grad_norm": 0.5941645503044128, - "learning_rate": 0.00038153264975715056, - "loss": 3.434, + "grad_norm": 0.5895088315010071, + "learning_rate": 0.00038152617377226115, + "loss": 3.4322, "step": 33850 }, { "epoch": 3.6549865229110514, - "grad_norm": 0.6274322867393494, - "learning_rate": 0.0003812088505126821, - "loss": 3.4256, + "grad_norm": 0.6150822043418884, + "learning_rate": 0.00038120237452779276, + "loss": 3.4265, "step": 33900 }, { "epoch": 3.660377358490566, - "grad_norm": 0.6215175986289978, - "learning_rate": 0.0003808850512682137, - "loss": 3.4534, + "grad_norm": 0.6281339526176453, + "learning_rate": 0.0003808785752833243, + "loss": 3.4533, "step": 33950 }, { "epoch": 3.665768194070081, - "grad_norm": 0.6265103816986084, - "learning_rate": 0.0003805612520237452, - "loss": 3.4498, + "grad_norm": 0.6197679042816162, + "learning_rate": 0.0003805547760388559, + "loss": 3.4477, "step": 34000 }, { "epoch": 3.665768194070081, - "eval_accuracy": 0.3748275814486494, - "eval_loss": 3.4626383781433105, - "eval_runtime": 185.0442, - "eval_samples_per_second": 97.334, - "eval_steps_per_second": 6.085, + "eval_accuracy": 0.3746061468632835, + "eval_loss": 3.461383104324341, + "eval_runtime": 185.1927, + "eval_samples_per_second": 97.255, + "eval_steps_per_second": 6.08, "step": 34000 }, { "epoch": 3.671159029649596, - "grad_norm": 0.6977486610412598, - "learning_rate": 0.00038023745277927687, + "grad_norm": 0.6397350430488586, + "learning_rate": 0.0003802309767943874, "loss": 3.4512, "step": 34050 }, { "epoch": 3.6765498652291106, - "grad_norm": 0.6424899697303772, - "learning_rate": 0.00037991365353480836, - "loss": 3.4514, + "grad_norm": 0.6090516448020935, + "learning_rate": 0.000379907177549919, + "loss": 3.4506, "step": 34100 }, { "epoch": 3.6819407008086253, - "grad_norm": 0.591140627861023, - "learning_rate": 0.0003795898542903399, - "loss": 3.4467, + "grad_norm": 0.6119341254234314, + "learning_rate": 0.00037958337830545057, + "loss": 3.4448, "step": 34150 }, { "epoch": 3.68733153638814, - "grad_norm": 0.6942396759986877, - "learning_rate": 0.0003792660550458715, - "loss": 3.4584, + "grad_norm": 0.6494148969650269, + "learning_rate": 0.00037925957906098217, + "loss": 3.4572, "step": 34200 }, { "epoch": 3.6927223719676547, - "grad_norm": 0.6438764333724976, - "learning_rate": 0.00037894225580140307, - "loss": 3.4478, + "grad_norm": 0.6527368426322937, + "learning_rate": 0.0003789357798165137, + "loss": 3.4459, "step": 34250 }, { "epoch": 3.69811320754717, - "grad_norm": 0.6779077649116516, - "learning_rate": 0.00037861845655693467, - "loss": 3.4519, + "grad_norm": 0.6111875772476196, + "learning_rate": 0.00037861198057204527, + "loss": 3.4508, "step": 34300 }, { "epoch": 3.7035040431266846, - "grad_norm": 0.6349784135818481, - "learning_rate": 0.0003782946573124662, - "loss": 3.4495, + "grad_norm": 0.6030789017677307, + "learning_rate": 0.0003782881813275769, + "loss": 3.4492, "step": 34350 }, { "epoch": 3.7088948787061993, - "grad_norm": 0.6402055025100708, - "learning_rate": 0.00037797085806799783, - "loss": 3.4582, + "grad_norm": 0.6276407241821289, + "learning_rate": 0.0003779643820831084, + "loss": 3.4563, "step": 34400 }, { "epoch": 3.7142857142857144, - "grad_norm": 0.6125994920730591, - "learning_rate": 0.0003776470588235294, - "loss": 3.4278, + "grad_norm": 0.6185351014137268, + "learning_rate": 0.00037764058283864003, + "loss": 3.4279, "step": 34450 }, { "epoch": 3.719676549865229, - "grad_norm": 0.658770740032196, - "learning_rate": 0.000377323259579061, - "loss": 3.4702, + "grad_norm": 0.6208236217498779, + "learning_rate": 0.0003773167835941716, + "loss": 3.4682, "step": 34500 }, { "epoch": 3.725067385444744, - "grad_norm": 0.6636422276496887, - "learning_rate": 0.00037699946033459253, - "loss": 3.448, + "grad_norm": 0.6041314601898193, + "learning_rate": 0.0003769929843497032, + "loss": 3.4465, "step": 34550 }, { "epoch": 3.730458221024259, - "grad_norm": 0.6237731575965881, - "learning_rate": 0.0003766756610901241, - "loss": 3.453, + "grad_norm": 0.5724568963050842, + "learning_rate": 0.00037666918510523474, + "loss": 3.4498, "step": 34600 }, { "epoch": 3.7358490566037736, - "grad_norm": 0.7565545439720154, - "learning_rate": 0.0003763518618456557, - "loss": 3.4553, + "grad_norm": 0.5927848815917969, + "learning_rate": 0.00037634538586076634, + "loss": 3.4556, "step": 34650 }, { "epoch": 3.7412398921832883, - "grad_norm": 0.6257597804069519, - "learning_rate": 0.0003760280626011872, - "loss": 3.4623, + "grad_norm": 0.6263810396194458, + "learning_rate": 0.00037602158661629784, + "loss": 3.46, "step": 34700 }, { "epoch": 3.7466307277628035, - "grad_norm": 0.636355996131897, - "learning_rate": 0.0003757042633567188, - "loss": 3.4671, + "grad_norm": 0.6283044815063477, + "learning_rate": 0.0003756977873718294, + "loss": 3.4649, "step": 34750 }, { "epoch": 3.752021563342318, - "grad_norm": 0.6237640976905823, - "learning_rate": 0.00037538046411225034, - "loss": 3.4505, + "grad_norm": 0.5919647812843323, + "learning_rate": 0.000375373988127361, + "loss": 3.4495, "step": 34800 }, { "epoch": 3.757412398921833, - "grad_norm": 0.6493251919746399, - "learning_rate": 0.00037505666486778195, - "loss": 3.4604, + "grad_norm": 0.64276522397995, + "learning_rate": 0.00037505018888289254, + "loss": 3.4591, "step": 34850 }, { "epoch": 3.7628032345013476, - "grad_norm": 0.6321045756340027, - "learning_rate": 0.0003747328656233135, - "loss": 3.4504, + "grad_norm": 0.6236562132835388, + "learning_rate": 0.00037472638963842415, + "loss": 3.4511, "step": 34900 }, { "epoch": 3.7681940700808623, - "grad_norm": 0.6456063389778137, - "learning_rate": 0.0003744090663788451, - "loss": 3.4446, + "grad_norm": 0.645114004611969, + "learning_rate": 0.0003744025903939557, + "loss": 3.4443, "step": 34950 }, { "epoch": 3.7735849056603774, - "grad_norm": 0.7021212577819824, - "learning_rate": 0.00037408526713437665, - "loss": 3.4671, + "grad_norm": 0.6953789591789246, + "learning_rate": 0.0003740787911494873, + "loss": 3.4652, "step": 35000 }, { "epoch": 3.7735849056603774, - "eval_accuracy": 0.3753076099070138, - "eval_loss": 3.457233428955078, - "eval_runtime": 184.8636, - "eval_samples_per_second": 97.429, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.3752516536697893, + "eval_loss": 3.454463243484497, + "eval_runtime": 185.0118, + "eval_samples_per_second": 97.351, + "eval_steps_per_second": 6.086, "step": 35000 }, { "epoch": 3.778975741239892, - "grad_norm": 0.6232941746711731, - "learning_rate": 0.0003737614678899082, - "loss": 3.4515, + "grad_norm": 0.615135669708252, + "learning_rate": 0.00037375499190501885, + "loss": 3.4494, "step": 35050 }, { "epoch": 3.784366576819407, - "grad_norm": 0.6385236978530884, - "learning_rate": 0.0003734376686454398, - "loss": 3.4544, + "grad_norm": 0.6264244914054871, + "learning_rate": 0.0003734311926605504, + "loss": 3.4529, "step": 35100 }, { "epoch": 3.789757412398922, - "grad_norm": 0.6696701049804688, - "learning_rate": 0.00037311386940097136, - "loss": 3.4392, + "grad_norm": 0.6474460959434509, + "learning_rate": 0.000373107393416082, + "loss": 3.4373, "step": 35150 }, { "epoch": 3.7951482479784366, - "grad_norm": 0.6041948795318604, - "learning_rate": 0.00037279007015650296, - "loss": 3.4452, + "grad_norm": 0.5701635479927063, + "learning_rate": 0.00037278359417161356, + "loss": 3.4437, "step": 35200 }, { "epoch": 3.8005390835579513, - "grad_norm": 0.6724011898040771, - "learning_rate": 0.0003724662709120345, - "loss": 3.453, + "grad_norm": 0.6054803133010864, + "learning_rate": 0.00037245979492714517, + "loss": 3.4508, "step": 35250 }, { "epoch": 3.8059299191374665, - "grad_norm": 0.6799139976501465, - "learning_rate": 0.0003721424716675661, - "loss": 3.4457, + "grad_norm": 0.6329179406166077, + "learning_rate": 0.0003721359956826767, + "loss": 3.4436, "step": 35300 }, { "epoch": 3.811320754716981, - "grad_norm": 0.6157107353210449, - "learning_rate": 0.0003718186724230976, - "loss": 3.4671, + "grad_norm": 0.6441768407821655, + "learning_rate": 0.0003718121964382083, + "loss": 3.4643, "step": 35350 }, { "epoch": 3.816711590296496, - "grad_norm": 0.6035497188568115, - "learning_rate": 0.0003714948731786293, - "loss": 3.4475, + "grad_norm": 0.5799574255943298, + "learning_rate": 0.0003714883971937398, + "loss": 3.4452, "step": 35400 }, { "epoch": 3.822102425876011, - "grad_norm": 0.6183246374130249, - "learning_rate": 0.00037117107393416077, - "loss": 3.4539, + "grad_norm": 0.6319050192832947, + "learning_rate": 0.0003711645979492714, + "loss": 3.4519, "step": 35450 }, { "epoch": 3.8274932614555257, - "grad_norm": 0.6753723621368408, - "learning_rate": 0.0003708472746896923, - "loss": 3.4443, + "grad_norm": 0.6138501167297363, + "learning_rate": 0.00037084079870480297, + "loss": 3.4435, "step": 35500 }, { "epoch": 3.8328840970350404, - "grad_norm": 0.6167969107627869, - "learning_rate": 0.0003705234754452239, - "loss": 3.4475, + "grad_norm": 0.6051216721534729, + "learning_rate": 0.0003705169994603345, + "loss": 3.4481, "step": 35550 }, { "epoch": 3.838274932614555, - "grad_norm": 0.6387770175933838, - "learning_rate": 0.0003701996762007555, - "loss": 3.4591, + "grad_norm": 0.6242967844009399, + "learning_rate": 0.00037019320021586613, + "loss": 3.4572, "step": 35600 }, { "epoch": 3.8436657681940702, - "grad_norm": 0.6343386173248291, - "learning_rate": 0.0003698758769562871, - "loss": 3.4443, + "grad_norm": 0.627966582775116, + "learning_rate": 0.0003698694009713977, + "loss": 3.4424, "step": 35650 }, { "epoch": 3.849056603773585, - "grad_norm": 0.6071075201034546, - "learning_rate": 0.00036955207771181863, - "loss": 3.4436, + "grad_norm": 0.5960062146186829, + "learning_rate": 0.0003695456017269293, + "loss": 3.4426, "step": 35700 }, { "epoch": 3.8544474393530996, - "grad_norm": 0.596243143081665, - "learning_rate": 0.00036922827846735024, - "loss": 3.4529, + "grad_norm": 0.5962764024734497, + "learning_rate": 0.00036922180248246083, + "loss": 3.4543, "step": 35750 }, { "epoch": 3.8598382749326143, - "grad_norm": 0.6391444206237793, - "learning_rate": 0.0003689044792228818, - "loss": 3.4403, + "grad_norm": 0.6414475440979004, + "learning_rate": 0.00036889800323799244, + "loss": 3.4405, "step": 35800 }, { "epoch": 3.8652291105121295, - "grad_norm": 0.6206788420677185, + "grad_norm": 0.6472159028053284, "learning_rate": 0.0003685806799784134, - "loss": 3.4506, + "loss": 3.4513, "step": 35850 }, { "epoch": 3.870619946091644, - "grad_norm": 0.652442455291748, + "grad_norm": 0.63205885887146, "learning_rate": 0.00036825688073394494, - "loss": 3.4626, + "loss": 3.4625, "step": 35900 }, { "epoch": 3.876010781671159, - "grad_norm": 0.6262994408607483, + "grad_norm": 0.6096854209899902, "learning_rate": 0.0003679330814894765, - "loss": 3.4641, + "loss": 3.4624, "step": 35950 }, { "epoch": 3.881401617250674, - "grad_norm": 0.6360272765159607, + "grad_norm": 0.6115530133247375, "learning_rate": 0.0003676092822450081, - "loss": 3.4334, + "loss": 3.4316, "step": 36000 }, { "epoch": 3.881401617250674, - "eval_accuracy": 0.37577340483707444, - "eval_loss": 3.450922966003418, - "eval_runtime": 185.1357, - "eval_samples_per_second": 97.285, - "eval_steps_per_second": 6.082, + "eval_accuracy": 0.37572016492204535, + "eval_loss": 3.448432445526123, + "eval_runtime": 185.5015, + "eval_samples_per_second": 97.094, + "eval_steps_per_second": 6.07, "step": 36000 }, { "epoch": 3.8867924528301887, - "grad_norm": 0.6211079359054565, + "grad_norm": 0.6192970871925354, "learning_rate": 0.0003672854830005396, - "loss": 3.47, + "loss": 3.4669, "step": 36050 }, { "epoch": 3.8921832884097034, - "grad_norm": 0.6081632375717163, + "grad_norm": 0.6395145654678345, "learning_rate": 0.0003669616837560712, - "loss": 3.4335, + "loss": 3.4336, "step": 36100 }, { "epoch": 3.8975741239892185, - "grad_norm": 0.6606460213661194, + "grad_norm": 0.6128035187721252, "learning_rate": 0.00036663788451160275, - "loss": 3.4427, + "loss": 3.4405, "step": 36150 }, { "epoch": 3.9029649595687332, - "grad_norm": 0.6085197925567627, + "grad_norm": 0.5994486808776855, "learning_rate": 0.00036631408526713435, - "loss": 3.4481, + "loss": 3.4473, "step": 36200 }, { "epoch": 3.908355795148248, - "grad_norm": 0.5942933559417725, + "grad_norm": 0.598868191242218, "learning_rate": 0.0003659902860226659, - "loss": 3.4558, + "loss": 3.4512, "step": 36250 }, { "epoch": 3.913746630727763, - "grad_norm": 0.5989931225776672, + "grad_norm": 0.6117761731147766, "learning_rate": 0.00036566648677819745, - "loss": 3.4572, + "loss": 3.453, "step": 36300 }, { "epoch": 3.9191374663072778, - "grad_norm": 0.6112847328186035, + "grad_norm": 0.6310285329818726, "learning_rate": 0.00036534268753372906, - "loss": 3.4703, + "loss": 3.4659, "step": 36350 }, { "epoch": 3.9245283018867925, - "grad_norm": 0.5879990458488464, + "grad_norm": 0.5916323065757751, "learning_rate": 0.0003650188882892606, - "loss": 3.4374, + "loss": 3.4365, "step": 36400 }, { "epoch": 3.929919137466307, - "grad_norm": 0.6718971729278564, + "grad_norm": 0.6302815079689026, "learning_rate": 0.0003646950890447922, - "loss": 3.4428, + "loss": 3.4415, "step": 36450 }, { "epoch": 3.935309973045822, - "grad_norm": 0.5937814116477966, + "grad_norm": 0.6169242262840271, "learning_rate": 0.00036437128980032376, - "loss": 3.4565, + "loss": 3.4535, "step": 36500 }, { "epoch": 3.940700808625337, - "grad_norm": 0.6202378869056702, + "grad_norm": 0.6036542057991028, "learning_rate": 0.00036404749055585537, - "loss": 3.4696, + "loss": 3.4697, "step": 36550 }, { "epoch": 3.9460916442048517, - "grad_norm": 0.6717408895492554, + "grad_norm": 0.6433455944061279, "learning_rate": 0.0003637236913113869, - "loss": 3.4239, + "loss": 3.4215, "step": 36600 }, { "epoch": 3.9514824797843664, - "grad_norm": 0.6636652946472168, + "grad_norm": 0.6333649754524231, "learning_rate": 0.0003633998920669185, - "loss": 3.4406, + "loss": 3.439, "step": 36650 }, { "epoch": 3.9568733153638815, - "grad_norm": 0.6985486149787903, + "grad_norm": 0.6479489207267761, "learning_rate": 0.0003630760928224501, - "loss": 3.4461, + "loss": 3.4444, "step": 36700 }, { "epoch": 3.9622641509433962, - "grad_norm": 0.639927089214325, + "grad_norm": 0.5945091247558594, "learning_rate": 0.00036275229357798157, - "loss": 3.4282, + "loss": 3.4266, "step": 36750 }, { "epoch": 3.967654986522911, - "grad_norm": 0.6031926870346069, + "grad_norm": 0.5627322793006897, "learning_rate": 0.0003624284943335132, - "loss": 3.4406, + "loss": 3.439, "step": 36800 }, { "epoch": 3.973045822102426, - "grad_norm": 0.6068711876869202, + "grad_norm": 0.5948078036308289, "learning_rate": 0.00036210469508904473, - "loss": 3.45, + "loss": 3.449, "step": 36850 }, { "epoch": 3.9784366576819408, - "grad_norm": 0.6099409461021423, + "grad_norm": 0.5916422605514526, "learning_rate": 0.00036178089584457633, - "loss": 3.4486, + "loss": 3.4465, "step": 36900 }, { "epoch": 3.9838274932614555, - "grad_norm": 0.6189759373664856, + "grad_norm": 0.5898805856704712, "learning_rate": 0.0003614570966001079, - "loss": 3.4265, + "loss": 3.4251, "step": 36950 }, { "epoch": 3.9892183288409706, - "grad_norm": 0.6519210338592529, + "grad_norm": 0.5998630523681641, "learning_rate": 0.0003611332973556395, - "loss": 3.416, + "loss": 3.4147, "step": 37000 }, { "epoch": 3.9892183288409706, - "eval_accuracy": 0.3765111579453348, - "eval_loss": 3.4432153701782227, - "eval_runtime": 184.8939, - "eval_samples_per_second": 97.413, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.37643064615546423, + "eval_loss": 3.441873073577881, + "eval_runtime": 184.939, + "eval_samples_per_second": 97.389, + "eval_steps_per_second": 6.088, "step": 37000 }, { "epoch": 3.9946091644204853, - "grad_norm": 0.6266780495643616, + "grad_norm": 0.616155743598938, "learning_rate": 0.00036080949811117104, - "loss": 3.4672, + "loss": 3.4655, "step": 37050 }, { "epoch": 4.0, - "grad_norm": 1.3627803325653076, + "grad_norm": 1.3538962602615356, "learning_rate": 0.00036048569886670264, - "loss": 3.4304, + "loss": 3.4306, "step": 37100 }, { "epoch": 4.005390835579515, - "grad_norm": 0.6425639390945435, - "learning_rate": 0.00036016837560712354, - "loss": 3.3615, + "grad_norm": 0.6368972659111023, + "learning_rate": 0.0003601618996222342, + "loss": 3.3594, "step": 37150 }, { "epoch": 4.010781671159029, - "grad_norm": 0.594615638256073, - "learning_rate": 0.00035984457636265515, - "loss": 3.3569, + "grad_norm": 0.6396125555038452, + "learning_rate": 0.00035983810037776574, + "loss": 3.3545, "step": 37200 }, { "epoch": 4.0161725067385445, - "grad_norm": 0.671310544013977, - "learning_rate": 0.0003595207771181867, - "loss": 3.3522, + "grad_norm": 0.6328780055046082, + "learning_rate": 0.00035951430113329735, + "loss": 3.3507, "step": 37250 }, { "epoch": 4.02156334231806, - "grad_norm": 0.6261488795280457, - "learning_rate": 0.0003591969778737183, - "loss": 3.357, + "grad_norm": 0.5838497877120972, + "learning_rate": 0.0003591905018888289, + "loss": 3.355, "step": 37300 }, { "epoch": 4.026954177897574, - "grad_norm": 0.63107830286026, - "learning_rate": 0.00035887317862924985, - "loss": 3.3399, + "grad_norm": 0.5783930420875549, + "learning_rate": 0.0003588667026443605, + "loss": 3.336, "step": 37350 }, { "epoch": 4.032345013477089, - "grad_norm": 0.6028571128845215, - "learning_rate": 0.00035854937938478146, - "loss": 3.3433, + "grad_norm": 0.597440242767334, + "learning_rate": 0.000358542903399892, + "loss": 3.3412, "step": 37400 }, { "epoch": 4.037735849056604, - "grad_norm": 0.6417591571807861, - "learning_rate": 0.00035822558014031295, - "loss": 3.353, + "grad_norm": 0.6205726861953735, + "learning_rate": 0.0003582191041554236, + "loss": 3.3508, "step": 37450 }, { "epoch": 4.0431266846361185, - "grad_norm": 0.6593953967094421, - "learning_rate": 0.0003579017808958445, - "loss": 3.3535, + "grad_norm": 0.6443513631820679, + "learning_rate": 0.00035789530491095516, + "loss": 3.3509, "step": 37500 }, { "epoch": 4.048517520215634, - "grad_norm": 0.6569228172302246, - "learning_rate": 0.0003575779816513761, - "loss": 3.3623, + "grad_norm": 0.6111722588539124, + "learning_rate": 0.00035757150566648676, + "loss": 3.3604, "step": 37550 }, { "epoch": 4.053908355795148, - "grad_norm": 0.6970022320747375, - "learning_rate": 0.00035725418240690766, - "loss": 3.357, + "grad_norm": 0.6462409496307373, + "learning_rate": 0.0003572477064220183, + "loss": 3.3548, "step": 37600 }, { "epoch": 4.059299191374663, - "grad_norm": 0.652712345123291, - "learning_rate": 0.00035693038316243926, - "loss": 3.3645, + "grad_norm": 0.6218355298042297, + "learning_rate": 0.00035692390717754986, + "loss": 3.3616, "step": 37650 }, { "epoch": 4.064690026954178, - "grad_norm": 0.6411773562431335, - "learning_rate": 0.0003566065839179708, - "loss": 3.3643, + "grad_norm": 0.6028040647506714, + "learning_rate": 0.00035660010793308147, + "loss": 3.3614, "step": 37700 }, { "epoch": 4.070080862533692, - "grad_norm": 0.6141267418861389, - "learning_rate": 0.0003562827846735024, - "loss": 3.3716, + "grad_norm": 0.5973382592201233, + "learning_rate": 0.000356276308688613, + "loss": 3.3686, "step": 37750 }, { "epoch": 4.0754716981132075, - "grad_norm": 0.61728835105896, - "learning_rate": 0.00035595898542903397, - "loss": 3.362, + "grad_norm": 0.5949469804763794, + "learning_rate": 0.0003559525094441446, + "loss": 3.36, "step": 37800 }, { "epoch": 4.080862533692723, - "grad_norm": 0.6423006653785706, - "learning_rate": 0.0003556351861845656, - "loss": 3.368, + "grad_norm": 0.6174900531768799, + "learning_rate": 0.00035562871019967617, + "loss": 3.3676, "step": 37850 }, { "epoch": 4.086253369272237, - "grad_norm": 0.6371752619743347, - "learning_rate": 0.0003553113869400971, - "loss": 3.3778, + "grad_norm": 0.6042484641075134, + "learning_rate": 0.0003553049109552078, + "loss": 3.3756, "step": 37900 }, { "epoch": 4.091644204851752, - "grad_norm": 0.6404188871383667, - "learning_rate": 0.0003549875876956287, - "loss": 3.3784, + "grad_norm": 0.610124409198761, + "learning_rate": 0.00035498111171073933, + "loss": 3.3738, "step": 37950 }, { "epoch": 4.097035040431267, - "grad_norm": 0.6130540370941162, - "learning_rate": 0.0003546637884511603, - "loss": 3.3571, + "grad_norm": 0.6482214331626892, + "learning_rate": 0.0003546573124662708, + "loss": 3.3564, "step": 38000 }, { "epoch": 4.097035040431267, - "eval_accuracy": 0.37693935897621167, - "eval_loss": 3.4457244873046875, - "eval_runtime": 184.614, - "eval_samples_per_second": 97.56, - "eval_steps_per_second": 6.099, + "eval_accuracy": 0.37664436638579535, + "eval_loss": 3.4471352100372314, + "eval_runtime": 185.4167, + "eval_samples_per_second": 97.138, + "eval_steps_per_second": 6.073, "step": 38000 }, { "epoch": 4.1024258760107815, - "grad_norm": 0.6331332921981812, - "learning_rate": 0.0003543399892066918, - "loss": 3.3762, + "grad_norm": 0.674815833568573, + "learning_rate": 0.0003543335132218025, + "loss": 3.3728, "step": 38050 }, { "epoch": 4.107816711590297, - "grad_norm": 0.6457241177558899, - "learning_rate": 0.0003540161899622234, - "loss": 3.3599, + "grad_norm": 0.659717857837677, + "learning_rate": 0.000354009713977334, + "loss": 3.3575, "step": 38100 }, { "epoch": 4.113207547169812, - "grad_norm": 0.6348027586936951, - "learning_rate": 0.00035369239071775493, - "loss": 3.3749, + "grad_norm": 0.6627160310745239, + "learning_rate": 0.0003536859147328656, + "loss": 3.3714, "step": 38150 }, { "epoch": 4.118598382749326, - "grad_norm": 0.650617778301239, - "learning_rate": 0.00035336859147328654, - "loss": 3.3783, + "grad_norm": 0.6379284858703613, + "learning_rate": 0.00035336211548839713, + "loss": 3.3761, "step": 38200 }, { "epoch": 4.123989218328841, - "grad_norm": 0.6565114855766296, - "learning_rate": 0.0003530447922288181, - "loss": 3.366, + "grad_norm": 0.610173761844635, + "learning_rate": 0.00035303831624392874, + "loss": 3.3633, "step": 38250 }, { "epoch": 4.129380053908355, - "grad_norm": 0.6230000853538513, - "learning_rate": 0.0003527209929843497, - "loss": 3.3705, + "grad_norm": 0.6131238341331482, + "learning_rate": 0.0003527145169994603, + "loss": 3.3686, "step": 38300 }, { "epoch": 4.1347708894878705, - "grad_norm": 0.6349278092384338, - "learning_rate": 0.00035239719373988124, - "loss": 3.3536, + "grad_norm": 0.6268836259841919, + "learning_rate": 0.0003523907177549919, + "loss": 3.353, "step": 38350 }, { "epoch": 4.140161725067386, - "grad_norm": 0.6567692756652832, - "learning_rate": 0.0003520733944954128, - "loss": 3.3749, + "grad_norm": 0.6279870271682739, + "learning_rate": 0.00035206691851052345, + "loss": 3.3732, "step": 38400 }, { "epoch": 4.1455525606469, - "grad_norm": 0.6406522989273071, - "learning_rate": 0.0003517495952509444, - "loss": 3.3725, + "grad_norm": 0.624947726726532, + "learning_rate": 0.000351743119266055, + "loss": 3.37, "step": 38450 }, { "epoch": 4.150943396226415, - "grad_norm": 0.611667275428772, - "learning_rate": 0.00035142579600647595, - "loss": 3.3662, + "grad_norm": 0.6435061693191528, + "learning_rate": 0.0003514193200215866, + "loss": 3.3646, "step": 38500 }, { "epoch": 4.15633423180593, - "grad_norm": 0.608488917350769, - "learning_rate": 0.00035110199676200755, - "loss": 3.3649, + "grad_norm": 0.6154115796089172, + "learning_rate": 0.00035109552077711815, + "loss": 3.3634, "step": 38550 }, { "epoch": 4.1617250673854445, - "grad_norm": 0.6489611268043518, - "learning_rate": 0.0003507781975175391, - "loss": 3.3649, + "grad_norm": 0.6166911125183105, + "learning_rate": 0.00035077172153264976, + "loss": 3.3637, "step": 38600 }, { "epoch": 4.16711590296496, - "grad_norm": 0.7167544364929199, - "learning_rate": 0.0003504543982730707, - "loss": 3.3698, + "grad_norm": 0.6432587504386902, + "learning_rate": 0.0003504479222881813, + "loss": 3.3691, "step": 38650 }, { "epoch": 4.172506738544475, - "grad_norm": 0.6684973239898682, - "learning_rate": 0.00035013059902860226, - "loss": 3.3851, + "grad_norm": 0.6543565392494202, + "learning_rate": 0.0003501241230437129, + "loss": 3.3833, "step": 38700 }, { "epoch": 4.177897574123989, - "grad_norm": 0.6070584654808044, - "learning_rate": 0.00034980679978413375, - "loss": 3.3609, + "grad_norm": 0.6134627461433411, + "learning_rate": 0.0003498003237992444, + "loss": 3.3596, "step": 38750 }, { "epoch": 4.183288409703504, - "grad_norm": 0.6427111029624939, - "learning_rate": 0.00034948300053966536, - "loss": 3.3758, + "grad_norm": 0.5936211943626404, + "learning_rate": 0.000349476524554776, + "loss": 3.3745, "step": 38800 }, { "epoch": 4.188679245283019, - "grad_norm": 0.6557640433311462, - "learning_rate": 0.0003491592012951969, - "loss": 3.3859, + "grad_norm": 0.617832362651825, + "learning_rate": 0.00034915272531030756, + "loss": 3.384, "step": 38850 }, { "epoch": 4.1940700808625335, - "grad_norm": 0.6033419966697693, - "learning_rate": 0.0003488354020507285, - "loss": 3.4005, + "grad_norm": 0.5889009237289429, + "learning_rate": 0.0003488289260658391, + "loss": 3.3997, "step": 38900 }, { "epoch": 4.199460916442049, - "grad_norm": 0.6868317127227783, - "learning_rate": 0.00034851160280626007, - "loss": 3.3661, + "grad_norm": 0.6840178370475769, + "learning_rate": 0.0003485051268213707, + "loss": 3.3653, "step": 38950 }, { "epoch": 4.204851752021563, - "grad_norm": 0.651969313621521, - "learning_rate": 0.00034818780356179167, - "loss": 3.3826, + "grad_norm": 0.664037823677063, + "learning_rate": 0.00034818132757690227, + "loss": 3.3819, "step": 39000 }, { "epoch": 4.204851752021563, - "eval_accuracy": 0.37738494446913884, - "eval_loss": 3.4434092044830322, - "eval_runtime": 185.0308, - "eval_samples_per_second": 97.341, - "eval_steps_per_second": 6.085, + "eval_accuracy": 0.377053335855529, + "eval_loss": 3.4432272911071777, + "eval_runtime": 184.8506, + "eval_samples_per_second": 97.435, + "eval_steps_per_second": 6.091, "step": 39000 }, { "epoch": 4.210242587601078, - "grad_norm": 0.6915486454963684, - "learning_rate": 0.0003478640043173232, - "loss": 3.386, + "grad_norm": 0.6409573554992676, + "learning_rate": 0.0003478575283324339, + "loss": 3.3856, "step": 39050 }, { "epoch": 4.215633423180593, - "grad_norm": 0.639582097530365, - "learning_rate": 0.0003475402050728548, - "loss": 3.3905, + "grad_norm": 0.6263582706451416, + "learning_rate": 0.0003475337290879654, + "loss": 3.3883, "step": 39100 }, { "epoch": 4.2210242587601075, - "grad_norm": 0.6311129927635193, - "learning_rate": 0.0003472228818132757, - "loss": 3.3763, + "grad_norm": 0.6016018986701965, + "learning_rate": 0.00034720992984349703, + "loss": 3.3752, "step": 39150 }, { "epoch": 4.226415094339623, - "grad_norm": 0.7378162145614624, - "learning_rate": 0.00034689908256880733, - "loss": 3.3681, + "grad_norm": 0.70400071144104, + "learning_rate": 0.0003468861305990286, + "loss": 3.3671, "step": 39200 }, { "epoch": 4.231805929919138, - "grad_norm": 0.6093368530273438, - "learning_rate": 0.0003465752833243389, - "loss": 3.3713, + "grad_norm": 0.6123245358467102, + "learning_rate": 0.0003465623313545602, + "loss": 3.3699, "step": 39250 }, { "epoch": 4.237196765498652, - "grad_norm": 0.6382229328155518, - "learning_rate": 0.0003462514840798705, - "loss": 3.3736, + "grad_norm": 0.6317894458770752, + "learning_rate": 0.00034623853211009173, + "loss": 3.3728, "step": 39300 }, { "epoch": 4.242587601078167, - "grad_norm": 0.5930299758911133, - "learning_rate": 0.00034592768483540203, - "loss": 3.3852, + "grad_norm": 0.6047124862670898, + "learning_rate": 0.00034591473286562323, + "loss": 3.3843, "step": 39350 }, { "epoch": 4.247978436657682, - "grad_norm": 0.6670295000076294, - "learning_rate": 0.00034560388559093364, - "loss": 3.3791, + "grad_norm": 0.6490882039070129, + "learning_rate": 0.0003455909336211549, + "loss": 3.3801, "step": 39400 }, { "epoch": 4.2533692722371965, - "grad_norm": 0.6960169076919556, - "learning_rate": 0.00034528008634646514, - "loss": 3.3902, + "grad_norm": 0.6895270347595215, + "learning_rate": 0.0003452671343766864, + "loss": 3.3879, "step": 39450 }, { "epoch": 4.258760107816712, - "grad_norm": 0.6310935020446777, - "learning_rate": 0.00034495628710199674, - "loss": 3.3771, + "grad_norm": 0.6247091889381409, + "learning_rate": 0.000344943335132218, + "loss": 3.3764, "step": 39500 }, { "epoch": 4.264150943396227, - "grad_norm": 0.6121526956558228, - "learning_rate": 0.0003446324878575283, - "loss": 3.3639, + "grad_norm": 0.6222687363624573, + "learning_rate": 0.00034461953588774954, + "loss": 3.3631, "step": 39550 }, { "epoch": 4.269541778975741, - "grad_norm": 0.627255916595459, - "learning_rate": 0.00034430868861305984, - "loss": 3.3805, + "grad_norm": 0.6222398281097412, + "learning_rate": 0.00034429573664328115, + "loss": 3.3788, "step": 39600 }, { "epoch": 4.274932614555256, - "grad_norm": 0.6354991793632507, - "learning_rate": 0.00034398488936859145, - "loss": 3.3886, + "grad_norm": 0.6109077334403992, + "learning_rate": 0.0003439719373988127, + "loss": 3.3883, "step": 39650 }, { "epoch": 4.280323450134771, - "grad_norm": 0.6785920262336731, - "learning_rate": 0.000343661090124123, - "loss": 3.372, + "grad_norm": 0.6322901248931885, + "learning_rate": 0.00034364813815434425, + "loss": 3.3713, "step": 39700 }, { "epoch": 4.285714285714286, - "grad_norm": 0.6545200347900391, - "learning_rate": 0.0003433372908796546, - "loss": 3.395, + "grad_norm": 0.6394525170326233, + "learning_rate": 0.00034332433890987585, + "loss": 3.3939, "step": 39750 }, { "epoch": 4.291105121293801, - "grad_norm": 0.6170735359191895, - "learning_rate": 0.00034301349163518615, - "loss": 3.3826, + "grad_norm": 0.6161048412322998, + "learning_rate": 0.0003430005396654074, + "loss": 3.3808, "step": 39800 }, { "epoch": 4.296495956873315, - "grad_norm": 0.6551817059516907, - "learning_rate": 0.00034268969239071776, - "loss": 3.3834, + "grad_norm": 0.6096895337104797, + "learning_rate": 0.000342676740420939, + "loss": 3.3812, "step": 39850 }, { "epoch": 4.30188679245283, - "grad_norm": 0.6523256897926331, - "learning_rate": 0.0003423658931462493, - "loss": 3.384, + "grad_norm": 0.6321001648902893, + "learning_rate": 0.00034235294117647056, + "loss": 3.3828, "step": 39900 }, { "epoch": 4.307277628032345, - "grad_norm": 0.6886041164398193, - "learning_rate": 0.00034204209390178086, - "loss": 3.3928, + "grad_norm": 0.6452771425247192, + "learning_rate": 0.00034202914193200216, + "loss": 3.3883, "step": 39950 }, { "epoch": 4.3126684636118595, - "grad_norm": 0.6848386526107788, - "learning_rate": 0.00034171829465731246, - "loss": 3.3971, + "grad_norm": 0.6669614911079407, + "learning_rate": 0.0003417053426875337, + "loss": 3.3957, "step": 40000 }, { "epoch": 4.3126684636118595, - "eval_accuracy": 0.3776180049135009, - "eval_loss": 3.4378561973571777, - "eval_runtime": 185.0073, - "eval_samples_per_second": 97.353, - "eval_steps_per_second": 6.086, + "eval_accuracy": 0.37731149511697626, + "eval_loss": 3.437464952468872, + "eval_runtime": 185.0409, + "eval_samples_per_second": 97.335, + "eval_steps_per_second": 6.085, "step": 40000 }, { "epoch": 4.318059299191375, - "grad_norm": 0.6221829056739807, - "learning_rate": 0.00034139449541284396, - "loss": 3.3875, + "grad_norm": 0.6292645335197449, + "learning_rate": 0.0003413815434430653, + "loss": 3.3865, "step": 40050 }, { "epoch": 4.32345013477089, - "grad_norm": 0.6784347891807556, - "learning_rate": 0.00034107069616837556, - "loss": 3.3911, + "grad_norm": 0.6619394421577454, + "learning_rate": 0.0003410577441985968, + "loss": 3.3892, "step": 40100 }, { "epoch": 4.328840970350404, - "grad_norm": 0.6379362344741821, - "learning_rate": 0.0003407468969239071, - "loss": 3.372, + "grad_norm": 0.608463704586029, + "learning_rate": 0.00034073394495412837, + "loss": 3.3712, "step": 40150 }, { "epoch": 4.334231805929919, - "grad_norm": 0.6500546932220459, - "learning_rate": 0.0003404230976794387, - "loss": 3.4049, + "grad_norm": 0.6409887671470642, + "learning_rate": 0.00034041014570965997, + "loss": 3.4038, "step": 40200 }, { "epoch": 4.339622641509434, - "grad_norm": 0.6693824529647827, - "learning_rate": 0.00034009929843497027, - "loss": 3.4065, + "grad_norm": 0.6474012136459351, + "learning_rate": 0.0003400863464651915, + "loss": 3.4056, "step": 40250 }, { "epoch": 4.345013477088949, - "grad_norm": 0.6409181952476501, - "learning_rate": 0.0003397754991905019, - "loss": 3.3808, + "grad_norm": 0.6828320622444153, + "learning_rate": 0.0003397625472207231, + "loss": 3.3789, "step": 40300 }, { "epoch": 4.350404312668464, - "grad_norm": 0.6256203651428223, - "learning_rate": 0.0003394516999460334, - "loss": 3.392, + "grad_norm": 0.659954845905304, + "learning_rate": 0.0003394387479762547, + "loss": 3.3883, "step": 40350 }, { "epoch": 4.355795148247978, - "grad_norm": 0.6128562688827515, - "learning_rate": 0.000339127900701565, - "loss": 3.3865, + "grad_norm": 0.6052781939506531, + "learning_rate": 0.0003391149487317863, + "loss": 3.3847, "step": 40400 }, { "epoch": 4.361185983827493, - "grad_norm": 0.638848066329956, - "learning_rate": 0.0003388041014570966, - "loss": 3.3884, + "grad_norm": 0.6608632802963257, + "learning_rate": 0.00033879114948731783, + "loss": 3.3881, "step": 40450 }, { "epoch": 4.366576819407008, - "grad_norm": 0.6677626371383667, - "learning_rate": 0.00033848030221262813, - "loss": 3.3821, + "grad_norm": 0.6373227834701538, + "learning_rate": 0.00033846735024284944, + "loss": 3.3809, "step": 40500 }, { "epoch": 4.3719676549865225, - "grad_norm": 0.6529048085212708, - "learning_rate": 0.00033815650296815974, - "loss": 3.388, + "grad_norm": 0.6604944467544556, + "learning_rate": 0.000338143550998381, + "loss": 3.3869, "step": 40550 }, { "epoch": 4.377358490566038, - "grad_norm": 0.6577892303466797, - "learning_rate": 0.0003378327037236913, - "loss": 3.3945, + "grad_norm": 0.619690477848053, + "learning_rate": 0.00033781975175391254, + "loss": 3.3931, "step": 40600 }, { "epoch": 4.382749326145553, - "grad_norm": 0.6787473559379578, - "learning_rate": 0.0003375089044792229, - "loss": 3.3849, + "grad_norm": 0.6604430675506592, + "learning_rate": 0.00033749595250944414, + "loss": 3.3844, "step": 40650 }, { "epoch": 4.388140161725067, - "grad_norm": 0.6604834794998169, - "learning_rate": 0.00033718510523475444, - "loss": 3.3991, + "grad_norm": 0.6418233513832092, + "learning_rate": 0.00033717215326497564, + "loss": 3.397, "step": 40700 }, { "epoch": 4.393530997304582, - "grad_norm": 0.6834147572517395, - "learning_rate": 0.00033686130599028605, - "loss": 3.4014, + "grad_norm": 0.6358850002288818, + "learning_rate": 0.0003368483540205073, + "loss": 3.3994, "step": 40750 }, { "epoch": 4.398921832884097, - "grad_norm": 0.6034276485443115, - "learning_rate": 0.00033653750674581754, - "loss": 3.389, + "grad_norm": 0.6532427072525024, + "learning_rate": 0.0003365245547760388, + "loss": 3.3877, "step": 40800 }, { "epoch": 4.404312668463612, - "grad_norm": 0.6377930045127869, - "learning_rate": 0.0003362137075013491, - "loss": 3.3799, + "grad_norm": 0.6354030966758728, + "learning_rate": 0.0003362007555315704, + "loss": 3.3772, "step": 40850 }, { "epoch": 4.409703504043127, - "grad_norm": 0.663975179195404, - "learning_rate": 0.0003358899082568807, - "loss": 3.3802, + "grad_norm": 0.6793525815010071, + "learning_rate": 0.00033587695628710195, + "loss": 3.3799, "step": 40900 }, { "epoch": 4.415094339622642, - "grad_norm": 0.9682093858718872, - "learning_rate": 0.00033556610901241225, - "loss": 3.377, + "grad_norm": 0.9723051190376282, + "learning_rate": 0.00033555315704263355, + "loss": 3.3773, "step": 40950 }, { "epoch": 4.420485175202156, - "grad_norm": 0.6749927997589111, - "learning_rate": 0.00033524230976794385, - "loss": 3.411, + "grad_norm": 0.6340115666389465, + "learning_rate": 0.0003352293577981651, + "loss": 3.4119, "step": 41000 }, { "epoch": 4.420485175202156, - "eval_accuracy": 0.3784913568257537, - "eval_loss": 3.4337351322174072, - "eval_runtime": 184.8697, - "eval_samples_per_second": 97.425, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.3779828613107819, + "eval_loss": 3.4332971572875977, + "eval_runtime": 185.1533, + "eval_samples_per_second": 97.276, + "eval_steps_per_second": 6.081, "step": 41000 }, { "epoch": 4.425876010781671, - "grad_norm": 0.628652036190033, - "learning_rate": 0.0003349185105234754, - "loss": 3.3868, + "grad_norm": 0.6561840176582336, + "learning_rate": 0.00033490555855369665, + "loss": 3.3857, "step": 41050 }, { "epoch": 4.431266846361186, - "grad_norm": 0.6534876823425293, - "learning_rate": 0.000334594711279007, - "loss": 3.3965, + "grad_norm": 0.6575011610984802, + "learning_rate": 0.00033458175930922826, + "loss": 3.3953, "step": 41100 }, { "epoch": 4.436657681940701, - "grad_norm": 0.6127163171768188, - "learning_rate": 0.00033427091203453856, - "loss": 3.4026, + "grad_norm": 0.6666945815086365, + "learning_rate": 0.0003342579600647598, + "loss": 3.4005, "step": 41150 }, { "epoch": 4.442048517520216, - "grad_norm": 0.6120777726173401, - "learning_rate": 0.0003339535887749595, - "loss": 3.3872, + "grad_norm": 0.5999481678009033, + "learning_rate": 0.0003339341608202914, + "loss": 3.3865, "step": 41200 }, { "epoch": 4.44743935309973, - "grad_norm": 0.6257559061050415, - "learning_rate": 0.00033362978953049106, - "loss": 3.4045, + "grad_norm": 0.6154288649559021, + "learning_rate": 0.00033361036157582297, + "loss": 3.4036, "step": 41250 }, { "epoch": 4.452830188679245, - "grad_norm": 0.6158122420310974, - "learning_rate": 0.00033330599028602267, - "loss": 3.3767, + "grad_norm": 0.6239913702011108, + "learning_rate": 0.00033328656233135457, + "loss": 3.3756, "step": 41300 }, { "epoch": 4.45822102425876, - "grad_norm": 0.6703405976295471, - "learning_rate": 0.0003329821910415542, - "loss": 3.382, + "grad_norm": 0.6489045023918152, + "learning_rate": 0.0003329627630868861, + "loss": 3.3797, "step": 41350 }, { "epoch": 4.463611859838275, - "grad_norm": 0.6126678586006165, - "learning_rate": 0.0003326583917970858, - "loss": 3.4067, + "grad_norm": 0.6149885058403015, + "learning_rate": 0.0003326389638424177, + "loss": 3.4028, "step": 41400 }, { "epoch": 4.46900269541779, - "grad_norm": 0.6338332295417786, - "learning_rate": 0.0003323345925526173, - "loss": 3.3812, + "grad_norm": 0.6494463682174683, + "learning_rate": 0.0003323151645979492, + "loss": 3.3799, "step": 41450 }, { "epoch": 4.474393530997305, - "grad_norm": 1.067539095878601, - "learning_rate": 0.0003320107933081489, - "loss": 3.3943, + "grad_norm": 0.8456867933273315, + "learning_rate": 0.00033199136535348077, + "loss": 3.3919, "step": 41500 }, { "epoch": 4.479784366576819, - "grad_norm": 0.6305244565010071, - "learning_rate": 0.0003316869940636805, - "loss": 3.3855, + "grad_norm": 0.6378749012947083, + "learning_rate": 0.0003316675661090124, + "loss": 3.3854, "step": 41550 }, { "epoch": 4.485175202156334, - "grad_norm": 0.6363754868507385, - "learning_rate": 0.000331363194819212, - "loss": 3.3964, + "grad_norm": 0.6068414449691772, + "learning_rate": 0.00033134376686454393, + "loss": 3.3958, "step": 41600 }, { "epoch": 4.490566037735849, - "grad_norm": 0.6011024117469788, - "learning_rate": 0.00033103939557474363, - "loss": 3.3798, + "grad_norm": 0.6004335284233093, + "learning_rate": 0.00033101996762007553, + "loss": 3.3789, "step": 41650 }, { "epoch": 4.495956873315364, - "grad_norm": 0.6291077136993408, - "learning_rate": 0.0003307155963302752, - "loss": 3.4022, + "grad_norm": 0.6522519588470459, + "learning_rate": 0.0003306961683756071, + "loss": 3.4029, "step": 41700 }, { "epoch": 4.501347708894879, - "grad_norm": 0.6409448385238647, - "learning_rate": 0.0003303917970858068, - "loss": 3.3858, + "grad_norm": 0.645179271697998, + "learning_rate": 0.0003303723691311387, + "loss": 3.3842, "step": 41750 }, { "epoch": 4.506738544474393, - "grad_norm": 0.6451292037963867, - "learning_rate": 0.00033006799784133833, - "loss": 3.391, + "grad_norm": 0.6851235032081604, + "learning_rate": 0.00033004856988667024, + "loss": 3.3892, "step": 41800 }, { "epoch": 4.512129380053908, - "grad_norm": 0.6577849388122559, - "learning_rate": 0.00032974419859686994, - "loss": 3.3801, + "grad_norm": 0.6213834881782532, + "learning_rate": 0.0003297247706422018, + "loss": 3.3784, "step": 41850 }, { "epoch": 4.517520215633423, - "grad_norm": 0.6435655355453491, - "learning_rate": 0.0003294203993524015, - "loss": 3.3903, + "grad_norm": 0.6451056599617004, + "learning_rate": 0.00032940744738262274, + "loss": 3.3886, "step": 41900 }, { "epoch": 4.5229110512129385, - "grad_norm": 0.6412240266799927, - "learning_rate": 0.0003290966001079331, - "loss": 3.3722, + "grad_norm": 0.6113904714584351, + "learning_rate": 0.00032908364813815435, + "loss": 3.3711, "step": 41950 }, { "epoch": 4.528301886792453, - "grad_norm": 0.622590184211731, - "learning_rate": 0.00032877280086346465, - "loss": 3.386, + "grad_norm": 0.605117917060852, + "learning_rate": 0.0003287598488936859, + "loss": 3.3842, "step": 42000 }, { "epoch": 4.528301886792453, - "eval_accuracy": 0.37879612817607333, - "eval_loss": 3.4283080101013184, - "eval_runtime": 185.1161, - "eval_samples_per_second": 97.296, - "eval_steps_per_second": 6.083, + "eval_accuracy": 0.37882416062112945, + "eval_loss": 3.427164316177368, + "eval_runtime": 184.9685, + "eval_samples_per_second": 97.373, + "eval_steps_per_second": 6.088, "step": 42000 }, { "epoch": 4.533692722371968, - "grad_norm": 0.6410642266273499, - "learning_rate": 0.00032844900161899614, - "loss": 3.3828, + "grad_norm": 0.6477672457695007, + "learning_rate": 0.0003284360496492175, + "loss": 3.3816, "step": 42050 }, { "epoch": 4.539083557951482, - "grad_norm": 0.6771649718284607, - "learning_rate": 0.0003281252023745278, - "loss": 3.3897, + "grad_norm": 0.6513396501541138, + "learning_rate": 0.000328112250404749, + "loss": 3.3891, "step": 42100 }, { "epoch": 4.544474393530997, - "grad_norm": 0.6513844728469849, - "learning_rate": 0.0003278014031300593, - "loss": 3.4196, + "grad_norm": 0.6318349242210388, + "learning_rate": 0.00032778845116028066, + "loss": 3.4179, "step": 42150 }, { "epoch": 4.549865229110512, - "grad_norm": 0.6491783857345581, - "learning_rate": 0.0003274776038855909, - "loss": 3.387, + "grad_norm": 0.6026633977890015, + "learning_rate": 0.00032746465191581215, + "loss": 3.3851, "step": 42200 }, { "epoch": 4.555256064690027, - "grad_norm": 0.643081545829773, - "learning_rate": 0.00032715380464112245, - "loss": 3.3852, + "grad_norm": 0.6635650396347046, + "learning_rate": 0.0003271408526713437, + "loss": 3.3838, "step": 42250 }, { "epoch": 4.560646900269542, - "grad_norm": 0.6979756951332092, - "learning_rate": 0.00032683000539665406, - "loss": 3.3863, + "grad_norm": 0.6454138159751892, + "learning_rate": 0.0003268170534268753, + "loss": 3.3865, "step": 42300 }, { "epoch": 4.566037735849057, - "grad_norm": 0.6504450440406799, - "learning_rate": 0.0003265062061521856, - "loss": 3.3895, + "grad_norm": 0.6208834052085876, + "learning_rate": 0.00032649325418240686, + "loss": 3.3868, "step": 42350 }, { "epoch": 4.571428571428571, - "grad_norm": 0.6485116481781006, - "learning_rate": 0.0003261824069077172, - "loss": 3.378, + "grad_norm": 0.6189976930618286, + "learning_rate": 0.00032616945493793846, + "loss": 3.3757, "step": 42400 }, { "epoch": 4.576819407008086, - "grad_norm": 0.686379075050354, - "learning_rate": 0.00032585860766324876, - "loss": 3.3909, + "grad_norm": 0.6465439796447754, + "learning_rate": 0.00032584565569347, + "loss": 3.3898, "step": 42450 }, { "epoch": 4.5822102425876015, - "grad_norm": 0.624584436416626, - "learning_rate": 0.0003255348084187803, - "loss": 3.3864, + "grad_norm": 0.6091396808624268, + "learning_rate": 0.0003255218564490016, + "loss": 3.3861, "step": 42500 }, { "epoch": 4.587601078167116, - "grad_norm": 0.7264350652694702, - "learning_rate": 0.0003252110091743119, - "loss": 3.3879, + "grad_norm": 0.6565108895301819, + "learning_rate": 0.00032519805720453317, + "loss": 3.3858, "step": 42550 }, { "epoch": 4.592991913746631, - "grad_norm": 0.6314030289649963, - "learning_rate": 0.00032488720992984347, - "loss": 3.3723, + "grad_norm": 0.6114659309387207, + "learning_rate": 0.0003248742579600647, + "loss": 3.3709, "step": 42600 }, { "epoch": 4.598382749326145, - "grad_norm": 0.7158349752426147, - "learning_rate": 0.0003245634106853751, - "loss": 3.3733, + "grad_norm": 0.6839783787727356, + "learning_rate": 0.0003245504587155963, + "loss": 3.3721, "step": 42650 }, { "epoch": 4.60377358490566, - "grad_norm": 0.6618641018867493, - "learning_rate": 0.0003242396114409066, - "loss": 3.3827, + "grad_norm": 0.664556086063385, + "learning_rate": 0.0003242266594711278, + "loss": 3.3804, "step": 42700 }, { "epoch": 4.609164420485175, - "grad_norm": 0.643898606300354, - "learning_rate": 0.00032391581219643823, - "loss": 3.3991, + "grad_norm": 0.6460453271865845, + "learning_rate": 0.0003239093362115488, + "loss": 3.3993, "step": 42750 }, { "epoch": 4.6145552560646905, - "grad_norm": 0.6714193820953369, - "learning_rate": 0.0003235920129519697, - "loss": 3.3718, + "grad_norm": 0.6806046366691589, + "learning_rate": 0.00032358553696708043, + "loss": 3.371, "step": 42800 }, { "epoch": 4.619946091644205, - "grad_norm": 0.650656521320343, - "learning_rate": 0.0003232682137075013, - "loss": 3.3804, + "grad_norm": 0.6139158606529236, + "learning_rate": 0.00032326173772261193, + "loss": 3.3778, "step": 42850 }, { "epoch": 4.62533692722372, - "grad_norm": 0.6801531314849854, - "learning_rate": 0.0003229444144630329, - "loss": 3.3771, + "grad_norm": 0.6241970062255859, + "learning_rate": 0.00032293793847814353, + "loss": 3.377, "step": 42900 }, { "epoch": 4.630727762803234, - "grad_norm": 0.6805683374404907, - "learning_rate": 0.00032262061521856443, - "loss": 3.4009, + "grad_norm": 0.678596019744873, + "learning_rate": 0.0003226141392336751, + "loss": 3.3996, "step": 42950 }, { "epoch": 4.636118598382749, - "grad_norm": 0.6824569702148438, - "learning_rate": 0.00032229681597409604, - "loss": 3.4062, + "grad_norm": 0.6589379906654358, + "learning_rate": 0.00032229033998920663, + "loss": 3.4039, "step": 43000 }, { "epoch": 4.636118598382749, - "eval_accuracy": 0.3791326261696348, - "eval_loss": 3.4244093894958496, - "eval_runtime": 184.8967, - "eval_samples_per_second": 97.411, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.3791835843740197, + "eval_loss": 3.422538995742798, + "eval_runtime": 185.1391, + "eval_samples_per_second": 97.284, + "eval_steps_per_second": 6.082, "step": 43000 }, { "epoch": 4.6415094339622645, - "grad_norm": 0.6415202021598816, - "learning_rate": 0.0003219730167296276, - "loss": 3.3891, + "grad_norm": 0.6241201758384705, + "learning_rate": 0.00032196654074473824, + "loss": 3.3885, "step": 43050 }, { "epoch": 4.646900269541779, - "grad_norm": 0.6398457884788513, - "learning_rate": 0.0003216492174851592, - "loss": 3.3942, + "grad_norm": 0.6069834232330322, + "learning_rate": 0.0003216427415002698, + "loss": 3.3944, "step": 43100 }, { "epoch": 4.652291105121294, - "grad_norm": 0.7170093059539795, - "learning_rate": 0.00032132541824069074, - "loss": 3.409, + "grad_norm": 0.6450423002243042, + "learning_rate": 0.0003213189422558014, + "loss": 3.4062, "step": 43150 }, { "epoch": 4.657681940700809, - "grad_norm": 0.6552037596702576, - "learning_rate": 0.0003210080949811117, - "loss": 3.3803, + "grad_norm": 0.6572805643081665, + "learning_rate": 0.00032099514301133295, + "loss": 3.3788, "step": 43200 }, { "epoch": 4.663072776280323, - "grad_norm": 0.6536373496055603, - "learning_rate": 0.00032068429573664324, - "loss": 3.3793, + "grad_norm": 0.6346420645713806, + "learning_rate": 0.00032067134376686455, + "loss": 3.3781, "step": 43250 }, { "epoch": 4.668463611859838, - "grad_norm": 0.6051586866378784, - "learning_rate": 0.00032036049649217485, - "loss": 3.3903, + "grad_norm": 0.6331166625022888, + "learning_rate": 0.0003203475445223961, + "loss": 3.3912, "step": 43300 }, { "epoch": 4.6738544474393535, - "grad_norm": 0.665641725063324, - "learning_rate": 0.0003200366972477064, + "grad_norm": 0.6622107028961182, + "learning_rate": 0.0003200237452779277, "loss": 3.3884, "step": 43350 }, { "epoch": 4.679245283018868, - "grad_norm": 0.6546832323074341, - "learning_rate": 0.000319712898003238, - "loss": 3.4008, + "grad_norm": 0.6595224738121033, + "learning_rate": 0.00031969994603345926, + "loss": 3.4004, "step": 43400 }, { "epoch": 4.684636118598383, - "grad_norm": 0.6022357940673828, - "learning_rate": 0.0003193890987587695, - "loss": 3.3881, + "grad_norm": 0.6058201789855957, + "learning_rate": 0.00031937614678899075, + "loss": 3.3886, "step": 43450 }, { "epoch": 4.690026954177897, - "grad_norm": 0.6649525165557861, - "learning_rate": 0.0003190652995143011, + "grad_norm": 0.6582033038139343, + "learning_rate": 0.00031905234754452236, "loss": 3.4009, "step": 43500 }, { "epoch": 4.695417789757412, - "grad_norm": 0.6134843826293945, - "learning_rate": 0.00031874150026983266, - "loss": 3.3842, + "grad_norm": 0.6451115608215332, + "learning_rate": 0.0003187285483000539, + "loss": 3.384, "step": 43550 }, { "epoch": 4.7008086253369274, - "grad_norm": 0.7057274580001831, - "learning_rate": 0.0003184177010253642, - "loss": 3.3762, + "grad_norm": 0.6477821469306946, + "learning_rate": 0.0003184047490555855, + "loss": 3.3735, "step": 43600 }, { "epoch": 4.706199460916442, - "grad_norm": 0.659426748752594, - "learning_rate": 0.0003180939017808958, - "loss": 3.3923, + "grad_norm": 0.6032214164733887, + "learning_rate": 0.00031808094981111706, + "loss": 3.3908, "step": 43650 }, { "epoch": 4.711590296495957, - "grad_norm": 0.6758320331573486, - "learning_rate": 0.00031777010253642736, - "loss": 3.3784, + "grad_norm": 0.6578019261360168, + "learning_rate": 0.00031775715056664867, + "loss": 3.3762, "step": 43700 }, { "epoch": 4.716981132075472, - "grad_norm": 0.6479880213737488, - "learning_rate": 0.00031744630329195897, - "loss": 3.3966, + "grad_norm": 0.6483471393585205, + "learning_rate": 0.0003174333513221802, + "loss": 3.396, "step": 43750 }, { "epoch": 4.722371967654986, - "grad_norm": 0.6481097936630249, - "learning_rate": 0.0003171225040474905, - "loss": 3.3905, + "grad_norm": 0.6229943633079529, + "learning_rate": 0.00031710955207771177, + "loss": 3.3894, "step": 43800 }, { "epoch": 4.727762803234501, - "grad_norm": 0.8242167830467224, - "learning_rate": 0.0003167987048030221, - "loss": 3.3923, + "grad_norm": 0.6678479909896851, + "learning_rate": 0.0003167857528332434, + "loss": 3.3916, "step": 43850 }, { "epoch": 4.7331536388140165, - "grad_norm": 0.6138800978660583, - "learning_rate": 0.00031647490555855367, - "loss": 3.396, + "grad_norm": 0.5916961431503296, + "learning_rate": 0.0003164619535887749, + "loss": 3.3943, "step": 43900 }, { "epoch": 4.738544474393531, - "grad_norm": 0.6652048230171204, - "learning_rate": 0.0003161511063140853, - "loss": 3.3855, + "grad_norm": 0.6245575547218323, + "learning_rate": 0.00031613815434430653, + "loss": 3.3859, "step": 43950 }, { "epoch": 4.743935309973046, - "grad_norm": 0.6460886597633362, - "learning_rate": 0.00031582730706961683, - "loss": 3.3905, + "grad_norm": 0.6339305639266968, + "learning_rate": 0.0003158143550998381, + "loss": 3.3886, "step": 44000 }, { "epoch": 4.743935309973046, - "eval_accuracy": 0.3798925444670092, - "eval_loss": 3.4189040660858154, - "eval_runtime": 185.0852, - "eval_samples_per_second": 97.312, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.37981963837928573, + "eval_loss": 3.416712760925293, + "eval_runtime": 185.1327, + "eval_samples_per_second": 97.287, + "eval_steps_per_second": 6.082, "step": 44000 }, { "epoch": 4.74932614555256, - "grad_norm": 0.6555764675140381, - "learning_rate": 0.0003155035078251483, - "loss": 3.397, + "grad_norm": 0.64893639087677, + "learning_rate": 0.0003154905558553697, + "loss": 3.3989, "step": 44050 }, { "epoch": 4.754716981132075, - "grad_norm": 0.6533858776092529, - "learning_rate": 0.00031517970858068, - "loss": 3.3928, + "grad_norm": 0.6587449312210083, + "learning_rate": 0.0003151667566109012, + "loss": 3.3939, "step": 44100 }, { "epoch": 4.7601078167115904, - "grad_norm": 0.6652816534042358, - "learning_rate": 0.0003148559093362115, - "loss": 3.4021, + "grad_norm": 0.6603413820266724, + "learning_rate": 0.00031484295736643284, + "loss": 3.4024, "step": 44150 }, { "epoch": 4.765498652291106, - "grad_norm": 0.6321492195129395, - "learning_rate": 0.00031453858607663243, - "loss": 3.4016, + "grad_norm": 0.6770880818367004, + "learning_rate": 0.00031451915812196434, + "loss": 3.4005, "step": 44200 }, { "epoch": 4.77088948787062, - "grad_norm": 0.632645308971405, - "learning_rate": 0.00031421478683216404, - "loss": 3.3598, + "grad_norm": 0.5969046950340271, + "learning_rate": 0.0003141953588774959, + "loss": 3.3597, "step": 44250 }, { "epoch": 4.776280323450135, - "grad_norm": 0.6430423855781555, - "learning_rate": 0.0003138909875876956, - "loss": 3.387, + "grad_norm": 0.6240620017051697, + "learning_rate": 0.0003138715596330275, + "loss": 3.3875, "step": 44300 }, { "epoch": 4.781671159029649, - "grad_norm": 0.6638597249984741, - "learning_rate": 0.0003135671883432272, - "loss": 3.3917, + "grad_norm": 0.6881016492843628, + "learning_rate": 0.00031354776038855904, + "loss": 3.3925, "step": 44350 }, { "epoch": 4.787061994609164, - "grad_norm": 0.6225000023841858, - "learning_rate": 0.00031324338909875874, - "loss": 3.3763, + "grad_norm": 0.602745532989502, + "learning_rate": 0.00031322396114409065, + "loss": 3.3754, "step": 44400 }, { "epoch": 4.7924528301886795, - "grad_norm": 0.6393536329269409, - "learning_rate": 0.0003129195898542903, - "loss": 3.3982, + "grad_norm": 0.6551032066345215, + "learning_rate": 0.0003129001618996222, + "loss": 3.3987, "step": 44450 }, { "epoch": 4.797843665768194, - "grad_norm": 0.7148990035057068, - "learning_rate": 0.0003125957906098219, - "loss": 3.3954, + "grad_norm": 0.715237557888031, + "learning_rate": 0.0003125763626551538, + "loss": 3.3968, "step": 44500 }, { "epoch": 4.803234501347709, - "grad_norm": 0.6571036577224731, - "learning_rate": 0.00031227199136535345, - "loss": 3.3703, + "grad_norm": 0.6164499521255493, + "learning_rate": 0.00031225256341068535, + "loss": 3.3697, "step": 44550 }, { "epoch": 4.808625336927224, - "grad_norm": 0.6597515940666199, - "learning_rate": 0.00031194819212088505, - "loss": 3.3852, + "grad_norm": 0.6373364925384521, + "learning_rate": 0.00031192876416621696, + "loss": 3.3842, "step": 44600 }, { "epoch": 4.814016172506738, - "grad_norm": 0.6399565935134888, - "learning_rate": 0.0003116243928764166, - "loss": 3.3852, + "grad_norm": 0.6444586515426636, + "learning_rate": 0.0003116049649217485, + "loss": 3.3847, "step": 44650 }, { "epoch": 4.819407008086253, - "grad_norm": 0.6244239807128906, - "learning_rate": 0.0003113005936319482, - "loss": 3.3862, + "grad_norm": 0.6503430604934692, + "learning_rate": 0.00031128116567728, + "loss": 3.3859, "step": 44700 }, { "epoch": 4.824797843665769, - "grad_norm": 0.6903212070465088, - "learning_rate": 0.00031097679438747976, - "loss": 3.4, + "grad_norm": 0.6720568537712097, + "learning_rate": 0.00031095736643281166, + "loss": 3.4001, "step": 44750 }, { "epoch": 4.830188679245283, - "grad_norm": 0.722104549407959, - "learning_rate": 0.00031065299514301126, - "loss": 3.3689, + "grad_norm": 0.7140028476715088, + "learning_rate": 0.00031063356718834316, + "loss": 3.3695, "step": 44800 }, { "epoch": 4.835579514824798, - "grad_norm": 0.6965301036834717, - "learning_rate": 0.00031032919589854286, - "loss": 3.385, + "grad_norm": 0.6931409239768982, + "learning_rate": 0.00031030976794387476, + "loss": 3.3834, "step": 44850 }, { "epoch": 4.840970350404312, - "grad_norm": 0.6514486074447632, - "learning_rate": 0.0003100053966540744, - "loss": 3.362, + "grad_norm": 0.6551147699356079, + "learning_rate": 0.0003099859686994063, + "loss": 3.3616, "step": 44900 }, { "epoch": 4.846361185983827, - "grad_norm": 0.6319965720176697, - "learning_rate": 0.000309681597409606, - "loss": 3.4001, + "grad_norm": 0.6414363384246826, + "learning_rate": 0.0003096621694549379, + "loss": 3.3987, "step": 44950 }, { "epoch": 4.8517520215633425, - "grad_norm": 0.6779524683952332, - "learning_rate": 0.00030935779816513757, - "loss": 3.3846, + "grad_norm": 0.658860981464386, + "learning_rate": 0.00030933837021046947, + "loss": 3.3862, "step": 45000 }, { "epoch": 4.8517520215633425, - "eval_accuracy": 0.3805246869683139, - "eval_loss": 3.41042423248291, - "eval_runtime": 184.8977, - "eval_samples_per_second": 97.411, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.3804288551212615, + "eval_loss": 3.4101946353912354, + "eval_runtime": 185.1294, + "eval_samples_per_second": 97.289, + "eval_steps_per_second": 6.082, "step": 45000 }, { "epoch": 4.857142857142857, - "grad_norm": 0.6845847964286804, - "learning_rate": 0.00030903399892066917, + "grad_norm": 0.673330545425415, + "learning_rate": 0.0003090145709660011, "loss": 3.3955, "step": 45050 }, { "epoch": 4.862533692722372, - "grad_norm": 0.6388900279998779, - "learning_rate": 0.0003087101996762007, - "loss": 3.4001, + "grad_norm": 0.6617267727851868, + "learning_rate": 0.0003086907717215326, + "loss": 3.3992, "step": 45100 }, { "epoch": 4.867924528301887, - "grad_norm": 0.6666938066482544, - "learning_rate": 0.0003083864004317323, - "loss": 3.3707, + "grad_norm": 0.6610994338989258, + "learning_rate": 0.0003083669724770642, + "loss": 3.3697, "step": 45150 }, { "epoch": 4.873315363881401, - "grad_norm": 0.6598957180976868, - "learning_rate": 0.0003080626011872639, - "loss": 3.3805, + "grad_norm": 0.60479336977005, + "learning_rate": 0.0003080431732325958, + "loss": 3.3779, "step": 45200 }, { "epoch": 4.878706199460916, - "grad_norm": 0.6968781352043152, - "learning_rate": 0.00030773880194279543, - "loss": 3.3946, + "grad_norm": 0.7236313223838806, + "learning_rate": 0.00030771937398812733, + "loss": 3.3945, "step": 45250 }, { "epoch": 4.884097035040432, - "grad_norm": 0.6397393345832825, - "learning_rate": 0.00030741500269832703, + "grad_norm": 0.6632745862007141, + "learning_rate": 0.00030739557474365894, "loss": 3.4061, "step": 45300 }, { "epoch": 4.889487870619946, - "grad_norm": 0.6977748870849609, - "learning_rate": 0.0003070912034538586, - "loss": 3.3841, + "grad_norm": 0.6762586236000061, + "learning_rate": 0.0003070717754991905, + "loss": 3.3828, "step": 45350 }, { "epoch": 4.894878706199461, - "grad_norm": 0.6801537275314331, - "learning_rate": 0.0003067674042093902, - "loss": 3.3938, + "grad_norm": 0.6466242074966431, + "learning_rate": 0.0003067479762547221, + "loss": 3.3951, "step": 45400 }, { "epoch": 4.900269541778976, - "grad_norm": 0.6654176115989685, - "learning_rate": 0.0003064436049649217, - "loss": 3.392, + "grad_norm": 0.6276098489761353, + "learning_rate": 0.0003064241770102536, + "loss": 3.3896, "step": 45450 }, { "epoch": 4.90566037735849, - "grad_norm": 0.6970499157905579, - "learning_rate": 0.0003061198057204533, - "loss": 3.3884, + "grad_norm": 0.6750917434692383, + "learning_rate": 0.00030610037776578514, + "loss": 3.3901, "step": 45500 }, { "epoch": 4.9110512129380055, - "grad_norm": 0.6450934410095215, - "learning_rate": 0.00030579600647598484, - "loss": 3.3903, + "grad_norm": 0.6179811954498291, + "learning_rate": 0.00030577657852131674, + "loss": 3.3895, "step": 45550 }, { "epoch": 4.916442048517521, - "grad_norm": 0.6576349139213562, - "learning_rate": 0.00030547220723151644, - "loss": 3.3872, + "grad_norm": 0.6492365002632141, + "learning_rate": 0.0003054527792768483, + "loss": 3.3835, "step": 45600 }, { "epoch": 4.921832884097035, - "grad_norm": 0.6595627069473267, - "learning_rate": 0.000305148407987048, - "loss": 3.3894, + "grad_norm": 0.6423570513725281, + "learning_rate": 0.0003051289800323799, + "loss": 3.3883, "step": 45650 }, { "epoch": 4.92722371967655, - "grad_norm": 0.701417088508606, - "learning_rate": 0.00030482460874257955, - "loss": 3.3649, + "grad_norm": 0.6584356427192688, + "learning_rate": 0.00030480518078791145, + "loss": 3.3633, "step": 45700 }, { "epoch": 4.932614555256064, - "grad_norm": 0.6728206872940063, - "learning_rate": 0.00030450080949811115, - "loss": 3.3825, + "grad_norm": 0.6494635939598083, + "learning_rate": 0.00030448138154344305, + "loss": 3.3824, "step": 45750 }, { "epoch": 4.938005390835579, - "grad_norm": 0.7555509805679321, - "learning_rate": 0.0003041770102536427, - "loss": 3.4, + "grad_norm": 0.6741239428520203, + "learning_rate": 0.0003041575822989746, + "loss": 3.3991, "step": 45800 }, { "epoch": 4.943396226415095, - "grad_norm": 0.725534975528717, - "learning_rate": 0.0003038532110091743, - "loss": 3.3784, + "grad_norm": 0.7292436957359314, + "learning_rate": 0.0003038337830545062, + "loss": 3.3769, "step": 45850 }, { "epoch": 4.948787061994609, - "grad_norm": 0.658740222454071, - "learning_rate": 0.00030352941176470586, - "loss": 3.3868, + "grad_norm": 0.6789447665214539, + "learning_rate": 0.00030350998381003776, + "loss": 3.3864, "step": 45900 }, { "epoch": 4.954177897574124, - "grad_norm": 0.6139926314353943, - "learning_rate": 0.00030320561252023746, - "loss": 3.377, + "grad_norm": 0.6103675961494446, + "learning_rate": 0.0003031861845655693, + "loss": 3.3783, "step": 45950 }, { "epoch": 4.959568733153639, - "grad_norm": 0.6422691345214844, - "learning_rate": 0.000302881813275769, - "loss": 3.3849, + "grad_norm": 0.6245911717414856, + "learning_rate": 0.0003028623853211009, + "loss": 3.3834, "step": 46000 }, { "epoch": 4.959568733153639, - "eval_accuracy": 0.3808301102359604, - "eval_loss": 3.4091789722442627, - "eval_runtime": 185.1173, - "eval_samples_per_second": 97.295, - "eval_steps_per_second": 6.083, + "eval_accuracy": 0.38076980788322334, + "eval_loss": 3.4084885120391846, + "eval_runtime": 185.2341, + "eval_samples_per_second": 97.234, + "eval_steps_per_second": 6.079, "step": 46000 }, { "epoch": 4.964959568733153, - "grad_norm": 0.6328104138374329, - "learning_rate": 0.0003025580140313006, - "loss": 3.3936, + "grad_norm": 0.631356954574585, + "learning_rate": 0.00030253858607663247, + "loss": 3.3948, "step": 46050 }, { "epoch": 4.9703504043126685, - "grad_norm": 0.6444491147994995, - "learning_rate": 0.00030223421478683217, - "loss": 3.3657, + "grad_norm": 0.6470606923103333, + "learning_rate": 0.00030221478683216407, + "loss": 3.3663, "step": 46100 }, { "epoch": 4.975741239892184, - "grad_norm": 0.6250355243682861, - "learning_rate": 0.00030191041554236366, - "loss": 3.3731, + "grad_norm": 0.672898530960083, + "learning_rate": 0.00030189098758769557, + "loss": 3.372, "step": 46150 }, { "epoch": 4.981132075471698, - "grad_norm": 0.7009178996086121, - "learning_rate": 0.00030158661629789527, - "loss": 3.3837, + "grad_norm": 0.7242577075958252, + "learning_rate": 0.00030156718834322717, + "loss": 3.3835, "step": 46200 }, { "epoch": 4.986522911051213, - "grad_norm": 0.6598658561706543, - "learning_rate": 0.0003012628170534268, - "loss": 3.4008, + "grad_norm": 0.7056680917739868, + "learning_rate": 0.0003012433890987587, + "loss": 3.401, "step": 46250 }, { "epoch": 4.991913746630727, - "grad_norm": 0.6467294692993164, - "learning_rate": 0.0003009390178089584, - "loss": 3.3979, + "grad_norm": 0.6546418070793152, + "learning_rate": 0.0003009195898542903, + "loss": 3.395, "step": 46300 }, { "epoch": 4.997304582210242, - "grad_norm": 0.6867441534996033, - "learning_rate": 0.00030061521856449, - "loss": 3.3752, + "grad_norm": 0.6470614671707153, + "learning_rate": 0.0003005957906098219, + "loss": 3.3757, "step": 46350 }, { "epoch": 5.002695417789758, - "grad_norm": 0.6885562539100647, - "learning_rate": 0.0003002914193200216, - "loss": 3.3244, + "grad_norm": 0.6490879654884338, + "learning_rate": 0.00030027199136535343, + "loss": 3.3243, "step": 46400 }, { "epoch": 5.008086253369272, - "grad_norm": 0.7105883359909058, - "learning_rate": 0.00029996762007555313, - "loss": 3.3023, + "grad_norm": 0.6910776495933533, + "learning_rate": 0.00029994819212088503, + "loss": 3.3026, "step": 46450 }, { "epoch": 5.013477088948787, - "grad_norm": 0.6555173397064209, - "learning_rate": 0.0002996438208310847, - "loss": 3.3038, + "grad_norm": 0.6369443535804749, + "learning_rate": 0.0002996243928764166, + "loss": 3.3021, "step": 46500 }, { "epoch": 5.018867924528302, - "grad_norm": 0.6800537705421448, - "learning_rate": 0.0002993200215866163, - "loss": 3.3036, + "grad_norm": 0.7196043729782104, + "learning_rate": 0.0002993005936319482, + "loss": 3.3039, "step": 46550 }, { "epoch": 5.024258760107816, - "grad_norm": 0.6979748606681824, - "learning_rate": 0.00029899622234214783, - "loss": 3.2893, + "grad_norm": 0.6704365015029907, + "learning_rate": 0.00029897679438747974, + "loss": 3.2892, "step": 46600 }, { "epoch": 5.0296495956873315, - "grad_norm": 0.6974326372146606, - "learning_rate": 0.00029867242309767944, - "loss": 3.2922, + "grad_norm": 0.6854135990142822, + "learning_rate": 0.0002986529951430113, + "loss": 3.2931, "step": 46650 }, { "epoch": 5.035040431266847, - "grad_norm": 0.6461212635040283, - "learning_rate": 0.000298348623853211, - "loss": 3.2947, + "grad_norm": 0.6373289227485657, + "learning_rate": 0.0002983291958985429, + "loss": 3.2935, "step": 46700 }, { "epoch": 5.040431266846361, - "grad_norm": 0.672947108745575, - "learning_rate": 0.00029802482460874254, - "loss": 3.3011, + "grad_norm": 0.6976351737976074, + "learning_rate": 0.00029801187263896385, + "loss": 3.3017, "step": 46750 }, { "epoch": 5.045822102425876, - "grad_norm": 0.7461097240447998, - "learning_rate": 0.0002977010253642741, - "loss": 3.3097, + "grad_norm": 0.6690797209739685, + "learning_rate": 0.0002976880733944954, + "loss": 3.3089, "step": 46800 }, { "epoch": 5.051212938005391, - "grad_norm": 0.6764471530914307, - "learning_rate": 0.0002973772261198057, - "loss": 3.3087, + "grad_norm": 0.7078843116760254, + "learning_rate": 0.00029736427415002695, + "loss": 3.3085, "step": 46850 }, { "epoch": 5.056603773584905, - "grad_norm": 0.6477118134498596, - "learning_rate": 0.00029705342687533725, - "loss": 3.3035, + "grad_norm": 0.6783893704414368, + "learning_rate": 0.0002970404749055585, + "loss": 3.3026, "step": 46900 }, { "epoch": 5.061994609164421, - "grad_norm": 0.7262566685676575, - "learning_rate": 0.00029672962763086885, - "loss": 3.3022, + "grad_norm": 0.7261761426925659, + "learning_rate": 0.0002967166756610901, + "loss": 3.3008, "step": 46950 }, { "epoch": 5.067385444743936, - "grad_norm": 0.7069379091262817, - "learning_rate": 0.0002964058283864004, - "loss": 3.311, + "grad_norm": 0.6983113884925842, + "learning_rate": 0.00029639287641662165, + "loss": 3.3112, "step": 47000 }, { "epoch": 5.067385444743936, - "eval_accuracy": 0.38110000400929156, - "eval_loss": 3.411581039428711, - "eval_runtime": 184.7171, - "eval_samples_per_second": 97.506, - "eval_steps_per_second": 6.096, + "eval_accuracy": 0.3807572041482369, + "eval_loss": 3.411543369293213, + "eval_runtime": 184.8806, + "eval_samples_per_second": 97.42, + "eval_steps_per_second": 6.09, "step": 47000 }, { "epoch": 5.07277628032345, - "grad_norm": 0.684544026851654, - "learning_rate": 0.000296082029141932, - "loss": 3.3039, + "grad_norm": 0.7106756567955017, + "learning_rate": 0.00029606907717215326, + "loss": 3.3048, "step": 47050 }, { "epoch": 5.078167115902965, - "grad_norm": 0.6550946235656738, - "learning_rate": 0.00029575822989746356, - "loss": 3.2982, + "grad_norm": 0.6234951615333557, + "learning_rate": 0.0002957452779276848, + "loss": 3.2983, "step": 47100 }, { "epoch": 5.083557951482479, - "grad_norm": 0.6490722894668579, - "learning_rate": 0.0002954344306529951, - "loss": 3.3105, + "grad_norm": 0.6151743531227112, + "learning_rate": 0.00029542147868321636, + "loss": 3.3121, "step": 47150 }, { "epoch": 5.0889487870619945, - "grad_norm": 0.6412487030029297, - "learning_rate": 0.00029511063140852666, - "loss": 3.3115, + "grad_norm": 0.6277012825012207, + "learning_rate": 0.00029509767943874796, + "loss": 3.3105, "step": 47200 }, { "epoch": 5.09433962264151, - "grad_norm": 0.6618337035179138, - "learning_rate": 0.00029478683216405826, - "loss": 3.2947, + "grad_norm": 0.6742247343063354, + "learning_rate": 0.0002947738801942795, + "loss": 3.2964, "step": 47250 }, { "epoch": 5.099730458221024, - "grad_norm": 0.7021799087524414, - "learning_rate": 0.0002944630329195898, - "loss": 3.3195, + "grad_norm": 0.6989336609840393, + "learning_rate": 0.0002944500809498111, + "loss": 3.3197, "step": 47300 }, { "epoch": 5.105121293800539, - "grad_norm": 0.6509613394737244, - "learning_rate": 0.0002941392336751214, - "loss": 3.32, + "grad_norm": 0.6480673551559448, + "learning_rate": 0.00029412628170534267, + "loss": 3.3195, "step": 47350 }, { "epoch": 5.110512129380054, - "grad_norm": 0.6623237133026123, - "learning_rate": 0.00029381543443065297, - "loss": 3.301, + "grad_norm": 0.6779480576515198, + "learning_rate": 0.0002938024824608742, + "loss": 3.3009, "step": 47400 }, { "epoch": 5.115902964959568, - "grad_norm": 0.6974384784698486, - "learning_rate": 0.0002934916351861846, - "loss": 3.3038, + "grad_norm": 0.6613697409629822, + "learning_rate": 0.00029347868321640577, + "loss": 3.3048, "step": 47450 }, { "epoch": 5.121293800539084, - "grad_norm": 0.6917511820793152, - "learning_rate": 0.0002931678359417161, - "loss": 3.2947, + "grad_norm": 0.6762946248054504, + "learning_rate": 0.0002931548839719374, + "loss": 3.2942, "step": 47500 }, { "epoch": 5.126684636118599, - "grad_norm": 0.6955335736274719, - "learning_rate": 0.0002928440366972477, - "loss": 3.3021, + "grad_norm": 0.6452640295028687, + "learning_rate": 0.0002928310847274689, + "loss": 3.3027, "step": 47550 }, { "epoch": 5.132075471698113, - "grad_norm": 0.7200063467025757, - "learning_rate": 0.0002925202374527792, - "loss": 3.2928, + "grad_norm": 0.7308632731437683, + "learning_rate": 0.00029250728548300053, + "loss": 3.2931, "step": 47600 }, { "epoch": 5.137466307277628, - "grad_norm": 0.716500461101532, - "learning_rate": 0.00029219643820831083, - "loss": 3.308, + "grad_norm": 0.7426350712776184, + "learning_rate": 0.0002921834862385321, + "loss": 3.3076, "step": 47650 }, { "epoch": 5.142857142857143, - "grad_norm": 0.6869317889213562, - "learning_rate": 0.0002918791149487318, - "loss": 3.3217, + "grad_norm": 0.6810452342033386, + "learning_rate": 0.0002918596869940637, + "loss": 3.3227, "step": 47700 }, { "epoch": 5.1482479784366575, - "grad_norm": 0.7114856243133545, - "learning_rate": 0.00029155531570426333, - "loss": 3.3117, + "grad_norm": 0.6850289702415466, + "learning_rate": 0.0002915358877495952, + "loss": 3.3086, "step": 47750 }, { "epoch": 5.153638814016173, - "grad_norm": 0.6355905532836914, - "learning_rate": 0.00029123151645979494, - "loss": 3.2979, + "grad_norm": 0.6442118287086487, + "learning_rate": 0.0002912120885051268, + "loss": 3.2973, "step": 47800 }, { "epoch": 5.159029649595688, - "grad_norm": 0.6989576816558838, - "learning_rate": 0.0002909077172153265, - "loss": 3.3042, + "grad_norm": 0.702198326587677, + "learning_rate": 0.00029088828926065834, + "loss": 3.3049, "step": 47850 }, { "epoch": 5.164420485175202, - "grad_norm": 0.716057300567627, - "learning_rate": 0.00029058391797085804, - "loss": 3.3303, + "grad_norm": 0.692348062992096, + "learning_rate": 0.00029056449001618994, + "loss": 3.3313, "step": 47900 }, { "epoch": 5.169811320754717, - "grad_norm": 0.6772635579109192, - "learning_rate": 0.0002902601187263896, - "loss": 3.3113, + "grad_norm": 0.6892920136451721, + "learning_rate": 0.0002902406907717215, + "loss": 3.3101, "step": 47950 }, { "epoch": 5.175202156334231, - "grad_norm": 0.6645113229751587, - "learning_rate": 0.0002899363194819212, - "loss": 3.3052, + "grad_norm": 0.7259092330932617, + "learning_rate": 0.0002899168915272531, + "loss": 3.3048, "step": 48000 }, { "epoch": 5.175202156334231, - "eval_accuracy": 0.3809431092392874, - "eval_loss": 3.411557912826538, - "eval_runtime": 184.8975, - "eval_samples_per_second": 97.411, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.3808885654896046, + "eval_loss": 3.4122819900512695, + "eval_runtime": 185.1757, + "eval_samples_per_second": 97.264, + "eval_steps_per_second": 6.081, "step": 48000 }, { "epoch": 5.180592991913747, - "grad_norm": 0.6688160300254822, - "learning_rate": 0.00028961252023745274, - "loss": 3.3126, + "grad_norm": 0.6801049113273621, + "learning_rate": 0.00028959956826767405, + "loss": 3.3147, "step": 48050 }, { "epoch": 5.185983827493262, - "grad_norm": 0.6707350611686707, - "learning_rate": 0.00028928872099298435, - "loss": 3.3017, + "grad_norm": 0.7039002776145935, + "learning_rate": 0.00028927576902320555, + "loss": 3.3004, "step": 48100 }, { "epoch": 5.191374663072776, - "grad_norm": 0.6549227833747864, - "learning_rate": 0.0002889649217485159, - "loss": 3.3112, + "grad_norm": 0.6632753610610962, + "learning_rate": 0.00028895196977873715, + "loss": 3.31, "step": 48150 }, { "epoch": 5.196765498652291, - "grad_norm": 0.6969558596611023, - "learning_rate": 0.00028864112250404745, - "loss": 3.3247, + "grad_norm": 0.6418648362159729, + "learning_rate": 0.0002886281705342687, + "loss": 3.326, "step": 48200 }, { "epoch": 5.202156334231806, - "grad_norm": 0.7048711180686951, - "learning_rate": 0.00028831732325957906, - "loss": 3.317, + "grad_norm": 0.6770094037055969, + "learning_rate": 0.0002883043712898003, + "loss": 3.3181, "step": 48250 }, { "epoch": 5.2075471698113205, - "grad_norm": 0.7173691987991333, - "learning_rate": 0.0002879935240151106, - "loss": 3.3182, + "grad_norm": 0.6800588965415955, + "learning_rate": 0.00028798057204533186, + "loss": 3.3185, "step": 48300 }, { "epoch": 5.212938005390836, - "grad_norm": 0.6918731927871704, - "learning_rate": 0.00028766972477064216, - "loss": 3.3264, + "grad_norm": 0.649732768535614, + "learning_rate": 0.00028765677280086346, + "loss": 3.3252, "step": 48350 }, { "epoch": 5.218328840970351, - "grad_norm": 0.6857321262359619, - "learning_rate": 0.00028734592552617376, - "loss": 3.3196, + "grad_norm": 0.714310348033905, + "learning_rate": 0.000287332973556395, + "loss": 3.3191, "step": 48400 }, { "epoch": 5.223719676549865, - "grad_norm": 0.6688578724861145, - "learning_rate": 0.0002870221262817053, - "loss": 3.3114, + "grad_norm": 0.6347412467002869, + "learning_rate": 0.0002870091743119266, + "loss": 3.3109, "step": 48450 }, { "epoch": 5.22911051212938, - "grad_norm": 0.7027651071548462, - "learning_rate": 0.00028669832703723686, - "loss": 3.3448, + "grad_norm": 0.6917827725410461, + "learning_rate": 0.00028668537506745817, + "loss": 3.3453, "step": 48500 }, { "epoch": 5.234501347708895, - "grad_norm": 0.6578187346458435, - "learning_rate": 0.00028637452779276847, - "loss": 3.3182, + "grad_norm": 0.650833249092102, + "learning_rate": 0.0002863615758229897, + "loss": 3.3185, "step": 48550 }, { "epoch": 5.2398921832884096, - "grad_norm": 0.6483734846115112, - "learning_rate": 0.0002860507285483, - "loss": 3.3184, + "grad_norm": 0.6491532325744629, + "learning_rate": 0.00028603777657852127, + "loss": 3.3186, "step": 48600 }, { "epoch": 5.245283018867925, - "grad_norm": 0.6719345450401306, - "learning_rate": 0.0002857269293038316, - "loss": 3.3152, + "grad_norm": 0.6467685699462891, + "learning_rate": 0.0002857139773340529, + "loss": 3.3169, "step": 48650 }, { "epoch": 5.250673854447439, - "grad_norm": 0.7231610417366028, - "learning_rate": 0.0002854031300593632, - "loss": 3.3191, + "grad_norm": 0.7819921970367432, + "learning_rate": 0.0002853966540744738, + "loss": 3.3204, "step": 48700 }, { "epoch": 5.256064690026954, - "grad_norm": 0.7005861401557922, - "learning_rate": 0.0002850793308148947, - "loss": 3.3382, + "grad_norm": 0.6711828112602234, + "learning_rate": 0.0002850728548300054, + "loss": 3.3389, "step": 48750 }, { "epoch": 5.261455525606469, - "grad_norm": 0.6894075274467468, - "learning_rate": 0.0002847555315704263, - "loss": 3.3371, + "grad_norm": 0.6818621158599854, + "learning_rate": 0.000284749055585537, + "loss": 3.3368, "step": 48800 }, { "epoch": 5.2668463611859835, - "grad_norm": 0.6957369446754456, - "learning_rate": 0.0002844317323259579, - "loss": 3.3153, + "grad_norm": 0.6671029925346375, + "learning_rate": 0.0002844252563410685, + "loss": 3.3145, "step": 48850 }, { "epoch": 5.272237196765499, - "grad_norm": 0.7194564342498779, - "learning_rate": 0.00028410793308148943, - "loss": 3.313, + "grad_norm": 0.7544181942939758, + "learning_rate": 0.0002841014570966001, + "loss": 3.314, "step": 48900 }, { "epoch": 5.277628032345014, - "grad_norm": 0.677614688873291, - "learning_rate": 0.00028378413383702103, - "loss": 3.3354, + "grad_norm": 0.6877387762069702, + "learning_rate": 0.00028377765785213163, + "loss": 3.3361, "step": 48950 }, { "epoch": 5.283018867924528, - "grad_norm": 0.6809502243995667, - "learning_rate": 0.0002834603345925526, - "loss": 3.3259, + "grad_norm": 0.6355829834938049, + "learning_rate": 0.00028345385860766324, + "loss": 3.3249, "step": 49000 }, { "epoch": 5.283018867924528, - "eval_accuracy": 0.38146768538165576, - "eval_loss": 3.406904458999634, - "eval_runtime": 185.2899, - "eval_samples_per_second": 97.204, - "eval_steps_per_second": 6.077, + "eval_accuracy": 0.38155525960923425, + "eval_loss": 3.4067444801330566, + "eval_runtime": 184.7957, + "eval_samples_per_second": 97.464, + "eval_steps_per_second": 6.093, "step": 49000 }, { "epoch": 5.288409703504043, - "grad_norm": 0.661638617515564, - "learning_rate": 0.0002831365353480842, - "loss": 3.3349, + "grad_norm": 0.6431439518928528, + "learning_rate": 0.0002831300593631948, + "loss": 3.3335, "step": 49050 }, { "epoch": 5.293800539083558, - "grad_norm": 0.6964898109436035, - "learning_rate": 0.00028281273610361574, - "loss": 3.3309, + "grad_norm": 0.6457442045211792, + "learning_rate": 0.0002828062601187264, + "loss": 3.3295, "step": 49100 }, { "epoch": 5.2991913746630726, - "grad_norm": 0.6699468493461609, - "learning_rate": 0.00028248893685914734, - "loss": 3.3279, + "grad_norm": 0.7099213600158691, + "learning_rate": 0.00028248246087425794, + "loss": 3.3284, "step": 49150 }, { "epoch": 5.304582210242588, - "grad_norm": 0.6527925729751587, - "learning_rate": 0.00028216513761467884, - "loss": 3.3377, + "grad_norm": 0.6772319674491882, + "learning_rate": 0.0002821586616297895, + "loss": 3.3365, "step": 49200 }, { "epoch": 5.309973045822103, - "grad_norm": 0.6300584673881531, - "learning_rate": 0.00028184133837021045, - "loss": 3.3239, + "grad_norm": 0.6409128308296204, + "learning_rate": 0.0002818348623853211, + "loss": 3.3249, "step": 49250 }, { "epoch": 5.315363881401617, - "grad_norm": 0.727450430393219, - "learning_rate": 0.000281517539125742, - "loss": 3.3253, + "grad_norm": 0.7182660102844238, + "learning_rate": 0.00028151106314085265, + "loss": 3.3255, "step": 49300 }, { "epoch": 5.320754716981132, - "grad_norm": 0.7141486406326294, - "learning_rate": 0.0002811937398812736, - "loss": 3.3347, + "grad_norm": 0.7561542987823486, + "learning_rate": 0.0002811872638963842, + "loss": 3.3359, "step": 49350 }, { "epoch": 5.3261455525606465, - "grad_norm": 0.8665236830711365, - "learning_rate": 0.00028086994063680515, - "loss": 3.3233, + "grad_norm": 0.675321638584137, + "learning_rate": 0.0002808634646519158, + "loss": 3.3237, "step": 49400 }, { "epoch": 5.331536388140162, - "grad_norm": 0.6957372426986694, - "learning_rate": 0.00028054614139233676, - "loss": 3.3173, + "grad_norm": 0.6842005252838135, + "learning_rate": 0.00028053966540744736, + "loss": 3.3151, "step": 49450 }, { "epoch": 5.336927223719677, - "grad_norm": 0.6600592732429504, - "learning_rate": 0.0002802223421478683, - "loss": 3.3405, + "grad_norm": 0.7084938287734985, + "learning_rate": 0.0002802158661629789, + "loss": 3.3432, "step": 49500 }, { "epoch": 5.342318059299191, - "grad_norm": 0.6827237010002136, - "learning_rate": 0.00027989854290339986, - "loss": 3.3152, + "grad_norm": 0.6873745322227478, + "learning_rate": 0.0002798920669185105, + "loss": 3.3164, "step": 49550 }, { "epoch": 5.347708894878706, - "grad_norm": 0.6884130835533142, - "learning_rate": 0.0002795747436589314, - "loss": 3.3326, + "grad_norm": 0.6905918121337891, + "learning_rate": 0.00027956826767404206, + "loss": 3.3339, "step": 49600 }, { "epoch": 5.353099730458221, - "grad_norm": 0.701141357421875, - "learning_rate": 0.000279250944414463, - "loss": 3.3293, + "grad_norm": 0.6891156435012817, + "learning_rate": 0.00027924446842957367, + "loss": 3.3292, "step": 49650 }, { "epoch": 5.3584905660377355, - "grad_norm": 0.6687909960746765, - "learning_rate": 0.00027892714516999456, - "loss": 3.3375, + "grad_norm": 0.6603860855102539, + "learning_rate": 0.0002789206691851052, + "loss": 3.3368, "step": 49700 }, { "epoch": 5.363881401617251, - "grad_norm": 0.6871972680091858, - "learning_rate": 0.00027860334592552617, - "loss": 3.3208, + "grad_norm": 0.676149308681488, + "learning_rate": 0.00027859686994063677, + "loss": 3.3218, "step": 49750 }, { "epoch": 5.369272237196766, - "grad_norm": 0.712066113948822, - "learning_rate": 0.0002782795466810577, - "loss": 3.3236, + "grad_norm": 0.7008675932884216, + "learning_rate": 0.0002782730706961683, + "loss": 3.3254, "step": 49800 }, { "epoch": 5.37466307277628, - "grad_norm": 0.6459314823150635, - "learning_rate": 0.00027795574743658927, - "loss": 3.3382, + "grad_norm": 0.63350909948349, + "learning_rate": 0.0002779492714516999, + "loss": 3.3386, "step": 49850 }, { "epoch": 5.380053908355795, - "grad_norm": 0.7322667241096497, - "learning_rate": 0.0002776384241770102, - "loss": 3.3321, + "grad_norm": 0.6737155914306641, + "learning_rate": 0.00027762547220723147, + "loss": 3.3339, "step": 49900 }, { "epoch": 5.38544474393531, - "grad_norm": 0.6739450097084045, - "learning_rate": 0.00027731462493254177, - "loss": 3.3187, + "grad_norm": 0.6853422522544861, + "learning_rate": 0.0002773016729627631, + "loss": 3.3183, "step": 49950 }, { "epoch": 5.390835579514825, - "grad_norm": 0.6529873013496399, - "learning_rate": 0.0002769908256880734, - "loss": 3.3392, + "grad_norm": 0.6315034627914429, + "learning_rate": 0.00027697787371829463, + "loss": 3.339, "step": 50000 }, { "epoch": 5.390835579514825, - "eval_accuracy": 0.3821004798002873, - "eval_loss": 3.400057792663574, - "eval_runtime": 184.8218, - "eval_samples_per_second": 97.451, - "eval_steps_per_second": 6.092, + "eval_accuracy": 0.3820815741978076, + "eval_loss": 3.399562358856201, + "eval_runtime": 185.1089, + "eval_samples_per_second": 97.299, + "eval_steps_per_second": 6.083, "step": 50000 }, { "epoch": 5.39622641509434, - "grad_norm": 0.7031570076942444, - "learning_rate": 0.00027666702644360493, - "loss": 3.3487, + "grad_norm": 0.6739052534103394, + "learning_rate": 0.00027665407447382623, + "loss": 3.3508, "step": 50050 }, { "epoch": 5.401617250673855, - "grad_norm": 0.6449311375617981, - "learning_rate": 0.00027634322719913653, - "loss": 3.3411, + "grad_norm": 0.6296254992485046, + "learning_rate": 0.0002763302752293578, + "loss": 3.3394, "step": 50100 }, { "epoch": 5.407008086253369, - "grad_norm": 0.6960560083389282, - "learning_rate": 0.0002760194279546681, - "loss": 3.3318, + "grad_norm": 0.7053967118263245, + "learning_rate": 0.00027600647598488933, + "loss": 3.3309, "step": 50150 }, { "epoch": 5.412398921832884, - "grad_norm": 0.6679825186729431, - "learning_rate": 0.00027569562871019963, - "loss": 3.3136, + "grad_norm": 0.6510531902313232, + "learning_rate": 0.0002756826767404209, + "loss": 3.3147, "step": 50200 }, { "epoch": 5.4177897574123985, - "grad_norm": 0.6533336639404297, - "learning_rate": 0.00027537182946573124, - "loss": 3.3423, + "grad_norm": 0.6612011194229126, + "learning_rate": 0.0002753588774959525, + "loss": 3.3438, "step": 50250 }, { "epoch": 5.423180592991914, - "grad_norm": 0.6472018361091614, - "learning_rate": 0.0002750480302212628, - "loss": 3.3375, + "grad_norm": 0.6302815079689026, + "learning_rate": 0.00027503507825148404, + "loss": 3.3371, "step": 50300 }, { "epoch": 5.428571428571429, - "grad_norm": 0.7485306859016418, - "learning_rate": 0.0002747242309767944, - "loss": 3.325, + "grad_norm": 0.6635833382606506, + "learning_rate": 0.00027471127900701564, + "loss": 3.3237, "step": 50350 }, { "epoch": 5.433962264150943, - "grad_norm": 1.076799750328064, - "learning_rate": 0.00027440043173232594, - "loss": 3.3463, + "grad_norm": 0.7546477317810059, + "learning_rate": 0.0002743874797625472, + "loss": 3.3456, "step": 50400 }, { "epoch": 5.439353099730458, - "grad_norm": 0.7454240322113037, - "learning_rate": 0.0002740766324878575, - "loss": 3.3443, + "grad_norm": 0.73973548412323, + "learning_rate": 0.0002740636805180788, + "loss": 3.3419, "step": 50450 }, { "epoch": 5.444743935309973, - "grad_norm": 0.6593639254570007, - "learning_rate": 0.00027375283324338905, + "grad_norm": 0.6448124051094055, + "learning_rate": 0.00027373988127361035, "loss": 3.3284, "step": 50500 }, { "epoch": 5.450134770889488, - "grad_norm": 0.657081663608551, - "learning_rate": 0.00027342903399892065, - "loss": 3.324, + "grad_norm": 0.6900261044502258, + "learning_rate": 0.0002734160820291419, + "loss": 3.3253, "step": 50550 }, { "epoch": 5.455525606469003, - "grad_norm": 0.7052240371704102, - "learning_rate": 0.0002731052347544522, - "loss": 3.3394, + "grad_norm": 0.6863948106765747, + "learning_rate": 0.00027309228278467345, + "loss": 3.3399, "step": 50600 }, { "epoch": 5.460916442048518, - "grad_norm": 0.8208935260772705, - "learning_rate": 0.0002727814355099838, - "loss": 3.3323, + "grad_norm": 0.8710711598396301, + "learning_rate": 0.00027276848354020506, + "loss": 3.3339, "step": 50650 }, { "epoch": 5.466307277628032, - "grad_norm": 0.6855742931365967, - "learning_rate": 0.00027245763626551536, - "loss": 3.3259, + "grad_norm": 0.7047181129455566, + "learning_rate": 0.0002724446842957366, + "loss": 3.3241, "step": 50700 }, { "epoch": 5.471698113207547, - "grad_norm": 0.6824098229408264, - "learning_rate": 0.00027213383702104696, - "loss": 3.3114, + "grad_norm": 0.6998323202133179, + "learning_rate": 0.0002721208850512682, + "loss": 3.3125, "step": 50750 }, { "epoch": 5.4770889487870615, - "grad_norm": 0.7104268074035645, - "learning_rate": 0.00027181003777657846, - "loss": 3.3402, + "grad_norm": 0.6888628005981445, + "learning_rate": 0.00027179708580679976, + "loss": 3.3405, "step": 50800 }, { "epoch": 5.482479784366577, - "grad_norm": 0.7086119651794434, - "learning_rate": 0.00027148623853211006, - "loss": 3.3259, + "grad_norm": 0.6762120723724365, + "learning_rate": 0.0002714732865623313, + "loss": 3.3244, "step": 50850 }, { "epoch": 5.487870619946092, - "grad_norm": 0.6901703476905823, - "learning_rate": 0.0002711624392876416, - "loss": 3.3348, + "grad_norm": 0.6589097380638123, + "learning_rate": 0.0002711494873178629, + "loss": 3.335, "step": 50900 }, { "epoch": 5.493261455525606, - "grad_norm": 0.70135098695755, - "learning_rate": 0.0002708386400431732, - "loss": 3.349, + "grad_norm": 0.6884141564369202, + "learning_rate": 0.00027082568807339447, + "loss": 3.3512, "step": 50950 }, { "epoch": 5.498652291105121, - "grad_norm": 0.6997105479240417, - "learning_rate": 0.00027051484079870477, + "grad_norm": 0.6888891458511353, + "learning_rate": 0.000270501888828926, "loss": 3.3135, "step": 51000 }, { "epoch": 5.498652291105121, - "eval_accuracy": 0.3822508553970225, - "eval_loss": 3.3968350887298584, - "eval_runtime": 185.1499, - "eval_samples_per_second": 97.278, - "eval_steps_per_second": 6.082, + "eval_accuracy": 0.3822205412413223, + "eval_loss": 3.397444248199463, + "eval_runtime": 185.2714, + "eval_samples_per_second": 97.214, + "eval_steps_per_second": 6.078, "step": 51000 }, { "epoch": 5.504043126684636, - "grad_norm": 0.7074891328811646, - "learning_rate": 0.00027019104155423637, - "loss": 3.3215, + "grad_norm": 0.6714994311332703, + "learning_rate": 0.0002701780895844576, + "loss": 3.321, "step": 51050 }, { "epoch": 5.509433962264151, - "grad_norm": 0.6949896216392517, - "learning_rate": 0.0002698672423097679, - "loss": 3.3343, + "grad_norm": 0.6546480655670166, + "learning_rate": 0.0002698542903399892, + "loss": 3.3323, "step": 51100 }, { "epoch": 5.514824797843666, - "grad_norm": 0.717146635055542, - "learning_rate": 0.00026954344306529953, - "loss": 3.349, + "grad_norm": 0.696523129940033, + "learning_rate": 0.0002695304910955207, + "loss": 3.3496, "step": 51150 }, { "epoch": 5.520215633423181, - "grad_norm": 0.673259437084198, - "learning_rate": 0.0002692196438208311, - "loss": 3.3233, + "grad_norm": 0.6788317561149597, + "learning_rate": 0.00026920669185105233, + "loss": 3.3245, "step": 51200 }, { "epoch": 5.525606469002695, - "grad_norm": 0.7613580226898193, - "learning_rate": 0.00026889584457636263, - "loss": 3.3286, + "grad_norm": 0.7704463601112366, + "learning_rate": 0.0002688828926065839, + "loss": 3.3297, "step": 51250 }, { "epoch": 5.53099730458221, - "grad_norm": 0.6700328588485718, - "learning_rate": 0.0002685720453318942, - "loss": 3.3417, + "grad_norm": 0.6638984680175781, + "learning_rate": 0.0002685590933621155, + "loss": 3.3427, "step": 51300 }, { "epoch": 5.536388140161725, - "grad_norm": 0.6604465842247009, - "learning_rate": 0.0002682482460874258, - "loss": 3.321, + "grad_norm": 0.6343899369239807, + "learning_rate": 0.00026823529411764704, + "loss": 3.3216, "step": 51350 }, { "epoch": 5.54177897574124, - "grad_norm": 0.7246283888816833, - "learning_rate": 0.00026792444684295733, - "loss": 3.3297, + "grad_norm": 0.7129838466644287, + "learning_rate": 0.0002679114948731786, + "loss": 3.33, "step": 51400 }, { "epoch": 5.547169811320755, - "grad_norm": 0.7392826676368713, - "learning_rate": 0.00026760064759848894, - "loss": 3.3475, + "grad_norm": 0.717201828956604, + "learning_rate": 0.0002675876956287102, + "loss": 3.3476, "step": 51450 }, { "epoch": 5.55256064690027, - "grad_norm": 0.6978980302810669, - "learning_rate": 0.0002672768483540205, - "loss": 3.3311, + "grad_norm": 0.6938988566398621, + "learning_rate": 0.00026726389638424174, + "loss": 3.334, "step": 51500 }, { "epoch": 5.557951482479784, - "grad_norm": 0.7010505199432373, - "learning_rate": 0.00026695304910955204, - "loss": 3.3245, + "grad_norm": 0.6860924363136292, + "learning_rate": 0.0002669400971397733, + "loss": 3.3264, "step": 51550 }, { "epoch": 5.563342318059299, - "grad_norm": 0.7545852661132812, - "learning_rate": 0.00026662924986508365, - "loss": 3.3501, + "grad_norm": 0.6723926663398743, + "learning_rate": 0.0002666162978953049, + "loss": 3.3517, "step": 51600 }, { "epoch": 5.568733153638814, - "grad_norm": 0.6517020463943481, - "learning_rate": 0.0002663054506206152, - "loss": 3.3475, + "grad_norm": 0.634928822517395, + "learning_rate": 0.00026629249865083645, + "loss": 3.3487, "step": 51650 }, { "epoch": 5.574123989218329, - "grad_norm": 0.7191069722175598, - "learning_rate": 0.00026598165137614675, - "loss": 3.3301, + "grad_norm": 0.709865391254425, + "learning_rate": 0.00026596869940636805, + "loss": 3.3307, "step": 51700 }, { "epoch": 5.579514824797844, - "grad_norm": 0.6981732845306396, - "learning_rate": 0.00026565785213167835, - "loss": 3.3249, + "grad_norm": 0.6830937266349792, + "learning_rate": 0.0002656449001618996, + "loss": 3.3239, "step": 51750 }, { "epoch": 5.584905660377358, - "grad_norm": 0.7021121382713318, - "learning_rate": 0.0002653340528872099, - "loss": 3.3381, + "grad_norm": 0.7171959280967712, + "learning_rate": 0.0002653211009174312, + "loss": 3.3398, "step": 51800 }, { "epoch": 5.590296495956873, - "grad_norm": 0.7237430810928345, - "learning_rate": 0.00026501025364274145, - "loss": 3.3342, + "grad_norm": 0.6931149363517761, + "learning_rate": 0.0002649973016729627, + "loss": 3.336, "step": 51850 }, { "epoch": 5.595687331536388, - "grad_norm": 0.6834338903427124, - "learning_rate": 0.00026468645439827306, - "loss": 3.3419, + "grad_norm": 0.6825995445251465, + "learning_rate": 0.0002646735024284943, + "loss": 3.3417, "step": 51900 }, { "epoch": 5.601078167115903, - "grad_norm": 0.6926736831665039, - "learning_rate": 0.0002643626551538046, - "loss": 3.3299, + "grad_norm": 0.6648308038711548, + "learning_rate": 0.00026434970318402586, + "loss": 3.3304, "step": 51950 }, { "epoch": 5.606469002695418, - "grad_norm": 0.696631908416748, - "learning_rate": 0.0002640388559093362, - "loss": 3.3244, + "grad_norm": 0.6847530007362366, + "learning_rate": 0.00026402590393955746, + "loss": 3.3253, "step": 52000 }, { "epoch": 5.606469002695418, - "eval_accuracy": 0.38270100431123794, - "eval_loss": 3.3929481506347656, - "eval_runtime": 184.8201, - "eval_samples_per_second": 97.452, - "eval_steps_per_second": 6.092, + "eval_accuracy": 0.382617232934733, + "eval_loss": 3.393908739089966, + "eval_runtime": 185.0037, + "eval_samples_per_second": 97.355, + "eval_steps_per_second": 6.086, "step": 52000 }, { "epoch": 5.611859838274933, - "grad_norm": 0.6704844832420349, - "learning_rate": 0.00026371505666486776, - "loss": 3.3505, + "grad_norm": 0.6547165513038635, + "learning_rate": 0.000263702104695089, + "loss": 3.3494, "step": 52050 }, { "epoch": 5.617250673854447, - "grad_norm": 0.7229808568954468, - "learning_rate": 0.0002633912574203993, - "loss": 3.3271, + "grad_norm": 0.6862826347351074, + "learning_rate": 0.0002633783054506206, + "loss": 3.3276, "step": 52100 }, { "epoch": 5.622641509433962, - "grad_norm": 0.7184265851974487, - "learning_rate": 0.00026306745817593086, - "loss": 3.3306, + "grad_norm": 0.7386119365692139, + "learning_rate": 0.00026305450620615217, + "loss": 3.3311, "step": 52150 }, { "epoch": 5.628032345013477, - "grad_norm": 0.6913803219795227, - "learning_rate": 0.00026274365893146247, - "loss": 3.326, + "grad_norm": 0.6849868297576904, + "learning_rate": 0.0002627307069616837, + "loss": 3.3285, "step": 52200 }, { "epoch": 5.633423180592992, - "grad_norm": 0.6871593594551086, - "learning_rate": 0.000262419859686994, - "loss": 3.328, + "grad_norm": 0.7046032547950745, + "learning_rate": 0.0002624069077172153, + "loss": 3.3305, "step": 52250 }, { "epoch": 5.638814016172507, - "grad_norm": 0.6831674575805664, - "learning_rate": 0.0002620960604425256, - "loss": 3.3284, + "grad_norm": 0.6594682335853577, + "learning_rate": 0.0002620831084727469, + "loss": 3.3306, "step": 52300 }, { "epoch": 5.644204851752022, - "grad_norm": 0.7181283235549927, - "learning_rate": 0.0002617722611980572, + "grad_norm": 0.6664849519729614, + "learning_rate": 0.0002617593092282784, "loss": 3.3381, "step": 52350 }, { "epoch": 5.649595687331536, - "grad_norm": 0.6951797604560852, - "learning_rate": 0.0002614484619535888, - "loss": 3.3242, + "grad_norm": 0.6926441192626953, + "learning_rate": 0.00026143550998381003, + "loss": 3.3279, "step": 52400 }, { "epoch": 5.654986522911051, - "grad_norm": 0.7090012431144714, - "learning_rate": 0.00026112466270912033, - "loss": 3.354, + "grad_norm": 0.7188628315925598, + "learning_rate": 0.0002611117107393416, + "loss": 3.3547, "step": 52450 }, { "epoch": 5.660377358490566, - "grad_norm": 0.6859129071235657, - "learning_rate": 0.0002608008634646519, - "loss": 3.3293, + "grad_norm": 0.7127479314804077, + "learning_rate": 0.00026078791149487313, + "loss": 3.3288, "step": 52500 }, { "epoch": 5.665768194070081, - "grad_norm": 0.7087217569351196, - "learning_rate": 0.00026047706422018343, - "loss": 3.3462, + "grad_norm": 0.6943938732147217, + "learning_rate": 0.00026046411225040474, + "loss": 3.3464, "step": 52550 }, { "epoch": 5.671159029649596, - "grad_norm": 0.6836449503898621, - "learning_rate": 0.00026015326497571504, - "loss": 3.3587, + "grad_norm": 0.6763001084327698, + "learning_rate": 0.0002601403130059363, + "loss": 3.3593, "step": 52600 }, { "epoch": 5.67654986522911, - "grad_norm": 0.7609317302703857, - "learning_rate": 0.0002598294657312466, - "loss": 3.3302, + "grad_norm": 0.7764798998832703, + "learning_rate": 0.0002598165137614679, + "loss": 3.3311, "step": 52650 }, { "epoch": 5.681940700808625, - "grad_norm": 0.7099915146827698, - "learning_rate": 0.0002595056664867782, - "loss": 3.3192, + "grad_norm": 0.7057187557220459, + "learning_rate": 0.00025949271451699944, + "loss": 3.3212, "step": 52700 }, { "epoch": 5.6873315363881405, - "grad_norm": 0.7150067090988159, - "learning_rate": 0.00025918186724230974, - "loss": 3.3365, + "grad_norm": 0.6973100900650024, + "learning_rate": 0.000259168915272531, + "loss": 3.3382, "step": 52750 }, { "epoch": 5.692722371967655, - "grad_norm": 0.6776279211044312, - "learning_rate": 0.00025885806799784135, - "loss": 3.3202, + "grad_norm": 0.6737171411514282, + "learning_rate": 0.0002588451160280626, + "loss": 3.3209, "step": 52800 }, { "epoch": 5.69811320754717, - "grad_norm": 0.6856902837753296, - "learning_rate": 0.0002585342687533729, - "loss": 3.3162, + "grad_norm": 0.715227484703064, + "learning_rate": 0.00025852131678359415, + "loss": 3.3182, "step": 52850 }, { "epoch": 5.703504043126685, - "grad_norm": 0.7150585651397705, - "learning_rate": 0.00025821046950890445, - "loss": 3.3215, + "grad_norm": 0.7033119797706604, + "learning_rate": 0.0002581975175391257, + "loss": 3.3229, "step": 52900 }, { "epoch": 5.708894878706199, - "grad_norm": 0.7075424194335938, - "learning_rate": 0.000257886670264436, - "loss": 3.3247, + "grad_norm": 0.7333863377571106, + "learning_rate": 0.0002578737182946573, + "loss": 3.3259, "step": 52950 }, { "epoch": 5.714285714285714, - "grad_norm": 0.7501097917556763, - "learning_rate": 0.0002575628710199676, - "loss": 3.3379, + "grad_norm": 0.7076955437660217, + "learning_rate": 0.00025754991905018885, + "loss": 3.3385, "step": 53000 }, { "epoch": 5.714285714285714, - "eval_accuracy": 0.38347841572355057, - "eval_loss": 3.3866093158721924, - "eval_runtime": 185.0709, - "eval_samples_per_second": 97.319, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.38299588824876646, + "eval_loss": 3.389537811279297, + "eval_runtime": 185.2976, + "eval_samples_per_second": 97.2, + "eval_steps_per_second": 6.077, "step": 53000 }, { "epoch": 5.719676549865229, - "grad_norm": 0.6975317597389221, - "learning_rate": 0.00025724554776038856, - "loss": 3.3276, + "grad_norm": 0.6770291328430176, + "learning_rate": 0.0002572325957906098, + "loss": 3.3304, "step": 53050 }, { "epoch": 5.725067385444744, - "grad_norm": 0.7414888143539429, - "learning_rate": 0.0002569217485159201, - "loss": 3.3394, + "grad_norm": 0.7202932834625244, + "learning_rate": 0.00025690879654614136, + "loss": 3.3392, "step": 53100 }, { "epoch": 5.730458221024259, - "grad_norm": 0.6874222159385681, - "learning_rate": 0.0002565979492714517, - "loss": 3.3305, + "grad_norm": 0.6683873534202576, + "learning_rate": 0.0002565849973016729, + "loss": 3.3322, "step": 53150 }, { "epoch": 5.735849056603773, - "grad_norm": 0.6985239386558533, - "learning_rate": 0.00025627415002698326, - "loss": 3.3424, + "grad_norm": 0.7561058402061462, + "learning_rate": 0.0002562611980572045, + "loss": 3.3437, "step": 53200 }, { "epoch": 5.741239892183288, - "grad_norm": 0.7149052619934082, - "learning_rate": 0.0002559503507825148, - "loss": 3.3156, + "grad_norm": 0.651046097278595, + "learning_rate": 0.00025593739881273606, + "loss": 3.3163, "step": 53250 }, { "epoch": 5.7466307277628035, - "grad_norm": 0.7191247940063477, - "learning_rate": 0.00025562655153804636, - "loss": 3.3569, + "grad_norm": 0.7331181764602661, + "learning_rate": 0.00025561359956826767, + "loss": 3.3574, "step": 53300 }, { "epoch": 5.752021563342318, - "grad_norm": 0.6939454674720764, - "learning_rate": 0.00025530275229357797, - "loss": 3.3385, + "grad_norm": 0.6629624366760254, + "learning_rate": 0.0002552898003237992, + "loss": 3.3406, "step": 53350 }, { "epoch": 5.757412398921833, - "grad_norm": 0.6978759765625, - "learning_rate": 0.0002549789530491095, - "loss": 3.3435, + "grad_norm": 0.6959452629089355, + "learning_rate": 0.0002549660010793308, + "loss": 3.345, "step": 53400 }, { "epoch": 5.762803234501348, - "grad_norm": 0.6790081858634949, - "learning_rate": 0.0002546551538046411, - "loss": 3.3399, + "grad_norm": 0.6279221773147583, + "learning_rate": 0.0002546422018348624, + "loss": 3.3411, "step": 53450 }, { "epoch": 5.768194070080862, - "grad_norm": 0.7253314852714539, - "learning_rate": 0.0002543313545601727, - "loss": 3.321, + "grad_norm": 0.6780901551246643, + "learning_rate": 0.0002543184025903939, + "loss": 3.3237, "step": 53500 }, { "epoch": 5.773584905660377, - "grad_norm": 0.6867673397064209, - "learning_rate": 0.0002540075553157042, - "loss": 3.3211, + "grad_norm": 0.702332079410553, + "learning_rate": 0.0002539946033459255, + "loss": 3.3236, "step": 53550 }, { "epoch": 5.7789757412398925, - "grad_norm": 0.7390033602714539, - "learning_rate": 0.00025368375607123583, - "loss": 3.3414, + "grad_norm": 0.7578160166740417, + "learning_rate": 0.0002536708041014571, + "loss": 3.3433, "step": 53600 }, { "epoch": 5.784366576819407, - "grad_norm": 0.6785998940467834, - "learning_rate": 0.0002533599568267674, - "loss": 3.3245, + "grad_norm": 0.6509333252906799, + "learning_rate": 0.00025334700485698863, + "loss": 3.3264, "step": 53650 }, { "epoch": 5.789757412398922, - "grad_norm": 0.6617134213447571, - "learning_rate": 0.00025303615758229893, - "loss": 3.3451, + "grad_norm": 0.7034610509872437, + "learning_rate": 0.00025302320561252023, + "loss": 3.3471, "step": 53700 }, { "epoch": 5.795148247978437, - "grad_norm": 0.7111865282058716, - "learning_rate": 0.00025271235833783053, - "loss": 3.3338, + "grad_norm": 0.688968300819397, + "learning_rate": 0.0002526994063680518, + "loss": 3.334, "step": 53750 }, { "epoch": 5.800539083557951, - "grad_norm": 0.7098289728164673, - "learning_rate": 0.0002523885590933621, - "loss": 3.3222, + "grad_norm": 0.6626862287521362, + "learning_rate": 0.0002523756071235834, + "loss": 3.3253, "step": 53800 }, { "epoch": 5.8059299191374665, - "grad_norm": 0.7283722162246704, - "learning_rate": 0.00025206475984889364, - "loss": 3.3266, + "grad_norm": 0.715716540813446, + "learning_rate": 0.00025205180787911494, + "loss": 3.3286, "step": 53850 }, { "epoch": 5.811320754716981, - "grad_norm": 0.7388856410980225, - "learning_rate": 0.00025174096060442524, - "loss": 3.3276, + "grad_norm": 0.6780540347099304, + "learning_rate": 0.0002517280086346465, + "loss": 3.329, "step": 53900 }, { "epoch": 5.816711590296496, - "grad_norm": 0.7053267955780029, - "learning_rate": 0.0002514171613599568, - "loss": 3.3259, + "grad_norm": 0.7091286182403564, + "learning_rate": 0.00025140420939017804, + "loss": 3.3265, "step": 53950 }, { "epoch": 5.822102425876011, - "grad_norm": 0.6764585971832275, - "learning_rate": 0.0002510933621154884, - "loss": 3.3211, + "grad_norm": 0.7116252779960632, + "learning_rate": 0.00025108041014570965, + "loss": 3.3229, "step": 54000 }, { "epoch": 5.822102425876011, - "eval_accuracy": 0.3836551939720247, - "eval_loss": 3.3842101097106934, - "eval_runtime": 184.7838, - "eval_samples_per_second": 97.471, + "eval_accuracy": 0.3835450199437808, + "eval_loss": 3.384857416152954, + "eval_runtime": 184.7842, + "eval_samples_per_second": 97.47, "eval_steps_per_second": 6.094, "step": 54000 }, { "epoch": 5.827493261455525, - "grad_norm": 0.6842856407165527, - "learning_rate": 0.00025076956287101995, - "loss": 3.3321, + "grad_norm": 0.6713538765907288, + "learning_rate": 0.0002507566109012412, + "loss": 3.3324, "step": 54050 }, { "epoch": 5.83288409703504, - "grad_norm": 0.6792701482772827, - "learning_rate": 0.00025044576362655155, - "loss": 3.32, + "grad_norm": 0.7192128896713257, + "learning_rate": 0.0002504328116567728, + "loss": 3.3225, "step": 54100 }, { "epoch": 5.8382749326145555, - "grad_norm": 0.7160159349441528, - "learning_rate": 0.00025012196438208305, - "loss": 3.3357, + "grad_norm": 0.7344403862953186, + "learning_rate": 0.00025010901241230435, + "loss": 3.3354, "step": 54150 }, { "epoch": 5.84366576819407, - "grad_norm": 0.744286060333252, - "learning_rate": 0.00024979816513761465, - "loss": 3.3244, + "grad_norm": 0.7250232100486755, + "learning_rate": 0.0002497852131678359, + "loss": 3.3251, "step": 54200 }, { "epoch": 5.849056603773585, - "grad_norm": 0.6898226141929626, - "learning_rate": 0.0002494743658931462, - "loss": 3.3246, + "grad_norm": 0.6815427541732788, + "learning_rate": 0.0002494614139233675, + "loss": 3.3268, "step": 54250 }, { "epoch": 5.8544474393531, - "grad_norm": 0.7597428560256958, - "learning_rate": 0.0002491505666486778, - "loss": 3.325, + "grad_norm": 0.7029092311859131, + "learning_rate": 0.00024913761467889906, + "loss": 3.3261, "step": 54300 }, { "epoch": 5.859838274932614, - "grad_norm": 0.7275387048721313, - "learning_rate": 0.00024882676740420936, - "loss": 3.3379, + "grad_norm": 0.7195873856544495, + "learning_rate": 0.0002488138154344306, + "loss": 3.3378, "step": 54350 }, { "epoch": 5.8652291105121295, - "grad_norm": 0.6952611207962036, - "learning_rate": 0.00024850296815974096, - "loss": 3.3227, + "grad_norm": 0.7248128652572632, + "learning_rate": 0.0002484900161899622, + "loss": 3.3249, "step": 54400 }, { "epoch": 5.870619946091644, - "grad_norm": 0.7258508801460266, - "learning_rate": 0.0002481791689152725, - "loss": 3.3406, + "grad_norm": 0.6670764684677124, + "learning_rate": 0.00024816621694549376, + "loss": 3.3428, "step": 54450 }, { "epoch": 5.876010781671159, - "grad_norm": 0.6898373365402222, - "learning_rate": 0.0002478553696708041, - "loss": 3.3427, + "grad_norm": 0.6498870253562927, + "learning_rate": 0.00024784241770102537, + "loss": 3.341, "step": 54500 }, { "epoch": 5.881401617250674, - "grad_norm": 0.7068774104118347, - "learning_rate": 0.0002475315704263356, - "loss": 3.3143, + "grad_norm": 0.7022345066070557, + "learning_rate": 0.0002475186184565569, + "loss": 3.3149, "step": 54550 }, { "epoch": 5.886792452830189, - "grad_norm": 0.6842253804206848, - "learning_rate": 0.0002472077711818672, - "loss": 3.3409, + "grad_norm": 0.6908873915672302, + "learning_rate": 0.00024719481921208847, + "loss": 3.3422, "step": 54600 }, { "epoch": 5.892183288409703, - "grad_norm": 0.6846425533294678, - "learning_rate": 0.00024688397193739877, - "loss": 3.3212, + "grad_norm": 0.663985013961792, + "learning_rate": 0.0002468710199676201, + "loss": 3.3228, "step": 54650 }, { "epoch": 5.8975741239892185, - "grad_norm": 0.6987577676773071, - "learning_rate": 0.0002465601726929304, - "loss": 3.3301, + "grad_norm": 0.7020365595817566, + "learning_rate": 0.0002465472207231516, + "loss": 3.3317, "step": 54700 }, { "epoch": 5.902964959568733, - "grad_norm": 0.6544787287712097, - "learning_rate": 0.0002462363734484619, - "loss": 3.338, + "grad_norm": 0.6784001588821411, + "learning_rate": 0.0002462234214786832, + "loss": 3.3385, "step": 54750 }, { "epoch": 5.908355795148248, - "grad_norm": 0.7005526423454285, - "learning_rate": 0.00024591257420399353, - "loss": 3.3302, + "grad_norm": 0.7305879592895508, + "learning_rate": 0.0002458996222342148, + "loss": 3.3328, "step": 54800 }, { "epoch": 5.913746630727763, - "grad_norm": 0.7439953684806824, - "learning_rate": 0.0002455887749595251, - "loss": 3.3389, + "grad_norm": 0.7549262046813965, + "learning_rate": 0.00024557582298974633, + "loss": 3.3406, "step": 54850 }, { "epoch": 5.919137466307277, - "grad_norm": 0.714672327041626, - "learning_rate": 0.00024526497571505663, - "loss": 3.3383, + "grad_norm": 0.7167611122131348, + "learning_rate": 0.0002452520237452779, + "loss": 3.3398, "step": 54900 }, { "epoch": 5.9245283018867925, - "grad_norm": 0.7019497156143188, - "learning_rate": 0.00024494117647058824, - "loss": 3.3411, + "grad_norm": 0.7034414410591125, + "learning_rate": 0.0002449282245008095, + "loss": 3.344, "step": 54950 }, { "epoch": 5.929919137466308, - "grad_norm": 0.6823916435241699, - "learning_rate": 0.0002446173772261198, - "loss": 3.3249, + "grad_norm": 0.691204309463501, + "learning_rate": 0.00024460442525634104, + "loss": 3.3257, "step": 55000 }, { "epoch": 5.929919137466308, - "eval_accuracy": 0.3843319928102211, - "eval_loss": 3.379180908203125, - "eval_runtime": 185.106, - "eval_samples_per_second": 97.301, - "eval_steps_per_second": 6.083, + "eval_accuracy": 0.38400364378324575, + "eval_loss": 3.38031268119812, + "eval_runtime": 185.1848, + "eval_samples_per_second": 97.26, + "eval_steps_per_second": 6.08, "step": 55000 }, { "epoch": 5.935309973045822, - "grad_norm": 0.7124596834182739, - "learning_rate": 0.00024429357798165134, - "loss": 3.3525, + "grad_norm": 0.6970035433769226, + "learning_rate": 0.00024428062601187264, + "loss": 3.3526, "step": 55050 }, { "epoch": 5.940700808625337, - "grad_norm": 0.706619381904602, - "learning_rate": 0.00024396977873718291, - "loss": 3.3188, + "grad_norm": 0.7048007249832153, + "learning_rate": 0.0002439568267674042, + "loss": 3.3196, "step": 55100 }, { "epoch": 5.946091644204852, - "grad_norm": 0.720799446105957, - "learning_rate": 0.0002436459794927145, - "loss": 3.3249, + "grad_norm": 0.6989700198173523, + "learning_rate": 0.00024363302752293574, + "loss": 3.3271, "step": 55150 }, { "epoch": 5.951482479784366, - "grad_norm": 0.694983720779419, - "learning_rate": 0.00024332218024824607, - "loss": 3.3204, + "grad_norm": 0.714759349822998, + "learning_rate": 0.00024330922827846732, + "loss": 3.3236, "step": 55200 }, { "epoch": 5.9568733153638815, - "grad_norm": 0.6975095272064209, - "learning_rate": 0.00024299838100377765, - "loss": 3.328, + "grad_norm": 0.7349509000778198, + "learning_rate": 0.0002429854290339989, + "loss": 3.3284, "step": 55250 }, { "epoch": 5.962264150943396, - "grad_norm": 0.6743301153182983, - "learning_rate": 0.00024267458175930922, - "loss": 3.321, + "grad_norm": 0.671440839767456, + "learning_rate": 0.00024266162978953048, + "loss": 3.3217, "step": 55300 }, { "epoch": 5.967654986522911, - "grad_norm": 0.7407911419868469, - "learning_rate": 0.0002423507825148408, - "loss": 3.3238, + "grad_norm": 0.7063277363777161, + "learning_rate": 0.00024233783054506203, + "loss": 3.324, "step": 55350 }, { "epoch": 5.973045822102426, - "grad_norm": 0.6877210140228271, - "learning_rate": 0.00024202698327037233, - "loss": 3.3247, + "grad_norm": 0.6923580169677734, + "learning_rate": 0.0002420140313005936, + "loss": 3.3253, "step": 55400 }, { "epoch": 5.97843665768194, - "grad_norm": 0.696134090423584, - "learning_rate": 0.0002417031840259039, - "loss": 3.3433, + "grad_norm": 0.6707276105880737, + "learning_rate": 0.00024169023205612518, + "loss": 3.3456, "step": 55450 }, { "epoch": 5.9838274932614555, - "grad_norm": 0.7565768361091614, - "learning_rate": 0.00024137938478143548, - "loss": 3.3266, + "grad_norm": 0.7710720896720886, + "learning_rate": 0.00024136643281165676, + "loss": 3.3296, "step": 55500 }, { "epoch": 5.989218328840971, - "grad_norm": 0.7190529704093933, - "learning_rate": 0.00024105558553696706, - "loss": 3.33, + "grad_norm": 0.7132817506790161, + "learning_rate": 0.00024104263356718834, + "loss": 3.3307, "step": 55550 }, { "epoch": 5.994609164420485, - "grad_norm": 0.732549786567688, - "learning_rate": 0.00024073178629249864, - "loss": 3.3474, + "grad_norm": 0.7166342735290527, + "learning_rate": 0.0002407188343227199, + "loss": 3.3496, "step": 55600 }, { "epoch": 6.0, - "grad_norm": 1.5415840148925781, - "learning_rate": 0.00024040798704803021, - "loss": 3.3441, + "grad_norm": 1.5364723205566406, + "learning_rate": 0.00024039503507825147, + "loss": 3.3466, "step": 55650 }, { "epoch": 6.005390835579515, - "grad_norm": 0.7237451076507568, - "learning_rate": 0.00024008418780356176, - "loss": 3.24, + "grad_norm": 0.756853461265564, + "learning_rate": 0.00024007123583378302, + "loss": 3.2414, "step": 55700 }, { "epoch": 6.010781671159029, - "grad_norm": 0.7061052918434143, - "learning_rate": 0.00023976038855909334, - "loss": 3.2419, + "grad_norm": 0.6904761791229248, + "learning_rate": 0.0002397474365893146, + "loss": 3.2437, "step": 55750 }, { "epoch": 6.0161725067385445, - "grad_norm": 0.7535544037818909, - "learning_rate": 0.00023943658931462492, - "loss": 3.234, + "grad_norm": 0.7776129245758057, + "learning_rate": 0.00023942363734484617, + "loss": 3.2365, "step": 55800 }, { "epoch": 6.02156334231806, - "grad_norm": 0.7151229977607727, - "learning_rate": 0.00023911279007015647, - "loss": 3.242, + "grad_norm": 0.7028306722640991, + "learning_rate": 0.00023909983810037775, + "loss": 3.2434, "step": 55850 }, { "epoch": 6.026954177897574, - "grad_norm": 0.6773797273635864, - "learning_rate": 0.00023878899082568805, - "loss": 3.2422, + "grad_norm": 0.6557170152664185, + "learning_rate": 0.00023877603885590933, + "loss": 3.2437, "step": 55900 }, { "epoch": 6.032345013477089, - "grad_norm": 0.6992098093032837, - "learning_rate": 0.00023846519158121963, - "loss": 3.238, + "grad_norm": 0.696887731552124, + "learning_rate": 0.0002384522396114409, + "loss": 3.2402, "step": 55950 }, { "epoch": 6.037735849056604, - "grad_norm": 0.6867258548736572, - "learning_rate": 0.0002381413923367512, - "loss": 3.2353, + "grad_norm": 0.6949684619903564, + "learning_rate": 0.00023812844036697248, + "loss": 3.2365, "step": 56000 }, { "epoch": 6.037735849056604, - "eval_accuracy": 0.3839687662062573, - "eval_loss": 3.3829169273376465, - "eval_runtime": 185.0633, - "eval_samples_per_second": 97.323, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.3836666025252452, + "eval_loss": 3.385128974914551, + "eval_runtime": 184.7946, + "eval_samples_per_second": 97.465, + "eval_steps_per_second": 6.093, "step": 56000 }, { "epoch": 6.0431266846361185, - "grad_norm": 0.7631310820579529, - "learning_rate": 0.00023781759309228275, - "loss": 3.2433, + "grad_norm": 0.6826362609863281, + "learning_rate": 0.000237804641122504, + "loss": 3.246, "step": 56050 }, { "epoch": 6.048517520215634, - "grad_norm": 0.7571997046470642, - "learning_rate": 0.00023749379384781433, - "loss": 3.2398, + "grad_norm": 0.7227660417556763, + "learning_rate": 0.00023748084187803558, + "loss": 3.2431, "step": 56100 }, { "epoch": 6.053908355795148, - "grad_norm": 0.7700287699699402, - "learning_rate": 0.0002371699946033459, - "loss": 3.2567, + "grad_norm": 0.7226882576942444, + "learning_rate": 0.00023715704263356716, + "loss": 3.2582, "step": 56150 }, { "epoch": 6.059299191374663, - "grad_norm": 0.7289305329322815, - "learning_rate": 0.0002368461953588775, - "loss": 3.2385, + "grad_norm": 0.7452118396759033, + "learning_rate": 0.00023683324338909874, + "loss": 3.2401, "step": 56200 }, { "epoch": 6.064690026954178, - "grad_norm": 0.7690556049346924, - "learning_rate": 0.00023652239611440904, - "loss": 3.2467, + "grad_norm": 0.7395612001419067, + "learning_rate": 0.00023650944414463032, + "loss": 3.2499, "step": 56250 }, { "epoch": 6.070080862533692, - "grad_norm": 0.6697551608085632, - "learning_rate": 0.00023619859686994062, - "loss": 3.2544, + "grad_norm": 0.7218838334083557, + "learning_rate": 0.0002361856449001619, + "loss": 3.2576, "step": 56300 }, { "epoch": 6.0754716981132075, - "grad_norm": 0.7450080513954163, - "learning_rate": 0.00023587479762547217, - "loss": 3.2534, + "grad_norm": 0.761297345161438, + "learning_rate": 0.00023586184565569347, + "loss": 3.2537, "step": 56350 }, { "epoch": 6.080862533692723, - "grad_norm": 0.7233448624610901, - "learning_rate": 0.00023555099838100374, - "loss": 3.2729, + "grad_norm": 0.7278382778167725, + "learning_rate": 0.00023553804641122502, + "loss": 3.2738, "step": 56400 }, { "epoch": 6.086253369272237, - "grad_norm": 0.6958713531494141, - "learning_rate": 0.00023522719913653532, - "loss": 3.2581, + "grad_norm": 0.6822406053543091, + "learning_rate": 0.00023521424716675657, + "loss": 3.2623, "step": 56450 }, { "epoch": 6.091644204851752, - "grad_norm": 0.7158046364784241, - "learning_rate": 0.0002349033998920669, - "loss": 3.2399, + "grad_norm": 0.7750765681266785, + "learning_rate": 0.00023489044792228815, + "loss": 3.2439, "step": 56500 }, { "epoch": 6.097035040431267, - "grad_norm": 0.6767109036445618, - "learning_rate": 0.00023457960064759848, - "loss": 3.2716, + "grad_norm": 0.6662526726722717, + "learning_rate": 0.00023456664867781973, + "loss": 3.2745, "step": 56550 }, { "epoch": 6.1024258760107815, - "grad_norm": 0.7307221293449402, - "learning_rate": 0.00023425580140313005, - "loss": 3.2545, + "grad_norm": 0.7166032791137695, + "learning_rate": 0.0002342428494333513, + "loss": 3.256, "step": 56600 }, { "epoch": 6.107816711590297, - "grad_norm": 0.7141797542572021, - "learning_rate": 0.00023393200215866163, - "loss": 3.238, + "grad_norm": 0.7623759508132935, + "learning_rate": 0.00023391905018888288, + "loss": 3.2411, "step": 56650 }, { "epoch": 6.113207547169812, - "grad_norm": 0.7216366529464722, - "learning_rate": 0.00023360820291419316, - "loss": 3.2681, + "grad_norm": 0.7492631673812866, + "learning_rate": 0.00023359525094441443, + "loss": 3.2714, "step": 56700 }, { "epoch": 6.118598382749326, - "grad_norm": 0.7594369649887085, - "learning_rate": 0.00023328440366972473, - "loss": 3.277, + "grad_norm": 0.7426467537879944, + "learning_rate": 0.000233271451699946, + "loss": 3.2786, "step": 56750 }, { "epoch": 6.123989218328841, - "grad_norm": 0.7302905321121216, - "learning_rate": 0.0002329606044252563, - "loss": 3.2676, + "grad_norm": 0.6925146579742432, + "learning_rate": 0.0002329476524554776, + "loss": 3.2701, "step": 56800 }, { "epoch": 6.129380053908355, - "grad_norm": 0.734412670135498, - "learning_rate": 0.0002326368051807879, - "loss": 3.2495, + "grad_norm": 0.7421886920928955, + "learning_rate": 0.00023263032919589851, + "loss": 3.2516, "step": 56850 }, { "epoch": 6.1347708894878705, - "grad_norm": 0.7013684511184692, - "learning_rate": 0.00023231300593631947, - "loss": 3.2643, + "grad_norm": 0.6857749223709106, + "learning_rate": 0.0002323065299514301, + "loss": 3.2697, "step": 56900 }, { "epoch": 6.140161725067386, - "grad_norm": 0.7368125319480896, - "learning_rate": 0.00023198920669185104, - "loss": 3.249, + "grad_norm": 0.7283300161361694, + "learning_rate": 0.00023198273070696167, + "loss": 3.2511, "step": 56950 }, { "epoch": 6.1455525606469, - "grad_norm": 0.7295984625816345, - "learning_rate": 0.00023166540744738262, - "loss": 3.2655, + "grad_norm": 0.6971080899238586, + "learning_rate": 0.00023165893146249325, + "loss": 3.2675, "step": 57000 }, { "epoch": 6.1455525606469, - "eval_accuracy": 0.38440663734414965, - "eval_loss": 3.3809986114501953, - "eval_runtime": 185.0253, - "eval_samples_per_second": 97.343, + "eval_accuracy": 0.3843410109999097, + "eval_loss": 3.3825490474700928, + "eval_runtime": 185.0025, + "eval_samples_per_second": 97.355, "eval_steps_per_second": 6.086, "step": 57000 }, { "epoch": 6.150943396226415, - "grad_norm": 0.7095818519592285, - "learning_rate": 0.00023134808418780352, - "loss": 3.2574, + "grad_norm": 0.6910756230354309, + "learning_rate": 0.0002313351322180248, + "loss": 3.2588, "step": 57050 }, { "epoch": 6.15633423180593, - "grad_norm": 0.7285301685333252, - "learning_rate": 0.0002310242849433351, - "loss": 3.2646, + "grad_norm": 0.6956217288970947, + "learning_rate": 0.00023101133297355638, + "loss": 3.2673, "step": 57100 }, { "epoch": 6.1617250673854445, - "grad_norm": 0.6750980019569397, - "learning_rate": 0.00023070048569886667, - "loss": 3.2782, + "grad_norm": 0.6852027773857117, + "learning_rate": 0.00023068753372908795, + "loss": 3.2824, "step": 57150 }, { "epoch": 6.16711590296496, - "grad_norm": 0.7419140934944153, - "learning_rate": 0.00023037668645439825, - "loss": 3.2553, + "grad_norm": 0.7331057190895081, + "learning_rate": 0.0002303637344846195, + "loss": 3.2579, "step": 57200 }, { "epoch": 6.172506738544475, - "grad_norm": 0.6713706851005554, - "learning_rate": 0.00023005288720992983, - "loss": 3.2533, + "grad_norm": 0.6825144290924072, + "learning_rate": 0.00023003993524015108, + "loss": 3.2576, "step": 57250 }, { "epoch": 6.177897574123989, - "grad_norm": 0.7637577056884766, - "learning_rate": 0.0002297290879654614, - "loss": 3.2685, + "grad_norm": 0.746617317199707, + "learning_rate": 0.00022971613599568266, + "loss": 3.2711, "step": 57300 }, { "epoch": 6.183288409703504, - "grad_norm": 0.709025502204895, - "learning_rate": 0.00022940528872099299, - "loss": 3.256, + "grad_norm": 0.6929312944412231, + "learning_rate": 0.0002293923367512142, + "loss": 3.2592, "step": 57350 }, { "epoch": 6.188679245283019, - "grad_norm": 0.7199300527572632, - "learning_rate": 0.00022908148947652454, - "loss": 3.2631, + "grad_norm": 0.7209480404853821, + "learning_rate": 0.0002290685375067458, + "loss": 3.2663, "step": 57400 }, { "epoch": 6.1940700808625335, - "grad_norm": 0.7396861910820007, - "learning_rate": 0.0002287576902320561, - "loss": 3.2615, + "grad_norm": 0.7121549248695374, + "learning_rate": 0.00022874473826227736, + "loss": 3.2621, "step": 57450 }, { "epoch": 6.199460916442049, - "grad_norm": 0.7257159352302551, - "learning_rate": 0.00022843389098758766, - "loss": 3.2525, + "grad_norm": 0.7355096340179443, + "learning_rate": 0.00022842093901780894, + "loss": 3.256, "step": 57500 }, { "epoch": 6.204851752021563, - "grad_norm": 0.7163692712783813, - "learning_rate": 0.00022811009174311924, - "loss": 3.277, + "grad_norm": 0.7216746211051941, + "learning_rate": 0.00022809713977334052, + "loss": 3.2775, "step": 57550 }, { "epoch": 6.210242587601078, - "grad_norm": 0.7706537246704102, - "learning_rate": 0.00022778629249865082, - "loss": 3.2624, + "grad_norm": 0.7493441700935364, + "learning_rate": 0.0002277733405288721, + "loss": 3.2659, "step": 57600 }, { "epoch": 6.215633423180593, - "grad_norm": 0.7195106148719788, - "learning_rate": 0.0002274624932541824, - "loss": 3.2776, + "grad_norm": 0.7113534808158875, + "learning_rate": 0.00022744954128440365, + "loss": 3.2794, "step": 57650 }, { "epoch": 6.2210242587601075, - "grad_norm": 0.7585012912750244, - "learning_rate": 0.00022713869400971397, + "grad_norm": 0.7359064817428589, + "learning_rate": 0.0002271257420399352, "loss": 3.2706, "step": 57700 }, { "epoch": 6.226415094339623, - "grad_norm": 0.7925708293914795, - "learning_rate": 0.00022681489476524553, - "loss": 3.261, + "grad_norm": 0.7439775466918945, + "learning_rate": 0.00022680194279546678, + "loss": 3.2631, "step": 57750 }, { "epoch": 6.231805929919138, - "grad_norm": 0.7118070125579834, - "learning_rate": 0.0002264910955207771, - "loss": 3.2546, + "grad_norm": 0.7042476534843445, + "learning_rate": 0.00022647814355099835, + "loss": 3.2573, "step": 57800 }, { "epoch": 6.237196765498652, - "grad_norm": 0.7217944264411926, - "learning_rate": 0.00022616729627630868, - "loss": 3.2693, + "grad_norm": 0.7179429531097412, + "learning_rate": 0.00022615434430652993, + "loss": 3.2712, "step": 57850 }, { "epoch": 6.242587601078167, - "grad_norm": 0.7400355935096741, - "learning_rate": 0.00022584349703184023, - "loss": 3.2768, + "grad_norm": 0.7358320355415344, + "learning_rate": 0.0002258305450620615, + "loss": 3.2811, "step": 57900 }, { "epoch": 6.247978436657682, - "grad_norm": 0.7412822842597961, - "learning_rate": 0.0002255196977873718, - "loss": 3.269, + "grad_norm": 0.6951166391372681, + "learning_rate": 0.0002255067458175931, + "loss": 3.2729, "step": 57950 }, { "epoch": 6.2533692722371965, - "grad_norm": 0.694203794002533, - "learning_rate": 0.0002251958985429034, - "loss": 3.2709, + "grad_norm": 0.7272751331329346, + "learning_rate": 0.00022518294657312467, + "loss": 3.2744, "step": 58000 }, { "epoch": 6.2533692722371965, - "eval_accuracy": 0.3845205055705792, - "eval_loss": 3.3798089027404785, - "eval_runtime": 185.1754, - "eval_samples_per_second": 97.265, - "eval_steps_per_second": 6.081, + "eval_accuracy": 0.3845305016362582, + "eval_loss": 3.3816213607788086, + "eval_runtime": 185.286, + "eval_samples_per_second": 97.207, + "eval_steps_per_second": 6.077, "step": 58000 }, { "epoch": 6.258760107816712, - "grad_norm": 0.6906482577323914, - "learning_rate": 0.00022487209929843494, - "loss": 3.2579, + "grad_norm": 0.6936237812042236, + "learning_rate": 0.0002248591473286562, + "loss": 3.2592, "step": 58050 }, { "epoch": 6.264150943396227, - "grad_norm": 0.704700767993927, - "learning_rate": 0.00022454830005396652, - "loss": 3.2713, + "grad_norm": 0.7434632182121277, + "learning_rate": 0.00022453534808418777, + "loss": 3.274, "step": 58100 }, { "epoch": 6.269541778975741, - "grad_norm": 0.718530535697937, - "learning_rate": 0.0002242245008094981, - "loss": 3.2661, + "grad_norm": 0.697569727897644, + "learning_rate": 0.00022421154883971934, + "loss": 3.2678, "step": 58150 }, { "epoch": 6.274932614555256, - "grad_norm": 0.7455987334251404, - "learning_rate": 0.00022390070156502967, - "loss": 3.2877, + "grad_norm": 0.710084080696106, + "learning_rate": 0.00022388774959525092, + "loss": 3.2885, "step": 58200 }, { "epoch": 6.280323450134771, - "grad_norm": 0.7250301241874695, - "learning_rate": 0.00022357690232056125, - "loss": 3.2694, + "grad_norm": 0.7134957313537598, + "learning_rate": 0.0002235639503507825, + "loss": 3.2707, "step": 58250 }, { "epoch": 6.285714285714286, - "grad_norm": 0.71706622838974, - "learning_rate": 0.0002232531030760928, - "loss": 3.2583, + "grad_norm": 0.694118320941925, + "learning_rate": 0.00022324015110631408, + "loss": 3.2621, "step": 58300 }, { "epoch": 6.291105121293801, - "grad_norm": 0.7500212788581848, - "learning_rate": 0.00022292930383162435, - "loss": 3.2811, + "grad_norm": 0.7010383009910583, + "learning_rate": 0.00022291635186184565, + "loss": 3.2829, "step": 58350 }, { "epoch": 6.296495956873315, - "grad_norm": 0.7041599750518799, - "learning_rate": 0.00022260550458715593, - "loss": 3.2633, + "grad_norm": 0.6871818900108337, + "learning_rate": 0.0002225925526173772, + "loss": 3.2675, "step": 58400 }, { "epoch": 6.30188679245283, - "grad_norm": 0.6938748955726624, - "learning_rate": 0.0002222817053426875, - "loss": 3.2907, + "grad_norm": 0.7094181776046753, + "learning_rate": 0.00022226875337290878, + "loss": 3.2923, "step": 58450 }, { "epoch": 6.307277628032345, - "grad_norm": 0.7118483185768127, - "learning_rate": 0.00022195790609821908, - "loss": 3.2895, + "grad_norm": 0.697614848613739, + "learning_rate": 0.00022194495412844033, + "loss": 3.2906, "step": 58500 }, { "epoch": 6.3126684636118595, - "grad_norm": 0.7580938935279846, - "learning_rate": 0.00022163410685375066, - "loss": 3.2791, + "grad_norm": 0.7467103004455566, + "learning_rate": 0.0002216211548839719, + "loss": 3.2814, "step": 58550 }, { "epoch": 6.318059299191375, - "grad_norm": 0.7854766845703125, - "learning_rate": 0.00022131030760928224, - "loss": 3.2838, + "grad_norm": 0.7276577353477478, + "learning_rate": 0.0002212973556395035, + "loss": 3.2854, "step": 58600 }, { "epoch": 6.32345013477089, - "grad_norm": 0.7117508053779602, - "learning_rate": 0.00022098650836481382, - "loss": 3.2798, + "grad_norm": 0.7234843969345093, + "learning_rate": 0.00022097355639503507, + "loss": 3.2813, "step": 58650 }, { "epoch": 6.328840970350404, - "grad_norm": 0.8072823286056519, - "learning_rate": 0.0002206627091203454, - "loss": 3.2804, + "grad_norm": 0.7457574009895325, + "learning_rate": 0.00022064975715056664, + "loss": 3.2817, "step": 58700 }, { "epoch": 6.334231805929919, - "grad_norm": 0.7097095847129822, - "learning_rate": 0.00022033890987587692, - "loss": 3.2692, + "grad_norm": 0.7120930552482605, + "learning_rate": 0.0002203259579060982, + "loss": 3.2705, "step": 58750 }, { "epoch": 6.339622641509434, - "grad_norm": 0.7268771529197693, - "learning_rate": 0.0002200151106314085, - "loss": 3.2774, + "grad_norm": 0.7144607305526733, + "learning_rate": 0.00022000215866162977, + "loss": 3.2781, "step": 58800 }, { "epoch": 6.345013477088949, - "grad_norm": 0.7636404037475586, - "learning_rate": 0.00021969131138694007, - "loss": 3.2894, + "grad_norm": 0.6950954794883728, + "learning_rate": 0.00021967835941716135, + "loss": 3.2913, "step": 58850 }, { "epoch": 6.350404312668464, - "grad_norm": 0.7441515326499939, - "learning_rate": 0.00021936751214247165, + "grad_norm": 0.7282975912094116, + "learning_rate": 0.0002193545601726929, "loss": 3.2755, "step": 58900 }, { "epoch": 6.355795148247978, - "grad_norm": 0.7508131861686707, - "learning_rate": 0.00021904371289800323, - "loss": 3.2721, + "grad_norm": 0.7318394780158997, + "learning_rate": 0.00021903076092822448, + "loss": 3.276, "step": 58950 }, { "epoch": 6.361185983827493, - "grad_norm": 0.7223631739616394, - "learning_rate": 0.0002187199136535348, - "loss": 3.2884, + "grad_norm": 0.7199020385742188, + "learning_rate": 0.00021870696168375606, + "loss": 3.2922, "step": 59000 }, { "epoch": 6.361185983827493, - "eval_accuracy": 0.38532431963463076, - "eval_loss": 3.3748483657836914, - "eval_runtime": 184.7925, - "eval_samples_per_second": 97.466, - "eval_steps_per_second": 6.093, + "eval_accuracy": 0.3849554430805006, + "eval_loss": 3.3758089542388916, + "eval_runtime": 184.9696, + "eval_samples_per_second": 97.373, + "eval_steps_per_second": 6.087, "step": 59000 }, { "epoch": 6.366576819407008, - "grad_norm": 0.7511294484138489, - "learning_rate": 0.00021839611440906638, - "loss": 3.2777, + "grad_norm": 0.7290945053100586, + "learning_rate": 0.0002183831624392876, + "loss": 3.2789, "step": 59050 }, { "epoch": 6.3719676549865225, - "grad_norm": 0.7336013317108154, - "learning_rate": 0.00021807879114948728, - "loss": 3.275, + "grad_norm": 0.7788245677947998, + "learning_rate": 0.00021805936319481918, + "loss": 3.2769, "step": 59100 }, { "epoch": 6.377358490566038, - "grad_norm": 0.7369405627250671, - "learning_rate": 0.00021775499190501886, - "loss": 3.2704, + "grad_norm": 0.7099036574363708, + "learning_rate": 0.00021773556395035076, + "loss": 3.271, "step": 59150 }, { "epoch": 6.382749326145553, - "grad_norm": 0.7976130843162537, - "learning_rate": 0.00021743119266055044, - "loss": 3.2738, + "grad_norm": 0.7210767865180969, + "learning_rate": 0.00021741176470588234, + "loss": 3.2757, "step": 59200 }, { "epoch": 6.388140161725067, - "grad_norm": 0.7466988563537598, - "learning_rate": 0.000217107393416082, - "loss": 3.2758, + "grad_norm": 0.7274968028068542, + "learning_rate": 0.00021708796546141392, + "loss": 3.2776, "step": 59250 }, { "epoch": 6.393530997304582, - "grad_norm": 0.7116513848304749, - "learning_rate": 0.0002167835941716136, - "loss": 3.2786, + "grad_norm": 0.6926999688148499, + "learning_rate": 0.0002167641662169455, + "loss": 3.2813, "step": 59300 }, { "epoch": 6.398921832884097, - "grad_norm": 0.7164875864982605, - "learning_rate": 0.00021645979492714517, - "loss": 3.2747, + "grad_norm": 0.6983639001846313, + "learning_rate": 0.00021644036697247702, + "loss": 3.2765, "step": 59350 }, { "epoch": 6.404312668463612, - "grad_norm": 0.7558356523513794, - "learning_rate": 0.00021613599568267672, - "loss": 3.3011, + "grad_norm": 0.7861400246620178, + "learning_rate": 0.0002161165677280086, + "loss": 3.3026, "step": 59400 }, { "epoch": 6.409703504043127, - "grad_norm": 0.7918745875358582, - "learning_rate": 0.0002158121964382083, - "loss": 3.2843, + "grad_norm": 0.7995545864105225, + "learning_rate": 0.00021579276848354017, + "loss": 3.2864, "step": 59450 }, { "epoch": 6.415094339622642, - "grad_norm": 0.7076215744018555, - "learning_rate": 0.00021548839719373985, - "loss": 3.2718, + "grad_norm": 0.7238384485244751, + "learning_rate": 0.00021546896923907175, + "loss": 3.273, "step": 59500 }, { "epoch": 6.420485175202156, - "grad_norm": 0.7596773505210876, - "learning_rate": 0.00021516459794927142, - "loss": 3.2693, + "grad_norm": 0.7718705534934998, + "learning_rate": 0.00021514516999460333, + "loss": 3.2711, "step": 59550 }, { "epoch": 6.425876010781671, - "grad_norm": 0.7385455965995789, - "learning_rate": 0.000214840798704803, - "loss": 3.293, + "grad_norm": 0.7363432049751282, + "learning_rate": 0.0002148213707501349, + "loss": 3.2944, "step": 59600 }, { "epoch": 6.431266846361186, - "grad_norm": 0.7650585174560547, - "learning_rate": 0.00021451699946033458, - "loss": 3.2822, + "grad_norm": 0.7251701354980469, + "learning_rate": 0.00021449757150566648, + "loss": 3.2848, "step": 59650 }, { "epoch": 6.436657681940701, - "grad_norm": 0.7756490111351013, - "learning_rate": 0.00021419320021586616, - "loss": 3.2918, + "grad_norm": 0.754550576210022, + "learning_rate": 0.00021417377226119806, + "loss": 3.2915, "step": 59700 }, { "epoch": 6.442048517520216, - "grad_norm": 0.7733294367790222, - "learning_rate": 0.0002138694009713977, - "loss": 3.2897, + "grad_norm": 0.7444179654121399, + "learning_rate": 0.0002138499730167296, + "loss": 3.2922, "step": 59750 }, { "epoch": 6.44743935309973, - "grad_norm": 0.7549540400505066, - "learning_rate": 0.00021354560172692929, - "loss": 3.281, + "grad_norm": 0.7723394632339478, + "learning_rate": 0.00021352617377226116, + "loss": 3.2833, "step": 59800 }, { "epoch": 6.452830188679245, - "grad_norm": 0.6927609443664551, - "learning_rate": 0.00021322180248246086, - "loss": 3.2651, + "grad_norm": 0.6993033289909363, + "learning_rate": 0.00021320237452779274, + "loss": 3.2674, "step": 59850 }, { "epoch": 6.45822102425876, - "grad_norm": 0.7731172442436218, - "learning_rate": 0.00021289800323799241, - "loss": 3.2776, + "grad_norm": 0.7500173449516296, + "learning_rate": 0.00021287857528332432, + "loss": 3.2794, "step": 59900 }, { "epoch": 6.463611859838275, - "grad_norm": 0.7873961329460144, - "learning_rate": 0.000212574203993524, - "loss": 3.2673, + "grad_norm": 0.7128188610076904, + "learning_rate": 0.0002125547760388559, + "loss": 3.2688, "step": 59950 }, { "epoch": 6.46900269541779, - "grad_norm": 0.7329972982406616, - "learning_rate": 0.00021225040474905557, - "loss": 3.2774, + "grad_norm": 0.6932951211929321, + "learning_rate": 0.00021223097679438747, + "loss": 3.28, "step": 60000 }, { "epoch": 6.46900269541779, - "eval_accuracy": 0.38562365834055967, - "eval_loss": 3.3711838722229004, - "eval_runtime": 185.2128, - "eval_samples_per_second": 97.245, - "eval_steps_per_second": 6.079, + "eval_accuracy": 0.3855733520535016, + "eval_loss": 3.3714606761932373, + "eval_runtime": 185.4106, + "eval_samples_per_second": 97.141, + "eval_steps_per_second": 6.073, "step": 60000 }, { "epoch": 6.474393530997305, - "grad_norm": 0.7409731149673462, - "learning_rate": 0.00021192660550458712, - "loss": 3.2753, + "grad_norm": 0.7293769121170044, + "learning_rate": 0.00021190717754991905, + "loss": 3.2759, "step": 60050 }, { "epoch": 6.479784366576819, - "grad_norm": 0.7271058559417725, - "learning_rate": 0.0002116028062601187, - "loss": 3.2901, + "grad_norm": 0.7333296537399292, + "learning_rate": 0.0002115833783054506, + "loss": 3.2925, "step": 60100 }, { "epoch": 6.485175202156334, - "grad_norm": 0.7415794730186462, - "learning_rate": 0.00021127900701565028, - "loss": 3.2682, + "grad_norm": 0.73787522315979, + "learning_rate": 0.00021125957906098218, + "loss": 3.269, "step": 60150 }, { "epoch": 6.490566037735849, - "grad_norm": 0.7169997692108154, - "learning_rate": 0.00021095520777118185, - "loss": 3.2797, + "grad_norm": 0.7034549117088318, + "learning_rate": 0.00021093577981651373, + "loss": 3.2833, "step": 60200 }, { "epoch": 6.495956873315364, - "grad_norm": 0.741539478302002, - "learning_rate": 0.00021063140852671343, - "loss": 3.2738, + "grad_norm": 0.7363021969795227, + "learning_rate": 0.0002106119805720453, + "loss": 3.2787, "step": 60250 }, { "epoch": 6.501347708894879, - "grad_norm": 0.7155291438102722, - "learning_rate": 0.000210307609282245, - "loss": 3.2714, + "grad_norm": 0.7120100259780884, + "learning_rate": 0.00021028818132757689, + "loss": 3.2729, "step": 60300 }, { "epoch": 6.506738544474393, - "grad_norm": 0.7458370327949524, - "learning_rate": 0.00020998381003777653, - "loss": 3.2564, + "grad_norm": 0.7493690252304077, + "learning_rate": 0.00020996438208310846, + "loss": 3.2609, "step": 60350 }, { "epoch": 6.512129380053908, - "grad_norm": 0.7324507832527161, - "learning_rate": 0.0002096600107933081, - "loss": 3.2867, + "grad_norm": 0.7664533257484436, + "learning_rate": 0.00020964058283864001, + "loss": 3.2888, "step": 60400 }, { "epoch": 6.517520215633423, - "grad_norm": 0.7247518301010132, - "learning_rate": 0.0002093362115488397, - "loss": 3.2764, + "grad_norm": 0.7788054347038269, + "learning_rate": 0.0002093167835941716, + "loss": 3.279, "step": 60450 }, { "epoch": 6.5229110512129385, - "grad_norm": 0.7751719355583191, - "learning_rate": 0.00020901241230437127, - "loss": 3.2722, + "grad_norm": 0.7385787963867188, + "learning_rate": 0.00020899946033459254, + "loss": 3.2747, "step": 60500 }, { "epoch": 6.528301886792453, - "grad_norm": 0.7711856961250305, - "learning_rate": 0.00020868861305990284, - "loss": 3.2942, + "grad_norm": 0.7137572765350342, + "learning_rate": 0.0002086756610901241, + "loss": 3.297, "step": 60550 }, { "epoch": 6.533692722371968, - "grad_norm": 0.7641790509223938, - "learning_rate": 0.00020836481381543442, - "loss": 3.2694, + "grad_norm": 0.7212474942207336, + "learning_rate": 0.0002083712898003238, + "loss": 3.2788, "step": 60600 }, { "epoch": 6.539083557951482, - "grad_norm": 0.7420713901519775, - "learning_rate": 0.000208041014570966, - "loss": 3.2843, + "grad_norm": 0.7242739796638489, + "learning_rate": 0.00020804749055585537, + "loss": 3.2879, "step": 60650 }, { "epoch": 6.544474393530997, - "grad_norm": 0.6974769830703735, - "learning_rate": 0.00020771721532649758, - "loss": 3.2811, + "grad_norm": 0.7038812637329102, + "learning_rate": 0.0002077236913113869, + "loss": 3.2846, "step": 60700 }, { "epoch": 6.549865229110512, - "grad_norm": 0.7124184370040894, - "learning_rate": 0.00020739341608202915, - "loss": 3.2855, + "grad_norm": 0.7051358819007874, + "learning_rate": 0.00020739989206691847, + "loss": 3.2898, "step": 60750 }, { "epoch": 6.555256064690027, - "grad_norm": 0.7596436142921448, - "learning_rate": 0.00020706961683756068, - "loss": 3.2815, + "grad_norm": 0.7652502655982971, + "learning_rate": 0.00020707609282245005, + "loss": 3.2851, "step": 60800 }, { "epoch": 6.560646900269542, - "grad_norm": 0.7241122722625732, - "learning_rate": 0.00020674581759309225, - "loss": 3.2703, + "grad_norm": 0.788062334060669, + "learning_rate": 0.00020675229357798163, + "loss": 3.2743, "step": 60850 }, { "epoch": 6.566037735849057, - "grad_norm": 0.7312943935394287, - "learning_rate": 0.00020642201834862383, - "loss": 3.2659, + "grad_norm": 0.73066246509552, + "learning_rate": 0.0002064284943335132, + "loss": 3.2681, "step": 60900 }, { "epoch": 6.571428571428571, - "grad_norm": 0.8218500018119812, - "learning_rate": 0.0002060982191041554, - "loss": 3.285, + "grad_norm": 0.7862104773521423, + "learning_rate": 0.00020610469508904478, + "loss": 3.2857, "step": 60950 }, { "epoch": 6.576819407008086, - "grad_norm": 0.7621256113052368, - "learning_rate": 0.000205774419859687, - "loss": 3.2727, + "grad_norm": 0.7408245801925659, + "learning_rate": 0.00020578089584457636, + "loss": 3.2752, "step": 61000 }, { "epoch": 6.576819407008086, - "eval_accuracy": 0.3862397202144678, - "eval_loss": 3.3667685985565186, - "eval_runtime": 184.7684, - "eval_samples_per_second": 97.479, - "eval_steps_per_second": 6.094, + "eval_accuracy": 0.38576555901204534, + "eval_loss": 3.3688933849334717, + "eval_runtime": 184.7982, + "eval_samples_per_second": 97.463, + "eval_steps_per_second": 6.093, "step": 61000 }, { "epoch": 6.5822102425876015, - "grad_norm": 0.7284641861915588, - "learning_rate": 0.00020545062061521857, - "loss": 3.2699, + "grad_norm": 0.6936094164848328, + "learning_rate": 0.00020545709660010794, + "loss": 3.2732, "step": 61050 }, { "epoch": 6.587601078167116, - "grad_norm": 0.7356741428375244, + "grad_norm": 0.713800311088562, "learning_rate": 0.00020513329735563946, - "loss": 3.2767, + "loss": 3.2782, "step": 61100 }, { "epoch": 6.592991913746631, - "grad_norm": 0.7225967645645142, + "grad_norm": 0.7378830909729004, "learning_rate": 0.00020480949811117104, - "loss": 3.2927, + "loss": 3.2957, "step": 61150 }, { "epoch": 6.598382749326145, - "grad_norm": 0.7218978404998779, + "grad_norm": 0.6994268894195557, "learning_rate": 0.00020448569886670262, - "loss": 3.2904, + "loss": 3.2931, "step": 61200 }, { "epoch": 6.60377358490566, - "grad_norm": 0.7737917900085449, + "grad_norm": 0.7355708479881287, "learning_rate": 0.0002041618996222342, - "loss": 3.2809, + "loss": 3.2839, "step": 61250 }, { "epoch": 6.609164420485175, - "grad_norm": 0.7530986666679382, + "grad_norm": 0.7560614347457886, "learning_rate": 0.00020383810037776577, - "loss": 3.2741, + "loss": 3.2765, "step": 61300 }, { "epoch": 6.6145552560646905, - "grad_norm": 0.7638270854949951, + "grad_norm": 0.7597295045852661, "learning_rate": 0.00020351430113329735, - "loss": 3.2863, + "loss": 3.2885, "step": 61350 }, { "epoch": 6.619946091644205, - "grad_norm": 0.7740856409072876, + "grad_norm": 0.7206094264984131, "learning_rate": 0.00020319050188882893, - "loss": 3.2869, + "loss": 3.2886, "step": 61400 }, { "epoch": 6.62533692722372, - "grad_norm": 0.7981337308883667, + "grad_norm": 0.7349038124084473, "learning_rate": 0.00020286670264436048, - "loss": 3.2831, + "loss": 3.2849, "step": 61450 }, { "epoch": 6.630727762803234, - "grad_norm": 0.7509191632270813, + "grad_norm": 0.7283930778503418, "learning_rate": 0.00020254290339989206, - "loss": 3.2781, + "loss": 3.2816, "step": 61500 }, { "epoch": 6.636118598382749, - "grad_norm": 0.7379617691040039, + "grad_norm": 0.7200408577919006, "learning_rate": 0.0002022191041554236, - "loss": 3.2753, + "loss": 3.2766, "step": 61550 }, { "epoch": 6.6415094339622645, - "grad_norm": 0.7517578601837158, + "grad_norm": 0.7172255516052246, "learning_rate": 0.00020189530491095519, - "loss": 3.3039, + "loss": 3.3055, "step": 61600 }, { "epoch": 6.646900269541779, - "grad_norm": 0.7389500141143799, + "grad_norm": 0.7040255665779114, "learning_rate": 0.00020157150566648676, - "loss": 3.2594, + "loss": 3.261, "step": 61650 }, { "epoch": 6.652291105121294, - "grad_norm": 0.7633240222930908, + "grad_norm": 0.7185490131378174, "learning_rate": 0.00020124770642201834, - "loss": 3.2779, + "loss": 3.2823, "step": 61700 }, { "epoch": 6.657681940700809, - "grad_norm": 0.7727910280227661, + "grad_norm": 0.7609604597091675, "learning_rate": 0.0002009239071775499, - "loss": 3.296, + "loss": 3.2977, "step": 61750 }, { "epoch": 6.663072776280323, - "grad_norm": 0.7852575778961182, + "grad_norm": 0.7795951962471008, "learning_rate": 0.00020060010793308147, - "loss": 3.2801, + "loss": 3.2814, "step": 61800 }, { "epoch": 6.668463611859838, - "grad_norm": 0.7530646920204163, + "grad_norm": 0.7162206172943115, "learning_rate": 0.00020027630868861305, - "loss": 3.2923, + "loss": 3.2936, "step": 61850 }, { "epoch": 6.6738544474393535, - "grad_norm": 0.7305129170417786, + "grad_norm": 0.7411020398139954, "learning_rate": 0.00019995250944414462, - "loss": 3.2805, + "loss": 3.283, "step": 61900 }, { "epoch": 6.679245283018868, - "grad_norm": 0.7686682343482971, + "grad_norm": 0.7376359105110168, "learning_rate": 0.00019962871019967617, - "loss": 3.2771, + "loss": 3.2784, "step": 61950 }, { "epoch": 6.684636118598383, - "grad_norm": 0.7675058841705322, + "grad_norm": 0.7186854481697083, "learning_rate": 0.00019930491095520775, - "loss": 3.2761, + "loss": 3.2773, "step": 62000 }, { "epoch": 6.684636118598383, - "eval_accuracy": 0.3863945505796034, - "eval_loss": 3.3637261390686035, - "eval_runtime": 184.9692, - "eval_samples_per_second": 97.373, - "eval_steps_per_second": 6.088, + "eval_accuracy": 0.38614823448292795, + "eval_loss": 3.36369252204895, + "eval_runtime": 185.2042, + "eval_samples_per_second": 97.249, + "eval_steps_per_second": 6.08, "step": 62000 }, { "epoch": 6.690026954177897, - "grad_norm": 0.7356290221214294, + "grad_norm": 0.7841340899467468, "learning_rate": 0.0001989811117107393, - "loss": 3.2717, + "loss": 3.2752, "step": 62050 }, { "epoch": 6.695417789757412, - "grad_norm": 0.7363114953041077, + "grad_norm": 0.7519031167030334, "learning_rate": 0.00019865731246627088, - "loss": 3.2784, + "loss": 3.2814, "step": 62100 }, { "epoch": 6.7008086253369274, - "grad_norm": 0.7388237714767456, + "grad_norm": 0.7419308423995972, "learning_rate": 0.00019833351322180246, - "loss": 3.2943, + "loss": 3.2982, "step": 62150 }, { "epoch": 6.706199460916442, - "grad_norm": 0.7899906039237976, + "grad_norm": 0.7742794156074524, "learning_rate": 0.00019800971397733404, - "loss": 3.3021, + "loss": 3.3044, "step": 62200 }, { "epoch": 6.711590296495957, - "grad_norm": 0.7731553316116333, - "learning_rate": 0.000197692390717755, - "loss": 3.2766, + "grad_norm": 0.8003810048103333, + "learning_rate": 0.00019768591473286561, + "loss": 3.2773, "step": 62250 }, { "epoch": 6.716981132075472, - "grad_norm": 0.7288447618484497, - "learning_rate": 0.00019736859147328654, - "loss": 3.2821, + "grad_norm": 0.7672263383865356, + "learning_rate": 0.0001973621154883972, + "loss": 3.2837, "step": 62300 }, { "epoch": 6.722371967654986, - "grad_norm": 0.7623395323753357, - "learning_rate": 0.00019704479222881812, - "loss": 3.2845, + "grad_norm": 0.7650006413459778, + "learning_rate": 0.00019703831624392877, + "loss": 3.2861, "step": 62350 }, { "epoch": 6.727762803234501, - "grad_norm": 0.7321327328681946, - "learning_rate": 0.00019672099298434967, - "loss": 3.2676, + "grad_norm": 0.7366001009941101, + "learning_rate": 0.0001967145169994603, + "loss": 3.2697, "step": 62400 }, { "epoch": 6.7331536388140165, - "grad_norm": 0.7705351710319519, - "learning_rate": 0.00019639719373988124, - "loss": 3.273, + "grad_norm": 0.7485739588737488, + "learning_rate": 0.00019639071775499187, + "loss": 3.2756, "step": 62450 }, { "epoch": 6.738544474393531, - "grad_norm": 0.80072021484375, - "learning_rate": 0.00019607339449541282, - "loss": 3.2825, + "grad_norm": 0.9604521989822388, + "learning_rate": 0.00019606691851052345, + "loss": 3.2846, "step": 62500 }, { "epoch": 6.743935309973046, - "grad_norm": 0.7516555190086365, - "learning_rate": 0.0001957495952509444, - "loss": 3.2804, + "grad_norm": 0.8006502985954285, + "learning_rate": 0.00019574311926605503, + "loss": 3.2832, "step": 62550 }, { "epoch": 6.74932614555256, - "grad_norm": 0.7890444993972778, - "learning_rate": 0.00019542579600647598, - "loss": 3.2755, + "grad_norm": 0.7882602214813232, + "learning_rate": 0.0001954193200215866, + "loss": 3.2776, "step": 62600 }, { "epoch": 6.754716981132075, - "grad_norm": 0.8166410326957703, - "learning_rate": 0.00019510199676200756, - "loss": 3.2874, + "grad_norm": 0.7486486434936523, + "learning_rate": 0.00019509552077711818, + "loss": 3.2898, "step": 62650 }, { "epoch": 6.7601078167115904, - "grad_norm": 0.7606351971626282, - "learning_rate": 0.00019477819751753913, - "loss": 3.2774, + "grad_norm": 0.7505764961242676, + "learning_rate": 0.00019477172153264976, + "loss": 3.2781, "step": 62700 }, { "epoch": 6.765498652291106, - "grad_norm": 0.7485134601593018, - "learning_rate": 0.00019445439827307066, - "loss": 3.2952, + "grad_norm": 0.7206476330757141, + "learning_rate": 0.00019444792228818134, + "loss": 3.2958, "step": 62750 }, { "epoch": 6.77088948787062, - "grad_norm": 0.7248587012290955, - "learning_rate": 0.00019413059902860223, - "loss": 3.2912, + "grad_norm": 0.7169566750526428, + "learning_rate": 0.00019412412304371286, + "loss": 3.2926, "step": 62800 }, { "epoch": 6.776280323450135, - "grad_norm": 0.7899085879325867, - "learning_rate": 0.0001938067997841338, - "loss": 3.2829, + "grad_norm": 0.8056523203849792, + "learning_rate": 0.00019380032379924444, + "loss": 3.2864, "step": 62850 }, { "epoch": 6.781671159029649, - "grad_norm": 0.7482374310493469, - "learning_rate": 0.0001934830005396654, - "loss": 3.2766, + "grad_norm": 0.7459729313850403, + "learning_rate": 0.00019347652455477602, + "loss": 3.279, "step": 62900 }, { "epoch": 6.787061994609164, - "grad_norm": 0.757675290107727, - "learning_rate": 0.00019315920129519697, - "loss": 3.2787, + "grad_norm": 0.7194493412971497, + "learning_rate": 0.0001931527253103076, + "loss": 3.2804, "step": 62950 }, { "epoch": 6.7924528301886795, - "grad_norm": 0.7783774137496948, - "learning_rate": 0.00019283540205072854, - "loss": 3.2838, + "grad_norm": 0.7365685701370239, + "learning_rate": 0.00019282892606583917, + "loss": 3.2849, "step": 63000 }, { "epoch": 6.7924528301886795, - "eval_accuracy": 0.38692890548187503, - "eval_loss": 3.3590385913848877, - "eval_runtime": 185.3079, - "eval_samples_per_second": 97.195, - "eval_steps_per_second": 6.076, + "eval_accuracy": 0.38663358693279526, + "eval_loss": 3.360600709915161, + "eval_runtime": 185.3706, + "eval_samples_per_second": 97.162, + "eval_steps_per_second": 6.074, "step": 63000 }, { "epoch": 6.797843665768194, - "grad_norm": 0.7797094583511353, - "learning_rate": 0.00019251160280626012, - "loss": 3.2693, + "grad_norm": 0.7655932903289795, + "learning_rate": 0.00019250512682137075, + "loss": 3.2718, "step": 63050 }, { "epoch": 6.803234501347709, - "grad_norm": 0.8045250773429871, - "learning_rate": 0.0001921878035617917, - "loss": 3.2952, + "grad_norm": 0.7850986123085022, + "learning_rate": 0.0001921813275769023, + "loss": 3.2966, "step": 63100 }, { "epoch": 6.808625336927224, - "grad_norm": 0.7782381176948547, - "learning_rate": 0.00019186400431732322, - "loss": 3.2823, + "grad_norm": 0.761485755443573, + "learning_rate": 0.00019185752833243388, + "loss": 3.2845, "step": 63150 }, { "epoch": 6.814016172506738, - "grad_norm": 0.7246415019035339, - "learning_rate": 0.0001915402050728548, - "loss": 3.2807, + "grad_norm": 0.7269786596298218, + "learning_rate": 0.00019153372908796545, + "loss": 3.2811, "step": 63200 }, { "epoch": 6.819407008086253, - "grad_norm": 0.7466952204704285, - "learning_rate": 0.00019121640582838638, - "loss": 3.2889, + "grad_norm": 0.7788788080215454, + "learning_rate": 0.000191209929843497, + "loss": 3.2914, "step": 63250 }, { "epoch": 6.824797843665769, - "grad_norm": 0.7936302423477173, - "learning_rate": 0.00019089260658391796, - "loss": 3.2867, + "grad_norm": 0.7736331820487976, + "learning_rate": 0.00019088613059902858, + "loss": 3.2903, "step": 63300 }, { "epoch": 6.830188679245283, - "grad_norm": 0.7542206048965454, - "learning_rate": 0.00019056880733944953, - "loss": 3.2796, + "grad_norm": 0.7407358884811401, + "learning_rate": 0.00019056233135456016, + "loss": 3.281, "step": 63350 }, { "epoch": 6.835579514824798, - "grad_norm": 0.767737865447998, - "learning_rate": 0.0001902450080949811, - "loss": 3.2729, + "grad_norm": 0.7694665193557739, + "learning_rate": 0.0001902385321100917, + "loss": 3.2754, "step": 63400 }, { "epoch": 6.840970350404312, - "grad_norm": 0.7150385975837708, - "learning_rate": 0.00018992120885051266, - "loss": 3.2794, + "grad_norm": 0.746585488319397, + "learning_rate": 0.0001899147328656233, + "loss": 3.2803, "step": 63450 }, { "epoch": 6.846361185983827, - "grad_norm": 0.7381755113601685, - "learning_rate": 0.00018959740960604424, - "loss": 3.2799, + "grad_norm": 0.7027455568313599, + "learning_rate": 0.00018959093362115487, + "loss": 3.2809, "step": 63500 }, { "epoch": 6.8517520215633425, - "grad_norm": 0.7633397579193115, - "learning_rate": 0.00018927361036157582, - "loss": 3.2966, + "grad_norm": 0.723949134349823, + "learning_rate": 0.00018926713437668644, + "loss": 3.2992, "step": 63550 }, { "epoch": 6.857142857142857, - "grad_norm": 0.8028882741928101, - "learning_rate": 0.00018894981111710737, - "loss": 3.2818, + "grad_norm": 0.8229068517684937, + "learning_rate": 0.00018894333513221802, + "loss": 3.2837, "step": 63600 }, { "epoch": 6.862533692722372, - "grad_norm": 0.7695461511611938, - "learning_rate": 0.00018862601187263895, - "loss": 3.285, + "grad_norm": 0.7902956604957581, + "learning_rate": 0.00018861953588774957, + "loss": 3.2873, "step": 63650 }, { "epoch": 6.867924528301887, - "grad_norm": 0.7666114568710327, - "learning_rate": 0.00018830221262817052, - "loss": 3.2613, + "grad_norm": 0.800199031829834, + "learning_rate": 0.00018829573664328115, + "loss": 3.2631, "step": 63700 }, { "epoch": 6.873315363881401, - "grad_norm": 0.7522419691085815, - "learning_rate": 0.00018797841338370207, - "loss": 3.2706, + "grad_norm": 0.7298393249511719, + "learning_rate": 0.0001879719373988127, + "loss": 3.2751, "step": 63750 }, { "epoch": 6.878706199460916, - "grad_norm": 0.7789742946624756, - "learning_rate": 0.00018765461413923365, - "loss": 3.2844, + "grad_norm": 0.7546523213386536, + "learning_rate": 0.00018764813815434428, + "loss": 3.2867, "step": 63800 }, { "epoch": 6.884097035040432, - "grad_norm": 0.735979437828064, - "learning_rate": 0.00018733081489476523, - "loss": 3.2571, + "grad_norm": 0.7286291718482971, + "learning_rate": 0.00018732433890987586, + "loss": 3.2573, "step": 63850 }, { "epoch": 6.889487870619946, - "grad_norm": 0.7391952872276306, - "learning_rate": 0.0001870070156502968, - "loss": 3.2718, + "grad_norm": 0.7267189025878906, + "learning_rate": 0.00018700053966540743, + "loss": 3.2745, "step": 63900 }, { "epoch": 6.894878706199461, - "grad_norm": 0.784604012966156, - "learning_rate": 0.00018668321640582838, - "loss": 3.2887, + "grad_norm": 0.7725000977516174, + "learning_rate": 0.000186676740420939, + "loss": 3.2904, "step": 63950 }, { "epoch": 6.900269541778976, - "grad_norm": 0.7470083832740784, - "learning_rate": 0.00018635941716135994, - "loss": 3.2991, + "grad_norm": 0.7633524537086487, + "learning_rate": 0.0001863529411764706, + "loss": 3.3004, "step": 64000 }, { "epoch": 6.900269541778976, - "eval_accuracy": 0.3873211424068853, - "eval_loss": 3.353762626647949, - "eval_runtime": 184.9046, - "eval_samples_per_second": 97.407, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.3869971394954225, + "eval_loss": 3.354445219039917, + "eval_runtime": 184.9666, + "eval_samples_per_second": 97.374, + "eval_steps_per_second": 6.088, "step": 64000 }, { "epoch": 6.90566037735849, - "grad_norm": 0.7583284378051758, - "learning_rate": 0.00018603561791689149, - "loss": 3.2804, + "grad_norm": 0.7619683146476746, + "learning_rate": 0.00018602914193200217, + "loss": 3.2826, "step": 64050 }, { "epoch": 6.9110512129380055, - "grad_norm": 0.7391751408576965, - "learning_rate": 0.00018571181867242306, - "loss": 3.2795, + "grad_norm": 0.7622038125991821, + "learning_rate": 0.0001857053426875337, + "loss": 3.28, "step": 64100 }, { "epoch": 6.916442048517521, - "grad_norm": 0.7356597781181335, - "learning_rate": 0.00018538801942795464, - "loss": 3.28, + "grad_norm": 0.7104603052139282, + "learning_rate": 0.00018538154344306527, + "loss": 3.2808, "step": 64150 }, { "epoch": 6.921832884097035, - "grad_norm": 0.7817914485931396, - "learning_rate": 0.00018506422018348622, - "loss": 3.305, + "grad_norm": 0.7579168081283569, + "learning_rate": 0.00018505774419859684, + "loss": 3.3057, "step": 64200 }, { "epoch": 6.92722371967655, - "grad_norm": 0.7687154412269592, - "learning_rate": 0.0001847404209390178, - "loss": 3.2742, + "grad_norm": 0.8231496214866638, + "learning_rate": 0.00018473394495412842, + "loss": 3.2746, "step": 64250 }, { "epoch": 6.932614555256064, - "grad_norm": 0.779971182346344, - "learning_rate": 0.00018441662169454937, - "loss": 3.2807, + "grad_norm": 0.7839766144752502, + "learning_rate": 0.00018441014570966, + "loss": 3.2825, "step": 64300 }, { "epoch": 6.938005390835579, - "grad_norm": 0.7690040469169617, - "learning_rate": 0.00018409282245008095, - "loss": 3.2714, + "grad_norm": 0.7433233857154846, + "learning_rate": 0.00018408634646519158, + "loss": 3.2741, "step": 64350 }, { "epoch": 6.943396226415095, - "grad_norm": 0.7556252479553223, - "learning_rate": 0.00018376902320561253, - "loss": 3.2809, + "grad_norm": 0.7666583061218262, + "learning_rate": 0.00018376254722072316, + "loss": 3.2823, "step": 64400 }, { "epoch": 6.948787061994609, - "grad_norm": 0.7381365299224854, - "learning_rate": 0.00018344522396114405, - "loss": 3.2561, + "grad_norm": 0.7528730630874634, + "learning_rate": 0.0001834387479762547, + "loss": 3.2574, "step": 64450 }, { "epoch": 6.954177897574124, - "grad_norm": 0.7619420289993286, - "learning_rate": 0.00018312142471667563, - "loss": 3.2978, + "grad_norm": 0.7804055213928223, + "learning_rate": 0.00018311494873178628, + "loss": 3.3006, "step": 64500 }, { "epoch": 6.959568733153639, - "grad_norm": 0.7599936127662659, - "learning_rate": 0.0001827976254722072, - "loss": 3.2935, + "grad_norm": 0.7621782422065735, + "learning_rate": 0.00018279114948731783, + "loss": 3.2945, "step": 64550 }, { "epoch": 6.964959568733153, - "grad_norm": 0.7568638324737549, - "learning_rate": 0.00018247382622773879, - "loss": 3.2925, + "grad_norm": 0.7308817505836487, + "learning_rate": 0.0001824673502428494, + "loss": 3.2963, "step": 64600 }, { "epoch": 6.9703504043126685, - "grad_norm": 0.8307446241378784, - "learning_rate": 0.00018215002698327036, - "loss": 3.3057, + "grad_norm": 0.795536994934082, + "learning_rate": 0.000182143550998381, + "loss": 3.3076, "step": 64650 }, { "epoch": 6.975741239892184, - "grad_norm": 0.757452666759491, - "learning_rate": 0.00018182622773880194, - "loss": 3.2963, + "grad_norm": 0.7348906397819519, + "learning_rate": 0.00018181975175391257, + "loss": 3.2982, "step": 64700 }, { "epoch": 6.981132075471698, - "grad_norm": 0.7277268767356873, - "learning_rate": 0.00018150242849433352, - "loss": 3.2846, + "grad_norm": 0.7965940237045288, + "learning_rate": 0.00018149595250944414, + "loss": 3.2848, "step": 64750 }, { "epoch": 6.986522911051213, - "grad_norm": 0.7873193621635437, - "learning_rate": 0.00018117862924986507, - "loss": 3.2625, + "grad_norm": 0.752318799495697, + "learning_rate": 0.0001811721532649757, + "loss": 3.2633, "step": 64800 }, { "epoch": 6.991913746630727, - "grad_norm": 0.7263054847717285, - "learning_rate": 0.00018085483000539662, - "loss": 3.2735, + "grad_norm": 0.8075865507125854, + "learning_rate": 0.00018084835402050727, + "loss": 3.276, "step": 64850 }, { "epoch": 6.997304582210242, - "grad_norm": 0.7373244166374207, - "learning_rate": 0.0001805310307609282, - "loss": 3.2756, + "grad_norm": 0.7498311996459961, + "learning_rate": 0.00018052455477603885, + "loss": 3.278, "step": 64900 }, { "epoch": 7.002695417789758, - "grad_norm": 0.7862066030502319, - "learning_rate": 0.00018020723151645978, - "loss": 3.2283, + "grad_norm": 0.7839092016220093, + "learning_rate": 0.0001802007555315704, + "loss": 3.2299, "step": 64950 }, { "epoch": 7.008086253369272, - "grad_norm": 0.7903035879135132, - "learning_rate": 0.00017988343227199135, - "loss": 3.2075, + "grad_norm": 0.816551923751831, + "learning_rate": 0.00017987695628710198, + "loss": 3.2095, "step": 65000 }, { "epoch": 7.008086253369272, - "eval_accuracy": 0.3872658380869877, - "eval_loss": 3.3579013347625732, - "eval_runtime": 184.825, - "eval_samples_per_second": 97.449, - "eval_steps_per_second": 6.092, + "eval_accuracy": 0.3868770780543875, + "eval_loss": 3.35855770111084, + "eval_runtime": 185.2641, + "eval_samples_per_second": 97.218, + "eval_steps_per_second": 6.078, "step": 65000 }, { "epoch": 7.013477088948787, - "grad_norm": 0.7548803687095642, - "learning_rate": 0.00017955963302752293, - "loss": 3.1932, + "grad_norm": 0.7568230032920837, + "learning_rate": 0.00017955315704263356, + "loss": 3.1947, "step": 65050 }, { "epoch": 7.018867924528302, - "grad_norm": 0.8561325669288635, - "learning_rate": 0.00017923583378305448, - "loss": 3.2053, + "grad_norm": 0.8540091514587402, + "learning_rate": 0.0001792293577981651, + "loss": 3.2084, "step": 65100 }, { "epoch": 7.024258760107816, - "grad_norm": 0.7893680334091187, - "learning_rate": 0.00017891203453858606, - "loss": 3.2151, + "grad_norm": 0.7811715006828308, + "learning_rate": 0.00017890555855369668, + "loss": 3.2189, "step": 65150 }, { "epoch": 7.0296495956873315, - "grad_norm": 0.7915575504302979, - "learning_rate": 0.00017858823529411764, - "loss": 3.1957, + "grad_norm": 0.7806506752967834, + "learning_rate": 0.00017858175930922826, + "loss": 3.1967, "step": 65200 }, { "epoch": 7.035040431266847, - "grad_norm": 0.7586877346038818, - "learning_rate": 0.00017826443604964921, - "loss": 3.1878, + "grad_norm": 0.7606923580169678, + "learning_rate": 0.00017825796006475984, + "loss": 3.1901, "step": 65250 }, { "epoch": 7.040431266846361, - "grad_norm": 0.7624568939208984, - "learning_rate": 0.00017794063680518077, - "loss": 3.2072, + "grad_norm": 0.7610327005386353, + "learning_rate": 0.00017793416082029142, + "loss": 3.2096, "step": 65300 }, { "epoch": 7.045822102425876, - "grad_norm": 0.7240259647369385, - "learning_rate": 0.00017761683756071234, - "loss": 3.1874, + "grad_norm": 0.7111101746559143, + "learning_rate": 0.000177610361575823, + "loss": 3.1885, "step": 65350 }, { "epoch": 7.051212938005391, - "grad_norm": 0.7831783294677734, - "learning_rate": 0.00017729303831624392, - "loss": 3.2062, + "grad_norm": 0.8230857253074646, + "learning_rate": 0.00017728656233135452, + "loss": 3.2079, "step": 65400 }, { "epoch": 7.056603773584905, - "grad_norm": 0.7812415957450867, - "learning_rate": 0.00017696923907177547, - "loss": 3.2053, + "grad_norm": 0.758479654788971, + "learning_rate": 0.0001769627630868861, + "loss": 3.2078, "step": 65450 }, { "epoch": 7.061994609164421, - "grad_norm": 0.7808825373649597, - "learning_rate": 0.00017664543982730705, - "loss": 3.2024, + "grad_norm": 0.7583780288696289, + "learning_rate": 0.00017663896384241767, + "loss": 3.2052, "step": 65500 }, { "epoch": 7.067385444743936, - "grad_norm": 0.7911673784255981, - "learning_rate": 0.00017632164058283863, - "loss": 3.2138, + "grad_norm": 0.8004472851753235, + "learning_rate": 0.00017631516459794925, + "loss": 3.2172, "step": 65550 }, { "epoch": 7.07277628032345, - "grad_norm": 0.7514714598655701, - "learning_rate": 0.0001759978413383702, - "loss": 3.2121, + "grad_norm": 0.7332755923271179, + "learning_rate": 0.00017599136535348083, + "loss": 3.2155, "step": 65600 }, { "epoch": 7.078167115902965, - "grad_norm": 0.8336493968963623, - "learning_rate": 0.00017567404209390178, - "loss": 3.1991, + "grad_norm": 0.7847483158111572, + "learning_rate": 0.0001756675661090124, + "loss": 3.2015, "step": 65650 }, { "epoch": 7.083557951482479, - "grad_norm": 0.7754457592964172, - "learning_rate": 0.00017535024284943333, - "loss": 3.211, + "grad_norm": 0.7899592518806458, + "learning_rate": 0.00017534376686454398, + "loss": 3.2144, "step": 65700 }, { "epoch": 7.0889487870619945, - "grad_norm": 0.7741825580596924, - "learning_rate": 0.00017502644360496488, - "loss": 3.2039, + "grad_norm": 0.7482497096061707, + "learning_rate": 0.00017501996762007556, + "loss": 3.2072, "step": 65750 }, { "epoch": 7.09433962264151, - "grad_norm": 0.7761595249176025, - "learning_rate": 0.00017470264436049646, - "loss": 3.2095, + "grad_norm": 0.8108317852020264, + "learning_rate": 0.00017469616837560709, + "loss": 3.2129, "step": 65800 }, { "epoch": 7.099730458221024, - "grad_norm": 0.781187117099762, - "learning_rate": 0.00017437884511602804, - "loss": 3.1897, + "grad_norm": 0.7613179087638855, + "learning_rate": 0.00017437236913113866, + "loss": 3.1923, "step": 65850 }, { "epoch": 7.105121293800539, - "grad_norm": 0.7472749948501587, - "learning_rate": 0.00017405504587155962, - "loss": 3.2028, + "grad_norm": 0.7778836488723755, + "learning_rate": 0.00017404856988667024, + "loss": 3.2073, "step": 65900 }, { "epoch": 7.110512129380054, - "grad_norm": 0.7998760342597961, - "learning_rate": 0.0001737312466270912, - "loss": 3.1907, + "grad_norm": 0.8139280676841736, + "learning_rate": 0.00017372477064220182, + "loss": 3.1943, "step": 65950 }, { "epoch": 7.115902964959568, - "grad_norm": 0.7840926051139832, - "learning_rate": 0.00017340744738262277, - "loss": 3.2182, + "grad_norm": 0.8360422849655151, + "learning_rate": 0.0001734009713977334, + "loss": 3.22, "step": 66000 }, { "epoch": 7.115902964959568, - "eval_accuracy": 0.3876529683262707, - "eval_loss": 3.3591625690460205, - "eval_runtime": 185.3265, - "eval_samples_per_second": 97.185, - "eval_steps_per_second": 6.076, + "eval_accuracy": 0.38747086608629366, + "eval_loss": 3.3592610359191895, + "eval_runtime": 184.7874, + "eval_samples_per_second": 97.469, + "eval_steps_per_second": 6.093, "step": 66000 }, { "epoch": 7.121293800539084, - "grad_norm": 0.7974547147750854, - "learning_rate": 0.00017308364813815435, - "loss": 3.2075, + "grad_norm": 0.756746768951416, + "learning_rate": 0.00017307717215326497, + "loss": 3.2077, "step": 66050 }, { "epoch": 7.126684636118599, - "grad_norm": 0.7505779266357422, - "learning_rate": 0.00017275984889368593, - "loss": 3.2087, + "grad_norm": 0.7934142351150513, + "learning_rate": 0.00017275337290879655, + "loss": 3.2103, "step": 66100 }, { "epoch": 7.132075471698113, - "grad_norm": 0.7844457030296326, - "learning_rate": 0.00017243604964921745, - "loss": 3.2189, + "grad_norm": 0.8033551573753357, + "learning_rate": 0.0001724295736643281, + "loss": 3.2184, "step": 66150 }, { "epoch": 7.137466307277628, - "grad_norm": 0.7378308773040771, - "learning_rate": 0.00017211225040474903, - "loss": 3.2015, + "grad_norm": 0.7241845726966858, + "learning_rate": 0.00017210577441985968, + "loss": 3.2062, "step": 66200 }, { "epoch": 7.142857142857143, - "grad_norm": 0.7681424617767334, - "learning_rate": 0.00017179492714516998, - "loss": 3.2217, + "grad_norm": 0.7614899277687073, + "learning_rate": 0.00017178197517539123, + "loss": 3.2244, "step": 66250 }, { "epoch": 7.1482479784366575, - "grad_norm": 0.7983142137527466, - "learning_rate": 0.00017147112790070156, - "loss": 3.2042, + "grad_norm": 0.7865104675292969, + "learning_rate": 0.0001714581759309228, + "loss": 3.2087, "step": 66300 }, { "epoch": 7.153638814016173, - "grad_norm": 0.8198614716529846, - "learning_rate": 0.00017114732865623313, - "loss": 3.2091, + "grad_norm": 0.7773383855819702, + "learning_rate": 0.00017113437668645439, + "loss": 3.213, "step": 66350 }, { "epoch": 7.159029649595688, - "grad_norm": 0.794266402721405, - "learning_rate": 0.0001708235294117647, - "loss": 3.2117, + "grad_norm": 0.7900416254997253, + "learning_rate": 0.00017081057744198596, + "loss": 3.2149, "step": 66400 }, { "epoch": 7.164420485175202, - "grad_norm": 0.7538433074951172, - "learning_rate": 0.0001704997301672963, - "loss": 3.2196, + "grad_norm": 0.7785615921020508, + "learning_rate": 0.00017048677819751751, + "loss": 3.2208, "step": 66450 }, { "epoch": 7.169811320754717, - "grad_norm": 0.7691895365715027, - "learning_rate": 0.00017017593092282781, - "loss": 3.2085, + "grad_norm": 0.7936018705368042, + "learning_rate": 0.0001701629789530491, + "loss": 3.2102, "step": 66500 }, { "epoch": 7.175202156334231, - "grad_norm": 0.8115940093994141, - "learning_rate": 0.0001698521316783594, - "loss": 3.2232, + "grad_norm": 0.8073174357414246, + "learning_rate": 0.00016983917970858067, + "loss": 3.2258, "step": 66550 }, { "epoch": 7.180592991913747, - "grad_norm": 0.7686892151832581, - "learning_rate": 0.00016952833243389097, - "loss": 3.2104, + "grad_norm": 0.7533419132232666, + "learning_rate": 0.00016951538046411225, + "loss": 3.2115, "step": 66600 }, { "epoch": 7.185983827493262, - "grad_norm": 0.7379952669143677, - "learning_rate": 0.00016920453318942255, - "loss": 3.2245, + "grad_norm": 0.7639559507369995, + "learning_rate": 0.0001691915812196438, + "loss": 3.2269, "step": 66650 }, { "epoch": 7.191374663072776, - "grad_norm": 0.8049662709236145, - "learning_rate": 0.00016888073394495412, - "loss": 3.2166, + "grad_norm": 0.7794451117515564, + "learning_rate": 0.00016886778197517538, + "loss": 3.219, "step": 66700 }, { "epoch": 7.196765498652291, - "grad_norm": 0.7826871275901794, - "learning_rate": 0.0001685569347004857, - "loss": 3.2239, + "grad_norm": 0.7711024284362793, + "learning_rate": 0.00016854398273070693, + "loss": 3.2267, "step": 66750 }, { "epoch": 7.202156334231806, - "grad_norm": 0.7626354694366455, - "learning_rate": 0.00016823313545601725, - "loss": 3.1991, + "grad_norm": 0.7972269654273987, + "learning_rate": 0.0001682201834862385, + "loss": 3.2023, "step": 66800 }, { "epoch": 7.2075471698113205, - "grad_norm": 0.8057540059089661, - "learning_rate": 0.00016790933621154883, - "loss": 3.2137, + "grad_norm": 0.7715423703193665, + "learning_rate": 0.00016789638424177008, + "loss": 3.2181, "step": 66850 }, { "epoch": 7.212938005390836, - "grad_norm": 0.7812713980674744, - "learning_rate": 0.00016758553696708038, - "loss": 3.2337, + "grad_norm": 0.7480494976043701, + "learning_rate": 0.00016757258499730166, + "loss": 3.2358, "step": 66900 }, { "epoch": 7.218328840970351, - "grad_norm": 0.7958628535270691, - "learning_rate": 0.00016726173772261196, - "loss": 3.2238, + "grad_norm": 0.7672910690307617, + "learning_rate": 0.00016724878575283324, + "loss": 3.2271, "step": 66950 }, { "epoch": 7.223719676549865, - "grad_norm": 0.776133120059967, - "learning_rate": 0.00016693793847814354, - "loss": 3.2277, + "grad_norm": 0.7870936989784241, + "learning_rate": 0.00016692498650836481, + "loss": 3.23, "step": 67000 }, { "epoch": 7.223719676549865, - "eval_accuracy": 0.3875116109192241, - "eval_loss": 3.3559296131134033, - "eval_runtime": 184.9003, - "eval_samples_per_second": 97.409, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.38741023777489314, + "eval_loss": 3.3569626808166504, + "eval_runtime": 185.0967, + "eval_samples_per_second": 97.306, + "eval_steps_per_second": 6.083, "step": 67000 }, { "epoch": 7.22911051212938, - "grad_norm": 0.7707949876785278, - "learning_rate": 0.00016661413923367511, - "loss": 3.2224, + "grad_norm": 0.7603834271430969, + "learning_rate": 0.0001666011872638964, + "loss": 3.2234, "step": 67050 }, { "epoch": 7.234501347708895, - "grad_norm": 0.8489854335784912, - "learning_rate": 0.00016629033998920666, - "loss": 3.2264, + "grad_norm": 0.7955418825149536, + "learning_rate": 0.00016627738801942792, + "loss": 3.2286, "step": 67100 }, { "epoch": 7.2398921832884096, - "grad_norm": 0.7626428604125977, - "learning_rate": 0.00016596654074473824, - "loss": 3.2253, + "grad_norm": 0.7377505302429199, + "learning_rate": 0.0001659535887749595, + "loss": 3.2262, "step": 67150 }, { "epoch": 7.245283018867925, - "grad_norm": 0.7870113253593445, - "learning_rate": 0.00016564274150026982, - "loss": 3.2146, + "grad_norm": 0.7720267176628113, + "learning_rate": 0.00016562978953049107, + "loss": 3.2166, "step": 67200 }, { "epoch": 7.250673854447439, - "grad_norm": 1.6695157289505005, - "learning_rate": 0.0001653189422558014, - "loss": 3.2152, + "grad_norm": 0.8122389316558838, + "learning_rate": 0.00016530599028602265, + "loss": 3.2182, "step": 67250 }, { "epoch": 7.256064690026954, - "grad_norm": 0.7939939498901367, - "learning_rate": 0.00016499514301133298, - "loss": 3.2307, + "grad_norm": 0.8003917336463928, + "learning_rate": 0.00016498219104155423, + "loss": 3.2336, "step": 67300 }, { "epoch": 7.261455525606469, - "grad_norm": 0.79146808385849, - "learning_rate": 0.00016467134376686453, - "loss": 3.219, + "grad_norm": 0.7706378698348999, + "learning_rate": 0.0001646583917970858, + "loss": 3.2216, "step": 67350 }, { "epoch": 7.2668463611859835, - "grad_norm": 0.789994478225708, - "learning_rate": 0.0001643475445223961, - "loss": 3.2288, + "grad_norm": 0.7822121381759644, + "learning_rate": 0.00016433459255261738, + "loss": 3.2279, "step": 67400 }, { "epoch": 7.272237196765499, - "grad_norm": 0.7432640194892883, - "learning_rate": 0.00016402374527792765, - "loss": 3.2366, + "grad_norm": 0.7819550633430481, + "learning_rate": 0.00016401079330814896, + "loss": 3.2403, "step": 67450 }, { "epoch": 7.277628032345014, - "grad_norm": 0.7772762775421143, - "learning_rate": 0.00016369994603345923, - "loss": 3.232, + "grad_norm": 0.7725815176963806, + "learning_rate": 0.00016368699406368048, + "loss": 3.2323, "step": 67500 }, { "epoch": 7.283018867924528, - "grad_norm": 0.8248816132545471, - "learning_rate": 0.0001633761467889908, - "loss": 3.2216, + "grad_norm": 0.8235310316085815, + "learning_rate": 0.00016336319481921206, + "loss": 3.2266, "step": 67550 }, { "epoch": 7.288409703504043, - "grad_norm": 0.7746945023536682, - "learning_rate": 0.0001630523475445224, - "loss": 3.245, + "grad_norm": 0.77774578332901, + "learning_rate": 0.00016303939557474364, + "loss": 3.2468, "step": 67600 }, { "epoch": 7.293800539083558, - "grad_norm": 0.8768308758735657, - "learning_rate": 0.00016272854830005396, - "loss": 3.2276, + "grad_norm": 0.8502614498138428, + "learning_rate": 0.00016271559633027522, + "loss": 3.2285, "step": 67650 }, { "epoch": 7.2991913746630726, - "grad_norm": 0.7637453675270081, - "learning_rate": 0.00016240474905558554, - "loss": 3.2252, + "grad_norm": 0.7677310705184937, + "learning_rate": 0.0001623917970858068, + "loss": 3.2293, "step": 67700 }, { "epoch": 7.304582210242588, - "grad_norm": 0.8112920522689819, - "learning_rate": 0.00016208094981111707, - "loss": 3.2282, + "grad_norm": 0.7754825353622437, + "learning_rate": 0.00016206799784133837, + "loss": 3.2326, "step": 67750 }, { "epoch": 7.309973045822103, - "grad_norm": 0.7835080027580261, - "learning_rate": 0.00016175715056664864, - "loss": 3.2117, + "grad_norm": 0.7810065150260925, + "learning_rate": 0.00016174419859686992, + "loss": 3.2142, "step": 67800 }, { "epoch": 7.315363881401617, - "grad_norm": 0.7686965465545654, - "learning_rate": 0.00016143335132218022, - "loss": 3.2489, + "grad_norm": 0.8359429836273193, + "learning_rate": 0.0001614203993524015, + "loss": 3.2522, "step": 67850 }, { "epoch": 7.320754716981132, - "grad_norm": 0.7612333297729492, - "learning_rate": 0.0001611095520777118, - "loss": 3.2279, + "grad_norm": 0.7552247643470764, + "learning_rate": 0.00016109660010793308, + "loss": 3.2309, "step": 67900 }, { "epoch": 7.3261455525606465, - "grad_norm": 0.8523034453392029, - "learning_rate": 0.00016078575283324338, - "loss": 3.2315, + "grad_norm": 0.813887894153595, + "learning_rate": 0.00016077280086346463, + "loss": 3.2337, "step": 67950 }, { "epoch": 7.331536388140162, - "grad_norm": 0.798283576965332, - "learning_rate": 0.00016046195358877495, - "loss": 3.21, + "grad_norm": 0.8046641945838928, + "learning_rate": 0.0001604490016189962, + "loss": 3.2111, "step": 68000 }, { "epoch": 7.331536388140162, - "eval_accuracy": 0.3883520410064691, - "eval_loss": 3.3501393795013428, - "eval_runtime": 184.9896, - "eval_samples_per_second": 97.362, - "eval_steps_per_second": 6.087, + "eval_accuracy": 0.38813169294998134, + "eval_loss": 3.3522262573242188, + "eval_runtime": 185.0686, + "eval_samples_per_second": 97.321, + "eval_steps_per_second": 6.084, "step": 68000 }, { "epoch": 7.336927223719677, - "grad_norm": 0.7581954598426819, - "learning_rate": 0.00016013815434430653, - "loss": 3.2302, + "grad_norm": 0.7817372679710388, + "learning_rate": 0.00016012520237452778, + "loss": 3.2321, "step": 68050 }, { "epoch": 7.342318059299191, - "grad_norm": 0.777908205986023, - "learning_rate": 0.0001598143550998381, - "loss": 3.235, + "grad_norm": 0.7686135768890381, + "learning_rate": 0.00015980140313005933, + "loss": 3.24, "step": 68100 }, { "epoch": 7.347708894878706, - "grad_norm": 0.7884739637374878, - "learning_rate": 0.00015949055585536966, - "loss": 3.2196, + "grad_norm": 0.7931602001190186, + "learning_rate": 0.0001594776038855909, + "loss": 3.2219, "step": 68150 }, { "epoch": 7.353099730458221, - "grad_norm": 0.8096325993537903, - "learning_rate": 0.0001591667566109012, - "loss": 3.2419, + "grad_norm": 0.8089348077774048, + "learning_rate": 0.0001591538046411225, + "loss": 3.244, "step": 68200 }, { "epoch": 7.3584905660377355, - "grad_norm": 0.7861303091049194, - "learning_rate": 0.00015884943335132216, - "loss": 3.2444, + "grad_norm": 0.7867382168769836, + "learning_rate": 0.00015883000539665407, + "loss": 3.2467, "step": 68250 }, { "epoch": 7.363881401617251, - "grad_norm": 0.8135442733764648, - "learning_rate": 0.00015852563410685374, - "loss": 3.2322, + "grad_norm": 0.7838876247406006, + "learning_rate": 0.00015850620615218564, + "loss": 3.2329, "step": 68300 }, { "epoch": 7.369272237196766, - "grad_norm": 0.7923033833503723, - "learning_rate": 0.00015820183486238532, - "loss": 3.2313, + "grad_norm": 0.8173036575317383, + "learning_rate": 0.0001581824069077172, + "loss": 3.2348, "step": 68350 }, { "epoch": 7.37466307277628, - "grad_norm": 0.8019893169403076, - "learning_rate": 0.0001578780356179169, - "loss": 3.2225, + "grad_norm": 0.7778081893920898, + "learning_rate": 0.00015785860766324877, + "loss": 3.2254, "step": 68400 }, { "epoch": 7.380053908355795, - "grad_norm": 0.8459454774856567, - "learning_rate": 0.00015755423637344847, - "loss": 3.2065, + "grad_norm": 0.8123937845230103, + "learning_rate": 0.00015753480841878032, + "loss": 3.2109, "step": 68450 }, { "epoch": 7.38544474393531, - "grad_norm": 0.7834682464599609, - "learning_rate": 0.00015723043712898, - "loss": 3.2141, + "grad_norm": 0.7795190811157227, + "learning_rate": 0.0001572110091743119, + "loss": 3.214, "step": 68500 }, { "epoch": 7.390835579514825, - "grad_norm": 0.7821386456489563, - "learning_rate": 0.00015690663788451157, - "loss": 3.2233, + "grad_norm": 0.7896884083747864, + "learning_rate": 0.00015688720992984348, + "loss": 3.2268, "step": 68550 }, { "epoch": 7.39622641509434, - "grad_norm": 0.7885687351226807, - "learning_rate": 0.00015658283864004315, - "loss": 3.2407, + "grad_norm": 0.7429280877113342, + "learning_rate": 0.00015656341068537506, + "loss": 3.2429, "step": 68600 }, { "epoch": 7.401617250673855, - "grad_norm": 0.8140493035316467, - "learning_rate": 0.00015625903939557473, - "loss": 3.2106, + "grad_norm": 0.8340896368026733, + "learning_rate": 0.00015623961144090663, + "loss": 3.2127, "step": 68650 }, { "epoch": 7.407008086253369, - "grad_norm": 0.8063366413116455, - "learning_rate": 0.0001559352401511063, - "loss": 3.2162, + "grad_norm": 0.8026847839355469, + "learning_rate": 0.0001559158121964382, + "loss": 3.2176, "step": 68700 }, { "epoch": 7.412398921832884, - "grad_norm": 0.8016238212585449, - "learning_rate": 0.00015561144090663788, - "loss": 3.22, + "grad_norm": 0.757952094078064, + "learning_rate": 0.0001555920129519698, + "loss": 3.2232, "step": 68750 }, { "epoch": 7.4177897574123985, - "grad_norm": 0.7843878865242004, - "learning_rate": 0.00015528764166216944, - "loss": 3.2307, + "grad_norm": 0.8089631199836731, + "learning_rate": 0.0001552682137075013, + "loss": 3.2333, "step": 68800 }, { "epoch": 7.423180592991914, - "grad_norm": 0.768501341342926, - "learning_rate": 0.000154963842417701, - "loss": 3.2143, + "grad_norm": 0.7726781964302063, + "learning_rate": 0.0001549444144630329, + "loss": 3.2168, "step": 68850 }, { "epoch": 7.428571428571429, - "grad_norm": 0.7922945618629456, - "learning_rate": 0.0001546400431732326, - "loss": 3.2301, + "grad_norm": 0.7755560874938965, + "learning_rate": 0.00015462061521856447, + "loss": 3.234, "step": 68900 }, { "epoch": 7.433962264150943, - "grad_norm": 0.8232368230819702, - "learning_rate": 0.00015431624392876414, - "loss": 3.222, + "grad_norm": 0.8190703392028809, + "learning_rate": 0.00015429681597409605, + "loss": 3.2245, "step": 68950 }, { "epoch": 7.439353099730458, - "grad_norm": 0.7619295120239258, - "learning_rate": 0.00015399244468429572, - "loss": 3.2443, + "grad_norm": 0.7696461081504822, + "learning_rate": 0.00015397301672962762, + "loss": 3.2467, "step": 69000 }, { "epoch": 7.439353099730458, - "eval_accuracy": 0.3885381634032953, - "eval_loss": 3.347168445587158, - "eval_runtime": 184.9909, - "eval_samples_per_second": 97.362, - "eval_steps_per_second": 6.087, + "eval_accuracy": 0.38801489109558074, + "eval_loss": 3.3500401973724365, + "eval_runtime": 185.1521, + "eval_samples_per_second": 97.277, + "eval_steps_per_second": 6.081, "step": 69000 }, { "epoch": 7.444743935309973, - "grad_norm": 0.8314117193222046, - "learning_rate": 0.0001536686454398273, - "loss": 3.2238, + "grad_norm": 0.8240043520927429, + "learning_rate": 0.0001536492174851592, + "loss": 3.2245, "step": 69050 }, { "epoch": 7.450134770889488, - "grad_norm": 0.7766390442848206, - "learning_rate": 0.00015334484619535887, - "loss": 3.216, + "grad_norm": 0.798420786857605, + "learning_rate": 0.00015332541824069078, + "loss": 3.219, "step": 69100 }, { "epoch": 7.455525606469003, - "grad_norm": 0.8217452168464661, - "learning_rate": 0.00015302104695089042, - "loss": 3.2008, + "grad_norm": 0.7799826264381409, + "learning_rate": 0.00015300161899622233, + "loss": 3.203, "step": 69150 }, { "epoch": 7.460916442048518, - "grad_norm": 0.8428397178649902, - "learning_rate": 0.000152697247706422, - "loss": 3.2424, + "grad_norm": 0.8098189830780029, + "learning_rate": 0.00015267781975175388, + "loss": 3.2447, "step": 69200 }, { "epoch": 7.466307277628032, - "grad_norm": 0.7982749342918396, - "learning_rate": 0.00015237344846195358, - "loss": 3.2283, + "grad_norm": 0.7774802446365356, + "learning_rate": 0.00015235402050728546, + "loss": 3.2306, "step": 69250 }, { "epoch": 7.471698113207547, - "grad_norm": 0.7763500809669495, - "learning_rate": 0.00015204964921748516, - "loss": 3.2182, + "grad_norm": 0.8291066884994507, + "learning_rate": 0.00015203022126281703, + "loss": 3.2186, "step": 69300 }, { "epoch": 7.4770889487870615, - "grad_norm": 0.7948207855224609, - "learning_rate": 0.0001517258499730167, - "loss": 3.2375, + "grad_norm": 0.7912863492965698, + "learning_rate": 0.0001517064220183486, + "loss": 3.2399, "step": 69350 }, { "epoch": 7.482479784366577, - "grad_norm": 0.7881017923355103, - "learning_rate": 0.00015140205072854829, - "loss": 3.2398, + "grad_norm": 0.7798894643783569, + "learning_rate": 0.0001513826227738802, + "loss": 3.242, "step": 69400 }, { "epoch": 7.487870619946092, - "grad_norm": 0.7627866864204407, - "learning_rate": 0.00015107825148407984, - "loss": 3.2324, + "grad_norm": 0.7952271699905396, + "learning_rate": 0.00015105882352941177, + "loss": 3.235, "step": 69450 }, { "epoch": 7.493261455525606, - "grad_norm": 0.8260999917984009, - "learning_rate": 0.00015075445223961141, - "loss": 3.2223, + "grad_norm": 0.8175017833709717, + "learning_rate": 0.00015073502428494332, + "loss": 3.2245, "step": 69500 }, { "epoch": 7.498652291105121, - "grad_norm": 0.792458176612854, - "learning_rate": 0.000150430652995143, - "loss": 3.2263, + "grad_norm": 0.8068032264709473, + "learning_rate": 0.0001504112250404749, + "loss": 3.2297, "step": 69550 }, { "epoch": 7.504043126684636, - "grad_norm": 0.8109192252159119, - "learning_rate": 0.00015010685375067457, - "loss": 3.2224, + "grad_norm": 0.8073570132255554, + "learning_rate": 0.00015008742579600647, + "loss": 3.2234, "step": 69600 }, { "epoch": 7.509433962264151, - "grad_norm": 0.7631048560142517, - "learning_rate": 0.00014978305450620615, - "loss": 3.2264, + "grad_norm": 0.7827601432800293, + "learning_rate": 0.00014976362655153802, + "loss": 3.2288, "step": 69650 }, { "epoch": 7.514824797843666, - "grad_norm": 0.8018741607666016, - "learning_rate": 0.00014946573124662707, - "loss": 3.2373, + "grad_norm": 0.7789676189422607, + "learning_rate": 0.0001494398273070696, + "loss": 3.2391, "step": 69700 }, { "epoch": 7.520215633423181, - "grad_norm": 0.8057368397712708, - "learning_rate": 0.00014914193200215865, - "loss": 3.2268, + "grad_norm": 0.7658881545066833, + "learning_rate": 0.00014911602806260118, + "loss": 3.2284, "step": 69750 }, { "epoch": 7.525606469002695, - "grad_norm": 0.7862183451652527, - "learning_rate": 0.0001488181327576902, - "loss": 3.2417, + "grad_norm": 0.7733731269836426, + "learning_rate": 0.00014879222881813273, + "loss": 3.2451, "step": 69800 }, { "epoch": 7.53099730458221, - "grad_norm": 0.7762759923934937, - "learning_rate": 0.00014849433351322178, - "loss": 3.2157, + "grad_norm": 0.8035315275192261, + "learning_rate": 0.0001484684295736643, + "loss": 3.2174, "step": 69850 }, { "epoch": 7.536388140161725, - "grad_norm": 0.7878009676933289, - "learning_rate": 0.00014817053426875336, - "loss": 3.2251, + "grad_norm": 0.7829474210739136, + "learning_rate": 0.00014814463032919589, + "loss": 3.2271, "step": 69900 }, { "epoch": 7.54177897574124, - "grad_norm": 0.8166378140449524, - "learning_rate": 0.00014784673502428493, - "loss": 3.2327, + "grad_norm": 0.8561943173408508, + "learning_rate": 0.00014782083108472744, + "loss": 3.2351, "step": 69950 }, { "epoch": 7.547169811320755, - "grad_norm": 0.8306125402450562, - "learning_rate": 0.0001475229357798165, - "loss": 3.2315, + "grad_norm": 0.8462625741958618, + "learning_rate": 0.00014749703184025901, + "loss": 3.2316, "step": 70000 }, { "epoch": 7.547169811320755, - "eval_accuracy": 0.3888810719172378, - "eval_loss": 3.3454511165618896, - "eval_runtime": 185.1045, - "eval_samples_per_second": 97.302, - "eval_steps_per_second": 6.083, + "eval_accuracy": 0.38856282760882915, + "eval_loss": 3.34738826751709, + "eval_runtime": 185.0786, + "eval_samples_per_second": 97.315, + "eval_steps_per_second": 6.084, "step": 70000 }, { "epoch": 7.55256064690027, - "grad_norm": 0.8042418360710144, - "learning_rate": 0.00014719913653534806, - "loss": 3.2114, + "grad_norm": 0.788134753704071, + "learning_rate": 0.0001471732325957906, + "loss": 3.2129, "step": 70050 }, { "epoch": 7.557951482479784, - "grad_norm": 0.7494966983795166, - "learning_rate": 0.00014687533729087964, - "loss": 3.2326, + "grad_norm": 0.7644175887107849, + "learning_rate": 0.00014684943335132217, + "loss": 3.2347, "step": 70100 }, { "epoch": 7.563342318059299, - "grad_norm": 0.7927791476249695, - "learning_rate": 0.00014655153804641122, - "loss": 3.2304, + "grad_norm": 0.8131969571113586, + "learning_rate": 0.00014652563410685375, + "loss": 3.2308, "step": 70150 }, { "epoch": 7.568733153638814, - "grad_norm": 0.7527803182601929, - "learning_rate": 0.0001462277388019428, - "loss": 3.2309, + "grad_norm": 0.7657596468925476, + "learning_rate": 0.0001462018348623853, + "loss": 3.2316, "step": 70200 }, { "epoch": 7.574123989218329, - "grad_norm": 0.7601218819618225, - "learning_rate": 0.00014590393955747435, - "loss": 3.226, + "grad_norm": 0.7949525713920593, + "learning_rate": 0.00014587803561791687, + "loss": 3.2284, "step": 70250 }, { "epoch": 7.579514824797844, - "grad_norm": 0.7645830512046814, - "learning_rate": 0.00014558014031300592, - "loss": 3.2393, + "grad_norm": 0.7503776550292969, + "learning_rate": 0.00014555423637344845, + "loss": 3.2428, "step": 70300 }, { "epoch": 7.584905660377358, - "grad_norm": 0.8330966830253601, - "learning_rate": 0.0001452563410685375, - "loss": 3.2108, + "grad_norm": 0.7979116439819336, + "learning_rate": 0.00014523043712898003, + "loss": 3.2158, "step": 70350 }, { "epoch": 7.590296495956873, - "grad_norm": 0.7954941391944885, - "learning_rate": 0.00014493254182406908, - "loss": 3.2179, + "grad_norm": 0.7930637001991272, + "learning_rate": 0.00014490663788451158, + "loss": 3.2204, "step": 70400 }, { "epoch": 7.595687331536388, - "grad_norm": 0.7679847478866577, - "learning_rate": 0.00014460874257960063, - "loss": 3.2248, + "grad_norm": 0.7918315529823303, + "learning_rate": 0.00014458283864004316, + "loss": 3.2266, "step": 70450 }, { "epoch": 7.601078167115903, - "grad_norm": 0.7807341814041138, - "learning_rate": 0.0001442849433351322, - "loss": 3.2387, + "grad_norm": 0.746759831905365, + "learning_rate": 0.00014425903939557474, + "loss": 3.2417, "step": 70500 }, { "epoch": 7.606469002695418, - "grad_norm": 0.8021829724311829, - "learning_rate": 0.00014396114409066378, - "loss": 3.2244, + "grad_norm": 0.8025341033935547, + "learning_rate": 0.00014393524015110631, + "loss": 3.2262, "step": 70550 }, { "epoch": 7.611859838274933, - "grad_norm": 0.8153308033943176, - "learning_rate": 0.00014363734484619536, - "loss": 3.251, + "grad_norm": 0.7924248576164246, + "learning_rate": 0.00014361791689152724, + "loss": 3.2529, "step": 70600 }, { "epoch": 7.617250673854447, - "grad_norm": 0.8160524964332581, - "learning_rate": 0.0001433135456017269, - "loss": 3.2256, + "grad_norm": 0.810879647731781, + "learning_rate": 0.00014329411764705882, + "loss": 3.2276, "step": 70650 }, { "epoch": 7.622641509433962, - "grad_norm": 0.8022107481956482, - "learning_rate": 0.0001429897463572585, - "loss": 3.2283, + "grad_norm": 0.8111072182655334, + "learning_rate": 0.00014297679438747974, + "loss": 3.2287, "step": 70700 }, { "epoch": 7.628032345013477, - "grad_norm": 0.8894932866096497, - "learning_rate": 0.00014266594711279007, - "loss": 3.2202, + "grad_norm": 0.8351150155067444, + "learning_rate": 0.00014265299514301132, + "loss": 3.2233, "step": 70750 }, { "epoch": 7.633423180592992, - "grad_norm": 0.7843272686004639, - "learning_rate": 0.00014234214786832165, - "loss": 3.2222, + "grad_norm": 0.7544995546340942, + "learning_rate": 0.0001423291958985429, + "loss": 3.2255, "step": 70800 }, { "epoch": 7.638814016172507, - "grad_norm": 0.8247528076171875, - "learning_rate": 0.0001420183486238532, - "loss": 3.2281, + "grad_norm": 0.8261779546737671, + "learning_rate": 0.00014200539665407445, + "loss": 3.231, "step": 70850 }, { "epoch": 7.644204851752022, - "grad_norm": 0.7786036133766174, - "learning_rate": 0.00014169454937938477, - "loss": 3.2223, + "grad_norm": 0.7995532751083374, + "learning_rate": 0.00014168159740960602, + "loss": 3.2283, "step": 70900 }, { "epoch": 7.649595687331536, - "grad_norm": 0.8587459921836853, - "learning_rate": 0.00014137075013491635, - "loss": 3.2313, + "grad_norm": 0.828923225402832, + "learning_rate": 0.0001413577981651376, + "loss": 3.2328, "step": 70950 }, { "epoch": 7.654986522911051, - "grad_norm": 0.8270299434661865, - "learning_rate": 0.0001410469508904479, - "loss": 3.2304, + "grad_norm": 0.8156834244728088, + "learning_rate": 0.00014103399892066918, + "loss": 3.2337, "step": 71000 }, { "epoch": 7.654986522911051, - "eval_accuracy": 0.38917465202011253, - "eval_loss": 3.3405866622924805, - "eval_runtime": 184.8909, - "eval_samples_per_second": 97.414, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.389215940117482, + "eval_loss": 3.340672254562378, + "eval_runtime": 184.7558, + "eval_samples_per_second": 97.485, + "eval_steps_per_second": 6.095, "step": 71000 }, { "epoch": 7.660377358490566, - "grad_norm": 0.8775826692581177, - "learning_rate": 0.00014072315164597948, - "loss": 3.2339, + "grad_norm": 0.820132315158844, + "learning_rate": 0.00014071019967620073, + "loss": 3.2363, "step": 71050 }, { "epoch": 7.665768194070081, - "grad_norm": 0.7978357076644897, - "learning_rate": 0.00014039935240151106, - "loss": 3.2172, + "grad_norm": 0.7700738310813904, + "learning_rate": 0.0001403864004317323, + "loss": 3.2192, "step": 71100 }, { "epoch": 7.671159029649596, - "grad_norm": 0.7993744015693665, - "learning_rate": 0.0001400755531570426, - "loss": 3.2088, + "grad_norm": 0.8299509286880493, + "learning_rate": 0.00014006260118726389, + "loss": 3.213, "step": 71150 }, { "epoch": 7.67654986522911, - "grad_norm": 0.7915191054344177, - "learning_rate": 0.00013975175391257419, - "loss": 3.2163, + "grad_norm": 0.7950608134269714, + "learning_rate": 0.00013973880194279546, + "loss": 3.2193, "step": 71200 }, { "epoch": 7.681940700808625, - "grad_norm": 0.8002047538757324, - "learning_rate": 0.00013942795466810576, - "loss": 3.2241, + "grad_norm": 0.7542383670806885, + "learning_rate": 0.00013941500269832704, + "loss": 3.2268, "step": 71250 }, { "epoch": 7.6873315363881405, - "grad_norm": 0.7947512269020081, - "learning_rate": 0.00013910415542363731, - "loss": 3.2479, + "grad_norm": 0.7638273239135742, + "learning_rate": 0.0001390912034538586, + "loss": 3.2504, "step": 71300 }, { "epoch": 7.692722371967655, - "grad_norm": 0.7719897627830505, - "learning_rate": 0.0001387803561791689, - "loss": 3.2372, + "grad_norm": 0.7596293687820435, + "learning_rate": 0.00013876740420939017, + "loss": 3.2385, "step": 71350 }, { "epoch": 7.69811320754717, - "grad_norm": 0.784153938293457, - "learning_rate": 0.00013845655693470047, - "loss": 3.2322, + "grad_norm": 0.7992427945137024, + "learning_rate": 0.00013844360496492175, + "loss": 3.234, "step": 71400 }, { "epoch": 7.703504043126685, - "grad_norm": 0.8173136711120605, - "learning_rate": 0.00013813275769023205, - "loss": 3.2203, + "grad_norm": 0.8380318284034729, + "learning_rate": 0.00013811980572045333, + "loss": 3.2229, "step": 71450 }, { "epoch": 7.708894878706199, - "grad_norm": 0.830466628074646, - "learning_rate": 0.0001378089584457636, - "loss": 3.2244, + "grad_norm": 0.8700864911079407, + "learning_rate": 0.00013779600647598488, + "loss": 3.2266, "step": 71500 }, { "epoch": 7.714285714285714, - "grad_norm": 0.7629924416542053, - "learning_rate": 0.00013748515920129517, - "loss": 3.2486, + "grad_norm": 0.7592607140541077, + "learning_rate": 0.00013747220723151645, + "loss": 3.2511, "step": 71550 }, { "epoch": 7.719676549865229, - "grad_norm": 0.7888294458389282, - "learning_rate": 0.00013716135995682675, - "loss": 3.2378, + "grad_norm": 0.7742652893066406, + "learning_rate": 0.00013714840798704803, + "loss": 3.2406, "step": 71600 }, { "epoch": 7.725067385444744, - "grad_norm": 0.7922984957695007, - "learning_rate": 0.00013683756071235833, - "loss": 3.2447, + "grad_norm": 0.8119229078292847, + "learning_rate": 0.00013682460874257958, + "loss": 3.2462, "step": 71650 }, { "epoch": 7.730458221024259, - "grad_norm": 0.8440670967102051, - "learning_rate": 0.0001365137614678899, - "loss": 3.2428, + "grad_norm": 0.8372693657875061, + "learning_rate": 0.00013650080949811116, + "loss": 3.2452, "step": 71700 }, { "epoch": 7.735849056603773, - "grad_norm": 0.7881962656974792, - "learning_rate": 0.00013618996222342146, - "loss": 3.245, + "grad_norm": 0.810852587223053, + "learning_rate": 0.00013617701025364274, + "loss": 3.2478, "step": 71750 }, { "epoch": 7.741239892183288, - "grad_norm": 0.824066698551178, - "learning_rate": 0.00013586616297895304, - "loss": 3.2387, + "grad_norm": 0.8200960755348206, + "learning_rate": 0.0001358532110091743, + "loss": 3.2403, "step": 71800 }, { "epoch": 7.7466307277628035, - "grad_norm": 0.8074589967727661, - "learning_rate": 0.00013554236373448461, - "loss": 3.2506, + "grad_norm": 0.8275381922721863, + "learning_rate": 0.00013552941176470587, + "loss": 3.253, "step": 71850 }, { "epoch": 7.752021563342318, - "grad_norm": 0.8384812474250793, - "learning_rate": 0.0001352185644900162, - "loss": 3.2281, + "grad_norm": 0.7997806072235107, + "learning_rate": 0.00013520561252023744, + "loss": 3.2331, "step": 71900 }, { "epoch": 7.757412398921833, - "grad_norm": 0.8503368496894836, - "learning_rate": 0.00013489476524554774, - "loss": 3.2573, + "grad_norm": 0.8084086179733276, + "learning_rate": 0.00013488181327576902, + "loss": 3.2599, "step": 71950 }, { "epoch": 7.762803234501348, - "grad_norm": 0.7815788984298706, - "learning_rate": 0.00013457096600107932, - "loss": 3.227, + "grad_norm": 0.8054947853088379, + "learning_rate": 0.00013455801403130057, + "loss": 3.2284, "step": 72000 }, { "epoch": 7.762803234501348, - "eval_accuracy": 0.38971911164095085, - "eval_loss": 3.3369975090026855, - "eval_runtime": 185.05, - "eval_samples_per_second": 97.33, - "eval_steps_per_second": 6.085, + "eval_accuracy": 0.38960480880297005, + "eval_loss": 3.338857412338257, + "eval_runtime": 185.2016, + "eval_samples_per_second": 97.251, + "eval_steps_per_second": 6.08, "step": 72000 }, { "epoch": 7.768194070080862, - "grad_norm": 0.7871066331863403, - "learning_rate": 0.0001342471667566109, - "loss": 3.2261, + "grad_norm": 0.7348754405975342, + "learning_rate": 0.00013423421478683215, + "loss": 3.2291, "step": 72050 }, { "epoch": 7.773584905660377, - "grad_norm": 0.8432220816612244, - "learning_rate": 0.00013392336751214248, - "loss": 3.2151, + "grad_norm": 0.8173696994781494, + "learning_rate": 0.00013391041554236373, + "loss": 3.22, "step": 72100 }, { "epoch": 7.7789757412398925, - "grad_norm": 0.7723524570465088, - "learning_rate": 0.00013359956826767403, - "loss": 3.229, + "grad_norm": 0.7856990098953247, + "learning_rate": 0.00013358661629789528, + "loss": 3.2322, "step": 72150 }, { "epoch": 7.784366576819407, - "grad_norm": 0.7924665808677673, - "learning_rate": 0.0001332757690232056, - "loss": 3.2323, + "grad_norm": 0.8363749980926514, + "learning_rate": 0.00013326281705342685, + "loss": 3.2338, "step": 72200 }, { "epoch": 7.789757412398922, - "grad_norm": 0.8395488262176514, - "learning_rate": 0.00013295196977873718, - "loss": 3.2297, + "grad_norm": 0.8107422590255737, + "learning_rate": 0.00013293901780895843, + "loss": 3.2322, "step": 72250 }, { "epoch": 7.795148247978437, - "grad_norm": 0.8542680144309998, - "learning_rate": 0.00013262817053426876, - "loss": 3.2238, + "grad_norm": 0.8467124104499817, + "learning_rate": 0.00013261521856449, + "loss": 3.2286, "step": 72300 }, { "epoch": 7.800539083557951, - "grad_norm": 0.804538905620575, - "learning_rate": 0.0001323043712898003, - "loss": 3.2303, + "grad_norm": 0.8180131316184998, + "learning_rate": 0.00013229141932002156, + "loss": 3.2327, "step": 72350 }, { "epoch": 7.8059299191374665, - "grad_norm": 0.8471668362617493, - "learning_rate": 0.0001319805720453319, - "loss": 3.2319, + "grad_norm": 0.7756885290145874, + "learning_rate": 0.00013196762007555314, + "loss": 3.2329, "step": 72400 }, { "epoch": 7.811320754716981, - "grad_norm": 0.917702317237854, - "learning_rate": 0.00013165677280086346, - "loss": 3.2264, + "grad_norm": 0.8815840482711792, + "learning_rate": 0.00013164382083108472, + "loss": 3.2278, "step": 72450 }, { "epoch": 7.816711590296496, - "grad_norm": 0.8071184754371643, - "learning_rate": 0.00013133297355639502, - "loss": 3.2349, + "grad_norm": 0.8170946836471558, + "learning_rate": 0.0001313200215866163, + "loss": 3.2392, "step": 72500 }, { "epoch": 7.822102425876011, - "grad_norm": 0.8377707004547119, - "learning_rate": 0.0001310091743119266, - "loss": 3.2281, + "grad_norm": 0.8364844918251038, + "learning_rate": 0.00013099622234214784, + "loss": 3.2302, "step": 72550 }, { "epoch": 7.827493261455525, - "grad_norm": 0.8600809574127197, - "learning_rate": 0.00013068537506745817, - "loss": 3.2396, + "grad_norm": 0.834038496017456, + "learning_rate": 0.00013067242309767942, + "loss": 3.2414, "step": 72600 }, { "epoch": 7.83288409703504, - "grad_norm": 0.7609186768531799, - "learning_rate": 0.00013036157582298972, - "loss": 3.2438, + "grad_norm": 0.7629231810569763, + "learning_rate": 0.000130348623853211, + "loss": 3.2461, "step": 72650 }, { "epoch": 7.8382749326145555, - "grad_norm": 0.7975425720214844, - "learning_rate": 0.0001300377765785213, - "loss": 3.2397, + "grad_norm": 0.7637219429016113, + "learning_rate": 0.00013002482460874258, + "loss": 3.2435, "step": 72700 }, { "epoch": 7.84366576819407, - "grad_norm": 0.8560848236083984, - "learning_rate": 0.00012971397733405288, - "loss": 3.2357, + "grad_norm": 0.8876663446426392, + "learning_rate": 0.00012970102536427413, + "loss": 3.2362, "step": 72750 }, { "epoch": 7.849056603773585, - "grad_norm": 0.8832368850708008, - "learning_rate": 0.00012939017808958443, - "loss": 3.2582, + "grad_norm": 0.8259429931640625, + "learning_rate": 0.0001293772261198057, + "loss": 3.2611, "step": 72800 }, { "epoch": 7.8544474393531, - "grad_norm": 0.8199717402458191, - "learning_rate": 0.000129066378845116, - "loss": 3.2274, + "grad_norm": 0.8051719069480896, + "learning_rate": 0.00012905342687533728, + "loss": 3.2314, "step": 72850 }, { "epoch": 7.859838274932614, - "grad_norm": 0.8567049503326416, - "learning_rate": 0.00012874257960064758, - "loss": 3.2322, + "grad_norm": 0.7907696962356567, + "learning_rate": 0.00012872962763086886, + "loss": 3.2336, "step": 72900 }, { "epoch": 7.8652291105121295, - "grad_norm": 0.7869696021080017, - "learning_rate": 0.00012841878035617916, - "loss": 3.2524, + "grad_norm": 0.7773061990737915, + "learning_rate": 0.00012840582838640044, + "loss": 3.2552, "step": 72950 }, { "epoch": 7.870619946091644, - "grad_norm": 0.790312647819519, - "learning_rate": 0.0001280949811117107, - "loss": 3.2268, + "grad_norm": 0.8075227737426758, + "learning_rate": 0.000128082029141932, + "loss": 3.2294, "step": 73000 }, { "epoch": 7.870619946091644, - "eval_accuracy": 0.3900164945948991, - "eval_loss": 3.331883430480957, - "eval_runtime": 184.8094, - "eval_samples_per_second": 97.457, - "eval_steps_per_second": 6.093, + "eval_accuracy": 0.38974986040820236, + "eval_loss": 3.3344857692718506, + "eval_runtime": 185.203, + "eval_samples_per_second": 97.25, + "eval_steps_per_second": 6.08, "step": 73000 }, { "epoch": 7.876010781671159, - "grad_norm": 0.8404228687286377, - "learning_rate": 0.0001277711818672423, - "loss": 3.2261, + "grad_norm": 0.8024936318397522, + "learning_rate": 0.00012775822989746357, + "loss": 3.228, "step": 73050 }, { "epoch": 7.881401617250674, - "grad_norm": 0.8575640320777893, - "learning_rate": 0.00012744738262277387, - "loss": 3.2354, + "grad_norm": 0.8461624979972839, + "learning_rate": 0.00012743443065299514, + "loss": 3.2375, "step": 73100 }, { "epoch": 7.886792452830189, - "grad_norm": 0.8023410439491272, - "learning_rate": 0.00012712358337830544, - "loss": 3.2411, + "grad_norm": 0.8133740425109863, + "learning_rate": 0.00012711063140852672, + "loss": 3.243, "step": 73150 }, { "epoch": 7.892183288409703, - "grad_norm": 0.7935019731521606, - "learning_rate": 0.00012679978413383702, - "loss": 3.2188, + "grad_norm": 0.7898610234260559, + "learning_rate": 0.00012678683216405827, + "loss": 3.2223, "step": 73200 }, { "epoch": 7.8975741239892185, - "grad_norm": 0.8237470984458923, - "learning_rate": 0.00012647598488936857, - "loss": 3.2369, + "grad_norm": 0.8022027015686035, + "learning_rate": 0.00012646303291958985, + "loss": 3.2392, "step": 73250 }, { "epoch": 7.902964959568733, - "grad_norm": 0.8367178440093994, - "learning_rate": 0.00012615218564490015, - "loss": 3.2359, + "grad_norm": 0.8332421779632568, + "learning_rate": 0.00012613923367512143, + "loss": 3.239, "step": 73300 }, { "epoch": 7.908355795148248, - "grad_norm": 0.8428457975387573, - "learning_rate": 0.00012582838640043173, - "loss": 3.2394, + "grad_norm": 0.8167179226875305, + "learning_rate": 0.00012581543443065298, + "loss": 3.2418, "step": 73350 }, { "epoch": 7.913746630727763, - "grad_norm": 0.8914994597434998, - "learning_rate": 0.0001255045871559633, - "loss": 3.2437, + "grad_norm": 0.8627277612686157, + "learning_rate": 0.00012549163518618456, + "loss": 3.2455, "step": 73400 }, { "epoch": 7.919137466307277, - "grad_norm": 0.8220916986465454, - "learning_rate": 0.00012518078791149486, - "loss": 3.2222, + "grad_norm": 0.8191093802452087, + "learning_rate": 0.00012516783594171613, + "loss": 3.2264, "step": 73450 }, { "epoch": 7.9245283018867925, - "grad_norm": 0.8281764388084412, - "learning_rate": 0.00012485698866702643, - "loss": 3.2444, + "grad_norm": 0.8071380853652954, + "learning_rate": 0.00012484403669724768, + "loss": 3.2465, "step": 73500 }, { "epoch": 7.929919137466308, - "grad_norm": 0.8000036478042603, - "learning_rate": 0.000124533189422558, - "loss": 3.2307, + "grad_norm": 0.8399831056594849, + "learning_rate": 0.00012452023745277926, + "loss": 3.2325, "step": 73550 }, { "epoch": 7.935309973045822, - "grad_norm": 0.857599675655365, - "learning_rate": 0.0001242093901780896, - "loss": 3.2393, + "grad_norm": 0.8575718402862549, + "learning_rate": 0.00012419643820831084, + "loss": 3.2416, "step": 73600 }, { "epoch": 7.940700808625337, - "grad_norm": 0.8535449504852295, - "learning_rate": 0.00012388559093362114, - "loss": 3.2199, + "grad_norm": 0.9044980406761169, + "learning_rate": 0.0001238726389638424, + "loss": 3.222, "step": 73650 }, { "epoch": 7.946091644204852, - "grad_norm": 0.8354669213294983, - "learning_rate": 0.0001235682676740421, - "loss": 3.2238, + "grad_norm": 0.815284788608551, + "learning_rate": 0.00012354883971937397, + "loss": 3.2261, "step": 73700 }, { "epoch": 7.951482479784366, - "grad_norm": 0.8469333052635193, - "learning_rate": 0.00012324446842957367, - "loss": 3.2163, + "grad_norm": 0.7698843479156494, + "learning_rate": 0.00012322504047490555, + "loss": 3.2182, "step": 73750 }, { "epoch": 7.9568733153638815, - "grad_norm": 0.8277457356452942, - "learning_rate": 0.00012292066918510522, - "loss": 3.196, + "grad_norm": 0.804814875125885, + "learning_rate": 0.00012290124123043712, + "loss": 3.1994, "step": 73800 }, { "epoch": 7.962264150943396, - "grad_norm": 0.8391556739807129, - "learning_rate": 0.0001225968699406368, - "loss": 3.2214, + "grad_norm": 0.7834069132804871, + "learning_rate": 0.00012257744198596867, + "loss": 3.223, "step": 73850 }, { "epoch": 7.967654986522911, - "grad_norm": 0.8240298628807068, - "learning_rate": 0.00012227307069616837, - "loss": 3.2273, + "grad_norm": 0.8091722130775452, + "learning_rate": 0.00012225364274150025, + "loss": 3.231, "step": 73900 }, { "epoch": 7.973045822102426, - "grad_norm": 0.8295006155967712, - "learning_rate": 0.00012194927145169994, - "loss": 3.251, + "grad_norm": 0.8159931898117065, + "learning_rate": 0.00012192984349703183, + "loss": 3.2533, "step": 73950 }, { "epoch": 7.97843665768194, - "grad_norm": 0.8376235961914062, - "learning_rate": 0.0001216254722072315, - "loss": 3.2412, + "grad_norm": 0.8482950925827026, + "learning_rate": 0.0001216060442525634, + "loss": 3.2455, "step": 74000 }, { "epoch": 7.97843665768194, - "eval_accuracy": 0.39033039278779513, - "eval_loss": 3.3302652835845947, - "eval_runtime": 185.3426, - "eval_samples_per_second": 97.177, - "eval_steps_per_second": 6.075, + "eval_accuracy": 0.38994032892054115, + "eval_loss": 3.3322672843933105, + "eval_runtime": 185.1982, + "eval_samples_per_second": 97.253, + "eval_steps_per_second": 6.08, "step": 74000 }, { "epoch": 7.9838274932614555, - "grad_norm": 0.8279843330383301, - "learning_rate": 0.00012130167296276308, - "loss": 3.2328, + "grad_norm": 0.7893300652503967, + "learning_rate": 0.00012128224500809496, + "loss": 3.2372, "step": 74050 }, { "epoch": 7.989218328840971, - "grad_norm": 0.7964900135993958, - "learning_rate": 0.00012097787371829464, - "loss": 3.2259, + "grad_norm": 0.841288685798645, + "learning_rate": 0.00012095844576362653, + "loss": 3.2274, "step": 74100 }, { "epoch": 7.994609164420485, - "grad_norm": 0.8571212887763977, - "learning_rate": 0.00012065407447382622, - "loss": 3.2181, + "grad_norm": 0.9836897253990173, + "learning_rate": 0.00012063464651915811, + "loss": 3.22, "step": 74150 }, { "epoch": 8.0, - "grad_norm": 1.8256008625030518, - "learning_rate": 0.00012033027522935779, - "loss": 3.2142, + "grad_norm": 1.7988165616989136, + "learning_rate": 0.00012031084727468969, + "loss": 3.2174, "step": 74200 }, { "epoch": 8.005390835579515, - "grad_norm": 0.787792980670929, - "learning_rate": 0.00012000647598488935, - "loss": 3.157, + "grad_norm": 0.8070291876792908, + "learning_rate": 0.00011998704803022124, + "loss": 3.1629, "step": 74250 }, { "epoch": 8.01078167115903, - "grad_norm": 0.8355326652526855, - "learning_rate": 0.00011968267674042093, - "loss": 3.1554, + "grad_norm": 0.8832926750183105, + "learning_rate": 0.00011966324878575282, + "loss": 3.1593, "step": 74300 }, { "epoch": 8.016172506738544, - "grad_norm": 0.8311036229133606, - "learning_rate": 0.0001193588774959525, - "loss": 3.1441, + "grad_norm": 0.8364817500114441, + "learning_rate": 0.0001193394495412844, + "loss": 3.1457, "step": 74350 }, { "epoch": 8.021563342318059, - "grad_norm": 0.8239858746528625, - "learning_rate": 0.00011903507825148406, - "loss": 3.1528, + "grad_norm": 0.8132591843605042, + "learning_rate": 0.00011901565029681597, + "loss": 3.1558, "step": 74400 }, { "epoch": 8.026954177897574, - "grad_norm": 0.8341767191886902, - "learning_rate": 0.00011871127900701563, - "loss": 3.157, + "grad_norm": 0.8514834046363831, + "learning_rate": 0.00011869185105234754, + "loss": 3.1608, "step": 74450 }, { "epoch": 8.032345013477089, - "grad_norm": 0.8128954768180847, - "learning_rate": 0.00011838747976254721, - "loss": 3.1587, + "grad_norm": 0.8021072149276733, + "learning_rate": 0.0001183680518078791, + "loss": 3.1607, "step": 74500 }, { "epoch": 8.037735849056604, - "grad_norm": 0.8075172305107117, - "learning_rate": 0.00011806368051807879, - "loss": 3.1719, + "grad_norm": 0.8027331233024597, + "learning_rate": 0.00011804425256341068, + "loss": 3.1741, "step": 74550 }, { "epoch": 8.04312668463612, - "grad_norm": 0.7725398540496826, - "learning_rate": 0.00011773988127361035, - "loss": 3.1608, + "grad_norm": 0.7780774831771851, + "learning_rate": 0.00011772045331894224, + "loss": 3.1636, "step": 74600 }, { "epoch": 8.048517520215633, - "grad_norm": 0.8969375491142273, - "learning_rate": 0.00011741608202914192, - "loss": 3.1593, + "grad_norm": 0.8480037450790405, + "learning_rate": 0.00011739665407447382, + "loss": 3.164, "step": 74650 }, { "epoch": 8.053908355795148, - "grad_norm": 0.7979028224945068, - "learning_rate": 0.0001170922827846735, - "loss": 3.1489, + "grad_norm": 0.7819019556045532, + "learning_rate": 0.00011707933081489476, + "loss": 3.1516, "step": 74700 }, { "epoch": 8.059299191374663, - "grad_norm": 0.8518366813659668, - "learning_rate": 0.00011676848354020506, - "loss": 3.1652, + "grad_norm": 0.7888557314872742, + "learning_rate": 0.00011675553157042632, + "loss": 3.1707, "step": 74750 }, { "epoch": 8.064690026954178, - "grad_norm": 0.8133111000061035, - "learning_rate": 0.00011644468429573664, - "loss": 3.1594, + "grad_norm": 0.8173406720161438, + "learning_rate": 0.00011643173232595789, + "loss": 3.1638, "step": 74800 }, { "epoch": 8.070080862533693, - "grad_norm": 0.8464822769165039, - "learning_rate": 0.0001161208850512682, - "loss": 3.1603, + "grad_norm": 0.9043296575546265, + "learning_rate": 0.00011610793308148947, + "loss": 3.1623, "step": 74850 }, { "epoch": 8.075471698113208, - "grad_norm": 0.8304824829101562, - "learning_rate": 0.00011579708580679978, - "loss": 3.1717, + "grad_norm": 0.8537508249282837, + "learning_rate": 0.00011578413383702104, + "loss": 3.1742, "step": 74900 }, { "epoch": 8.080862533692722, - "grad_norm": 0.8468976616859436, - "learning_rate": 0.00011547328656233134, - "loss": 3.1532, + "grad_norm": 0.8383791446685791, + "learning_rate": 0.00011546033459255261, + "loss": 3.1563, "step": 74950 }, { "epoch": 8.086253369272237, - "grad_norm": 0.8231673240661621, - "learning_rate": 0.00011514948731786292, - "loss": 3.1538, + "grad_norm": 0.7972344756126404, + "learning_rate": 0.00011514301133297355, + "loss": 3.1569, "step": 75000 }, { "epoch": 8.086253369272237, - "eval_accuracy": 0.3899742286215393, - "eval_loss": 3.336721420288086, - "eval_runtime": 184.8716, - "eval_samples_per_second": 97.424, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.38994478368894153, + "eval_loss": 3.3383166790008545, + "eval_runtime": 184.7956, + "eval_samples_per_second": 97.464, + "eval_steps_per_second": 6.093, "step": 75000 }, { "epoch": 8.091644204851752, - "grad_norm": 0.8552824258804321, - "learning_rate": 0.00011482568807339448, - "loss": 3.1624, + "grad_norm": 0.8097219467163086, + "learning_rate": 0.00011481921208850512, + "loss": 3.1658, "step": 75050 }, { "epoch": 8.097035040431267, - "grad_norm": 0.8481261134147644, - "learning_rate": 0.00011450188882892605, - "loss": 3.153, + "grad_norm": 0.7911630272865295, + "learning_rate": 0.00011449541284403669, + "loss": 3.1553, "step": 75100 }, { "epoch": 8.102425876010782, - "grad_norm": 0.8056097626686096, - "learning_rate": 0.00011417808958445763, - "loss": 3.1682, + "grad_norm": 0.8461593985557556, + "learning_rate": 0.00011417161359956825, + "loss": 3.1711, "step": 75150 }, { "epoch": 8.107816711590296, - "grad_norm": 0.8345311284065247, - "learning_rate": 0.0001138542903399892, - "loss": 3.1789, + "grad_norm": 0.7964184284210205, + "learning_rate": 0.00011384781435509983, + "loss": 3.1809, "step": 75200 }, { "epoch": 8.11320754716981, - "grad_norm": 0.7978355884552002, - "learning_rate": 0.00011353049109552075, - "loss": 3.1634, + "grad_norm": 0.789570152759552, + "learning_rate": 0.0001135240151106314, + "loss": 3.1654, "step": 75250 }, { "epoch": 8.118598382749326, - "grad_norm": 0.810292661190033, - "learning_rate": 0.00011320669185105233, - "loss": 3.1758, + "grad_norm": 0.792686939239502, + "learning_rate": 0.00011320021586616297, + "loss": 3.1782, "step": 75300 }, { "epoch": 8.123989218328841, - "grad_norm": 0.8161046504974365, - "learning_rate": 0.00011288289260658391, - "loss": 3.1726, + "grad_norm": 0.835252583026886, + "learning_rate": 0.00011287641662169454, + "loss": 3.1779, "step": 75350 }, { "epoch": 8.129380053908356, - "grad_norm": 0.7999998331069946, - "learning_rate": 0.00011255909336211549, - "loss": 3.1797, + "grad_norm": 0.7725673317909241, + "learning_rate": 0.0001125526173772261, + "loss": 3.182, "step": 75400 }, { "epoch": 8.134770889487871, - "grad_norm": 0.8012352585792542, - "learning_rate": 0.00011224177010253641, - "loss": 3.1674, + "grad_norm": 0.8120366930961609, + "learning_rate": 0.00011222881813275768, + "loss": 3.1711, "step": 75450 }, { "epoch": 8.140161725067385, - "grad_norm": 0.8193981051445007, - "learning_rate": 0.00011191797085806799, - "loss": 3.1668, + "grad_norm": 0.8159014582633972, + "learning_rate": 0.00011190501888828925, + "loss": 3.1689, "step": 75500 }, { "epoch": 8.1455525606469, - "grad_norm": 0.8131717443466187, - "learning_rate": 0.00011159417161359957, - "loss": 3.1712, + "grad_norm": 0.8142615556716919, + "learning_rate": 0.00011158121964382083, + "loss": 3.1743, "step": 75550 }, { "epoch": 8.150943396226415, - "grad_norm": 0.8060713410377502, - "learning_rate": 0.00011127037236913112, - "loss": 3.1755, + "grad_norm": 0.8550556302070618, + "learning_rate": 0.00011125742039935238, + "loss": 3.1787, "step": 75600 }, { "epoch": 8.15633423180593, - "grad_norm": 0.8505592942237854, - "learning_rate": 0.0001109465731246627, - "loss": 3.16, + "grad_norm": 0.7932506799697876, + "learning_rate": 0.00011093362115488396, + "loss": 3.1626, "step": 75650 }, { "epoch": 8.161725067385445, - "grad_norm": 0.8405550122261047, - "learning_rate": 0.00011062277388019427, - "loss": 3.1734, + "grad_norm": 0.8494173884391785, + "learning_rate": 0.00011060982191041554, + "loss": 3.1756, "step": 75700 }, { "epoch": 8.167115902964959, - "grad_norm": 0.8327785134315491, - "learning_rate": 0.00011029897463572585, - "loss": 3.1702, + "grad_norm": 0.8864437341690063, + "learning_rate": 0.00011028602266594712, + "loss": 3.1729, "step": 75750 }, { "epoch": 8.172506738544474, - "grad_norm": 0.838642954826355, - "learning_rate": 0.0001099751753912574, - "loss": 3.1898, + "grad_norm": 0.806351900100708, + "learning_rate": 0.00010996222342147867, + "loss": 3.1927, "step": 75800 }, { "epoch": 8.177897574123989, - "grad_norm": 0.8472077250480652, - "learning_rate": 0.00010965137614678898, - "loss": 3.1944, + "grad_norm": 0.8257133960723877, + "learning_rate": 0.00010963842417701024, + "loss": 3.1989, "step": 75850 }, { "epoch": 8.183288409703504, - "grad_norm": 0.852802038192749, - "learning_rate": 0.00010932757690232056, - "loss": 3.1576, + "grad_norm": 0.8057741522789001, + "learning_rate": 0.00010931462493254182, + "loss": 3.159, "step": 75900 }, { "epoch": 8.18867924528302, - "grad_norm": 0.8112123608589172, - "learning_rate": 0.00010900377765785212, - "loss": 3.176, + "grad_norm": 0.7742595672607422, + "learning_rate": 0.00010899082568807339, + "loss": 3.1796, "step": 75950 }, { "epoch": 8.194070080862534, - "grad_norm": 0.8221772313117981, - "learning_rate": 0.0001086799784133837, - "loss": 3.1823, + "grad_norm": 0.8021875619888306, + "learning_rate": 0.00010866702644360495, + "loss": 3.185, "step": 76000 }, { "epoch": 8.194070080862534, - "eval_accuracy": 0.3900471347092628, - "eval_loss": 3.335939407348633, - "eval_runtime": 185.1744, - "eval_samples_per_second": 97.265, - "eval_steps_per_second": 6.081, + "eval_accuracy": 0.38998737562096486, + "eval_loss": 3.3373751640319824, + "eval_runtime": 185.1022, + "eval_samples_per_second": 97.303, + "eval_steps_per_second": 6.083, "step": 76000 }, { "epoch": 8.199460916442048, - "grad_norm": 0.8041417002677917, - "learning_rate": 0.00010835617916891526, - "loss": 3.1784, + "grad_norm": 0.7910264134407043, + "learning_rate": 0.00010834322719913653, + "loss": 3.1801, "step": 76050 }, { "epoch": 8.204851752021563, - "grad_norm": 0.8178136944770813, - "learning_rate": 0.00010803237992444683, - "loss": 3.1516, + "grad_norm": 0.7795304656028748, + "learning_rate": 0.00010801942795466809, + "loss": 3.155, "step": 76100 }, { "epoch": 8.210242587601078, - "grad_norm": 0.8160865902900696, - "learning_rate": 0.0001077085806799784, - "loss": 3.1834, + "grad_norm": 0.8299195170402527, + "learning_rate": 0.00010769562871019967, + "loss": 3.1855, "step": 76150 }, { "epoch": 8.215633423180593, - "grad_norm": 0.8639083504676819, - "learning_rate": 0.00010738478143550998, - "loss": 3.1641, + "grad_norm": 0.8401374220848083, + "learning_rate": 0.00010737182946573123, + "loss": 3.1657, "step": 76200 }, { "epoch": 8.221024258760108, - "grad_norm": 0.8077586889266968, - "learning_rate": 0.00010706098219104153, - "loss": 3.1617, + "grad_norm": 0.7858666777610779, + "learning_rate": 0.0001070480302212628, + "loss": 3.1654, "step": 76250 }, { "epoch": 8.226415094339623, - "grad_norm": 0.8162104487419128, - "learning_rate": 0.00010673718294657311, - "loss": 3.1768, + "grad_norm": 0.8238749504089355, + "learning_rate": 0.00010672423097679438, + "loss": 3.1798, "step": 76300 }, { "epoch": 8.231805929919137, - "grad_norm": 0.8558855056762695, - "learning_rate": 0.00010641338370210469, - "loss": 3.1665, + "grad_norm": 0.8600948452949524, + "learning_rate": 0.00010640043173232595, + "loss": 3.1702, "step": 76350 }, { "epoch": 8.237196765498652, - "grad_norm": 0.817091703414917, - "learning_rate": 0.00010608958445763627, - "loss": 3.1608, + "grad_norm": 0.7855995893478394, + "learning_rate": 0.00010607663248785753, + "loss": 3.164, "step": 76400 }, { "epoch": 8.242587601078167, - "grad_norm": 0.8458694815635681, - "learning_rate": 0.00010576578521316782, - "loss": 3.1833, + "grad_norm": 0.8233217000961304, + "learning_rate": 0.00010575283324338908, + "loss": 3.1892, "step": 76450 }, { "epoch": 8.247978436657682, - "grad_norm": 0.8939496278762817, - "learning_rate": 0.0001054419859686994, - "loss": 3.1853, + "grad_norm": 0.8577181100845337, + "learning_rate": 0.00010542903399892066, + "loss": 3.1883, "step": 76500 }, { "epoch": 8.253369272237197, - "grad_norm": 0.8105524778366089, - "learning_rate": 0.00010511818672423097, - "loss": 3.1898, + "grad_norm": 0.8304762840270996, + "learning_rate": 0.00010510523475445224, + "loss": 3.1926, "step": 76550 }, { "epoch": 8.25876010781671, - "grad_norm": 0.8321719169616699, - "learning_rate": 0.00010479438747976254, - "loss": 3.1769, + "grad_norm": 0.8044326305389404, + "learning_rate": 0.0001047814355099838, + "loss": 3.1809, "step": 76600 }, { "epoch": 8.264150943396226, - "grad_norm": 0.8863265514373779, - "learning_rate": 0.0001044705882352941, - "loss": 3.1695, + "grad_norm": 0.8440804481506348, + "learning_rate": 0.00010445763626551537, + "loss": 3.1729, "step": 76650 }, { "epoch": 8.269541778975741, - "grad_norm": 0.8106926679611206, - "learning_rate": 0.00010414678899082568, - "loss": 3.1919, + "grad_norm": 0.8171114921569824, + "learning_rate": 0.00010413383702104694, + "loss": 3.1942, "step": 76700 }, { "epoch": 8.274932614555256, - "grad_norm": 0.8519572019577026, - "learning_rate": 0.00010382298974635726, - "loss": 3.1769, + "grad_norm": 0.8302549123764038, + "learning_rate": 0.00010381003777657852, + "loss": 3.1802, "step": 76750 }, { "epoch": 8.280323450134771, - "grad_norm": 0.8171219229698181, - "learning_rate": 0.00010349919050188882, - "loss": 3.1732, + "grad_norm": 0.7697561979293823, + "learning_rate": 0.00010348623853211008, + "loss": 3.1741, "step": 76800 }, { "epoch": 8.285714285714286, - "grad_norm": 0.8138165473937988, - "learning_rate": 0.0001031753912574204, - "loss": 3.1747, + "grad_norm": 0.7852888703346252, + "learning_rate": 0.00010316243928764165, + "loss": 3.1784, "step": 76850 }, { "epoch": 8.2911051212938, - "grad_norm": 0.8623942732810974, - "learning_rate": 0.00010285159201295196, - "loss": 3.1877, + "grad_norm": 0.8156181573867798, + "learning_rate": 0.00010283864004317323, + "loss": 3.1924, "step": 76900 }, { "epoch": 8.296495956873315, - "grad_norm": 0.8288198709487915, - "learning_rate": 0.00010252779276848353, - "loss": 3.1723, + "grad_norm": 0.8077170848846436, + "learning_rate": 0.00010251484079870479, + "loss": 3.1766, "step": 76950 }, { "epoch": 8.30188679245283, - "grad_norm": 0.8703798651695251, - "learning_rate": 0.0001022039935240151, - "loss": 3.1717, + "grad_norm": 0.8286692500114441, + "learning_rate": 0.00010219104155423637, + "loss": 3.1749, "step": 77000 }, { "epoch": 8.30188679245283, - "eval_accuracy": 0.3907528352156179, - "eval_loss": 3.3312199115753174, - "eval_runtime": 185.2399, - "eval_samples_per_second": 97.231, - "eval_steps_per_second": 6.079, + "eval_accuracy": 0.390606914387283, + "eval_loss": 3.3321518898010254, + "eval_runtime": 185.3128, + "eval_samples_per_second": 97.192, + "eval_steps_per_second": 6.076, "step": 77000 }, { "epoch": 8.307277628032345, - "grad_norm": 0.8199688196182251, - "learning_rate": 0.00010188019427954668, - "loss": 3.1804, + "grad_norm": 0.7854719758033752, + "learning_rate": 0.0001018737182946573, + "loss": 3.1833, "step": 77050 }, { "epoch": 8.31266846361186, - "grad_norm": 0.8455313444137573, - "learning_rate": 0.00010155639503507823, - "loss": 3.1848, + "grad_norm": 0.8314083218574524, + "learning_rate": 0.00010154991905018887, + "loss": 3.1864, "step": 77100 }, { "epoch": 8.318059299191376, - "grad_norm": 0.8879739046096802, - "learning_rate": 0.00010123259579060981, - "loss": 3.1738, + "grad_norm": 0.8903999328613281, + "learning_rate": 0.00010122611980572045, + "loss": 3.1781, "step": 77150 }, { "epoch": 8.323450134770889, - "grad_norm": 0.849547266960144, - "learning_rate": 0.00010090879654614139, - "loss": 3.1672, + "grad_norm": 0.8407829999923706, + "learning_rate": 0.00010090232056125201, + "loss": 3.1695, "step": 77200 }, { "epoch": 8.328840970350404, - "grad_norm": 0.8199661374092102, - "learning_rate": 0.00010058499730167296, - "loss": 3.1712, + "grad_norm": 0.7816826105117798, + "learning_rate": 0.00010057852131678359, + "loss": 3.1748, "step": 77250 }, { "epoch": 8.33423180592992, - "grad_norm": 0.9070252180099487, - "learning_rate": 0.00010026119805720452, - "loss": 3.1767, + "grad_norm": 0.830484926700592, + "learning_rate": 0.00010025472207231515, + "loss": 3.1788, "step": 77300 }, { "epoch": 8.339622641509434, - "grad_norm": 0.8134100437164307, - "learning_rate": 9.993739881273609e-05, - "loss": 3.1671, + "grad_norm": 0.8447725772857666, + "learning_rate": 9.993092282784673e-05, + "loss": 3.168, "step": 77350 }, { "epoch": 8.34501347708895, - "grad_norm": 0.8291438221931458, - "learning_rate": 9.961359956826767e-05, - "loss": 3.2002, + "grad_norm": 0.8344404101371765, + "learning_rate": 9.96071235833783e-05, + "loss": 3.2012, "step": 77400 }, { "epoch": 8.350404312668463, - "grad_norm": 0.8317015767097473, - "learning_rate": 9.928980032379923e-05, - "loss": 3.1749, + "grad_norm": 0.863575279712677, + "learning_rate": 9.928332433890986e-05, + "loss": 3.1774, "step": 77450 }, { "epoch": 8.355795148247978, - "grad_norm": 0.8489733338356018, - "learning_rate": 9.896600107933081e-05, - "loss": 3.1672, + "grad_norm": 0.8597997426986694, + "learning_rate": 9.895952509444144e-05, + "loss": 3.1703, "step": 77500 }, { "epoch": 8.361185983827493, - "grad_norm": 0.8064641952514648, - "learning_rate": 9.864220183486238e-05, - "loss": 3.1891, + "grad_norm": 0.8186684846878052, + "learning_rate": 9.863572584997302e-05, + "loss": 3.192, "step": 77550 }, { "epoch": 8.366576819407008, - "grad_norm": 0.8400290012359619, - "learning_rate": 9.831840259039394e-05, - "loss": 3.1807, + "grad_norm": 0.8455076217651367, + "learning_rate": 9.831192660550457e-05, + "loss": 3.1844, "step": 77600 }, { "epoch": 8.371967654986523, - "grad_norm": 0.8481228351593018, - "learning_rate": 9.799460334592552e-05, - "loss": 3.1785, + "grad_norm": 0.8197668790817261, + "learning_rate": 9.798812736103614e-05, + "loss": 3.1818, "step": 77650 }, { "epoch": 8.377358490566039, - "grad_norm": 0.8531959652900696, - "learning_rate": 9.76708041014571e-05, - "loss": 3.1811, + "grad_norm": 0.8690318465232849, + "learning_rate": 9.766432811656772e-05, + "loss": 3.1833, "step": 77700 }, { "epoch": 8.382749326145552, - "grad_norm": 0.8155908584594727, - "learning_rate": 9.734700485698865e-05, - "loss": 3.1734, + "grad_norm": 0.8059806227684021, + "learning_rate": 9.73405288720993e-05, + "loss": 3.176, "step": 77750 }, { "epoch": 8.388140161725067, - "grad_norm": 0.8197899460792542, - "learning_rate": 9.702320561252022e-05, - "loss": 3.1783, + "grad_norm": 0.7999092936515808, + "learning_rate": 9.701672962763086e-05, + "loss": 3.1828, "step": 77800 }, { "epoch": 8.393530997304582, - "grad_norm": 0.785799503326416, - "learning_rate": 9.66994063680518e-05, - "loss": 3.1647, + "grad_norm": 0.7501765489578247, + "learning_rate": 9.669293038316243e-05, + "loss": 3.1667, "step": 77850 }, { "epoch": 8.398921832884097, - "grad_norm": 0.8561741709709167, - "learning_rate": 9.637560712358338e-05, - "loss": 3.172, + "grad_norm": 0.7897477149963379, + "learning_rate": 9.6369131138694e-05, + "loss": 3.1748, "step": 77900 }, { "epoch": 8.404312668463612, - "grad_norm": 0.8725782036781311, - "learning_rate": 9.605180787911493e-05, - "loss": 3.1793, + "grad_norm": 0.824909508228302, + "learning_rate": 9.604533189422557e-05, + "loss": 3.1822, "step": 77950 }, { "epoch": 8.409703504043126, - "grad_norm": 0.8261856436729431, - "learning_rate": 9.572800863464651e-05, - "loss": 3.1889, + "grad_norm": 0.8260047435760498, + "learning_rate": 9.572153264975715e-05, + "loss": 3.1912, "step": 78000 }, { "epoch": 8.409703504043126, - "eval_accuracy": 0.3913926920719573, - "eval_loss": 3.326793670654297, - "eval_runtime": 184.8117, - "eval_samples_per_second": 97.456, - "eval_steps_per_second": 6.093, + "eval_accuracy": 0.3911025888614927, + "eval_loss": 3.328890085220337, + "eval_runtime": 184.8301, + "eval_samples_per_second": 97.446, + "eval_steps_per_second": 6.092, "step": 78000 }, { "epoch": 8.415094339622641, - "grad_norm": 0.8366762399673462, - "learning_rate": 9.540420939017809e-05, - "loss": 3.1671, + "grad_norm": 0.8345364332199097, + "learning_rate": 9.539773340528871e-05, + "loss": 3.1706, "step": 78050 }, { "epoch": 8.420485175202156, - "grad_norm": 0.8470886945724487, - "learning_rate": 9.508041014570966e-05, - "loss": 3.1645, + "grad_norm": 0.8107349872589111, + "learning_rate": 9.507393416082027e-05, + "loss": 3.1668, "step": 78100 }, { "epoch": 8.425876010781671, - "grad_norm": 0.8738471865653992, - "learning_rate": 9.475661090124121e-05, - "loss": 3.1871, + "grad_norm": 0.873961865901947, + "learning_rate": 9.475013491635185e-05, + "loss": 3.1897, "step": 78150 }, { "epoch": 8.431266846361186, - "grad_norm": 0.8640100955963135, - "learning_rate": 9.443281165677279e-05, - "loss": 3.1946, + "grad_norm": 0.8412361741065979, + "learning_rate": 9.442633567188343e-05, + "loss": 3.1988, "step": 78200 }, { "epoch": 8.436657681940702, - "grad_norm": 0.8384734392166138, - "learning_rate": 9.410901241230437e-05, - "loss": 3.1726, + "grad_norm": 0.8533317446708679, + "learning_rate": 9.410253642741498e-05, + "loss": 3.1762, "step": 78250 }, { "epoch": 8.442048517520215, - "grad_norm": 0.8679486513137817, - "learning_rate": 9.378521316783593e-05, - "loss": 3.1655, + "grad_norm": 0.8340447545051575, + "learning_rate": 9.377873718294656e-05, + "loss": 3.1693, "step": 78300 }, { "epoch": 8.44743935309973, - "grad_norm": 0.8307101130485535, - "learning_rate": 9.346141392336751e-05, - "loss": 3.1704, + "grad_norm": 0.8339968323707581, + "learning_rate": 9.345493793847814e-05, + "loss": 3.173, "step": 78350 }, { "epoch": 8.452830188679245, - "grad_norm": 0.8091768622398376, - "learning_rate": 9.313761467889907e-05, - "loss": 3.1647, + "grad_norm": 0.800408124923706, + "learning_rate": 9.313113869400971e-05, + "loss": 3.168, "step": 78400 }, { "epoch": 8.45822102425876, - "grad_norm": 0.8790152668952942, - "learning_rate": 9.281381543443064e-05, - "loss": 3.1761, + "grad_norm": 0.9167557954788208, + "learning_rate": 9.280733944954126e-05, + "loss": 3.1787, "step": 78450 }, { "epoch": 8.463611859838275, - "grad_norm": 0.8548306822776794, - "learning_rate": 9.249001618996222e-05, - "loss": 3.1806, + "grad_norm": 0.8427416086196899, + "learning_rate": 9.248354020507284e-05, + "loss": 3.1839, "step": 78500 }, { "epoch": 8.46900269541779, - "grad_norm": 0.8710213303565979, - "learning_rate": 9.21662169454938e-05, - "loss": 3.1752, + "grad_norm": 0.8728715181350708, + "learning_rate": 9.215974096060442e-05, + "loss": 3.1779, "step": 78550 }, { "epoch": 8.474393530997304, - "grad_norm": 0.8840611577033997, - "learning_rate": 9.184241770102534e-05, - "loss": 3.1864, + "grad_norm": 0.8693714141845703, + "learning_rate": 9.1835941716136e-05, + "loss": 3.1893, "step": 78600 }, { "epoch": 8.479784366576819, - "grad_norm": 0.8206093907356262, - "learning_rate": 9.151861845655692e-05, - "loss": 3.1722, + "grad_norm": 0.8074076771736145, + "learning_rate": 9.151214247166756e-05, + "loss": 3.1732, "step": 78650 }, { "epoch": 8.485175202156334, - "grad_norm": 0.8537209033966064, - "learning_rate": 9.11948192120885e-05, - "loss": 3.1831, + "grad_norm": 0.8128515481948853, + "learning_rate": 9.118834322719913e-05, + "loss": 3.1889, "step": 78700 }, { "epoch": 8.49056603773585, - "grad_norm": 0.8518295884132385, - "learning_rate": 9.087101996762008e-05, - "loss": 3.1831, + "grad_norm": 0.8658214807510376, + "learning_rate": 9.08645439827307e-05, + "loss": 3.187, "step": 78750 }, { "epoch": 8.495956873315365, - "grad_norm": 0.8618102073669434, - "learning_rate": 9.054722072315163e-05, - "loss": 3.193, + "grad_norm": 0.922846257686615, + "learning_rate": 9.054074473826227e-05, + "loss": 3.1961, "step": 78800 }, { "epoch": 8.501347708894878, - "grad_norm": 0.8804048299789429, - "learning_rate": 9.02234214786832e-05, - "loss": 3.1665, + "grad_norm": 0.7982033491134644, + "learning_rate": 9.021694549379385e-05, + "loss": 3.1698, "step": 78850 }, { "epoch": 8.506738544474393, - "grad_norm": 0.807671844959259, - "learning_rate": 8.989962223421478e-05, - "loss": 3.1756, + "grad_norm": 0.8339309692382812, + "learning_rate": 8.989314624932541e-05, + "loss": 3.1777, "step": 78900 }, { "epoch": 8.512129380053908, - "grad_norm": 0.8216286301612854, - "learning_rate": 8.957582298974635e-05, - "loss": 3.176, + "grad_norm": 0.7865361571311951, + "learning_rate": 8.956934700485697e-05, + "loss": 3.1804, "step": 78950 }, { "epoch": 8.517520215633423, - "grad_norm": 0.9073687791824341, - "learning_rate": 8.925202374527791e-05, - "loss": 3.1733, + "grad_norm": 0.8305343389511108, + "learning_rate": 8.924554776038855e-05, + "loss": 3.1771, "step": 79000 }, { "epoch": 8.517520215633423, - "eval_accuracy": 0.3913608567758277, - "eval_loss": 3.324925184249878, - "eval_runtime": 185.0203, - "eval_samples_per_second": 97.346, - "eval_steps_per_second": 6.086, + "eval_accuracy": 0.3911895111717443, + "eval_loss": 3.326295852661133, + "eval_runtime": 185.0767, + "eval_samples_per_second": 97.316, + "eval_steps_per_second": 6.084, "step": 79000 }, { "epoch": 8.522911051212938, - "grad_norm": 0.8593799471855164, - "learning_rate": 8.892822450080949e-05, - "loss": 3.1864, + "grad_norm": 0.8321008682250977, + "learning_rate": 8.892174851592013e-05, + "loss": 3.1894, "step": 79050 }, { "epoch": 8.528301886792454, - "grad_norm": 0.8459751605987549, - "learning_rate": 8.860442525634107e-05, - "loss": 3.1834, + "grad_norm": 0.8203441500663757, + "learning_rate": 8.859794927145168e-05, + "loss": 3.1856, "step": 79100 }, { "epoch": 8.533692722371967, - "grad_norm": 0.9012542963027954, - "learning_rate": 8.828062601187263e-05, - "loss": 3.1701, + "grad_norm": 0.8526754379272461, + "learning_rate": 8.827415002698326e-05, + "loss": 3.1741, "step": 79150 }, { "epoch": 8.539083557951482, - "grad_norm": 0.8935059309005737, - "learning_rate": 8.795682676740421e-05, - "loss": 3.1855, + "grad_norm": 0.8762194514274597, + "learning_rate": 8.795035078251483e-05, + "loss": 3.1888, "step": 79200 }, { "epoch": 8.544474393530997, - "grad_norm": 0.8865764141082764, - "learning_rate": 8.763950350782515e-05, - "loss": 3.1828, + "grad_norm": 0.822303831577301, + "learning_rate": 8.762655153804641e-05, + "loss": 3.1832, "step": 79250 }, { "epoch": 8.549865229110512, - "grad_norm": 0.8719243407249451, - "learning_rate": 8.731570426335671e-05, - "loss": 3.1741, + "grad_norm": 0.8480991125106812, + "learning_rate": 8.730275229357798e-05, + "loss": 3.1773, "step": 79300 }, { "epoch": 8.555256064690028, - "grad_norm": 0.8688287734985352, - "learning_rate": 8.699190501888828e-05, - "loss": 3.1992, + "grad_norm": 0.8900675177574158, + "learning_rate": 8.697895304910954e-05, + "loss": 3.2036, "step": 79350 }, { "epoch": 8.560646900269543, - "grad_norm": 0.8754032254219055, - "learning_rate": 8.666810577441985e-05, - "loss": 3.1734, + "grad_norm": 0.8703182339668274, + "learning_rate": 8.665515380464112e-05, + "loss": 3.1759, "step": 79400 }, { "epoch": 8.566037735849056, - "grad_norm": 0.8592960834503174, - "learning_rate": 8.634430652995142e-05, - "loss": 3.1913, + "grad_norm": 0.845158040523529, + "learning_rate": 8.633135456017268e-05, + "loss": 3.1934, "step": 79450 }, { "epoch": 8.571428571428571, - "grad_norm": 0.8773514628410339, - "learning_rate": 8.6020507285483e-05, - "loss": 3.1695, + "grad_norm": 0.9047236442565918, + "learning_rate": 8.600755531570426e-05, + "loss": 3.1729, "step": 79500 }, { "epoch": 8.576819407008086, - "grad_norm": 1.158023715019226, - "learning_rate": 8.569670804101456e-05, - "loss": 3.1684, + "grad_norm": 0.8393561244010925, + "learning_rate": 8.568375607123582e-05, + "loss": 3.1729, "step": 79550 }, { "epoch": 8.582210242587601, - "grad_norm": 0.8309387564659119, - "learning_rate": 8.537290879654612e-05, - "loss": 3.1909, + "grad_norm": 0.811484158039093, + "learning_rate": 8.53599568267674e-05, + "loss": 3.192, "step": 79600 }, { "epoch": 8.587601078167117, - "grad_norm": 0.8641684055328369, - "learning_rate": 8.50491095520777e-05, - "loss": 3.1692, + "grad_norm": 0.8052199482917786, + "learning_rate": 8.503615758229897e-05, + "loss": 3.1714, "step": 79650 }, { "epoch": 8.59299191374663, - "grad_norm": 0.9096925854682922, - "learning_rate": 8.472531030760928e-05, - "loss": 3.1758, + "grad_norm": 0.9134714007377625, + "learning_rate": 8.471235833783054e-05, + "loss": 3.1793, "step": 79700 }, { "epoch": 8.598382749326145, - "grad_norm": 0.8475040793418884, - "learning_rate": 8.440151106314086e-05, - "loss": 3.176, + "grad_norm": 0.8519928455352783, + "learning_rate": 8.438855909336211e-05, + "loss": 3.1796, "step": 79750 }, { "epoch": 8.60377358490566, - "grad_norm": 0.8620229959487915, - "learning_rate": 8.407771181867241e-05, - "loss": 3.1735, + "grad_norm": 0.842822790145874, + "learning_rate": 8.406475984889367e-05, + "loss": 3.1762, "step": 79800 }, { "epoch": 8.609164420485175, - "grad_norm": 0.8247419595718384, - "learning_rate": 8.375391257420398e-05, - "loss": 3.172, + "grad_norm": 0.7788348197937012, + "learning_rate": 8.374096060442525e-05, + "loss": 3.1748, "step": 79850 }, { "epoch": 8.61455525606469, - "grad_norm": 0.8362515568733215, - "learning_rate": 8.343011332973556e-05, - "loss": 3.187, + "grad_norm": 0.8229261040687561, + "learning_rate": 8.341716135995683e-05, + "loss": 3.1902, "step": 79900 }, { "epoch": 8.619946091644206, - "grad_norm": 0.8225242495536804, - "learning_rate": 8.310631408526714e-05, - "loss": 3.1912, + "grad_norm": 0.8361340165138245, + "learning_rate": 8.309336211548838e-05, + "loss": 3.1947, "step": 79950 }, { "epoch": 8.625336927223719, - "grad_norm": 0.7935653328895569, - "learning_rate": 8.278251484079869e-05, - "loss": 3.1708, + "grad_norm": 0.7869669795036316, + "learning_rate": 8.276956287101996e-05, + "loss": 3.1761, "step": 80000 }, { "epoch": 8.625336927223719, - "eval_accuracy": 0.3918086153265112, - "eval_loss": 3.321773052215576, - "eval_runtime": 185.1286, - "eval_samples_per_second": 97.289, - "eval_steps_per_second": 6.082, + "eval_accuracy": 0.3917423370649443, + "eval_loss": 3.3230607509613037, + "eval_runtime": 185.1693, + "eval_samples_per_second": 97.268, + "eval_steps_per_second": 6.081, "step": 80000 }, { "epoch": 8.630727762803234, - "grad_norm": 0.8339737057685852, - "learning_rate": 8.245871559633027e-05, - "loss": 3.1691, + "grad_norm": 0.8103577494621277, + "learning_rate": 8.244576362655153e-05, + "loss": 3.1737, "step": 80050 }, { "epoch": 8.63611859838275, - "grad_norm": 0.8542758822441101, - "learning_rate": 8.213491635186185e-05, - "loss": 3.1696, + "grad_norm": 0.8197165131568909, + "learning_rate": 8.212196438208311e-05, + "loss": 3.1753, "step": 80100 }, { "epoch": 8.641509433962264, - "grad_norm": 0.8964957594871521, - "learning_rate": 8.181111710739341e-05, - "loss": 3.1944, + "grad_norm": 0.9167805910110474, + "learning_rate": 8.179816513761467e-05, + "loss": 3.1966, "step": 80150 }, { "epoch": 8.64690026954178, - "grad_norm": 0.8350257277488708, - "learning_rate": 8.148731786292497e-05, - "loss": 3.1732, + "grad_norm": 0.8052075505256653, + "learning_rate": 8.147436589314624e-05, + "loss": 3.178, "step": 80200 }, { "epoch": 8.652291105121293, - "grad_norm": 0.8604266047477722, - "learning_rate": 8.116351861845655e-05, - "loss": 3.1959, + "grad_norm": 0.8420871496200562, + "learning_rate": 8.115056664867782e-05, + "loss": 3.1983, "step": 80250 }, { "epoch": 8.657681940700808, - "grad_norm": 0.8317278027534485, - "learning_rate": 8.083971937398812e-05, - "loss": 3.1793, + "grad_norm": 0.8218672275543213, + "learning_rate": 8.082676740420938e-05, + "loss": 3.1813, "step": 80300 }, { "epoch": 8.663072776280323, - "grad_norm": 0.8608930706977844, - "learning_rate": 8.05159201295197e-05, - "loss": 3.1823, + "grad_norm": 0.8376367092132568, + "learning_rate": 8.050296815974096e-05, + "loss": 3.1852, "step": 80350 }, { "epoch": 8.668463611859838, - "grad_norm": 0.885753333568573, - "learning_rate": 8.019212088505126e-05, - "loss": 3.1764, + "grad_norm": 0.8482858538627625, + "learning_rate": 8.017916891527252e-05, + "loss": 3.1787, "step": 80400 }, { "epoch": 8.673854447439354, - "grad_norm": 0.8569355010986328, - "learning_rate": 7.986832164058282e-05, - "loss": 3.1739, + "grad_norm": 0.8449885249137878, + "learning_rate": 7.985536967080409e-05, + "loss": 3.1745, "step": 80450 }, { "epoch": 8.679245283018869, - "grad_norm": 0.8573786616325378, - "learning_rate": 7.95445223961144e-05, - "loss": 3.1716, + "grad_norm": 0.8417103290557861, + "learning_rate": 7.953157042633566e-05, + "loss": 3.1751, "step": 80500 }, { "epoch": 8.684636118598382, - "grad_norm": 0.948824405670166, - "learning_rate": 7.922072315164598e-05, - "loss": 3.1796, + "grad_norm": 0.9321690201759338, + "learning_rate": 7.920777118186724e-05, + "loss": 3.1817, "step": 80550 }, { "epoch": 8.690026954177897, - "grad_norm": 0.8927190899848938, - "learning_rate": 7.889692390717755e-05, - "loss": 3.1731, + "grad_norm": 0.8538167476654053, + "learning_rate": 7.888397193739879e-05, + "loss": 3.1763, "step": 80600 }, { "epoch": 8.695417789757412, - "grad_norm": 0.876378059387207, - "learning_rate": 7.85731246627091e-05, - "loss": 3.1849, + "grad_norm": 0.8338154554367065, + "learning_rate": 7.856017269293037e-05, + "loss": 3.1875, "step": 80650 }, { "epoch": 8.700808625336927, - "grad_norm": 0.8848509192466736, - "learning_rate": 7.824932541824068e-05, - "loss": 3.1881, + "grad_norm": 0.8730599880218506, + "learning_rate": 7.823637344846195e-05, + "loss": 3.1911, "step": 80700 }, { "epoch": 8.706199460916443, - "grad_norm": 0.8308282494544983, - "learning_rate": 7.792552617377226e-05, - "loss": 3.1794, + "grad_norm": 0.8520699739456177, + "learning_rate": 7.791257420399353e-05, + "loss": 3.1807, "step": 80750 }, { "epoch": 8.711590296495956, - "grad_norm": 0.8364676237106323, - "learning_rate": 7.760172692930382e-05, - "loss": 3.151, + "grad_norm": 0.8210703730583191, + "learning_rate": 7.758877495952508e-05, + "loss": 3.1525, "step": 80800 }, { "epoch": 8.716981132075471, - "grad_norm": 0.8687278032302856, - "learning_rate": 7.727792768483539e-05, - "loss": 3.1811, + "grad_norm": 0.8489146828651428, + "learning_rate": 7.726497571505665e-05, + "loss": 3.1826, "step": 80850 }, { "epoch": 8.722371967654986, - "grad_norm": 0.8405010104179382, - "learning_rate": 7.695412844036697e-05, - "loss": 3.1711, + "grad_norm": 0.816367506980896, + "learning_rate": 7.694117647058823e-05, + "loss": 3.1729, "step": 80900 }, { "epoch": 8.727762803234501, - "grad_norm": 0.8751876950263977, - "learning_rate": 7.663032919589854e-05, - "loss": 3.1721, + "grad_norm": 0.8610575199127197, + "learning_rate": 7.661737722611981e-05, + "loss": 3.1732, "step": 80950 }, { "epoch": 8.733153638814017, - "grad_norm": 0.8680465817451477, - "learning_rate": 7.630652995143011e-05, - "loss": 3.1854, + "grad_norm": 0.8475139141082764, + "learning_rate": 7.629357798165137e-05, + "loss": 3.1889, "step": 81000 }, { "epoch": 8.733153638814017, - "eval_accuracy": 0.3922179107549083, - "eval_loss": 3.3174245357513428, - "eval_runtime": 185.0813, - "eval_samples_per_second": 97.314, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.3920232047799448, + "eval_loss": 3.319646120071411, + "eval_runtime": 185.4331, + "eval_samples_per_second": 97.129, + "eval_steps_per_second": 6.072, "step": 81000 }, { "epoch": 8.738544474393532, - "grad_norm": 0.8618085384368896, - "learning_rate": 7.598273070696167e-05, - "loss": 3.194, + "grad_norm": 0.8521168231964111, + "learning_rate": 7.596977873718294e-05, + "loss": 3.1971, "step": 81050 }, { "epoch": 8.743935309973045, - "grad_norm": 0.8313191533088684, - "learning_rate": 7.565893146249325e-05, - "loss": 3.1953, + "grad_norm": 0.8344953060150146, + "learning_rate": 7.565245547760389e-05, + "loss": 3.1993, "step": 81100 }, { "epoch": 8.74932614555256, - "grad_norm": 0.875768780708313, - "learning_rate": 7.533513221802481e-05, - "loss": 3.1711, + "grad_norm": 0.8405035138130188, + "learning_rate": 7.532865623313544e-05, + "loss": 3.1744, "step": 81150 }, { "epoch": 8.754716981132075, - "grad_norm": 0.8554214239120483, + "grad_norm": 0.8584547638893127, "learning_rate": 7.501133297355639e-05, - "loss": 3.1901, + "loss": 3.1933, "step": 81200 }, { "epoch": 8.76010781671159, - "grad_norm": 0.8244317173957825, + "grad_norm": 0.8843427896499634, "learning_rate": 7.468753372908796e-05, - "loss": 3.1755, + "loss": 3.1785, "step": 81250 }, { "epoch": 8.765498652291106, - "grad_norm": 0.8972374200820923, + "grad_norm": 0.9259549975395203, "learning_rate": 7.436373448461953e-05, - "loss": 3.162, + "loss": 3.1642, "step": 81300 }, { "epoch": 8.77088948787062, - "grad_norm": 0.8698784112930298, + "grad_norm": 0.8856974244117737, "learning_rate": 7.40399352401511e-05, - "loss": 3.1873, + "loss": 3.1904, "step": 81350 }, { "epoch": 8.776280323450134, - "grad_norm": 0.8932819962501526, + "grad_norm": 0.8694860935211182, "learning_rate": 7.371613599568268e-05, - "loss": 3.2003, + "loss": 3.2025, "step": 81400 }, { "epoch": 8.78167115902965, - "grad_norm": 0.8630633354187012, + "grad_norm": 0.8153300285339355, "learning_rate": 7.339233675121424e-05, - "loss": 3.1777, + "loss": 3.1802, "step": 81450 }, { "epoch": 8.787061994609164, - "grad_norm": 0.8544013500213623, + "grad_norm": 0.8006735444068909, "learning_rate": 7.306853750674582e-05, - "loss": 3.1815, + "loss": 3.1858, "step": 81500 }, { "epoch": 8.79245283018868, - "grad_norm": 0.8761833906173706, + "grad_norm": 0.873319685459137, "learning_rate": 7.274473826227738e-05, - "loss": 3.182, + "loss": 3.1848, "step": 81550 }, { "epoch": 8.797843665768195, - "grad_norm": 0.830359935760498, + "grad_norm": 0.8179894089698792, "learning_rate": 7.242093901780896e-05, - "loss": 3.157, + "loss": 3.1606, "step": 81600 }, { "epoch": 8.80323450134771, - "grad_norm": 0.8646902441978455, + "grad_norm": 1.1829814910888672, "learning_rate": 7.209713977334052e-05, - "loss": 3.1892, + "loss": 3.1938, "step": 81650 }, { "epoch": 8.808625336927223, - "grad_norm": 0.8650271892547607, + "grad_norm": 0.8417500257492065, "learning_rate": 7.17733405288721e-05, - "loss": 3.1926, + "loss": 3.1965, "step": 81700 }, { "epoch": 8.814016172506738, - "grad_norm": 0.8778845071792603, + "grad_norm": 0.849403977394104, "learning_rate": 7.144954128440366e-05, - "loss": 3.182, + "loss": 3.1847, "step": 81750 }, { "epoch": 8.819407008086253, - "grad_norm": 0.852878749370575, + "grad_norm": 0.8199416399002075, "learning_rate": 7.112574203993523e-05, - "loss": 3.1683, + "loss": 3.1699, "step": 81800 }, { "epoch": 8.824797843665769, - "grad_norm": 0.8865038752555847, + "grad_norm": 0.8438077569007874, "learning_rate": 7.08019427954668e-05, - "loss": 3.1871, + "loss": 3.19, "step": 81850 }, { "epoch": 8.830188679245284, - "grad_norm": 0.9244481921195984, + "grad_norm": 0.8480911254882812, "learning_rate": 7.047814355099837e-05, - "loss": 3.186, + "loss": 3.1902, "step": 81900 }, { "epoch": 8.835579514824797, - "grad_norm": 0.9070996642112732, + "grad_norm": 0.8652701377868652, "learning_rate": 7.015434430652993e-05, - "loss": 3.174, + "loss": 3.1776, "step": 81950 }, { "epoch": 8.840970350404312, - "grad_norm": 0.8721534013748169, + "grad_norm": 0.8693273067474365, "learning_rate": 6.983054506206151e-05, - "loss": 3.1885, + "loss": 3.1946, "step": 82000 }, { "epoch": 8.840970350404312, - "eval_accuracy": 0.39263633302588186, - "eval_loss": 3.313530445098877, - "eval_runtime": 185.0624, - "eval_samples_per_second": 97.324, - "eval_steps_per_second": 6.084, + "eval_accuracy": 0.3924052283335005, + "eval_loss": 3.3151607513427734, + "eval_runtime": 184.9732, + "eval_samples_per_second": 97.371, + "eval_steps_per_second": 6.087, "step": 82000 }, { "epoch": 8.846361185983827, - "grad_norm": 0.872122049331665, + "grad_norm": 0.8377047777175903, "learning_rate": 6.950674581759309e-05, - "loss": 3.1817, + "loss": 3.1836, "step": 82050 }, { "epoch": 8.851752021563343, - "grad_norm": 0.8270566463470459, + "grad_norm": 0.8401516079902649, "learning_rate": 6.918294657312465e-05, - "loss": 3.1714, + "loss": 3.1738, "step": 82100 }, { "epoch": 8.857142857142858, - "grad_norm": 0.8941061496734619, + "grad_norm": 0.8585867285728455, "learning_rate": 6.885914732865623e-05, - "loss": 3.1884, + "loss": 3.1916, "step": 82150 }, { "epoch": 8.862533692722373, - "grad_norm": 0.8830512166023254, + "grad_norm": 0.8396210074424744, "learning_rate": 6.85353480841878e-05, - "loss": 3.1766, + "loss": 3.1801, "step": 82200 }, { "epoch": 8.867924528301886, - "grad_norm": 0.8618978261947632, + "grad_norm": 0.8280928730964661, "learning_rate": 6.821154883971937e-05, - "loss": 3.1935, + "loss": 3.1956, "step": 82250 }, { "epoch": 8.873315363881401, - "grad_norm": 0.8829900026321411, + "grad_norm": 0.906252920627594, "learning_rate": 6.788774959525094e-05, - "loss": 3.1805, + "loss": 3.1818, "step": 82300 }, { "epoch": 8.878706199460916, - "grad_norm": 0.8683678507804871, + "grad_norm": 0.8322795629501343, "learning_rate": 6.756395035078252e-05, - "loss": 3.1852, + "loss": 3.1908, "step": 82350 }, { "epoch": 8.884097035040432, - "grad_norm": 0.8829304575920105, + "grad_norm": 0.856133759021759, "learning_rate": 6.724015110631408e-05, - "loss": 3.1773, + "loss": 3.1786, "step": 82400 }, { "epoch": 8.889487870619947, - "grad_norm": 0.8580146431922913, + "grad_norm": 0.8618354201316833, "learning_rate": 6.691635186184566e-05, - "loss": 3.1882, + "loss": 3.1911, "step": 82450 }, { "epoch": 8.89487870619946, - "grad_norm": 0.8446676135063171, + "grad_norm": 0.8436751365661621, "learning_rate": 6.659255261737722e-05, - "loss": 3.1776, + "loss": 3.1788, "step": 82500 }, { "epoch": 8.900269541778975, - "grad_norm": 0.8247393369674683, + "grad_norm": 0.8414259552955627, "learning_rate": 6.626875337290879e-05, - "loss": 3.1661, + "loss": 3.1687, "step": 82550 }, { "epoch": 8.90566037735849, - "grad_norm": 0.8302431702613831, + "grad_norm": 0.8588948249816895, "learning_rate": 6.594495412844036e-05, - "loss": 3.1837, + "loss": 3.1871, "step": 82600 }, { "epoch": 8.911051212938006, - "grad_norm": 0.8455914258956909, + "grad_norm": 0.8397443890571594, "learning_rate": 6.562115488397193e-05, - "loss": 3.1756, + "loss": 3.181, "step": 82650 }, { "epoch": 8.91644204851752, - "grad_norm": 0.875441312789917, + "grad_norm": 0.8405500650405884, "learning_rate": 6.52973556395035e-05, - "loss": 3.1938, + "loss": 3.1951, "step": 82700 }, { "epoch": 8.921832884097036, - "grad_norm": 0.8480143547058105, + "grad_norm": 0.8125476241111755, "learning_rate": 6.497355639503507e-05, - "loss": 3.1679, + "loss": 3.1696, "step": 82750 }, { "epoch": 8.92722371967655, - "grad_norm": 0.8535221815109253, + "grad_norm": 0.8378741145133972, "learning_rate": 6.464975715056663e-05, - "loss": 3.1713, + "loss": 3.1755, "step": 82800 }, { "epoch": 8.932614555256064, - "grad_norm": 0.8422010540962219, + "grad_norm": 0.8491596579551697, "learning_rate": 6.432595790609821e-05, - "loss": 3.1738, + "loss": 3.1784, "step": 82850 }, { "epoch": 8.93800539083558, - "grad_norm": 0.8463007211685181, + "grad_norm": 0.832361102104187, "learning_rate": 6.400215866162979e-05, - "loss": 3.1733, + "loss": 3.175, "step": 82900 }, { "epoch": 8.943396226415095, - "grad_norm": 0.8810790777206421, + "grad_norm": 0.8865790963172913, "learning_rate": 6.367835941716135e-05, - "loss": 3.1733, + "loss": 3.1758, "step": 82950 }, { "epoch": 8.94878706199461, - "grad_norm": 0.8690757751464844, + "grad_norm": 0.8258707523345947, "learning_rate": 6.335456017269293e-05, - "loss": 3.1691, + "loss": 3.1729, "step": 83000 }, { "epoch": 8.94878706199461, - "eval_accuracy": 0.3927895335977003, - "eval_loss": 3.312455415725708, - "eval_runtime": 184.9722, - "eval_samples_per_second": 97.371, - "eval_steps_per_second": 6.087, + "eval_accuracy": 0.3925592981284214, + "eval_loss": 3.314058303833008, + "eval_runtime": 185.21, + "eval_samples_per_second": 97.246, + "eval_steps_per_second": 6.08, "step": 83000 }, { "epoch": 8.954177897574123, - "grad_norm": 0.8817049264907837, + "grad_norm": 0.8663061857223511, "learning_rate": 6.30307609282245e-05, - "loss": 3.1657, + "loss": 3.1686, "step": 83050 }, { "epoch": 8.959568733153638, - "grad_norm": 0.8801669478416443, + "grad_norm": 0.8376678824424744, "learning_rate": 6.270696168375607e-05, - "loss": 3.1741, + "loss": 3.1761, "step": 83100 }, { "epoch": 8.964959568733153, - "grad_norm": 0.888874351978302, + "grad_norm": 0.8633261322975159, "learning_rate": 6.238316243928764e-05, - "loss": 3.2002, + "loss": 3.2033, "step": 83150 }, { "epoch": 8.970350404312669, - "grad_norm": 0.8430107235908508, + "grad_norm": 0.9000486731529236, "learning_rate": 6.205936319481921e-05, - "loss": 3.1797, + "loss": 3.1812, "step": 83200 }, { "epoch": 8.975741239892184, - "grad_norm": 0.8379260301589966, + "grad_norm": 0.8094392418861389, "learning_rate": 6.173556395035078e-05, - "loss": 3.1659, + "loss": 3.1674, "step": 83250 }, { "epoch": 8.981132075471699, - "grad_norm": 0.8534399271011353, - "learning_rate": 6.141824069077172e-05, - "loss": 3.1762, + "grad_norm": 0.8410497903823853, + "learning_rate": 6.141176470588236e-05, + "loss": 3.1784, "step": 83300 }, { "epoch": 8.986522911051212, - "grad_norm": 0.8653827905654907, - "learning_rate": 6.109444144630328e-05, - "loss": 3.1804, + "grad_norm": 0.8640261292457581, + "learning_rate": 6.108796546141392e-05, + "loss": 3.1831, "step": 83350 }, { "epoch": 8.991913746630727, - "grad_norm": 0.8666049242019653, - "learning_rate": 6.077064220183486e-05, - "loss": 3.1691, + "grad_norm": 0.8512471318244934, + "learning_rate": 6.076416621694549e-05, + "loss": 3.1697, "step": 83400 }, { "epoch": 8.997304582210242, - "grad_norm": 0.9309250712394714, - "learning_rate": 6.044684295736643e-05, - "loss": 3.1926, + "grad_norm": 0.9322320818901062, + "learning_rate": 6.0440366972477055e-05, + "loss": 3.1964, "step": 83450 }, { "epoch": 9.002695417789758, - "grad_norm": 0.8734882473945618, - "learning_rate": 6.0123043712898e-05, - "loss": 3.1226, + "grad_norm": 0.8278214335441589, + "learning_rate": 6.011656772800863e-05, + "loss": 3.1252, "step": 83500 }, { "epoch": 9.008086253369273, - "grad_norm": 0.8495007157325745, - "learning_rate": 5.979924446842957e-05, - "loss": 3.1172, + "grad_norm": 0.791012167930603, + "learning_rate": 5.9792768483540197e-05, + "loss": 3.1193, "step": 83550 }, { "epoch": 9.013477088948788, - "grad_norm": 0.8585401177406311, - "learning_rate": 5.9475445223961135e-05, - "loss": 3.1339, + "grad_norm": 0.8340629935264587, + "learning_rate": 5.946896923907177e-05, + "loss": 3.1389, "step": 83600 }, { "epoch": 9.018867924528301, - "grad_norm": 0.8981791734695435, - "learning_rate": 5.915164597949271e-05, - "loss": 3.1283, + "grad_norm": 0.8602026104927063, + "learning_rate": 5.9145169994603345e-05, + "loss": 3.1343, "step": 83650 }, { "epoch": 9.024258760107816, - "grad_norm": 0.8400082588195801, - "learning_rate": 5.882784673502428e-05, - "loss": 3.113, + "grad_norm": 0.8203555345535278, + "learning_rate": 5.882137075013491e-05, + "loss": 3.116, "step": 83700 }, { "epoch": 9.029649595687331, - "grad_norm": 0.8486177921295166, - "learning_rate": 5.8504047490555855e-05, - "loss": 3.1105, + "grad_norm": 0.8314825296401978, + "learning_rate": 5.849757150566649e-05, + "loss": 3.1127, "step": 83750 }, { "epoch": 9.035040431266847, - "grad_norm": 0.8828721642494202, - "learning_rate": 5.818024824608742e-05, - "loss": 3.114, + "grad_norm": 0.8461922407150269, + "learning_rate": 5.817377226119805e-05, + "loss": 3.1168, "step": 83800 }, { "epoch": 9.040431266846362, - "grad_norm": 0.8872936964035034, - "learning_rate": 5.7856449001618996e-05, - "loss": 3.1002, + "grad_norm": 0.8548511266708374, + "learning_rate": 5.784997301672963e-05, + "loss": 3.1039, "step": 83850 }, { "epoch": 9.045822102425875, - "grad_norm": 0.8819625377655029, - "learning_rate": 5.753264975715056e-05, - "loss": 3.1178, + "grad_norm": 0.8729279041290283, + "learning_rate": 5.752617377226119e-05, + "loss": 3.1212, "step": 83900 }, { "epoch": 9.05121293800539, - "grad_norm": 0.8307718634605408, - "learning_rate": 5.720885051268213e-05, - "loss": 3.1203, + "grad_norm": 0.84293532371521, + "learning_rate": 5.7202374527792764e-05, + "loss": 3.123, "step": 83950 }, { "epoch": 9.056603773584905, - "grad_norm": 0.8690371513366699, - "learning_rate": 5.68850512682137e-05, - "loss": 3.104, + "grad_norm": 0.8201664686203003, + "learning_rate": 5.6878575283324335e-05, + "loss": 3.1054, "step": 84000 }, { "epoch": 9.056603773584905, - "eval_accuracy": 0.3928119160925901, - "eval_loss": 3.3145651817321777, - "eval_runtime": 185.1326, - "eval_samples_per_second": 97.287, - "eval_steps_per_second": 6.082, + "eval_accuracy": 0.3924269589110634, + "eval_loss": 3.3185200691223145, + "eval_runtime": 185.2088, + "eval_samples_per_second": 97.247, + "eval_steps_per_second": 6.08, "step": 84000 }, { "epoch": 9.06199460916442, - "grad_norm": 0.8724967241287231, - "learning_rate": 5.656125202374527e-05, - "loss": 3.1179, + "grad_norm": 0.8599904775619507, + "learning_rate": 5.6554776038855905e-05, + "loss": 3.1192, "step": 84050 }, { "epoch": 9.067385444743936, - "grad_norm": 0.8930646777153015, - "learning_rate": 5.623745277927684e-05, - "loss": 3.1429, + "grad_norm": 0.9081140756607056, + "learning_rate": 5.623097679438747e-05, + "loss": 3.1465, "step": 84100 }, { "epoch": 9.07277628032345, - "grad_norm": 0.8653122782707214, - "learning_rate": 5.5913653534808415e-05, - "loss": 3.1217, + "grad_norm": 0.8966930508613586, + "learning_rate": 5.590717754991905e-05, + "loss": 3.1251, "step": 84150 }, { "epoch": 9.078167115902964, - "grad_norm": 0.8568156957626343, - "learning_rate": 5.558985429033998e-05, - "loss": 3.1379, + "grad_norm": 0.8360949754714966, + "learning_rate": 5.558337830545061e-05, + "loss": 3.1401, "step": 84200 }, { "epoch": 9.08355795148248, - "grad_norm": 0.8342962265014648, - "learning_rate": 5.526605504587156e-05, - "loss": 3.1325, + "grad_norm": 0.8535497784614563, + "learning_rate": 5.525957906098219e-05, + "loss": 3.1357, "step": 84250 }, { "epoch": 9.088948787061994, - "grad_norm": 0.8761492371559143, - "learning_rate": 5.494225580140313e-05, - "loss": 3.124, + "grad_norm": 0.8542355298995972, + "learning_rate": 5.493577981651375e-05, + "loss": 3.1276, "step": 84300 }, { "epoch": 9.09433962264151, - "grad_norm": 0.8720303773880005, - "learning_rate": 5.461845655693469e-05, - "loss": 3.1166, + "grad_norm": 0.8455662727355957, + "learning_rate": 5.4611980572045324e-05, + "loss": 3.1184, "step": 84350 }, { "epoch": 9.099730458221025, - "grad_norm": 0.9339317083358765, - "learning_rate": 5.429465731246627e-05, - "loss": 3.1074, + "grad_norm": 0.9406881928443909, + "learning_rate": 5.4288181327576895e-05, + "loss": 3.109, "step": 84400 }, { "epoch": 9.10512129380054, - "grad_norm": 0.877099871635437, + "grad_norm": 0.8666425943374634, "learning_rate": 5.3970858067997833e-05, - "loss": 3.1322, + "loss": 3.1374, "step": 84450 }, { "epoch": 9.110512129380053, - "grad_norm": 0.8813628554344177, - "learning_rate": 5.364705882352941e-05, - "loss": 3.1202, + "grad_norm": 0.859734296798706, + "learning_rate": 5.365353480841878e-05, + "loss": 3.1224, "step": 84500 }, { "epoch": 9.115902964959568, - "grad_norm": 0.8760769963264465, - "learning_rate": 5.3323259579060975e-05, - "loss": 3.1273, + "grad_norm": 0.8375653624534607, + "learning_rate": 5.332973556395034e-05, + "loss": 3.1319, "step": 84550 }, { "epoch": 9.121293800539084, - "grad_norm": 0.8799816370010376, - "learning_rate": 5.299946033459255e-05, - "loss": 3.1158, + "grad_norm": 0.8139840960502625, + "learning_rate": 5.300593631948192e-05, + "loss": 3.119, "step": 84600 }, { "epoch": 9.126684636118599, - "grad_norm": 0.8845181465148926, - "learning_rate": 5.267566109012412e-05, - "loss": 3.1049, + "grad_norm": 0.8541390895843506, + "learning_rate": 5.2682137075013485e-05, + "loss": 3.1071, "step": 84650 }, { "epoch": 9.132075471698114, - "grad_norm": 0.8808998465538025, - "learning_rate": 5.235186184565569e-05, - "loss": 3.1181, + "grad_norm": 0.8539107441902161, + "learning_rate": 5.2358337830545056e-05, + "loss": 3.1217, "step": 84700 }, { "epoch": 9.137466307277627, - "grad_norm": 0.9242299795150757, - "learning_rate": 5.202806260118726e-05, - "loss": 3.121, + "grad_norm": 0.900639533996582, + "learning_rate": 5.2034538586076626e-05, + "loss": 3.123, "step": 84750 }, { "epoch": 9.142857142857142, - "grad_norm": 0.9068589210510254, - "learning_rate": 5.170426335671883e-05, - "loss": 3.1207, + "grad_norm": 0.8555846214294434, + "learning_rate": 5.17107393416082e-05, + "loss": 3.1225, "step": 84800 }, { "epoch": 9.148247978436657, - "grad_norm": 0.8964105248451233, - "learning_rate": 5.1380464112250394e-05, - "loss": 3.1314, + "grad_norm": 0.8566033840179443, + "learning_rate": 5.1386940097139775e-05, + "loss": 3.1336, "step": 84850 }, { "epoch": 9.153638814016173, - "grad_norm": 0.8806259036064148, - "learning_rate": 5.105666486778197e-05, - "loss": 3.1387, + "grad_norm": 0.8521566987037659, + "learning_rate": 5.106314085267134e-05, + "loss": 3.1411, "step": 84900 }, { "epoch": 9.159029649595688, - "grad_norm": 0.8335862755775452, - "learning_rate": 5.0732865623313536e-05, - "loss": 3.1257, + "grad_norm": 0.8052482604980469, + "learning_rate": 5.073934160820291e-05, + "loss": 3.1304, "step": 84950 }, { "epoch": 9.164420485175203, - "grad_norm": 0.9166033864021301, - "learning_rate": 5.040906637884511e-05, - "loss": 3.1398, + "grad_norm": 0.873296856880188, + "learning_rate": 5.041554236373448e-05, + "loss": 3.1447, "step": 85000 }, { "epoch": 9.164420485175203, - "eval_accuracy": 0.3929139411542479, - "eval_loss": 3.3140039443969727, - "eval_runtime": 185.025, - "eval_samples_per_second": 97.344, - "eval_steps_per_second": 6.086, + "eval_accuracy": 0.39260960441547954, + "eval_loss": 3.3156075477600098, + "eval_runtime": 185.3438, + "eval_samples_per_second": 97.176, + "eval_steps_per_second": 6.075, "step": 85000 }, { "epoch": 9.169811320754716, - "grad_norm": 0.8852344751358032, - "learning_rate": 5.0085267134376684e-05, - "loss": 3.1307, + "grad_norm": 0.8564801216125488, + "learning_rate": 5.009174311926605e-05, + "loss": 3.1333, "step": 85050 }, { "epoch": 9.175202156334231, - "grad_norm": 0.8456209301948547, + "grad_norm": 0.8553125858306885, "learning_rate": 4.976794387479762e-05, - "loss": 3.1277, + "loss": 3.133, "step": 85100 }, { "epoch": 9.180592991913747, - "grad_norm": 0.8467517495155334, + "grad_norm": 0.8583039045333862, "learning_rate": 4.9444144630329194e-05, - "loss": 3.137, + "loss": 3.1389, "step": 85150 }, { "epoch": 9.185983827493262, - "grad_norm": 0.905830442905426, + "grad_norm": 0.9124622941017151, "learning_rate": 4.912034538586076e-05, - "loss": 3.1382, + "loss": 3.1409, "step": 85200 }, { "epoch": 9.191374663072777, - "grad_norm": 0.9719045162200928, + "grad_norm": 0.8893402218818665, "learning_rate": 4.8796546141392335e-05, - "loss": 3.1128, + "loss": 3.1163, "step": 85250 }, { "epoch": 9.19676549865229, - "grad_norm": 0.9098581075668335, + "grad_norm": 0.8861328959465027, "learning_rate": 4.84727468969239e-05, - "loss": 3.1232, + "loss": 3.1273, "step": 85300 }, { "epoch": 9.202156334231805, - "grad_norm": 0.8981218338012695, + "grad_norm": 0.8711820244789124, "learning_rate": 4.814894765245548e-05, - "loss": 3.1183, + "loss": 3.1204, "step": 85350 }, { "epoch": 9.20754716981132, - "grad_norm": 0.8904754519462585, + "grad_norm": 0.880292534828186, "learning_rate": 4.782514840798704e-05, - "loss": 3.1292, + "loss": 3.1304, "step": 85400 }, { "epoch": 9.212938005390836, - "grad_norm": 0.88540118932724, + "grad_norm": 0.8324595093727112, "learning_rate": 4.750134916351861e-05, - "loss": 3.1313, + "loss": 3.1335, "step": 85450 }, { "epoch": 9.21832884097035, - "grad_norm": 0.8722923994064331, + "grad_norm": 0.8473580479621887, "learning_rate": 4.717754991905018e-05, - "loss": 3.1197, + "loss": 3.1238, "step": 85500 }, { "epoch": 9.223719676549866, - "grad_norm": 0.913180410861969, + "grad_norm": 0.9067453742027283, "learning_rate": 4.6853750674581754e-05, - "loss": 3.107, + "loss": 3.1113, "step": 85550 }, { "epoch": 9.22911051212938, - "grad_norm": 0.8536023497581482, + "grad_norm": 0.8589571118354797, "learning_rate": 4.652995143011333e-05, - "loss": 3.115, + "loss": 3.1175, "step": 85600 }, { "epoch": 9.234501347708894, - "grad_norm": 0.8618268966674805, + "grad_norm": 0.8338851928710938, "learning_rate": 4.6206152185644896e-05, - "loss": 3.1302, + "loss": 3.133, "step": 85650 }, { "epoch": 9.23989218328841, - "grad_norm": 0.8930416703224182, + "grad_norm": 0.8637981414794922, "learning_rate": 4.588235294117647e-05, - "loss": 3.1218, + "loss": 3.1245, "step": 85700 }, { "epoch": 9.245283018867925, - "grad_norm": 0.9080697894096375, + "grad_norm": 0.8614128828048706, "learning_rate": 4.555855369670804e-05, - "loss": 3.1122, + "loss": 3.1155, "step": 85750 }, { "epoch": 9.25067385444744, - "grad_norm": 0.8830873370170593, + "grad_norm": 0.8531436920166016, "learning_rate": 4.523475445223961e-05, - "loss": 3.122, + "loss": 3.1243, "step": 85800 }, { "epoch": 9.256064690026955, - "grad_norm": 0.8966952562332153, + "grad_norm": 0.8632187247276306, "learning_rate": 4.491095520777118e-05, - "loss": 3.1461, + "loss": 3.1491, "step": 85850 }, { "epoch": 9.261455525606468, - "grad_norm": 0.8461965918540955, + "grad_norm": 0.8344792723655701, "learning_rate": 4.458715596330275e-05, - "loss": 3.1312, + "loss": 3.1338, "step": 85900 }, { "epoch": 9.266846361185983, - "grad_norm": 0.9272150993347168, + "grad_norm": 0.8807640075683594, "learning_rate": 4.4263356718834314e-05, - "loss": 3.1333, + "loss": 3.1349, "step": 85950 }, { "epoch": 9.272237196765499, - "grad_norm": 0.8832882642745972, + "grad_norm": 0.8676424026489258, "learning_rate": 4.393955747436589e-05, - "loss": 3.1275, + "loss": 3.1303, "step": 86000 }, { "epoch": 9.272237196765499, - "eval_accuracy": 0.3932997675588771, - "eval_loss": 3.311936378479004, - "eval_runtime": 184.859, - "eval_samples_per_second": 97.431, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.393040956380103, + "eval_loss": 3.313966989517212, + "eval_runtime": 184.9242, + "eval_samples_per_second": 97.397, + "eval_steps_per_second": 6.089, "step": 86000 }, { "epoch": 9.277628032345014, - "grad_norm": 0.8568077683448792, + "grad_norm": 0.8324336409568787, "learning_rate": 4.3615758229897456e-05, - "loss": 3.1382, + "loss": 3.1421, "step": 86050 }, { "epoch": 9.283018867924529, - "grad_norm": 0.8889806270599365, + "grad_norm": 0.8373544216156006, "learning_rate": 4.3291958985429034e-05, - "loss": 3.1295, + "loss": 3.1336, "step": 86100 }, { "epoch": 9.288409703504042, - "grad_norm": 0.8781276345252991, + "grad_norm": 0.8576855063438416, "learning_rate": 4.29681597409606e-05, - "loss": 3.1426, + "loss": 3.1479, "step": 86150 }, { "epoch": 9.293800539083557, - "grad_norm": 0.9067704081535339, + "grad_norm": 0.863328754901886, "learning_rate": 4.264436049649217e-05, - "loss": 3.1376, + "loss": 3.1387, "step": 86200 }, { "epoch": 9.299191374663073, - "grad_norm": 0.8865089416503906, + "grad_norm": 0.8970577716827393, "learning_rate": 4.232056125202374e-05, - "loss": 3.1408, + "loss": 3.1417, "step": 86250 }, { "epoch": 9.304582210242588, - "grad_norm": 0.8952741622924805, + "grad_norm": 0.9056400060653687, "learning_rate": 4.199676200755531e-05, - "loss": 3.1384, + "loss": 3.1423, "step": 86300 }, { "epoch": 9.309973045822103, - "grad_norm": 0.870446503162384, + "grad_norm": 0.8215680718421936, "learning_rate": 4.167296276308688e-05, - "loss": 3.1127, + "loss": 3.1151, "step": 86350 }, { "epoch": 9.315363881401618, - "grad_norm": 0.8651818633079529, + "grad_norm": 0.8935737013816833, "learning_rate": 4.134916351861845e-05, - "loss": 3.1194, + "loss": 3.125, "step": 86400 }, { "epoch": 9.320754716981131, - "grad_norm": 0.8986634612083435, + "grad_norm": 0.8761557936668396, "learning_rate": 4.102536427415003e-05, - "loss": 3.1328, + "loss": 3.1351, "step": 86450 }, { "epoch": 9.326145552560646, - "grad_norm": 0.9104035496711731, + "grad_norm": 0.8842337131500244, "learning_rate": 4.0701565029681594e-05, - "loss": 3.1283, + "loss": 3.1319, "step": 86500 }, { "epoch": 9.331536388140162, - "grad_norm": 0.901136577129364, + "grad_norm": 0.8900733590126038, "learning_rate": 4.0377765785213165e-05, - "loss": 3.1363, + "loss": 3.1398, "step": 86550 }, { "epoch": 9.336927223719677, - "grad_norm": 0.9057474732398987, + "grad_norm": 0.8664814829826355, "learning_rate": 4.0053966540744736e-05, - "loss": 3.1485, + "loss": 3.1515, "step": 86600 }, { "epoch": 9.342318059299192, - "grad_norm": 0.8955982327461243, + "grad_norm": 0.888770580291748, "learning_rate": 3.973016729627631e-05, - "loss": 3.1168, + "loss": 3.1203, "step": 86650 }, { "epoch": 9.347708894878707, - "grad_norm": 0.8255138397216797, + "grad_norm": 0.82951819896698, "learning_rate": 3.940636805180787e-05, - "loss": 3.1294, + "loss": 3.1318, "step": 86700 }, { "epoch": 9.35309973045822, - "grad_norm": 0.8693887591362, + "grad_norm": 0.866402804851532, "learning_rate": 3.908256880733945e-05, - "loss": 3.1106, + "loss": 3.1143, "step": 86750 }, { "epoch": 9.358490566037736, - "grad_norm": 0.8875924944877625, + "grad_norm": 0.9193182587623596, "learning_rate": 3.875876956287101e-05, - "loss": 3.113, + "loss": 3.1167, "step": 86800 }, { "epoch": 9.36388140161725, - "grad_norm": 0.8646242618560791, + "grad_norm": 0.8478342890739441, "learning_rate": 3.843497031840259e-05, - "loss": 3.131, + "loss": 3.1339, "step": 86850 }, { "epoch": 9.369272237196766, - "grad_norm": 0.9060271978378296, + "grad_norm": 0.8515084981918335, "learning_rate": 3.8111171073934154e-05, - "loss": 3.1357, + "loss": 3.1382, "step": 86900 }, { "epoch": 9.374663072776281, - "grad_norm": 0.8785396814346313, + "grad_norm": 0.9116027355194092, "learning_rate": 3.778737182946573e-05, - "loss": 3.1382, + "loss": 3.1403, "step": 86950 }, { "epoch": 9.380053908355794, - "grad_norm": 0.8670048713684082, + "grad_norm": 0.8636597990989685, "learning_rate": 3.7463572584997296e-05, - "loss": 3.1152, + "loss": 3.1186, "step": 87000 }, { "epoch": 9.380053908355794, - "eval_accuracy": 0.39350446959951957, - "eval_loss": 3.3097434043884277, - "eval_runtime": 185.0264, - "eval_samples_per_second": 97.343, - "eval_steps_per_second": 6.086, + "eval_accuracy": 0.3931999155549756, + "eval_loss": 3.3115437030792236, + "eval_runtime": 185.4557, + "eval_samples_per_second": 97.118, + "eval_steps_per_second": 6.072, "step": 87000 }, { "epoch": 9.38544474393531, - "grad_norm": 0.8645483255386353, + "grad_norm": 0.8413622379302979, "learning_rate": 3.713977334052887e-05, - "loss": 3.1364, + "loss": 3.1368, "step": 87050 }, { "epoch": 9.390835579514825, - "grad_norm": 0.9023386836051941, + "grad_norm": 0.8973217010498047, "learning_rate": 3.681597409606044e-05, - "loss": 3.1231, + "loss": 3.1272, "step": 87100 }, { "epoch": 9.39622641509434, - "grad_norm": 0.8484783172607422, + "grad_norm": 0.8484401106834412, "learning_rate": 3.649217485159201e-05, - "loss": 3.1285, + "loss": 3.1329, "step": 87150 }, { "epoch": 9.401617250673855, - "grad_norm": 0.8609200119972229, + "grad_norm": 0.8557623624801636, "learning_rate": 3.616837560712358e-05, - "loss": 3.1254, + "loss": 3.1275, "step": 87200 }, { "epoch": 9.40700808625337, - "grad_norm": 0.8669583201408386, + "grad_norm": 0.8505693674087524, "learning_rate": 3.584457636265515e-05, - "loss": 3.1401, + "loss": 3.1427, "step": 87250 }, { "epoch": 9.412398921832883, - "grad_norm": 0.9098264575004578, + "grad_norm": 0.8859766125679016, "learning_rate": 3.552077711818672e-05, - "loss": 3.1465, + "loss": 3.1487, "step": 87300 }, { "epoch": 9.417789757412399, - "grad_norm": 0.8986435532569885, + "grad_norm": 0.8826354742050171, "learning_rate": 3.519697787371829e-05, - "loss": 3.1234, + "loss": 3.1261, "step": 87350 }, { "epoch": 9.423180592991914, - "grad_norm": 0.8918881416320801, + "grad_norm": 0.8677520751953125, "learning_rate": 3.487317862924986e-05, - "loss": 3.1235, + "loss": 3.1261, "step": 87400 }, { "epoch": 9.428571428571429, - "grad_norm": 0.9000228047370911, + "grad_norm": 0.8759662508964539, "learning_rate": 3.4549379384781434e-05, - "loss": 3.13, + "loss": 3.1328, "step": 87450 }, { "epoch": 9.433962264150944, - "grad_norm": 0.9257108569145203, + "grad_norm": 0.8896969556808472, "learning_rate": 3.4225580140313e-05, - "loss": 3.1329, + "loss": 3.1362, "step": 87500 }, { "epoch": 9.439353099730457, - "grad_norm": 0.8658718466758728, + "grad_norm": 0.8729029297828674, "learning_rate": 3.390178089584457e-05, - "loss": 3.1421, + "loss": 3.1455, "step": 87550 }, { "epoch": 9.444743935309972, - "grad_norm": 0.895495593547821, + "grad_norm": 0.8900150656700134, "learning_rate": 3.357798165137615e-05, - "loss": 3.128, + "loss": 3.1304, "step": 87600 }, { "epoch": 9.450134770889488, - "grad_norm": 0.882296085357666, + "grad_norm": 0.8727821707725525, "learning_rate": 3.325418240690772e-05, - "loss": 3.1428, + "loss": 3.1481, "step": 87650 }, { "epoch": 9.455525606469003, - "grad_norm": 0.8577868342399597, + "grad_norm": 0.8477314710617065, "learning_rate": 3.293038316243929e-05, - "loss": 3.1046, + "loss": 3.1063, "step": 87700 }, { "epoch": 9.460916442048518, - "grad_norm": 0.9138762950897217, + "grad_norm": 0.8587415218353271, "learning_rate": 3.260658391797086e-05, - "loss": 3.1224, + "loss": 3.1258, "step": 87750 }, { "epoch": 9.466307277628033, - "grad_norm": 0.8505160212516785, + "grad_norm": 0.8558546900749207, "learning_rate": 3.2282784673502424e-05, - "loss": 3.1177, + "loss": 3.1225, "step": 87800 }, { "epoch": 9.471698113207546, - "grad_norm": 0.8743757009506226, + "grad_norm": 0.8536050319671631, "learning_rate": 3.1958985429033995e-05, - "loss": 3.13, + "loss": 3.1322, "step": 87850 }, { "epoch": 9.477088948787062, - "grad_norm": 0.8973845839500427, + "grad_norm": 0.8786517977714539, "learning_rate": 3.1635186184565565e-05, - "loss": 3.1383, + "loss": 3.1458, "step": 87900 }, { "epoch": 9.482479784366577, - "grad_norm": 0.8859951496124268, + "grad_norm": 0.8906846642494202, "learning_rate": 3.1311386940097136e-05, - "loss": 3.1428, + "loss": 3.1442, "step": 87950 }, { "epoch": 9.487870619946092, - "grad_norm": 0.9165722131729126, + "grad_norm": 0.8581869006156921, "learning_rate": 3.098758769562871e-05, - "loss": 3.1329, + "loss": 3.1368, "step": 88000 }, { "epoch": 9.487870619946092, - "eval_accuracy": 0.39391898036653183, - "eval_loss": 3.3068549633026123, - "eval_runtime": 184.8627, - "eval_samples_per_second": 97.429, - "eval_steps_per_second": 6.091, + "eval_accuracy": 0.3937146042845528, + "eval_loss": 3.3079490661621094, + "eval_runtime": 184.9387, + "eval_samples_per_second": 97.389, + "eval_steps_per_second": 6.089, "step": 88000 }, { "epoch": 9.493261455525607, - "grad_norm": 0.8812258243560791, + "grad_norm": 0.8685297966003418, "learning_rate": 3.066378845116028e-05, - "loss": 3.1294, + "loss": 3.134, "step": 88050 }, { "epoch": 9.498652291105122, - "grad_norm": 0.8953335881233215, + "grad_norm": 0.8372690081596375, "learning_rate": 3.033998920669185e-05, - "loss": 3.1284, + "loss": 3.1328, "step": 88100 }, { "epoch": 9.504043126684635, - "grad_norm": 0.8637675642967224, + "grad_norm": 0.8183308839797974, "learning_rate": 3.0016189962223416e-05, - "loss": 3.1266, + "loss": 3.1306, "step": 88150 }, { "epoch": 9.50943396226415, - "grad_norm": 0.8599790930747986, + "grad_norm": 1.0200047492980957, "learning_rate": 2.9692390717754987e-05, - "loss": 3.1237, + "loss": 3.1259, "step": 88200 }, { "epoch": 9.514824797843666, - "grad_norm": 0.8580920100212097, + "grad_norm": 0.8445395827293396, "learning_rate": 2.9368591473286558e-05, - "loss": 3.1317, + "loss": 3.1334, "step": 88250 }, { "epoch": 9.520215633423181, - "grad_norm": 0.8671429753303528, + "grad_norm": 0.8794388175010681, "learning_rate": 2.904479222881813e-05, - "loss": 3.1385, + "loss": 3.1418, "step": 88300 }, { "epoch": 9.525606469002696, - "grad_norm": 0.8459764122962952, + "grad_norm": 0.7987924814224243, "learning_rate": 2.87209929843497e-05, - "loss": 3.1398, + "loss": 3.1419, "step": 88350 }, { "epoch": 9.530997304582211, - "grad_norm": 0.8779290914535522, + "grad_norm": 0.8669410943984985, "learning_rate": 2.8397193739881274e-05, - "loss": 3.138, + "loss": 3.1422, "step": 88400 }, { "epoch": 9.536388140161725, - "grad_norm": 0.8906304240226746, + "grad_norm": 0.8429611921310425, "learning_rate": 2.8073394495412842e-05, - "loss": 3.1318, + "loss": 3.1346, "step": 88450 }, { "epoch": 9.54177897574124, - "grad_norm": 0.91546231508255, + "grad_norm": 0.8398469686508179, "learning_rate": 2.7749595250944413e-05, - "loss": 3.1251, + "loss": 3.1279, "step": 88500 }, { "epoch": 9.547169811320755, - "grad_norm": 0.8772543668746948, + "grad_norm": 0.8982305526733398, "learning_rate": 2.7425796006475984e-05, "loss": 3.1203, "step": 88550 }, { "epoch": 9.55256064690027, - "grad_norm": 0.9106935262680054, + "grad_norm": 0.8920227289199829, "learning_rate": 2.7101996762007554e-05, - "loss": 3.1328, + "loss": 3.1362, "step": 88600 }, { "epoch": 9.557951482479785, - "grad_norm": 0.864047110080719, + "grad_norm": 0.8532522320747375, "learning_rate": 2.6778197517539125e-05, - "loss": 3.1012, + "loss": 3.1053, "step": 88650 }, { "epoch": 9.563342318059298, - "grad_norm": 0.8812419176101685, + "grad_norm": 0.8370513319969177, "learning_rate": 2.6454398273070693e-05, - "loss": 3.1276, + "loss": 3.1305, "step": 88700 }, { "epoch": 9.568733153638814, - "grad_norm": 0.8692460060119629, + "grad_norm": 0.8290605545043945, "learning_rate": 2.6130599028602264e-05, - "loss": 3.1308, + "loss": 3.1349, "step": 88750 }, { "epoch": 9.574123989218329, - "grad_norm": 0.9016609191894531, + "grad_norm": 0.8801090121269226, "learning_rate": 2.5806799784133835e-05, - "loss": 3.1092, + "loss": 3.113, "step": 88800 }, { "epoch": 9.579514824797844, - "grad_norm": 0.875944972038269, + "grad_norm": 0.8506044149398804, "learning_rate": 2.5483000539665406e-05, - "loss": 3.1279, + "loss": 3.13, "step": 88850 }, { "epoch": 9.584905660377359, - "grad_norm": 0.8492311835289001, + "grad_norm": 0.8414507508277893, "learning_rate": 2.5159201295196976e-05, - "loss": 3.1498, + "loss": 3.1535, "step": 88900 }, { "epoch": 9.590296495956874, - "grad_norm": 0.879350483417511, + "grad_norm": 0.8439317345619202, "learning_rate": 2.4835402050728544e-05, - "loss": 3.1354, + "loss": 3.1387, "step": 88950 }, { "epoch": 9.595687331536388, - "grad_norm": 0.8495625853538513, + "grad_norm": 0.8427550196647644, "learning_rate": 2.4511602806260115e-05, - "loss": 3.1273, + "loss": 3.13, "step": 89000 }, { "epoch": 9.595687331536388, - "eval_accuracy": 0.3940988008958648, - "eval_loss": 3.304088592529297, - "eval_runtime": 184.961, - "eval_samples_per_second": 97.377, - "eval_steps_per_second": 6.088, + "eval_accuracy": 0.3939106140941701, + "eval_loss": 3.3059844970703125, + "eval_runtime": 185.1018, + "eval_samples_per_second": 97.303, + "eval_steps_per_second": 6.083, "step": 89000 }, { "epoch": 9.601078167115903, - "grad_norm": 0.8095118403434753, + "grad_norm": 0.7962179780006409, "learning_rate": 2.4187803561791686e-05, - "loss": 3.1248, + "loss": 3.1265, "step": 89050 }, { "epoch": 9.606469002695418, - "grad_norm": 0.8577584028244019, - "learning_rate": 2.3870480302212628e-05, - "loss": 3.1295, + "grad_norm": 0.834709644317627, + "learning_rate": 2.3864004317323257e-05, + "loss": 3.1327, "step": 89100 }, { "epoch": 9.611859838274933, - "grad_norm": 0.8957887887954712, - "learning_rate": 2.35466810577442e-05, - "loss": 3.1508, + "grad_norm": 0.883066713809967, + "learning_rate": 2.354020507285483e-05, + "loss": 3.156, "step": 89150 }, { "epoch": 9.617250673854448, - "grad_norm": 0.9367175102233887, - "learning_rate": 2.3222881813275766e-05, - "loss": 3.1187, + "grad_norm": 0.932812511920929, + "learning_rate": 2.3216405828386402e-05, + "loss": 3.1206, "step": 89200 }, { "epoch": 9.622641509433961, - "grad_norm": 0.9250417351722717, - "learning_rate": 2.2899082568807337e-05, - "loss": 3.1394, + "grad_norm": 0.9128279089927673, + "learning_rate": 2.289260658391797e-05, + "loss": 3.1426, "step": 89250 }, { "epoch": 9.628032345013477, - "grad_norm": 0.8945813775062561, - "learning_rate": 2.2575283324338908e-05, - "loss": 3.1292, + "grad_norm": 0.8921266198158264, + "learning_rate": 2.256880733944954e-05, + "loss": 3.1294, "step": 89300 }, { "epoch": 9.633423180592992, - "grad_norm": 0.8463922142982483, - "learning_rate": 2.225148407987048e-05, - "loss": 3.1253, + "grad_norm": 0.8413317799568176, + "learning_rate": 2.224500809498111e-05, + "loss": 3.1273, "step": 89350 }, { "epoch": 9.638814016172507, - "grad_norm": 0.9221121072769165, - "learning_rate": 2.192768483540205e-05, - "loss": 3.1276, + "grad_norm": 0.8817403316497803, + "learning_rate": 2.1921208850512682e-05, + "loss": 3.1315, "step": 89400 }, { "epoch": 9.644204851752022, - "grad_norm": 0.882972240447998, - "learning_rate": 2.1603885590933617e-05, - "loss": 3.1281, + "grad_norm": 0.8911518454551697, + "learning_rate": 2.1597409606044253e-05, + "loss": 3.1327, "step": 89450 }, { "epoch": 9.649595687331537, - "grad_norm": 0.8939113616943359, - "learning_rate": 2.1280086346465188e-05, - "loss": 3.1432, + "grad_norm": 0.8857812285423279, + "learning_rate": 2.127361036157582e-05, + "loss": 3.1459, "step": 89500 }, { "epoch": 9.65498652291105, - "grad_norm": 0.896306037902832, - "learning_rate": 2.095628710199676e-05, - "loss": 3.1255, + "grad_norm": 0.8955504298210144, + "learning_rate": 2.094981111710739e-05, + "loss": 3.1307, "step": 89550 }, { "epoch": 9.660377358490566, - "grad_norm": 0.8766800165176392, - "learning_rate": 2.063248785752833e-05, - "loss": 3.1184, + "grad_norm": 0.86283940076828, + "learning_rate": 2.0626011872638962e-05, + "loss": 3.1206, "step": 89600 }, { "epoch": 9.66576819407008, - "grad_norm": 0.8930668830871582, - "learning_rate": 2.03086886130599e-05, - "loss": 3.1188, + "grad_norm": 0.8852249979972839, + "learning_rate": 2.0302212628170533e-05, + "loss": 3.1227, "step": 89650 }, { "epoch": 9.671159029649596, - "grad_norm": 0.9026961326599121, - "learning_rate": 1.9984889368591468e-05, - "loss": 3.1218, + "grad_norm": 0.878625214099884, + "learning_rate": 1.99784133837021e-05, + "loss": 3.1262, "step": 89700 }, { "epoch": 9.676549865229111, - "grad_norm": 0.9059770107269287, - "learning_rate": 1.9661090124123042e-05, - "loss": 3.1375, + "grad_norm": 0.850557267665863, + "learning_rate": 1.965461413923367e-05, + "loss": 3.1391, "step": 89750 }, { "epoch": 9.681940700808624, - "grad_norm": 0.8790046572685242, - "learning_rate": 1.9337290879654613e-05, - "loss": 3.1196, + "grad_norm": 0.8788577318191528, + "learning_rate": 1.9330814894765242e-05, + "loss": 3.1231, "step": 89800 }, { "epoch": 9.68733153638814, - "grad_norm": 0.8520891070365906, - "learning_rate": 1.9013491635186184e-05, - "loss": 3.1237, + "grad_norm": 0.8096227049827576, + "learning_rate": 1.9007015650296813e-05, + "loss": 3.1254, "step": 89850 }, { "epoch": 9.692722371967655, - "grad_norm": 0.8755563497543335, - "learning_rate": 1.868969239071775e-05, - "loss": 3.1437, + "grad_norm": 0.8480028510093689, + "learning_rate": 1.8683216405828384e-05, + "loss": 3.1454, "step": 89900 }, { "epoch": 9.69811320754717, - "grad_norm": 0.8892934322357178, - "learning_rate": 1.8365893146249326e-05, - "loss": 3.1458, + "grad_norm": 0.8541979193687439, + "learning_rate": 1.8359417161359955e-05, + "loss": 3.1467, "step": 89950 }, { "epoch": 9.703504043126685, - "grad_norm": 0.8864810466766357, - "learning_rate": 1.8042093901780893e-05, - "loss": 3.129, + "grad_norm": 0.8675863742828369, + "learning_rate": 1.8035617916891526e-05, + "loss": 3.1323, "step": 90000 }, { "epoch": 9.703504043126685, - "eval_accuracy": 0.39435913321506827, - "eval_loss": 3.302138566970825, - "eval_runtime": 184.8974, - "eval_samples_per_second": 97.411, - "eval_steps_per_second": 6.09, + "eval_accuracy": 0.39413606883638513, + "eval_loss": 3.304030179977417, + "eval_runtime": 185.3134, + "eval_samples_per_second": 97.192, + "eval_steps_per_second": 6.076, "step": 90000 } ],