{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 55.945200155460554, "eval_steps": 1000, "global_step": 72000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0194325689856199, "grad_norm": 65.91597747802734, "learning_rate": 4.9e-07, "loss": 8.2687, "step": 50 }, { "epoch": 0.0388651379712398, "grad_norm": 51.2559700012207, "learning_rate": 9.9e-07, "loss": 7.9936, "step": 100 }, { "epoch": 0.05829770695685969, "grad_norm": 23.14748764038086, "learning_rate": 1.4900000000000001e-06, "loss": 7.527, "step": 150 }, { "epoch": 0.0777302759424796, "grad_norm": 22.350494384765625, "learning_rate": 1.99e-06, "loss": 6.3514, "step": 200 }, { "epoch": 0.0971628449280995, "grad_norm": 22.213350296020508, "learning_rate": 2.49e-06, "loss": 5.6544, "step": 250 }, { "epoch": 0.11659541391371939, "grad_norm": 12.773467063903809, "learning_rate": 2.99e-06, "loss": 4.6271, "step": 300 }, { "epoch": 0.1360279828993393, "grad_norm": 9.955669403076172, "learning_rate": 3.49e-06, "loss": 3.8173, "step": 350 }, { "epoch": 0.1554605518849592, "grad_norm": 6.667578220367432, "learning_rate": 3.99e-06, "loss": 3.4296, "step": 400 }, { "epoch": 0.1748931208705791, "grad_norm": 3.926382064819336, "learning_rate": 4.49e-06, "loss": 3.2801, "step": 450 }, { "epoch": 0.194325689856199, "grad_norm": 5.839285850524902, "learning_rate": 4.9900000000000005e-06, "loss": 3.1784, "step": 500 }, { "epoch": 0.2137582588418189, "grad_norm": 5.314439296722412, "learning_rate": 5.49e-06, "loss": 3.15, "step": 550 }, { "epoch": 0.23319082782743877, "grad_norm": 3.1964633464813232, "learning_rate": 5.99e-06, "loss": 3.0832, "step": 600 }, { "epoch": 0.2526233968130587, "grad_norm": 5.283780097961426, "learning_rate": 6.4900000000000005e-06, "loss": 3.099, "step": 650 }, { "epoch": 0.2720559657986786, "grad_norm": 4.881435394287109, "learning_rate": 6.990000000000001e-06, "loss": 3.068, "step": 700 }, { "epoch": 0.2914885347842985, "grad_norm": 6.7736945152282715, "learning_rate": 7.4899999999999994e-06, "loss": 3.0591, "step": 750 }, { "epoch": 0.3109211037699184, "grad_norm": 15.492425918579102, "learning_rate": 7.99e-06, "loss": 3.0675, "step": 800 }, { "epoch": 0.3303536727555383, "grad_norm": 31.496707916259766, "learning_rate": 8.49e-06, "loss": 3.0561, "step": 850 }, { "epoch": 0.3497862417411582, "grad_norm": 3.549668073654175, "learning_rate": 8.99e-06, "loss": 3.0493, "step": 900 }, { "epoch": 0.3692188107267781, "grad_norm": 4.400085926055908, "learning_rate": 9.49e-06, "loss": 3.0294, "step": 950 }, { "epoch": 0.388651379712398, "grad_norm": 7.091118335723877, "learning_rate": 9.990000000000001e-06, "loss": 3.053, "step": 1000 }, { "epoch": 0.388651379712398, "eval_accuracy": 0.0002429189136666181, "eval_runtime": 907.7535, "eval_samples_per_second": 22.675, "eval_steps_per_second": 1.418, "step": 1000 }, { "epoch": 0.4080839486980179, "grad_norm": 2.439570903778076, "learning_rate": 1.049e-05, "loss": 3.0036, "step": 1050 }, { "epoch": 0.4275165176836378, "grad_norm": 2.5228285789489746, "learning_rate": 1.099e-05, "loss": 3.0156, "step": 1100 }, { "epoch": 0.4469490866692577, "grad_norm": 3.5071604251861572, "learning_rate": 1.149e-05, "loss": 3.0285, "step": 1150 }, { "epoch": 0.46638165565487755, "grad_norm": 2.174527168273926, "learning_rate": 1.199e-05, "loss": 3.024, "step": 1200 }, { "epoch": 0.48581422464049745, "grad_norm": 2.910525321960449, "learning_rate": 1.249e-05, "loss": 2.9969, "step": 1250 }, { "epoch": 0.5052467936261174, "grad_norm": 2.5316076278686523, "learning_rate": 1.299e-05, "loss": 3.0125, "step": 1300 }, { "epoch": 0.5246793626117373, "grad_norm": 2.175851821899414, "learning_rate": 1.349e-05, "loss": 2.993, "step": 1350 }, { "epoch": 0.5441119315973572, "grad_norm": 2.5506649017333984, "learning_rate": 1.399e-05, "loss": 3.0089, "step": 1400 }, { "epoch": 0.5635445005829771, "grad_norm": 3.2838923931121826, "learning_rate": 1.449e-05, "loss": 2.9501, "step": 1450 }, { "epoch": 0.582977069568597, "grad_norm": 4.193533420562744, "learning_rate": 1.499e-05, "loss": 2.9575, "step": 1500 }, { "epoch": 0.6024096385542169, "grad_norm": 5.010601997375488, "learning_rate": 1.5490000000000002e-05, "loss": 2.9187, "step": 1550 }, { "epoch": 0.6218422075398368, "grad_norm": 4.640109539031982, "learning_rate": 1.599e-05, "loss": 2.9261, "step": 1600 }, { "epoch": 0.6412747765254566, "grad_norm": 4.375769138336182, "learning_rate": 1.649e-05, "loss": 2.9343, "step": 1650 }, { "epoch": 0.6607073455110766, "grad_norm": 3.256545066833496, "learning_rate": 1.699e-05, "loss": 2.8974, "step": 1700 }, { "epoch": 0.6801399144966964, "grad_norm": 8.876006126403809, "learning_rate": 1.749e-05, "loss": 2.8581, "step": 1750 }, { "epoch": 0.6995724834823164, "grad_norm": 5.627965927124023, "learning_rate": 1.7990000000000002e-05, "loss": 2.8551, "step": 1800 }, { "epoch": 0.7190050524679362, "grad_norm": 5.085724830627441, "learning_rate": 1.849e-05, "loss": 2.8319, "step": 1850 }, { "epoch": 0.7384376214535562, "grad_norm": 5.020870685577393, "learning_rate": 1.8990000000000003e-05, "loss": 2.7724, "step": 1900 }, { "epoch": 0.757870190439176, "grad_norm": 7.748109340667725, "learning_rate": 1.949e-05, "loss": 2.7172, "step": 1950 }, { "epoch": 0.777302759424796, "grad_norm": 3.8322463035583496, "learning_rate": 1.999e-05, "loss": 2.6999, "step": 2000 }, { "epoch": 0.777302759424796, "eval_accuracy": 0.0023320215711995335, "eval_runtime": 907.4599, "eval_samples_per_second": 22.682, "eval_steps_per_second": 1.418, "step": 2000 }, { "epoch": 0.7967353284104158, "grad_norm": 4.006162643432617, "learning_rate": 2.0490000000000002e-05, "loss": 2.7107, "step": 2050 }, { "epoch": 0.8161678973960358, "grad_norm": 5.7521233558654785, "learning_rate": 2.099e-05, "loss": 2.6041, "step": 2100 }, { "epoch": 0.8356004663816556, "grad_norm": 8.727128028869629, "learning_rate": 2.1490000000000003e-05, "loss": 2.6262, "step": 2150 }, { "epoch": 0.8550330353672756, "grad_norm": 5.179250717163086, "learning_rate": 2.199e-05, "loss": 2.6203, "step": 2200 }, { "epoch": 0.8744656043528954, "grad_norm": 4.94296407699585, "learning_rate": 2.249e-05, "loss": 2.533, "step": 2250 }, { "epoch": 0.8938981733385154, "grad_norm": 5.3395094871521, "learning_rate": 2.2990000000000002e-05, "loss": 2.6065, "step": 2300 }, { "epoch": 0.9133307423241352, "grad_norm": 3.9141852855682373, "learning_rate": 2.349e-05, "loss": 2.5368, "step": 2350 }, { "epoch": 0.9327633113097551, "grad_norm": 4.978011608123779, "learning_rate": 2.3990000000000002e-05, "loss": 2.5219, "step": 2400 }, { "epoch": 0.952195880295375, "grad_norm": 4.247427940368652, "learning_rate": 2.449e-05, "loss": 2.5095, "step": 2450 }, { "epoch": 0.9716284492809949, "grad_norm": 8.596626281738281, "learning_rate": 2.4990000000000003e-05, "loss": 2.536, "step": 2500 }, { "epoch": 0.9910610182666149, "grad_norm": 3.68994402885437, "learning_rate": 2.549e-05, "loss": 2.5272, "step": 2550 }, { "epoch": 1.0104935872522347, "grad_norm": 7.549562454223633, "learning_rate": 2.5990000000000004e-05, "loss": 2.4804, "step": 2600 }, { "epoch": 1.0299261562378546, "grad_norm": 6.586672306060791, "learning_rate": 2.6490000000000002e-05, "loss": 2.4526, "step": 2650 }, { "epoch": 1.0493587252234746, "grad_norm": 6.57544469833374, "learning_rate": 2.6989999999999997e-05, "loss": 2.4115, "step": 2700 }, { "epoch": 1.0687912942090945, "grad_norm": 3.6275458335876465, "learning_rate": 2.749e-05, "loss": 2.3916, "step": 2750 }, { "epoch": 1.0882238631947143, "grad_norm": 5.756579399108887, "learning_rate": 2.7989999999999998e-05, "loss": 2.4652, "step": 2800 }, { "epoch": 1.1076564321803342, "grad_norm": 8.164681434631348, "learning_rate": 2.849e-05, "loss": 2.4645, "step": 2850 }, { "epoch": 1.1270890011659542, "grad_norm": 4.305910110473633, "learning_rate": 2.8990000000000002e-05, "loss": 2.41, "step": 2900 }, { "epoch": 1.146521570151574, "grad_norm": 5.3087286949157715, "learning_rate": 2.949e-05, "loss": 2.4348, "step": 2950 }, { "epoch": 1.165954139137194, "grad_norm": 5.00079345703125, "learning_rate": 2.9990000000000003e-05, "loss": 2.3594, "step": 3000 }, { "epoch": 1.165954139137194, "eval_accuracy": 0.004809794490599038, "eval_runtime": 907.572, "eval_samples_per_second": 22.679, "eval_steps_per_second": 1.418, "step": 3000 }, { "epoch": 1.1853867081228138, "grad_norm": 4.513027191162109, "learning_rate": 3.049e-05, "loss": 2.3504, "step": 3050 }, { "epoch": 1.2048192771084336, "grad_norm": 5.349442005157471, "learning_rate": 3.099e-05, "loss": 2.3989, "step": 3100 }, { "epoch": 1.2242518460940537, "grad_norm": 5.596327304840088, "learning_rate": 3.1490000000000005e-05, "loss": 2.3555, "step": 3150 }, { "epoch": 1.2436844150796735, "grad_norm": 3.9062111377716064, "learning_rate": 3.1990000000000004e-05, "loss": 2.409, "step": 3200 }, { "epoch": 1.2631169840652934, "grad_norm": 4.693165302276611, "learning_rate": 3.249e-05, "loss": 2.3674, "step": 3250 }, { "epoch": 1.2825495530509134, "grad_norm": 5.275837421417236, "learning_rate": 3.299e-05, "loss": 2.3401, "step": 3300 }, { "epoch": 1.3019821220365333, "grad_norm": 3.579828977584839, "learning_rate": 3.349e-05, "loss": 2.3985, "step": 3350 }, { "epoch": 1.3214146910221531, "grad_norm": 5.416715145111084, "learning_rate": 3.399e-05, "loss": 2.3403, "step": 3400 }, { "epoch": 1.340847260007773, "grad_norm": 7.789376735687256, "learning_rate": 3.449e-05, "loss": 2.4155, "step": 3450 }, { "epoch": 1.3602798289933928, "grad_norm": 4.4100470542907715, "learning_rate": 3.499e-05, "loss": 2.3535, "step": 3500 }, { "epoch": 1.379712397979013, "grad_norm": 3.998701572418213, "learning_rate": 3.549e-05, "loss": 2.3238, "step": 3550 }, { "epoch": 1.3991449669646328, "grad_norm": 6.838342666625977, "learning_rate": 3.599e-05, "loss": 2.3312, "step": 3600 }, { "epoch": 1.4185775359502526, "grad_norm": 3.762887477874756, "learning_rate": 3.6490000000000005e-05, "loss": 2.2789, "step": 3650 }, { "epoch": 1.4380101049358724, "grad_norm": 7.700969219207764, "learning_rate": 3.699e-05, "loss": 2.3047, "step": 3700 }, { "epoch": 1.4574426739214923, "grad_norm": 4.673271179199219, "learning_rate": 3.749e-05, "loss": 2.3218, "step": 3750 }, { "epoch": 1.4768752429071124, "grad_norm": 4.929595947265625, "learning_rate": 3.799e-05, "loss": 2.2811, "step": 3800 }, { "epoch": 1.4963078118927322, "grad_norm": 6.298782825469971, "learning_rate": 3.8490000000000006e-05, "loss": 2.2838, "step": 3850 }, { "epoch": 1.515740380878352, "grad_norm": 7.514346122741699, "learning_rate": 3.8990000000000004e-05, "loss": 2.2797, "step": 3900 }, { "epoch": 1.5351729498639721, "grad_norm": 4.082162857055664, "learning_rate": 3.9489999999999996e-05, "loss": 2.3473, "step": 3950 }, { "epoch": 1.5546055188495917, "grad_norm": 3.844625473022461, "learning_rate": 3.999e-05, "loss": 2.2859, "step": 4000 }, { "epoch": 1.5546055188495917, "eval_accuracy": 0.005198464752465627, "eval_runtime": 907.1135, "eval_samples_per_second": 22.691, "eval_steps_per_second": 1.419, "step": 4000 }, { "epoch": 1.5740380878352118, "grad_norm": 3.8445560932159424, "learning_rate": 4.049e-05, "loss": 2.3033, "step": 4050 }, { "epoch": 1.5934706568208317, "grad_norm": 4.017306327819824, "learning_rate": 4.099e-05, "loss": 2.2842, "step": 4100 }, { "epoch": 1.6129032258064515, "grad_norm": 3.259737968444824, "learning_rate": 4.1490000000000004e-05, "loss": 2.2398, "step": 4150 }, { "epoch": 1.6323357947920716, "grad_norm": 3.228858470916748, "learning_rate": 4.199e-05, "loss": 2.2501, "step": 4200 }, { "epoch": 1.6517683637776914, "grad_norm": 9.149596214294434, "learning_rate": 4.249e-05, "loss": 2.3207, "step": 4250 }, { "epoch": 1.6712009327633113, "grad_norm": 5.323018550872803, "learning_rate": 4.299e-05, "loss": 2.2831, "step": 4300 }, { "epoch": 1.6906335017489313, "grad_norm": 2.7300260066986084, "learning_rate": 4.3490000000000005e-05, "loss": 2.2799, "step": 4350 }, { "epoch": 1.710066070734551, "grad_norm": 2.534250259399414, "learning_rate": 4.3990000000000004e-05, "loss": 2.2916, "step": 4400 }, { "epoch": 1.729498639720171, "grad_norm": 4.524829387664795, "learning_rate": 4.449e-05, "loss": 2.2706, "step": 4450 }, { "epoch": 1.7489312087057909, "grad_norm": 5.60699987411499, "learning_rate": 4.499e-05, "loss": 2.2751, "step": 4500 }, { "epoch": 1.7683637776914107, "grad_norm": 4.741508960723877, "learning_rate": 4.549000000000001e-05, "loss": 2.2387, "step": 4550 }, { "epoch": 1.7877963466770308, "grad_norm": 4.468010902404785, "learning_rate": 4.599e-05, "loss": 2.2431, "step": 4600 }, { "epoch": 1.8072289156626506, "grad_norm": 3.4687862396240234, "learning_rate": 4.649e-05, "loss": 2.2534, "step": 4650 }, { "epoch": 1.8266614846482705, "grad_norm": 3.7992916107177734, "learning_rate": 4.699e-05, "loss": 2.2387, "step": 4700 }, { "epoch": 1.8460940536338906, "grad_norm": 4.4966936111450195, "learning_rate": 4.749e-05, "loss": 2.2469, "step": 4750 }, { "epoch": 1.8655266226195102, "grad_norm": 3.7579238414764404, "learning_rate": 4.799e-05, "loss": 2.217, "step": 4800 }, { "epoch": 1.8849591916051303, "grad_norm": 5.868619441986084, "learning_rate": 4.8490000000000005e-05, "loss": 2.281, "step": 4850 }, { "epoch": 1.90439176059075, "grad_norm": 3.1073763370513916, "learning_rate": 4.8990000000000004e-05, "loss": 2.2435, "step": 4900 }, { "epoch": 1.92382432957637, "grad_norm": 5.91089391708374, "learning_rate": 4.949e-05, "loss": 2.228, "step": 4950 }, { "epoch": 1.94325689856199, "grad_norm": 4.082432270050049, "learning_rate": 4.999e-05, "loss": 2.1793, "step": 5000 }, { "epoch": 1.94325689856199, "eval_accuracy": 0.007724821454598455, "eval_runtime": 907.4609, "eval_samples_per_second": 22.682, "eval_steps_per_second": 1.418, "step": 5000 }, { "epoch": 1.9626894675476096, "grad_norm": 3.8252387046813965, "learning_rate": 5.0490000000000006e-05, "loss": 2.2716, "step": 5050 }, { "epoch": 1.9821220365332297, "grad_norm": 5.004796504974365, "learning_rate": 5.0990000000000005e-05, "loss": 2.2269, "step": 5100 }, { "epoch": 2.00155460551885, "grad_norm": 3.568448066711426, "learning_rate": 5.149e-05, "loss": 2.2113, "step": 5150 }, { "epoch": 2.0209871745044694, "grad_norm": 3.9365673065185547, "learning_rate": 5.199000000000001e-05, "loss": 2.1322, "step": 5200 }, { "epoch": 2.0404197434900895, "grad_norm": 4.7153401374816895, "learning_rate": 5.249000000000001e-05, "loss": 2.1687, "step": 5250 }, { "epoch": 2.059852312475709, "grad_norm": 3.279733896255493, "learning_rate": 5.2990000000000006e-05, "loss": 2.1613, "step": 5300 }, { "epoch": 2.079284881461329, "grad_norm": 8.057929039001465, "learning_rate": 5.3490000000000005e-05, "loss": 2.152, "step": 5350 }, { "epoch": 2.0987174504469492, "grad_norm": 2.202526330947876, "learning_rate": 5.399000000000001e-05, "loss": 2.1066, "step": 5400 }, { "epoch": 2.118150019432569, "grad_norm": 5.828799247741699, "learning_rate": 5.449000000000001e-05, "loss": 2.152, "step": 5450 }, { "epoch": 2.137582588418189, "grad_norm": 4.265638828277588, "learning_rate": 5.499000000000001e-05, "loss": 2.1494, "step": 5500 }, { "epoch": 2.1570151574038086, "grad_norm": 3.351078748703003, "learning_rate": 5.549e-05, "loss": 2.1429, "step": 5550 }, { "epoch": 2.1764477263894286, "grad_norm": 3.50518798828125, "learning_rate": 5.599e-05, "loss": 2.0642, "step": 5600 }, { "epoch": 2.1958802953750487, "grad_norm": 3.6676487922668457, "learning_rate": 5.6489999999999996e-05, "loss": 2.1673, "step": 5650 }, { "epoch": 2.2153128643606683, "grad_norm": 13.605622291564941, "learning_rate": 5.699e-05, "loss": 2.187, "step": 5700 }, { "epoch": 2.2347454333462884, "grad_norm": 2.574394702911377, "learning_rate": 5.749e-05, "loss": 2.1414, "step": 5750 }, { "epoch": 2.2541780023319085, "grad_norm": 2.6154654026031494, "learning_rate": 5.799e-05, "loss": 2.1965, "step": 5800 }, { "epoch": 2.273610571317528, "grad_norm": 4.553177356719971, "learning_rate": 5.849e-05, "loss": 2.1489, "step": 5850 }, { "epoch": 2.293043140303148, "grad_norm": 5.114558696746826, "learning_rate": 5.899e-05, "loss": 2.1779, "step": 5900 }, { "epoch": 2.312475709288768, "grad_norm": 4.845452785491943, "learning_rate": 5.949e-05, "loss": 2.1091, "step": 5950 }, { "epoch": 2.331908278274388, "grad_norm": 5.508037090301514, "learning_rate": 5.999e-05, "loss": 2.1407, "step": 6000 }, { "epoch": 2.331908278274388, "eval_accuracy": 0.011805859204197638, "eval_runtime": 907.658, "eval_samples_per_second": 22.677, "eval_steps_per_second": 1.418, "step": 6000 }, { "epoch": 2.351340847260008, "grad_norm": 3.4910573959350586, "learning_rate": 6.0490000000000005e-05, "loss": 2.0972, "step": 6050 }, { "epoch": 2.3707734162456275, "grad_norm": 11.334199905395508, "learning_rate": 6.0990000000000004e-05, "loss": 2.1085, "step": 6100 }, { "epoch": 2.3902059852312476, "grad_norm": 4.658063888549805, "learning_rate": 6.149000000000001e-05, "loss": 2.161, "step": 6150 }, { "epoch": 2.4096385542168672, "grad_norm": 4.726357460021973, "learning_rate": 6.199000000000001e-05, "loss": 2.0757, "step": 6200 }, { "epoch": 2.4290711232024873, "grad_norm": 7.879475116729736, "learning_rate": 6.249e-05, "loss": 2.1643, "step": 6250 }, { "epoch": 2.4485036921881074, "grad_norm": 4.898128986358643, "learning_rate": 6.299e-05, "loss": 2.1573, "step": 6300 }, { "epoch": 2.467936261173727, "grad_norm": 4.784426212310791, "learning_rate": 6.349e-05, "loss": 2.1296, "step": 6350 }, { "epoch": 2.487368830159347, "grad_norm": 4.347411155700684, "learning_rate": 6.399e-05, "loss": 2.1221, "step": 6400 }, { "epoch": 2.506801399144967, "grad_norm": 9.684266090393066, "learning_rate": 6.449e-05, "loss": 2.0595, "step": 6450 }, { "epoch": 2.5262339681305868, "grad_norm": 3.320681095123291, "learning_rate": 6.499000000000001e-05, "loss": 2.1178, "step": 6500 }, { "epoch": 2.545666537116207, "grad_norm": 5.14285945892334, "learning_rate": 6.549000000000001e-05, "loss": 2.1096, "step": 6550 }, { "epoch": 2.565099106101827, "grad_norm": 4.604893684387207, "learning_rate": 6.599000000000001e-05, "loss": 2.0953, "step": 6600 }, { "epoch": 2.5845316750874465, "grad_norm": 3.0445802211761475, "learning_rate": 6.649000000000001e-05, "loss": 2.1309, "step": 6650 }, { "epoch": 2.6039642440730666, "grad_norm": 5.163757801055908, "learning_rate": 6.699000000000001e-05, "loss": 2.1532, "step": 6700 }, { "epoch": 2.623396813058686, "grad_norm": 3.930135488510132, "learning_rate": 6.749e-05, "loss": 2.0524, "step": 6750 }, { "epoch": 2.6428293820443063, "grad_norm": 2.7163515090942383, "learning_rate": 6.799e-05, "loss": 2.1163, "step": 6800 }, { "epoch": 2.662261951029926, "grad_norm": 10.727341651916504, "learning_rate": 6.849e-05, "loss": 2.159, "step": 6850 }, { "epoch": 2.681694520015546, "grad_norm": 3.4097917079925537, "learning_rate": 6.899e-05, "loss": 2.093, "step": 6900 }, { "epoch": 2.701127089001166, "grad_norm": 3.5827713012695312, "learning_rate": 6.949e-05, "loss": 2.1316, "step": 6950 }, { "epoch": 2.7205596579867857, "grad_norm": 2.8892476558685303, "learning_rate": 6.999e-05, "loss": 2.0978, "step": 7000 }, { "epoch": 2.7205596579867857, "eval_accuracy": 0.018413253655929652, "eval_runtime": 906.6344, "eval_samples_per_second": 22.703, "eval_steps_per_second": 1.42, "step": 7000 }, { "epoch": 2.7399922269724057, "grad_norm": 3.845646619796753, "learning_rate": 7.049e-05, "loss": 2.0751, "step": 7050 }, { "epoch": 2.759424795958026, "grad_norm": 2.9022414684295654, "learning_rate": 7.099e-05, "loss": 2.112, "step": 7100 }, { "epoch": 2.7788573649436454, "grad_norm": 2.3950488567352295, "learning_rate": 7.149e-05, "loss": 2.0933, "step": 7150 }, { "epoch": 2.7982899339292655, "grad_norm": 4.62535285949707, "learning_rate": 7.199000000000001e-05, "loss": 2.1066, "step": 7200 }, { "epoch": 2.8177225029148856, "grad_norm": 1.856604814529419, "learning_rate": 7.249e-05, "loss": 2.1142, "step": 7250 }, { "epoch": 2.837155071900505, "grad_norm": 5.750202655792236, "learning_rate": 7.299e-05, "loss": 2.1047, "step": 7300 }, { "epoch": 2.8565876408861253, "grad_norm": 3.7773256301879883, "learning_rate": 7.349e-05, "loss": 2.0695, "step": 7350 }, { "epoch": 2.876020209871745, "grad_norm": 3.556065082550049, "learning_rate": 7.399e-05, "loss": 2.1637, "step": 7400 }, { "epoch": 2.895452778857365, "grad_norm": 6.621519565582275, "learning_rate": 7.449e-05, "loss": 2.0937, "step": 7450 }, { "epoch": 2.9148853478429846, "grad_norm": 7.399990558624268, "learning_rate": 7.499e-05, "loss": 2.1091, "step": 7500 }, { "epoch": 2.9343179168286047, "grad_norm": 5.250476360321045, "learning_rate": 7.549000000000001e-05, "loss": 2.0978, "step": 7550 }, { "epoch": 2.9537504858142247, "grad_norm": 2.741161823272705, "learning_rate": 7.599000000000001e-05, "loss": 2.093, "step": 7600 }, { "epoch": 2.9731830547998443, "grad_norm": 2.7134132385253906, "learning_rate": 7.649000000000001e-05, "loss": 2.0529, "step": 7650 }, { "epoch": 2.9926156237854644, "grad_norm": 4.2654194831848145, "learning_rate": 7.699e-05, "loss": 2.0865, "step": 7700 }, { "epoch": 3.0120481927710845, "grad_norm": 4.368113994598389, "learning_rate": 7.749e-05, "loss": 2.0073, "step": 7750 }, { "epoch": 3.031480761756704, "grad_norm": 3.039257526397705, "learning_rate": 7.799e-05, "loss": 1.981, "step": 7800 }, { "epoch": 3.050913330742324, "grad_norm": 2.098691463470459, "learning_rate": 7.849e-05, "loss": 1.9836, "step": 7850 }, { "epoch": 3.0703458997279442, "grad_norm": 3.731783151626587, "learning_rate": 7.899000000000001e-05, "loss": 1.978, "step": 7900 }, { "epoch": 3.089778468713564, "grad_norm": 2.487757682800293, "learning_rate": 7.949000000000001e-05, "loss": 1.9576, "step": 7950 }, { "epoch": 3.109211037699184, "grad_norm": 7.170228481292725, "learning_rate": 7.999000000000001e-05, "loss": 1.9401, "step": 8000 }, { "epoch": 3.109211037699184, "eval_accuracy": 0.036243501919059416, "eval_runtime": 907.4435, "eval_samples_per_second": 22.682, "eval_steps_per_second": 1.418, "step": 8000 }, { "epoch": 3.1286436066848036, "grad_norm": 3.2367658615112305, "learning_rate": 8.049e-05, "loss": 1.9768, "step": 8050 }, { "epoch": 3.1480761756704236, "grad_norm": 11.398568153381348, "learning_rate": 8.099e-05, "loss": 1.9919, "step": 8100 }, { "epoch": 3.1675087446560437, "grad_norm": 4.276248931884766, "learning_rate": 8.149e-05, "loss": 1.9771, "step": 8150 }, { "epoch": 3.1869413136416633, "grad_norm": 4.85842752456665, "learning_rate": 8.199e-05, "loss": 1.9818, "step": 8200 }, { "epoch": 3.2063738826272834, "grad_norm": 2.933837652206421, "learning_rate": 8.249e-05, "loss": 1.9883, "step": 8250 }, { "epoch": 3.225806451612903, "grad_norm": 4.235050678253174, "learning_rate": 8.299e-05, "loss": 1.9446, "step": 8300 }, { "epoch": 3.245239020598523, "grad_norm": 2.48342227935791, "learning_rate": 8.349e-05, "loss": 1.9867, "step": 8350 }, { "epoch": 3.264671589584143, "grad_norm": 6.0085530281066895, "learning_rate": 8.399e-05, "loss": 1.9291, "step": 8400 }, { "epoch": 3.284104158569763, "grad_norm": 3.1313884258270264, "learning_rate": 8.449e-05, "loss": 1.9675, "step": 8450 }, { "epoch": 3.303536727555383, "grad_norm": 5.40889835357666, "learning_rate": 8.499e-05, "loss": 1.9775, "step": 8500 }, { "epoch": 3.322969296541003, "grad_norm": 2.2098782062530518, "learning_rate": 8.549000000000001e-05, "loss": 1.9164, "step": 8550 }, { "epoch": 3.3424018655266226, "grad_norm": 3.1617531776428223, "learning_rate": 8.599000000000001e-05, "loss": 1.9841, "step": 8600 }, { "epoch": 3.3618344345122426, "grad_norm": 5.392375946044922, "learning_rate": 8.649000000000001e-05, "loss": 1.9923, "step": 8650 }, { "epoch": 3.3812670034978622, "grad_norm": 2.839928150177002, "learning_rate": 8.699e-05, "loss": 2.0026, "step": 8700 }, { "epoch": 3.4006995724834823, "grad_norm": 4.040922164916992, "learning_rate": 8.749e-05, "loss": 1.9445, "step": 8750 }, { "epoch": 3.4201321414691024, "grad_norm": 8.354971885681152, "learning_rate": 8.799e-05, "loss": 1.9008, "step": 8800 }, { "epoch": 3.439564710454722, "grad_norm": 3.485241174697876, "learning_rate": 8.849e-05, "loss": 1.9589, "step": 8850 }, { "epoch": 3.458997279440342, "grad_norm": 31.03635597229004, "learning_rate": 8.899e-05, "loss": 1.9488, "step": 8900 }, { "epoch": 3.4784298484259617, "grad_norm": 2.6878533363342285, "learning_rate": 8.949000000000001e-05, "loss": 1.9127, "step": 8950 }, { "epoch": 3.4978624174115818, "grad_norm": 2.712730646133423, "learning_rate": 8.999000000000001e-05, "loss": 1.907, "step": 9000 }, { "epoch": 3.4978624174115818, "eval_accuracy": 0.04727202059952388, "eval_runtime": 908.1635, "eval_samples_per_second": 22.664, "eval_steps_per_second": 1.417, "step": 9000 }, { "epoch": 3.517294986397202, "grad_norm": 2.0906474590301514, "learning_rate": 9.049000000000001e-05, "loss": 1.9679, "step": 9050 }, { "epoch": 3.5367275553828215, "grad_norm": 4.546792507171631, "learning_rate": 9.099000000000001e-05, "loss": 1.9406, "step": 9100 }, { "epoch": 3.5561601243684415, "grad_norm": 5.863422870635986, "learning_rate": 9.149e-05, "loss": 1.9497, "step": 9150 }, { "epoch": 3.5755926933540616, "grad_norm": 3.8730051517486572, "learning_rate": 9.199e-05, "loss": 1.949, "step": 9200 }, { "epoch": 3.5950252623396812, "grad_norm": 8.266348838806152, "learning_rate": 9.249e-05, "loss": 1.9245, "step": 9250 }, { "epoch": 3.6144578313253013, "grad_norm": 3.6021506786346436, "learning_rate": 9.299e-05, "loss": 1.9254, "step": 9300 }, { "epoch": 3.6338904003109214, "grad_norm": 3.5761210918426514, "learning_rate": 9.349e-05, "loss": 1.9584, "step": 9350 }, { "epoch": 3.653322969296541, "grad_norm": 3.0531485080718994, "learning_rate": 9.399e-05, "loss": 1.9022, "step": 9400 }, { "epoch": 3.672755538282161, "grad_norm": 4.91207218170166, "learning_rate": 9.449e-05, "loss": 1.9076, "step": 9450 }, { "epoch": 3.6921881072677807, "grad_norm": 3.6810998916625977, "learning_rate": 9.499e-05, "loss": 1.9406, "step": 9500 }, { "epoch": 3.7116206762534008, "grad_norm": 4.888444423675537, "learning_rate": 9.549e-05, "loss": 1.9407, "step": 9550 }, { "epoch": 3.7310532452390204, "grad_norm": 3.0702097415924072, "learning_rate": 9.599000000000001e-05, "loss": 1.9525, "step": 9600 }, { "epoch": 3.7504858142246404, "grad_norm": 3.8583812713623047, "learning_rate": 9.649e-05, "loss": 1.9311, "step": 9650 }, { "epoch": 3.7699183832102605, "grad_norm": 6.406527042388916, "learning_rate": 9.699e-05, "loss": 1.911, "step": 9700 }, { "epoch": 3.78935095219588, "grad_norm": 3.266839027404785, "learning_rate": 9.749e-05, "loss": 1.922, "step": 9750 }, { "epoch": 3.8087835211815, "grad_norm": 3.8981516361236572, "learning_rate": 9.799e-05, "loss": 1.9372, "step": 9800 }, { "epoch": 3.8282160901671203, "grad_norm": 2.969425916671753, "learning_rate": 9.849e-05, "loss": 1.9577, "step": 9850 }, { "epoch": 3.84764865915274, "grad_norm": 3.436954975128174, "learning_rate": 9.899e-05, "loss": 1.957, "step": 9900 }, { "epoch": 3.86708122813836, "grad_norm": 7.24133825302124, "learning_rate": 9.949000000000001e-05, "loss": 1.8949, "step": 9950 }, { "epoch": 3.88651379712398, "grad_norm": 3.6599984169006348, "learning_rate": 9.999000000000001e-05, "loss": 1.9428, "step": 10000 }, { "epoch": 3.88651379712398, "eval_accuracy": 0.07073798765971918, "eval_runtime": 907.1728, "eval_samples_per_second": 22.689, "eval_steps_per_second": 1.419, "step": 10000 }, { "epoch": 3.9059463661095997, "grad_norm": 3.687016010284424, "learning_rate": 9.99742105263158e-05, "loss": 1.9132, "step": 10050 }, { "epoch": 3.9253789350952197, "grad_norm": 4.09041166305542, "learning_rate": 9.994789473684211e-05, "loss": 1.9465, "step": 10100 }, { "epoch": 3.9448115040808394, "grad_norm": 3.4802134037017822, "learning_rate": 9.992157894736842e-05, "loss": 1.9263, "step": 10150 }, { "epoch": 3.9642440730664594, "grad_norm": 2.9187519550323486, "learning_rate": 9.989526315789473e-05, "loss": 1.954, "step": 10200 }, { "epoch": 3.983676642052079, "grad_norm": 4.464130878448486, "learning_rate": 9.986894736842106e-05, "loss": 1.8683, "step": 10250 }, { "epoch": 4.0031092110377, "grad_norm": 57.50473403930664, "learning_rate": 9.984263157894738e-05, "loss": 1.9317, "step": 10300 }, { "epoch": 4.022541780023319, "grad_norm": 3.390007734298706, "learning_rate": 9.98163157894737e-05, "loss": 1.7154, "step": 10350 }, { "epoch": 4.041974349008939, "grad_norm": 4.305722236633301, "learning_rate": 9.979e-05, "loss": 1.762, "step": 10400 }, { "epoch": 4.061406917994558, "grad_norm": 3.997655153274536, "learning_rate": 9.976368421052632e-05, "loss": 1.7644, "step": 10450 }, { "epoch": 4.080839486980179, "grad_norm": 4.27736759185791, "learning_rate": 9.973736842105263e-05, "loss": 1.7189, "step": 10500 }, { "epoch": 4.100272055965799, "grad_norm": 16.003461837768555, "learning_rate": 9.971105263157895e-05, "loss": 1.7184, "step": 10550 }, { "epoch": 4.119704624951418, "grad_norm": 4.10724401473999, "learning_rate": 9.968473684210526e-05, "loss": 1.7408, "step": 10600 }, { "epoch": 4.139137193937039, "grad_norm": 4.501052379608154, "learning_rate": 9.965842105263158e-05, "loss": 1.7309, "step": 10650 }, { "epoch": 4.158569762922658, "grad_norm": 4.458967208862305, "learning_rate": 9.96321052631579e-05, "loss": 1.7234, "step": 10700 }, { "epoch": 4.178002331908278, "grad_norm": 7.051727771759033, "learning_rate": 9.960578947368421e-05, "loss": 1.711, "step": 10750 }, { "epoch": 4.1974349008938985, "grad_norm": 4.20425271987915, "learning_rate": 9.957947368421054e-05, "loss": 1.7405, "step": 10800 }, { "epoch": 4.216867469879518, "grad_norm": 4.120607852935791, "learning_rate": 9.955315789473685e-05, "loss": 1.7183, "step": 10850 }, { "epoch": 4.236300038865138, "grad_norm": 3.240799903869629, "learning_rate": 9.952684210526316e-05, "loss": 1.7711, "step": 10900 }, { "epoch": 4.255732607850758, "grad_norm": 5.256502628326416, "learning_rate": 9.950052631578947e-05, "loss": 1.7256, "step": 10950 }, { "epoch": 4.275165176836378, "grad_norm": 6.0444560050964355, "learning_rate": 9.94742105263158e-05, "loss": 1.6995, "step": 11000 }, { "epoch": 4.275165176836378, "eval_accuracy": 0.13307098090657338, "eval_runtime": 906.647, "eval_samples_per_second": 22.702, "eval_steps_per_second": 1.42, "step": 11000 }, { "epoch": 4.2945977458219975, "grad_norm": 3.814336061477661, "learning_rate": 9.944789473684211e-05, "loss": 1.7494, "step": 11050 }, { "epoch": 4.314030314807617, "grad_norm": 5.616982460021973, "learning_rate": 9.942157894736842e-05, "loss": 1.7516, "step": 11100 }, { "epoch": 4.333462883793238, "grad_norm": 4.849693775177002, "learning_rate": 9.939526315789475e-05, "loss": 1.726, "step": 11150 }, { "epoch": 4.352895452778857, "grad_norm": 4.43636417388916, "learning_rate": 9.936894736842106e-05, "loss": 1.7132, "step": 11200 }, { "epoch": 4.372328021764477, "grad_norm": 4.7518439292907715, "learning_rate": 9.934263157894737e-05, "loss": 1.7164, "step": 11250 }, { "epoch": 4.391760590750097, "grad_norm": 5.752714157104492, "learning_rate": 9.93163157894737e-05, "loss": 1.7357, "step": 11300 }, { "epoch": 4.411193159735717, "grad_norm": 5.446049690246582, "learning_rate": 9.929e-05, "loss": 1.7636, "step": 11350 }, { "epoch": 4.430625728721337, "grad_norm": 5.5599775314331055, "learning_rate": 9.926368421052632e-05, "loss": 1.7316, "step": 11400 }, { "epoch": 4.450058297706957, "grad_norm": 4.994411945343018, "learning_rate": 9.923736842105263e-05, "loss": 1.6766, "step": 11450 }, { "epoch": 4.469490866692577, "grad_norm": 6.706633567810059, "learning_rate": 9.921105263157895e-05, "loss": 1.6803, "step": 11500 }, { "epoch": 4.488923435678196, "grad_norm": 5.894743919372559, "learning_rate": 9.918473684210527e-05, "loss": 1.7184, "step": 11550 }, { "epoch": 4.508356004663817, "grad_norm": 3.859015941619873, "learning_rate": 9.915842105263158e-05, "loss": 1.7238, "step": 11600 }, { "epoch": 4.5277885736494365, "grad_norm": 5.355344772338867, "learning_rate": 9.91321052631579e-05, "loss": 1.7641, "step": 11650 }, { "epoch": 4.547221142635056, "grad_norm": 9.785111427307129, "learning_rate": 9.910578947368421e-05, "loss": 1.705, "step": 11700 }, { "epoch": 4.566653711620676, "grad_norm": 3.365846633911133, "learning_rate": 9.907947368421054e-05, "loss": 1.6986, "step": 11750 }, { "epoch": 4.586086280606296, "grad_norm": 3.3706541061401367, "learning_rate": 9.905315789473685e-05, "loss": 1.7428, "step": 11800 }, { "epoch": 4.605518849591916, "grad_norm": 4.395911693572998, "learning_rate": 9.902684210526316e-05, "loss": 1.7401, "step": 11850 }, { "epoch": 4.624951418577536, "grad_norm": 3.5814549922943115, "learning_rate": 9.900052631578947e-05, "loss": 1.7055, "step": 11900 }, { "epoch": 4.644383987563156, "grad_norm": 6.163331031799316, "learning_rate": 9.897421052631579e-05, "loss": 1.7195, "step": 11950 }, { "epoch": 4.663816556548776, "grad_norm": 7.837599277496338, "learning_rate": 9.894789473684211e-05, "loss": 1.6419, "step": 12000 }, { "epoch": 4.663816556548776, "eval_accuracy": 0.1958412281980275, "eval_runtime": 912.2477, "eval_samples_per_second": 22.563, "eval_steps_per_second": 1.411, "step": 12000 }, { "epoch": 4.683249125534395, "grad_norm": 4.789027690887451, "learning_rate": 9.892157894736842e-05, "loss": 1.7383, "step": 12050 }, { "epoch": 4.702681694520016, "grad_norm": 3.7628960609436035, "learning_rate": 9.889526315789475e-05, "loss": 1.6448, "step": 12100 }, { "epoch": 4.7221142635056355, "grad_norm": 4.155508041381836, "learning_rate": 9.886894736842106e-05, "loss": 1.6326, "step": 12150 }, { "epoch": 4.741546832491255, "grad_norm": 4.6860785484313965, "learning_rate": 9.884263157894737e-05, "loss": 1.669, "step": 12200 }, { "epoch": 4.760979401476876, "grad_norm": 9.118622779846191, "learning_rate": 9.88163157894737e-05, "loss": 1.7108, "step": 12250 }, { "epoch": 4.780411970462495, "grad_norm": 5.970766067504883, "learning_rate": 9.879000000000001e-05, "loss": 1.6809, "step": 12300 }, { "epoch": 4.799844539448115, "grad_norm": 6.902002811431885, "learning_rate": 9.876368421052632e-05, "loss": 1.7195, "step": 12350 }, { "epoch": 4.8192771084337345, "grad_norm": 8.516134262084961, "learning_rate": 9.873736842105263e-05, "loss": 1.6597, "step": 12400 }, { "epoch": 4.838709677419355, "grad_norm": 5.790510654449463, "learning_rate": 9.871105263157894e-05, "loss": 1.6695, "step": 12450 }, { "epoch": 4.858142246404975, "grad_norm": 7.634342193603516, "learning_rate": 9.868473684210527e-05, "loss": 1.6527, "step": 12500 }, { "epoch": 4.877574815390595, "grad_norm": 4.296464920043945, "learning_rate": 9.865842105263159e-05, "loss": 1.6022, "step": 12550 }, { "epoch": 4.897007384376215, "grad_norm": 7.289546489715576, "learning_rate": 9.86321052631579e-05, "loss": 1.7196, "step": 12600 }, { "epoch": 4.916439953361834, "grad_norm": 4.080571174621582, "learning_rate": 9.860578947368422e-05, "loss": 1.6623, "step": 12650 }, { "epoch": 4.935872522347454, "grad_norm": 6.348351001739502, "learning_rate": 9.857947368421053e-05, "loss": 1.638, "step": 12700 }, { "epoch": 4.9553050913330745, "grad_norm": 5.687911033630371, "learning_rate": 9.855315789473685e-05, "loss": 1.6331, "step": 12750 }, { "epoch": 4.974737660318694, "grad_norm": 4.747588634490967, "learning_rate": 9.852684210526316e-05, "loss": 1.6322, "step": 12800 }, { "epoch": 4.994170229304314, "grad_norm": 3.4772491455078125, "learning_rate": 9.850052631578948e-05, "loss": 1.6759, "step": 12850 }, { "epoch": 5.013602798289934, "grad_norm": 7.125967502593994, "learning_rate": 9.847421052631579e-05, "loss": 1.5166, "step": 12900 }, { "epoch": 5.033035367275554, "grad_norm": 7.8761305809021, "learning_rate": 9.84478947368421e-05, "loss": 1.4225, "step": 12950 }, { "epoch": 5.0524679362611735, "grad_norm": 4.948407173156738, "learning_rate": 9.842157894736842e-05, "loss": 1.4377, "step": 13000 }, { "epoch": 5.0524679362611735, "eval_accuracy": 0.28732449108487584, "eval_runtime": 908.7257, "eval_samples_per_second": 22.65, "eval_steps_per_second": 1.416, "step": 13000 }, { "epoch": 5.071900505246794, "grad_norm": 4.546520709991455, "learning_rate": 9.839526315789475e-05, "loss": 1.462, "step": 13050 }, { "epoch": 5.091333074232414, "grad_norm": 3.945157051086426, "learning_rate": 9.836894736842106e-05, "loss": 1.4214, "step": 13100 }, { "epoch": 5.110765643218033, "grad_norm": 5.798865795135498, "learning_rate": 9.834263157894737e-05, "loss": 1.3673, "step": 13150 }, { "epoch": 5.130198212203653, "grad_norm": 5.186479568481445, "learning_rate": 9.831631578947368e-05, "loss": 1.425, "step": 13200 }, { "epoch": 5.149630781189273, "grad_norm": 6.136054992675781, "learning_rate": 9.829000000000001e-05, "loss": 1.399, "step": 13250 }, { "epoch": 5.169063350174893, "grad_norm": 5.4267191886901855, "learning_rate": 9.826368421052632e-05, "loss": 1.3612, "step": 13300 }, { "epoch": 5.188495919160513, "grad_norm": 6.774726390838623, "learning_rate": 9.823736842105263e-05, "loss": 1.4772, "step": 13350 }, { "epoch": 5.207928488146133, "grad_norm": 3.9114222526550293, "learning_rate": 9.821105263157894e-05, "loss": 1.4336, "step": 13400 }, { "epoch": 5.227361057131753, "grad_norm": 5.662858963012695, "learning_rate": 9.818473684210527e-05, "loss": 1.4467, "step": 13450 }, { "epoch": 5.246793626117372, "grad_norm": 4.636590957641602, "learning_rate": 9.81584210526316e-05, "loss": 1.4401, "step": 13500 }, { "epoch": 5.266226195102993, "grad_norm": 3.994364023208618, "learning_rate": 9.81321052631579e-05, "loss": 1.4181, "step": 13550 }, { "epoch": 5.285658764088613, "grad_norm": 7.642608642578125, "learning_rate": 9.810578947368422e-05, "loss": 1.3862, "step": 13600 }, { "epoch": 5.305091333074232, "grad_norm": 6.14906120300293, "learning_rate": 9.807947368421053e-05, "loss": 1.3952, "step": 13650 }, { "epoch": 5.324523902059853, "grad_norm": 6.9564127922058105, "learning_rate": 9.805315789473684e-05, "loss": 1.3662, "step": 13700 }, { "epoch": 5.343956471045472, "grad_norm": 5.573160648345947, "learning_rate": 9.802684210526317e-05, "loss": 1.408, "step": 13750 }, { "epoch": 5.363389040031092, "grad_norm": 8.71044921875, "learning_rate": 9.800052631578948e-05, "loss": 1.4109, "step": 13800 }, { "epoch": 5.3828216090167125, "grad_norm": 7.305654525756836, "learning_rate": 9.797421052631579e-05, "loss": 1.3941, "step": 13850 }, { "epoch": 5.402254178002332, "grad_norm": 11.20594596862793, "learning_rate": 9.79478947368421e-05, "loss": 1.3478, "step": 13900 }, { "epoch": 5.421686746987952, "grad_norm": 7.136280059814453, "learning_rate": 9.792157894736843e-05, "loss": 1.4619, "step": 13950 }, { "epoch": 5.441119315973571, "grad_norm": 16.48175048828125, "learning_rate": 9.789526315789475e-05, "loss": 1.3761, "step": 14000 }, { "epoch": 5.441119315973571, "eval_accuracy": 0.38327746198319, "eval_runtime": 910.388, "eval_samples_per_second": 22.609, "eval_steps_per_second": 1.414, "step": 14000 }, { "epoch": 5.460551884959192, "grad_norm": 11.419659614562988, "learning_rate": 9.786894736842106e-05, "loss": 1.3115, "step": 14050 }, { "epoch": 5.4799844539448115, "grad_norm": 5.9120869636535645, "learning_rate": 9.784263157894737e-05, "loss": 1.4261, "step": 14100 }, { "epoch": 5.499417022930431, "grad_norm": 7.246486663818359, "learning_rate": 9.781631578947369e-05, "loss": 1.374, "step": 14150 }, { "epoch": 5.518849591916052, "grad_norm": 7.775355815887451, "learning_rate": 9.779e-05, "loss": 1.3663, "step": 14200 }, { "epoch": 5.538282160901671, "grad_norm": 7.888273239135742, "learning_rate": 9.776368421052632e-05, "loss": 1.363, "step": 14250 }, { "epoch": 5.557714729887291, "grad_norm": 5.733011722564697, "learning_rate": 9.773736842105263e-05, "loss": 1.4194, "step": 14300 }, { "epoch": 5.577147298872911, "grad_norm": 10.086505889892578, "learning_rate": 9.771105263157895e-05, "loss": 1.3748, "step": 14350 }, { "epoch": 5.596579867858531, "grad_norm": 4.082585334777832, "learning_rate": 9.768473684210527e-05, "loss": 1.4055, "step": 14400 }, { "epoch": 5.616012436844151, "grad_norm": 8.12739086151123, "learning_rate": 9.765842105263158e-05, "loss": 1.371, "step": 14450 }, { "epoch": 5.635445005829771, "grad_norm": 5.28172492980957, "learning_rate": 9.763210526315791e-05, "loss": 1.3121, "step": 14500 }, { "epoch": 5.654877574815391, "grad_norm": 6.072118759155273, "learning_rate": 9.760578947368422e-05, "loss": 1.3478, "step": 14550 }, { "epoch": 5.67431014380101, "grad_norm": 4.58156681060791, "learning_rate": 9.757947368421053e-05, "loss": 1.3545, "step": 14600 }, { "epoch": 5.69374271278663, "grad_norm": 8.339735984802246, "learning_rate": 9.755315789473684e-05, "loss": 1.3544, "step": 14650 }, { "epoch": 5.7131752817722505, "grad_norm": 6.120683670043945, "learning_rate": 9.752684210526317e-05, "loss": 1.3693, "step": 14700 }, { "epoch": 5.73260785075787, "grad_norm": 5.467242240905762, "learning_rate": 9.750052631578948e-05, "loss": 1.3501, "step": 14750 }, { "epoch": 5.75204041974349, "grad_norm": 6.579073429107666, "learning_rate": 9.747421052631579e-05, "loss": 1.3322, "step": 14800 }, { "epoch": 5.77147298872911, "grad_norm": 6.063732624053955, "learning_rate": 9.74478947368421e-05, "loss": 1.353, "step": 14850 }, { "epoch": 5.79090555771473, "grad_norm": 6.175238132476807, "learning_rate": 9.742157894736843e-05, "loss": 1.3488, "step": 14900 }, { "epoch": 5.8103381267003495, "grad_norm": 9.881817817687988, "learning_rate": 9.739526315789474e-05, "loss": 1.2767, "step": 14950 }, { "epoch": 5.82977069568597, "grad_norm": 6.4397759437561035, "learning_rate": 9.736894736842106e-05, "loss": 1.3274, "step": 15000 }, { "epoch": 5.82977069568597, "eval_accuracy": 0.4773356653549045, "eval_runtime": 906.2985, "eval_samples_per_second": 22.711, "eval_steps_per_second": 1.42, "step": 15000 }, { "epoch": 5.84920326467159, "grad_norm": 7.608861446380615, "learning_rate": 9.734263157894738e-05, "loss": 1.2751, "step": 15050 }, { "epoch": 5.868635833657209, "grad_norm": 5.615549087524414, "learning_rate": 9.731631578947369e-05, "loss": 1.3654, "step": 15100 }, { "epoch": 5.88806840264283, "grad_norm": 7.904622554779053, "learning_rate": 9.729e-05, "loss": 1.3844, "step": 15150 }, { "epoch": 5.9075009716284494, "grad_norm": 4.8503851890563965, "learning_rate": 9.726368421052632e-05, "loss": 1.3207, "step": 15200 }, { "epoch": 5.926933540614069, "grad_norm": 5.832537651062012, "learning_rate": 9.723736842105264e-05, "loss": 1.3454, "step": 15250 }, { "epoch": 5.946366109599689, "grad_norm": 7.324341773986816, "learning_rate": 9.721105263157895e-05, "loss": 1.3522, "step": 15300 }, { "epoch": 5.965798678585309, "grad_norm": 7.22886848449707, "learning_rate": 9.718473684210527e-05, "loss": 1.3406, "step": 15350 }, { "epoch": 5.985231247570929, "grad_norm": 10.675936698913574, "learning_rate": 9.715842105263158e-05, "loss": 1.3592, "step": 15400 }, { "epoch": 6.0046638165565485, "grad_norm": 4.880441188812256, "learning_rate": 9.713210526315791e-05, "loss": 1.2014, "step": 15450 }, { "epoch": 6.024096385542169, "grad_norm": 9.532160758972168, "learning_rate": 9.710578947368422e-05, "loss": 1.0553, "step": 15500 }, { "epoch": 6.043528954527789, "grad_norm": 6.296610355377197, "learning_rate": 9.707947368421053e-05, "loss": 1.0809, "step": 15550 }, { "epoch": 6.062961523513408, "grad_norm": 6.660314559936523, "learning_rate": 9.705315789473684e-05, "loss": 1.0264, "step": 15600 }, { "epoch": 6.082394092499029, "grad_norm": 7.901432514190674, "learning_rate": 9.702684210526316e-05, "loss": 1.153, "step": 15650 }, { "epoch": 6.101826661484648, "grad_norm": 73.9762954711914, "learning_rate": 9.700052631578948e-05, "loss": 1.1122, "step": 15700 }, { "epoch": 6.121259230470268, "grad_norm": 6.655057430267334, "learning_rate": 9.697421052631579e-05, "loss": 1.0927, "step": 15750 }, { "epoch": 6.1406917994558885, "grad_norm": 6.886069297790527, "learning_rate": 9.694789473684212e-05, "loss": 0.9707, "step": 15800 }, { "epoch": 6.160124368441508, "grad_norm": 6.370534420013428, "learning_rate": 9.692157894736843e-05, "loss": 1.0316, "step": 15850 }, { "epoch": 6.179556937427128, "grad_norm": 8.026642799377441, "learning_rate": 9.689526315789474e-05, "loss": 1.0855, "step": 15900 }, { "epoch": 6.198989506412747, "grad_norm": 4.1472859382629395, "learning_rate": 9.686894736842107e-05, "loss": 1.0527, "step": 15950 }, { "epoch": 6.218422075398368, "grad_norm": 13.967721939086914, "learning_rate": 9.684263157894738e-05, "loss": 1.0484, "step": 16000 }, { "epoch": 6.218422075398368, "eval_accuracy": 0.564494971578487, "eval_runtime": 907.0139, "eval_samples_per_second": 22.693, "eval_steps_per_second": 1.419, "step": 16000 }, { "epoch": 6.2378546443839875, "grad_norm": 8.06767463684082, "learning_rate": 9.681631578947369e-05, "loss": 1.0868, "step": 16050 }, { "epoch": 6.257287213369607, "grad_norm": 5.721848487854004, "learning_rate": 9.679e-05, "loss": 1.0125, "step": 16100 }, { "epoch": 6.276719782355228, "grad_norm": 7.221185207366943, "learning_rate": 9.676368421052631e-05, "loss": 1.0789, "step": 16150 }, { "epoch": 6.296152351340847, "grad_norm": 6.3180670738220215, "learning_rate": 9.673736842105264e-05, "loss": 1.08, "step": 16200 }, { "epoch": 6.315584920326467, "grad_norm": 5.97916841506958, "learning_rate": 9.671105263157895e-05, "loss": 1.045, "step": 16250 }, { "epoch": 6.335017489312087, "grad_norm": 5.4424967765808105, "learning_rate": 9.668473684210527e-05, "loss": 1.0835, "step": 16300 }, { "epoch": 6.354450058297707, "grad_norm": 5.190217018127441, "learning_rate": 9.665842105263158e-05, "loss": 1.0738, "step": 16350 }, { "epoch": 6.373882627283327, "grad_norm": 6.745716571807861, "learning_rate": 9.66321052631579e-05, "loss": 1.0482, "step": 16400 }, { "epoch": 6.393315196268947, "grad_norm": 5.413162708282471, "learning_rate": 9.660578947368422e-05, "loss": 1.073, "step": 16450 }, { "epoch": 6.412747765254567, "grad_norm": 7.720066547393799, "learning_rate": 9.657947368421053e-05, "loss": 1.0818, "step": 16500 }, { "epoch": 6.432180334240186, "grad_norm": 7.573994159698486, "learning_rate": 9.655315789473684e-05, "loss": 1.0531, "step": 16550 }, { "epoch": 6.451612903225806, "grad_norm": 4.831798076629639, "learning_rate": 9.652684210526316e-05, "loss": 1.037, "step": 16600 }, { "epoch": 6.471045472211427, "grad_norm": 4.910946846008301, "learning_rate": 9.650052631578947e-05, "loss": 0.9816, "step": 16650 }, { "epoch": 6.490478041197046, "grad_norm": 3.0695698261260986, "learning_rate": 9.647421052631579e-05, "loss": 1.13, "step": 16700 }, { "epoch": 6.509910610182666, "grad_norm": 7.250815391540527, "learning_rate": 9.644789473684212e-05, "loss": 1.0495, "step": 16750 }, { "epoch": 6.529343179168286, "grad_norm": 7.248625755310059, "learning_rate": 9.642157894736843e-05, "loss": 1.0516, "step": 16800 }, { "epoch": 6.548775748153906, "grad_norm": 7.137625694274902, "learning_rate": 9.639526315789474e-05, "loss": 0.9927, "step": 16850 }, { "epoch": 6.568208317139526, "grad_norm": 8.5234956741333, "learning_rate": 9.636894736842105e-05, "loss": 1.0574, "step": 16900 }, { "epoch": 6.587640886125146, "grad_norm": 11.187993049621582, "learning_rate": 9.634263157894738e-05, "loss": 1.0173, "step": 16950 }, { "epoch": 6.607073455110766, "grad_norm": 7.501619815826416, "learning_rate": 9.631631578947369e-05, "loss": 1.0407, "step": 17000 }, { "epoch": 6.607073455110766, "eval_accuracy": 0.644949715784871, "eval_runtime": 908.2934, "eval_samples_per_second": 22.661, "eval_steps_per_second": 1.417, "step": 17000 }, { "epoch": 6.626506024096385, "grad_norm": 8.143706321716309, "learning_rate": 9.629e-05, "loss": 1.0366, "step": 17050 }, { "epoch": 6.645938593082006, "grad_norm": 7.184727191925049, "learning_rate": 9.626368421052631e-05, "loss": 1.0755, "step": 17100 }, { "epoch": 6.6653711620676255, "grad_norm": 9.023209571838379, "learning_rate": 9.623736842105264e-05, "loss": 1.0336, "step": 17150 }, { "epoch": 6.684803731053245, "grad_norm": 6.733196258544922, "learning_rate": 9.621105263157896e-05, "loss": 0.9945, "step": 17200 }, { "epoch": 6.704236300038865, "grad_norm": 8.987995147705078, "learning_rate": 9.618473684210527e-05, "loss": 1.0559, "step": 17250 }, { "epoch": 6.723668869024485, "grad_norm": 5.8680853843688965, "learning_rate": 9.615842105263159e-05, "loss": 1.0421, "step": 17300 }, { "epoch": 6.743101438010105, "grad_norm": 9.718525886535645, "learning_rate": 9.61321052631579e-05, "loss": 1.0841, "step": 17350 }, { "epoch": 6.7625340069957245, "grad_norm": 5.664957046508789, "learning_rate": 9.610578947368421e-05, "loss": 1.0099, "step": 17400 }, { "epoch": 6.781966575981345, "grad_norm": 5.648830413818359, "learning_rate": 9.607947368421053e-05, "loss": 0.9931, "step": 17450 }, { "epoch": 6.801399144966965, "grad_norm": 16.464553833007812, "learning_rate": 9.605315789473685e-05, "loss": 0.9833, "step": 17500 }, { "epoch": 6.820831713952584, "grad_norm": 9.624323844909668, "learning_rate": 9.602684210526316e-05, "loss": 1.0671, "step": 17550 }, { "epoch": 6.840264282938205, "grad_norm": 5.411578178405762, "learning_rate": 9.600052631578947e-05, "loss": 1.0591, "step": 17600 }, { "epoch": 6.859696851923824, "grad_norm": 6.365889549255371, "learning_rate": 9.59742105263158e-05, "loss": 1.0182, "step": 17650 }, { "epoch": 6.879129420909444, "grad_norm": 9.195286750793457, "learning_rate": 9.594789473684212e-05, "loss": 1.0284, "step": 17700 }, { "epoch": 6.8985619898950645, "grad_norm": 8.23979377746582, "learning_rate": 9.592157894736843e-05, "loss": 1.0397, "step": 17750 }, { "epoch": 6.917994558880684, "grad_norm": 6.512070178985596, "learning_rate": 9.589526315789474e-05, "loss": 1.0816, "step": 17800 }, { "epoch": 6.937427127866304, "grad_norm": 8.572783470153809, "learning_rate": 9.586894736842105e-05, "loss": 1.0445, "step": 17850 }, { "epoch": 6.956859696851923, "grad_norm": 7.494849681854248, "learning_rate": 9.584263157894737e-05, "loss": 0.9656, "step": 17900 }, { "epoch": 6.976292265837544, "grad_norm": 7.2482123374938965, "learning_rate": 9.581631578947369e-05, "loss": 0.9848, "step": 17950 }, { "epoch": 6.9957248348231635, "grad_norm": 8.3186674118042, "learning_rate": 9.579e-05, "loss": 0.9241, "step": 18000 }, { "epoch": 6.9957248348231635, "eval_accuracy": 0.7144245250935238, "eval_runtime": 908.8783, "eval_samples_per_second": 22.647, "eval_steps_per_second": 1.416, "step": 18000 }, { "epoch": 14.025650991061019, "grad_norm": 3.8534250259399414, "learning_rate": 9.576368421052631e-05, "loss": 0.7464, "step": 18050 }, { "epoch": 14.064516129032258, "grad_norm": 5.60610294342041, "learning_rate": 9.573736842105263e-05, "loss": 0.7384, "step": 18100 }, { "epoch": 14.103381267003497, "grad_norm": 4.310544967651367, "learning_rate": 9.571105263157895e-05, "loss": 0.7456, "step": 18150 }, { "epoch": 14.142246404974738, "grad_norm": 4.901150226593018, "learning_rate": 9.568473684210528e-05, "loss": 0.7099, "step": 18200 }, { "epoch": 14.181111542945978, "grad_norm": 7.738977909088135, "learning_rate": 9.565842105263159e-05, "loss": 0.7437, "step": 18250 }, { "epoch": 14.219976680917217, "grad_norm": 4.552745819091797, "learning_rate": 9.56321052631579e-05, "loss": 0.7383, "step": 18300 }, { "epoch": 14.258841818888458, "grad_norm": 5.0580034255981445, "learning_rate": 9.560578947368421e-05, "loss": 0.7134, "step": 18350 }, { "epoch": 14.297706956859697, "grad_norm": 4.803134441375732, "learning_rate": 9.557947368421054e-05, "loss": 0.7098, "step": 18400 }, { "epoch": 14.336572094830936, "grad_norm": 4.600313186645508, "learning_rate": 9.555315789473685e-05, "loss": 0.708, "step": 18450 }, { "epoch": 14.375437232802177, "grad_norm": 4.624382019042969, "learning_rate": 9.552684210526316e-05, "loss": 0.6778, "step": 18500 }, { "epoch": 14.414302370773417, "grad_norm": 4.699090957641602, "learning_rate": 9.550052631578947e-05, "loss": 0.695, "step": 18550 }, { "epoch": 14.453167508744656, "grad_norm": 4.733650207519531, "learning_rate": 9.54742105263158e-05, "loss": 0.6811, "step": 18600 }, { "epoch": 14.492032646715895, "grad_norm": 6.0096282958984375, "learning_rate": 9.544789473684211e-05, "loss": 0.6831, "step": 18650 }, { "epoch": 14.530897784687136, "grad_norm": 3.8526785373687744, "learning_rate": 9.542157894736843e-05, "loss": 0.7374, "step": 18700 }, { "epoch": 14.569762922658375, "grad_norm": 4.106224060058594, "learning_rate": 9.539526315789474e-05, "loss": 0.6959, "step": 18750 }, { "epoch": 14.608628060629615, "grad_norm": 5.415355205535889, "learning_rate": 9.536894736842106e-05, "loss": 0.6764, "step": 18800 }, { "epoch": 14.647493198600856, "grad_norm": 5.404593467712402, "learning_rate": 9.534263157894737e-05, "loss": 0.7101, "step": 18850 }, { "epoch": 14.686358336572095, "grad_norm": 5.305964469909668, "learning_rate": 9.531631578947369e-05, "loss": 0.6862, "step": 18900 }, { "epoch": 14.725223474543334, "grad_norm": 4.21513557434082, "learning_rate": 9.529e-05, "loss": 0.6621, "step": 18950 }, { "epoch": 14.764088612514575, "grad_norm": 5.1977715492248535, "learning_rate": 9.526368421052632e-05, "loss": 0.6832, "step": 19000 }, { "epoch": 14.764088612514575, "eval_accuracy": 0.8375844143224992, "eval_runtime": 859.0271, "eval_samples_per_second": 23.961, "eval_steps_per_second": 2.995, "step": 19000 }, { "epoch": 14.802953750485814, "grad_norm": 4.032940864562988, "learning_rate": 9.523736842105263e-05, "loss": 0.7262, "step": 19050 }, { "epoch": 14.841818888457054, "grad_norm": 6.659789562225342, "learning_rate": 9.521105263157895e-05, "loss": 0.6872, "step": 19100 }, { "epoch": 14.880684026428295, "grad_norm": 4.311520576477051, "learning_rate": 9.518473684210528e-05, "loss": 0.715, "step": 19150 }, { "epoch": 14.919549164399534, "grad_norm": 4.256176471710205, "learning_rate": 9.515842105263159e-05, "loss": 0.7551, "step": 19200 }, { "epoch": 14.958414302370773, "grad_norm": 5.675657749176025, "learning_rate": 9.51321052631579e-05, "loss": 0.6938, "step": 19250 }, { "epoch": 14.997279440342012, "grad_norm": 5.095183372497559, "learning_rate": 9.510578947368421e-05, "loss": 0.6902, "step": 19300 }, { "epoch": 15.035755926933541, "grad_norm": 3.1958320140838623, "learning_rate": 9.507947368421052e-05, "loss": 0.4941, "step": 19350 }, { "epoch": 15.07462106490478, "grad_norm": 4.101893901824951, "learning_rate": 9.505315789473685e-05, "loss": 0.5105, "step": 19400 }, { "epoch": 15.11348620287602, "grad_norm": 54.76054763793945, "learning_rate": 9.502684210526316e-05, "loss": 0.5441, "step": 19450 }, { "epoch": 15.15235134084726, "grad_norm": 4.233603477478027, "learning_rate": 9.500052631578947e-05, "loss": 0.4974, "step": 19500 }, { "epoch": 15.1912164788185, "grad_norm": 4.762860298156738, "learning_rate": 9.49742105263158e-05, "loss": 0.5109, "step": 19550 }, { "epoch": 15.23008161678974, "grad_norm": 5.9275054931640625, "learning_rate": 9.494789473684211e-05, "loss": 0.4692, "step": 19600 }, { "epoch": 15.268946754760979, "grad_norm": 4.715384006500244, "learning_rate": 9.492157894736843e-05, "loss": 0.507, "step": 19650 }, { "epoch": 15.30781189273222, "grad_norm": 6.866525650024414, "learning_rate": 9.489526315789475e-05, "loss": 0.5331, "step": 19700 }, { "epoch": 15.346677030703459, "grad_norm": 7.685290336608887, "learning_rate": 9.486894736842106e-05, "loss": 0.4853, "step": 19750 }, { "epoch": 15.385542168674698, "grad_norm": 5.017430782318115, "learning_rate": 9.484263157894737e-05, "loss": 0.49, "step": 19800 }, { "epoch": 15.42440730664594, "grad_norm": 4.645963191986084, "learning_rate": 9.481631578947368e-05, "loss": 0.536, "step": 19850 }, { "epoch": 15.463272444617179, "grad_norm": 3.9454667568206787, "learning_rate": 9.479e-05, "loss": 0.4746, "step": 19900 }, { "epoch": 15.502137582588418, "grad_norm": 9.802485466003418, "learning_rate": 9.476368421052632e-05, "loss": 0.488, "step": 19950 }, { "epoch": 15.541002720559659, "grad_norm": 3.2630667686462402, "learning_rate": 9.473736842105264e-05, "loss": 0.5281, "step": 20000 }, { "epoch": 15.541002720559659, "eval_accuracy": 0.8903464023708886, "eval_runtime": 856.9951, "eval_samples_per_second": 24.018, "eval_steps_per_second": 3.002, "step": 20000 }, { "epoch": 15.579867858530898, "grad_norm": 5.689331531524658, "learning_rate": 9.471105263157895e-05, "loss": 0.5053, "step": 20050 }, { "epoch": 15.618732996502137, "grad_norm": 9.707596778869629, "learning_rate": 9.468473684210527e-05, "loss": 0.4869, "step": 20100 }, { "epoch": 15.657598134473377, "grad_norm": 4.015494346618652, "learning_rate": 9.465842105263159e-05, "loss": 0.5089, "step": 20150 }, { "epoch": 15.696463272444618, "grad_norm": 6.2136616706848145, "learning_rate": 9.46321052631579e-05, "loss": 0.5071, "step": 20200 }, { "epoch": 15.735328410415857, "grad_norm": 4.501992702484131, "learning_rate": 9.460578947368421e-05, "loss": 0.5169, "step": 20250 }, { "epoch": 15.774193548387096, "grad_norm": 3.086958408355713, "learning_rate": 9.457947368421053e-05, "loss": 0.5076, "step": 20300 }, { "epoch": 15.813058686358337, "grad_norm": 5.842269420623779, "learning_rate": 9.455315789473684e-05, "loss": 0.4818, "step": 20350 }, { "epoch": 15.851923824329576, "grad_norm": 5.230940341949463, "learning_rate": 9.452684210526316e-05, "loss": 0.5153, "step": 20400 }, { "epoch": 15.890788962300816, "grad_norm": 5.823616027832031, "learning_rate": 9.450052631578947e-05, "loss": 0.4984, "step": 20450 }, { "epoch": 15.929654100272057, "grad_norm": 4.166293144226074, "learning_rate": 9.44742105263158e-05, "loss": 0.5255, "step": 20500 }, { "epoch": 15.968519238243296, "grad_norm": 5.066617488861084, "learning_rate": 9.444789473684211e-05, "loss": 0.4885, "step": 20550 }, { "epoch": 16.006995724834823, "grad_norm": 3.68914532661438, "learning_rate": 9.442157894736842e-05, "loss": 0.4843, "step": 20600 }, { "epoch": 16.045860862806062, "grad_norm": 3.0051112174987793, "learning_rate": 9.439526315789475e-05, "loss": 0.3863, "step": 20650 }, { "epoch": 16.0847260007773, "grad_norm": 4.9845404624938965, "learning_rate": 9.436894736842106e-05, "loss": 0.3352, "step": 20700 }, { "epoch": 16.12359113874854, "grad_norm": 6.085782527923584, "learning_rate": 9.434263157894737e-05, "loss": 0.3561, "step": 20750 }, { "epoch": 16.162456276719784, "grad_norm": 9.10416030883789, "learning_rate": 9.431631578947368e-05, "loss": 0.3884, "step": 20800 }, { "epoch": 16.201321414691023, "grad_norm": 4.070003509521484, "learning_rate": 9.429000000000001e-05, "loss": 0.3566, "step": 20850 }, { "epoch": 16.240186552662262, "grad_norm": 6.551778793334961, "learning_rate": 9.426368421052632e-05, "loss": 0.3936, "step": 20900 }, { "epoch": 16.2790516906335, "grad_norm": 2.916369676589966, "learning_rate": 9.423736842105264e-05, "loss": 0.3535, "step": 20950 }, { "epoch": 16.31791682860474, "grad_norm": 4.9708123207092285, "learning_rate": 9.421105263157896e-05, "loss": 0.3538, "step": 21000 }, { "epoch": 16.31791682860474, "eval_accuracy": 0.9259583151144148, "eval_runtime": 857.5854, "eval_samples_per_second": 24.001, "eval_steps_per_second": 3.0, "step": 21000 }, { "epoch": 16.35678196657598, "grad_norm": 5.022733688354492, "learning_rate": 9.418473684210527e-05, "loss": 0.3687, "step": 21050 }, { "epoch": 16.395647104547223, "grad_norm": 6.180454254150391, "learning_rate": 9.415842105263158e-05, "loss": 0.3892, "step": 21100 }, { "epoch": 16.434512242518462, "grad_norm": 5.216296195983887, "learning_rate": 9.41321052631579e-05, "loss": 0.3633, "step": 21150 }, { "epoch": 16.4733773804897, "grad_norm": 4.853186130523682, "learning_rate": 9.410578947368422e-05, "loss": 0.3367, "step": 21200 }, { "epoch": 16.51224251846094, "grad_norm": 4.97639274597168, "learning_rate": 9.407947368421053e-05, "loss": 0.3945, "step": 21250 }, { "epoch": 16.55110765643218, "grad_norm": 6.093092441558838, "learning_rate": 9.405315789473684e-05, "loss": 0.3445, "step": 21300 }, { "epoch": 16.58997279440342, "grad_norm": 5.588438034057617, "learning_rate": 9.402684210526316e-05, "loss": 0.3521, "step": 21350 }, { "epoch": 16.62883793237466, "grad_norm": 4.828138828277588, "learning_rate": 9.400052631578949e-05, "loss": 0.3503, "step": 21400 }, { "epoch": 16.6677030703459, "grad_norm": 4.060586929321289, "learning_rate": 9.39742105263158e-05, "loss": 0.3969, "step": 21450 }, { "epoch": 16.70656820831714, "grad_norm": 3.49153995513916, "learning_rate": 9.394789473684211e-05, "loss": 0.3768, "step": 21500 }, { "epoch": 16.74543334628838, "grad_norm": 4.94192361831665, "learning_rate": 9.392157894736842e-05, "loss": 0.3908, "step": 21550 }, { "epoch": 16.78429848425962, "grad_norm": 5.816671848297119, "learning_rate": 9.389526315789475e-05, "loss": 0.3491, "step": 21600 }, { "epoch": 16.823163622230858, "grad_norm": 4.488963603973389, "learning_rate": 9.386894736842106e-05, "loss": 0.3465, "step": 21650 }, { "epoch": 16.862028760202097, "grad_norm": 5.019749641418457, "learning_rate": 9.384263157894737e-05, "loss": 0.3675, "step": 21700 }, { "epoch": 16.90089389817334, "grad_norm": 8.628226280212402, "learning_rate": 9.381631578947368e-05, "loss": 0.3765, "step": 21750 }, { "epoch": 16.93975903614458, "grad_norm": 5.863889694213867, "learning_rate": 9.379e-05, "loss": 0.3939, "step": 21800 }, { "epoch": 16.97862417411582, "grad_norm": 3.1837944984436035, "learning_rate": 9.376368421052632e-05, "loss": 0.404, "step": 21850 }, { "epoch": 17.017100660707346, "grad_norm": 2.9766252040863037, "learning_rate": 9.373736842105265e-05, "loss": 0.3439, "step": 21900 }, { "epoch": 17.055965798678585, "grad_norm": 3.5172083377838135, "learning_rate": 9.371105263157896e-05, "loss": 0.258, "step": 21950 }, { "epoch": 17.094830936649824, "grad_norm": 4.440106391906738, "learning_rate": 9.368473684210527e-05, "loss": 0.275, "step": 22000 }, { "epoch": 17.094830936649824, "eval_accuracy": 0.9561774279745421, "eval_runtime": 854.4439, "eval_samples_per_second": 24.089, "eval_steps_per_second": 3.011, "step": 22000 }, { "epoch": 17.133696074621064, "grad_norm": 19.169038772583008, "learning_rate": 9.365842105263158e-05, "loss": 0.2566, "step": 22050 }, { "epoch": 17.172561212592306, "grad_norm": 5.666447639465332, "learning_rate": 9.36321052631579e-05, "loss": 0.2736, "step": 22100 }, { "epoch": 17.211426350563546, "grad_norm": 2.622077465057373, "learning_rate": 9.360578947368422e-05, "loss": 0.2408, "step": 22150 }, { "epoch": 17.250291488534785, "grad_norm": 3.8020524978637695, "learning_rate": 9.357947368421053e-05, "loss": 0.2865, "step": 22200 }, { "epoch": 17.289156626506024, "grad_norm": 3.419302225112915, "learning_rate": 9.355315789473684e-05, "loss": 0.2249, "step": 22250 }, { "epoch": 17.328021764477263, "grad_norm": 4.451815128326416, "learning_rate": 9.352684210526315e-05, "loss": 0.296, "step": 22300 }, { "epoch": 17.366886902448503, "grad_norm": 4.277288913726807, "learning_rate": 9.350052631578948e-05, "loss": 0.2647, "step": 22350 }, { "epoch": 17.405752040419742, "grad_norm": 3.892031669616699, "learning_rate": 9.34742105263158e-05, "loss": 0.2746, "step": 22400 }, { "epoch": 17.444617178390985, "grad_norm": 3.7854325771331787, "learning_rate": 9.344789473684211e-05, "loss": 0.2615, "step": 22450 }, { "epoch": 17.483482316362224, "grad_norm": 5.1615471839904785, "learning_rate": 9.342157894736842e-05, "loss": 0.27, "step": 22500 }, { "epoch": 17.522347454333463, "grad_norm": 7.163486003875732, "learning_rate": 9.339526315789474e-05, "loss": 0.2791, "step": 22550 }, { "epoch": 17.561212592304702, "grad_norm": 3.133782386779785, "learning_rate": 9.336894736842106e-05, "loss": 0.289, "step": 22600 }, { "epoch": 17.60007773027594, "grad_norm": 8.343432426452637, "learning_rate": 9.334263157894737e-05, "loss": 0.2562, "step": 22650 }, { "epoch": 17.63894286824718, "grad_norm": 4.588578701019287, "learning_rate": 9.331631578947368e-05, "loss": 0.2722, "step": 22700 }, { "epoch": 17.677808006218424, "grad_norm": 5.381346225738525, "learning_rate": 9.329e-05, "loss": 0.2841, "step": 22750 }, { "epoch": 17.716673144189663, "grad_norm": 5.521486759185791, "learning_rate": 9.326368421052632e-05, "loss": 0.2821, "step": 22800 }, { "epoch": 17.755538282160902, "grad_norm": 7.152676105499268, "learning_rate": 9.323736842105265e-05, "loss": 0.2991, "step": 22850 }, { "epoch": 17.79440342013214, "grad_norm": 2.277432918548584, "learning_rate": 9.321105263157896e-05, "loss": 0.2974, "step": 22900 }, { "epoch": 17.83326855810338, "grad_norm": 2.4336891174316406, "learning_rate": 9.318473684210527e-05, "loss": 0.2944, "step": 22950 }, { "epoch": 17.87213369607462, "grad_norm": 3.757892608642578, "learning_rate": 9.315842105263158e-05, "loss": 0.2839, "step": 23000 }, { "epoch": 17.87213369607462, "eval_accuracy": 0.9704610600981393, "eval_runtime": 853.279, "eval_samples_per_second": 24.122, "eval_steps_per_second": 3.015, "step": 23000 }, { "epoch": 17.91099883404586, "grad_norm": 2.538939952850342, "learning_rate": 9.313210526315789e-05, "loss": 0.2912, "step": 23050 }, { "epoch": 17.949863972017102, "grad_norm": 2.826970338821411, "learning_rate": 9.310578947368422e-05, "loss": 0.3091, "step": 23100 }, { "epoch": 17.98872910998834, "grad_norm": 6.114530086517334, "learning_rate": 9.307947368421053e-05, "loss": 0.3039, "step": 23150 }, { "epoch": 18.02720559657987, "grad_norm": 3.5452566146850586, "learning_rate": 9.305315789473684e-05, "loss": 0.2166, "step": 23200 }, { "epoch": 18.066070734551108, "grad_norm": 3.9908535480499268, "learning_rate": 9.302684210526317e-05, "loss": 0.1961, "step": 23250 }, { "epoch": 18.104935872522347, "grad_norm": 3.3680648803710938, "learning_rate": 9.300052631578948e-05, "loss": 0.1853, "step": 23300 }, { "epoch": 18.143801010493586, "grad_norm": 3.556689977645874, "learning_rate": 9.29742105263158e-05, "loss": 0.2204, "step": 23350 }, { "epoch": 18.182666148464826, "grad_norm": 3.8766796588897705, "learning_rate": 9.294789473684211e-05, "loss": 0.212, "step": 23400 }, { "epoch": 18.22153128643607, "grad_norm": 3.0015463829040527, "learning_rate": 9.292157894736843e-05, "loss": 0.2259, "step": 23450 }, { "epoch": 18.260396424407308, "grad_norm": 6.374223709106445, "learning_rate": 9.289526315789474e-05, "loss": 0.228, "step": 23500 }, { "epoch": 18.299261562378547, "grad_norm": 4.002296447753906, "learning_rate": 9.286894736842105e-05, "loss": 0.1872, "step": 23550 }, { "epoch": 18.338126700349786, "grad_norm": 3.892784357070923, "learning_rate": 9.284263157894737e-05, "loss": 0.2024, "step": 23600 }, { "epoch": 18.376991838321025, "grad_norm": 2.9542012214660645, "learning_rate": 9.281631578947369e-05, "loss": 0.2348, "step": 23650 }, { "epoch": 18.415856976292265, "grad_norm": 4.670549392700195, "learning_rate": 9.279e-05, "loss": 0.2231, "step": 23700 }, { "epoch": 18.454722114263504, "grad_norm": 4.850724220275879, "learning_rate": 9.276368421052632e-05, "loss": 0.2145, "step": 23750 }, { "epoch": 18.493587252234747, "grad_norm": 4.08035135269165, "learning_rate": 9.273736842105263e-05, "loss": 0.1965, "step": 23800 }, { "epoch": 18.532452390205986, "grad_norm": 2.9276678562164307, "learning_rate": 9.271105263157896e-05, "loss": 0.2289, "step": 23850 }, { "epoch": 18.571317528177225, "grad_norm": 3.3067493438720703, "learning_rate": 9.268473684210527e-05, "loss": 0.2138, "step": 23900 }, { "epoch": 18.610182666148464, "grad_norm": 6.159919261932373, "learning_rate": 9.265842105263158e-05, "loss": 0.2366, "step": 23950 }, { "epoch": 18.649047804119704, "grad_norm": 2.813751459121704, "learning_rate": 9.26321052631579e-05, "loss": 0.2365, "step": 24000 }, { "epoch": 18.649047804119704, "eval_accuracy": 0.9767769518534714, "eval_runtime": 853.479, "eval_samples_per_second": 24.117, "eval_steps_per_second": 3.015, "step": 24000 }, { "epoch": 18.687912942090943, "grad_norm": 4.694916248321533, "learning_rate": 9.26057894736842e-05, "loss": 0.2256, "step": 24050 }, { "epoch": 18.726778080062186, "grad_norm": 6.502670764923096, "learning_rate": 9.257947368421053e-05, "loss": 0.1918, "step": 24100 }, { "epoch": 18.765643218033425, "grad_norm": 4.842265605926514, "learning_rate": 9.255315789473684e-05, "loss": 0.2503, "step": 24150 }, { "epoch": 18.804508356004664, "grad_norm": 3.503544330596924, "learning_rate": 9.252684210526317e-05, "loss": 0.21, "step": 24200 }, { "epoch": 18.843373493975903, "grad_norm": 2.136115312576294, "learning_rate": 9.250052631578948e-05, "loss": 0.2161, "step": 24250 }, { "epoch": 18.882238631947143, "grad_norm": 2.678394317626953, "learning_rate": 9.247421052631579e-05, "loss": 0.1887, "step": 24300 }, { "epoch": 18.921103769918382, "grad_norm": 3.438103675842285, "learning_rate": 9.244789473684212e-05, "loss": 0.254, "step": 24350 }, { "epoch": 18.959968907889625, "grad_norm": 3.543240785598755, "learning_rate": 9.242157894736843e-05, "loss": 0.2246, "step": 24400 }, { "epoch": 18.998834045860864, "grad_norm": 4.896575450897217, "learning_rate": 9.239526315789474e-05, "loss": 0.2251, "step": 24450 }, { "epoch": 19.03731053245239, "grad_norm": 4.922858715057373, "learning_rate": 9.236894736842105e-05, "loss": 0.1551, "step": 24500 }, { "epoch": 19.07617567042363, "grad_norm": 12.493631362915039, "learning_rate": 9.234263157894738e-05, "loss": 0.1661, "step": 24550 }, { "epoch": 19.11504080839487, "grad_norm": 4.771761894226074, "learning_rate": 9.231631578947369e-05, "loss": 0.154, "step": 24600 }, { "epoch": 19.15390594636611, "grad_norm": 3.863640785217285, "learning_rate": 9.229000000000001e-05, "loss": 0.14, "step": 24650 }, { "epoch": 19.19277108433735, "grad_norm": 5.57180118560791, "learning_rate": 9.226368421052632e-05, "loss": 0.1585, "step": 24700 }, { "epoch": 19.231636222308587, "grad_norm": 4.381727695465088, "learning_rate": 9.223736842105264e-05, "loss": 0.1764, "step": 24750 }, { "epoch": 19.27050136027983, "grad_norm": 3.324845552444458, "learning_rate": 9.221105263157895e-05, "loss": 0.1605, "step": 24800 }, { "epoch": 19.30936649825107, "grad_norm": 3.4294657707214355, "learning_rate": 9.218473684210527e-05, "loss": 0.168, "step": 24850 }, { "epoch": 19.34823163622231, "grad_norm": 2.7688677310943604, "learning_rate": 9.215842105263158e-05, "loss": 0.1594, "step": 24900 }, { "epoch": 19.387096774193548, "grad_norm": 4.8695807456970215, "learning_rate": 9.21321052631579e-05, "loss": 0.1659, "step": 24950 }, { "epoch": 19.425961912164787, "grad_norm": 3.9514307975769043, "learning_rate": 9.210578947368421e-05, "loss": 0.1872, "step": 25000 }, { "epoch": 19.425961912164787, "eval_accuracy": 0.9811009085167371, "eval_runtime": 857.0247, "eval_samples_per_second": 24.017, "eval_steps_per_second": 3.002, "step": 25000 }, { "epoch": 19.464827050136027, "grad_norm": 2.4025089740753174, "learning_rate": 9.207947368421053e-05, "loss": 0.1627, "step": 25050 }, { "epoch": 19.50369218810727, "grad_norm": 4.841952323913574, "learning_rate": 9.205315789473684e-05, "loss": 0.1714, "step": 25100 }, { "epoch": 19.54255732607851, "grad_norm": 3.034189224243164, "learning_rate": 9.202684210526317e-05, "loss": 0.1652, "step": 25150 }, { "epoch": 19.581422464049748, "grad_norm": 3.2492823600769043, "learning_rate": 9.200052631578948e-05, "loss": 0.2033, "step": 25200 }, { "epoch": 19.620287602020987, "grad_norm": 7.616010665893555, "learning_rate": 9.197421052631579e-05, "loss": 0.2131, "step": 25250 }, { "epoch": 19.659152739992226, "grad_norm": 3.7748496532440186, "learning_rate": 9.194789473684212e-05, "loss": 0.1808, "step": 25300 }, { "epoch": 19.698017877963466, "grad_norm": 4.933774471282959, "learning_rate": 9.192157894736843e-05, "loss": 0.1586, "step": 25350 }, { "epoch": 19.73688301593471, "grad_norm": 6.590142250061035, "learning_rate": 9.189526315789474e-05, "loss": 0.2023, "step": 25400 }, { "epoch": 19.775748153905948, "grad_norm": 3.3198065757751465, "learning_rate": 9.186894736842105e-05, "loss": 0.1881, "step": 25450 }, { "epoch": 19.814613291877187, "grad_norm": 5.870887279510498, "learning_rate": 9.184263157894736e-05, "loss": 0.1717, "step": 25500 }, { "epoch": 19.853478429848426, "grad_norm": 2.5725080966949463, "learning_rate": 9.181631578947369e-05, "loss": 0.1939, "step": 25550 }, { "epoch": 19.892343567819665, "grad_norm": 3.569075345993042, "learning_rate": 9.179000000000001e-05, "loss": 0.1915, "step": 25600 }, { "epoch": 19.931208705790905, "grad_norm": 3.5029423236846924, "learning_rate": 9.176368421052633e-05, "loss": 0.1679, "step": 25650 }, { "epoch": 19.970073843762144, "grad_norm": 2.4752578735351562, "learning_rate": 9.173736842105264e-05, "loss": 0.168, "step": 25700 }, { "epoch": 20.00855033035367, "grad_norm": 9.303007125854492, "learning_rate": 9.171105263157895e-05, "loss": 0.1754, "step": 25750 }, { "epoch": 20.047415468324914, "grad_norm": 4.825704574584961, "learning_rate": 9.168473684210527e-05, "loss": 0.1179, "step": 25800 }, { "epoch": 20.086280606296153, "grad_norm": 4.486258506774902, "learning_rate": 9.165842105263159e-05, "loss": 0.1358, "step": 25850 }, { "epoch": 20.125145744267392, "grad_norm": 1.9208111763000488, "learning_rate": 9.16321052631579e-05, "loss": 0.1231, "step": 25900 }, { "epoch": 20.16401088223863, "grad_norm": 5.461341857910156, "learning_rate": 9.160578947368421e-05, "loss": 0.1262, "step": 25950 }, { "epoch": 20.20287602020987, "grad_norm": 0.8620373010635376, "learning_rate": 9.157947368421052e-05, "loss": 0.1263, "step": 26000 }, { "epoch": 20.20287602020987, "eval_accuracy": 0.9867366273138026, "eval_runtime": 884.9066, "eval_samples_per_second": 23.26, "eval_steps_per_second": 2.908, "step": 26000 }, { "epoch": 20.24174115818111, "grad_norm": 3.118957281112671, "learning_rate": 9.155315789473685e-05, "loss": 0.1281, "step": 26050 }, { "epoch": 20.280606296152353, "grad_norm": 2.739800214767456, "learning_rate": 9.152684210526317e-05, "loss": 0.1213, "step": 26100 }, { "epoch": 20.319471434123592, "grad_norm": 6.1789445877075195, "learning_rate": 9.150052631578948e-05, "loss": 0.1474, "step": 26150 }, { "epoch": 20.35833657209483, "grad_norm": 2.280268907546997, "learning_rate": 9.14742105263158e-05, "loss": 0.1275, "step": 26200 }, { "epoch": 20.39720171006607, "grad_norm": 4.79387903213501, "learning_rate": 9.14478947368421e-05, "loss": 0.1337, "step": 26250 }, { "epoch": 20.43606684803731, "grad_norm": 2.4207911491394043, "learning_rate": 9.142157894736843e-05, "loss": 0.1317, "step": 26300 }, { "epoch": 20.47493198600855, "grad_norm": 0.7470328211784363, "learning_rate": 9.139526315789474e-05, "loss": 0.1197, "step": 26350 }, { "epoch": 20.51379712397979, "grad_norm": 3.8738152980804443, "learning_rate": 9.136894736842105e-05, "loss": 0.1336, "step": 26400 }, { "epoch": 20.55266226195103, "grad_norm": 3.2332074642181396, "learning_rate": 9.134263157894737e-05, "loss": 0.1508, "step": 26450 }, { "epoch": 20.59152739992227, "grad_norm": 4.310563087463379, "learning_rate": 9.131631578947368e-05, "loss": 0.1238, "step": 26500 }, { "epoch": 20.63039253789351, "grad_norm": 4.384401321411133, "learning_rate": 9.129000000000002e-05, "loss": 0.1583, "step": 26550 }, { "epoch": 20.66925767586475, "grad_norm": 4.6766486167907715, "learning_rate": 9.126368421052633e-05, "loss": 0.1224, "step": 26600 }, { "epoch": 20.70812281383599, "grad_norm": 2.9925906658172607, "learning_rate": 9.123736842105264e-05, "loss": 0.1504, "step": 26650 }, { "epoch": 20.746987951807228, "grad_norm": 6.686166763305664, "learning_rate": 9.121105263157895e-05, "loss": 0.1324, "step": 26700 }, { "epoch": 20.78585308977847, "grad_norm": 4.77677059173584, "learning_rate": 9.118473684210526e-05, "loss": 0.1291, "step": 26750 }, { "epoch": 20.82471822774971, "grad_norm": 4.284395217895508, "learning_rate": 9.115842105263159e-05, "loss": 0.1421, "step": 26800 }, { "epoch": 20.86358336572095, "grad_norm": 3.2048757076263428, "learning_rate": 9.11321052631579e-05, "loss": 0.141, "step": 26850 }, { "epoch": 20.902448503692188, "grad_norm": 3.526207447052002, "learning_rate": 9.110578947368421e-05, "loss": 0.1471, "step": 26900 }, { "epoch": 20.941313641663427, "grad_norm": 2.7321910858154297, "learning_rate": 9.107947368421052e-05, "loss": 0.1307, "step": 26950 }, { "epoch": 20.980178779634667, "grad_norm": 3.099595546722412, "learning_rate": 9.105315789473685e-05, "loss": 0.1459, "step": 27000 }, { "epoch": 20.980178779634667, "eval_accuracy": 0.989943156974202, "eval_runtime": 880.3227, "eval_samples_per_second": 23.381, "eval_steps_per_second": 2.923, "step": 27000 }, { "epoch": 21.018655266226194, "grad_norm": 5.128775119781494, "learning_rate": 9.102684210526317e-05, "loss": 0.1232, "step": 27050 }, { "epoch": 21.057520404197437, "grad_norm": 3.6633059978485107, "learning_rate": 9.100052631578948e-05, "loss": 0.1005, "step": 27100 }, { "epoch": 21.096385542168676, "grad_norm": 2.404827117919922, "learning_rate": 9.09742105263158e-05, "loss": 0.1007, "step": 27150 }, { "epoch": 21.135250680139915, "grad_norm": 1.061808466911316, "learning_rate": 9.09478947368421e-05, "loss": 0.1129, "step": 27200 }, { "epoch": 21.174115818111154, "grad_norm": 4.0880513191223145, "learning_rate": 9.092157894736842e-05, "loss": 0.1165, "step": 27250 }, { "epoch": 21.212980956082394, "grad_norm": 9.207396507263184, "learning_rate": 9.089526315789474e-05, "loss": 0.1177, "step": 27300 }, { "epoch": 21.251846094053633, "grad_norm": 1.657961130142212, "learning_rate": 9.086894736842105e-05, "loss": 0.1053, "step": 27350 }, { "epoch": 21.290711232024872, "grad_norm": 3.722139596939087, "learning_rate": 9.084263157894737e-05, "loss": 0.1, "step": 27400 }, { "epoch": 21.329576369996115, "grad_norm": 3.5209944248199463, "learning_rate": 9.081631578947369e-05, "loss": 0.1073, "step": 27450 }, { "epoch": 21.368441507967354, "grad_norm": 2.708458662033081, "learning_rate": 9.079e-05, "loss": 0.1089, "step": 27500 }, { "epoch": 21.407306645938593, "grad_norm": 5.232219696044922, "learning_rate": 9.076368421052633e-05, "loss": 0.1182, "step": 27550 }, { "epoch": 21.446171783909833, "grad_norm": 4.402341365814209, "learning_rate": 9.073736842105264e-05, "loss": 0.1194, "step": 27600 }, { "epoch": 21.485036921881072, "grad_norm": 1.9735609292984009, "learning_rate": 9.071105263157895e-05, "loss": 0.1254, "step": 27650 }, { "epoch": 21.52390205985231, "grad_norm": 2.739964485168457, "learning_rate": 9.068473684210526e-05, "loss": 0.1152, "step": 27700 }, { "epoch": 21.562767197823554, "grad_norm": 4.691028594970703, "learning_rate": 9.065842105263157e-05, "loss": 0.1004, "step": 27750 }, { "epoch": 21.601632335794793, "grad_norm": 2.4163448810577393, "learning_rate": 9.06321052631579e-05, "loss": 0.1202, "step": 27800 }, { "epoch": 21.640497473766032, "grad_norm": 3.471006155014038, "learning_rate": 9.060578947368421e-05, "loss": 0.123, "step": 27850 }, { "epoch": 21.67936261173727, "grad_norm": 3.148470640182495, "learning_rate": 9.057947368421052e-05, "loss": 0.1041, "step": 27900 }, { "epoch": 21.71822774970851, "grad_norm": 1.7018976211547852, "learning_rate": 9.055315789473685e-05, "loss": 0.1223, "step": 27950 }, { "epoch": 21.75709288767975, "grad_norm": 1.1783331632614136, "learning_rate": 9.052684210526316e-05, "loss": 0.125, "step": 28000 }, { "epoch": 21.75709288767975, "eval_accuracy": 0.9914978380216684, "eval_runtime": 873.9179, "eval_samples_per_second": 23.553, "eval_steps_per_second": 2.944, "step": 28000 }, { "epoch": 21.79595802565099, "grad_norm": 2.917894124984741, "learning_rate": 9.050052631578948e-05, "loss": 0.1228, "step": 28050 }, { "epoch": 21.834823163622232, "grad_norm": 2.4652888774871826, "learning_rate": 9.04742105263158e-05, "loss": 0.1249, "step": 28100 }, { "epoch": 21.87368830159347, "grad_norm": 0.5699157118797302, "learning_rate": 9.044789473684211e-05, "loss": 0.1147, "step": 28150 }, { "epoch": 21.91255343956471, "grad_norm": 0.5322246551513672, "learning_rate": 9.042157894736842e-05, "loss": 0.1075, "step": 28200 }, { "epoch": 21.95141857753595, "grad_norm": 6.01038122177124, "learning_rate": 9.039526315789474e-05, "loss": 0.1283, "step": 28250 }, { "epoch": 21.99028371550719, "grad_norm": 2.6229259967803955, "learning_rate": 9.036894736842106e-05, "loss": 0.1176, "step": 28300 }, { "epoch": 22.028760202098717, "grad_norm": 0.6570966839790344, "learning_rate": 9.034263157894737e-05, "loss": 0.0926, "step": 28350 }, { "epoch": 22.067625340069956, "grad_norm": 3.900761127471924, "learning_rate": 9.031631578947369e-05, "loss": 0.0874, "step": 28400 }, { "epoch": 22.1064904780412, "grad_norm": 1.106946349143982, "learning_rate": 9.029e-05, "loss": 0.0904, "step": 28450 }, { "epoch": 22.145355616012438, "grad_norm": 2.706346035003662, "learning_rate": 9.026368421052632e-05, "loss": 0.0832, "step": 28500 }, { "epoch": 22.184220753983677, "grad_norm": 2.462101459503174, "learning_rate": 9.023736842105264e-05, "loss": 0.0866, "step": 28550 }, { "epoch": 22.223085891954916, "grad_norm": 4.171143054962158, "learning_rate": 9.021105263157895e-05, "loss": 0.0879, "step": 28600 }, { "epoch": 22.261951029926156, "grad_norm": 2.033679962158203, "learning_rate": 9.018473684210526e-05, "loss": 0.0885, "step": 28650 }, { "epoch": 22.300816167897395, "grad_norm": 4.7418365478515625, "learning_rate": 9.015842105263158e-05, "loss": 0.1195, "step": 28700 }, { "epoch": 22.339681305868634, "grad_norm": 2.649731397628784, "learning_rate": 9.01321052631579e-05, "loss": 0.0883, "step": 28750 }, { "epoch": 22.378546443839877, "grad_norm": 0.8859423398971558, "learning_rate": 9.010578947368421e-05, "loss": 0.0934, "step": 28800 }, { "epoch": 22.417411581811116, "grad_norm": 4.102853775024414, "learning_rate": 9.007947368421054e-05, "loss": 0.0906, "step": 28850 }, { "epoch": 22.456276719782355, "grad_norm": 4.6845598220825195, "learning_rate": 9.005315789473685e-05, "loss": 0.1131, "step": 28900 }, { "epoch": 22.495141857753595, "grad_norm": 3.274709463119507, "learning_rate": 9.002684210526316e-05, "loss": 0.0977, "step": 28950 }, { "epoch": 22.534006995724834, "grad_norm": 2.538916826248169, "learning_rate": 9.000052631578949e-05, "loss": 0.1073, "step": 29000 }, { "epoch": 22.534006995724834, "eval_accuracy": 0.9921294271972015, "eval_runtime": 876.8455, "eval_samples_per_second": 23.474, "eval_steps_per_second": 2.934, "step": 29000 }, { "epoch": 22.572872133696073, "grad_norm": 2.3162245750427246, "learning_rate": 8.99742105263158e-05, "loss": 0.0725, "step": 29050 }, { "epoch": 22.611737271667316, "grad_norm": 3.547755241394043, "learning_rate": 8.994789473684211e-05, "loss": 0.0853, "step": 29100 }, { "epoch": 22.650602409638555, "grad_norm": 0.8838712573051453, "learning_rate": 8.992157894736842e-05, "loss": 0.1115, "step": 29150 }, { "epoch": 22.689467547609794, "grad_norm": 3.248056650161743, "learning_rate": 8.989526315789473e-05, "loss": 0.0898, "step": 29200 }, { "epoch": 22.728332685581034, "grad_norm": 3.2503175735473633, "learning_rate": 8.986894736842106e-05, "loss": 0.094, "step": 29250 }, { "epoch": 22.767197823552273, "grad_norm": 1.3047415018081665, "learning_rate": 8.984263157894738e-05, "loss": 0.0974, "step": 29300 }, { "epoch": 22.806062961523512, "grad_norm": 3.866671085357666, "learning_rate": 8.98163157894737e-05, "loss": 0.0923, "step": 29350 }, { "epoch": 22.84492809949475, "grad_norm": 4.593476295471191, "learning_rate": 8.979e-05, "loss": 0.1018, "step": 29400 }, { "epoch": 22.883793237465994, "grad_norm": 5.606900691986084, "learning_rate": 8.976368421052632e-05, "loss": 0.1081, "step": 29450 }, { "epoch": 22.922658375437234, "grad_norm": 1.0974065065383911, "learning_rate": 8.973736842105264e-05, "loss": 0.0936, "step": 29500 }, { "epoch": 22.961523513408473, "grad_norm": 3.682856321334839, "learning_rate": 8.971105263157895e-05, "loss": 0.1205, "step": 29550 }, { "epoch": 23.0, "grad_norm": 3.5092780590057373, "learning_rate": 8.968473684210527e-05, "loss": 0.1111, "step": 29600 }, { "epoch": 23.03886513797124, "grad_norm": 0.9673867225646973, "learning_rate": 8.965842105263158e-05, "loss": 0.067, "step": 29650 }, { "epoch": 23.07773027594248, "grad_norm": 0.83358234167099, "learning_rate": 8.963210526315789e-05, "loss": 0.067, "step": 29700 }, { "epoch": 23.116595413913718, "grad_norm": 4.988703727722168, "learning_rate": 8.960578947368421e-05, "loss": 0.0618, "step": 29750 }, { "epoch": 23.15546055188496, "grad_norm": 1.1745929718017578, "learning_rate": 8.957947368421054e-05, "loss": 0.0752, "step": 29800 }, { "epoch": 23.1943256898562, "grad_norm": 3.4321532249450684, "learning_rate": 8.955315789473685e-05, "loss": 0.0709, "step": 29850 }, { "epoch": 23.23319082782744, "grad_norm": 1.3007441759109497, "learning_rate": 8.952684210526316e-05, "loss": 0.0713, "step": 29900 }, { "epoch": 23.27205596579868, "grad_norm": 0.5880366563796997, "learning_rate": 8.950052631578947e-05, "loss": 0.0781, "step": 29950 }, { "epoch": 23.310921103769918, "grad_norm": 0.8208589553833008, "learning_rate": 8.94742105263158e-05, "loss": 0.0734, "step": 30000 }, { "epoch": 23.310921103769918, "eval_accuracy": 0.9923723461108682, "eval_runtime": 872.4649, "eval_samples_per_second": 23.592, "eval_steps_per_second": 2.949, "step": 30000 }, { "epoch": 23.349786241741157, "grad_norm": 2.487136125564575, "learning_rate": 8.944789473684211e-05, "loss": 0.0715, "step": 30050 }, { "epoch": 23.3886513797124, "grad_norm": 3.428359031677246, "learning_rate": 8.942157894736842e-05, "loss": 0.0782, "step": 30100 }, { "epoch": 23.42751651768364, "grad_norm": 3.85115122795105, "learning_rate": 8.939526315789473e-05, "loss": 0.0823, "step": 30150 }, { "epoch": 23.466381655654878, "grad_norm": 4.094728469848633, "learning_rate": 8.936894736842105e-05, "loss": 0.1054, "step": 30200 }, { "epoch": 23.505246793626117, "grad_norm": 1.2569011449813843, "learning_rate": 8.934263157894738e-05, "loss": 0.0792, "step": 30250 }, { "epoch": 23.544111931597357, "grad_norm": 1.1779286861419678, "learning_rate": 8.93163157894737e-05, "loss": 0.0655, "step": 30300 }, { "epoch": 23.582977069568596, "grad_norm": 1.402733564376831, "learning_rate": 8.929000000000001e-05, "loss": 0.0877, "step": 30350 }, { "epoch": 23.621842207539835, "grad_norm": 5.030694007873535, "learning_rate": 8.926368421052632e-05, "loss": 0.0817, "step": 30400 }, { "epoch": 23.660707345511078, "grad_norm": 4.230234146118164, "learning_rate": 8.923736842105263e-05, "loss": 0.0656, "step": 30450 }, { "epoch": 23.699572483482317, "grad_norm": 3.24495792388916, "learning_rate": 8.921105263157896e-05, "loss": 0.0966, "step": 30500 }, { "epoch": 23.738437621453556, "grad_norm": 0.8058502078056335, "learning_rate": 8.918473684210527e-05, "loss": 0.0663, "step": 30550 }, { "epoch": 23.777302759424796, "grad_norm": 4.629317760467529, "learning_rate": 8.915842105263158e-05, "loss": 0.0804, "step": 30600 }, { "epoch": 23.816167897396035, "grad_norm": 0.6016934514045715, "learning_rate": 8.913210526315789e-05, "loss": 0.088, "step": 30650 }, { "epoch": 23.855033035367274, "grad_norm": 1.444543719291687, "learning_rate": 8.910578947368422e-05, "loss": 0.0704, "step": 30700 }, { "epoch": 23.893898173338517, "grad_norm": 4.45253849029541, "learning_rate": 8.907947368421054e-05, "loss": 0.0777, "step": 30750 }, { "epoch": 23.932763311309756, "grad_norm": 2.3076658248901367, "learning_rate": 8.905315789473685e-05, "loss": 0.0978, "step": 30800 }, { "epoch": 23.971628449280995, "grad_norm": 1.820786952972412, "learning_rate": 8.902684210526316e-05, "loss": 0.1152, "step": 30850 }, { "epoch": 24.010104935872523, "grad_norm": 1.6674726009368896, "learning_rate": 8.900052631578948e-05, "loss": 0.0788, "step": 30900 }, { "epoch": 24.048970073843762, "grad_norm": 0.5249277949333191, "learning_rate": 8.897421052631579e-05, "loss": 0.0662, "step": 30950 }, { "epoch": 24.087835211815, "grad_norm": 2.1410939693450928, "learning_rate": 8.894789473684211e-05, "loss": 0.0467, "step": 31000 }, { "epoch": 24.087835211815, "eval_accuracy": 0.9940241947238012, "eval_runtime": 875.8174, "eval_samples_per_second": 23.501, "eval_steps_per_second": 2.938, "step": 31000 }, { "epoch": 24.12670034978624, "grad_norm": 0.8823385834693909, "learning_rate": 8.892157894736842e-05, "loss": 0.063, "step": 31050 }, { "epoch": 24.165565487757483, "grad_norm": 1.1366829872131348, "learning_rate": 8.889526315789474e-05, "loss": 0.0589, "step": 31100 }, { "epoch": 24.204430625728723, "grad_norm": 0.5657198429107666, "learning_rate": 8.886894736842105e-05, "loss": 0.0675, "step": 31150 }, { "epoch": 24.24329576369996, "grad_norm": 5.198428630828857, "learning_rate": 8.884263157894737e-05, "loss": 0.071, "step": 31200 }, { "epoch": 24.2821609016712, "grad_norm": 2.458716869354248, "learning_rate": 8.88163157894737e-05, "loss": 0.0699, "step": 31250 }, { "epoch": 24.32102603964244, "grad_norm": 1.9143106937408447, "learning_rate": 8.879000000000001e-05, "loss": 0.0656, "step": 31300 }, { "epoch": 24.35989117761368, "grad_norm": 2.3596277236938477, "learning_rate": 8.876368421052632e-05, "loss": 0.0841, "step": 31350 }, { "epoch": 24.39875631558492, "grad_norm": 2.6902995109558105, "learning_rate": 8.873736842105263e-05, "loss": 0.0592, "step": 31400 }, { "epoch": 24.43762145355616, "grad_norm": 0.7977442145347595, "learning_rate": 8.871105263157894e-05, "loss": 0.066, "step": 31450 }, { "epoch": 24.4764865915274, "grad_norm": 1.8103632926940918, "learning_rate": 8.868473684210527e-05, "loss": 0.0816, "step": 31500 }, { "epoch": 24.51535172949864, "grad_norm": 3.2258715629577637, "learning_rate": 8.865842105263158e-05, "loss": 0.0613, "step": 31550 }, { "epoch": 24.55421686746988, "grad_norm": 1.8390077352523804, "learning_rate": 8.863210526315789e-05, "loss": 0.0788, "step": 31600 }, { "epoch": 24.59308200544112, "grad_norm": 1.1279451847076416, "learning_rate": 8.860578947368422e-05, "loss": 0.0843, "step": 31650 }, { "epoch": 24.631947143412358, "grad_norm": 3.265662670135498, "learning_rate": 8.857947368421053e-05, "loss": 0.0589, "step": 31700 }, { "epoch": 24.6708122813836, "grad_norm": 3.05871844291687, "learning_rate": 8.855315789473685e-05, "loss": 0.0767, "step": 31750 }, { "epoch": 24.70967741935484, "grad_norm": 3.3928933143615723, "learning_rate": 8.852684210526317e-05, "loss": 0.064, "step": 31800 }, { "epoch": 24.74854255732608, "grad_norm": 2.3093581199645996, "learning_rate": 8.850052631578948e-05, "loss": 0.0756, "step": 31850 }, { "epoch": 24.78740769529732, "grad_norm": 1.7362569570541382, "learning_rate": 8.847421052631579e-05, "loss": 0.0682, "step": 31900 }, { "epoch": 24.826272833268558, "grad_norm": 2.4637110233306885, "learning_rate": 8.844789473684211e-05, "loss": 0.0677, "step": 31950 }, { "epoch": 24.865137971239797, "grad_norm": 2.16152024269104, "learning_rate": 8.842157894736843e-05, "loss": 0.0687, "step": 32000 }, { "epoch": 24.865137971239797, "eval_accuracy": 0.9946557838993344, "eval_runtime": 876.7622, "eval_samples_per_second": 23.476, "eval_steps_per_second": 2.935, "step": 32000 }, { "epoch": 24.904003109211036, "grad_norm": 1.6575545072555542, "learning_rate": 8.839526315789474e-05, "loss": 0.0714, "step": 32050 }, { "epoch": 24.94286824718228, "grad_norm": 3.0430097579956055, "learning_rate": 8.836894736842106e-05, "loss": 0.0942, "step": 32100 }, { "epoch": 24.981733385153518, "grad_norm": 1.7112277746200562, "learning_rate": 8.834263157894737e-05, "loss": 0.0727, "step": 32150 }, { "epoch": 25.020209871745045, "grad_norm": 1.13712477684021, "learning_rate": 8.831631578947369e-05, "loss": 0.0634, "step": 32200 }, { "epoch": 25.059075009716285, "grad_norm": 0.15798869729042053, "learning_rate": 8.829000000000001e-05, "loss": 0.06, "step": 32250 }, { "epoch": 25.097940147687524, "grad_norm": 3.0741162300109863, "learning_rate": 8.826368421052632e-05, "loss": 0.0749, "step": 32300 }, { "epoch": 25.136805285658763, "grad_norm": 0.9993603825569153, "learning_rate": 8.823736842105263e-05, "loss": 0.0492, "step": 32350 }, { "epoch": 25.175670423630002, "grad_norm": 0.929404616355896, "learning_rate": 8.821105263157894e-05, "loss": 0.0507, "step": 32400 }, { "epoch": 25.214535561601245, "grad_norm": 1.5012816190719604, "learning_rate": 8.818473684210527e-05, "loss": 0.0566, "step": 32450 }, { "epoch": 25.253400699572484, "grad_norm": 0.46388107538223267, "learning_rate": 8.815842105263158e-05, "loss": 0.0594, "step": 32500 }, { "epoch": 25.292265837543724, "grad_norm": 2.559751272201538, "learning_rate": 8.81321052631579e-05, "loss": 0.0619, "step": 32550 }, { "epoch": 25.331130975514963, "grad_norm": 0.8673915863037109, "learning_rate": 8.810578947368422e-05, "loss": 0.053, "step": 32600 }, { "epoch": 25.369996113486202, "grad_norm": 4.02969217300415, "learning_rate": 8.807947368421053e-05, "loss": 0.0521, "step": 32650 }, { "epoch": 25.40886125145744, "grad_norm": 0.8806849718093872, "learning_rate": 8.805315789473686e-05, "loss": 0.0557, "step": 32700 }, { "epoch": 25.44772638942868, "grad_norm": 1.1096513271331787, "learning_rate": 8.802684210526317e-05, "loss": 0.0641, "step": 32750 }, { "epoch": 25.486591527399924, "grad_norm": 0.36731696128845215, "learning_rate": 8.800052631578948e-05, "loss": 0.0534, "step": 32800 }, { "epoch": 25.525456665371163, "grad_norm": 0.8237600922584534, "learning_rate": 8.797421052631579e-05, "loss": 0.0593, "step": 32850 }, { "epoch": 25.564321803342402, "grad_norm": 2.0420188903808594, "learning_rate": 8.79478947368421e-05, "loss": 0.0576, "step": 32900 }, { "epoch": 25.60318694131364, "grad_norm": 3.4013640880584717, "learning_rate": 8.792157894736843e-05, "loss": 0.0657, "step": 32950 }, { "epoch": 25.64205207928488, "grad_norm": 2.2952609062194824, "learning_rate": 8.789526315789474e-05, "loss": 0.0565, "step": 33000 }, { "epoch": 25.64205207928488, "eval_accuracy": 0.9952873730748676, "eval_runtime": 878.0538, "eval_samples_per_second": 23.442, "eval_steps_per_second": 2.93, "step": 33000 }, { "epoch": 25.68091721725612, "grad_norm": 4.091900825500488, "learning_rate": 8.786894736842106e-05, "loss": 0.0707, "step": 33050 }, { "epoch": 25.719782355227363, "grad_norm": 2.2345447540283203, "learning_rate": 8.784263157894737e-05, "loss": 0.0661, "step": 33100 }, { "epoch": 25.758647493198602, "grad_norm": 5.394659996032715, "learning_rate": 8.781631578947369e-05, "loss": 0.0703, "step": 33150 }, { "epoch": 25.79751263116984, "grad_norm": 2.6766786575317383, "learning_rate": 8.779000000000001e-05, "loss": 0.0531, "step": 33200 }, { "epoch": 25.83637776914108, "grad_norm": 0.757504403591156, "learning_rate": 8.776368421052632e-05, "loss": 0.0657, "step": 33250 }, { "epoch": 25.87524290711232, "grad_norm": 9.220613479614258, "learning_rate": 8.773736842105263e-05, "loss": 0.0639, "step": 33300 }, { "epoch": 25.91410804508356, "grad_norm": 0.38048455119132996, "learning_rate": 8.771105263157895e-05, "loss": 0.0583, "step": 33350 }, { "epoch": 25.9529731830548, "grad_norm": 0.569552481174469, "learning_rate": 8.768473684210526e-05, "loss": 0.0629, "step": 33400 }, { "epoch": 25.99183832102604, "grad_norm": 0.9764150977134705, "learning_rate": 8.765842105263158e-05, "loss": 0.0684, "step": 33450 }, { "epoch": 26.030314807617568, "grad_norm": 0.6928421854972839, "learning_rate": 8.763210526315791e-05, "loss": 0.0578, "step": 33500 }, { "epoch": 26.069179945588807, "grad_norm": 2.852064371109009, "learning_rate": 8.760578947368422e-05, "loss": 0.046, "step": 33550 }, { "epoch": 26.108045083560047, "grad_norm": 2.2197632789611816, "learning_rate": 8.757947368421053e-05, "loss": 0.0379, "step": 33600 }, { "epoch": 26.146910221531286, "grad_norm": 0.2494724988937378, "learning_rate": 8.755315789473684e-05, "loss": 0.0444, "step": 33650 }, { "epoch": 26.185775359502525, "grad_norm": 0.8166674971580505, "learning_rate": 8.752684210526317e-05, "loss": 0.0531, "step": 33700 }, { "epoch": 26.224640497473764, "grad_norm": 2.2353274822235107, "learning_rate": 8.750052631578948e-05, "loss": 0.0627, "step": 33750 }, { "epoch": 26.263505635445007, "grad_norm": 1.0710805654525757, "learning_rate": 8.747421052631579e-05, "loss": 0.047, "step": 33800 }, { "epoch": 26.302370773416246, "grad_norm": 1.9708255529403687, "learning_rate": 8.74478947368421e-05, "loss": 0.051, "step": 33850 }, { "epoch": 26.341235911387486, "grad_norm": 1.2529079914093018, "learning_rate": 8.742157894736841e-05, "loss": 0.0492, "step": 33900 }, { "epoch": 26.380101049358725, "grad_norm": 0.9265830516815186, "learning_rate": 8.739526315789474e-05, "loss": 0.0555, "step": 33950 }, { "epoch": 26.418966187329964, "grad_norm": 3.131944179534912, "learning_rate": 8.736894736842106e-05, "loss": 0.0473, "step": 34000 }, { "epoch": 26.418966187329964, "eval_accuracy": 0.995044454161201, "eval_runtime": 876.168, "eval_samples_per_second": 23.492, "eval_steps_per_second": 2.937, "step": 34000 }, { "epoch": 26.457831325301203, "grad_norm": 0.6346553564071655, "learning_rate": 8.734263157894738e-05, "loss": 0.0498, "step": 34050 }, { "epoch": 26.496696463272446, "grad_norm": 0.4876219928264618, "learning_rate": 8.731631578947369e-05, "loss": 0.0483, "step": 34100 }, { "epoch": 26.535561601243685, "grad_norm": 1.6699252128601074, "learning_rate": 8.729e-05, "loss": 0.0525, "step": 34150 }, { "epoch": 26.574426739214925, "grad_norm": 2.112086772918701, "learning_rate": 8.726368421052632e-05, "loss": 0.053, "step": 34200 }, { "epoch": 26.613291877186164, "grad_norm": 1.6960245370864868, "learning_rate": 8.723736842105264e-05, "loss": 0.048, "step": 34250 }, { "epoch": 26.652157015157403, "grad_norm": 3.9964354038238525, "learning_rate": 8.721105263157895e-05, "loss": 0.0606, "step": 34300 }, { "epoch": 26.691022153128642, "grad_norm": 4.299483299255371, "learning_rate": 8.718473684210526e-05, "loss": 0.0387, "step": 34350 }, { "epoch": 26.72988729109988, "grad_norm": 0.5646001100540161, "learning_rate": 8.715842105263158e-05, "loss": 0.0459, "step": 34400 }, { "epoch": 26.768752429071125, "grad_norm": 3.601149320602417, "learning_rate": 8.713210526315791e-05, "loss": 0.0762, "step": 34450 }, { "epoch": 26.807617567042364, "grad_norm": 5.873041152954102, "learning_rate": 8.710578947368422e-05, "loss": 0.0664, "step": 34500 }, { "epoch": 26.846482705013603, "grad_norm": 0.9744375348091125, "learning_rate": 8.707947368421053e-05, "loss": 0.0552, "step": 34550 }, { "epoch": 26.885347842984842, "grad_norm": 1.381766676902771, "learning_rate": 8.705315789473684e-05, "loss": 0.0645, "step": 34600 }, { "epoch": 26.92421298095608, "grad_norm": 3.265528678894043, "learning_rate": 8.702684210526316e-05, "loss": 0.0649, "step": 34650 }, { "epoch": 26.96307811892732, "grad_norm": 2.2487645149230957, "learning_rate": 8.700052631578948e-05, "loss": 0.0714, "step": 34700 }, { "epoch": 27.001554605518848, "grad_norm": 0.22899124026298523, "learning_rate": 8.697421052631579e-05, "loss": 0.0585, "step": 34750 }, { "epoch": 27.04041974349009, "grad_norm": 2.3547112941741943, "learning_rate": 8.69478947368421e-05, "loss": 0.0357, "step": 34800 }, { "epoch": 27.07928488146133, "grad_norm": 0.5145918130874634, "learning_rate": 8.692157894736842e-05, "loss": 0.0493, "step": 34850 }, { "epoch": 27.11815001943257, "grad_norm": 3.8266923427581787, "learning_rate": 8.689526315789474e-05, "loss": 0.0313, "step": 34900 }, { "epoch": 27.15701515740381, "grad_norm": 0.9876055717468262, "learning_rate": 8.686894736842107e-05, "loss": 0.0416, "step": 34950 }, { "epoch": 27.195880295375048, "grad_norm": 0.4796826243400574, "learning_rate": 8.684263157894738e-05, "loss": 0.0386, "step": 35000 }, { "epoch": 27.195880295375048, "eval_accuracy": 0.9959189622504008, "eval_runtime": 876.7261, "eval_samples_per_second": 23.477, "eval_steps_per_second": 2.935, "step": 35000 }, { "epoch": 27.234745433346287, "grad_norm": 0.3122335970401764, "learning_rate": 8.681631578947369e-05, "loss": 0.05, "step": 35050 }, { "epoch": 27.27361057131753, "grad_norm": 1.78782320022583, "learning_rate": 8.679e-05, "loss": 0.039, "step": 35100 }, { "epoch": 27.31247570928877, "grad_norm": 2.0089144706726074, "learning_rate": 8.676368421052633e-05, "loss": 0.0407, "step": 35150 }, { "epoch": 27.35134084726001, "grad_norm": 0.6602781414985657, "learning_rate": 8.673736842105264e-05, "loss": 0.0439, "step": 35200 }, { "epoch": 27.390205985231248, "grad_norm": 1.503353476524353, "learning_rate": 8.671105263157895e-05, "loss": 0.0458, "step": 35250 }, { "epoch": 27.429071123202487, "grad_norm": 3.133200168609619, "learning_rate": 8.668473684210526e-05, "loss": 0.0455, "step": 35300 }, { "epoch": 27.467936261173726, "grad_norm": 3.797276496887207, "learning_rate": 8.665842105263157e-05, "loss": 0.0444, "step": 35350 }, { "epoch": 27.506801399144965, "grad_norm": 1.8748810291290283, "learning_rate": 8.66321052631579e-05, "loss": 0.0445, "step": 35400 }, { "epoch": 27.545666537116208, "grad_norm": 1.198071837425232, "learning_rate": 8.660578947368422e-05, "loss": 0.0544, "step": 35450 }, { "epoch": 27.584531675087447, "grad_norm": 0.8296390175819397, "learning_rate": 8.657947368421053e-05, "loss": 0.0334, "step": 35500 }, { "epoch": 27.623396813058687, "grad_norm": 0.7118576169013977, "learning_rate": 8.655315789473685e-05, "loss": 0.0408, "step": 35550 }, { "epoch": 27.662261951029926, "grad_norm": 2.786017656326294, "learning_rate": 8.652684210526316e-05, "loss": 0.0474, "step": 35600 }, { "epoch": 27.701127089001165, "grad_norm": 0.6719661951065063, "learning_rate": 8.650052631578948e-05, "loss": 0.0586, "step": 35650 }, { "epoch": 27.739992226972404, "grad_norm": 1.9458692073822021, "learning_rate": 8.64742105263158e-05, "loss": 0.0403, "step": 35700 }, { "epoch": 27.778857364943647, "grad_norm": 2.8628056049346924, "learning_rate": 8.64478947368421e-05, "loss": 0.0491, "step": 35750 }, { "epoch": 27.817722502914886, "grad_norm": 3.354652166366577, "learning_rate": 8.642157894736842e-05, "loss": 0.0623, "step": 35800 }, { "epoch": 27.856587640886126, "grad_norm": 3.452547788619995, "learning_rate": 8.639526315789474e-05, "loss": 0.0564, "step": 35850 }, { "epoch": 27.895452778857365, "grad_norm": 2.5311927795410156, "learning_rate": 8.636894736842105e-05, "loss": 0.0505, "step": 35900 }, { "epoch": 27.934317916828604, "grad_norm": 2.357215642929077, "learning_rate": 8.634263157894738e-05, "loss": 0.0387, "step": 35950 }, { "epoch": 27.973183054799843, "grad_norm": 2.599184036254883, "learning_rate": 8.631631578947369e-05, "loss": 0.0565, "step": 36000 }, { "epoch": 27.973183054799843, "eval_accuracy": 0.9960161298158675, "eval_runtime": 876.7552, "eval_samples_per_second": 23.476, "eval_steps_per_second": 2.935, "step": 36000 }, { "epoch": 28.01165954139137, "grad_norm": 0.7705150246620178, "learning_rate": 8.629e-05, "loss": 0.0467, "step": 36050 }, { "epoch": 28.05052467936261, "grad_norm": 3.2739124298095703, "learning_rate": 8.626368421052631e-05, "loss": 0.0295, "step": 36100 }, { "epoch": 28.089389817333853, "grad_norm": 0.19782505929470062, "learning_rate": 8.623736842105264e-05, "loss": 0.0394, "step": 36150 }, { "epoch": 28.128254955305092, "grad_norm": 3.2549073696136475, "learning_rate": 8.621105263157895e-05, "loss": 0.0284, "step": 36200 }, { "epoch": 28.16712009327633, "grad_norm": 0.6249772310256958, "learning_rate": 8.618473684210526e-05, "loss": 0.0537, "step": 36250 }, { "epoch": 28.20598523124757, "grad_norm": 2.322044610977173, "learning_rate": 8.615842105263159e-05, "loss": 0.0506, "step": 36300 }, { "epoch": 28.24485036921881, "grad_norm": 1.3696391582489014, "learning_rate": 8.61321052631579e-05, "loss": 0.0373, "step": 36350 }, { "epoch": 28.28371550719005, "grad_norm": 0.34534499049186707, "learning_rate": 8.610578947368422e-05, "loss": 0.0431, "step": 36400 }, { "epoch": 28.322580645161292, "grad_norm": 1.8817704916000366, "learning_rate": 8.607947368421054e-05, "loss": 0.0459, "step": 36450 }, { "epoch": 28.36144578313253, "grad_norm": 3.955824851989746, "learning_rate": 8.605315789473685e-05, "loss": 0.0485, "step": 36500 }, { "epoch": 28.40031092110377, "grad_norm": 0.8568444848060608, "learning_rate": 8.602684210526316e-05, "loss": 0.0366, "step": 36550 }, { "epoch": 28.43917605907501, "grad_norm": 2.062861442565918, "learning_rate": 8.600052631578947e-05, "loss": 0.0382, "step": 36600 }, { "epoch": 28.47804119704625, "grad_norm": 0.24858438968658447, "learning_rate": 8.59742105263158e-05, "loss": 0.0453, "step": 36650 }, { "epoch": 28.516906335017488, "grad_norm": 1.0825531482696533, "learning_rate": 8.594789473684211e-05, "loss": 0.0384, "step": 36700 }, { "epoch": 28.55577147298873, "grad_norm": 1.907128095626831, "learning_rate": 8.592157894736842e-05, "loss": 0.0483, "step": 36750 }, { "epoch": 28.59463661095997, "grad_norm": 1.5113343000411987, "learning_rate": 8.589526315789474e-05, "loss": 0.0512, "step": 36800 }, { "epoch": 28.63350174893121, "grad_norm": 3.101874828338623, "learning_rate": 8.586894736842106e-05, "loss": 0.0401, "step": 36850 }, { "epoch": 28.67236688690245, "grad_norm": 2.6152641773223877, "learning_rate": 8.584263157894738e-05, "loss": 0.0415, "step": 36900 }, { "epoch": 28.711232024873688, "grad_norm": 4.213351726531982, "learning_rate": 8.581631578947369e-05, "loss": 0.0266, "step": 36950 }, { "epoch": 28.750097162844927, "grad_norm": 3.643454074859619, "learning_rate": 8.579e-05, "loss": 0.0584, "step": 37000 }, { "epoch": 28.750097162844927, "eval_accuracy": 0.9965505514259341, "eval_runtime": 878.5114, "eval_samples_per_second": 23.429, "eval_steps_per_second": 2.929, "step": 37000 }, { "epoch": 28.788962300816166, "grad_norm": 2.3031961917877197, "learning_rate": 8.576368421052632e-05, "loss": 0.0592, "step": 37050 }, { "epoch": 28.82782743878741, "grad_norm": 0.538669764995575, "learning_rate": 8.573736842105263e-05, "loss": 0.0423, "step": 37100 }, { "epoch": 28.86669257675865, "grad_norm": 4.017199993133545, "learning_rate": 8.571105263157895e-05, "loss": 0.0499, "step": 37150 }, { "epoch": 28.905557714729888, "grad_norm": 2.1706478595733643, "learning_rate": 8.568473684210526e-05, "loss": 0.0395, "step": 37200 }, { "epoch": 28.944422852701127, "grad_norm": 4.465363025665283, "learning_rate": 8.565842105263159e-05, "loss": 0.0491, "step": 37250 }, { "epoch": 28.983287990672366, "grad_norm": 4.176932334899902, "learning_rate": 8.56321052631579e-05, "loss": 0.0595, "step": 37300 }, { "epoch": 29.021764477263893, "grad_norm": 5.255528926849365, "learning_rate": 8.560578947368421e-05, "loss": 0.0385, "step": 37350 }, { "epoch": 29.060629615235133, "grad_norm": 3.7985963821411133, "learning_rate": 8.557947368421054e-05, "loss": 0.0308, "step": 37400 }, { "epoch": 29.099494753206375, "grad_norm": 1.006502628326416, "learning_rate": 8.555315789473685e-05, "loss": 0.0369, "step": 37450 }, { "epoch": 29.138359891177615, "grad_norm": 0.36928635835647583, "learning_rate": 8.552684210526316e-05, "loss": 0.0335, "step": 37500 }, { "epoch": 29.177225029148854, "grad_norm": 1.71954345703125, "learning_rate": 8.550052631578947e-05, "loss": 0.0383, "step": 37550 }, { "epoch": 29.216090167120093, "grad_norm": 0.19994154572486877, "learning_rate": 8.547421052631578e-05, "loss": 0.0379, "step": 37600 }, { "epoch": 29.254955305091332, "grad_norm": 1.6422315835952759, "learning_rate": 8.544789473684211e-05, "loss": 0.0365, "step": 37650 }, { "epoch": 29.29382044306257, "grad_norm": 0.8327766060829163, "learning_rate": 8.542157894736843e-05, "loss": 0.0383, "step": 37700 }, { "epoch": 29.33268558103381, "grad_norm": 0.69755619764328, "learning_rate": 8.539526315789475e-05, "loss": 0.039, "step": 37750 }, { "epoch": 29.371550719005054, "grad_norm": 0.969310462474823, "learning_rate": 8.536894736842106e-05, "loss": 0.0351, "step": 37800 }, { "epoch": 29.410415856976293, "grad_norm": 3.9037134647369385, "learning_rate": 8.534263157894737e-05, "loss": 0.0336, "step": 37850 }, { "epoch": 29.449280994947532, "grad_norm": 6.574547290802002, "learning_rate": 8.53163157894737e-05, "loss": 0.0318, "step": 37900 }, { "epoch": 29.48814613291877, "grad_norm": 0.7531656622886658, "learning_rate": 8.529e-05, "loss": 0.0372, "step": 37950 }, { "epoch": 29.52701127089001, "grad_norm": 3.7106435298919678, "learning_rate": 8.526368421052632e-05, "loss": 0.0369, "step": 38000 }, { "epoch": 29.52701127089001, "eval_accuracy": 0.9972307243842006, "eval_runtime": 877.9929, "eval_samples_per_second": 23.443, "eval_steps_per_second": 2.931, "step": 38000 }, { "epoch": 29.56587640886125, "grad_norm": 3.104320526123047, "learning_rate": 8.523736842105263e-05, "loss": 0.0403, "step": 38050 }, { "epoch": 29.604741546832493, "grad_norm": 0.4907873272895813, "learning_rate": 8.521105263157895e-05, "loss": 0.0412, "step": 38100 }, { "epoch": 29.643606684803732, "grad_norm": 0.19150598347187042, "learning_rate": 8.518473684210528e-05, "loss": 0.0417, "step": 38150 }, { "epoch": 29.68247182277497, "grad_norm": 1.1941745281219482, "learning_rate": 8.515842105263159e-05, "loss": 0.0335, "step": 38200 }, { "epoch": 29.72133696074621, "grad_norm": 0.36566030979156494, "learning_rate": 8.51321052631579e-05, "loss": 0.0461, "step": 38250 }, { "epoch": 29.76020209871745, "grad_norm": 0.7068200707435608, "learning_rate": 8.510578947368421e-05, "loss": 0.0462, "step": 38300 }, { "epoch": 29.79906723668869, "grad_norm": 0.6579626202583313, "learning_rate": 8.507947368421052e-05, "loss": 0.0383, "step": 38350 }, { "epoch": 29.83793237465993, "grad_norm": 2.1728909015655518, "learning_rate": 8.505315789473685e-05, "loss": 0.0398, "step": 38400 }, { "epoch": 29.87679751263117, "grad_norm": 0.7398192286491394, "learning_rate": 8.502684210526316e-05, "loss": 0.0483, "step": 38450 }, { "epoch": 29.91566265060241, "grad_norm": 2.1343936920166016, "learning_rate": 8.500052631578947e-05, "loss": 0.0519, "step": 38500 }, { "epoch": 29.95452778857365, "grad_norm": 0.2378653883934021, "learning_rate": 8.497421052631578e-05, "loss": 0.0384, "step": 38550 }, { "epoch": 29.99339292654489, "grad_norm": 2.9573421478271484, "learning_rate": 8.494789473684211e-05, "loss": 0.0541, "step": 38600 }, { "epoch": 30.031869413136416, "grad_norm": 0.5036262273788452, "learning_rate": 8.492157894736843e-05, "loss": 0.0295, "step": 38650 }, { "epoch": 30.070734551107655, "grad_norm": 1.123215913772583, "learning_rate": 8.489526315789475e-05, "loss": 0.0347, "step": 38700 }, { "epoch": 30.109599689078895, "grad_norm": 3.358325958251953, "learning_rate": 8.486894736842106e-05, "loss": 0.043, "step": 38750 }, { "epoch": 30.148464827050137, "grad_norm": 1.2945706844329834, "learning_rate": 8.484263157894737e-05, "loss": 0.0274, "step": 38800 }, { "epoch": 30.187329965021377, "grad_norm": 0.08089222759008408, "learning_rate": 8.48163157894737e-05, "loss": 0.026, "step": 38850 }, { "epoch": 30.226195102992616, "grad_norm": 0.9981313347816467, "learning_rate": 8.479e-05, "loss": 0.0283, "step": 38900 }, { "epoch": 30.265060240963855, "grad_norm": 2.40240740776062, "learning_rate": 8.476368421052632e-05, "loss": 0.0409, "step": 38950 }, { "epoch": 30.303925378935094, "grad_norm": 2.257906913757324, "learning_rate": 8.473736842105263e-05, "loss": 0.0336, "step": 39000 }, { "epoch": 30.303925378935094, "eval_accuracy": 0.996696302774134, "eval_runtime": 876.703, "eval_samples_per_second": 23.478, "eval_steps_per_second": 2.935, "step": 39000 }, { "epoch": 30.342790516906334, "grad_norm": 3.1069672107696533, "learning_rate": 8.471105263157894e-05, "loss": 0.0354, "step": 39050 }, { "epoch": 30.381655654877576, "grad_norm": 0.984092652797699, "learning_rate": 8.468473684210527e-05, "loss": 0.0283, "step": 39100 }, { "epoch": 30.420520792848816, "grad_norm": 1.6073585748672485, "learning_rate": 8.465842105263159e-05, "loss": 0.0329, "step": 39150 }, { "epoch": 30.459385930820055, "grad_norm": 0.9413264989852905, "learning_rate": 8.46321052631579e-05, "loss": 0.0355, "step": 39200 }, { "epoch": 30.498251068791294, "grad_norm": 0.16653196513652802, "learning_rate": 8.460578947368421e-05, "loss": 0.0349, "step": 39250 }, { "epoch": 30.537116206762533, "grad_norm": 2.087178945541382, "learning_rate": 8.457947368421053e-05, "loss": 0.0516, "step": 39300 }, { "epoch": 30.575981344733773, "grad_norm": 0.6503152847290039, "learning_rate": 8.455315789473685e-05, "loss": 0.0403, "step": 39350 }, { "epoch": 30.614846482705012, "grad_norm": 0.18071837723255157, "learning_rate": 8.452684210526316e-05, "loss": 0.0291, "step": 39400 }, { "epoch": 30.653711620676255, "grad_norm": 0.3294931650161743, "learning_rate": 8.450052631578947e-05, "loss": 0.0266, "step": 39450 }, { "epoch": 30.692576758647494, "grad_norm": 2.715810537338257, "learning_rate": 8.447421052631579e-05, "loss": 0.0331, "step": 39500 }, { "epoch": 30.731441896618733, "grad_norm": 0.7051801085472107, "learning_rate": 8.444789473684211e-05, "loss": 0.0353, "step": 39550 }, { "epoch": 30.770307034589973, "grad_norm": 0.311470627784729, "learning_rate": 8.442157894736842e-05, "loss": 0.0304, "step": 39600 }, { "epoch": 30.80917217256121, "grad_norm": 0.666933536529541, "learning_rate": 8.439526315789475e-05, "loss": 0.0304, "step": 39650 }, { "epoch": 30.84803731053245, "grad_norm": 3.9159634113311768, "learning_rate": 8.436894736842106e-05, "loss": 0.0424, "step": 39700 }, { "epoch": 30.886902448503694, "grad_norm": 1.1517620086669922, "learning_rate": 8.434263157894737e-05, "loss": 0.0389, "step": 39750 }, { "epoch": 30.925767586474933, "grad_norm": 2.5502254962921143, "learning_rate": 8.431631578947368e-05, "loss": 0.0405, "step": 39800 }, { "epoch": 30.964632724446172, "grad_norm": 2.4017560482025146, "learning_rate": 8.429000000000001e-05, "loss": 0.0317, "step": 39850 }, { "epoch": 31.0031092110377, "grad_norm": 1.4030591249465942, "learning_rate": 8.426368421052632e-05, "loss": 0.0405, "step": 39900 }, { "epoch": 31.04197434900894, "grad_norm": 3.136474847793579, "learning_rate": 8.423736842105263e-05, "loss": 0.0235, "step": 39950 }, { "epoch": 31.080839486980178, "grad_norm": 0.37211865186691284, "learning_rate": 8.421105263157894e-05, "loss": 0.0168, "step": 40000 }, { "epoch": 31.080839486980178, "eval_accuracy": 0.9970363892532672, "eval_runtime": 873.7751, "eval_samples_per_second": 23.556, "eval_steps_per_second": 2.945, "step": 40000 }, { "epoch": 31.119704624951417, "grad_norm": 1.3298871517181396, "learning_rate": 8.418473684210527e-05, "loss": 0.0258, "step": 40050 }, { "epoch": 31.15856976292266, "grad_norm": 0.12341593205928802, "learning_rate": 8.415842105263159e-05, "loss": 0.0387, "step": 40100 }, { "epoch": 31.1974349008939, "grad_norm": 0.411594420671463, "learning_rate": 8.41321052631579e-05, "loss": 0.0306, "step": 40150 }, { "epoch": 31.23630003886514, "grad_norm": 1.527314305305481, "learning_rate": 8.410578947368422e-05, "loss": 0.0275, "step": 40200 }, { "epoch": 31.275165176836378, "grad_norm": 1.303266167640686, "learning_rate": 8.407947368421053e-05, "loss": 0.0225, "step": 40250 }, { "epoch": 31.314030314807617, "grad_norm": 4.040980815887451, "learning_rate": 8.405315789473684e-05, "loss": 0.0302, "step": 40300 }, { "epoch": 31.352895452778856, "grad_norm": 95.21772766113281, "learning_rate": 8.402684210526316e-05, "loss": 0.0341, "step": 40350 }, { "epoch": 31.391760590750096, "grad_norm": 2.755831241607666, "learning_rate": 8.400052631578948e-05, "loss": 0.0219, "step": 40400 }, { "epoch": 31.43062572872134, "grad_norm": 0.8875411748886108, "learning_rate": 8.397421052631579e-05, "loss": 0.0312, "step": 40450 }, { "epoch": 31.469490866692578, "grad_norm": 0.5669600963592529, "learning_rate": 8.394789473684211e-05, "loss": 0.0319, "step": 40500 }, { "epoch": 31.508356004663817, "grad_norm": 2.629176139831543, "learning_rate": 8.392157894736842e-05, "loss": 0.0415, "step": 40550 }, { "epoch": 31.547221142635056, "grad_norm": 0.5514523983001709, "learning_rate": 8.389526315789475e-05, "loss": 0.0363, "step": 40600 }, { "epoch": 31.586086280606295, "grad_norm": 2.319906711578369, "learning_rate": 8.386894736842106e-05, "loss": 0.0284, "step": 40650 }, { "epoch": 31.624951418577535, "grad_norm": 3.822218894958496, "learning_rate": 8.384263157894737e-05, "loss": 0.0332, "step": 40700 }, { "epoch": 31.663816556548774, "grad_norm": 0.9227602481842041, "learning_rate": 8.381631578947368e-05, "loss": 0.0542, "step": 40750 }, { "epoch": 31.702681694520017, "grad_norm": 1.3786684274673462, "learning_rate": 8.379e-05, "loss": 0.0468, "step": 40800 }, { "epoch": 31.741546832491256, "grad_norm": 1.7518463134765625, "learning_rate": 8.376368421052632e-05, "loss": 0.0376, "step": 40850 }, { "epoch": 31.780411970462495, "grad_norm": 3.1545917987823486, "learning_rate": 8.373736842105263e-05, "loss": 0.0344, "step": 40900 }, { "epoch": 31.819277108433734, "grad_norm": 0.835984468460083, "learning_rate": 8.371105263157896e-05, "loss": 0.033, "step": 40950 }, { "epoch": 31.858142246404974, "grad_norm": 1.3513562679290771, "learning_rate": 8.368473684210527e-05, "loss": 0.0357, "step": 41000 }, { "epoch": 31.858142246404974, "eval_accuracy": 0.9974250595151338, "eval_runtime": 876.8345, "eval_samples_per_second": 23.474, "eval_steps_per_second": 2.934, "step": 41000 }, { "epoch": 31.897007384376213, "grad_norm": 1.5770503282546997, "learning_rate": 8.365842105263158e-05, "loss": 0.0305, "step": 41050 }, { "epoch": 31.935872522347456, "grad_norm": 2.7758595943450928, "learning_rate": 8.36321052631579e-05, "loss": 0.0329, "step": 41100 }, { "epoch": 31.974737660318695, "grad_norm": 0.14476031064987183, "learning_rate": 8.360578947368422e-05, "loss": 0.0364, "step": 41150 }, { "epoch": 32.01321414691022, "grad_norm": 2.3019137382507324, "learning_rate": 8.357947368421053e-05, "loss": 0.0389, "step": 41200 }, { "epoch": 32.05207928488146, "grad_norm": 1.0233415365219116, "learning_rate": 8.355315789473684e-05, "loss": 0.0246, "step": 41250 }, { "epoch": 32.0909444228527, "grad_norm": 0.19109289348125458, "learning_rate": 8.352684210526315e-05, "loss": 0.0248, "step": 41300 }, { "epoch": 32.129809560823944, "grad_norm": 0.2724582850933075, "learning_rate": 8.350052631578948e-05, "loss": 0.0193, "step": 41350 }, { "epoch": 32.16867469879518, "grad_norm": 0.41294848918914795, "learning_rate": 8.347421052631579e-05, "loss": 0.0345, "step": 41400 }, { "epoch": 32.20753983676642, "grad_norm": 0.17901839315891266, "learning_rate": 8.344789473684211e-05, "loss": 0.0325, "step": 41450 }, { "epoch": 32.24640497473766, "grad_norm": 0.3015025854110718, "learning_rate": 8.342157894736843e-05, "loss": 0.0226, "step": 41500 }, { "epoch": 32.2852701127089, "grad_norm": 1.5335288047790527, "learning_rate": 8.339526315789474e-05, "loss": 0.0297, "step": 41550 }, { "epoch": 32.32413525068014, "grad_norm": 0.370561420917511, "learning_rate": 8.336894736842106e-05, "loss": 0.0206, "step": 41600 }, { "epoch": 32.36300038865138, "grad_norm": 2.148465156555176, "learning_rate": 8.334263157894737e-05, "loss": 0.0278, "step": 41650 }, { "epoch": 32.40186552662262, "grad_norm": 0.1226087436079979, "learning_rate": 8.331631578947369e-05, "loss": 0.042, "step": 41700 }, { "epoch": 32.44073066459386, "grad_norm": 0.08631648868322372, "learning_rate": 8.329e-05, "loss": 0.0245, "step": 41750 }, { "epoch": 32.4795958025651, "grad_norm": 0.32861921191215515, "learning_rate": 8.326368421052632e-05, "loss": 0.035, "step": 41800 }, { "epoch": 32.518460940536336, "grad_norm": 0.6523399353027344, "learning_rate": 8.323736842105263e-05, "loss": 0.0235, "step": 41850 }, { "epoch": 32.55732607850758, "grad_norm": 5.079765796661377, "learning_rate": 8.321105263157896e-05, "loss": 0.0356, "step": 41900 }, { "epoch": 32.59619121647882, "grad_norm": 0.8504376411437988, "learning_rate": 8.318473684210527e-05, "loss": 0.035, "step": 41950 }, { "epoch": 32.63505635445006, "grad_norm": 0.7626635432243347, "learning_rate": 8.315842105263158e-05, "loss": 0.0365, "step": 42000 }, { "epoch": 32.63505635445006, "eval_accuracy": 0.9966477189914007, "eval_runtime": 949.1706, "eval_samples_per_second": 21.685, "eval_steps_per_second": 2.711, "step": 42000 }, { "epoch": 32.6739214924213, "grad_norm": 0.28948360681533813, "learning_rate": 8.31321052631579e-05, "loss": 0.0349, "step": 42050 }, { "epoch": 32.712786630392536, "grad_norm": 1.969033122062683, "learning_rate": 8.310578947368422e-05, "loss": 0.0341, "step": 42100 }, { "epoch": 32.75165176836378, "grad_norm": 0.21110635995864868, "learning_rate": 8.307947368421053e-05, "loss": 0.0275, "step": 42150 }, { "epoch": 32.790516906335014, "grad_norm": 1.5202889442443848, "learning_rate": 8.305315789473684e-05, "loss": 0.0414, "step": 42200 }, { "epoch": 32.82938204430626, "grad_norm": 1.3945695161819458, "learning_rate": 8.302684210526315e-05, "loss": 0.036, "step": 42250 }, { "epoch": 32.8682471822775, "grad_norm": 1.1253477334976196, "learning_rate": 8.300052631578948e-05, "loss": 0.0302, "step": 42300 }, { "epoch": 32.907112320248736, "grad_norm": 0.15563784539699554, "learning_rate": 8.29742105263158e-05, "loss": 0.0343, "step": 42350 }, { "epoch": 32.94597745821998, "grad_norm": 2.101106643676758, "learning_rate": 8.294789473684212e-05, "loss": 0.0321, "step": 42400 }, { "epoch": 32.984842596191214, "grad_norm": 3.4534518718719482, "learning_rate": 8.292157894736843e-05, "loss": 0.0265, "step": 42450 }, { "epoch": 33.02331908278274, "grad_norm": 1.4187448024749756, "learning_rate": 8.289526315789474e-05, "loss": 0.0294, "step": 42500 }, { "epoch": 33.062184220753984, "grad_norm": 4.081812381744385, "learning_rate": 8.286894736842106e-05, "loss": 0.0302, "step": 42550 }, { "epoch": 33.10104935872522, "grad_norm": 3.367576837539673, "learning_rate": 8.284263157894738e-05, "loss": 0.0214, "step": 42600 }, { "epoch": 33.13991449669646, "grad_norm": 0.43796464800834656, "learning_rate": 8.281631578947369e-05, "loss": 0.021, "step": 42650 }, { "epoch": 33.178779634667706, "grad_norm": 0.7556443214416504, "learning_rate": 8.279e-05, "loss": 0.0307, "step": 42700 }, { "epoch": 33.21764477263894, "grad_norm": 0.5198609232902527, "learning_rate": 8.276368421052631e-05, "loss": 0.0255, "step": 42750 }, { "epoch": 33.256509910610184, "grad_norm": 0.27239367365837097, "learning_rate": 8.273736842105264e-05, "loss": 0.0215, "step": 42800 }, { "epoch": 33.29537504858142, "grad_norm": 2.588319778442383, "learning_rate": 8.271105263157896e-05, "loss": 0.0422, "step": 42850 }, { "epoch": 33.33424018655266, "grad_norm": 0.2941960394382477, "learning_rate": 8.268473684210527e-05, "loss": 0.0274, "step": 42900 }, { "epoch": 33.373105324523905, "grad_norm": 2.5152249336242676, "learning_rate": 8.265842105263158e-05, "loss": 0.0167, "step": 42950 }, { "epoch": 33.41197046249514, "grad_norm": 2.8935768604278564, "learning_rate": 8.26321052631579e-05, "loss": 0.0391, "step": 43000 }, { "epoch": 33.41197046249514, "eval_accuracy": 0.9973764757324005, "eval_runtime": 876.8781, "eval_samples_per_second": 23.473, "eval_steps_per_second": 2.934, "step": 43000 }, { "epoch": 33.450835600466384, "grad_norm": 0.657610297203064, "learning_rate": 8.260578947368422e-05, "loss": 0.0223, "step": 43050 }, { "epoch": 33.48970073843762, "grad_norm": 0.24365423619747162, "learning_rate": 8.257947368421053e-05, "loss": 0.0204, "step": 43100 }, { "epoch": 33.52856587640886, "grad_norm": 0.0590318888425827, "learning_rate": 8.255315789473684e-05, "loss": 0.0239, "step": 43150 }, { "epoch": 33.5674310143801, "grad_norm": 0.2320161759853363, "learning_rate": 8.252684210526315e-05, "loss": 0.0308, "step": 43200 }, { "epoch": 33.60629615235134, "grad_norm": 2.4144468307495117, "learning_rate": 8.250052631578947e-05, "loss": 0.0246, "step": 43250 }, { "epoch": 33.645161290322584, "grad_norm": 0.04267783835530281, "learning_rate": 8.247421052631579e-05, "loss": 0.0399, "step": 43300 }, { "epoch": 33.68402642829382, "grad_norm": 0.4845680594444275, "learning_rate": 8.244789473684212e-05, "loss": 0.0372, "step": 43350 }, { "epoch": 33.72289156626506, "grad_norm": 0.2614389657974243, "learning_rate": 8.242157894736843e-05, "loss": 0.0319, "step": 43400 }, { "epoch": 33.7617567042363, "grad_norm": 1.213369607925415, "learning_rate": 8.239526315789474e-05, "loss": 0.0387, "step": 43450 }, { "epoch": 33.80062184220754, "grad_norm": 0.19858339428901672, "learning_rate": 8.236894736842105e-05, "loss": 0.0276, "step": 43500 }, { "epoch": 33.839486980178776, "grad_norm": 0.058305270969867706, "learning_rate": 8.234263157894738e-05, "loss": 0.0421, "step": 43550 }, { "epoch": 33.87835211815002, "grad_norm": 0.3969908356666565, "learning_rate": 8.231631578947369e-05, "loss": 0.0432, "step": 43600 }, { "epoch": 33.91721725612126, "grad_norm": 3.350163459777832, "learning_rate": 8.229e-05, "loss": 0.0373, "step": 43650 }, { "epoch": 33.9560823940925, "grad_norm": 1.675500750541687, "learning_rate": 8.226368421052631e-05, "loss": 0.0283, "step": 43700 }, { "epoch": 33.99494753206374, "grad_norm": 1.9610657691955566, "learning_rate": 8.223736842105264e-05, "loss": 0.0383, "step": 43750 }, { "epoch": 34.03342401865527, "grad_norm": 0.2761792540550232, "learning_rate": 8.221105263157896e-05, "loss": 0.0158, "step": 43800 }, { "epoch": 34.0722891566265, "grad_norm": 4.586607456207275, "learning_rate": 8.218473684210527e-05, "loss": 0.0273, "step": 43850 }, { "epoch": 34.111154294597746, "grad_norm": 0.15766066312789917, "learning_rate": 8.215842105263158e-05, "loss": 0.0237, "step": 43900 }, { "epoch": 34.15001943256899, "grad_norm": 0.7128147482872009, "learning_rate": 8.21321052631579e-05, "loss": 0.0289, "step": 43950 }, { "epoch": 34.188884570540225, "grad_norm": 0.08594869077205658, "learning_rate": 8.210578947368421e-05, "loss": 0.024, "step": 44000 }, { "epoch": 34.188884570540225, "eval_accuracy": 0.9972307243842006, "eval_runtime": 876.3484, "eval_samples_per_second": 23.487, "eval_steps_per_second": 2.936, "step": 44000 }, { "epoch": 34.22774970851147, "grad_norm": 2.7693264484405518, "learning_rate": 8.207947368421053e-05, "loss": 0.0224, "step": 44050 }, { "epoch": 34.2666148464827, "grad_norm": 1.722844123840332, "learning_rate": 8.205315789473684e-05, "loss": 0.0398, "step": 44100 }, { "epoch": 34.305479984453946, "grad_norm": 4.099954605102539, "learning_rate": 8.202684210526316e-05, "loss": 0.0328, "step": 44150 }, { "epoch": 34.34434512242518, "grad_norm": 5.51669979095459, "learning_rate": 8.200052631578947e-05, "loss": 0.0368, "step": 44200 }, { "epoch": 34.383210260396424, "grad_norm": 0.23224566876888275, "learning_rate": 8.197421052631579e-05, "loss": 0.0232, "step": 44250 }, { "epoch": 34.42207539836767, "grad_norm": 0.1644495576620102, "learning_rate": 8.194789473684212e-05, "loss": 0.019, "step": 44300 }, { "epoch": 34.4609405363389, "grad_norm": 1.9373985528945923, "learning_rate": 8.192157894736843e-05, "loss": 0.0294, "step": 44350 }, { "epoch": 34.499805674310146, "grad_norm": 1.882383942604065, "learning_rate": 8.189526315789474e-05, "loss": 0.0259, "step": 44400 }, { "epoch": 34.53867081228138, "grad_norm": 3.6029393672943115, "learning_rate": 8.186894736842105e-05, "loss": 0.0256, "step": 44450 }, { "epoch": 34.577535950252624, "grad_norm": 0.5291832089424133, "learning_rate": 8.184263157894736e-05, "loss": 0.0195, "step": 44500 }, { "epoch": 34.61640108822386, "grad_norm": 0.31490087509155273, "learning_rate": 8.181631578947369e-05, "loss": 0.0193, "step": 44550 }, { "epoch": 34.6552662261951, "grad_norm": 3.0802714824676514, "learning_rate": 8.179e-05, "loss": 0.0245, "step": 44600 }, { "epoch": 34.694131364166346, "grad_norm": 0.4628906846046448, "learning_rate": 8.176368421052631e-05, "loss": 0.0295, "step": 44650 }, { "epoch": 34.73299650213758, "grad_norm": 0.29779794812202454, "learning_rate": 8.173736842105264e-05, "loss": 0.0219, "step": 44700 }, { "epoch": 34.771861640108824, "grad_norm": 2.227691650390625, "learning_rate": 8.171105263157895e-05, "loss": 0.0282, "step": 44750 }, { "epoch": 34.81072677808006, "grad_norm": 0.709452211856842, "learning_rate": 8.168473684210527e-05, "loss": 0.0331, "step": 44800 }, { "epoch": 34.8495919160513, "grad_norm": 0.4563519358634949, "learning_rate": 8.165842105263159e-05, "loss": 0.0263, "step": 44850 }, { "epoch": 34.88845705402254, "grad_norm": 0.22809520363807678, "learning_rate": 8.16321052631579e-05, "loss": 0.0285, "step": 44900 }, { "epoch": 34.92732219199378, "grad_norm": 3.2207274436950684, "learning_rate": 8.160578947368421e-05, "loss": 0.0288, "step": 44950 }, { "epoch": 34.966187329965024, "grad_norm": 3.6163206100463867, "learning_rate": 8.157947368421053e-05, "loss": 0.0318, "step": 45000 }, { "epoch": 34.966187329965024, "eval_accuracy": 0.9970849730360006, "eval_runtime": 876.4574, "eval_samples_per_second": 23.484, "eval_steps_per_second": 2.936, "step": 45000 }, { "epoch": 35.00466381655655, "grad_norm": 0.31362563371658325, "learning_rate": 8.155315789473685e-05, "loss": 0.0215, "step": 45050 }, { "epoch": 35.04352895452779, "grad_norm": 1.6593172550201416, "learning_rate": 8.152684210526316e-05, "loss": 0.0203, "step": 45100 }, { "epoch": 35.08239409249903, "grad_norm": 0.22460521757602692, "learning_rate": 8.150052631578948e-05, "loss": 0.0212, "step": 45150 }, { "epoch": 35.121259230470265, "grad_norm": 0.0782230868935585, "learning_rate": 8.14742105263158e-05, "loss": 0.025, "step": 45200 }, { "epoch": 35.16012436844151, "grad_norm": 0.7649909257888794, "learning_rate": 8.14478947368421e-05, "loss": 0.0192, "step": 45250 }, { "epoch": 35.19898950641275, "grad_norm": 0.5605923533439636, "learning_rate": 8.142157894736843e-05, "loss": 0.0276, "step": 45300 }, { "epoch": 35.23785464438399, "grad_norm": 0.656554102897644, "learning_rate": 8.139526315789474e-05, "loss": 0.0293, "step": 45350 }, { "epoch": 35.27671978235523, "grad_norm": 0.15811707079410553, "learning_rate": 8.136894736842105e-05, "loss": 0.0222, "step": 45400 }, { "epoch": 35.315584920326465, "grad_norm": 0.06599489599466324, "learning_rate": 8.134263157894737e-05, "loss": 0.0213, "step": 45450 }, { "epoch": 35.35445005829771, "grad_norm": 2.348271608352661, "learning_rate": 8.131631578947369e-05, "loss": 0.0211, "step": 45500 }, { "epoch": 35.393315196268944, "grad_norm": 0.1668296754360199, "learning_rate": 8.129e-05, "loss": 0.0202, "step": 45550 }, { "epoch": 35.432180334240186, "grad_norm": 0.6717625856399536, "learning_rate": 8.126368421052633e-05, "loss": 0.0147, "step": 45600 }, { "epoch": 35.47104547221143, "grad_norm": 0.5778363943099976, "learning_rate": 8.123736842105264e-05, "loss": 0.0246, "step": 45650 }, { "epoch": 35.509910610182665, "grad_norm": 1.1340782642364502, "learning_rate": 8.121105263157895e-05, "loss": 0.0201, "step": 45700 }, { "epoch": 35.54877574815391, "grad_norm": 0.16294288635253906, "learning_rate": 8.118473684210526e-05, "loss": 0.0276, "step": 45750 }, { "epoch": 35.58764088612514, "grad_norm": 0.07922358065843582, "learning_rate": 8.115842105263159e-05, "loss": 0.0224, "step": 45800 }, { "epoch": 35.626506024096386, "grad_norm": 0.037852223962545395, "learning_rate": 8.11321052631579e-05, "loss": 0.0209, "step": 45850 }, { "epoch": 35.66537116206762, "grad_norm": 0.5991801619529724, "learning_rate": 8.110578947368421e-05, "loss": 0.0223, "step": 45900 }, { "epoch": 35.704236300038865, "grad_norm": 1.8638439178466797, "learning_rate": 8.107947368421052e-05, "loss": 0.0294, "step": 45950 }, { "epoch": 35.74310143801011, "grad_norm": 1.1038992404937744, "learning_rate": 8.105315789473685e-05, "loss": 0.0162, "step": 46000 }, { "epoch": 35.74310143801011, "eval_accuracy": 0.9976679784288005, "eval_runtime": 878.5451, "eval_samples_per_second": 23.429, "eval_steps_per_second": 2.929, "step": 46000 }, { "epoch": 35.78196657598134, "grad_norm": 0.5733603239059448, "learning_rate": 8.102684210526316e-05, "loss": 0.0336, "step": 46050 }, { "epoch": 35.820831713952586, "grad_norm": 5.950699329376221, "learning_rate": 8.100052631578948e-05, "loss": 0.0273, "step": 46100 }, { "epoch": 35.85969685192382, "grad_norm": 0.018481293693184853, "learning_rate": 8.09742105263158e-05, "loss": 0.0169, "step": 46150 }, { "epoch": 35.898561989895065, "grad_norm": 0.4097042977809906, "learning_rate": 8.094789473684211e-05, "loss": 0.0285, "step": 46200 }, { "epoch": 35.93742712786631, "grad_norm": 0.5013408660888672, "learning_rate": 8.092157894736843e-05, "loss": 0.0241, "step": 46250 }, { "epoch": 35.97629226583754, "grad_norm": 0.3590555489063263, "learning_rate": 8.089526315789474e-05, "loss": 0.0261, "step": 46300 }, { "epoch": 36.01476875242907, "grad_norm": 3.033891201019287, "learning_rate": 8.086894736842106e-05, "loss": 0.0256, "step": 46350 }, { "epoch": 36.05363389040031, "grad_norm": 2.469217300415039, "learning_rate": 8.084263157894737e-05, "loss": 0.0325, "step": 46400 }, { "epoch": 36.09249902837155, "grad_norm": 1.083754301071167, "learning_rate": 8.081631578947368e-05, "loss": 0.0166, "step": 46450 }, { "epoch": 36.13136416634279, "grad_norm": 0.434164434671402, "learning_rate": 8.079e-05, "loss": 0.0146, "step": 46500 }, { "epoch": 36.17022930431403, "grad_norm": 1.0752344131469727, "learning_rate": 8.076368421052633e-05, "loss": 0.015, "step": 46550 }, { "epoch": 36.20909444228527, "grad_norm": 1.8302538394927979, "learning_rate": 8.073736842105264e-05, "loss": 0.0243, "step": 46600 }, { "epoch": 36.24795958025651, "grad_norm": 0.20908156037330627, "learning_rate": 8.071105263157895e-05, "loss": 0.0236, "step": 46650 }, { "epoch": 36.28682471822775, "grad_norm": 0.074724480509758, "learning_rate": 8.068473684210526e-05, "loss": 0.0148, "step": 46700 }, { "epoch": 36.32568985619899, "grad_norm": 2.220067024230957, "learning_rate": 8.065842105263159e-05, "loss": 0.0237, "step": 46750 }, { "epoch": 36.36455499417023, "grad_norm": 1.991211175918579, "learning_rate": 8.06321052631579e-05, "loss": 0.0263, "step": 46800 }, { "epoch": 36.40342013214147, "grad_norm": 0.17283689975738525, "learning_rate": 8.060578947368421e-05, "loss": 0.0227, "step": 46850 }, { "epoch": 36.442285270112706, "grad_norm": 4.825707912445068, "learning_rate": 8.057947368421052e-05, "loss": 0.0203, "step": 46900 }, { "epoch": 36.48115040808395, "grad_norm": 2.0094900131225586, "learning_rate": 8.055315789473684e-05, "loss": 0.0253, "step": 46950 }, { "epoch": 36.52001554605519, "grad_norm": 1.4706486463546753, "learning_rate": 8.052684210526316e-05, "loss": 0.0355, "step": 47000 }, { "epoch": 36.52001554605519, "eval_accuracy": 0.9974250595151338, "eval_runtime": 880.3189, "eval_samples_per_second": 23.381, "eval_steps_per_second": 2.923, "step": 47000 }, { "epoch": 36.55888068402643, "grad_norm": 1.3879079818725586, "learning_rate": 8.050052631578949e-05, "loss": 0.0203, "step": 47050 }, { "epoch": 36.59774582199767, "grad_norm": 0.15365901589393616, "learning_rate": 8.04742105263158e-05, "loss": 0.0224, "step": 47100 }, { "epoch": 36.636610959968905, "grad_norm": 3.338381052017212, "learning_rate": 8.044789473684211e-05, "loss": 0.0238, "step": 47150 }, { "epoch": 36.67547609794015, "grad_norm": 0.1772722750902176, "learning_rate": 8.042157894736842e-05, "loss": 0.0268, "step": 47200 }, { "epoch": 36.71434123591139, "grad_norm": 1.338255763053894, "learning_rate": 8.039526315789475e-05, "loss": 0.027, "step": 47250 }, { "epoch": 36.75320637388263, "grad_norm": 3.2425434589385986, "learning_rate": 8.036894736842106e-05, "loss": 0.0209, "step": 47300 }, { "epoch": 36.79207151185387, "grad_norm": 2.552506923675537, "learning_rate": 8.034263157894737e-05, "loss": 0.034, "step": 47350 }, { "epoch": 36.830936649825105, "grad_norm": 0.42373234033584595, "learning_rate": 8.031631578947368e-05, "loss": 0.0203, "step": 47400 }, { "epoch": 36.86980178779635, "grad_norm": 2.9511702060699463, "learning_rate": 8.028999999999999e-05, "loss": 0.0199, "step": 47450 }, { "epoch": 36.908666925767584, "grad_norm": 0.06529057770967484, "learning_rate": 8.026368421052633e-05, "loss": 0.0216, "step": 47500 }, { "epoch": 36.94753206373883, "grad_norm": 1.3030205965042114, "learning_rate": 8.023736842105264e-05, "loss": 0.0406, "step": 47550 }, { "epoch": 36.98639720171007, "grad_norm": 0.37334632873535156, "learning_rate": 8.021105263157895e-05, "loss": 0.0285, "step": 47600 }, { "epoch": 37.0248736883016, "grad_norm": 1.9018858671188354, "learning_rate": 8.018473684210527e-05, "loss": 0.0219, "step": 47650 }, { "epoch": 37.06373882627283, "grad_norm": 0.15374009311199188, "learning_rate": 8.015842105263158e-05, "loss": 0.0205, "step": 47700 }, { "epoch": 37.102603964244075, "grad_norm": 0.3583906590938568, "learning_rate": 8.01321052631579e-05, "loss": 0.018, "step": 47750 }, { "epoch": 37.14146910221531, "grad_norm": 2.795750856399536, "learning_rate": 8.010578947368421e-05, "loss": 0.0178, "step": 47800 }, { "epoch": 37.18033424018655, "grad_norm": 0.7013401389122009, "learning_rate": 8.007947368421053e-05, "loss": 0.0303, "step": 47850 }, { "epoch": 37.21919937815779, "grad_norm": 1.7055522203445435, "learning_rate": 8.005315789473684e-05, "loss": 0.0174, "step": 47900 }, { "epoch": 37.25806451612903, "grad_norm": 0.1926261931657791, "learning_rate": 8.002684210526316e-05, "loss": 0.0275, "step": 47950 }, { "epoch": 37.296929654100275, "grad_norm": 3.2588279247283936, "learning_rate": 8.000052631578949e-05, "loss": 0.0146, "step": 48000 }, { "epoch": 37.296929654100275, "eval_accuracy": 0.9987368216489336, "eval_runtime": 877.0856, "eval_samples_per_second": 23.467, "eval_steps_per_second": 2.934, "step": 48000 }, { "epoch": 37.33579479207151, "grad_norm": 1.3797889947891235, "learning_rate": 7.99742105263158e-05, "loss": 0.0228, "step": 48050 }, { "epoch": 37.37465993004275, "grad_norm": 0.4642263352870941, "learning_rate": 7.994789473684211e-05, "loss": 0.0276, "step": 48100 }, { "epoch": 37.41352506801399, "grad_norm": 0.7000479102134705, "learning_rate": 7.992157894736842e-05, "loss": 0.0095, "step": 48150 }, { "epoch": 37.45239020598523, "grad_norm": 0.5434962511062622, "learning_rate": 7.989526315789473e-05, "loss": 0.0167, "step": 48200 }, { "epoch": 37.49125534395647, "grad_norm": 0.2139407992362976, "learning_rate": 7.986894736842106e-05, "loss": 0.0156, "step": 48250 }, { "epoch": 37.53012048192771, "grad_norm": 0.040264032781124115, "learning_rate": 7.984263157894737e-05, "loss": 0.0212, "step": 48300 }, { "epoch": 37.56898561989895, "grad_norm": 0.15313409268856049, "learning_rate": 7.981631578947368e-05, "loss": 0.0182, "step": 48350 }, { "epoch": 37.60785075787019, "grad_norm": 0.43257951736450195, "learning_rate": 7.979000000000001e-05, "loss": 0.0226, "step": 48400 }, { "epoch": 37.64671589584143, "grad_norm": 2.317460775375366, "learning_rate": 7.976368421052632e-05, "loss": 0.0164, "step": 48450 }, { "epoch": 37.68558103381267, "grad_norm": 0.07254145294427872, "learning_rate": 7.973736842105264e-05, "loss": 0.0194, "step": 48500 }, { "epoch": 37.72444617178391, "grad_norm": 0.2547185719013214, "learning_rate": 7.971105263157896e-05, "loss": 0.0266, "step": 48550 }, { "epoch": 37.76331130975515, "grad_norm": 3.119913101196289, "learning_rate": 7.968473684210527e-05, "loss": 0.0145, "step": 48600 }, { "epoch": 37.80217644772639, "grad_norm": 1.1772373914718628, "learning_rate": 7.965842105263158e-05, "loss": 0.0265, "step": 48650 }, { "epoch": 37.84104158569763, "grad_norm": 0.11725064367055893, "learning_rate": 7.96321052631579e-05, "loss": 0.0116, "step": 48700 }, { "epoch": 37.87990672366887, "grad_norm": 0.8524043560028076, "learning_rate": 7.960578947368422e-05, "loss": 0.0178, "step": 48750 }, { "epoch": 37.91877186164011, "grad_norm": 0.40145233273506165, "learning_rate": 7.957947368421053e-05, "loss": 0.0319, "step": 48800 }, { "epoch": 37.957636999611346, "grad_norm": 0.2777029275894165, "learning_rate": 7.955315789473684e-05, "loss": 0.0221, "step": 48850 }, { "epoch": 37.99650213758259, "grad_norm": 0.24182066321372986, "learning_rate": 7.952684210526316e-05, "loss": 0.0168, "step": 48900 }, { "epoch": 38.034978624174116, "grad_norm": 2.333148241043091, "learning_rate": 7.950052631578947e-05, "loss": 0.024, "step": 48950 }, { "epoch": 38.07384376214536, "grad_norm": 3.2772161960601807, "learning_rate": 7.94742105263158e-05, "loss": 0.0187, "step": 49000 }, { "epoch": 38.07384376214536, "eval_accuracy": 0.9978137297770004, "eval_runtime": 886.8812, "eval_samples_per_second": 23.208, "eval_steps_per_second": 2.901, "step": 49000 }, { "epoch": 38.112708900116594, "grad_norm": 1.3939074277877808, "learning_rate": 7.944789473684211e-05, "loss": 0.0214, "step": 49050 }, { "epoch": 38.15157403808784, "grad_norm": 0.7525760531425476, "learning_rate": 7.942157894736842e-05, "loss": 0.0252, "step": 49100 }, { "epoch": 38.19043917605907, "grad_norm": 4.617874622344971, "learning_rate": 7.939526315789473e-05, "loss": 0.028, "step": 49150 }, { "epoch": 38.229304314030315, "grad_norm": 0.4796071946620941, "learning_rate": 7.936894736842106e-05, "loss": 0.0257, "step": 49200 }, { "epoch": 38.26816945200155, "grad_norm": 0.20932449400424957, "learning_rate": 7.934263157894737e-05, "loss": 0.0174, "step": 49250 }, { "epoch": 38.307034589972794, "grad_norm": 2.9770264625549316, "learning_rate": 7.931631578947368e-05, "loss": 0.0174, "step": 49300 }, { "epoch": 38.34589972794404, "grad_norm": 2.455333948135376, "learning_rate": 7.929000000000001e-05, "loss": 0.0335, "step": 49350 }, { "epoch": 38.38476486591527, "grad_norm": 2.985612630844116, "learning_rate": 7.926368421052632e-05, "loss": 0.0205, "step": 49400 }, { "epoch": 38.423630003886515, "grad_norm": 2.100100517272949, "learning_rate": 7.923736842105263e-05, "loss": 0.0249, "step": 49450 }, { "epoch": 38.46249514185775, "grad_norm": 3.7981529235839844, "learning_rate": 7.921105263157896e-05, "loss": 0.0154, "step": 49500 }, { "epoch": 38.501360279828994, "grad_norm": 1.1639838218688965, "learning_rate": 7.918473684210527e-05, "loss": 0.0188, "step": 49550 }, { "epoch": 38.54022541780024, "grad_norm": 0.5902754068374634, "learning_rate": 7.915842105263158e-05, "loss": 0.0253, "step": 49600 }, { "epoch": 38.57909055577147, "grad_norm": 3.1052238941192627, "learning_rate": 7.913210526315789e-05, "loss": 0.0143, "step": 49650 }, { "epoch": 38.617955693742715, "grad_norm": 0.11219343543052673, "learning_rate": 7.910578947368422e-05, "loss": 0.0167, "step": 49700 }, { "epoch": 38.65682083171395, "grad_norm": 0.9959619045257568, "learning_rate": 7.907947368421053e-05, "loss": 0.0223, "step": 49750 }, { "epoch": 38.695685969685194, "grad_norm": 0.7064417600631714, "learning_rate": 7.905315789473685e-05, "loss": 0.0204, "step": 49800 }, { "epoch": 38.73455110765643, "grad_norm": 0.32734501361846924, "learning_rate": 7.902684210526316e-05, "loss": 0.0235, "step": 49850 }, { "epoch": 38.77341624562767, "grad_norm": 1.8845106363296509, "learning_rate": 7.900052631578948e-05, "loss": 0.0231, "step": 49900 }, { "epoch": 38.812281383598915, "grad_norm": 1.0510152578353882, "learning_rate": 7.89742105263158e-05, "loss": 0.0196, "step": 49950 }, { "epoch": 38.85114652157015, "grad_norm": 2.2157108783721924, "learning_rate": 7.894789473684211e-05, "loss": 0.021, "step": 50000 }, { "epoch": 38.85114652157015, "eval_accuracy": 0.9982024000388671, "eval_runtime": 880.8313, "eval_samples_per_second": 23.368, "eval_steps_per_second": 2.921, "step": 50000 }, { "epoch": 38.89001165954139, "grad_norm": 2.7347636222839355, "learning_rate": 7.892157894736842e-05, "loss": 0.0219, "step": 50050 }, { "epoch": 38.92887679751263, "grad_norm": 0.05592096224427223, "learning_rate": 7.889526315789474e-05, "loss": 0.0234, "step": 50100 }, { "epoch": 38.96774193548387, "grad_norm": 0.25417473912239075, "learning_rate": 7.886894736842105e-05, "loss": 0.0273, "step": 50150 }, { "epoch": 39.0062184220754, "grad_norm": 0.08906584233045578, "learning_rate": 7.884263157894737e-05, "loss": 0.0234, "step": 50200 }, { "epoch": 39.045083560046635, "grad_norm": 0.2883197069168091, "learning_rate": 7.881631578947368e-05, "loss": 0.0121, "step": 50250 }, { "epoch": 39.08394869801788, "grad_norm": 0.18966904282569885, "learning_rate": 7.879000000000001e-05, "loss": 0.0161, "step": 50300 }, { "epoch": 39.12281383598912, "grad_norm": 0.5769274830818176, "learning_rate": 7.876368421052632e-05, "loss": 0.0152, "step": 50350 }, { "epoch": 39.161678973960356, "grad_norm": 2.145620107650757, "learning_rate": 7.873736842105263e-05, "loss": 0.0169, "step": 50400 }, { "epoch": 39.2005441119316, "grad_norm": 0.7396944761276245, "learning_rate": 7.871105263157896e-05, "loss": 0.0099, "step": 50450 }, { "epoch": 39.239409249902835, "grad_norm": 0.2696094810962677, "learning_rate": 7.868473684210527e-05, "loss": 0.0126, "step": 50500 }, { "epoch": 39.27827438787408, "grad_norm": 0.7367541790008545, "learning_rate": 7.865842105263158e-05, "loss": 0.0152, "step": 50550 }, { "epoch": 39.31713952584532, "grad_norm": 0.13489407300949097, "learning_rate": 7.863210526315789e-05, "loss": 0.0144, "step": 50600 }, { "epoch": 39.356004663816556, "grad_norm": 3.5517187118530273, "learning_rate": 7.86057894736842e-05, "loss": 0.0159, "step": 50650 }, { "epoch": 39.3948698017878, "grad_norm": 0.4950302839279175, "learning_rate": 7.857947368421053e-05, "loss": 0.0157, "step": 50700 }, { "epoch": 39.433734939759034, "grad_norm": 0.3861680328845978, "learning_rate": 7.855315789473685e-05, "loss": 0.0148, "step": 50750 }, { "epoch": 39.47260007773028, "grad_norm": 0.625752866268158, "learning_rate": 7.852684210526317e-05, "loss": 0.0209, "step": 50800 }, { "epoch": 39.51146521570151, "grad_norm": 1.8318170309066772, "learning_rate": 7.850052631578948e-05, "loss": 0.017, "step": 50850 }, { "epoch": 39.550330353672756, "grad_norm": 6.518121719360352, "learning_rate": 7.847421052631579e-05, "loss": 0.0191, "step": 50900 }, { "epoch": 39.589195491644, "grad_norm": 0.056834232062101364, "learning_rate": 7.844789473684211e-05, "loss": 0.0212, "step": 50950 }, { "epoch": 39.628060629615234, "grad_norm": 3.313100576400757, "learning_rate": 7.842157894736843e-05, "loss": 0.0217, "step": 51000 }, { "epoch": 39.628060629615234, "eval_accuracy": 0.9980080649079337, "eval_runtime": 878.9892, "eval_samples_per_second": 23.417, "eval_steps_per_second": 2.927, "step": 51000 }, { "epoch": 39.66692576758648, "grad_norm": 0.2621047794818878, "learning_rate": 7.839526315789474e-05, "loss": 0.0124, "step": 51050 }, { "epoch": 39.70579090555771, "grad_norm": 4.195189476013184, "learning_rate": 7.836894736842105e-05, "loss": 0.0231, "step": 51100 }, { "epoch": 39.744656043528956, "grad_norm": 0.27412697672843933, "learning_rate": 7.834263157894736e-05, "loss": 0.0265, "step": 51150 }, { "epoch": 39.78352118150019, "grad_norm": 0.005160004366189241, "learning_rate": 7.83163157894737e-05, "loss": 0.0183, "step": 51200 }, { "epoch": 39.822386319471434, "grad_norm": 3.9593896865844727, "learning_rate": 7.829000000000001e-05, "loss": 0.0235, "step": 51250 }, { "epoch": 39.86125145744268, "grad_norm": 2.3815083503723145, "learning_rate": 7.826368421052632e-05, "loss": 0.0204, "step": 51300 }, { "epoch": 39.90011659541391, "grad_norm": 0.07994846999645233, "learning_rate": 7.823736842105263e-05, "loss": 0.0221, "step": 51350 }, { "epoch": 39.938981733385155, "grad_norm": 2.044116258621216, "learning_rate": 7.821105263157895e-05, "loss": 0.0215, "step": 51400 }, { "epoch": 39.97784687135639, "grad_norm": 0.45163819193840027, "learning_rate": 7.818473684210527e-05, "loss": 0.0197, "step": 51450 }, { "epoch": 40.01632335794792, "grad_norm": 0.2218233197927475, "learning_rate": 7.815842105263158e-05, "loss": 0.0113, "step": 51500 }, { "epoch": 40.05518849591916, "grad_norm": 0.23619444668293, "learning_rate": 7.81321052631579e-05, "loss": 0.0167, "step": 51550 }, { "epoch": 40.0940536338904, "grad_norm": 0.0961291566491127, "learning_rate": 7.81057894736842e-05, "loss": 0.0155, "step": 51600 }, { "epoch": 40.13291877186164, "grad_norm": 16.20973014831543, "learning_rate": 7.807947368421053e-05, "loss": 0.0176, "step": 51650 }, { "epoch": 40.17178390983288, "grad_norm": 0.22694605588912964, "learning_rate": 7.805315789473686e-05, "loss": 0.0153, "step": 51700 }, { "epoch": 40.21064904780412, "grad_norm": 0.20650620758533478, "learning_rate": 7.802684210526317e-05, "loss": 0.0157, "step": 51750 }, { "epoch": 40.24951418577536, "grad_norm": 2.3340983390808105, "learning_rate": 7.800052631578948e-05, "loss": 0.0097, "step": 51800 }, { "epoch": 40.2883793237466, "grad_norm": 0.4349310100078583, "learning_rate": 7.797421052631579e-05, "loss": 0.0186, "step": 51850 }, { "epoch": 40.32724446171784, "grad_norm": 1.1148430109024048, "learning_rate": 7.79478947368421e-05, "loss": 0.0182, "step": 51900 }, { "epoch": 40.36610959968908, "grad_norm": 0.6705226898193359, "learning_rate": 7.792157894736843e-05, "loss": 0.0237, "step": 51950 }, { "epoch": 40.40497473766032, "grad_norm": 0.337016224861145, "learning_rate": 7.789526315789474e-05, "loss": 0.0193, "step": 52000 }, { "epoch": 40.40497473766032, "eval_accuracy": 0.9984453189525336, "eval_runtime": 879.6611, "eval_samples_per_second": 23.399, "eval_steps_per_second": 2.925, "step": 52000 }, { "epoch": 40.44383987563156, "grad_norm": 1.561038851737976, "learning_rate": 7.786894736842105e-05, "loss": 0.0148, "step": 52050 }, { "epoch": 40.482705013602796, "grad_norm": 1.7603131532669067, "learning_rate": 7.784263157894736e-05, "loss": 0.0192, "step": 52100 }, { "epoch": 40.52157015157404, "grad_norm": 2.7705743312835693, "learning_rate": 7.781631578947369e-05, "loss": 0.0303, "step": 52150 }, { "epoch": 40.560435289545275, "grad_norm": 3.2688467502593994, "learning_rate": 7.779000000000001e-05, "loss": 0.0236, "step": 52200 }, { "epoch": 40.59930042751652, "grad_norm": 0.5117989778518677, "learning_rate": 7.776368421052632e-05, "loss": 0.0195, "step": 52250 }, { "epoch": 40.63816556548776, "grad_norm": 6.964997291564941, "learning_rate": 7.773736842105264e-05, "loss": 0.0195, "step": 52300 }, { "epoch": 40.677030703458996, "grad_norm": 2.0262928009033203, "learning_rate": 7.771105263157895e-05, "loss": 0.0143, "step": 52350 }, { "epoch": 40.71589584143024, "grad_norm": 0.025713708251714706, "learning_rate": 7.768473684210527e-05, "loss": 0.0191, "step": 52400 }, { "epoch": 40.754760979401475, "grad_norm": 0.850270688533783, "learning_rate": 7.765842105263158e-05, "loss": 0.0219, "step": 52450 }, { "epoch": 40.79362611737272, "grad_norm": 2.509840250015259, "learning_rate": 7.76321052631579e-05, "loss": 0.0203, "step": 52500 }, { "epoch": 40.83249125534395, "grad_norm": 0.4855981469154358, "learning_rate": 7.760578947368421e-05, "loss": 0.023, "step": 52550 }, { "epoch": 40.871356393315196, "grad_norm": 0.11746831238269806, "learning_rate": 7.757947368421053e-05, "loss": 0.0167, "step": 52600 }, { "epoch": 40.91022153128644, "grad_norm": 0.8593801856040955, "learning_rate": 7.755315789473684e-05, "loss": 0.0214, "step": 52650 }, { "epoch": 40.949086669257674, "grad_norm": 2.0908868312835693, "learning_rate": 7.752684210526317e-05, "loss": 0.0135, "step": 52700 }, { "epoch": 40.98795180722892, "grad_norm": 0.017740854993462563, "learning_rate": 7.750052631578948e-05, "loss": 0.018, "step": 52750 }, { "epoch": 41.026428293820445, "grad_norm": 0.7258341312408447, "learning_rate": 7.747421052631579e-05, "loss": 0.0123, "step": 52800 }, { "epoch": 41.06529343179168, "grad_norm": 1.297969102859497, "learning_rate": 7.74478947368421e-05, "loss": 0.0211, "step": 52850 }, { "epoch": 41.10415856976292, "grad_norm": 0.03711100295186043, "learning_rate": 7.742157894736843e-05, "loss": 0.0189, "step": 52900 }, { "epoch": 41.143023707734166, "grad_norm": 0.9266149401664734, "learning_rate": 7.739526315789474e-05, "loss": 0.0133, "step": 52950 }, { "epoch": 41.1818888457054, "grad_norm": 0.04078902676701546, "learning_rate": 7.736894736842105e-05, "loss": 0.0139, "step": 53000 }, { "epoch": 41.1818888457054, "eval_accuracy": 0.998493902735267, "eval_runtime": 881.642, "eval_samples_per_second": 23.346, "eval_steps_per_second": 2.918, "step": 53000 }, { "epoch": 41.220753983676644, "grad_norm": 2.0239853858947754, "learning_rate": 7.734263157894738e-05, "loss": 0.0211, "step": 53050 }, { "epoch": 41.25961912164788, "grad_norm": 0.8523136377334595, "learning_rate": 7.731631578947369e-05, "loss": 0.0183, "step": 53100 }, { "epoch": 41.29848425961912, "grad_norm": 0.05443975329399109, "learning_rate": 7.729e-05, "loss": 0.0141, "step": 53150 }, { "epoch": 41.33734939759036, "grad_norm": 3.1442885398864746, "learning_rate": 7.726368421052633e-05, "loss": 0.0169, "step": 53200 }, { "epoch": 41.3762145355616, "grad_norm": 1.146552562713623, "learning_rate": 7.723736842105264e-05, "loss": 0.0158, "step": 53250 }, { "epoch": 41.415079673532844, "grad_norm": 1.5513288974761963, "learning_rate": 7.721105263157895e-05, "loss": 0.0119, "step": 53300 }, { "epoch": 41.45394481150408, "grad_norm": 1.2826228141784668, "learning_rate": 7.718473684210526e-05, "loss": 0.0115, "step": 53350 }, { "epoch": 41.49280994947532, "grad_norm": 1.0318225622177124, "learning_rate": 7.715842105263159e-05, "loss": 0.0143, "step": 53400 }, { "epoch": 41.53167508744656, "grad_norm": 0.11040183156728745, "learning_rate": 7.71321052631579e-05, "loss": 0.0289, "step": 53450 }, { "epoch": 41.5705402254178, "grad_norm": 0.10635088384151459, "learning_rate": 7.710578947368421e-05, "loss": 0.0159, "step": 53500 }, { "epoch": 41.60940536338904, "grad_norm": 0.13474248349666595, "learning_rate": 7.707947368421053e-05, "loss": 0.0156, "step": 53550 }, { "epoch": 41.64827050136028, "grad_norm": 2.0040714740753174, "learning_rate": 7.705315789473685e-05, "loss": 0.0078, "step": 53600 }, { "epoch": 41.68713563933152, "grad_norm": 1.4266414642333984, "learning_rate": 7.702684210526317e-05, "loss": 0.0136, "step": 53650 }, { "epoch": 41.72600077730276, "grad_norm": 0.7016364336013794, "learning_rate": 7.700052631578948e-05, "loss": 0.018, "step": 53700 }, { "epoch": 41.764865915274, "grad_norm": 0.6079683899879456, "learning_rate": 7.69742105263158e-05, "loss": 0.0219, "step": 53750 }, { "epoch": 41.80373105324524, "grad_norm": 0.13080672919750214, "learning_rate": 7.69478947368421e-05, "loss": 0.0116, "step": 53800 }, { "epoch": 41.84259619121648, "grad_norm": 1.093205451965332, "learning_rate": 7.692157894736842e-05, "loss": 0.018, "step": 53850 }, { "epoch": 41.881461329187715, "grad_norm": 0.07690184563398361, "learning_rate": 7.689526315789474e-05, "loss": 0.0166, "step": 53900 }, { "epoch": 41.92032646715896, "grad_norm": 0.6435837745666504, "learning_rate": 7.686894736842105e-05, "loss": 0.0159, "step": 53950 }, { "epoch": 41.9591916051302, "grad_norm": 0.10160457342863083, "learning_rate": 7.684263157894738e-05, "loss": 0.0205, "step": 54000 }, { "epoch": 41.9591916051302, "eval_accuracy": 0.9978623135597338, "eval_runtime": 1074.1404, "eval_samples_per_second": 19.162, "eval_steps_per_second": 2.395, "step": 54000 }, { "epoch": 41.998056743101436, "grad_norm": 1.1743571758270264, "learning_rate": 7.681631578947369e-05, "loss": 0.0175, "step": 54050 }, { "epoch": 42.036533229692964, "grad_norm": 2.6207754611968994, "learning_rate": 7.679e-05, "loss": 0.0094, "step": 54100 }, { "epoch": 42.07539836766421, "grad_norm": 0.26519230008125305, "learning_rate": 7.676368421052633e-05, "loss": 0.0169, "step": 54150 }, { "epoch": 42.11426350563544, "grad_norm": 0.45864981412887573, "learning_rate": 7.673736842105264e-05, "loss": 0.0104, "step": 54200 }, { "epoch": 42.153128643606685, "grad_norm": 0.0494319349527359, "learning_rate": 7.671105263157895e-05, "loss": 0.006, "step": 54250 }, { "epoch": 42.19199378157793, "grad_norm": 0.43440911173820496, "learning_rate": 7.668473684210526e-05, "loss": 0.0183, "step": 54300 }, { "epoch": 42.23085891954916, "grad_norm": 0.030721886083483696, "learning_rate": 7.665842105263157e-05, "loss": 0.0129, "step": 54350 }, { "epoch": 42.269724057520406, "grad_norm": 0.8205466270446777, "learning_rate": 7.66321052631579e-05, "loss": 0.0185, "step": 54400 }, { "epoch": 42.30858919549164, "grad_norm": 0.013601802289485931, "learning_rate": 7.660578947368422e-05, "loss": 0.0137, "step": 54450 }, { "epoch": 42.347454333462885, "grad_norm": 3.7669804096221924, "learning_rate": 7.657947368421054e-05, "loss": 0.0153, "step": 54500 }, { "epoch": 42.38631947143412, "grad_norm": 0.4318236708641052, "learning_rate": 7.655315789473685e-05, "loss": 0.0182, "step": 54550 }, { "epoch": 42.42518460940536, "grad_norm": 0.08533640205860138, "learning_rate": 7.652684210526316e-05, "loss": 0.0211, "step": 54600 }, { "epoch": 42.464049747376606, "grad_norm": 0.10274555534124374, "learning_rate": 7.650052631578948e-05, "loss": 0.0142, "step": 54650 }, { "epoch": 42.50291488534784, "grad_norm": 6.137352466583252, "learning_rate": 7.64742105263158e-05, "loss": 0.0163, "step": 54700 }, { "epoch": 42.541780023319085, "grad_norm": 0.6145594716072083, "learning_rate": 7.64478947368421e-05, "loss": 0.0087, "step": 54750 }, { "epoch": 42.58064516129032, "grad_norm": 1.7177934646606445, "learning_rate": 7.642157894736842e-05, "loss": 0.018, "step": 54800 }, { "epoch": 42.61951029926156, "grad_norm": 0.0985284298658371, "learning_rate": 7.639526315789473e-05, "loss": 0.0068, "step": 54850 }, { "epoch": 42.6583754372328, "grad_norm": 0.36002910137176514, "learning_rate": 7.636894736842105e-05, "loss": 0.0286, "step": 54900 }, { "epoch": 42.69724057520404, "grad_norm": 0.5385025143623352, "learning_rate": 7.634263157894738e-05, "loss": 0.0175, "step": 54950 }, { "epoch": 42.736105713175284, "grad_norm": 0.27823692560195923, "learning_rate": 7.631631578947369e-05, "loss": 0.0107, "step": 55000 }, { "epoch": 42.736105713175284, "eval_accuracy": 0.9982509838216004, "eval_runtime": 881.3376, "eval_samples_per_second": 23.354, "eval_steps_per_second": 2.919, "step": 55000 }, { "epoch": 42.77497085114652, "grad_norm": 0.012415263801813126, "learning_rate": 7.629e-05, "loss": 0.0152, "step": 55050 }, { "epoch": 42.81383598911776, "grad_norm": 0.06934750825166702, "learning_rate": 7.626368421052631e-05, "loss": 0.0106, "step": 55100 }, { "epoch": 42.852701127089, "grad_norm": 2.376636505126953, "learning_rate": 7.623736842105264e-05, "loss": 0.0137, "step": 55150 }, { "epoch": 42.89156626506024, "grad_norm": 0.2681966722011566, "learning_rate": 7.621105263157895e-05, "loss": 0.0177, "step": 55200 }, { "epoch": 42.930431403031484, "grad_norm": 0.0518764927983284, "learning_rate": 7.618473684210526e-05, "loss": 0.0249, "step": 55250 }, { "epoch": 42.96929654100272, "grad_norm": 0.05432099476456642, "learning_rate": 7.615842105263157e-05, "loss": 0.021, "step": 55300 }, { "epoch": 43.00777302759425, "grad_norm": 0.03369888663291931, "learning_rate": 7.61321052631579e-05, "loss": 0.0252, "step": 55350 }, { "epoch": 43.04663816556549, "grad_norm": 0.7964507341384888, "learning_rate": 7.610578947368422e-05, "loss": 0.018, "step": 55400 }, { "epoch": 43.085503303536726, "grad_norm": 1.1792151927947998, "learning_rate": 7.607947368421054e-05, "loss": 0.0143, "step": 55450 }, { "epoch": 43.12436844150797, "grad_norm": 0.8480147123336792, "learning_rate": 7.605315789473685e-05, "loss": 0.013, "step": 55500 }, { "epoch": 43.163233579479204, "grad_norm": 1.0581895112991333, "learning_rate": 7.602684210526316e-05, "loss": 0.0221, "step": 55550 }, { "epoch": 43.20209871745045, "grad_norm": 0.1439225971698761, "learning_rate": 7.600052631578947e-05, "loss": 0.017, "step": 55600 }, { "epoch": 43.24096385542169, "grad_norm": 0.47069478034973145, "learning_rate": 7.59742105263158e-05, "loss": 0.0186, "step": 55650 }, { "epoch": 43.279828993392925, "grad_norm": 0.09937036782503128, "learning_rate": 7.594789473684211e-05, "loss": 0.0111, "step": 55700 }, { "epoch": 43.31869413136417, "grad_norm": 0.12931126356124878, "learning_rate": 7.592157894736842e-05, "loss": 0.0109, "step": 55750 }, { "epoch": 43.357559269335404, "grad_norm": 2.7805063724517822, "learning_rate": 7.589526315789473e-05, "loss": 0.0173, "step": 55800 }, { "epoch": 43.39642440730665, "grad_norm": 0.9066233038902283, "learning_rate": 7.586894736842106e-05, "loss": 0.0243, "step": 55850 }, { "epoch": 43.43528954527788, "grad_norm": 0.13462518155574799, "learning_rate": 7.584263157894738e-05, "loss": 0.0219, "step": 55900 }, { "epoch": 43.474154683249125, "grad_norm": 0.2093678116798401, "learning_rate": 7.581631578947369e-05, "loss": 0.0077, "step": 55950 }, { "epoch": 43.51301982122037, "grad_norm": 0.5262479782104492, "learning_rate": 7.579e-05, "loss": 0.0157, "step": 56000 }, { "epoch": 43.51301982122037, "eval_accuracy": 0.9981052324734003, "eval_runtime": 879.9359, "eval_samples_per_second": 23.391, "eval_steps_per_second": 2.924, "step": 56000 }, { "epoch": 43.551884959191604, "grad_norm": 0.41396597027778625, "learning_rate": 7.576368421052632e-05, "loss": 0.0172, "step": 56050 }, { "epoch": 43.59075009716285, "grad_norm": 0.23529602587223053, "learning_rate": 7.573736842105264e-05, "loss": 0.0208, "step": 56100 }, { "epoch": 43.62961523513408, "grad_norm": 0.17375200986862183, "learning_rate": 7.571105263157895e-05, "loss": 0.0168, "step": 56150 }, { "epoch": 43.668480373105325, "grad_norm": 0.03255872428417206, "learning_rate": 7.568473684210526e-05, "loss": 0.0168, "step": 56200 }, { "epoch": 43.70734551107657, "grad_norm": 1.1647653579711914, "learning_rate": 7.565842105263158e-05, "loss": 0.0143, "step": 56250 }, { "epoch": 43.7462106490478, "grad_norm": 0.39687833189964294, "learning_rate": 7.563210526315789e-05, "loss": 0.0087, "step": 56300 }, { "epoch": 43.785075787019046, "grad_norm": 1.4236061573028564, "learning_rate": 7.560578947368421e-05, "loss": 0.0127, "step": 56350 }, { "epoch": 43.82394092499028, "grad_norm": 1.305365800857544, "learning_rate": 7.557947368421054e-05, "loss": 0.0211, "step": 56400 }, { "epoch": 43.862806062961525, "grad_norm": 1.352987289428711, "learning_rate": 7.555315789473685e-05, "loss": 0.0149, "step": 56450 }, { "epoch": 43.90167120093276, "grad_norm": 0.7978178858757019, "learning_rate": 7.552684210526316e-05, "loss": 0.0251, "step": 56500 }, { "epoch": 43.940536338904, "grad_norm": 0.01514784898608923, "learning_rate": 7.550052631578947e-05, "loss": 0.0173, "step": 56550 }, { "epoch": 43.979401476875246, "grad_norm": 2.1784021854400635, "learning_rate": 7.54742105263158e-05, "loss": 0.0139, "step": 56600 }, { "epoch": 44.01787796346677, "grad_norm": 0.22825995087623596, "learning_rate": 7.544789473684211e-05, "loss": 0.0168, "step": 56650 }, { "epoch": 44.05674310143801, "grad_norm": 0.3057117760181427, "learning_rate": 7.542157894736842e-05, "loss": 0.014, "step": 56700 }, { "epoch": 44.09560823940925, "grad_norm": 1.0708446502685547, "learning_rate": 7.539526315789473e-05, "loss": 0.0114, "step": 56750 }, { "epoch": 44.13447337738049, "grad_norm": 0.020148010924458504, "learning_rate": 7.536894736842106e-05, "loss": 0.0203, "step": 56800 }, { "epoch": 44.17333851535173, "grad_norm": 1.150698184967041, "learning_rate": 7.534263157894737e-05, "loss": 0.0151, "step": 56850 }, { "epoch": 44.212203653322966, "grad_norm": 0.1435837298631668, "learning_rate": 7.53163157894737e-05, "loss": 0.0121, "step": 56900 }, { "epoch": 44.25106879129421, "grad_norm": 0.8915746808052063, "learning_rate": 7.529e-05, "loss": 0.0147, "step": 56950 }, { "epoch": 44.28993392926545, "grad_norm": 0.151080921292305, "learning_rate": 7.526368421052632e-05, "loss": 0.0154, "step": 57000 }, { "epoch": 44.28993392926545, "eval_accuracy": 0.9985910703007336, "eval_runtime": 877.799, "eval_samples_per_second": 23.448, "eval_steps_per_second": 2.931, "step": 57000 }, { "epoch": 44.32879906723669, "grad_norm": 1.8628959655761719, "learning_rate": 7.523736842105263e-05, "loss": 0.0115, "step": 57050 }, { "epoch": 44.36766420520793, "grad_norm": 0.06097853183746338, "learning_rate": 7.521105263157895e-05, "loss": 0.0117, "step": 57100 }, { "epoch": 44.406529343179166, "grad_norm": 0.1226363331079483, "learning_rate": 7.518473684210527e-05, "loss": 0.0121, "step": 57150 }, { "epoch": 44.44539448115041, "grad_norm": 0.30275824666023254, "learning_rate": 7.515842105263158e-05, "loss": 0.0129, "step": 57200 }, { "epoch": 44.484259619121644, "grad_norm": 0.03589918836951256, "learning_rate": 7.51321052631579e-05, "loss": 0.0141, "step": 57250 }, { "epoch": 44.52312475709289, "grad_norm": 0.1084870994091034, "learning_rate": 7.510578947368421e-05, "loss": 0.0175, "step": 57300 }, { "epoch": 44.56198989506413, "grad_norm": 0.06478150933980942, "learning_rate": 7.507947368421054e-05, "loss": 0.0201, "step": 57350 }, { "epoch": 44.600855033035366, "grad_norm": 0.11772127449512482, "learning_rate": 7.505315789473685e-05, "loss": 0.018, "step": 57400 }, { "epoch": 44.63972017100661, "grad_norm": 3.1865267753601074, "learning_rate": 7.502684210526316e-05, "loss": 0.009, "step": 57450 }, { "epoch": 44.678585308977844, "grad_norm": 0.5008084774017334, "learning_rate": 7.500052631578947e-05, "loss": 0.0209, "step": 57500 }, { "epoch": 44.71745044694909, "grad_norm": 0.014076965861022472, "learning_rate": 7.497421052631579e-05, "loss": 0.0149, "step": 57550 }, { "epoch": 44.75631558492033, "grad_norm": 3.251145362854004, "learning_rate": 7.494789473684211e-05, "loss": 0.0166, "step": 57600 }, { "epoch": 44.795180722891565, "grad_norm": 0.050258051604032516, "learning_rate": 7.492157894736842e-05, "loss": 0.0171, "step": 57650 }, { "epoch": 44.83404586086281, "grad_norm": 2.849292039871216, "learning_rate": 7.489526315789473e-05, "loss": 0.0145, "step": 57700 }, { "epoch": 44.872910998834044, "grad_norm": 0.8286195993423462, "learning_rate": 7.486894736842106e-05, "loss": 0.0158, "step": 57750 }, { "epoch": 44.91177613680529, "grad_norm": 1.5031063556671143, "learning_rate": 7.484263157894737e-05, "loss": 0.0197, "step": 57800 }, { "epoch": 44.95064127477652, "grad_norm": 0.2209365963935852, "learning_rate": 7.48163157894737e-05, "loss": 0.0165, "step": 57850 }, { "epoch": 44.989506412747765, "grad_norm": 4.55959415435791, "learning_rate": 7.479000000000001e-05, "loss": 0.0187, "step": 57900 }, { "epoch": 45.02798289933929, "grad_norm": 0.03095565363764763, "learning_rate": 7.476368421052632e-05, "loss": 0.0128, "step": 57950 }, { "epoch": 45.066848037310535, "grad_norm": 0.09888409823179245, "learning_rate": 7.473736842105263e-05, "loss": 0.0097, "step": 58000 }, { "epoch": 45.066848037310535, "eval_accuracy": 0.9982024000388671, "eval_runtime": 873.7797, "eval_samples_per_second": 23.556, "eval_steps_per_second": 2.945, "step": 58000 }, { "epoch": 45.10571317528177, "grad_norm": 0.46758630871772766, "learning_rate": 7.471105263157894e-05, "loss": 0.0111, "step": 58050 }, { "epoch": 45.144578313253014, "grad_norm": 3.0760531425476074, "learning_rate": 7.468473684210527e-05, "loss": 0.0125, "step": 58100 }, { "epoch": 45.18344345122425, "grad_norm": 0.02062395215034485, "learning_rate": 7.465842105263158e-05, "loss": 0.0092, "step": 58150 }, { "epoch": 45.22230858919549, "grad_norm": 0.43851378560066223, "learning_rate": 7.46321052631579e-05, "loss": 0.0143, "step": 58200 }, { "epoch": 45.26117372716673, "grad_norm": 2.340397357940674, "learning_rate": 7.460578947368422e-05, "loss": 0.019, "step": 58250 }, { "epoch": 45.30003886513797, "grad_norm": 1.865161418914795, "learning_rate": 7.457947368421053e-05, "loss": 0.0123, "step": 58300 }, { "epoch": 45.338904003109214, "grad_norm": 0.6069706678390503, "learning_rate": 7.455315789473685e-05, "loss": 0.0139, "step": 58350 }, { "epoch": 45.37776914108045, "grad_norm": 0.08736221492290497, "learning_rate": 7.452684210526316e-05, "loss": 0.0134, "step": 58400 }, { "epoch": 45.41663427905169, "grad_norm": 2.229518175125122, "learning_rate": 7.450052631578948e-05, "loss": 0.0114, "step": 58450 }, { "epoch": 45.45549941702293, "grad_norm": 0.29379507899284363, "learning_rate": 7.447421052631579e-05, "loss": 0.0139, "step": 58500 }, { "epoch": 45.49436455499417, "grad_norm": 2.3412671089172363, "learning_rate": 7.444789473684211e-05, "loss": 0.0157, "step": 58550 }, { "epoch": 45.53322969296541, "grad_norm": 0.7046128511428833, "learning_rate": 7.442157894736842e-05, "loss": 0.0171, "step": 58600 }, { "epoch": 45.57209483093665, "grad_norm": 0.04889620468020439, "learning_rate": 7.439526315789475e-05, "loss": 0.0204, "step": 58650 }, { "epoch": 45.61095996890789, "grad_norm": 0.0866706445813179, "learning_rate": 7.436894736842106e-05, "loss": 0.0178, "step": 58700 }, { "epoch": 45.64982510687913, "grad_norm": 0.6070784330368042, "learning_rate": 7.434263157894737e-05, "loss": 0.0193, "step": 58750 }, { "epoch": 45.68869024485037, "grad_norm": 2.2081806659698486, "learning_rate": 7.431631578947368e-05, "loss": 0.0142, "step": 58800 }, { "epoch": 45.727555382821606, "grad_norm": 2.7623023986816406, "learning_rate": 7.429000000000001e-05, "loss": 0.0136, "step": 58850 }, { "epoch": 45.76642052079285, "grad_norm": 0.286423921585083, "learning_rate": 7.426368421052632e-05, "loss": 0.015, "step": 58900 }, { "epoch": 45.80528565876409, "grad_norm": 0.11053986847400665, "learning_rate": 7.423736842105263e-05, "loss": 0.0131, "step": 58950 }, { "epoch": 45.84415079673533, "grad_norm": 0.2008861005306244, "learning_rate": 7.421105263157894e-05, "loss": 0.0082, "step": 59000 }, { "epoch": 45.84415079673533, "eval_accuracy": 0.9987854054316669, "eval_runtime": 869.9812, "eval_samples_per_second": 23.659, "eval_steps_per_second": 2.958, "step": 59000 }, { "epoch": 45.88301593470657, "grad_norm": 0.1004604771733284, "learning_rate": 7.418473684210527e-05, "loss": 0.0158, "step": 59050 }, { "epoch": 45.921881072677806, "grad_norm": 1.3480688333511353, "learning_rate": 7.415842105263158e-05, "loss": 0.0103, "step": 59100 }, { "epoch": 45.96074621064905, "grad_norm": 0.016539020463824272, "learning_rate": 7.41321052631579e-05, "loss": 0.0115, "step": 59150 }, { "epoch": 45.999611348620284, "grad_norm": 0.07502561807632446, "learning_rate": 7.410578947368422e-05, "loss": 0.0161, "step": 59200 }, { "epoch": 46.03808783521181, "grad_norm": 0.060839392244815826, "learning_rate": 7.407947368421053e-05, "loss": 0.0125, "step": 59250 }, { "epoch": 46.076952973183054, "grad_norm": 1.0475828647613525, "learning_rate": 7.405315789473684e-05, "loss": 0.0087, "step": 59300 }, { "epoch": 46.1158181111543, "grad_norm": 0.024058906361460686, "learning_rate": 7.402684210526317e-05, "loss": 0.0106, "step": 59350 }, { "epoch": 46.15468324912553, "grad_norm": 0.049573808908462524, "learning_rate": 7.400052631578948e-05, "loss": 0.01, "step": 59400 }, { "epoch": 46.193548387096776, "grad_norm": 1.1914864778518677, "learning_rate": 7.397421052631579e-05, "loss": 0.0111, "step": 59450 }, { "epoch": 46.23241352506801, "grad_norm": 0.13890480995178223, "learning_rate": 7.39478947368421e-05, "loss": 0.012, "step": 59500 }, { "epoch": 46.271278663039254, "grad_norm": 0.18326811492443085, "learning_rate": 7.392157894736843e-05, "loss": 0.0093, "step": 59550 }, { "epoch": 46.3101438010105, "grad_norm": 0.5677682161331177, "learning_rate": 7.389526315789475e-05, "loss": 0.0112, "step": 59600 }, { "epoch": 46.34900893898173, "grad_norm": 0.02287537045776844, "learning_rate": 7.386894736842106e-05, "loss": 0.0109, "step": 59650 }, { "epoch": 46.387874076952976, "grad_norm": 0.07677704095840454, "learning_rate": 7.384263157894737e-05, "loss": 0.0061, "step": 59700 }, { "epoch": 46.42673921492421, "grad_norm": 2.4414875507354736, "learning_rate": 7.381631578947368e-05, "loss": 0.0163, "step": 59750 }, { "epoch": 46.465604352895454, "grad_norm": 1.8416739702224731, "learning_rate": 7.379000000000001e-05, "loss": 0.0136, "step": 59800 }, { "epoch": 46.50446949086669, "grad_norm": 3.0992000102996826, "learning_rate": 7.376368421052632e-05, "loss": 0.0233, "step": 59850 }, { "epoch": 46.54333462883793, "grad_norm": 0.06737317144870758, "learning_rate": 7.373736842105263e-05, "loss": 0.0135, "step": 59900 }, { "epoch": 46.582199766809175, "grad_norm": 0.14803974330425262, "learning_rate": 7.371105263157894e-05, "loss": 0.0133, "step": 59950 }, { "epoch": 46.62106490478041, "grad_norm": 0.18518660962581635, "learning_rate": 7.368473684210526e-05, "loss": 0.0163, "step": 60000 }, { "epoch": 46.62106490478041, "eval_accuracy": 0.9984453189525336, "eval_runtime": 877.3612, "eval_samples_per_second": 23.46, "eval_steps_per_second": 2.933, "step": 60000 }, { "epoch": 46.659930042751654, "grad_norm": 0.07843168079853058, "learning_rate": 7.365842105263158e-05, "loss": 0.0164, "step": 60050 }, { "epoch": 46.69879518072289, "grad_norm": 0.48835334181785583, "learning_rate": 7.36321052631579e-05, "loss": 0.0095, "step": 60100 }, { "epoch": 46.73766031869413, "grad_norm": 0.03689779341220856, "learning_rate": 7.360578947368422e-05, "loss": 0.0089, "step": 60150 }, { "epoch": 46.77652545666537, "grad_norm": 0.12304223328828812, "learning_rate": 7.357947368421053e-05, "loss": 0.013, "step": 60200 }, { "epoch": 46.81539059463661, "grad_norm": 0.028829611837863922, "learning_rate": 7.355315789473684e-05, "loss": 0.0095, "step": 60250 }, { "epoch": 46.854255732607854, "grad_norm": 0.1779215931892395, "learning_rate": 7.352684210526317e-05, "loss": 0.0203, "step": 60300 }, { "epoch": 46.89312087057909, "grad_norm": 0.027951283380389214, "learning_rate": 7.350052631578948e-05, "loss": 0.0242, "step": 60350 }, { "epoch": 46.93198600855033, "grad_norm": 0.24708083271980286, "learning_rate": 7.347421052631579e-05, "loss": 0.0179, "step": 60400 }, { "epoch": 46.97085114652157, "grad_norm": 3.7258784770965576, "learning_rate": 7.34478947368421e-05, "loss": 0.0094, "step": 60450 }, { "epoch": 47.009327633113095, "grad_norm": 3.078702688217163, "learning_rate": 7.342157894736841e-05, "loss": 0.0184, "step": 60500 }, { "epoch": 47.04819277108434, "grad_norm": 0.09043591469526291, "learning_rate": 7.339526315789474e-05, "loss": 0.0201, "step": 60550 }, { "epoch": 47.087057909055574, "grad_norm": 0.01854565367102623, "learning_rate": 7.336894736842106e-05, "loss": 0.0093, "step": 60600 }, { "epoch": 47.12592304702682, "grad_norm": 0.008243966847658157, "learning_rate": 7.334263157894737e-05, "loss": 0.0173, "step": 60650 }, { "epoch": 47.16478818499806, "grad_norm": 0.0831562727689743, "learning_rate": 7.331631578947369e-05, "loss": 0.0094, "step": 60700 }, { "epoch": 47.203653322969295, "grad_norm": 2.855015516281128, "learning_rate": 7.329e-05, "loss": 0.0073, "step": 60750 }, { "epoch": 47.24251846094054, "grad_norm": 1.0864886045455933, "learning_rate": 7.326368421052632e-05, "loss": 0.0096, "step": 60800 }, { "epoch": 47.28138359891177, "grad_norm": 0.05669767037034035, "learning_rate": 7.323736842105263e-05, "loss": 0.0122, "step": 60850 }, { "epoch": 47.320248736883016, "grad_norm": 0.26644644141197205, "learning_rate": 7.321105263157895e-05, "loss": 0.0058, "step": 60900 }, { "epoch": 47.35911387485426, "grad_norm": 0.01469517033547163, "learning_rate": 7.318473684210526e-05, "loss": 0.0127, "step": 60950 }, { "epoch": 47.397979012825495, "grad_norm": 0.21100173890590668, "learning_rate": 7.315842105263158e-05, "loss": 0.0125, "step": 61000 }, { "epoch": 47.397979012825495, "eval_accuracy": 0.998493902735267, "eval_runtime": 849.6945, "eval_samples_per_second": 24.224, "eval_steps_per_second": 3.028, "step": 61000 }, { "epoch": 47.43684415079674, "grad_norm": 0.06516958773136139, "learning_rate": 7.313210526315791e-05, "loss": 0.0136, "step": 61050 }, { "epoch": 47.47570928876797, "grad_norm": 0.05053483694791794, "learning_rate": 7.310578947368422e-05, "loss": 0.0093, "step": 61100 }, { "epoch": 47.514574426739216, "grad_norm": 0.5198332667350769, "learning_rate": 7.307947368421053e-05, "loss": 0.0112, "step": 61150 }, { "epoch": 47.55343956471045, "grad_norm": 0.052330415695905685, "learning_rate": 7.305315789473684e-05, "loss": 0.011, "step": 61200 }, { "epoch": 47.592304702681695, "grad_norm": 0.7372391223907471, "learning_rate": 7.302684210526315e-05, "loss": 0.0115, "step": 61250 }, { "epoch": 47.63116984065294, "grad_norm": 0.9434056878089905, "learning_rate": 7.300052631578948e-05, "loss": 0.0136, "step": 61300 }, { "epoch": 47.67003497862417, "grad_norm": 0.547739565372467, "learning_rate": 7.297421052631579e-05, "loss": 0.0185, "step": 61350 }, { "epoch": 47.708900116595416, "grad_norm": 0.1716149002313614, "learning_rate": 7.29478947368421e-05, "loss": 0.0135, "step": 61400 }, { "epoch": 47.74776525456665, "grad_norm": 0.1340044140815735, "learning_rate": 7.292157894736843e-05, "loss": 0.012, "step": 61450 }, { "epoch": 47.786630392537894, "grad_norm": 1.5391032695770264, "learning_rate": 7.289526315789474e-05, "loss": 0.0108, "step": 61500 }, { "epoch": 47.82549553050913, "grad_norm": 0.813843846321106, "learning_rate": 7.286894736842106e-05, "loss": 0.0121, "step": 61550 }, { "epoch": 47.86436066848037, "grad_norm": 3.4990172386169434, "learning_rate": 7.284263157894738e-05, "loss": 0.013, "step": 61600 }, { "epoch": 47.903225806451616, "grad_norm": 0.053542912006378174, "learning_rate": 7.281631578947369e-05, "loss": 0.0074, "step": 61650 }, { "epoch": 47.94209094442285, "grad_norm": 0.01410434115678072, "learning_rate": 7.279e-05, "loss": 0.0189, "step": 61700 }, { "epoch": 47.980956082394094, "grad_norm": 0.09987429529428482, "learning_rate": 7.276368421052631e-05, "loss": 0.0109, "step": 61750 }, { "epoch": 48.01943256898562, "grad_norm": 0.6762978434562683, "learning_rate": 7.273736842105264e-05, "loss": 0.0156, "step": 61800 }, { "epoch": 48.05829770695686, "grad_norm": 0.04120064899325371, "learning_rate": 7.271105263157895e-05, "loss": 0.0163, "step": 61850 }, { "epoch": 48.0971628449281, "grad_norm": 0.0444943942129612, "learning_rate": 7.268473684210527e-05, "loss": 0.012, "step": 61900 }, { "epoch": 48.13602798289934, "grad_norm": 1.1446350812911987, "learning_rate": 7.265842105263158e-05, "loss": 0.0153, "step": 61950 }, { "epoch": 48.17489312087058, "grad_norm": 0.03808463364839554, "learning_rate": 7.26321052631579e-05, "loss": 0.014, "step": 62000 }, { "epoch": 48.17489312087058, "eval_accuracy": 0.9986396540834669, "eval_runtime": 851.9896, "eval_samples_per_second": 24.159, "eval_steps_per_second": 3.02, "step": 62000 }, { "epoch": 48.21375825884182, "grad_norm": 0.23002126812934875, "learning_rate": 7.260578947368422e-05, "loss": 0.0105, "step": 62050 }, { "epoch": 48.25262339681306, "grad_norm": 0.052425310015678406, "learning_rate": 7.257947368421053e-05, "loss": 0.0149, "step": 62100 }, { "epoch": 48.2914885347843, "grad_norm": 1.247965693473816, "learning_rate": 7.255315789473684e-05, "loss": 0.01, "step": 62150 }, { "epoch": 48.330353672755535, "grad_norm": 0.17689017951488495, "learning_rate": 7.252684210526316e-05, "loss": 0.0146, "step": 62200 }, { "epoch": 48.36921881072678, "grad_norm": 0.10547871887683868, "learning_rate": 7.250052631578948e-05, "loss": 0.0108, "step": 62250 }, { "epoch": 48.40808394869802, "grad_norm": 0.049669452011585236, "learning_rate": 7.247421052631579e-05, "loss": 0.0083, "step": 62300 }, { "epoch": 48.44694908666926, "grad_norm": 0.9575231075286865, "learning_rate": 7.24478947368421e-05, "loss": 0.0114, "step": 62350 }, { "epoch": 48.4858142246405, "grad_norm": 1.0744680166244507, "learning_rate": 7.242157894736843e-05, "loss": 0.0119, "step": 62400 }, { "epoch": 48.524679362611735, "grad_norm": 0.05026625469326973, "learning_rate": 7.239526315789474e-05, "loss": 0.0107, "step": 62450 }, { "epoch": 48.56354450058298, "grad_norm": 0.013055695220828056, "learning_rate": 7.236894736842105e-05, "loss": 0.0136, "step": 62500 }, { "epoch": 48.602409638554214, "grad_norm": 0.5503189563751221, "learning_rate": 7.234263157894738e-05, "loss": 0.0107, "step": 62550 }, { "epoch": 48.64127477652546, "grad_norm": 0.5869662165641785, "learning_rate": 7.231631578947369e-05, "loss": 0.0165, "step": 62600 }, { "epoch": 48.6801399144967, "grad_norm": 0.45683395862579346, "learning_rate": 7.229e-05, "loss": 0.0086, "step": 62650 }, { "epoch": 48.719005052467935, "grad_norm": 0.046745553612709045, "learning_rate": 7.226368421052631e-05, "loss": 0.0119, "step": 62700 }, { "epoch": 48.75787019043918, "grad_norm": 0.08209559321403503, "learning_rate": 7.223736842105264e-05, "loss": 0.0087, "step": 62750 }, { "epoch": 48.79673532841041, "grad_norm": 0.7342914938926697, "learning_rate": 7.221105263157895e-05, "loss": 0.0086, "step": 62800 }, { "epoch": 48.835600466381656, "grad_norm": 0.2686988413333893, "learning_rate": 7.218473684210527e-05, "loss": 0.0249, "step": 62850 }, { "epoch": 48.87446560435289, "grad_norm": 0.045273009687662125, "learning_rate": 7.215842105263159e-05, "loss": 0.01, "step": 62900 }, { "epoch": 48.913330742324135, "grad_norm": 0.5616592168807983, "learning_rate": 7.21321052631579e-05, "loss": 0.0189, "step": 62950 }, { "epoch": 48.95219588029538, "grad_norm": 0.11938033998012543, "learning_rate": 7.210578947368421e-05, "loss": 0.0142, "step": 63000 }, { "epoch": 48.95219588029538, "eval_accuracy": 0.9986396540834669, "eval_runtime": 852.2966, "eval_samples_per_second": 24.15, "eval_steps_per_second": 3.019, "step": 63000 }, { "epoch": 48.99106101826661, "grad_norm": 1.7056082487106323, "learning_rate": 7.207947368421053e-05, "loss": 0.0142, "step": 63050 }, { "epoch": 49.02953750485814, "grad_norm": 0.026881588622927666, "learning_rate": 7.205315789473685e-05, "loss": 0.0164, "step": 63100 }, { "epoch": 49.06840264282938, "grad_norm": 0.5080930590629578, "learning_rate": 7.202684210526316e-05, "loss": 0.0087, "step": 63150 }, { "epoch": 49.10726778080062, "grad_norm": 0.6311091780662537, "learning_rate": 7.200052631578947e-05, "loss": 0.0119, "step": 63200 }, { "epoch": 49.14613291877186, "grad_norm": 0.007207009941339493, "learning_rate": 7.19742105263158e-05, "loss": 0.0101, "step": 63250 }, { "epoch": 49.184998056743105, "grad_norm": 0.6172986626625061, "learning_rate": 7.194789473684212e-05, "loss": 0.0084, "step": 63300 }, { "epoch": 49.22386319471434, "grad_norm": 0.12769931554794312, "learning_rate": 7.192157894736843e-05, "loss": 0.0074, "step": 63350 }, { "epoch": 49.26272833268558, "grad_norm": 0.22103777527809143, "learning_rate": 7.189526315789474e-05, "loss": 0.0246, "step": 63400 }, { "epoch": 49.30159347065682, "grad_norm": 1.9517908096313477, "learning_rate": 7.186894736842105e-05, "loss": 0.0095, "step": 63450 }, { "epoch": 49.34045860862806, "grad_norm": 0.9385688304901123, "learning_rate": 7.184263157894738e-05, "loss": 0.0133, "step": 63500 }, { "epoch": 49.3793237465993, "grad_norm": 0.32060927152633667, "learning_rate": 7.181631578947369e-05, "loss": 0.0052, "step": 63550 }, { "epoch": 49.41818888457054, "grad_norm": 3.535792350769043, "learning_rate": 7.179e-05, "loss": 0.0113, "step": 63600 }, { "epoch": 49.45705402254178, "grad_norm": 0.07511692494153976, "learning_rate": 7.176368421052631e-05, "loss": 0.0097, "step": 63650 }, { "epoch": 49.49591916051302, "grad_norm": 0.19153301417827606, "learning_rate": 7.173736842105263e-05, "loss": 0.0159, "step": 63700 }, { "epoch": 49.53478429848426, "grad_norm": 0.2613302171230316, "learning_rate": 7.171105263157895e-05, "loss": 0.012, "step": 63750 }, { "epoch": 49.5736494364555, "grad_norm": 0.9980623722076416, "learning_rate": 7.168473684210528e-05, "loss": 0.0119, "step": 63800 }, { "epoch": 49.61251457442674, "grad_norm": 0.07133900374174118, "learning_rate": 7.165842105263159e-05, "loss": 0.0126, "step": 63850 }, { "epoch": 49.651379712397976, "grad_norm": 1.0189536809921265, "learning_rate": 7.16321052631579e-05, "loss": 0.0091, "step": 63900 }, { "epoch": 49.69024485036922, "grad_norm": 0.18489180505275726, "learning_rate": 7.160578947368421e-05, "loss": 0.0076, "step": 63950 }, { "epoch": 49.72910998834046, "grad_norm": 0.25034400820732117, "learning_rate": 7.157947368421054e-05, "loss": 0.0092, "step": 64000 }, { "epoch": 49.72910998834046, "eval_accuracy": 0.9987854054316669, "eval_runtime": 854.0151, "eval_samples_per_second": 24.101, "eval_steps_per_second": 3.013, "step": 64000 }, { "epoch": 49.7679751263117, "grad_norm": 0.03618240728974342, "learning_rate": 7.155315789473685e-05, "loss": 0.0132, "step": 64050 }, { "epoch": 49.80684026428294, "grad_norm": 0.04059286043047905, "learning_rate": 7.152684210526316e-05, "loss": 0.0127, "step": 64100 }, { "epoch": 49.845705402254175, "grad_norm": 0.31507959961891174, "learning_rate": 7.150052631578947e-05, "loss": 0.0099, "step": 64150 }, { "epoch": 49.88457054022542, "grad_norm": 0.14851006865501404, "learning_rate": 7.147421052631578e-05, "loss": 0.0092, "step": 64200 }, { "epoch": 49.92343567819666, "grad_norm": 0.2910158336162567, "learning_rate": 7.144789473684211e-05, "loss": 0.0091, "step": 64250 }, { "epoch": 49.9623008161679, "grad_norm": 0.9559641480445862, "learning_rate": 7.142157894736843e-05, "loss": 0.0156, "step": 64300 }, { "epoch": 50.000777302759424, "grad_norm": 0.012824985198676586, "learning_rate": 7.139526315789474e-05, "loss": 0.0085, "step": 64350 }, { "epoch": 50.03964244073067, "grad_norm": 0.12682408094406128, "learning_rate": 7.136894736842106e-05, "loss": 0.0129, "step": 64400 }, { "epoch": 50.0785075787019, "grad_norm": 0.025887005031108856, "learning_rate": 7.134263157894737e-05, "loss": 0.0063, "step": 64450 }, { "epoch": 50.117372716673145, "grad_norm": 0.06498356908559799, "learning_rate": 7.131631578947369e-05, "loss": 0.01, "step": 64500 }, { "epoch": 50.15623785464438, "grad_norm": 2.5489485263824463, "learning_rate": 7.129e-05, "loss": 0.0118, "step": 64550 }, { "epoch": 50.195102992615624, "grad_norm": 0.08452356606721878, "learning_rate": 7.126368421052632e-05, "loss": 0.0137, "step": 64600 }, { "epoch": 50.23396813058687, "grad_norm": 3.793992757797241, "learning_rate": 7.123736842105263e-05, "loss": 0.01, "step": 64650 }, { "epoch": 50.2728332685581, "grad_norm": 0.2957603335380554, "learning_rate": 7.121105263157895e-05, "loss": 0.0157, "step": 64700 }, { "epoch": 50.311698406529345, "grad_norm": 0.11913411319255829, "learning_rate": 7.118473684210528e-05, "loss": 0.0144, "step": 64750 }, { "epoch": 50.35056354450058, "grad_norm": 0.07215093076229095, "learning_rate": 7.115842105263159e-05, "loss": 0.0199, "step": 64800 }, { "epoch": 50.389428682471824, "grad_norm": 0.04829329997301102, "learning_rate": 7.11321052631579e-05, "loss": 0.0165, "step": 64850 }, { "epoch": 50.42829382044306, "grad_norm": 0.13153165578842163, "learning_rate": 7.110578947368421e-05, "loss": 0.0151, "step": 64900 }, { "epoch": 50.4671589584143, "grad_norm": 0.029550815001130104, "learning_rate": 7.107947368421052e-05, "loss": 0.0101, "step": 64950 }, { "epoch": 50.506024096385545, "grad_norm": 0.010725216940045357, "learning_rate": 7.105315789473685e-05, "loss": 0.0069, "step": 65000 }, { "epoch": 50.506024096385545, "eval_accuracy": 0.9988825729971336, "eval_runtime": 858.6368, "eval_samples_per_second": 23.972, "eval_steps_per_second": 2.997, "step": 65000 }, { "epoch": 50.54488923435678, "grad_norm": 0.3737189471721649, "learning_rate": 7.102684210526316e-05, "loss": 0.0088, "step": 65050 }, { "epoch": 50.58375437232802, "grad_norm": 0.07762659341096878, "learning_rate": 7.100052631578947e-05, "loss": 0.0122, "step": 65100 }, { "epoch": 50.62261951029926, "grad_norm": 0.008183227851986885, "learning_rate": 7.097421052631578e-05, "loss": 0.0075, "step": 65150 }, { "epoch": 50.6614846482705, "grad_norm": 1.5395113229751587, "learning_rate": 7.094789473684211e-05, "loss": 0.0127, "step": 65200 }, { "epoch": 50.700349786241745, "grad_norm": 1.8640962839126587, "learning_rate": 7.092157894736843e-05, "loss": 0.0159, "step": 65250 }, { "epoch": 50.73921492421298, "grad_norm": 0.3395310938358307, "learning_rate": 7.089526315789475e-05, "loss": 0.0115, "step": 65300 }, { "epoch": 50.77808006218422, "grad_norm": 0.5718401074409485, "learning_rate": 7.086894736842106e-05, "loss": 0.0149, "step": 65350 }, { "epoch": 50.81694520015546, "grad_norm": 0.6449897885322571, "learning_rate": 7.084263157894737e-05, "loss": 0.014, "step": 65400 }, { "epoch": 50.8558103381267, "grad_norm": 0.24561935663223267, "learning_rate": 7.081631578947368e-05, "loss": 0.0101, "step": 65450 }, { "epoch": 50.89467547609794, "grad_norm": 0.222761869430542, "learning_rate": 7.079e-05, "loss": 0.0092, "step": 65500 }, { "epoch": 50.93354061406918, "grad_norm": 3.4988646507263184, "learning_rate": 7.076368421052632e-05, "loss": 0.0148, "step": 65550 }, { "epoch": 50.97240575204042, "grad_norm": 1.750221610069275, "learning_rate": 7.073736842105263e-05, "loss": 0.0127, "step": 65600 }, { "epoch": 51.01088223863195, "grad_norm": 0.1166532039642334, "learning_rate": 7.071105263157895e-05, "loss": 0.0045, "step": 65650 }, { "epoch": 51.049747376603186, "grad_norm": 2.3461062908172607, "learning_rate": 7.068473684210526e-05, "loss": 0.0045, "step": 65700 }, { "epoch": 51.08861251457443, "grad_norm": 0.01984625868499279, "learning_rate": 7.065842105263159e-05, "loss": 0.0098, "step": 65750 }, { "epoch": 51.127477652545664, "grad_norm": 1.5265790224075317, "learning_rate": 7.06321052631579e-05, "loss": 0.0076, "step": 65800 }, { "epoch": 51.16634279051691, "grad_norm": 0.029023034498095512, "learning_rate": 7.060578947368421e-05, "loss": 0.0093, "step": 65850 }, { "epoch": 51.20520792848814, "grad_norm": 0.08802133798599243, "learning_rate": 7.057947368421052e-05, "loss": 0.0037, "step": 65900 }, { "epoch": 51.244073066459386, "grad_norm": 0.2991699278354645, "learning_rate": 7.055315789473685e-05, "loss": 0.0093, "step": 65950 }, { "epoch": 51.28293820443063, "grad_norm": 2.19167160987854, "learning_rate": 7.052684210526316e-05, "loss": 0.0087, "step": 66000 }, { "epoch": 51.28293820443063, "eval_accuracy": 0.9989311567798669, "eval_runtime": 854.2184, "eval_samples_per_second": 24.096, "eval_steps_per_second": 3.012, "step": 66000 }, { "epoch": 51.321803342401864, "grad_norm": 0.21082434058189392, "learning_rate": 7.050052631578947e-05, "loss": 0.0098, "step": 66050 }, { "epoch": 51.36066848037311, "grad_norm": 0.013236195780336857, "learning_rate": 7.04742105263158e-05, "loss": 0.0136, "step": 66100 }, { "epoch": 51.39953361834434, "grad_norm": 1.2647536993026733, "learning_rate": 7.044789473684211e-05, "loss": 0.0087, "step": 66150 }, { "epoch": 51.438398756315586, "grad_norm": 0.042822957038879395, "learning_rate": 7.042157894736842e-05, "loss": 0.0161, "step": 66200 }, { "epoch": 51.47726389428682, "grad_norm": 0.0055616190657019615, "learning_rate": 7.039526315789475e-05, "loss": 0.0163, "step": 66250 }, { "epoch": 51.516129032258064, "grad_norm": 0.6984388828277588, "learning_rate": 7.036894736842106e-05, "loss": 0.0066, "step": 66300 }, { "epoch": 51.55499417022931, "grad_norm": 0.17050936818122864, "learning_rate": 7.034263157894737e-05, "loss": 0.008, "step": 66350 }, { "epoch": 51.59385930820054, "grad_norm": 0.04073571413755417, "learning_rate": 7.031631578947368e-05, "loss": 0.012, "step": 66400 }, { "epoch": 51.632724446171785, "grad_norm": 2.6651530265808105, "learning_rate": 7.029e-05, "loss": 0.013, "step": 66450 }, { "epoch": 51.67158958414302, "grad_norm": 0.2084534913301468, "learning_rate": 7.026368421052632e-05, "loss": 0.0107, "step": 66500 }, { "epoch": 51.710454722114264, "grad_norm": 0.3147223889827728, "learning_rate": 7.023736842105263e-05, "loss": 0.0145, "step": 66550 }, { "epoch": 51.74931986008551, "grad_norm": 0.026732545346021652, "learning_rate": 7.021105263157895e-05, "loss": 0.0134, "step": 66600 }, { "epoch": 51.78818499805674, "grad_norm": 1.0873422622680664, "learning_rate": 7.018473684210527e-05, "loss": 0.0127, "step": 66650 }, { "epoch": 51.827050136027985, "grad_norm": 0.040288642048835754, "learning_rate": 7.015842105263158e-05, "loss": 0.0122, "step": 66700 }, { "epoch": 51.86591527399922, "grad_norm": 0.10090038180351257, "learning_rate": 7.01321052631579e-05, "loss": 0.0154, "step": 66750 }, { "epoch": 51.904780411970464, "grad_norm": 0.007838579826056957, "learning_rate": 7.010578947368421e-05, "loss": 0.0228, "step": 66800 }, { "epoch": 51.9436455499417, "grad_norm": 0.0807645320892334, "learning_rate": 7.007947368421053e-05, "loss": 0.0087, "step": 66850 }, { "epoch": 51.98251068791294, "grad_norm": 0.5056226849555969, "learning_rate": 7.005315789473684e-05, "loss": 0.0115, "step": 66900 }, { "epoch": 52.02098717450447, "grad_norm": 0.03711940720677376, "learning_rate": 7.002684210526316e-05, "loss": 0.0075, "step": 66950 }, { "epoch": 52.05985231247571, "grad_norm": 0.18952326476573944, "learning_rate": 7.000052631578947e-05, "loss": 0.0079, "step": 67000 }, { "epoch": 52.05985231247571, "eval_accuracy": 0.9989311567798669, "eval_runtime": 851.5324, "eval_samples_per_second": 24.172, "eval_steps_per_second": 3.022, "step": 67000 }, { "epoch": 52.09871745044695, "grad_norm": 0.048657167702913284, "learning_rate": 6.99742105263158e-05, "loss": 0.0162, "step": 67050 }, { "epoch": 52.13758258841819, "grad_norm": 0.28034019470214844, "learning_rate": 6.994789473684211e-05, "loss": 0.0044, "step": 67100 }, { "epoch": 52.176447726389426, "grad_norm": 0.060565035790205, "learning_rate": 6.992157894736842e-05, "loss": 0.0097, "step": 67150 }, { "epoch": 52.21531286436067, "grad_norm": 0.13445107638835907, "learning_rate": 6.989526315789475e-05, "loss": 0.011, "step": 67200 }, { "epoch": 52.254178002331905, "grad_norm": 0.060040175914764404, "learning_rate": 6.986894736842106e-05, "loss": 0.0065, "step": 67250 }, { "epoch": 52.29304314030315, "grad_norm": 0.11060953140258789, "learning_rate": 6.984263157894737e-05, "loss": 0.0151, "step": 67300 }, { "epoch": 52.33190827827439, "grad_norm": 0.6473913788795471, "learning_rate": 6.981631578947368e-05, "loss": 0.0085, "step": 67350 }, { "epoch": 52.370773416245626, "grad_norm": 0.02483026310801506, "learning_rate": 6.979e-05, "loss": 0.0083, "step": 67400 }, { "epoch": 52.40963855421687, "grad_norm": 0.03834308311343193, "learning_rate": 6.976368421052632e-05, "loss": 0.0057, "step": 67450 }, { "epoch": 52.448503692188105, "grad_norm": 0.16416214406490326, "learning_rate": 6.973736842105264e-05, "loss": 0.011, "step": 67500 }, { "epoch": 52.48736883015935, "grad_norm": 0.012599085457623005, "learning_rate": 6.971105263157896e-05, "loss": 0.0065, "step": 67550 }, { "epoch": 52.52623396813059, "grad_norm": 0.04111713543534279, "learning_rate": 6.968473684210527e-05, "loss": 0.0112, "step": 67600 }, { "epoch": 52.565099106101826, "grad_norm": 0.003991789184510708, "learning_rate": 6.965842105263158e-05, "loss": 0.0131, "step": 67650 }, { "epoch": 52.60396424407307, "grad_norm": 0.05732455104589462, "learning_rate": 6.96321052631579e-05, "loss": 0.0107, "step": 67700 }, { "epoch": 52.642829382044305, "grad_norm": 0.1286925971508026, "learning_rate": 6.960578947368422e-05, "loss": 0.0152, "step": 67750 }, { "epoch": 52.68169452001555, "grad_norm": 0.13254734873771667, "learning_rate": 6.957947368421053e-05, "loss": 0.011, "step": 67800 }, { "epoch": 52.72055965798678, "grad_norm": 1.788832187652588, "learning_rate": 6.955315789473684e-05, "loss": 0.008, "step": 67850 }, { "epoch": 52.759424795958026, "grad_norm": 0.010066797025501728, "learning_rate": 6.952684210526315e-05, "loss": 0.0107, "step": 67900 }, { "epoch": 52.79828993392927, "grad_norm": 0.12101956456899643, "learning_rate": 6.950052631578948e-05, "loss": 0.0095, "step": 67950 }, { "epoch": 52.837155071900504, "grad_norm": 0.42573440074920654, "learning_rate": 6.94742105263158e-05, "loss": 0.0088, "step": 68000 }, { "epoch": 52.837155071900504, "eval_accuracy": 0.9985910703007336, "eval_runtime": 855.217, "eval_samples_per_second": 24.068, "eval_steps_per_second": 3.009, "step": 68000 }, { "epoch": 52.87602020987175, "grad_norm": 0.5126527547836304, "learning_rate": 6.944789473684211e-05, "loss": 0.0071, "step": 68050 }, { "epoch": 52.91488534784298, "grad_norm": 0.46387553215026855, "learning_rate": 6.942157894736842e-05, "loss": 0.0107, "step": 68100 }, { "epoch": 52.953750485814226, "grad_norm": 0.572813868522644, "learning_rate": 6.939526315789474e-05, "loss": 0.0089, "step": 68150 }, { "epoch": 52.99261562378546, "grad_norm": 0.013004188425838947, "learning_rate": 6.936894736842106e-05, "loss": 0.0147, "step": 68200 }, { "epoch": 53.03109211037699, "grad_norm": 0.2339731901884079, "learning_rate": 6.934263157894737e-05, "loss": 0.0118, "step": 68250 }, { "epoch": 53.06995724834823, "grad_norm": 0.0777270719408989, "learning_rate": 6.931631578947368e-05, "loss": 0.0159, "step": 68300 }, { "epoch": 53.108822386319474, "grad_norm": 0.6745654940605164, "learning_rate": 6.929e-05, "loss": 0.0032, "step": 68350 }, { "epoch": 53.14768752429071, "grad_norm": 0.10512412339448929, "learning_rate": 6.926368421052632e-05, "loss": 0.0162, "step": 68400 }, { "epoch": 53.18655266226195, "grad_norm": 0.20373064279556274, "learning_rate": 6.923736842105265e-05, "loss": 0.0112, "step": 68450 }, { "epoch": 53.22541780023319, "grad_norm": 0.12958596646785736, "learning_rate": 6.921105263157896e-05, "loss": 0.0071, "step": 68500 }, { "epoch": 53.26428293820443, "grad_norm": 0.0892186313867569, "learning_rate": 6.918473684210527e-05, "loss": 0.0152, "step": 68550 }, { "epoch": 53.303148076175674, "grad_norm": 4.232198238372803, "learning_rate": 6.915842105263158e-05, "loss": 0.0093, "step": 68600 }, { "epoch": 53.34201321414691, "grad_norm": 4.7500176429748535, "learning_rate": 6.913210526315789e-05, "loss": 0.0151, "step": 68650 }, { "epoch": 53.38087835211815, "grad_norm": 0.430583655834198, "learning_rate": 6.910578947368422e-05, "loss": 0.01, "step": 68700 }, { "epoch": 53.41974349008939, "grad_norm": 0.4367411434650421, "learning_rate": 6.907947368421053e-05, "loss": 0.0089, "step": 68750 }, { "epoch": 53.45860862806063, "grad_norm": 0.05042579025030136, "learning_rate": 6.905315789473684e-05, "loss": 0.006, "step": 68800 }, { "epoch": 53.49747376603187, "grad_norm": 0.14255771040916443, "learning_rate": 6.902684210526315e-05, "loss": 0.0101, "step": 68850 }, { "epoch": 53.53633890400311, "grad_norm": 0.374448299407959, "learning_rate": 6.900052631578948e-05, "loss": 0.0086, "step": 68900 }, { "epoch": 53.57520404197435, "grad_norm": 0.10216441005468369, "learning_rate": 6.89742105263158e-05, "loss": 0.0087, "step": 68950 }, { "epoch": 53.61406917994559, "grad_norm": 0.03489489480853081, "learning_rate": 6.894789473684211e-05, "loss": 0.0088, "step": 69000 }, { "epoch": 53.61406917994559, "eval_accuracy": 0.9985424865180003, "eval_runtime": 849.4139, "eval_samples_per_second": 24.232, "eval_steps_per_second": 3.029, "step": 69000 }, { "epoch": 53.65293431791683, "grad_norm": 0.0035805790685117245, "learning_rate": 6.892157894736843e-05, "loss": 0.0086, "step": 69050 }, { "epoch": 53.69179945588807, "grad_norm": 0.15195728838443756, "learning_rate": 6.889526315789474e-05, "loss": 0.0118, "step": 69100 }, { "epoch": 53.73066459385931, "grad_norm": 0.5311455726623535, "learning_rate": 6.886894736842105e-05, "loss": 0.011, "step": 69150 }, { "epoch": 53.769529731830545, "grad_norm": 0.30500563979148865, "learning_rate": 6.884263157894737e-05, "loss": 0.0097, "step": 69200 }, { "epoch": 53.80839486980179, "grad_norm": 0.05197153985500336, "learning_rate": 6.881631578947369e-05, "loss": 0.0103, "step": 69250 }, { "epoch": 53.84726000777303, "grad_norm": 0.369314581155777, "learning_rate": 6.879e-05, "loss": 0.0073, "step": 69300 }, { "epoch": 53.886125145744266, "grad_norm": 0.755934476852417, "learning_rate": 6.876368421052632e-05, "loss": 0.0071, "step": 69350 }, { "epoch": 53.92499028371551, "grad_norm": 2.4341537952423096, "learning_rate": 6.873736842105263e-05, "loss": 0.0191, "step": 69400 }, { "epoch": 53.963855421686745, "grad_norm": 6.1135993003845215, "learning_rate": 6.871105263157896e-05, "loss": 0.0125, "step": 69450 }, { "epoch": 54.00233190827827, "grad_norm": 0.027943432331085205, "learning_rate": 6.868473684210527e-05, "loss": 0.0124, "step": 69500 }, { "epoch": 54.041197046249515, "grad_norm": 3.6769490242004395, "learning_rate": 6.865842105263158e-05, "loss": 0.0121, "step": 69550 }, { "epoch": 54.08006218422075, "grad_norm": 2.0445716381073, "learning_rate": 6.86321052631579e-05, "loss": 0.0129, "step": 69600 }, { "epoch": 54.11892732219199, "grad_norm": 0.0530659481883049, "learning_rate": 6.860578947368422e-05, "loss": 0.0051, "step": 69650 }, { "epoch": 54.157792460163236, "grad_norm": 0.038605764508247375, "learning_rate": 6.857947368421053e-05, "loss": 0.0053, "step": 69700 }, { "epoch": 54.19665759813447, "grad_norm": 0.5865646004676819, "learning_rate": 6.855315789473684e-05, "loss": 0.01, "step": 69750 }, { "epoch": 54.235522736105715, "grad_norm": 1.5437185764312744, "learning_rate": 6.852684210526315e-05, "loss": 0.0087, "step": 69800 }, { "epoch": 54.27438787407695, "grad_norm": 0.030433477833867073, "learning_rate": 6.850052631578948e-05, "loss": 0.0044, "step": 69850 }, { "epoch": 54.31325301204819, "grad_norm": 0.1999322772026062, "learning_rate": 6.847421052631579e-05, "loss": 0.0098, "step": 69900 }, { "epoch": 54.352118150019436, "grad_norm": 2.545680046081543, "learning_rate": 6.844789473684212e-05, "loss": 0.0032, "step": 69950 }, { "epoch": 54.39098328799067, "grad_norm": 0.26193609833717346, "learning_rate": 6.842157894736843e-05, "loss": 0.012, "step": 70000 }, { "epoch": 54.39098328799067, "eval_accuracy": 0.9987854054316669, "eval_runtime": 852.7537, "eval_samples_per_second": 24.137, "eval_steps_per_second": 3.017, "step": 70000 }, { "epoch": 54.429848425961914, "grad_norm": 0.2691155970096588, "learning_rate": 6.839526315789474e-05, "loss": 0.0089, "step": 70050 }, { "epoch": 54.46871356393315, "grad_norm": 0.05729250609874725, "learning_rate": 6.836894736842105e-05, "loss": 0.0108, "step": 70100 }, { "epoch": 54.50757870190439, "grad_norm": 5.909592151641846, "learning_rate": 6.834263157894738e-05, "loss": 0.0159, "step": 70150 }, { "epoch": 54.54644383987563, "grad_norm": 0.48799797892570496, "learning_rate": 6.831631578947369e-05, "loss": 0.0125, "step": 70200 }, { "epoch": 54.58530897784687, "grad_norm": 1.2644726037979126, "learning_rate": 6.829e-05, "loss": 0.0113, "step": 70250 }, { "epoch": 54.624174115818114, "grad_norm": 0.2227022796869278, "learning_rate": 6.826368421052632e-05, "loss": 0.0047, "step": 70300 }, { "epoch": 54.66303925378935, "grad_norm": 0.06142238900065422, "learning_rate": 6.823736842105264e-05, "loss": 0.0089, "step": 70350 }, { "epoch": 54.70190439176059, "grad_norm": 2.738752603530884, "learning_rate": 6.821105263157895e-05, "loss": 0.006, "step": 70400 }, { "epoch": 54.74076952973183, "grad_norm": 0.5507923364639282, "learning_rate": 6.818473684210527e-05, "loss": 0.0095, "step": 70450 }, { "epoch": 54.77963466770307, "grad_norm": 0.024553149938583374, "learning_rate": 6.815842105263158e-05, "loss": 0.0066, "step": 70500 }, { "epoch": 54.81849980567431, "grad_norm": 0.1172686219215393, "learning_rate": 6.81321052631579e-05, "loss": 0.0113, "step": 70550 }, { "epoch": 54.85736494364555, "grad_norm": 0.08226852118968964, "learning_rate": 6.81057894736842e-05, "loss": 0.0083, "step": 70600 }, { "epoch": 54.89623008161679, "grad_norm": 5.019248008728027, "learning_rate": 6.807947368421053e-05, "loss": 0.0108, "step": 70650 }, { "epoch": 54.93509521958803, "grad_norm": 0.14996369183063507, "learning_rate": 6.805315789473684e-05, "loss": 0.0095, "step": 70700 }, { "epoch": 54.97396035755927, "grad_norm": 1.8933607339859009, "learning_rate": 6.802684210526317e-05, "loss": 0.0114, "step": 70750 }, { "epoch": 55.0124368441508, "grad_norm": 0.010230555199086666, "learning_rate": 6.800052631578948e-05, "loss": 0.0132, "step": 70800 }, { "epoch": 55.051301982122034, "grad_norm": 0.17247895896434784, "learning_rate": 6.797421052631579e-05, "loss": 0.0078, "step": 70850 }, { "epoch": 55.09016712009328, "grad_norm": 1.4789326190948486, "learning_rate": 6.794789473684212e-05, "loss": 0.0222, "step": 70900 }, { "epoch": 55.12903225806452, "grad_norm": 0.0069031622260808945, "learning_rate": 6.792157894736843e-05, "loss": 0.0061, "step": 70950 }, { "epoch": 55.167897396035755, "grad_norm": 0.054508525878190994, "learning_rate": 6.789526315789474e-05, "loss": 0.0049, "step": 71000 }, { "epoch": 55.167897396035755, "eval_accuracy": 0.9989797405626002, "eval_runtime": 862.2551, "eval_samples_per_second": 23.871, "eval_steps_per_second": 2.984, "step": 71000 }, { "epoch": 55.206762534007, "grad_norm": 0.043283432722091675, "learning_rate": 6.786894736842105e-05, "loss": 0.0066, "step": 71050 }, { "epoch": 55.245627671978234, "grad_norm": 0.06320410966873169, "learning_rate": 6.784263157894736e-05, "loss": 0.0055, "step": 71100 }, { "epoch": 55.28449280994948, "grad_norm": 0.04028366133570671, "learning_rate": 6.781631578947369e-05, "loss": 0.0049, "step": 71150 }, { "epoch": 55.32335794792071, "grad_norm": 2.822596311569214, "learning_rate": 6.779e-05, "loss": 0.0088, "step": 71200 }, { "epoch": 55.362223085891955, "grad_norm": 0.5370486378669739, "learning_rate": 6.776368421052632e-05, "loss": 0.008, "step": 71250 }, { "epoch": 55.4010882238632, "grad_norm": 1.14641535282135, "learning_rate": 6.773736842105264e-05, "loss": 0.0067, "step": 71300 }, { "epoch": 55.439953361834434, "grad_norm": 0.1291676163673401, "learning_rate": 6.771105263157895e-05, "loss": 0.0091, "step": 71350 }, { "epoch": 55.478818499805676, "grad_norm": 0.24983321130275726, "learning_rate": 6.768473684210527e-05, "loss": 0.014, "step": 71400 }, { "epoch": 55.51768363777691, "grad_norm": 0.03976181894540787, "learning_rate": 6.765842105263158e-05, "loss": 0.0126, "step": 71450 }, { "epoch": 55.556548775748155, "grad_norm": 0.37395182251930237, "learning_rate": 6.76321052631579e-05, "loss": 0.0071, "step": 71500 }, { "epoch": 55.59541391371939, "grad_norm": 0.05620402470231056, "learning_rate": 6.760578947368421e-05, "loss": 0.0068, "step": 71550 }, { "epoch": 55.63427905169063, "grad_norm": 0.27877360582351685, "learning_rate": 6.757947368421052e-05, "loss": 0.0107, "step": 71600 }, { "epoch": 55.673144189661876, "grad_norm": 0.04781031608581543, "learning_rate": 6.755315789473684e-05, "loss": 0.0144, "step": 71650 }, { "epoch": 55.71200932763311, "grad_norm": 0.015767989680171013, "learning_rate": 6.752684210526317e-05, "loss": 0.0173, "step": 71700 }, { "epoch": 55.750874465604355, "grad_norm": 0.03923904523253441, "learning_rate": 6.750052631578948e-05, "loss": 0.0067, "step": 71750 }, { "epoch": 55.78973960357559, "grad_norm": 3.509338140487671, "learning_rate": 6.747421052631579e-05, "loss": 0.0052, "step": 71800 }, { "epoch": 55.82860474154683, "grad_norm": 0.012454860843718052, "learning_rate": 6.74478947368421e-05, "loss": 0.0123, "step": 71850 }, { "epoch": 55.86746987951807, "grad_norm": 0.1035495400428772, "learning_rate": 6.742157894736843e-05, "loss": 0.0089, "step": 71900 }, { "epoch": 55.90633501748931, "grad_norm": 0.031217306852340698, "learning_rate": 6.739526315789474e-05, "loss": 0.0214, "step": 71950 }, { "epoch": 55.945200155460554, "grad_norm": 0.12328901886940002, "learning_rate": 6.736894736842105e-05, "loss": 0.008, "step": 72000 }, { "epoch": 55.945200155460554, "eval_accuracy": 0.9991254919108001, "eval_runtime": 855.5732, "eval_samples_per_second": 24.058, "eval_steps_per_second": 3.007, "step": 72000 } ], "logging_steps": 50, "max_steps": 200000, "num_input_tokens_seen": 0, "num_train_epochs": 156, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.39666699518976e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }