diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8266 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "global_step": 119547, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.00024811801548585953, + "loss": 8.8316, + "step": 100 + }, + { + "epoch": 0.0, + "learning_rate": 0.00029403430324938403, + "loss": 5.464, + "step": 200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002998014193905167, + "loss": 4.4569, + "step": 300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002995500515304113, + "loss": 4.0926, + "step": 400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002992986836703059, + "loss": 3.9091, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029904731581020046, + "loss": 3.7998, + "step": 600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002987959479500951, + "loss": 3.7103, + "step": 700 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002985445800899897, + "loss": 3.663, + "step": 800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029829321222988426, + "loss": 3.6162, + "step": 900 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002980418443697789, + "loss": 3.5725, + "step": 1000 + }, + { + "epoch": 0.01, + "eval_accuracy": 0.37403400092106887, + "eval_loss": 3.5909957885742188, + "eval_runtime": 37.0746, + "eval_samples_per_second": 302.526, + "eval_steps_per_second": 2.535, + "step": 1000 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029779047650967344, + "loss": 3.5506, + "step": 1100 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029753910864956806, + "loss": 3.5285, + "step": 1200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002972877407894626, + "loss": 3.5064, + "step": 1300 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029703637292935724, + "loss": 3.4907, + "step": 1400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002967875187478529, + "loss": 3.4708, + "step": 1500 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029653615088774747, + "loss": 3.456, + "step": 1600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002962847830276421, + "loss": 3.4413, + "step": 1700 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029603341516753665, + "loss": 3.4224, + "step": 1800 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029578204730743127, + "loss": 3.4184, + "step": 1900 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029553067944732583, + "loss": 3.4011, + "step": 2000 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.39113344827973534, + "eval_loss": 3.4203457832336426, + "eval_runtime": 36.9935, + "eval_samples_per_second": 303.189, + "eval_steps_per_second": 2.541, + "step": 2000 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029527931158722045, + "loss": 3.3915, + "step": 2100 + }, + { + "epoch": 0.02, + "learning_rate": 0.000295027943727115, + "loss": 3.3831, + "step": 2200 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029477657586700963, + "loss": 3.3814, + "step": 2300 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002945252080069042, + "loss": 3.3734, + "step": 2400 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029427384014679876, + "loss": 3.3624, + "step": 2500 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002940224722866934, + "loss": 3.3559, + "step": 2600 + }, + { + "epoch": 0.02, + "learning_rate": 0.000293771104426588, + "loss": 3.3459, + "step": 2700 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029351973656648256, + "loss": 3.3462, + "step": 2800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002932683687063772, + "loss": 3.3306, + "step": 2900 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002930170008462718, + "loss": 3.335, + "step": 3000 + }, + { + "epoch": 0.03, + "eval_accuracy": 0.39839248205600547, + "eval_loss": 3.3489201068878174, + "eval_runtime": 36.3401, + "eval_samples_per_second": 308.64, + "eval_steps_per_second": 2.587, + "step": 3000 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029276563298616636, + "loss": 3.3239, + "step": 3100 + }, + { + "epoch": 0.03, + "learning_rate": 0.000292514265126061, + "loss": 3.3132, + "step": 3200 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029226289726595555, + "loss": 3.3157, + "step": 3300 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029201152940585017, + "loss": 3.3077, + "step": 3400 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029176016154574473, + "loss": 3.308, + "step": 3500 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029150879368563935, + "loss": 3.2971, + "step": 3600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002912574258255339, + "loss": 3.2953, + "step": 3700 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029100605796542853, + "loss": 3.2915, + "step": 3800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002907572037839242, + "loss": 3.289, + "step": 3900 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029050583592381876, + "loss": 3.2835, + "step": 4000 + }, + { + "epoch": 0.03, + "eval_accuracy": 0.4028412728722747, + "eval_loss": 3.306710958480835, + "eval_runtime": 37.7903, + "eval_samples_per_second": 296.796, + "eval_steps_per_second": 2.487, + "step": 4000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002902544680637134, + "loss": 3.2759, + "step": 4100 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029000310020360794, + "loss": 3.2803, + "step": 4200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002897517323435025, + "loss": 3.2752, + "step": 4300 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002895003644833971, + "loss": 3.28, + "step": 4400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002892489966232917, + "loss": 3.2788, + "step": 4500 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002889976287631863, + "loss": 3.2663, + "step": 4600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002887462609030809, + "loss": 3.2647, + "step": 4700 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002884948930429755, + "loss": 3.2643, + "step": 4800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002882435251828701, + "loss": 3.2686, + "step": 4900 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028799215732276467, + "loss": 3.2477, + "step": 5000 + }, + { + "epoch": 0.04, + "eval_accuracy": 0.4059681332629427, + "eval_loss": 3.2766220569610596, + "eval_runtime": 36.7229, + "eval_samples_per_second": 305.423, + "eval_steps_per_second": 2.56, + "step": 5000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002877407894626593, + "loss": 3.247, + "step": 5100 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002874894216025539, + "loss": 3.2555, + "step": 5200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028723805374244847, + "loss": 3.2528, + "step": 5300 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002869866858823431, + "loss": 3.2441, + "step": 5400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028673531802223765, + "loss": 3.2458, + "step": 5500 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028648395016213227, + "loss": 3.2394, + "step": 5600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028623258230202683, + "loss": 3.2464, + "step": 5700 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028598121444192145, + "loss": 3.2484, + "step": 5800 + }, + { + "epoch": 0.05, + "learning_rate": 0.000285729846581816, + "loss": 3.2372, + "step": 5900 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002854784787217106, + "loss": 3.2373, + "step": 6000 + }, + { + "epoch": 0.05, + "eval_accuracy": 0.40810372134296335, + "eval_loss": 3.256094455718994, + "eval_runtime": 36.3187, + "eval_samples_per_second": 308.822, + "eval_steps_per_second": 2.588, + "step": 6000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002852271108616052, + "loss": 3.2314, + "step": 6100 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028497825668010086, + "loss": 3.2393, + "step": 6200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002847268888199954, + "loss": 3.23, + "step": 6300 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002844780346384911, + "loss": 3.2317, + "step": 6400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002842266667783857, + "loss": 3.2188, + "step": 6500 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028397529891828027, + "loss": 3.2251, + "step": 6600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002837239310581749, + "loss": 3.2235, + "step": 6700 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028347256319806945, + "loss": 3.2174, + "step": 6800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028322119533796407, + "loss": 3.2212, + "step": 6900 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028296982747785863, + "loss": 3.2208, + "step": 7000 + }, + { + "epoch": 0.06, + "eval_accuracy": 0.4099135655475305, + "eval_loss": 3.2382774353027344, + "eval_runtime": 36.4193, + "eval_samples_per_second": 307.969, + "eval_steps_per_second": 2.581, + "step": 7000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028271845961775325, + "loss": 3.215, + "step": 7100 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002824670917576478, + "loss": 3.2124, + "step": 7200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028221572389754244, + "loss": 3.2214, + "step": 7300 + }, + { + "epoch": 0.06, + "learning_rate": 0.000281964356037437, + "loss": 3.2157, + "step": 7400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002817129881773316, + "loss": 3.212, + "step": 7500 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028146162031722624, + "loss": 3.2063, + "step": 7600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002812102524571208, + "loss": 3.2089, + "step": 7700 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002809588845970154, + "loss": 3.2056, + "step": 7800 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028070751673691, + "loss": 3.206, + "step": 7900 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002804561488768046, + "loss": 3.2021, + "step": 8000 + }, + { + "epoch": 0.07, + "eval_accuracy": 0.4112453244521325, + "eval_loss": 3.2249624729156494, + "eval_runtime": 37.3966, + "eval_samples_per_second": 299.92, + "eval_steps_per_second": 2.514, + "step": 8000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028020478101669917, + "loss": 3.2098, + "step": 8100 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002799534131565938, + "loss": 3.2099, + "step": 8200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027970204529648835, + "loss": 3.2075, + "step": 8300 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027945067743638297, + "loss": 3.205, + "step": 8400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027920182325487863, + "loss": 3.1931, + "step": 8500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002789504553947732, + "loss": 3.1969, + "step": 8600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002786990875346678, + "loss": 3.1974, + "step": 8700 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002784502333531634, + "loss": 3.1958, + "step": 8800 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027819886549305804, + "loss": 3.1925, + "step": 8900 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027794749763295265, + "loss": 3.194, + "step": 9000 + }, + { + "epoch": 0.08, + "eval_accuracy": 0.41224642524178057, + "eval_loss": 3.2142982482910156, + "eval_runtime": 37.0575, + "eval_samples_per_second": 302.665, + "eval_steps_per_second": 2.537, + "step": 9000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002776961297728472, + "loss": 3.1941, + "step": 9100 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027744476191274184, + "loss": 3.1943, + "step": 9200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002771933940526364, + "loss": 3.197, + "step": 9300 + }, + { + "epoch": 0.08, + "learning_rate": 0.000276942026192531, + "loss": 3.1912, + "step": 9400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002766906583324256, + "loss": 3.1941, + "step": 9500 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002764392904723202, + "loss": 3.1904, + "step": 9600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027618792261221477, + "loss": 3.1807, + "step": 9700 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027593906843071043, + "loss": 3.1854, + "step": 9800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027568770057060505, + "loss": 3.1859, + "step": 9900 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002754363327104996, + "loss": 3.1971, + "step": 10000 + }, + { + "epoch": 0.08, + "eval_accuracy": 0.413248228065643, + "eval_loss": 3.2038817405700684, + "eval_runtime": 36.9865, + "eval_samples_per_second": 303.246, + "eval_steps_per_second": 2.541, + "step": 10000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027518496485039423, + "loss": 3.1776, + "step": 10100 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002749335969902888, + "loss": 3.1872, + "step": 10200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002746822291301834, + "loss": 3.1792, + "step": 10300 + }, + { + "epoch": 0.09, + "learning_rate": 0.000274430861270078, + "loss": 3.1858, + "step": 10400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027417949340997254, + "loss": 3.1825, + "step": 10500 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027392812554986716, + "loss": 3.1798, + "step": 10600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002736767576897618, + "loss": 3.1819, + "step": 10700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027342538982965634, + "loss": 3.1778, + "step": 10800 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027317402196955096, + "loss": 3.185, + "step": 10900 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002729226541094455, + "loss": 3.1794, + "step": 11000 + }, + { + "epoch": 0.09, + "eval_accuracy": 0.41429074887393713, + "eval_loss": 3.1947903633117676, + "eval_runtime": 37.2827, + "eval_samples_per_second": 300.837, + "eval_steps_per_second": 2.521, + "step": 11000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027267128624934014, + "loss": 3.1782, + "step": 11100 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027242243206783575, + "loss": 3.1752, + "step": 11200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027217106420773037, + "loss": 3.172, + "step": 11300 + }, + { + "epoch": 0.1, + "learning_rate": 0.000271919696347625, + "loss": 3.1794, + "step": 11400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027166832848751955, + "loss": 3.1773, + "step": 11500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027141696062741417, + "loss": 3.1776, + "step": 11600 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002711681064459098, + "loss": 3.1866, + "step": 11700 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002709167385858044, + "loss": 3.1707, + "step": 11800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027066537072569896, + "loss": 3.1705, + "step": 11900 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002704140028655936, + "loss": 3.1731, + "step": 12000 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.41493381221427206, + "eval_loss": 3.1884472370147705, + "eval_runtime": 36.6321, + "eval_samples_per_second": 306.18, + "eval_steps_per_second": 2.566, + "step": 12000 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027016514868408924, + "loss": 3.1688, + "step": 12100 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002699137808239838, + "loss": 3.1698, + "step": 12200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002696624129638784, + "loss": 3.1661, + "step": 12300 + }, + { + "epoch": 0.1, + "learning_rate": 0.000269411045103773, + "loss": 3.163, + "step": 12400 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002691596772436676, + "loss": 3.166, + "step": 12500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026890830938356217, + "loss": 3.1684, + "step": 12600 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002686569415234568, + "loss": 3.1665, + "step": 12700 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002684055736633514, + "loss": 3.1623, + "step": 12800 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026815420580324597, + "loss": 3.1674, + "step": 12900 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002679028379431406, + "loss": 3.1596, + "step": 13000 + }, + { + "epoch": 0.11, + "eval_accuracy": 0.41567656441304324, + "eval_loss": 3.181196928024292, + "eval_runtime": 38.8685, + "eval_samples_per_second": 288.563, + "eval_steps_per_second": 2.418, + "step": 13000 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026765147008303515, + "loss": 3.1659, + "step": 13100 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026740010222292977, + "loss": 3.1528, + "step": 13200 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026714873436282433, + "loss": 3.1656, + "step": 13300 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026689736650271895, + "loss": 3.1594, + "step": 13400 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002666459986426135, + "loss": 3.1593, + "step": 13500 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026639463078250813, + "loss": 3.1579, + "step": 13600 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002661432629224027, + "loss": 3.1599, + "step": 13700 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002658918950622973, + "loss": 3.1529, + "step": 13800 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002656405272021919, + "loss": 3.1615, + "step": 13900 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002653891593420865, + "loss": 3.1628, + "step": 14000 + }, + { + "epoch": 0.12, + "eval_accuracy": 0.41594754961977826, + "eval_loss": 3.1771674156188965, + "eval_runtime": 37.2285, + "eval_samples_per_second": 301.275, + "eval_steps_per_second": 2.525, + "step": 14000 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026513779148198106, + "loss": 3.1594, + "step": 14100 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002648864236218757, + "loss": 3.158, + "step": 14200 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002646400831189724, + "loss": 3.1588, + "step": 14300 + }, + { + "epoch": 0.12, + "learning_rate": 0.000264388715258867, + "loss": 3.1606, + "step": 14400 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026413734739876157, + "loss": 3.1555, + "step": 14500 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002638859795386562, + "loss": 3.1574, + "step": 14600 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026363461167855075, + "loss": 3.1526, + "step": 14700 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026338324381844537, + "loss": 3.1457, + "step": 14800 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026313187595833993, + "loss": 3.1655, + "step": 14900 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002628805080982345, + "loss": 3.1658, + "step": 15000 + }, + { + "epoch": 0.13, + "eval_accuracy": 0.416945842272569, + "eval_loss": 3.170196294784546, + "eval_runtime": 38.2091, + "eval_samples_per_second": 293.542, + "eval_steps_per_second": 2.46, + "step": 15000 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002626291402381291, + "loss": 3.1537, + "step": 15100 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026237777237802373, + "loss": 3.1596, + "step": 15200 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026212891819651934, + "loss": 3.1558, + "step": 15300 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026187755033641396, + "loss": 3.1568, + "step": 15400 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002616261824763085, + "loss": 3.1488, + "step": 15500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026137481461620314, + "loss": 3.1452, + "step": 15600 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002611234467560977, + "loss": 3.1503, + "step": 15700 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002608720788959923, + "loss": 3.1456, + "step": 15800 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026062071103588694, + "loss": 3.1469, + "step": 15900 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002603693431757815, + "loss": 3.1479, + "step": 16000 + }, + { + "epoch": 0.13, + "eval_accuracy": 0.41732213261145495, + "eval_loss": 3.1664865016937256, + "eval_runtime": 36.9736, + "eval_samples_per_second": 303.351, + "eval_steps_per_second": 2.542, + "step": 16000 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002601179753156761, + "loss": 3.152, + "step": 16100 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025986660745557074, + "loss": 3.1515, + "step": 16200 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002596152395954653, + "loss": 3.1403, + "step": 16300 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002593638717353599, + "loss": 3.1482, + "step": 16400 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002591125038752545, + "loss": 3.1384, + "step": 16500 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002588611360151491, + "loss": 3.1423, + "step": 16600 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002586097681550437, + "loss": 3.1388, + "step": 16700 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025835840029493824, + "loss": 3.1502, + "step": 16800 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025810703243483286, + "loss": 3.1423, + "step": 16900 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002578556645747274, + "loss": 3.1401, + "step": 17000 + }, + { + "epoch": 0.14, + "eval_accuracy": 0.4181765082503061, + "eval_loss": 3.161729097366333, + "eval_runtime": 36.3895, + "eval_samples_per_second": 308.221, + "eval_steps_per_second": 2.583, + "step": 17000 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025760429671462204, + "loss": 3.1444, + "step": 17100 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002573529288545166, + "loss": 3.1344, + "step": 17200 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002571015609944112, + "loss": 3.1362, + "step": 17300 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025685019313430584, + "loss": 3.1449, + "step": 17400 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002565988252742004, + "loss": 3.1403, + "step": 17500 + }, + { + "epoch": 0.15, + "learning_rate": 0.000256347457414095, + "loss": 3.1485, + "step": 17600 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025609608955398964, + "loss": 3.1465, + "step": 17700 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002558447216938842, + "loss": 3.1388, + "step": 17800 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002555933538337788, + "loss": 3.1412, + "step": 17900 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002553419859736734, + "loss": 3.1386, + "step": 18000 + }, + { + "epoch": 0.15, + "eval_accuracy": 0.4183225313668887, + "eval_loss": 3.1586148738861084, + "eval_runtime": 36.9298, + "eval_samples_per_second": 303.711, + "eval_steps_per_second": 2.545, + "step": 18000 + }, + { + "epoch": 0.15, + "learning_rate": 0.000255090618113568, + "loss": 3.1421, + "step": 18100 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025483925025346257, + "loss": 3.1355, + "step": 18200 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002545878823933572, + "loss": 3.1399, + "step": 18300 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025433651453325175, + "loss": 3.1349, + "step": 18400 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002540851466731463, + "loss": 3.1413, + "step": 18500 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025383377881304093, + "loss": 3.1278, + "step": 18600 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002535824109529355, + "loss": 3.1405, + "step": 18700 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002533310430928301, + "loss": 3.1277, + "step": 18800 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025307967523272474, + "loss": 3.1341, + "step": 18900 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002528283073726193, + "loss": 3.1396, + "step": 19000 + }, + { + "epoch": 0.16, + "eval_accuracy": 0.41871075628741844, + "eval_loss": 3.1532347202301025, + "eval_runtime": 36.5045, + "eval_samples_per_second": 307.25, + "eval_steps_per_second": 2.575, + "step": 19000 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025257945319111496, + "loss": 3.1329, + "step": 19100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002523280853310095, + "loss": 3.1358, + "step": 19200 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025207671747090414, + "loss": 3.1377, + "step": 19300 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002518253496107987, + "loss": 3.1289, + "step": 19400 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002515739817506933, + "loss": 3.1348, + "step": 19500 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025132261389058794, + "loss": 3.1324, + "step": 19600 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002510712460304825, + "loss": 3.136, + "step": 19700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002508198781703771, + "loss": 3.1337, + "step": 19800 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025056851031027175, + "loss": 3.132, + "step": 19900 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002503171424501663, + "loss": 3.1345, + "step": 20000 + }, + { + "epoch": 0.17, + "eval_accuracy": 0.41896770080986667, + "eval_loss": 3.150233268737793, + "eval_runtime": 36.0939, + "eval_samples_per_second": 310.745, + "eval_steps_per_second": 2.604, + "step": 20000 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002500682882686619, + "loss": 3.1301, + "step": 20100 + }, + { + "epoch": 0.17, + "learning_rate": 0.00024981692040855653, + "loss": 3.1261, + "step": 20200 + }, + { + "epoch": 0.17, + "learning_rate": 0.00024956555254845115, + "loss": 3.1279, + "step": 20300 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002493141846883457, + "loss": 3.1235, + "step": 20400 + }, + { + "epoch": 0.17, + "learning_rate": 0.00024906281682824034, + "loss": 3.1302, + "step": 20500 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002488114489681349, + "loss": 3.1287, + "step": 20600 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002485600811080295, + "loss": 3.1314, + "step": 20700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002483087132479241, + "loss": 3.1226, + "step": 20800 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002480573453878187, + "loss": 3.1289, + "step": 20900 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024780597752771327, + "loss": 3.1319, + "step": 21000 + }, + { + "epoch": 0.18, + "eval_accuracy": 0.41907721814730364, + "eval_loss": 3.1475839614868164, + "eval_runtime": 36.3645, + "eval_samples_per_second": 308.432, + "eval_steps_per_second": 2.585, + "step": 21000 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002475546096676079, + "loss": 3.1304, + "step": 21100 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024730324180750245, + "loss": 3.1309, + "step": 21200 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024705187394739707, + "loss": 3.1254, + "step": 21300 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024680050608729163, + "loss": 3.1293, + "step": 21400 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024654913822718625, + "loss": 3.1278, + "step": 21500 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002462977703670808, + "loss": 3.1216, + "step": 21600 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024604640250697543, + "loss": 3.1281, + "step": 21700 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002457975483254711, + "loss": 3.1182, + "step": 21800 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024554618046536566, + "loss": 3.1231, + "step": 21900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002452948126052603, + "loss": 3.1238, + "step": 22000 + }, + { + "epoch": 0.18, + "eval_accuracy": 0.42022504408774863, + "eval_loss": 3.1434154510498047, + "eval_runtime": 36.9095, + "eval_samples_per_second": 303.878, + "eval_steps_per_second": 2.547, + "step": 22000 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024504344474515484, + "loss": 3.1249, + "step": 22100 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024479207688504946, + "loss": 3.1316, + "step": 22200 + }, + { + "epoch": 0.19, + "learning_rate": 0.000244540709024944, + "loss": 3.1152, + "step": 22300 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024428934116483864, + "loss": 3.1204, + "step": 22400 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024403797330473323, + "loss": 3.1237, + "step": 22500 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024378660544462782, + "loss": 3.1256, + "step": 22600 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024353523758452241, + "loss": 3.1272, + "step": 22700 + }, + { + "epoch": 0.19, + "learning_rate": 0.000243283869724417, + "loss": 3.12, + "step": 22800 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024303250186431162, + "loss": 3.1182, + "step": 22900 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002427811340042062, + "loss": 3.1224, + "step": 23000 + }, + { + "epoch": 0.19, + "eval_accuracy": 0.42017309355588756, + "eval_loss": 3.1407454013824463, + "eval_runtime": 36.6142, + "eval_samples_per_second": 306.329, + "eval_steps_per_second": 2.567, + "step": 23000 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002425297661441008, + "loss": 3.1174, + "step": 23100 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024227839828399537, + "loss": 3.1199, + "step": 23200 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024202703042389, + "loss": 3.1147, + "step": 23300 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024177566256378458, + "loss": 3.1201, + "step": 23400 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024152429470367917, + "loss": 3.1231, + "step": 23500 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024127544052217483, + "loss": 3.1172, + "step": 23600 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002410240726620694, + "loss": 3.1176, + "step": 23700 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024077521848056506, + "loss": 3.1119, + "step": 23800 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024052385062045965, + "loss": 3.1212, + "step": 23900 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024027248276035424, + "loss": 3.1183, + "step": 24000 + }, + { + "epoch": 0.2, + "eval_accuracy": 0.4208589809832972, + "eval_loss": 3.137460947036743, + "eval_runtime": 37.115, + "eval_samples_per_second": 302.196, + "eval_steps_per_second": 2.533, + "step": 24000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024002111490024883, + "loss": 3.1287, + "step": 24100 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023976974704014342, + "loss": 3.1157, + "step": 24200 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023951837918003804, + "loss": 3.1199, + "step": 24300 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002392670113199326, + "loss": 3.1162, + "step": 24400 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023901564345982722, + "loss": 3.1179, + "step": 24500 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002387642755997218, + "loss": 3.1214, + "step": 24600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023851290773961638, + "loss": 3.1138, + "step": 24700 + }, + { + "epoch": 0.21, + "learning_rate": 0.000238261539879511, + "loss": 3.1117, + "step": 24800 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023801017201940556, + "loss": 3.1117, + "step": 24900 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023775880415930018, + "loss": 3.1131, + "step": 25000 + }, + { + "epoch": 0.21, + "eval_accuracy": 0.4210050040998798, + "eval_loss": 3.1347129344940186, + "eval_runtime": 36.8178, + "eval_samples_per_second": 304.635, + "eval_steps_per_second": 2.553, + "step": 25000 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023750743629919474, + "loss": 3.118, + "step": 25100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023725606843908936, + "loss": 3.1158, + "step": 25200 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023700470057898395, + "loss": 3.1178, + "step": 25300 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023675333271887855, + "loss": 3.1019, + "step": 25400 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023650196485877314, + "loss": 3.1105, + "step": 25500 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023625059699866773, + "loss": 3.1158, + "step": 25600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023599922913856232, + "loss": 3.1166, + "step": 25700 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023574786127845694, + "loss": 3.1172, + "step": 25800 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002354964934183515, + "loss": 3.1233, + "step": 25900 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023524763923684716, + "loss": 3.1106, + "step": 26000 + }, + { + "epoch": 0.22, + "eval_accuracy": 0.42156382333449405, + "eval_loss": 3.131035566329956, + "eval_runtime": 36.1307, + "eval_samples_per_second": 310.428, + "eval_steps_per_second": 2.602, + "step": 26000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023499627137674175, + "loss": 3.1186, + "step": 26100 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023474490351663635, + "loss": 3.1069, + "step": 26200 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023449353565653094, + "loss": 3.114, + "step": 26300 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023424216779642553, + "loss": 3.114, + "step": 26400 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002339907999363201, + "loss": 3.1072, + "step": 26500 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002337394320762147, + "loss": 3.1141, + "step": 26600 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002334880642161093, + "loss": 3.1125, + "step": 26700 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002332366963560039, + "loss": 3.1202, + "step": 26800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023298532849589849, + "loss": 3.1177, + "step": 26900 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002327339606357931, + "loss": 3.114, + "step": 27000 + }, + { + "epoch": 0.23, + "eval_accuracy": 0.42156241926606536, + "eval_loss": 3.129709482192993, + "eval_runtime": 36.5135, + "eval_samples_per_second": 307.174, + "eval_steps_per_second": 2.574, + "step": 27000 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023248259277568767, + "loss": 3.1107, + "step": 27100 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023223122491558229, + "loss": 3.1111, + "step": 27200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023197985705547685, + "loss": 3.106, + "step": 27300 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023172848919537147, + "loss": 3.1081, + "step": 27400 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023147712133526606, + "loss": 3.1077, + "step": 27500 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023122575347516065, + "loss": 3.116, + "step": 27600 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023097438561505524, + "loss": 3.1168, + "step": 27700 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023072301775494983, + "loss": 3.1137, + "step": 27800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023047164989484442, + "loss": 3.1065, + "step": 27900 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023022028203473904, + "loss": 3.1083, + "step": 28000 + }, + { + "epoch": 0.23, + "eval_accuracy": 0.42211211205589316, + "eval_loss": 3.1262805461883545, + "eval_runtime": 36.106, + "eval_samples_per_second": 310.641, + "eval_steps_per_second": 2.603, + "step": 28000 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002299689141746336, + "loss": 3.1193, + "step": 28100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022972257367173034, + "loss": 3.0997, + "step": 28200 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002294712058116249, + "loss": 3.1013, + "step": 28300 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022921983795151952, + "loss": 3.1049, + "step": 28400 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022896847009141409, + "loss": 3.1152, + "step": 28500 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022871710223130868, + "loss": 3.1077, + "step": 28600 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022846573437120327, + "loss": 3.1146, + "step": 28700 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022821436651109786, + "loss": 3.1054, + "step": 28800 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022796299865099248, + "loss": 3.1087, + "step": 28900 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022771163079088704, + "loss": 3.1045, + "step": 29000 + }, + { + "epoch": 0.24, + "eval_accuracy": 0.4221415974928954, + "eval_loss": 3.124873161315918, + "eval_runtime": 37.1734, + "eval_samples_per_second": 301.721, + "eval_steps_per_second": 2.529, + "step": 29000 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002274627766093827, + "loss": 3.1024, + "step": 29100 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002272114087492773, + "loss": 3.0938, + "step": 29200 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022696004088917189, + "loss": 3.1049, + "step": 29300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022670867302906648, + "loss": 3.109, + "step": 29400 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022645730516896107, + "loss": 3.1033, + "step": 29500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002262059373088557, + "loss": 3.1066, + "step": 29600 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022595456944875025, + "loss": 3.1087, + "step": 29700 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022570320158864487, + "loss": 3.101, + "step": 29800 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022545183372853943, + "loss": 3.1137, + "step": 29900 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022520046586843405, + "loss": 3.1084, + "step": 30000 + }, + { + "epoch": 0.25, + "eval_accuracy": 0.4223374650386961, + "eval_loss": 3.1216838359832764, + "eval_runtime": 39.4599, + "eval_samples_per_second": 284.238, + "eval_steps_per_second": 2.382, + "step": 30000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022494909800832864, + "loss": 3.1006, + "step": 30100 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022469773014822323, + "loss": 3.1045, + "step": 30200 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022444636228811783, + "loss": 3.1001, + "step": 30300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022419499442801244, + "loss": 3.0988, + "step": 30400 + }, + { + "epoch": 0.26, + "learning_rate": 0.000223943626567907, + "loss": 3.0981, + "step": 30500 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022369477238640264, + "loss": 3.1027, + "step": 30600 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022344340452629723, + "loss": 3.1046, + "step": 30700 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022319203666619185, + "loss": 3.1025, + "step": 30800 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022294066880608642, + "loss": 3.1025, + "step": 30900 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022268930094598103, + "loss": 3.097, + "step": 31000 + }, + { + "epoch": 0.26, + "eval_accuracy": 0.42269550248800924, + "eval_loss": 3.1202731132507324, + "eval_runtime": 36.6594, + "eval_samples_per_second": 305.952, + "eval_steps_per_second": 2.564, + "step": 31000 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002224379330858756, + "loss": 3.104, + "step": 31100 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022218656522577022, + "loss": 3.0977, + "step": 31200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002219351973656648, + "loss": 3.1121, + "step": 31300 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002216838295055594, + "loss": 3.1011, + "step": 31400 + }, + { + "epoch": 0.26, + "learning_rate": 0.000221432461645454, + "loss": 3.0963, + "step": 31500 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022118109378534858, + "loss": 3.1082, + "step": 31600 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022092972592524317, + "loss": 3.0994, + "step": 31700 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002206783580651378, + "loss": 3.0957, + "step": 31800 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022042699020503236, + "loss": 3.0947, + "step": 31900 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022017562234492697, + "loss": 3.0926, + "step": 32000 + }, + { + "epoch": 0.27, + "eval_accuracy": 0.42268707807743716, + "eval_loss": 3.119593381881714, + "eval_runtime": 37.8215, + "eval_samples_per_second": 296.551, + "eval_steps_per_second": 2.485, + "step": 32000 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021992425448482154, + "loss": 3.0955, + "step": 32100 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021967288662471616, + "loss": 3.0973, + "step": 32200 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021942151876461075, + "loss": 3.1098, + "step": 32300 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002191701509045053, + "loss": 3.1007, + "step": 32400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021891878304439993, + "loss": 3.0992, + "step": 32500 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002186674151842945, + "loss": 3.1029, + "step": 32600 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021841604732418911, + "loss": 3.0947, + "step": 32700 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021816719314268475, + "loss": 3.0941, + "step": 32800 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021791582528257934, + "loss": 3.1004, + "step": 32900 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021766445742247396, + "loss": 3.1003, + "step": 33000 + }, + { + "epoch": 0.28, + "eval_accuracy": 0.4228331011940198, + "eval_loss": 3.1163218021392822, + "eval_runtime": 37.158, + "eval_samples_per_second": 301.846, + "eval_steps_per_second": 2.53, + "step": 33000 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021741308956236852, + "loss": 3.0986, + "step": 33100 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021716172170226314, + "loss": 3.0999, + "step": 33200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002169103538421577, + "loss": 3.0994, + "step": 33300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021665898598205232, + "loss": 3.0976, + "step": 33400 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021640761812194691, + "loss": 3.0949, + "step": 33500 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002161562502618415, + "loss": 3.0923, + "step": 33600 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002159048824017361, + "loss": 3.0909, + "step": 33700 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002156535145416307, + "loss": 3.0944, + "step": 33800 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021540214668152528, + "loss": 3.0997, + "step": 33900 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002151507788214199, + "loss": 3.097, + "step": 34000 + }, + { + "epoch": 0.28, + "eval_accuracy": 0.4235625147427185, + "eval_loss": 3.1130168437957764, + "eval_runtime": 36.3501, + "eval_samples_per_second": 308.555, + "eval_steps_per_second": 2.586, + "step": 34000 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021489941096131446, + "loss": 3.0878, + "step": 34100 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021464804310120905, + "loss": 3.094, + "step": 34200 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021439667524110364, + "loss": 3.0976, + "step": 34300 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021414530738099824, + "loss": 3.0959, + "step": 34400 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021389393952089285, + "loss": 3.098, + "step": 34500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021364257166078742, + "loss": 3.0891, + "step": 34600 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021339120380068204, + "loss": 3.0881, + "step": 34700 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002131398359405766, + "loss": 3.0934, + "step": 34800 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021288846808047122, + "loss": 3.0997, + "step": 34900 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021263961389896685, + "loss": 3.0934, + "step": 35000 + }, + { + "epoch": 0.29, + "eval_accuracy": 0.4233083783571276, + "eval_loss": 3.112696886062622, + "eval_runtime": 36.2826, + "eval_samples_per_second": 309.129, + "eval_steps_per_second": 2.591, + "step": 35000 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021238824603886144, + "loss": 3.0886, + "step": 35100 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021213687817875606, + "loss": 3.0891, + "step": 35200 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021188551031865063, + "loss": 3.0952, + "step": 35300 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021163414245854525, + "loss": 3.0869, + "step": 35400 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002113827745984398, + "loss": 3.0905, + "step": 35500 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021113140673833443, + "loss": 3.0939, + "step": 35600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021088255255683006, + "loss": 3.0958, + "step": 35700 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021063118469672465, + "loss": 3.0882, + "step": 35800 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021037981683661927, + "loss": 3.0852, + "step": 35900 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021012844897651384, + "loss": 3.0957, + "step": 36000 + }, + { + "epoch": 0.3, + "eval_accuracy": 0.4237239826120166, + "eval_loss": 3.110541820526123, + "eval_runtime": 37.0216, + "eval_samples_per_second": 302.958, + "eval_steps_per_second": 2.539, + "step": 36000 + }, + { + "epoch": 0.3, + "learning_rate": 0.00020987708111640845, + "loss": 3.0968, + "step": 36100 + }, + { + "epoch": 0.3, + "learning_rate": 0.00020962571325630302, + "loss": 3.0909, + "step": 36200 + }, + { + "epoch": 0.3, + "learning_rate": 0.00020937434539619764, + "loss": 3.0826, + "step": 36300 + }, + { + "epoch": 0.3, + "learning_rate": 0.00020912297753609223, + "loss": 3.086, + "step": 36400 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002088716096759868, + "loss": 3.091, + "step": 36500 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020862275549448245, + "loss": 3.0865, + "step": 36600 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020837138763437704, + "loss": 3.092, + "step": 36700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020812001977427164, + "loss": 3.0916, + "step": 36800 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020786865191416623, + "loss": 3.0924, + "step": 36900 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020761728405406082, + "loss": 3.0915, + "step": 37000 + }, + { + "epoch": 0.31, + "eval_accuracy": 0.42398513933975085, + "eval_loss": 3.10992169380188, + "eval_runtime": 36.5153, + "eval_samples_per_second": 307.159, + "eval_steps_per_second": 2.574, + "step": 37000 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020736591619395544, + "loss": 3.0841, + "step": 37100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020711454833385, + "loss": 3.088, + "step": 37200 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020686318047374462, + "loss": 3.0941, + "step": 37300 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020661181261363918, + "loss": 3.0898, + "step": 37400 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002063604447535338, + "loss": 3.0885, + "step": 37500 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002061090768934284, + "loss": 3.0918, + "step": 37600 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020585770903332298, + "loss": 3.0962, + "step": 37700 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020560634117321758, + "loss": 3.096, + "step": 37800 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002053549733131122, + "loss": 3.0846, + "step": 37900 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020510611913160783, + "loss": 3.0908, + "step": 38000 + }, + { + "epoch": 0.32, + "eval_accuracy": 0.4245425545059364, + "eval_loss": 3.1069419384002686, + "eval_runtime": 37.2669, + "eval_samples_per_second": 300.964, + "eval_steps_per_second": 2.522, + "step": 38000 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002048547512715024, + "loss": 3.0851, + "step": 38100 + }, + { + "epoch": 0.32, + "learning_rate": 0.000204603383411397, + "loss": 3.0859, + "step": 38200 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002043520155512916, + "loss": 3.0877, + "step": 38300 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002041006476911862, + "loss": 3.08, + "step": 38400 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020384927983108079, + "loss": 3.0872, + "step": 38500 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020359791197097535, + "loss": 3.0934, + "step": 38600 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020334654411086997, + "loss": 3.0898, + "step": 38700 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020309517625076456, + "loss": 3.091, + "step": 38800 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020284380839065915, + "loss": 3.0903, + "step": 38900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020259244053055374, + "loss": 3.0764, + "step": 39000 + }, + { + "epoch": 0.33, + "eval_accuracy": 0.42455589315600883, + "eval_loss": 3.104147434234619, + "eval_runtime": 36.3216, + "eval_samples_per_second": 308.797, + "eval_steps_per_second": 2.588, + "step": 39000 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020234107267044833, + "loss": 3.0781, + "step": 39100 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020208970481034292, + "loss": 3.0805, + "step": 39200 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020183833695023754, + "loss": 3.0861, + "step": 39300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002015869690901321, + "loss": 3.0906, + "step": 39400 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020133560123002672, + "loss": 3.0837, + "step": 39500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002010842333699213, + "loss": 3.0827, + "step": 39600 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002008328655098159, + "loss": 3.082, + "step": 39700 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002005814976497105, + "loss": 3.0838, + "step": 39800 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002003301297896051, + "loss": 3.0834, + "step": 39900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020007876192949968, + "loss": 3.0855, + "step": 40000 + }, + { + "epoch": 0.33, + "eval_accuracy": 0.42506837813247667, + "eval_loss": 3.1023147106170654, + "eval_runtime": 36.3302, + "eval_samples_per_second": 308.724, + "eval_steps_per_second": 2.587, + "step": 40000 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019982990774799532, + "loss": 3.0823, + "step": 40100 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019957853988788993, + "loss": 3.0877, + "step": 40200 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001993271720277845, + "loss": 3.0891, + "step": 40300 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001990758041676791, + "loss": 3.0847, + "step": 40400 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001988244363075737, + "loss": 3.0769, + "step": 40500 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019857306844746827, + "loss": 3.0842, + "step": 40600 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001983217005873629, + "loss": 3.0771, + "step": 40700 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019807033272725745, + "loss": 3.0878, + "step": 40800 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019781896486715207, + "loss": 3.0876, + "step": 40900 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019756759700704666, + "loss": 3.0782, + "step": 41000 + }, + { + "epoch": 0.34, + "eval_accuracy": 0.42481002954159974, + "eval_loss": 3.100797414779663, + "eval_runtime": 37.0564, + "eval_samples_per_second": 302.674, + "eval_steps_per_second": 2.537, + "step": 41000 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019731622914694126, + "loss": 3.0788, + "step": 41100 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019706486128683585, + "loss": 3.0811, + "step": 41200 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019681349342673044, + "loss": 3.0799, + "step": 41300 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019656212556662503, + "loss": 3.0737, + "step": 41400 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019631075770651965, + "loss": 3.0815, + "step": 41500 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001960593898464142, + "loss": 3.0885, + "step": 41600 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019580802198630883, + "loss": 3.0785, + "step": 41700 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001955566541262034, + "loss": 3.0738, + "step": 41800 + }, + { + "epoch": 0.35, + "learning_rate": 0.000195305286266098, + "loss": 3.0826, + "step": 41900 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001950539184059926, + "loss": 3.0821, + "step": 42000 + }, + { + "epoch": 0.35, + "eval_accuracy": 0.4254973210374381, + "eval_loss": 3.0979230403900146, + "eval_runtime": 36.8694, + "eval_samples_per_second": 304.209, + "eval_steps_per_second": 2.55, + "step": 42000 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019480255054588717, + "loss": 3.0689, + "step": 42100 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019455369636438286, + "loss": 3.0767, + "step": 42200 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001943048421828785, + "loss": 3.0768, + "step": 42300 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019405598800137415, + "loss": 3.0746, + "step": 42400 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019380462014126872, + "loss": 3.0812, + "step": 42500 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019355325228116333, + "loss": 3.0721, + "step": 42600 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001933018844210579, + "loss": 3.0701, + "step": 42700 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001930505165609525, + "loss": 3.0769, + "step": 42800 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019279914870084708, + "loss": 3.0827, + "step": 42900 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019254778084074167, + "loss": 3.075, + "step": 43000 + }, + { + "epoch": 0.36, + "eval_accuracy": 0.425449582710863, + "eval_loss": 3.0971269607543945, + "eval_runtime": 36.1836, + "eval_samples_per_second": 309.975, + "eval_steps_per_second": 2.598, + "step": 43000 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001922964129806363, + "loss": 3.0742, + "step": 43100 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019204504512053086, + "loss": 3.0804, + "step": 43200 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019179367726042547, + "loss": 3.0788, + "step": 43300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019154230940032004, + "loss": 3.078, + "step": 43400 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019129094154021466, + "loss": 3.0729, + "step": 43500 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019103957368010925, + "loss": 3.0704, + "step": 43600 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019078820582000384, + "loss": 3.0793, + "step": 43700 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019053683795989843, + "loss": 3.0789, + "step": 43800 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019028547009979305, + "loss": 3.0835, + "step": 43900 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019003661591828868, + "loss": 3.0794, + "step": 44000 + }, + { + "epoch": 0.37, + "eval_accuracy": 0.4256580868725218, + "eval_loss": 3.0950751304626465, + "eval_runtime": 36.1829, + "eval_samples_per_second": 309.98, + "eval_steps_per_second": 2.598, + "step": 44000 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018978524805818325, + "loss": 3.0746, + "step": 44100 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018953388019807787, + "loss": 3.0778, + "step": 44200 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018928251233797246, + "loss": 3.0743, + "step": 44300 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018903114447786705, + "loss": 3.0822, + "step": 44400 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018877977661776164, + "loss": 3.0782, + "step": 44500 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001885284087576562, + "loss": 3.0705, + "step": 44600 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018827704089755082, + "loss": 3.0737, + "step": 44700 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001880256730374454, + "loss": 3.0736, + "step": 44800 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018777681885594105, + "loss": 3.0712, + "step": 44900 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018752545099583567, + "loss": 3.0836, + "step": 45000 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.42573460860188483, + "eval_loss": 3.0936806201934814, + "eval_runtime": 36.1343, + "eval_samples_per_second": 310.398, + "eval_steps_per_second": 2.601, + "step": 45000 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018727408313573023, + "loss": 3.0763, + "step": 45100 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018702271527562485, + "loss": 3.0831, + "step": 45200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001867713474155194, + "loss": 3.0768, + "step": 45300 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018651997955541403, + "loss": 3.0686, + "step": 45400 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018626861169530862, + "loss": 3.0766, + "step": 45500 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001860172438352032, + "loss": 3.0721, + "step": 45600 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001857658759750978, + "loss": 3.0812, + "step": 45700 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018551450811499242, + "loss": 3.0853, + "step": 45800 + }, + { + "epoch": 0.38, + "learning_rate": 0.000185263140254887, + "loss": 3.0753, + "step": 45900 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001850117723947816, + "loss": 3.0744, + "step": 46000 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.42582517101553463, + "eval_loss": 3.092123508453369, + "eval_runtime": 36.2715, + "eval_samples_per_second": 309.224, + "eval_steps_per_second": 2.592, + "step": 46000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018476040453467617, + "loss": 3.077, + "step": 46100 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001845090366745708, + "loss": 3.0822, + "step": 46200 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018425766881446535, + "loss": 3.0791, + "step": 46300 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018400630095435997, + "loss": 3.0776, + "step": 46400 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018375493309425456, + "loss": 3.0781, + "step": 46500 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018350356523414913, + "loss": 3.0756, + "step": 46600 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018325219737404374, + "loss": 3.0739, + "step": 46700 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001830008295139383, + "loss": 3.0697, + "step": 46800 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018274946165383293, + "loss": 3.0747, + "step": 46900 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018249809379372752, + "loss": 3.0692, + "step": 47000 + }, + { + "epoch": 0.39, + "eval_accuracy": 0.42626464443371115, + "eval_loss": 3.090735912322998, + "eval_runtime": 36.0323, + "eval_samples_per_second": 311.276, + "eval_steps_per_second": 2.609, + "step": 47000 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001822467259336221, + "loss": 3.0701, + "step": 47100 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001819953580735167, + "loss": 3.0706, + "step": 47200 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018174399021341132, + "loss": 3.0734, + "step": 47300 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018149262235330588, + "loss": 3.0719, + "step": 47400 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001812412544932005, + "loss": 3.07, + "step": 47500 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018098988663309507, + "loss": 3.0743, + "step": 47600 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018074103245159073, + "loss": 3.0768, + "step": 47700 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018048966459148532, + "loss": 3.0598, + "step": 47800 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001802382967313799, + "loss": 3.0653, + "step": 47900 + }, + { + "epoch": 0.4, + "learning_rate": 0.00017998692887127453, + "loss": 3.0717, + "step": 48000 + }, + { + "epoch": 0.4, + "eval_accuracy": 0.42618812270434814, + "eval_loss": 3.0900797843933105, + "eval_runtime": 36.3, + "eval_samples_per_second": 308.981, + "eval_steps_per_second": 2.59, + "step": 48000 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001797355610111691, + "loss": 3.0752, + "step": 48100 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001794841931510637, + "loss": 3.0656, + "step": 48200 + }, + { + "epoch": 0.4, + "learning_rate": 0.00017923282529095827, + "loss": 3.0758, + "step": 48300 + }, + { + "epoch": 0.4, + "learning_rate": 0.00017898145743085287, + "loss": 3.0827, + "step": 48400 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017873008957074746, + "loss": 3.068, + "step": 48500 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017847872171064205, + "loss": 3.0645, + "step": 48600 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017822735385053667, + "loss": 3.0752, + "step": 48700 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017797598599043123, + "loss": 3.0726, + "step": 48800 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017772461813032585, + "loss": 3.0736, + "step": 48900 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017747325027022041, + "loss": 3.0697, + "step": 49000 + }, + { + "epoch": 0.41, + "eval_accuracy": 0.42656862524852013, + "eval_loss": 3.0877325534820557, + "eval_runtime": 37.2501, + "eval_samples_per_second": 301.1, + "eval_steps_per_second": 2.523, + "step": 49000 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017722188241011503, + "loss": 3.0779, + "step": 49100 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017697051455000962, + "loss": 3.0736, + "step": 49200 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017671914668990421, + "loss": 3.0657, + "step": 49300 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001764677788297988, + "loss": 3.065, + "step": 49400 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017621641096969342, + "loss": 3.0683, + "step": 49500 + }, + { + "epoch": 0.41, + "learning_rate": 0.000175965043109588, + "loss": 3.0656, + "step": 49600 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001757136752494826, + "loss": 3.0714, + "step": 49700 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017546482106797824, + "loss": 3.0804, + "step": 49800 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017521345320787283, + "loss": 3.0636, + "step": 49900 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017496208534776742, + "loss": 3.0689, + "step": 50000 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.426702713783459, + "eval_loss": 3.0857808589935303, + "eval_runtime": 36.1585, + "eval_samples_per_second": 310.189, + "eval_steps_per_second": 2.6, + "step": 50000 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017471071748766202, + "loss": 3.0627, + "step": 50100 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017445934962755663, + "loss": 3.0655, + "step": 50200 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001742079817674512, + "loss": 3.0711, + "step": 50300 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001739566139073458, + "loss": 3.0684, + "step": 50400 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017370775972584142, + "loss": 3.066, + "step": 50500 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017345639186573604, + "loss": 3.0587, + "step": 50600 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001732050240056306, + "loss": 3.0705, + "step": 50700 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017295365614552522, + "loss": 3.0652, + "step": 50800 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001727022882854198, + "loss": 3.0718, + "step": 50900 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001724509204253144, + "loss": 3.067, + "step": 51000 + }, + { + "epoch": 0.43, + "eval_accuracy": 0.42674553787053365, + "eval_loss": 3.08451247215271, + "eval_runtime": 37.357, + "eval_samples_per_second": 300.238, + "eval_steps_per_second": 2.516, + "step": 51000 + }, + { + "epoch": 0.43, + "learning_rate": 0.000172199552565209, + "loss": 3.0652, + "step": 51100 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001719481847051036, + "loss": 3.0697, + "step": 51200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017169681684499818, + "loss": 3.0699, + "step": 51300 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001714454489848928, + "loss": 3.0656, + "step": 51400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017119408112478736, + "loss": 3.0579, + "step": 51500 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017094271326468198, + "loss": 3.0586, + "step": 51600 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017069134540457655, + "loss": 3.0725, + "step": 51700 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017043997754447116, + "loss": 3.0713, + "step": 51800 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017018860968436573, + "loss": 3.0674, + "step": 51900 + }, + { + "epoch": 0.43, + "learning_rate": 0.00016993724182426035, + "loss": 3.0635, + "step": 52000 + }, + { + "epoch": 0.43, + "eval_accuracy": 0.4271583339885653, + "eval_loss": 3.082775115966797, + "eval_runtime": 36.4468, + "eval_samples_per_second": 307.736, + "eval_steps_per_second": 2.579, + "step": 52000 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016968587396415494, + "loss": 3.0589, + "step": 52100 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001694345061040495, + "loss": 3.0656, + "step": 52200 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016918313824394412, + "loss": 3.0657, + "step": 52300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016893177038383868, + "loss": 3.0622, + "step": 52400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001686804025237333, + "loss": 3.0627, + "step": 52500 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001684290346636279, + "loss": 3.063, + "step": 52600 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016817766680352249, + "loss": 3.0637, + "step": 52700 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016792629894341708, + "loss": 3.0639, + "step": 52800 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001676774447619127, + "loss": 3.0639, + "step": 52900 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016742607690180733, + "loss": 3.0678, + "step": 53000 + }, + { + "epoch": 0.44, + "eval_accuracy": 0.4273408628842935, + "eval_loss": 3.0823299884796143, + "eval_runtime": 36.1917, + "eval_samples_per_second": 309.906, + "eval_steps_per_second": 2.597, + "step": 53000 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001671747090417019, + "loss": 3.0582, + "step": 53100 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001669233411815965, + "loss": 3.0708, + "step": 53200 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001666719733214911, + "loss": 3.0692, + "step": 53300 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001664206054613857, + "loss": 3.0671, + "step": 53400 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016616923760128029, + "loss": 3.0662, + "step": 53500 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001659178697411749, + "loss": 3.0653, + "step": 53600 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016566650188106947, + "loss": 3.0669, + "step": 53700 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001654151340209641, + "loss": 3.0552, + "step": 53800 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016516376616085865, + "loss": 3.0569, + "step": 53900 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016491239830075327, + "loss": 3.067, + "step": 54000 + }, + { + "epoch": 0.45, + "eval_accuracy": 0.4276448436991025, + "eval_loss": 3.0794825553894043, + "eval_runtime": 36.2802, + "eval_samples_per_second": 309.15, + "eval_steps_per_second": 2.591, + "step": 54000 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016466103044064783, + "loss": 3.0623, + "step": 54100 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016440966258054242, + "loss": 3.0612, + "step": 54200 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016415829472043704, + "loss": 3.0588, + "step": 54300 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001639069268603316, + "loss": 3.064, + "step": 54400 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016365555900022623, + "loss": 3.0564, + "step": 54500 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001634041911401208, + "loss": 3.0605, + "step": 54600 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016315533695861645, + "loss": 3.0591, + "step": 54700 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016290396909851104, + "loss": 3.0639, + "step": 54800 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016265260123840563, + "loss": 3.0612, + "step": 54900 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016240123337830025, + "loss": 3.0597, + "step": 55000 + }, + { + "epoch": 0.46, + "eval_accuracy": 0.4277283857706089, + "eval_loss": 3.078927516937256, + "eval_runtime": 36.9604, + "eval_samples_per_second": 303.46, + "eval_steps_per_second": 2.543, + "step": 55000 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016214986551819482, + "loss": 3.0607, + "step": 55100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016190101133669048, + "loss": 3.0505, + "step": 55200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016164964347658507, + "loss": 3.0628, + "step": 55300 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016139827561647966, + "loss": 3.0592, + "step": 55400 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016114690775637428, + "loss": 3.0488, + "step": 55500 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016089553989626884, + "loss": 3.0533, + "step": 55600 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016064417203616346, + "loss": 3.0666, + "step": 55700 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016039280417605803, + "loss": 3.0596, + "step": 55800 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016014143631595264, + "loss": 3.0604, + "step": 55900 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001598900684558472, + "loss": 3.0648, + "step": 56000 + }, + { + "epoch": 0.47, + "eval_accuracy": 0.4278596661686904, + "eval_loss": 3.0768725872039795, + "eval_runtime": 37.0258, + "eval_samples_per_second": 302.924, + "eval_steps_per_second": 2.539, + "step": 56000 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015963870059574183, + "loss": 3.0614, + "step": 56100 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015938733273563642, + "loss": 3.0541, + "step": 56200 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015913596487553098, + "loss": 3.0595, + "step": 56300 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001588845970154256, + "loss": 3.0624, + "step": 56400 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015863322915532016, + "loss": 3.055, + "step": 56500 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015838186129521478, + "loss": 3.0585, + "step": 56600 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015813049343510937, + "loss": 3.0555, + "step": 56700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015787912557500397, + "loss": 3.0501, + "step": 56800 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015762775771489856, + "loss": 3.0667, + "step": 56900 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015737638985479315, + "loss": 3.0681, + "step": 57000 + }, + { + "epoch": 0.48, + "eval_accuracy": 0.42812924730699675, + "eval_loss": 3.075896739959717, + "eval_runtime": 36.4669, + "eval_samples_per_second": 307.567, + "eval_steps_per_second": 2.578, + "step": 57000 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015712502199468774, + "loss": 3.0554, + "step": 57100 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015687365413458236, + "loss": 3.063, + "step": 57200 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015662228627447692, + "loss": 3.0611, + "step": 57300 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015637091841437154, + "loss": 3.0647, + "step": 57400 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001561195505542661, + "loss": 3.0552, + "step": 57500 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015586818269416072, + "loss": 3.0629, + "step": 57600 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015561932851265638, + "loss": 3.0619, + "step": 57700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015536796065255095, + "loss": 3.0531, + "step": 57800 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015511659279244557, + "loss": 3.063, + "step": 57900 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015486522493234013, + "loss": 3.0513, + "step": 58000 + }, + { + "epoch": 0.49, + "eval_accuracy": 0.42832300875015444, + "eval_loss": 3.0737130641937256, + "eval_runtime": 36.8692, + "eval_samples_per_second": 304.211, + "eval_steps_per_second": 2.55, + "step": 58000 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015461385707223472, + "loss": 3.0546, + "step": 58100 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001543624892121293, + "loss": 3.0545, + "step": 58200 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001541111213520239, + "loss": 3.0543, + "step": 58300 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015385975349191852, + "loss": 3.0525, + "step": 58400 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001536083856318131, + "loss": 3.0533, + "step": 58500 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001533570177717077, + "loss": 3.0616, + "step": 58600 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015310564991160227, + "loss": 3.0542, + "step": 58700 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001528542820514969, + "loss": 3.0543, + "step": 58800 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015260291419139148, + "loss": 3.0603, + "step": 58900 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015235154633128607, + "loss": 3.0566, + "step": 59000 + }, + { + "epoch": 0.49, + "eval_accuracy": 0.42880530625540564, + "eval_loss": 3.0726654529571533, + "eval_runtime": 36.7531, + "eval_samples_per_second": 305.171, + "eval_steps_per_second": 2.558, + "step": 59000 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015210017847118066, + "loss": 3.0475, + "step": 59100 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015184881061107528, + "loss": 3.0545, + "step": 59200 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015159744275096984, + "loss": 3.0616, + "step": 59300 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015134607489086446, + "loss": 3.0503, + "step": 59400 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015109470703075903, + "loss": 3.0462, + "step": 59500 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015084333917065365, + "loss": 3.0586, + "step": 59600 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015059448498914928, + "loss": 3.0502, + "step": 59700 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015034563080764494, + "loss": 3.0535, + "step": 59800 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001500942629475395, + "loss": 3.0608, + "step": 59900 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001498428950874341, + "loss": 3.0552, + "step": 60000 + }, + { + "epoch": 0.5, + "eval_accuracy": 0.42880671032383433, + "eval_loss": 3.071218967437744, + "eval_runtime": 36.2431, + "eval_samples_per_second": 309.466, + "eval_steps_per_second": 2.594, + "step": 60000 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001495915272273287, + "loss": 3.0546, + "step": 60100 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001493401593672233, + "loss": 3.053, + "step": 60200 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001490887915071179, + "loss": 3.0506, + "step": 60300 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001488374236470125, + "loss": 3.0562, + "step": 60400 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014858605578690708, + "loss": 3.0572, + "step": 60500 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014833468792680167, + "loss": 3.0568, + "step": 60600 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014808332006669626, + "loss": 3.0571, + "step": 60700 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014783195220659085, + "loss": 3.0483, + "step": 60800 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014758058434648544, + "loss": 3.0486, + "step": 60900 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014732921648638004, + "loss": 3.0457, + "step": 61000 + }, + { + "epoch": 0.51, + "eval_accuracy": 0.42915000505464634, + "eval_loss": 3.0692341327667236, + "eval_runtime": 36.4055, + "eval_samples_per_second": 308.086, + "eval_steps_per_second": 2.582, + "step": 61000 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014707784862627463, + "loss": 3.0448, + "step": 61100 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014682648076616922, + "loss": 3.0498, + "step": 61200 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001465751129060638, + "loss": 3.0505, + "step": 61300 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001463237450459584, + "loss": 3.0531, + "step": 61400 + }, + { + "epoch": 0.51, + "learning_rate": 0.000146072377185853, + "loss": 3.0526, + "step": 61500 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014582100932574758, + "loss": 3.0585, + "step": 61600 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001455696414656422, + "loss": 3.0519, + "step": 61700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001453182736055368, + "loss": 3.0545, + "step": 61800 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014506690574543138, + "loss": 3.0521, + "step": 61900 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014481553788532598, + "loss": 3.0425, + "step": 62000 + }, + { + "epoch": 0.52, + "eval_accuracy": 0.4291008626596426, + "eval_loss": 3.0679004192352295, + "eval_runtime": 36.1636, + "eval_samples_per_second": 310.146, + "eval_steps_per_second": 2.599, + "step": 62000 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001445666837038216, + "loss": 3.0616, + "step": 62100 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001443153158437162, + "loss": 3.057, + "step": 62200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001440639479836108, + "loss": 3.052, + "step": 62300 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001438125801235054, + "loss": 3.0501, + "step": 62400 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001435612122634, + "loss": 3.0457, + "step": 62500 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001433098444032946, + "loss": 3.0506, + "step": 62600 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014305847654318918, + "loss": 3.0478, + "step": 62700 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014280710868308378, + "loss": 3.0545, + "step": 62800 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014255574082297834, + "loss": 3.0554, + "step": 62900 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014230437296287296, + "loss": 3.0573, + "step": 63000 + }, + { + "epoch": 0.53, + "eval_accuracy": 0.42917527828636254, + "eval_loss": 3.0663866996765137, + "eval_runtime": 36.3952, + "eval_samples_per_second": 308.172, + "eval_steps_per_second": 2.583, + "step": 63000 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014205300510276755, + "loss": 3.0485, + "step": 63100 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014180163724266214, + "loss": 3.0476, + "step": 63200 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014155026938255673, + "loss": 3.0442, + "step": 63300 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014129890152245132, + "loss": 3.0486, + "step": 63400 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014104753366234592, + "loss": 3.0384, + "step": 63500 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001407961658022405, + "loss": 3.0539, + "step": 63600 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001405447979421351, + "loss": 3.0429, + "step": 63700 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001402934300820297, + "loss": 3.0444, + "step": 63800 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014004457590052535, + "loss": 3.0489, + "step": 63900 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013979320804041994, + "loss": 3.0555, + "step": 64000 + }, + { + "epoch": 0.54, + "eval_accuracy": 0.42978183584755186, + "eval_loss": 3.0650320053100586, + "eval_runtime": 37.0145, + "eval_samples_per_second": 303.016, + "eval_steps_per_second": 2.54, + "step": 64000 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013954184018031453, + "loss": 3.0507, + "step": 64100 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013929047232020912, + "loss": 3.0453, + "step": 64200 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013903910446010372, + "loss": 3.0495, + "step": 64300 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001387877365999983, + "loss": 3.0446, + "step": 64400 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001385363687398929, + "loss": 3.0488, + "step": 64500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013828500087978752, + "loss": 3.0498, + "step": 64600 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001380336330196821, + "loss": 3.0441, + "step": 64700 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013778226515957667, + "loss": 3.0435, + "step": 64800 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013753089729947126, + "loss": 3.0517, + "step": 64900 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013727952943936585, + "loss": 3.0421, + "step": 65000 + }, + { + "epoch": 0.54, + "eval_accuracy": 0.4294195861929527, + "eval_loss": 3.0636541843414307, + "eval_runtime": 36.752, + "eval_samples_per_second": 305.181, + "eval_steps_per_second": 2.558, + "step": 65000 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013702816157926045, + "loss": 3.0412, + "step": 65100 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013677679371915506, + "loss": 3.0548, + "step": 65200 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013652542585904966, + "loss": 3.0409, + "step": 65300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013627405799894425, + "loss": 3.0377, + "step": 65400 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013602269013883884, + "loss": 3.0429, + "step": 65500 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013577383595733447, + "loss": 3.0467, + "step": 65600 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013552498177583013, + "loss": 3.0496, + "step": 65700 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013527361391572472, + "loss": 3.0424, + "step": 65800 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013502224605561932, + "loss": 3.043, + "step": 65900 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001347708781955139, + "loss": 3.0496, + "step": 66000 + }, + { + "epoch": 0.55, + "eval_accuracy": 0.42957333168589307, + "eval_loss": 3.062688112258911, + "eval_runtime": 36.3303, + "eval_samples_per_second": 308.723, + "eval_steps_per_second": 2.587, + "step": 66000 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001345195103354085, + "loss": 3.0434, + "step": 66100 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001342681424753031, + "loss": 3.0392, + "step": 66200 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013401677461519768, + "loss": 3.041, + "step": 66300 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013376540675509227, + "loss": 3.0526, + "step": 66400 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001335140388949869, + "loss": 3.046, + "step": 66500 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013326267103488148, + "loss": 3.0398, + "step": 66600 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013301130317477607, + "loss": 3.0473, + "step": 66700 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013275993531467066, + "loss": 3.0368, + "step": 66800 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013250856745456523, + "loss": 3.0427, + "step": 66900 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013225719959445985, + "loss": 3.0415, + "step": 67000 + }, + { + "epoch": 0.56, + "eval_accuracy": 0.4300071888303548, + "eval_loss": 3.060805320739746, + "eval_runtime": 37.0174, + "eval_samples_per_second": 302.993, + "eval_steps_per_second": 2.539, + "step": 67000 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013200583173435444, + "loss": 3.0429, + "step": 67100 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013175446387424903, + "loss": 3.0494, + "step": 67200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013150309601414362, + "loss": 3.0384, + "step": 67300 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001312517281540382, + "loss": 3.0438, + "step": 67400 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001310003602939328, + "loss": 3.0427, + "step": 67500 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001307489924338274, + "loss": 3.0447, + "step": 67600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013050013825232306, + "loss": 3.0438, + "step": 67700 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013024877039221765, + "loss": 3.0403, + "step": 67800 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012999740253211224, + "loss": 3.0478, + "step": 67900 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012974603467200683, + "loss": 3.0412, + "step": 68000 + }, + { + "epoch": 0.57, + "eval_accuracy": 0.4298436148584137, + "eval_loss": 3.0598626136779785, + "eval_runtime": 36.2351, + "eval_samples_per_second": 309.534, + "eval_steps_per_second": 2.594, + "step": 68000 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012949466681190142, + "loss": 3.0411, + "step": 68100 + }, + { + "epoch": 0.57, + "learning_rate": 0.000129243298951796, + "loss": 3.035, + "step": 68200 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001289919310916906, + "loss": 3.0464, + "step": 68300 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001287405632315852, + "loss": 3.0369, + "step": 68400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012848919537147979, + "loss": 3.0428, + "step": 68500 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001282378275113744, + "loss": 3.0436, + "step": 68600 + }, + { + "epoch": 0.57, + "learning_rate": 0.000127986459651269, + "loss": 3.0454, + "step": 68700 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012773509179116356, + "loss": 3.0361, + "step": 68800 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012748372393105815, + "loss": 3.0437, + "step": 68900 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012723235607095274, + "loss": 3.0373, + "step": 69000 + }, + { + "epoch": 0.58, + "eval_accuracy": 0.4302465824974446, + "eval_loss": 3.057598829269409, + "eval_runtime": 36.2031, + "eval_samples_per_second": 309.808, + "eval_steps_per_second": 2.596, + "step": 69000 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001269835018894484, + "loss": 3.0426, + "step": 69100 + }, + { + "epoch": 0.58, + "learning_rate": 0.000126732134029343, + "loss": 3.041, + "step": 69200 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001264807661692376, + "loss": 3.036, + "step": 69300 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012622939830913218, + "loss": 3.0396, + "step": 69400 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012597803044902677, + "loss": 3.0418, + "step": 69500 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012572666258892136, + "loss": 3.0335, + "step": 69600 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012547529472881595, + "loss": 3.0334, + "step": 69700 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012522392686871054, + "loss": 3.0381, + "step": 69800 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012497255900860516, + "loss": 3.0393, + "step": 69900 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012472119114849975, + "loss": 3.0393, + "step": 70000 + }, + { + "epoch": 0.59, + "eval_accuracy": 0.43052950228582343, + "eval_loss": 3.05704665184021, + "eval_runtime": 36.208, + "eval_samples_per_second": 309.765, + "eval_steps_per_second": 2.596, + "step": 70000 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012446982328839434, + "loss": 3.0383, + "step": 70100 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012421845542828894, + "loss": 3.0441, + "step": 70200 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012396708756818353, + "loss": 3.0388, + "step": 70300 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012371823338667916, + "loss": 3.0403, + "step": 70400 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012346686552657378, + "loss": 3.0368, + "step": 70500 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012321549766646837, + "loss": 3.0405, + "step": 70600 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012296412980636296, + "loss": 3.0351, + "step": 70700 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001227152756248586, + "loss": 3.0355, + "step": 70800 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001224639077647532, + "loss": 3.038, + "step": 70900 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012221253990464778, + "loss": 3.0312, + "step": 71000 + }, + { + "epoch": 0.59, + "eval_accuracy": 0.43072256169476675, + "eval_loss": 3.056051254272461, + "eval_runtime": 35.9605, + "eval_samples_per_second": 311.897, + "eval_steps_per_second": 2.614, + "step": 71000 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012196117204454238, + "loss": 3.0336, + "step": 71100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012170980418443696, + "loss": 3.0371, + "step": 71200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012145843632433155, + "loss": 3.0415, + "step": 71300 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012120706846422614, + "loss": 3.033, + "step": 71400 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012095570060412075, + "loss": 3.0401, + "step": 71500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012070433274401534, + "loss": 3.0407, + "step": 71600 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012045296488390993, + "loss": 3.0389, + "step": 71700 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012020159702380452, + "loss": 3.0326, + "step": 71800 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011995022916369911, + "loss": 3.0343, + "step": 71900 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011969886130359372, + "loss": 3.0397, + "step": 72000 + }, + { + "epoch": 0.6, + "eval_accuracy": 0.43072607186583844, + "eval_loss": 3.0532803535461426, + "eval_runtime": 36.5519, + "eval_samples_per_second": 306.851, + "eval_steps_per_second": 2.572, + "step": 72000 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011945000712208935, + "loss": 3.041, + "step": 72100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011919863926198396, + "loss": 3.0375, + "step": 72200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011894727140187855, + "loss": 3.03, + "step": 72300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011869590354177314, + "loss": 3.0314, + "step": 72400 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011844453568166773, + "loss": 3.0399, + "step": 72500 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011819316782156232, + "loss": 3.043, + "step": 72600 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011794179996145693, + "loss": 3.0324, + "step": 72700 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011769043210135152, + "loss": 3.037, + "step": 72800 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011743906424124611, + "loss": 3.0391, + "step": 72900 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011718769638114069, + "loss": 3.0303, + "step": 73000 + }, + { + "epoch": 0.61, + "eval_accuracy": 0.43111219068372514, + "eval_loss": 3.0526981353759766, + "eval_runtime": 36.4926, + "eval_samples_per_second": 307.35, + "eval_steps_per_second": 2.576, + "step": 73000 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011693632852103528, + "loss": 3.0329, + "step": 73100 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011668496066092987, + "loss": 3.0346, + "step": 73200 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011643359280082448, + "loss": 3.0405, + "step": 73300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011618222494071907, + "loss": 3.0344, + "step": 73400 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011593085708061366, + "loss": 3.0389, + "step": 73500 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011567948922050825, + "loss": 3.0361, + "step": 73600 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011542812136040285, + "loss": 3.0329, + "step": 73700 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011517675350029745, + "loss": 3.0304, + "step": 73800 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011492538564019204, + "loss": 3.0316, + "step": 73900 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011467401778008663, + "loss": 3.0403, + "step": 74000 + }, + { + "epoch": 0.62, + "eval_accuracy": 0.43146250575668055, + "eval_loss": 3.0502421855926514, + "eval_runtime": 36.2647, + "eval_samples_per_second": 309.281, + "eval_steps_per_second": 2.592, + "step": 74000 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011442516359858228, + "loss": 3.0443, + "step": 74100 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011417379573847687, + "loss": 3.0376, + "step": 74200 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011392242787837146, + "loss": 3.0313, + "step": 74300 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011367106001826606, + "loss": 3.0429, + "step": 74400 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011341969215816065, + "loss": 3.0342, + "step": 74500 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011316832429805525, + "loss": 3.0335, + "step": 74600 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011291695643794984, + "loss": 3.0375, + "step": 74700 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011266558857784443, + "loss": 3.0247, + "step": 74800 + }, + { + "epoch": 0.63, + "learning_rate": 0.000112414220717739, + "loss": 3.0309, + "step": 74900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011216285285763361, + "loss": 3.0326, + "step": 75000 + }, + { + "epoch": 0.63, + "eval_accuracy": 0.43156359868354544, + "eval_loss": 3.049257278442383, + "eval_runtime": 36.2389, + "eval_samples_per_second": 309.501, + "eval_steps_per_second": 2.594, + "step": 75000 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001119114849975282, + "loss": 3.0389, + "step": 75100 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011166011713742279, + "loss": 3.0309, + "step": 75200 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011141126295591844, + "loss": 3.0375, + "step": 75300 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011115989509581303, + "loss": 3.0351, + "step": 75400 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011090852723570762, + "loss": 3.0324, + "step": 75500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011065715937560223, + "loss": 3.0369, + "step": 75600 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011040579151549682, + "loss": 3.0289, + "step": 75700 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011015442365539141, + "loss": 3.0346, + "step": 75800 + }, + { + "epoch": 0.63, + "learning_rate": 0.000109903055795286, + "loss": 3.0234, + "step": 75900 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001096516879351806, + "loss": 3.0322, + "step": 76000 + }, + { + "epoch": 0.64, + "eval_accuracy": 0.4314962033989688, + "eval_loss": 3.0480940341949463, + "eval_runtime": 35.8603, + "eval_samples_per_second": 312.77, + "eval_steps_per_second": 2.621, + "step": 76000 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001094003200750752, + "loss": 3.027, + "step": 76100 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010914895221496979, + "loss": 3.03, + "step": 76200 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010890009803346544, + "loss": 3.0236, + "step": 76300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010864873017336003, + "loss": 3.0343, + "step": 76400 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010839736231325462, + "loss": 3.0335, + "step": 76500 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010814599445314921, + "loss": 3.0263, + "step": 76600 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010789462659304382, + "loss": 3.0269, + "step": 76700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010764325873293841, + "loss": 3.0391, + "step": 76800 + }, + { + "epoch": 0.64, + "learning_rate": 0.000107391890872833, + "loss": 3.0361, + "step": 76900 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010714052301272759, + "loss": 3.0265, + "step": 77000 + }, + { + "epoch": 0.64, + "eval_accuracy": 0.4318640693272827, + "eval_loss": 3.0469460487365723, + "eval_runtime": 37.1071, + "eval_samples_per_second": 302.26, + "eval_steps_per_second": 2.533, + "step": 77000 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010688915515262217, + "loss": 3.0313, + "step": 77100 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010663778729251676, + "loss": 3.0319, + "step": 77200 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010638641943241136, + "loss": 3.0247, + "step": 77300 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010613505157230595, + "loss": 3.0264, + "step": 77400 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001058861973908016, + "loss": 3.0262, + "step": 77500 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001056348295306962, + "loss": 3.0327, + "step": 77600 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010538346167059079, + "loss": 3.0318, + "step": 77700 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010513209381048538, + "loss": 3.0356, + "step": 77800 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010488072595037997, + "loss": 3.0374, + "step": 77900 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010462935809027457, + "loss": 3.0231, + "step": 78000 + }, + { + "epoch": 0.65, + "eval_accuracy": 0.43201430464915136, + "eval_loss": 3.045305013656616, + "eval_runtime": 37.1474, + "eval_samples_per_second": 301.933, + "eval_steps_per_second": 2.53, + "step": 78000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010437799023016916, + "loss": 3.0296, + "step": 78100 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010412662237006376, + "loss": 3.025, + "step": 78200 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010387525450995835, + "loss": 3.0329, + "step": 78300 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010362388664985294, + "loss": 3.0268, + "step": 78400 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010337251878974754, + "loss": 3.0259, + "step": 78500 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010312115092964213, + "loss": 3.0298, + "step": 78600 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010286978306953673, + "loss": 3.0296, + "step": 78700 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010261841520943132, + "loss": 3.0291, + "step": 78800 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001023670473493259, + "loss": 3.0371, + "step": 78900 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001021156794892205, + "loss": 3.0259, + "step": 79000 + }, + { + "epoch": 0.66, + "eval_accuracy": 0.43211188740494455, + "eval_loss": 3.044191837310791, + "eval_runtime": 37.3457, + "eval_samples_per_second": 300.329, + "eval_steps_per_second": 2.517, + "step": 79000 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010186431162911509, + "loss": 3.0266, + "step": 79100 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010161294376900968, + "loss": 3.0272, + "step": 79200 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010136157590890427, + "loss": 3.0191, + "step": 79300 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010111020804879886, + "loss": 3.0178, + "step": 79400 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010085884018869347, + "loss": 3.0178, + "step": 79500 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001006099860071891, + "loss": 3.0264, + "step": 79600 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010036113182568475, + "loss": 3.0172, + "step": 79700 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010010976396557934, + "loss": 3.0276, + "step": 79800 + }, + { + "epoch": 0.67, + "learning_rate": 9.985839610547395e-05, + "loss": 3.0254, + "step": 79900 + }, + { + "epoch": 0.67, + "learning_rate": 9.960702824536854e-05, + "loss": 3.0219, + "step": 80000 + }, + { + "epoch": 0.67, + "eval_accuracy": 0.43250292046233163, + "eval_loss": 3.0422935485839844, + "eval_runtime": 37.0202, + "eval_samples_per_second": 302.97, + "eval_steps_per_second": 2.539, + "step": 80000 + }, + { + "epoch": 0.67, + "learning_rate": 9.935566038526313e-05, + "loss": 3.0265, + "step": 80100 + }, + { + "epoch": 0.67, + "learning_rate": 9.910429252515772e-05, + "loss": 3.025, + "step": 80200 + }, + { + "epoch": 0.67, + "learning_rate": 9.885292466505231e-05, + "loss": 3.0164, + "step": 80300 + }, + { + "epoch": 0.67, + "learning_rate": 9.860155680494692e-05, + "loss": 3.0307, + "step": 80400 + }, + { + "epoch": 0.67, + "learning_rate": 9.835018894484151e-05, + "loss": 3.0268, + "step": 80500 + }, + { + "epoch": 0.67, + "learning_rate": 9.80988210847361e-05, + "loss": 3.0261, + "step": 80600 + }, + { + "epoch": 0.68, + "learning_rate": 9.784745322463069e-05, + "loss": 3.0213, + "step": 80700 + }, + { + "epoch": 0.68, + "learning_rate": 9.75960853645253e-05, + "loss": 3.0222, + "step": 80800 + }, + { + "epoch": 0.68, + "learning_rate": 9.734471750441989e-05, + "loss": 3.0249, + "step": 80900 + }, + { + "epoch": 0.68, + "learning_rate": 9.709334964431448e-05, + "loss": 3.0233, + "step": 81000 + }, + { + "epoch": 0.68, + "eval_accuracy": 0.4324165702539679, + "eval_loss": 3.0414962768554688, + "eval_runtime": 37.0887, + "eval_samples_per_second": 302.41, + "eval_steps_per_second": 2.534, + "step": 81000 + }, + { + "epoch": 0.68, + "learning_rate": 9.684198178420906e-05, + "loss": 3.0177, + "step": 81100 + }, + { + "epoch": 0.68, + "learning_rate": 9.659061392410365e-05, + "loss": 3.0309, + "step": 81200 + }, + { + "epoch": 0.68, + "learning_rate": 9.633924606399824e-05, + "loss": 3.0245, + "step": 81300 + }, + { + "epoch": 0.68, + "learning_rate": 9.608787820389284e-05, + "loss": 3.0287, + "step": 81400 + }, + { + "epoch": 0.68, + "learning_rate": 9.583651034378743e-05, + "loss": 3.0152, + "step": 81500 + }, + { + "epoch": 0.68, + "learning_rate": 9.558514248368203e-05, + "loss": 3.0204, + "step": 81600 + }, + { + "epoch": 0.68, + "learning_rate": 9.533377462357662e-05, + "loss": 3.0258, + "step": 81700 + }, + { + "epoch": 0.68, + "learning_rate": 9.508240676347121e-05, + "loss": 3.0255, + "step": 81800 + }, + { + "epoch": 0.69, + "learning_rate": 9.483103890336581e-05, + "loss": 3.0245, + "step": 81900 + }, + { + "epoch": 0.69, + "learning_rate": 9.45796710432604e-05, + "loss": 3.0261, + "step": 82000 + }, + { + "epoch": 0.69, + "eval_accuracy": 0.43273810192413537, + "eval_loss": 3.040773868560791, + "eval_runtime": 36.3004, + "eval_samples_per_second": 308.977, + "eval_steps_per_second": 2.59, + "step": 82000 + }, + { + "epoch": 0.69, + "learning_rate": 9.433081686175605e-05, + "loss": 3.0236, + "step": 82100 + }, + { + "epoch": 0.69, + "learning_rate": 9.407944900165064e-05, + "loss": 3.0339, + "step": 82200 + }, + { + "epoch": 0.69, + "learning_rate": 9.382808114154523e-05, + "loss": 3.021, + "step": 82300 + }, + { + "epoch": 0.69, + "learning_rate": 9.357671328143983e-05, + "loss": 3.0208, + "step": 82400 + }, + { + "epoch": 0.69, + "learning_rate": 9.332534542133443e-05, + "loss": 3.0175, + "step": 82500 + }, + { + "epoch": 0.69, + "learning_rate": 9.307397756122902e-05, + "loss": 3.0294, + "step": 82600 + }, + { + "epoch": 0.69, + "learning_rate": 9.282260970112361e-05, + "loss": 3.0258, + "step": 82700 + }, + { + "epoch": 0.69, + "learning_rate": 9.25712418410182e-05, + "loss": 3.0144, + "step": 82800 + }, + { + "epoch": 0.69, + "learning_rate": 9.231987398091278e-05, + "loss": 3.016, + "step": 82900 + }, + { + "epoch": 0.69, + "learning_rate": 9.206850612080737e-05, + "loss": 3.0221, + "step": 83000 + }, + { + "epoch": 0.69, + "eval_accuracy": 0.43296696507801, + "eval_loss": 3.038726806640625, + "eval_runtime": 36.1807, + "eval_samples_per_second": 309.999, + "eval_steps_per_second": 2.598, + "step": 83000 + }, + { + "epoch": 0.7, + "learning_rate": 9.181965193930304e-05, + "loss": 3.0217, + "step": 83100 + }, + { + "epoch": 0.7, + "learning_rate": 9.156828407919761e-05, + "loss": 3.0149, + "step": 83200 + }, + { + "epoch": 0.7, + "learning_rate": 9.131691621909222e-05, + "loss": 3.0247, + "step": 83300 + }, + { + "epoch": 0.7, + "learning_rate": 9.106554835898681e-05, + "loss": 3.021, + "step": 83400 + }, + { + "epoch": 0.7, + "learning_rate": 9.081669417748246e-05, + "loss": 3.0239, + "step": 83500 + }, + { + "epoch": 0.7, + "learning_rate": 9.056532631737705e-05, + "loss": 3.0349, + "step": 83600 + }, + { + "epoch": 0.7, + "learning_rate": 9.031395845727164e-05, + "loss": 3.026, + "step": 83700 + }, + { + "epoch": 0.7, + "learning_rate": 9.006259059716623e-05, + "loss": 3.0178, + "step": 83800 + }, + { + "epoch": 0.7, + "learning_rate": 8.981122273706082e-05, + "loss": 3.0249, + "step": 83900 + }, + { + "epoch": 0.7, + "learning_rate": 8.955985487695543e-05, + "loss": 3.0296, + "step": 84000 + }, + { + "epoch": 0.7, + "eval_accuracy": 0.43312211463937905, + "eval_loss": 3.0376861095428467, + "eval_runtime": 38.9475, + "eval_samples_per_second": 287.978, + "eval_steps_per_second": 2.414, + "step": 84000 + }, + { + "epoch": 0.7, + "learning_rate": 8.930848701685002e-05, + "loss": 3.0205, + "step": 84100 + }, + { + "epoch": 0.7, + "learning_rate": 8.905711915674461e-05, + "loss": 3.0214, + "step": 84200 + }, + { + "epoch": 0.71, + "learning_rate": 8.88057512966392e-05, + "loss": 3.0283, + "step": 84300 + }, + { + "epoch": 0.71, + "learning_rate": 8.85543834365338e-05, + "loss": 3.0163, + "step": 84400 + }, + { + "epoch": 0.71, + "learning_rate": 8.83030155764284e-05, + "loss": 3.02, + "step": 84500 + }, + { + "epoch": 0.71, + "learning_rate": 8.805164771632299e-05, + "loss": 3.0189, + "step": 84600 + }, + { + "epoch": 0.71, + "learning_rate": 8.780027985621758e-05, + "loss": 3.0167, + "step": 84700 + }, + { + "epoch": 0.71, + "learning_rate": 8.754891199611217e-05, + "loss": 3.0177, + "step": 84800 + }, + { + "epoch": 0.71, + "learning_rate": 8.729754413600678e-05, + "loss": 3.0226, + "step": 84900 + }, + { + "epoch": 0.71, + "learning_rate": 8.704617627590137e-05, + "loss": 3.0186, + "step": 85000 + }, + { + "epoch": 0.71, + "eval_accuracy": 0.4335391229626967, + "eval_loss": 3.03602933883667, + "eval_runtime": 36.1657, + "eval_samples_per_second": 310.128, + "eval_steps_per_second": 2.599, + "step": 85000 + }, + { + "epoch": 0.71, + "learning_rate": 8.679480841579594e-05, + "loss": 3.0144, + "step": 85100 + }, + { + "epoch": 0.71, + "learning_rate": 8.65459542342916e-05, + "loss": 3.0128, + "step": 85200 + }, + { + "epoch": 0.71, + "learning_rate": 8.629458637418618e-05, + "loss": 3.0189, + "step": 85300 + }, + { + "epoch": 0.71, + "learning_rate": 8.604321851408077e-05, + "loss": 3.0231, + "step": 85400 + }, + { + "epoch": 0.72, + "learning_rate": 8.579185065397537e-05, + "loss": 3.0161, + "step": 85500 + }, + { + "epoch": 0.72, + "learning_rate": 8.554048279386996e-05, + "loss": 3.0188, + "step": 85600 + }, + { + "epoch": 0.72, + "learning_rate": 8.528911493376456e-05, + "loss": 3.027, + "step": 85700 + }, + { + "epoch": 0.72, + "learning_rate": 8.503774707365915e-05, + "loss": 3.017, + "step": 85800 + }, + { + "epoch": 0.72, + "learning_rate": 8.478637921355374e-05, + "loss": 3.0173, + "step": 85900 + }, + { + "epoch": 0.72, + "learning_rate": 8.453501135344834e-05, + "loss": 3.0151, + "step": 86000 + }, + { + "epoch": 0.72, + "eval_accuracy": 0.43330745167196466, + "eval_loss": 3.034996747970581, + "eval_runtime": 36.1826, + "eval_samples_per_second": 309.983, + "eval_steps_per_second": 2.598, + "step": 86000 + }, + { + "epoch": 0.72, + "learning_rate": 8.428364349334294e-05, + "loss": 3.0227, + "step": 86100 + }, + { + "epoch": 0.72, + "learning_rate": 8.403227563323753e-05, + "loss": 3.0163, + "step": 86200 + }, + { + "epoch": 0.72, + "learning_rate": 8.378090777313212e-05, + "loss": 3.0096, + "step": 86300 + }, + { + "epoch": 0.72, + "learning_rate": 8.352953991302671e-05, + "loss": 3.0147, + "step": 86400 + }, + { + "epoch": 0.72, + "learning_rate": 8.32781720529213e-05, + "loss": 3.0051, + "step": 86500 + }, + { + "epoch": 0.72, + "learning_rate": 8.302680419281591e-05, + "loss": 3.0201, + "step": 86600 + }, + { + "epoch": 0.73, + "learning_rate": 8.277795001131154e-05, + "loss": 3.0169, + "step": 86700 + }, + { + "epoch": 0.73, + "learning_rate": 8.252658215120615e-05, + "loss": 3.008, + "step": 86800 + }, + { + "epoch": 0.73, + "learning_rate": 8.227521429110074e-05, + "loss": 3.0117, + "step": 86900 + }, + { + "epoch": 0.73, + "learning_rate": 8.202384643099533e-05, + "loss": 3.0121, + "step": 87000 + }, + { + "epoch": 0.73, + "eval_accuracy": 0.43354333516798277, + "eval_loss": 3.033334493637085, + "eval_runtime": 37.3178, + "eval_samples_per_second": 300.553, + "eval_steps_per_second": 2.519, + "step": 87000 + }, + { + "epoch": 0.73, + "learning_rate": 8.177247857088992e-05, + "loss": 3.014, + "step": 87100 + }, + { + "epoch": 0.73, + "learning_rate": 8.15211107107845e-05, + "loss": 3.0168, + "step": 87200 + }, + { + "epoch": 0.73, + "learning_rate": 8.126974285067909e-05, + "loss": 3.0092, + "step": 87300 + }, + { + "epoch": 0.73, + "learning_rate": 8.10183749905737e-05, + "loss": 3.0231, + "step": 87400 + }, + { + "epoch": 0.73, + "learning_rate": 8.076700713046829e-05, + "loss": 3.0133, + "step": 87500 + }, + { + "epoch": 0.73, + "learning_rate": 8.051563927036288e-05, + "loss": 3.0135, + "step": 87600 + }, + { + "epoch": 0.73, + "learning_rate": 8.026427141025747e-05, + "loss": 3.0188, + "step": 87700 + }, + { + "epoch": 0.73, + "learning_rate": 8.001290355015206e-05, + "loss": 3.0151, + "step": 87800 + }, + { + "epoch": 0.74, + "learning_rate": 7.976153569004667e-05, + "loss": 3.0211, + "step": 87900 + }, + { + "epoch": 0.74, + "learning_rate": 7.951016782994126e-05, + "loss": 3.0142, + "step": 88000 + }, + { + "epoch": 0.74, + "eval_accuracy": 0.4337988756220023, + "eval_loss": 3.032519817352295, + "eval_runtime": 37.5602, + "eval_samples_per_second": 298.614, + "eval_steps_per_second": 2.503, + "step": 88000 + }, + { + "epoch": 0.74, + "learning_rate": 7.925879996983585e-05, + "loss": 3.0117, + "step": 88100 + }, + { + "epoch": 0.74, + "learning_rate": 7.900743210973044e-05, + "loss": 3.0092, + "step": 88200 + }, + { + "epoch": 0.74, + "learning_rate": 7.875606424962505e-05, + "loss": 3.0124, + "step": 88300 + }, + { + "epoch": 0.74, + "learning_rate": 7.850469638951964e-05, + "loss": 3.0104, + "step": 88400 + }, + { + "epoch": 0.74, + "learning_rate": 7.825584220801528e-05, + "loss": 3.0136, + "step": 88500 + }, + { + "epoch": 0.74, + "learning_rate": 7.800447434790988e-05, + "loss": 3.0186, + "step": 88600 + }, + { + "epoch": 0.74, + "learning_rate": 7.775310648780447e-05, + "loss": 3.0107, + "step": 88700 + }, + { + "epoch": 0.74, + "learning_rate": 7.750173862769906e-05, + "loss": 3.0129, + "step": 88800 + }, + { + "epoch": 0.74, + "learning_rate": 7.725037076759365e-05, + "loss": 3.0117, + "step": 88900 + }, + { + "epoch": 0.74, + "learning_rate": 7.699900290748825e-05, + "loss": 3.0088, + "step": 89000 + }, + { + "epoch": 0.74, + "eval_accuracy": 0.4338164264773608, + "eval_loss": 3.031200647354126, + "eval_runtime": 36.6187, + "eval_samples_per_second": 306.292, + "eval_steps_per_second": 2.567, + "step": 89000 + }, + { + "epoch": 0.75, + "learning_rate": 7.674763504738283e-05, + "loss": 3.0107, + "step": 89100 + }, + { + "epoch": 0.75, + "learning_rate": 7.649626718727742e-05, + "loss": 3.0104, + "step": 89200 + }, + { + "epoch": 0.75, + "learning_rate": 7.624489932717202e-05, + "loss": 3.0141, + "step": 89300 + }, + { + "epoch": 0.75, + "learning_rate": 7.59935314670666e-05, + "loss": 3.0263, + "step": 89400 + }, + { + "epoch": 0.75, + "learning_rate": 7.57421636069612e-05, + "loss": 3.0093, + "step": 89500 + }, + { + "epoch": 0.75, + "learning_rate": 7.54907957468558e-05, + "loss": 3.0057, + "step": 89600 + }, + { + "epoch": 0.75, + "learning_rate": 7.52394278867504e-05, + "loss": 3.0104, + "step": 89700 + }, + { + "epoch": 0.75, + "learning_rate": 7.498806002664499e-05, + "loss": 3.0202, + "step": 89800 + }, + { + "epoch": 0.75, + "learning_rate": 7.473669216653958e-05, + "loss": 3.0118, + "step": 89900 + }, + { + "epoch": 0.75, + "learning_rate": 7.448532430643417e-05, + "loss": 3.0087, + "step": 90000 + }, + { + "epoch": 0.75, + "eval_accuracy": 0.43394138856751324, + "eval_loss": 3.0297725200653076, + "eval_runtime": 36.7079, + "eval_samples_per_second": 305.547, + "eval_steps_per_second": 2.561, + "step": 90000 + }, + { + "epoch": 0.75, + "learning_rate": 7.423395644632877e-05, + "loss": 3.0163, + "step": 90100 + }, + { + "epoch": 0.75, + "learning_rate": 7.398258858622336e-05, + "loss": 3.0168, + "step": 90200 + }, + { + "epoch": 0.76, + "learning_rate": 7.373122072611796e-05, + "loss": 3.0145, + "step": 90300 + }, + { + "epoch": 0.76, + "learning_rate": 7.347985286601255e-05, + "loss": 3.0112, + "step": 90400 + }, + { + "epoch": 0.76, + "learning_rate": 7.322848500590714e-05, + "loss": 3.0094, + "step": 90500 + }, + { + "epoch": 0.76, + "learning_rate": 7.297711714580173e-05, + "loss": 3.0129, + "step": 90600 + }, + { + "epoch": 0.76, + "learning_rate": 7.272574928569632e-05, + "loss": 3.0033, + "step": 90700 + }, + { + "epoch": 0.76, + "learning_rate": 7.247438142559092e-05, + "loss": 3.0115, + "step": 90800 + }, + { + "epoch": 0.76, + "learning_rate": 7.222301356548552e-05, + "loss": 3.0075, + "step": 90900 + }, + { + "epoch": 0.76, + "learning_rate": 7.197164570538011e-05, + "loss": 3.0134, + "step": 91000 + }, + { + "epoch": 0.76, + "eval_accuracy": 0.43423554090332145, + "eval_loss": 3.0285885334014893, + "eval_runtime": 36.591, + "eval_samples_per_second": 306.523, + "eval_steps_per_second": 2.569, + "step": 91000 + }, + { + "epoch": 0.76, + "learning_rate": 7.172279152387576e-05, + "loss": 3.019, + "step": 91100 + }, + { + "epoch": 0.76, + "learning_rate": 7.147142366377035e-05, + "loss": 3.0166, + "step": 91200 + }, + { + "epoch": 0.76, + "learning_rate": 7.122005580366494e-05, + "loss": 3.0114, + "step": 91300 + }, + { + "epoch": 0.76, + "learning_rate": 7.097120162216059e-05, + "loss": 3.015, + "step": 91400 + }, + { + "epoch": 0.77, + "learning_rate": 7.071983376205518e-05, + "loss": 3.0123, + "step": 91500 + }, + { + "epoch": 0.77, + "learning_rate": 7.046846590194977e-05, + "loss": 3.007, + "step": 91600 + }, + { + "epoch": 0.77, + "learning_rate": 7.021709804184436e-05, + "loss": 3.005, + "step": 91700 + }, + { + "epoch": 0.77, + "learning_rate": 6.996573018173895e-05, + "loss": 3.0122, + "step": 91800 + }, + { + "epoch": 0.77, + "learning_rate": 6.971436232163356e-05, + "loss": 3.0069, + "step": 91900 + }, + { + "epoch": 0.77, + "learning_rate": 6.946299446152815e-05, + "loss": 3.0136, + "step": 92000 + }, + { + "epoch": 0.77, + "eval_accuracy": 0.43437735181461806, + "eval_loss": 3.0268590450286865, + "eval_runtime": 36.7262, + "eval_samples_per_second": 305.395, + "eval_steps_per_second": 2.559, + "step": 92000 + }, + { + "epoch": 0.77, + "learning_rate": 6.921162660142274e-05, + "loss": 3.0063, + "step": 92100 + }, + { + "epoch": 0.77, + "learning_rate": 6.896025874131733e-05, + "loss": 3.007, + "step": 92200 + }, + { + "epoch": 0.77, + "learning_rate": 6.870889088121192e-05, + "loss": 3.0132, + "step": 92300 + }, + { + "epoch": 0.77, + "learning_rate": 6.845752302110651e-05, + "loss": 3.0145, + "step": 92400 + }, + { + "epoch": 0.77, + "learning_rate": 6.82061551610011e-05, + "loss": 3.0116, + "step": 92500 + }, + { + "epoch": 0.77, + "learning_rate": 6.79547873008957e-05, + "loss": 3.0138, + "step": 92600 + }, + { + "epoch": 0.78, + "learning_rate": 6.77034194407903e-05, + "loss": 3.0075, + "step": 92700 + }, + { + "epoch": 0.78, + "learning_rate": 6.745205158068489e-05, + "loss": 3.0098, + "step": 92800 + }, + { + "epoch": 0.78, + "learning_rate": 6.720068372057948e-05, + "loss": 3.0058, + "step": 92900 + }, + { + "epoch": 0.78, + "learning_rate": 6.694931586047407e-05, + "loss": 3.0043, + "step": 93000 + }, + { + "epoch": 0.78, + "eval_accuracy": 0.43468133262942704, + "eval_loss": 3.0255324840545654, + "eval_runtime": 36.1761, + "eval_samples_per_second": 310.039, + "eval_steps_per_second": 2.598, + "step": 93000 + }, + { + "epoch": 0.78, + "learning_rate": 6.669794800036866e-05, + "loss": 3.0167, + "step": 93100 + }, + { + "epoch": 0.78, + "learning_rate": 6.644658014026327e-05, + "loss": 3.0077, + "step": 93200 + }, + { + "epoch": 0.78, + "learning_rate": 6.619521228015785e-05, + "loss": 3.0087, + "step": 93300 + }, + { + "epoch": 0.78, + "learning_rate": 6.594384442005244e-05, + "loss": 3.0137, + "step": 93400 + }, + { + "epoch": 0.78, + "learning_rate": 6.569499023854809e-05, + "loss": 3.015, + "step": 93500 + }, + { + "epoch": 0.78, + "learning_rate": 6.544362237844268e-05, + "loss": 3.0046, + "step": 93600 + }, + { + "epoch": 0.78, + "learning_rate": 6.519225451833728e-05, + "loss": 3.0015, + "step": 93700 + }, + { + "epoch": 0.78, + "learning_rate": 6.494088665823187e-05, + "loss": 3.0074, + "step": 93800 + }, + { + "epoch": 0.79, + "learning_rate": 6.468951879812646e-05, + "loss": 3.0082, + "step": 93900 + }, + { + "epoch": 0.79, + "learning_rate": 6.443815093802106e-05, + "loss": 2.9995, + "step": 94000 + }, + { + "epoch": 0.79, + "eval_accuracy": 0.43484701270401116, + "eval_loss": 3.023953914642334, + "eval_runtime": 36.328, + "eval_samples_per_second": 308.742, + "eval_steps_per_second": 2.588, + "step": 94000 + }, + { + "epoch": 0.79, + "learning_rate": 6.418678307791566e-05, + "loss": 3.0039, + "step": 94100 + }, + { + "epoch": 0.79, + "learning_rate": 6.393541521781025e-05, + "loss": 3.0095, + "step": 94200 + }, + { + "epoch": 0.79, + "learning_rate": 6.368404735770483e-05, + "loss": 3.0028, + "step": 94300 + }, + { + "epoch": 0.79, + "learning_rate": 6.343267949759943e-05, + "loss": 3.0082, + "step": 94400 + }, + { + "epoch": 0.79, + "learning_rate": 6.318131163749403e-05, + "loss": 3.0069, + "step": 94500 + }, + { + "epoch": 0.79, + "learning_rate": 6.292994377738862e-05, + "loss": 3.0004, + "step": 94600 + }, + { + "epoch": 0.79, + "learning_rate": 6.267857591728321e-05, + "loss": 3.0087, + "step": 94700 + }, + { + "epoch": 0.79, + "learning_rate": 6.24272080571778e-05, + "loss": 3.0062, + "step": 94800 + }, + { + "epoch": 0.79, + "learning_rate": 6.21758401970724e-05, + "loss": 3.0113, + "step": 94900 + }, + { + "epoch": 0.79, + "learning_rate": 6.1924472336967e-05, + "loss": 3.001, + "step": 95000 + }, + { + "epoch": 0.79, + "eval_accuracy": 0.434945999528233, + "eval_loss": 3.0230536460876465, + "eval_runtime": 36.4523, + "eval_samples_per_second": 307.69, + "eval_steps_per_second": 2.579, + "step": 95000 + }, + { + "epoch": 0.8, + "learning_rate": 6.167310447686157e-05, + "loss": 3.0026, + "step": 95100 + }, + { + "epoch": 0.8, + "learning_rate": 6.142173661675618e-05, + "loss": 3.01, + "step": 95200 + }, + { + "epoch": 0.8, + "learning_rate": 6.117036875665077e-05, + "loss": 3.0073, + "step": 95300 + }, + { + "epoch": 0.8, + "learning_rate": 6.091900089654536e-05, + "loss": 3.0101, + "step": 95400 + }, + { + "epoch": 0.8, + "learning_rate": 6.067014671504101e-05, + "loss": 3.0013, + "step": 95500 + }, + { + "epoch": 0.8, + "learning_rate": 6.04187788549356e-05, + "loss": 3.004, + "step": 95600 + }, + { + "epoch": 0.8, + "learning_rate": 6.01674109948302e-05, + "loss": 3.0042, + "step": 95700 + }, + { + "epoch": 0.8, + "learning_rate": 5.991604313472479e-05, + "loss": 3.0081, + "step": 95800 + }, + { + "epoch": 0.8, + "learning_rate": 5.966467527461938e-05, + "loss": 3.0048, + "step": 95900 + }, + { + "epoch": 0.8, + "learning_rate": 5.941582109311503e-05, + "loss": 3.007, + "step": 96000 + }, + { + "epoch": 0.8, + "eval_accuracy": 0.4351959237085379, + "eval_loss": 3.02174973487854, + "eval_runtime": 36.367, + "eval_samples_per_second": 308.412, + "eval_steps_per_second": 2.585, + "step": 96000 + }, + { + "epoch": 0.8, + "learning_rate": 5.9164453233009627e-05, + "loss": 3.006, + "step": 96100 + }, + { + "epoch": 0.8, + "learning_rate": 5.891308537290422e-05, + "loss": 3.0025, + "step": 96200 + }, + { + "epoch": 0.81, + "learning_rate": 5.866171751279881e-05, + "loss": 3.006, + "step": 96300 + }, + { + "epoch": 0.81, + "learning_rate": 5.841034965269341e-05, + "loss": 2.9956, + "step": 96400 + }, + { + "epoch": 0.81, + "learning_rate": 5.815898179258799e-05, + "loss": 2.9968, + "step": 96500 + }, + { + "epoch": 0.81, + "learning_rate": 5.790761393248258e-05, + "loss": 3.0023, + "step": 96600 + }, + { + "epoch": 0.81, + "learning_rate": 5.765624607237718e-05, + "loss": 3.0014, + "step": 96700 + }, + { + "epoch": 0.81, + "learning_rate": 5.740487821227177e-05, + "loss": 2.9961, + "step": 96800 + }, + { + "epoch": 0.81, + "learning_rate": 5.715351035216637e-05, + "loss": 3.0024, + "step": 96900 + }, + { + "epoch": 0.81, + "learning_rate": 5.690214249206096e-05, + "loss": 3.0035, + "step": 97000 + }, + { + "epoch": 0.81, + "eval_accuracy": 0.43532720410661935, + "eval_loss": 3.02020263671875, + "eval_runtime": 37.453, + "eval_samples_per_second": 299.469, + "eval_steps_per_second": 2.51, + "step": 97000 + }, + { + "epoch": 0.81, + "learning_rate": 5.665328831055661e-05, + "loss": 3.0032, + "step": 97100 + }, + { + "epoch": 0.81, + "learning_rate": 5.64019204504512e-05, + "loss": 2.9961, + "step": 97200 + }, + { + "epoch": 0.81, + "learning_rate": 5.61505525903458e-05, + "loss": 3.0048, + "step": 97300 + }, + { + "epoch": 0.81, + "learning_rate": 5.589918473024039e-05, + "loss": 2.9939, + "step": 97400 + }, + { + "epoch": 0.82, + "learning_rate": 5.565033054873604e-05, + "loss": 2.9995, + "step": 97500 + }, + { + "epoch": 0.82, + "learning_rate": 5.539896268863063e-05, + "loss": 3.0067, + "step": 97600 + }, + { + "epoch": 0.82, + "learning_rate": 5.514759482852523e-05, + "loss": 2.9924, + "step": 97700 + }, + { + "epoch": 0.82, + "learning_rate": 5.489622696841981e-05, + "loss": 2.9997, + "step": 97800 + }, + { + "epoch": 0.82, + "learning_rate": 5.46448591083144e-05, + "loss": 3.0077, + "step": 97900 + }, + { + "epoch": 0.82, + "learning_rate": 5.4393491248209e-05, + "loss": 2.9966, + "step": 98000 + }, + { + "epoch": 0.82, + "eval_accuracy": 0.43553711233670683, + "eval_loss": 3.019421100616455, + "eval_runtime": 36.419, + "eval_samples_per_second": 307.971, + "eval_steps_per_second": 2.581, + "step": 98000 + }, + { + "epoch": 0.82, + "learning_rate": 5.414212338810359e-05, + "loss": 3.0027, + "step": 98100 + }, + { + "epoch": 0.82, + "learning_rate": 5.3890755527998184e-05, + "loss": 3.0008, + "step": 98200 + }, + { + "epoch": 0.82, + "learning_rate": 5.363938766789278e-05, + "loss": 3.0019, + "step": 98300 + }, + { + "epoch": 0.82, + "learning_rate": 5.338801980778737e-05, + "loss": 2.9993, + "step": 98400 + }, + { + "epoch": 0.82, + "learning_rate": 5.313665194768197e-05, + "loss": 3.0025, + "step": 98500 + }, + { + "epoch": 0.82, + "learning_rate": 5.2885284087576555e-05, + "loss": 2.9987, + "step": 98600 + }, + { + "epoch": 0.83, + "learning_rate": 5.263391622747115e-05, + "loss": 3.0054, + "step": 98700 + }, + { + "epoch": 0.83, + "learning_rate": 5.2382548367365745e-05, + "loss": 3.0064, + "step": 98800 + }, + { + "epoch": 0.83, + "learning_rate": 5.2131180507260336e-05, + "loss": 3.0096, + "step": 98900 + }, + { + "epoch": 0.83, + "learning_rate": 5.1879812647154934e-05, + "loss": 2.9881, + "step": 99000 + }, + { + "epoch": 0.83, + "eval_accuracy": 0.4356613723926449, + "eval_loss": 3.0177648067474365, + "eval_runtime": 37.6095, + "eval_samples_per_second": 298.223, + "eval_steps_per_second": 2.499, + "step": 99000 + }, + { + "epoch": 0.83, + "learning_rate": 5.1628444787049525e-05, + "loss": 3.0002, + "step": 99100 + }, + { + "epoch": 0.83, + "learning_rate": 5.1377076926944117e-05, + "loss": 2.9966, + "step": 99200 + }, + { + "epoch": 0.83, + "learning_rate": 5.1128222745439764e-05, + "loss": 3.0012, + "step": 99300 + }, + { + "epoch": 0.83, + "learning_rate": 5.0876854885334356e-05, + "loss": 2.9964, + "step": 99400 + }, + { + "epoch": 0.83, + "learning_rate": 5.0625487025228954e-05, + "loss": 2.9981, + "step": 99500 + }, + { + "epoch": 0.83, + "learning_rate": 5.037411916512354e-05, + "loss": 2.9986, + "step": 99600 + }, + { + "epoch": 0.83, + "learning_rate": 5.012275130501813e-05, + "loss": 2.9981, + "step": 99700 + }, + { + "epoch": 0.83, + "learning_rate": 4.987138344491273e-05, + "loss": 3.0057, + "step": 99800 + }, + { + "epoch": 0.84, + "learning_rate": 4.962001558480732e-05, + "loss": 2.994, + "step": 99900 + }, + { + "epoch": 0.84, + "learning_rate": 4.936864772470192e-05, + "loss": 3.0028, + "step": 100000 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.43574631853258, + "eval_loss": 3.0173962116241455, + "eval_runtime": 36.2768, + "eval_samples_per_second": 309.179, + "eval_steps_per_second": 2.591, + "step": 100000 + }, + { + "epoch": 0.84, + "learning_rate": 4.911727986459651e-05, + "loss": 3.0028, + "step": 100100 + }, + { + "epoch": 0.84, + "learning_rate": 4.8865912004491106e-05, + "loss": 2.9969, + "step": 100200 + }, + { + "epoch": 0.84, + "learning_rate": 4.86145441443857e-05, + "loss": 3.0029, + "step": 100300 + }, + { + "epoch": 0.84, + "learning_rate": 4.836317628428029e-05, + "loss": 3.0033, + "step": 100400 + }, + { + "epoch": 0.84, + "learning_rate": 4.811180842417488e-05, + "loss": 2.9945, + "step": 100500 + }, + { + "epoch": 0.84, + "learning_rate": 4.786044056406947e-05, + "loss": 2.9985, + "step": 100600 + }, + { + "epoch": 0.84, + "learning_rate": 4.760907270396406e-05, + "loss": 2.9952, + "step": 100700 + }, + { + "epoch": 0.84, + "learning_rate": 4.735770484385866e-05, + "loss": 2.9859, + "step": 100800 + }, + { + "epoch": 0.84, + "learning_rate": 4.710633698375325e-05, + "loss": 2.9951, + "step": 100900 + }, + { + "epoch": 0.84, + "learning_rate": 4.685496912364785e-05, + "loss": 2.9933, + "step": 101000 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.4362117672166871, + "eval_loss": 3.01594614982605, + "eval_runtime": 36.0518, + "eval_samples_per_second": 311.108, + "eval_steps_per_second": 2.607, + "step": 101000 + }, + { + "epoch": 0.85, + "learning_rate": 4.660360126354244e-05, + "loss": 2.9979, + "step": 101100 + }, + { + "epoch": 0.85, + "learning_rate": 4.635223340343704e-05, + "loss": 2.9961, + "step": 101200 + }, + { + "epoch": 0.85, + "learning_rate": 4.6100865543331624e-05, + "loss": 3.0076, + "step": 101300 + }, + { + "epoch": 0.85, + "learning_rate": 4.5849497683226215e-05, + "loss": 3.0, + "step": 101400 + }, + { + "epoch": 0.85, + "learning_rate": 4.559812982312081e-05, + "loss": 2.9964, + "step": 101500 + }, + { + "epoch": 0.85, + "learning_rate": 4.5346761963015404e-05, + "loss": 2.9951, + "step": 101600 + }, + { + "epoch": 0.85, + "learning_rate": 4.5095394102909996e-05, + "loss": 2.9964, + "step": 101700 + }, + { + "epoch": 0.85, + "learning_rate": 4.4844026242804594e-05, + "loss": 3.0034, + "step": 101800 + }, + { + "epoch": 0.85, + "learning_rate": 4.4592658382699185e-05, + "loss": 2.994, + "step": 101900 + }, + { + "epoch": 0.85, + "learning_rate": 4.434129052259378e-05, + "loss": 3.0002, + "step": 102000 + }, + { + "epoch": 0.85, + "eval_accuracy": 0.43605310748424636, + "eval_loss": 3.01462721824646, + "eval_runtime": 36.4761, + "eval_samples_per_second": 307.489, + "eval_steps_per_second": 2.577, + "step": 102000 + }, + { + "epoch": 0.85, + "learning_rate": 4.408992266248837e-05, + "loss": 2.9951, + "step": 102100 + }, + { + "epoch": 0.85, + "learning_rate": 4.383855480238296e-05, + "loss": 2.9959, + "step": 102200 + }, + { + "epoch": 0.86, + "learning_rate": 4.358718694227756e-05, + "loss": 2.9922, + "step": 102300 + }, + { + "epoch": 0.86, + "learning_rate": 4.333581908217215e-05, + "loss": 2.9888, + "step": 102400 + }, + { + "epoch": 0.86, + "learning_rate": 4.3084451222066746e-05, + "loss": 2.9951, + "step": 102500 + }, + { + "epoch": 0.86, + "learning_rate": 4.283559704056239e-05, + "loss": 2.994, + "step": 102600 + }, + { + "epoch": 0.86, + "learning_rate": 4.2586742859058035e-05, + "loss": 2.9969, + "step": 102700 + }, + { + "epoch": 0.86, + "learning_rate": 4.2335374998952626e-05, + "loss": 2.9959, + "step": 102800 + }, + { + "epoch": 0.86, + "learning_rate": 4.2084007138847224e-05, + "loss": 3.0063, + "step": 102900 + }, + { + "epoch": 0.86, + "learning_rate": 4.1832639278741816e-05, + "loss": 2.9901, + "step": 103000 + }, + { + "epoch": 0.86, + "eval_accuracy": 0.4364076347624878, + "eval_loss": 3.0131843090057373, + "eval_runtime": 36.6938, + "eval_samples_per_second": 305.665, + "eval_steps_per_second": 2.562, + "step": 103000 + }, + { + "epoch": 0.86, + "learning_rate": 4.1581271418636414e-05, + "loss": 2.996, + "step": 103100 + }, + { + "epoch": 0.86, + "learning_rate": 4.1329903558531005e-05, + "loss": 2.9928, + "step": 103200 + }, + { + "epoch": 0.86, + "learning_rate": 4.1078535698425596e-05, + "loss": 2.9981, + "step": 103300 + }, + { + "epoch": 0.86, + "learning_rate": 4.082716783832019e-05, + "loss": 2.9999, + "step": 103400 + }, + { + "epoch": 0.87, + "learning_rate": 4.057579997821478e-05, + "loss": 2.9879, + "step": 103500 + }, + { + "epoch": 0.87, + "learning_rate": 4.032443211810937e-05, + "loss": 2.9927, + "step": 103600 + }, + { + "epoch": 0.87, + "learning_rate": 4.007306425800397e-05, + "loss": 2.997, + "step": 103700 + }, + { + "epoch": 0.87, + "learning_rate": 3.982169639789856e-05, + "loss": 2.9899, + "step": 103800 + }, + { + "epoch": 0.87, + "learning_rate": 3.957032853779316e-05, + "loss": 3.0014, + "step": 103900 + }, + { + "epoch": 0.87, + "learning_rate": 3.931896067768775e-05, + "loss": 2.9895, + "step": 104000 + }, + { + "epoch": 0.87, + "eval_accuracy": 0.4363837655992002, + "eval_loss": 3.012049674987793, + "eval_runtime": 36.7749, + "eval_samples_per_second": 304.99, + "eval_steps_per_second": 2.556, + "step": 104000 + }, + { + "epoch": 0.87, + "learning_rate": 3.906759281758235e-05, + "loss": 3.0, + "step": 104100 + }, + { + "epoch": 0.87, + "learning_rate": 3.881622495747693e-05, + "loss": 2.9981, + "step": 104200 + }, + { + "epoch": 0.87, + "learning_rate": 3.856485709737152e-05, + "loss": 2.9975, + "step": 104300 + }, + { + "epoch": 0.87, + "learning_rate": 3.831600291586718e-05, + "loss": 3.0007, + "step": 104400 + }, + { + "epoch": 0.87, + "learning_rate": 3.806463505576176e-05, + "loss": 2.9958, + "step": 104500 + }, + { + "epoch": 0.87, + "learning_rate": 3.781326719565636e-05, + "loss": 2.9939, + "step": 104600 + }, + { + "epoch": 0.88, + "learning_rate": 3.756189933555095e-05, + "loss": 2.9936, + "step": 104700 + }, + { + "epoch": 0.88, + "learning_rate": 3.731053147544555e-05, + "loss": 3.0004, + "step": 104800 + }, + { + "epoch": 0.88, + "learning_rate": 3.705916361534014e-05, + "loss": 2.9945, + "step": 104900 + }, + { + "epoch": 0.88, + "learning_rate": 3.680779575523473e-05, + "loss": 2.9882, + "step": 105000 + }, + { + "epoch": 0.88, + "eval_accuracy": 0.4366730036955081, + "eval_loss": 3.0106048583984375, + "eval_runtime": 36.7107, + "eval_samples_per_second": 305.524, + "eval_steps_per_second": 2.561, + "step": 105000 + }, + { + "epoch": 0.88, + "learning_rate": 3.655642789512932e-05, + "loss": 2.9919, + "step": 105100 + }, + { + "epoch": 0.88, + "learning_rate": 3.630506003502392e-05, + "loss": 2.9894, + "step": 105200 + }, + { + "epoch": 0.88, + "learning_rate": 3.605369217491851e-05, + "loss": 3.0005, + "step": 105300 + }, + { + "epoch": 0.88, + "learning_rate": 3.580483799341416e-05, + "loss": 2.9884, + "step": 105400 + }, + { + "epoch": 0.88, + "learning_rate": 3.555347013330875e-05, + "loss": 2.9865, + "step": 105500 + }, + { + "epoch": 0.88, + "learning_rate": 3.530210227320335e-05, + "loss": 2.9909, + "step": 105600 + }, + { + "epoch": 0.88, + "learning_rate": 3.5050734413097934e-05, + "loss": 2.9961, + "step": 105700 + }, + { + "epoch": 0.89, + "learning_rate": 3.479936655299253e-05, + "loss": 2.9905, + "step": 105800 + }, + { + "epoch": 0.89, + "learning_rate": 3.454799869288712e-05, + "loss": 2.9913, + "step": 105900 + }, + { + "epoch": 0.89, + "learning_rate": 3.429914451138277e-05, + "loss": 2.9866, + "step": 106000 + }, + { + "epoch": 0.89, + "eval_accuracy": 0.4369524133128152, + "eval_loss": 3.008857250213623, + "eval_runtime": 36.004, + "eval_samples_per_second": 311.521, + "eval_steps_per_second": 2.611, + "step": 106000 + }, + { + "epoch": 0.89, + "learning_rate": 3.404777665127736e-05, + "loss": 2.9893, + "step": 106100 + }, + { + "epoch": 0.89, + "learning_rate": 3.379640879117196e-05, + "loss": 2.989, + "step": 106200 + }, + { + "epoch": 0.89, + "learning_rate": 3.354504093106655e-05, + "loss": 2.9886, + "step": 106300 + }, + { + "epoch": 0.89, + "learning_rate": 3.329367307096114e-05, + "loss": 2.9835, + "step": 106400 + }, + { + "epoch": 0.89, + "learning_rate": 3.3042305210855734e-05, + "loss": 2.9918, + "step": 106500 + }, + { + "epoch": 0.89, + "learning_rate": 3.279093735075033e-05, + "loss": 2.9855, + "step": 106600 + }, + { + "epoch": 0.89, + "learning_rate": 3.2539569490644923e-05, + "loss": 2.9895, + "step": 106700 + }, + { + "epoch": 0.89, + "learning_rate": 3.229071530914057e-05, + "loss": 2.9791, + "step": 106800 + }, + { + "epoch": 0.89, + "learning_rate": 3.203934744903516e-05, + "loss": 2.9955, + "step": 106900 + }, + { + "epoch": 0.9, + "learning_rate": 3.178797958892976e-05, + "loss": 2.9961, + "step": 107000 + }, + { + "epoch": 0.9, + "eval_accuracy": 0.43725920226448156, + "eval_loss": 3.007978677749634, + "eval_runtime": 36.4349, + "eval_samples_per_second": 307.837, + "eval_steps_per_second": 2.58, + "step": 107000 + }, + { + "epoch": 0.9, + "learning_rate": 3.153661172882435e-05, + "loss": 2.9921, + "step": 107100 + }, + { + "epoch": 0.9, + "learning_rate": 3.128524386871894e-05, + "loss": 2.9937, + "step": 107200 + }, + { + "epoch": 0.9, + "learning_rate": 3.1033876008613534e-05, + "loss": 2.9894, + "step": 107300 + }, + { + "epoch": 0.9, + "learning_rate": 3.078250814850813e-05, + "loss": 2.9919, + "step": 107400 + }, + { + "epoch": 0.9, + "learning_rate": 3.0531140288402724e-05, + "loss": 2.9906, + "step": 107500 + }, + { + "epoch": 0.9, + "learning_rate": 3.0279772428297315e-05, + "loss": 2.9839, + "step": 107600 + }, + { + "epoch": 0.9, + "learning_rate": 3.002840456819191e-05, + "loss": 2.9871, + "step": 107700 + }, + { + "epoch": 0.9, + "learning_rate": 2.9777036708086504e-05, + "loss": 2.9891, + "step": 107800 + }, + { + "epoch": 0.9, + "learning_rate": 2.9525668847981092e-05, + "loss": 2.9898, + "step": 107900 + }, + { + "epoch": 0.9, + "learning_rate": 2.9274300987875687e-05, + "loss": 2.9876, + "step": 108000 + }, + { + "epoch": 0.9, + "eval_accuracy": 0.4373946948678491, + "eval_loss": 3.0067296028137207, + "eval_runtime": 36.3734, + "eval_samples_per_second": 308.357, + "eval_steps_per_second": 2.584, + "step": 108000 + }, + { + "epoch": 0.9, + "learning_rate": 2.902293312777028e-05, + "loss": 2.9917, + "step": 108100 + }, + { + "epoch": 0.91, + "learning_rate": 2.8771565267664876e-05, + "loss": 2.9916, + "step": 108200 + }, + { + "epoch": 0.91, + "learning_rate": 2.8520197407559464e-05, + "loss": 2.9981, + "step": 108300 + }, + { + "epoch": 0.91, + "learning_rate": 2.826882954745406e-05, + "loss": 2.9817, + "step": 108400 + }, + { + "epoch": 0.91, + "learning_rate": 2.8017461687348653e-05, + "loss": 2.9875, + "step": 108500 + }, + { + "epoch": 0.91, + "learning_rate": 2.7766093827243248e-05, + "loss": 2.9904, + "step": 108600 + }, + { + "epoch": 0.91, + "learning_rate": 2.751472596713784e-05, + "loss": 2.9882, + "step": 108700 + }, + { + "epoch": 0.91, + "learning_rate": 2.7265871785633487e-05, + "loss": 2.9885, + "step": 108800 + }, + { + "epoch": 0.91, + "learning_rate": 2.701450392552808e-05, + "loss": 2.9898, + "step": 108900 + }, + { + "epoch": 0.91, + "learning_rate": 2.6763136065422673e-05, + "loss": 2.9873, + "step": 109000 + }, + { + "epoch": 0.91, + "eval_accuracy": 0.43755826883979015, + "eval_loss": 3.0054852962493896, + "eval_runtime": 36.8956, + "eval_samples_per_second": 303.993, + "eval_steps_per_second": 2.548, + "step": 109000 + }, + { + "epoch": 0.91, + "learning_rate": 2.6511768205317264e-05, + "loss": 2.9921, + "step": 109100 + }, + { + "epoch": 0.91, + "learning_rate": 2.626040034521186e-05, + "loss": 2.9863, + "step": 109200 + }, + { + "epoch": 0.91, + "learning_rate": 2.600903248510645e-05, + "loss": 2.9902, + "step": 109300 + }, + { + "epoch": 0.92, + "learning_rate": 2.5757664625001045e-05, + "loss": 2.9823, + "step": 109400 + }, + { + "epoch": 0.92, + "learning_rate": 2.5508810443496693e-05, + "loss": 2.9978, + "step": 109500 + }, + { + "epoch": 0.92, + "learning_rate": 2.5257442583391284e-05, + "loss": 2.9859, + "step": 109600 + }, + { + "epoch": 0.92, + "learning_rate": 2.500607472328588e-05, + "loss": 2.9821, + "step": 109700 + }, + { + "epoch": 0.92, + "learning_rate": 2.4754706863180473e-05, + "loss": 2.9932, + "step": 109800 + }, + { + "epoch": 0.92, + "learning_rate": 2.4503339003075065e-05, + "loss": 2.9906, + "step": 109900 + }, + { + "epoch": 0.92, + "learning_rate": 2.4251971142969656e-05, + "loss": 2.9891, + "step": 110000 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.4375182528895728, + "eval_loss": 3.004079580307007, + "eval_runtime": 36.2219, + "eval_samples_per_second": 309.647, + "eval_steps_per_second": 2.595, + "step": 110000 + }, + { + "epoch": 0.92, + "learning_rate": 2.400060328286425e-05, + "loss": 2.9875, + "step": 110100 + }, + { + "epoch": 0.92, + "learning_rate": 2.3749235422758845e-05, + "loss": 2.9859, + "step": 110200 + }, + { + "epoch": 0.92, + "learning_rate": 2.349786756265344e-05, + "loss": 2.9865, + "step": 110300 + }, + { + "epoch": 0.92, + "learning_rate": 2.3246499702548028e-05, + "loss": 2.994, + "step": 110400 + }, + { + "epoch": 0.92, + "learning_rate": 2.2995131842442623e-05, + "loss": 2.9817, + "step": 110500 + }, + { + "epoch": 0.93, + "learning_rate": 2.2743763982337217e-05, + "loss": 2.9915, + "step": 110600 + }, + { + "epoch": 0.93, + "learning_rate": 2.2492396122231812e-05, + "loss": 2.9927, + "step": 110700 + }, + { + "epoch": 0.93, + "learning_rate": 2.22410282621264e-05, + "loss": 2.9908, + "step": 110800 + }, + { + "epoch": 0.93, + "learning_rate": 2.1989660402020994e-05, + "loss": 2.9897, + "step": 110900 + }, + { + "epoch": 0.93, + "learning_rate": 2.173829254191559e-05, + "loss": 2.9835, + "step": 111000 + }, + { + "epoch": 0.93, + "eval_accuracy": 0.4377632628303773, + "eval_loss": 3.0032153129577637, + "eval_runtime": 36.5662, + "eval_samples_per_second": 306.731, + "eval_steps_per_second": 2.571, + "step": 111000 + }, + { + "epoch": 0.93, + "learning_rate": 2.1486924681810184e-05, + "loss": 2.9787, + "step": 111100 + }, + { + "epoch": 0.93, + "learning_rate": 2.123555682170477e-05, + "loss": 2.9831, + "step": 111200 + }, + { + "epoch": 0.93, + "learning_rate": 2.0984188961599366e-05, + "loss": 2.9913, + "step": 111300 + }, + { + "epoch": 0.93, + "learning_rate": 2.073282110149396e-05, + "loss": 2.9904, + "step": 111400 + }, + { + "epoch": 0.93, + "learning_rate": 2.0481453241388556e-05, + "loss": 2.9842, + "step": 111500 + }, + { + "epoch": 0.93, + "learning_rate": 2.0230085381283147e-05, + "loss": 2.987, + "step": 111600 + }, + { + "epoch": 0.93, + "learning_rate": 1.9978717521177738e-05, + "loss": 2.9868, + "step": 111700 + }, + { + "epoch": 0.94, + "learning_rate": 1.9727349661072333e-05, + "loss": 2.9887, + "step": 111800 + }, + { + "epoch": 0.94, + "learning_rate": 1.9475981800966928e-05, + "loss": 2.9844, + "step": 111900 + }, + { + "epoch": 0.94, + "learning_rate": 1.922461394086152e-05, + "loss": 2.9887, + "step": 112000 + }, + { + "epoch": 0.94, + "eval_accuracy": 0.4380391622766127, + "eval_loss": 3.0022435188293457, + "eval_runtime": 36.4456, + "eval_samples_per_second": 307.746, + "eval_steps_per_second": 2.579, + "step": 112000 + }, + { + "epoch": 0.94, + "learning_rate": 1.8973246080756113e-05, + "loss": 2.9792, + "step": 112100 + }, + { + "epoch": 0.94, + "learning_rate": 1.8721878220650705e-05, + "loss": 2.9813, + "step": 112200 + }, + { + "epoch": 0.94, + "learning_rate": 1.84705103605453e-05, + "loss": 2.9852, + "step": 112300 + }, + { + "epoch": 0.94, + "learning_rate": 1.821914250043989e-05, + "loss": 2.9927, + "step": 112400 + }, + { + "epoch": 0.94, + "learning_rate": 1.797028831893554e-05, + "loss": 2.9869, + "step": 112500 + }, + { + "epoch": 0.94, + "learning_rate": 1.7718920458830133e-05, + "loss": 2.9798, + "step": 112600 + }, + { + "epoch": 0.94, + "learning_rate": 1.7467552598724724e-05, + "loss": 2.982, + "step": 112700 + }, + { + "epoch": 0.94, + "learning_rate": 1.721618473861932e-05, + "loss": 2.9787, + "step": 112800 + }, + { + "epoch": 0.94, + "learning_rate": 1.6964816878513914e-05, + "loss": 2.9891, + "step": 112900 + }, + { + "epoch": 0.95, + "learning_rate": 1.671596269700956e-05, + "loss": 2.9876, + "step": 113000 + }, + { + "epoch": 0.95, + "eval_accuracy": 0.43829610679906095, + "eval_loss": 3.0009684562683105, + "eval_runtime": 37.5779, + "eval_samples_per_second": 298.473, + "eval_steps_per_second": 2.501, + "step": 113000 + }, + { + "epoch": 0.95, + "learning_rate": 1.6464594836904153e-05, + "loss": 2.9809, + "step": 113100 + }, + { + "epoch": 0.95, + "learning_rate": 1.6213226976798747e-05, + "loss": 2.9933, + "step": 113200 + }, + { + "epoch": 0.95, + "learning_rate": 1.596185911669334e-05, + "loss": 2.9868, + "step": 113300 + }, + { + "epoch": 0.95, + "learning_rate": 1.5710491256587933e-05, + "loss": 2.9867, + "step": 113400 + }, + { + "epoch": 0.95, + "learning_rate": 1.5459123396482525e-05, + "loss": 2.9831, + "step": 113500 + }, + { + "epoch": 0.95, + "learning_rate": 1.5207755536377118e-05, + "loss": 2.9857, + "step": 113600 + }, + { + "epoch": 0.95, + "learning_rate": 1.495638767627171e-05, + "loss": 2.9861, + "step": 113700 + }, + { + "epoch": 0.95, + "learning_rate": 1.4705019816166305e-05, + "loss": 2.9797, + "step": 113800 + }, + { + "epoch": 0.95, + "learning_rate": 1.4453651956060897e-05, + "loss": 2.9819, + "step": 113900 + }, + { + "epoch": 0.95, + "learning_rate": 1.4202284095955491e-05, + "loss": 2.9818, + "step": 114000 + }, + { + "epoch": 0.95, + "eval_accuracy": 0.4384379177103575, + "eval_loss": 2.9998745918273926, + "eval_runtime": 36.4806, + "eval_samples_per_second": 307.451, + "eval_steps_per_second": 2.577, + "step": 114000 + }, + { + "epoch": 0.95, + "learning_rate": 1.3950916235850083e-05, + "loss": 2.9861, + "step": 114100 + }, + { + "epoch": 0.96, + "learning_rate": 1.3699548375744677e-05, + "loss": 2.9859, + "step": 114200 + }, + { + "epoch": 0.96, + "learning_rate": 1.3448180515639268e-05, + "loss": 2.9864, + "step": 114300 + }, + { + "epoch": 0.96, + "learning_rate": 1.3196812655533863e-05, + "loss": 2.9818, + "step": 114400 + }, + { + "epoch": 0.96, + "learning_rate": 1.2945444795428454e-05, + "loss": 2.9732, + "step": 114500 + }, + { + "epoch": 0.96, + "learning_rate": 1.2694076935323049e-05, + "loss": 2.9859, + "step": 114600 + }, + { + "epoch": 0.96, + "learning_rate": 1.244270907521764e-05, + "loss": 2.9828, + "step": 114700 + }, + { + "epoch": 0.96, + "learning_rate": 1.2191341215112235e-05, + "loss": 2.9837, + "step": 114800 + }, + { + "epoch": 0.96, + "learning_rate": 1.1939973355006828e-05, + "loss": 2.9748, + "step": 114900 + }, + { + "epoch": 0.96, + "learning_rate": 1.1688605494901421e-05, + "loss": 2.9797, + "step": 115000 + }, + { + "epoch": 0.96, + "eval_accuracy": 0.43843651364192887, + "eval_loss": 2.999021291732788, + "eval_runtime": 36.1681, + "eval_samples_per_second": 310.108, + "eval_steps_per_second": 2.599, + "step": 115000 + }, + { + "epoch": 0.96, + "learning_rate": 1.1437237634796014e-05, + "loss": 2.9813, + "step": 115100 + }, + { + "epoch": 0.96, + "learning_rate": 1.1185869774690607e-05, + "loss": 2.978, + "step": 115200 + }, + { + "epoch": 0.96, + "learning_rate": 1.09345019145852e-05, + "loss": 2.9886, + "step": 115300 + }, + { + "epoch": 0.97, + "learning_rate": 1.0683134054479795e-05, + "loss": 2.9744, + "step": 115400 + }, + { + "epoch": 0.97, + "learning_rate": 1.0431766194374386e-05, + "loss": 2.9804, + "step": 115500 + }, + { + "epoch": 0.97, + "learning_rate": 1.0182912012870034e-05, + "loss": 2.984, + "step": 115600 + }, + { + "epoch": 0.97, + "learning_rate": 9.931544152764628e-06, + "loss": 2.985, + "step": 115700 + }, + { + "epoch": 0.97, + "learning_rate": 9.68017629265922e-06, + "loss": 2.9843, + "step": 115800 + }, + { + "epoch": 0.97, + "learning_rate": 9.428808432553814e-06, + "loss": 2.9809, + "step": 115900 + }, + { + "epoch": 0.97, + "learning_rate": 9.177440572448405e-06, + "loss": 2.9842, + "step": 116000 + }, + { + "epoch": 0.97, + "eval_accuracy": 0.43876225751738235, + "eval_loss": 2.9980885982513428, + "eval_runtime": 36.1964, + "eval_samples_per_second": 309.865, + "eval_steps_per_second": 2.597, + "step": 116000 + }, + { + "epoch": 0.97, + "learning_rate": 8.926072712342998e-06, + "loss": 2.9702, + "step": 116100 + }, + { + "epoch": 0.97, + "learning_rate": 8.674704852237591e-06, + "loss": 2.9799, + "step": 116200 + }, + { + "epoch": 0.97, + "learning_rate": 8.423336992132184e-06, + "loss": 2.9825, + "step": 116300 + }, + { + "epoch": 0.97, + "learning_rate": 8.171969132026777e-06, + "loss": 2.9726, + "step": 116400 + }, + { + "epoch": 0.97, + "learning_rate": 7.920601271921372e-06, + "loss": 2.9788, + "step": 116500 + }, + { + "epoch": 0.98, + "learning_rate": 7.669233411815965e-06, + "loss": 2.988, + "step": 116600 + }, + { + "epoch": 0.98, + "learning_rate": 7.417865551710557e-06, + "loss": 2.9795, + "step": 116700 + }, + { + "epoch": 0.98, + "learning_rate": 7.16649769160515e-06, + "loss": 2.9797, + "step": 116800 + }, + { + "epoch": 0.98, + "learning_rate": 6.915129831499744e-06, + "loss": 2.9735, + "step": 116900 + }, + { + "epoch": 0.98, + "learning_rate": 6.663761971394337e-06, + "loss": 2.9739, + "step": 117000 + }, + { + "epoch": 0.98, + "eval_accuracy": 0.43866397272737484, + "eval_loss": 2.9972493648529053, + "eval_runtime": 36.7078, + "eval_samples_per_second": 305.548, + "eval_steps_per_second": 2.561, + "step": 117000 + }, + { + "epoch": 0.98, + "learning_rate": 6.41239411128893e-06, + "loss": 2.9765, + "step": 117100 + }, + { + "epoch": 0.98, + "learning_rate": 6.161026251183523e-06, + "loss": 2.9859, + "step": 117200 + }, + { + "epoch": 0.98, + "learning_rate": 5.909658391078116e-06, + "loss": 2.9897, + "step": 117300 + }, + { + "epoch": 0.98, + "learning_rate": 5.658290530972709e-06, + "loss": 2.9855, + "step": 117400 + }, + { + "epoch": 0.98, + "learning_rate": 5.406922670867302e-06, + "loss": 2.9747, + "step": 117500 + }, + { + "epoch": 0.98, + "learning_rate": 5.155554810761895e-06, + "loss": 2.9732, + "step": 117600 + }, + { + "epoch": 0.98, + "learning_rate": 4.9041869506564885e-06, + "loss": 2.9796, + "step": 117700 + }, + { + "epoch": 0.99, + "learning_rate": 4.6528190905510815e-06, + "loss": 2.9782, + "step": 117800 + }, + { + "epoch": 0.99, + "learning_rate": 4.4014512304456745e-06, + "loss": 2.9841, + "step": 117900 + }, + { + "epoch": 0.99, + "learning_rate": 4.150083370340268e-06, + "loss": 2.9804, + "step": 118000 + }, + { + "epoch": 0.99, + "eval_accuracy": 0.43883737517831667, + "eval_loss": 2.9965155124664307, + "eval_runtime": 36.1347, + "eval_samples_per_second": 310.394, + "eval_steps_per_second": 2.601, + "step": 118000 + }, + { + "epoch": 0.99, + "learning_rate": 3.898715510234861e-06, + "loss": 2.9836, + "step": 118100 + }, + { + "epoch": 0.99, + "learning_rate": 3.6473476501294542e-06, + "loss": 2.9815, + "step": 118200 + }, + { + "epoch": 0.99, + "learning_rate": 3.398493468625101e-06, + "loss": 2.9744, + "step": 118300 + }, + { + "epoch": 0.99, + "learning_rate": 3.147125608519694e-06, + "loss": 2.9847, + "step": 118400 + }, + { + "epoch": 0.99, + "learning_rate": 2.8957577484142875e-06, + "loss": 2.9733, + "step": 118500 + }, + { + "epoch": 0.99, + "learning_rate": 2.6469035669099345e-06, + "loss": 2.9766, + "step": 118600 + }, + { + "epoch": 0.99, + "learning_rate": 2.395535706804528e-06, + "loss": 2.9802, + "step": 118700 + }, + { + "epoch": 0.99, + "learning_rate": 2.144167846699121e-06, + "loss": 2.9757, + "step": 118800 + }, + { + "epoch": 0.99, + "learning_rate": 1.8927999865937138e-06, + "loss": 2.9775, + "step": 118900 + }, + { + "epoch": 1.0, + "learning_rate": 1.641432126488307e-06, + "loss": 2.9828, + "step": 119000 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.43901218169768724, + "eval_loss": 2.995953321456909, + "eval_runtime": 36.3994, + "eval_samples_per_second": 308.137, + "eval_steps_per_second": 2.582, + "step": 119000 + }, + { + "epoch": 1.0, + "learning_rate": 1.3900642663829e-06, + "loss": 2.9783, + "step": 119100 + }, + { + "epoch": 1.0, + "learning_rate": 1.1386964062774932e-06, + "loss": 2.9723, + "step": 119200 + }, + { + "epoch": 1.0, + "learning_rate": 8.873285461720863e-07, + "loss": 2.9817, + "step": 119300 + }, + { + "epoch": 1.0, + "learning_rate": 6.359606860666794e-07, + "loss": 2.9792, + "step": 119400 + }, + { + "epoch": 1.0, + "learning_rate": 3.8459282596127255e-07, + "loss": 2.982, + "step": 119500 + }, + { + "epoch": 1.0, + "step": 119547, + "total_flos": 1.455921831670228e+20, + "train_loss": 3.081914688561298, + "train_runtime": 169290.0352, + "train_samples_per_second": 169.48, + "train_steps_per_second": 0.706 + } + ], + "max_steps": 119547, + "num_train_epochs": 1, + "total_flos": 1.455921831670228e+20, + "trial_name": null, + "trial_params": null +}