{ "best_metric": 1.101412057876587, "best_model_checkpoint": "models/agriQA-assistant\\checkpoint-6250", "epoch": 1.0, "eval_steps": 250, "global_step": 6250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": NaN, "learning_rate": 5e-05, "loss": 5.5315, "step": 10 }, { "epoch": 0.0032, "grad_norm": 2.2085492610931396, "learning_rate": 0.0001, "loss": 5.3337, "step": 20 }, { "epoch": 0.0048, "grad_norm": 3.2219746112823486, "learning_rate": 0.00015, "loss": 5.1164, "step": 30 }, { "epoch": 0.0064, "grad_norm": 4.628455638885498, "learning_rate": 0.0002, "loss": 3.6573, "step": 40 }, { "epoch": 0.008, "grad_norm": 12.310958862304688, "learning_rate": 0.00025, "loss": 3.9022, "step": 50 }, { "epoch": 0.0096, "grad_norm": 9.530706405639648, "learning_rate": 0.0003, "loss": 3.309, "step": 60 }, { "epoch": 0.0112, "grad_norm": 4.938753128051758, "learning_rate": 0.00035, "loss": 2.356, "step": 70 }, { "epoch": 0.0128, "grad_norm": 3.4446096420288086, "learning_rate": 0.0004, "loss": 1.8554, "step": 80 }, { "epoch": 0.0144, "grad_norm": 2.971761465072632, "learning_rate": 0.00045000000000000004, "loss": 1.7541, "step": 90 }, { "epoch": 0.016, "grad_norm": 6.156565189361572, "learning_rate": 0.0005, "loss": 1.8158, "step": 100 }, { "epoch": 0.0176, "grad_norm": 4.076125144958496, "learning_rate": 0.0004991869918699188, "loss": 1.7109, "step": 110 }, { "epoch": 0.0192, "grad_norm": 3.3770179748535156, "learning_rate": 0.0004983739837398374, "loss": 1.6453, "step": 120 }, { "epoch": 0.0208, "grad_norm": 4.189070224761963, "learning_rate": 0.0004975609756097561, "loss": 1.8939, "step": 130 }, { "epoch": 0.0224, "grad_norm": 4.818182468414307, "learning_rate": 0.0004967479674796748, "loss": 1.4016, "step": 140 }, { "epoch": 0.024, "grad_norm": 1.7811826467514038, "learning_rate": 0.0004959349593495934, "loss": 1.4795, "step": 150 }, { "epoch": 0.0256, "grad_norm": 3.0742604732513428, "learning_rate": 0.0004951219512195122, "loss": 1.2655, "step": 160 }, { "epoch": 0.0272, "grad_norm": 2.1021037101745605, "learning_rate": 0.0004943089430894309, "loss": 1.4392, "step": 170 }, { "epoch": 0.0288, "grad_norm": 2.147939920425415, "learning_rate": 0.0004934959349593496, "loss": 1.6663, "step": 180 }, { "epoch": 0.0304, "grad_norm": 4.158925533294678, "learning_rate": 0.0004926829268292683, "loss": 1.6488, "step": 190 }, { "epoch": 0.032, "grad_norm": 1.7264795303344727, "learning_rate": 0.0004918699186991871, "loss": 1.677, "step": 200 }, { "epoch": 0.0336, "grad_norm": 0.8538139462471008, "learning_rate": 0.0004910569105691057, "loss": 1.4913, "step": 210 }, { "epoch": 0.0352, "grad_norm": 5.219719409942627, "learning_rate": 0.0004902439024390243, "loss": 1.1787, "step": 220 }, { "epoch": 0.0368, "grad_norm": 2.3855836391448975, "learning_rate": 0.0004894308943089431, "loss": 1.907, "step": 230 }, { "epoch": 0.0384, "grad_norm": 6.361728191375732, "learning_rate": 0.0004886178861788618, "loss": 1.4054, "step": 240 }, { "epoch": 0.04, "grad_norm": 2.0494649410247803, "learning_rate": 0.0004878048780487805, "loss": 1.5189, "step": 250 }, { "epoch": 0.04, "eval_loss": 1.650099277496338, "eval_runtime": 1034.8057, "eval_samples_per_second": 4.832, "eval_steps_per_second": 4.832, "step": 250 }, { "epoch": 0.0416, "grad_norm": 1.2376840114593506, "learning_rate": 0.0004869918699186992, "loss": 1.4281, "step": 260 }, { "epoch": 0.0432, "grad_norm": 1.7537360191345215, "learning_rate": 0.00048617886178861793, "loss": 1.1609, "step": 270 }, { "epoch": 0.0448, "grad_norm": 2.7885940074920654, "learning_rate": 0.00048536585365853657, "loss": 1.4809, "step": 280 }, { "epoch": 0.0464, "grad_norm": 2.5449018478393555, "learning_rate": 0.0004845528455284553, "loss": 1.5099, "step": 290 }, { "epoch": 0.048, "grad_norm": 4.444058418273926, "learning_rate": 0.000483739837398374, "loss": 1.2722, "step": 300 }, { "epoch": 0.0496, "grad_norm": 0.870796263217926, "learning_rate": 0.00048292682926829266, "loss": 1.0486, "step": 310 }, { "epoch": 0.0512, "grad_norm": 1.8581308126449585, "learning_rate": 0.0004821138211382114, "loss": 1.4882, "step": 320 }, { "epoch": 0.0528, "grad_norm": 0.9003315567970276, "learning_rate": 0.0004813008130081301, "loss": 1.177, "step": 330 }, { "epoch": 0.0544, "grad_norm": 2.3489129543304443, "learning_rate": 0.0004804878048780488, "loss": 1.0059, "step": 340 }, { "epoch": 0.056, "grad_norm": 2.488342046737671, "learning_rate": 0.0004796747967479675, "loss": 1.6738, "step": 350 }, { "epoch": 0.0576, "grad_norm": 2.3013713359832764, "learning_rate": 0.0004788617886178862, "loss": 1.362, "step": 360 }, { "epoch": 0.0592, "grad_norm": 1.2207469940185547, "learning_rate": 0.0004780487804878049, "loss": 1.3391, "step": 370 }, { "epoch": 0.0608, "grad_norm": 0.778947651386261, "learning_rate": 0.0004772357723577236, "loss": 1.4315, "step": 380 }, { "epoch": 0.0624, "grad_norm": 0.9309093952178955, "learning_rate": 0.0004764227642276423, "loss": 1.052, "step": 390 }, { "epoch": 0.064, "grad_norm": 6.077177047729492, "learning_rate": 0.00047560975609756096, "loss": 1.2999, "step": 400 }, { "epoch": 0.0656, "grad_norm": 1.5627727508544922, "learning_rate": 0.00047479674796747966, "loss": 0.9933, "step": 410 }, { "epoch": 0.0672, "grad_norm": 2.4397456645965576, "learning_rate": 0.0004739837398373984, "loss": 1.083, "step": 420 }, { "epoch": 0.0688, "grad_norm": 1.7931780815124512, "learning_rate": 0.00047317073170731705, "loss": 1.0095, "step": 430 }, { "epoch": 0.0704, "grad_norm": 1.6210583448410034, "learning_rate": 0.0004723577235772358, "loss": 1.1547, "step": 440 }, { "epoch": 0.072, "grad_norm": 2.350104808807373, "learning_rate": 0.0004715447154471545, "loss": 1.0687, "step": 450 }, { "epoch": 0.0736, "grad_norm": 3.4625842571258545, "learning_rate": 0.00047073170731707313, "loss": 1.0758, "step": 460 }, { "epoch": 0.0752, "grad_norm": 1.5620652437210083, "learning_rate": 0.0004699186991869919, "loss": 0.8588, "step": 470 }, { "epoch": 0.0768, "grad_norm": 2.5379786491394043, "learning_rate": 0.0004691056910569106, "loss": 1.3742, "step": 480 }, { "epoch": 0.0784, "grad_norm": 1.6187106370925903, "learning_rate": 0.0004682926829268293, "loss": 0.9447, "step": 490 }, { "epoch": 0.08, "grad_norm": 2.7396605014801025, "learning_rate": 0.00046747967479674797, "loss": 1.0118, "step": 500 }, { "epoch": 0.08, "eval_loss": 1.4677984714508057, "eval_runtime": 1024.8065, "eval_samples_per_second": 4.879, "eval_steps_per_second": 4.879, "step": 500 }, { "epoch": 0.0816, "grad_norm": 0.862608015537262, "learning_rate": 0.00046666666666666666, "loss": 1.3823, "step": 510 }, { "epoch": 0.0832, "grad_norm": 1.3692424297332764, "learning_rate": 0.0004658536585365854, "loss": 0.8877, "step": 520 }, { "epoch": 0.0848, "grad_norm": 1.674166202545166, "learning_rate": 0.00046504065040650405, "loss": 1.3644, "step": 530 }, { "epoch": 0.0864, "grad_norm": 3.7798876762390137, "learning_rate": 0.0004642276422764228, "loss": 1.2296, "step": 540 }, { "epoch": 0.088, "grad_norm": 2.530099630355835, "learning_rate": 0.0004634146341463415, "loss": 1.4154, "step": 550 }, { "epoch": 0.0896, "grad_norm": 2.375380039215088, "learning_rate": 0.00046260162601626014, "loss": 0.9654, "step": 560 }, { "epoch": 0.0912, "grad_norm": 4.751425266265869, "learning_rate": 0.0004617886178861789, "loss": 1.4011, "step": 570 }, { "epoch": 0.0928, "grad_norm": 1.316773772239685, "learning_rate": 0.0004609756097560976, "loss": 0.9764, "step": 580 }, { "epoch": 0.0944, "grad_norm": 3.132272243499756, "learning_rate": 0.0004601626016260163, "loss": 1.3579, "step": 590 }, { "epoch": 0.096, "grad_norm": 2.0117361545562744, "learning_rate": 0.00045934959349593497, "loss": 1.2378, "step": 600 }, { "epoch": 0.0976, "grad_norm": 3.49707293510437, "learning_rate": 0.00045853658536585366, "loss": 0.9963, "step": 610 }, { "epoch": 0.0992, "grad_norm": 3.4453463554382324, "learning_rate": 0.00045772357723577236, "loss": 1.5574, "step": 620 }, { "epoch": 0.1008, "grad_norm": NaN, "learning_rate": 0.00045691056910569105, "loss": 0.981, "step": 630 }, { "epoch": 0.1024, "grad_norm": 1.6297922134399414, "learning_rate": 0.0004560975609756098, "loss": 1.3719, "step": 640 }, { "epoch": 0.104, "grad_norm": 1.4996604919433594, "learning_rate": 0.00045528455284552844, "loss": 0.8291, "step": 650 }, { "epoch": 0.1056, "grad_norm": 4.539699077606201, "learning_rate": 0.00045447154471544714, "loss": 1.3347, "step": 660 }, { "epoch": 0.1072, "grad_norm": 1.6817468404769897, "learning_rate": 0.0004536585365853659, "loss": 1.1633, "step": 670 }, { "epoch": 0.1088, "grad_norm": 2.0384576320648193, "learning_rate": 0.00045284552845528453, "loss": 0.7568, "step": 680 }, { "epoch": 0.1104, "grad_norm": 0.6239755749702454, "learning_rate": 0.0004520325203252033, "loss": 0.9898, "step": 690 }, { "epoch": 0.112, "grad_norm": 3.0464870929718018, "learning_rate": 0.00045121951219512197, "loss": 1.1183, "step": 700 }, { "epoch": 0.1136, "grad_norm": 7.100531101226807, "learning_rate": 0.0004504065040650406, "loss": 1.1491, "step": 710 }, { "epoch": 0.1152, "grad_norm": 2.297905683517456, "learning_rate": 0.00044959349593495936, "loss": 1.1005, "step": 720 }, { "epoch": 0.1168, "grad_norm": 0.7417575120925903, "learning_rate": 0.00044878048780487806, "loss": 0.6954, "step": 730 }, { "epoch": 0.1184, "grad_norm": 4.498805999755859, "learning_rate": 0.00044796747967479675, "loss": 1.3516, "step": 740 }, { "epoch": 0.12, "grad_norm": 2.0761196613311768, "learning_rate": 0.00044715447154471545, "loss": 1.3995, "step": 750 }, { "epoch": 0.12, "eval_loss": 1.4611822366714478, "eval_runtime": 992.1537, "eval_samples_per_second": 5.04, "eval_steps_per_second": 5.04, "step": 750 }, { "epoch": 0.1216, "grad_norm": 0.4251354932785034, "learning_rate": 0.00044634146341463414, "loss": 0.89, "step": 760 }, { "epoch": 0.1232, "grad_norm": 1.715605616569519, "learning_rate": 0.0004455284552845529, "loss": 0.7241, "step": 770 }, { "epoch": 0.1248, "grad_norm": 3.6502270698547363, "learning_rate": 0.00044471544715447153, "loss": 0.8324, "step": 780 }, { "epoch": 0.1264, "grad_norm": 2.7257204055786133, "learning_rate": 0.0004439024390243903, "loss": 1.1949, "step": 790 }, { "epoch": 0.128, "grad_norm": 3.6415014266967773, "learning_rate": 0.000443089430894309, "loss": 1.1487, "step": 800 }, { "epoch": 0.1296, "grad_norm": 1.9723396301269531, "learning_rate": 0.0004422764227642276, "loss": 0.8628, "step": 810 }, { "epoch": 0.1312, "grad_norm": 3.361844778060913, "learning_rate": 0.00044146341463414636, "loss": 1.1734, "step": 820 }, { "epoch": 0.1328, "grad_norm": 2.197422981262207, "learning_rate": 0.00044065040650406506, "loss": 1.4427, "step": 830 }, { "epoch": 0.1344, "grad_norm": 1.0647861957550049, "learning_rate": 0.00043983739837398375, "loss": 1.0792, "step": 840 }, { "epoch": 0.136, "grad_norm": 2.6153125762939453, "learning_rate": 0.00043902439024390245, "loss": 1.0544, "step": 850 }, { "epoch": 0.1376, "grad_norm": 2.4081950187683105, "learning_rate": 0.00043821138211382114, "loss": 1.187, "step": 860 }, { "epoch": 0.1392, "grad_norm": 1.5457711219787598, "learning_rate": 0.00043739837398373984, "loss": 1.0687, "step": 870 }, { "epoch": 0.1408, "grad_norm": 1.0338834524154663, "learning_rate": 0.00043658536585365853, "loss": 0.9691, "step": 880 }, { "epoch": 0.1424, "grad_norm": 2.2360999584198, "learning_rate": 0.0004357723577235773, "loss": 1.0545, "step": 890 }, { "epoch": 0.144, "grad_norm": 2.766735076904297, "learning_rate": 0.0004349593495934959, "loss": 0.8978, "step": 900 }, { "epoch": 0.1456, "grad_norm": 2.768793821334839, "learning_rate": 0.0004341463414634146, "loss": 1.3507, "step": 910 }, { "epoch": 0.1472, "grad_norm": 2.9616165161132812, "learning_rate": 0.00043333333333333337, "loss": 1.2602, "step": 920 }, { "epoch": 0.1488, "grad_norm": 2.001955986022949, "learning_rate": 0.000432520325203252, "loss": 1.1164, "step": 930 }, { "epoch": 0.1504, "grad_norm": 3.6507134437561035, "learning_rate": 0.00043170731707317076, "loss": 1.0245, "step": 940 }, { "epoch": 0.152, "grad_norm": 2.580472230911255, "learning_rate": 0.00043089430894308945, "loss": 1.1918, "step": 950 }, { "epoch": 0.1536, "grad_norm": 1.4432860612869263, "learning_rate": 0.0004300813008130081, "loss": 1.0045, "step": 960 }, { "epoch": 0.1552, "grad_norm": 0.4128560423851013, "learning_rate": 0.00042926829268292684, "loss": 1.1314, "step": 970 }, { "epoch": 0.1568, "grad_norm": 1.6866891384124756, "learning_rate": 0.00042845528455284554, "loss": 0.8903, "step": 980 }, { "epoch": 0.1584, "grad_norm": 2.277833938598633, "learning_rate": 0.00042764227642276423, "loss": 1.1665, "step": 990 }, { "epoch": 0.16, "grad_norm": 2.491178512573242, "learning_rate": 0.0004268292682926829, "loss": 0.8519, "step": 1000 }, { "epoch": 0.16, "eval_loss": 1.285598874092102, "eval_runtime": 985.2873, "eval_samples_per_second": 5.075, "eval_steps_per_second": 5.075, "step": 1000 }, { "epoch": 0.1616, "grad_norm": 2.373556137084961, "learning_rate": 0.0004260162601626016, "loss": 0.8051, "step": 1010 }, { "epoch": 0.1632, "grad_norm": 2.0977611541748047, "learning_rate": 0.00042520325203252037, "loss": 1.091, "step": 1020 }, { "epoch": 0.1648, "grad_norm": 1.58762526512146, "learning_rate": 0.000424390243902439, "loss": 1.0129, "step": 1030 }, { "epoch": 0.1664, "grad_norm": 2.275988817214966, "learning_rate": 0.00042357723577235776, "loss": 1.2149, "step": 1040 }, { "epoch": 0.168, "grad_norm": 2.571326494216919, "learning_rate": 0.00042276422764227645, "loss": 0.6916, "step": 1050 }, { "epoch": 0.1696, "grad_norm": 1.5755791664123535, "learning_rate": 0.0004219512195121951, "loss": 0.8999, "step": 1060 }, { "epoch": 0.1712, "grad_norm": 2.5511739253997803, "learning_rate": 0.00042113821138211384, "loss": 1.3716, "step": 1070 }, { "epoch": 0.1728, "grad_norm": 1.7605398893356323, "learning_rate": 0.00042032520325203254, "loss": 0.6896, "step": 1080 }, { "epoch": 0.1744, "grad_norm": 2.7377278804779053, "learning_rate": 0.00041951219512195123, "loss": 0.8409, "step": 1090 }, { "epoch": 0.176, "grad_norm": 0.7547760009765625, "learning_rate": 0.00041869918699186993, "loss": 0.5473, "step": 1100 }, { "epoch": 0.1776, "grad_norm": 0.608930766582489, "learning_rate": 0.0004178861788617886, "loss": 0.9329, "step": 1110 }, { "epoch": 0.1792, "grad_norm": 1.1628577709197998, "learning_rate": 0.0004170731707317073, "loss": 1.1194, "step": 1120 }, { "epoch": 0.1808, "grad_norm": 1.2819926738739014, "learning_rate": 0.000416260162601626, "loss": 0.8936, "step": 1130 }, { "epoch": 0.1824, "grad_norm": 4.105648040771484, "learning_rate": 0.00041544715447154476, "loss": 1.0135, "step": 1140 }, { "epoch": 0.184, "grad_norm": 1.276444435119629, "learning_rate": 0.0004146341463414634, "loss": 0.9495, "step": 1150 }, { "epoch": 0.1856, "grad_norm": 4.56814432144165, "learning_rate": 0.0004138211382113821, "loss": 0.981, "step": 1160 }, { "epoch": 0.1872, "grad_norm": 2.2705514430999756, "learning_rate": 0.00041300813008130085, "loss": 1.213, "step": 1170 }, { "epoch": 0.1888, "grad_norm": 0.3999709188938141, "learning_rate": 0.0004121951219512195, "loss": 0.7223, "step": 1180 }, { "epoch": 0.1904, "grad_norm": 1.3647452592849731, "learning_rate": 0.00041138211382113824, "loss": 0.8783, "step": 1190 }, { "epoch": 0.192, "grad_norm": 1.4808121919631958, "learning_rate": 0.00041056910569105693, "loss": 1.2453, "step": 1200 }, { "epoch": 0.1936, "grad_norm": 1.598960041999817, "learning_rate": 0.00040975609756097557, "loss": 0.9894, "step": 1210 }, { "epoch": 0.1952, "grad_norm": 2.9480648040771484, "learning_rate": 0.0004089430894308943, "loss": 1.3508, "step": 1220 }, { "epoch": 0.1968, "grad_norm": 1.4865163564682007, "learning_rate": 0.000408130081300813, "loss": 1.1306, "step": 1230 }, { "epoch": 0.1984, "grad_norm": 1.445144772529602, "learning_rate": 0.0004073170731707317, "loss": 1.0977, "step": 1240 }, { "epoch": 0.2, "grad_norm": 1.741799235343933, "learning_rate": 0.0004065040650406504, "loss": 0.6742, "step": 1250 }, { "epoch": 0.2, "eval_loss": 1.3160793781280518, "eval_runtime": 979.8483, "eval_samples_per_second": 5.103, "eval_steps_per_second": 5.103, "step": 1250 }, { "epoch": 0.2016, "grad_norm": 2.603684425354004, "learning_rate": 0.0004056910569105691, "loss": 1.0061, "step": 1260 }, { "epoch": 0.2032, "grad_norm": 1.8652371168136597, "learning_rate": 0.00040487804878048785, "loss": 1.4929, "step": 1270 }, { "epoch": 0.2048, "grad_norm": 0.33945292234420776, "learning_rate": 0.0004040650406504065, "loss": 0.8642, "step": 1280 }, { "epoch": 0.2064, "grad_norm": 4.188396453857422, "learning_rate": 0.00040325203252032524, "loss": 0.8442, "step": 1290 }, { "epoch": 0.208, "grad_norm": 0.17668074369430542, "learning_rate": 0.00040243902439024393, "loss": 0.8546, "step": 1300 }, { "epoch": 0.2096, "grad_norm": 2.030315637588501, "learning_rate": 0.0004016260162601626, "loss": 0.8966, "step": 1310 }, { "epoch": 0.2112, "grad_norm": 2.019416332244873, "learning_rate": 0.0004008130081300813, "loss": 0.8694, "step": 1320 }, { "epoch": 0.2128, "grad_norm": 2.457531690597534, "learning_rate": 0.0004, "loss": 1.1978, "step": 1330 }, { "epoch": 0.2144, "grad_norm": 1.8144443035125732, "learning_rate": 0.0003991869918699187, "loss": 0.5631, "step": 1340 }, { "epoch": 0.216, "grad_norm": 1.4982506036758423, "learning_rate": 0.0003983739837398374, "loss": 1.5233, "step": 1350 }, { "epoch": 0.2176, "grad_norm": 2.3264763355255127, "learning_rate": 0.0003975609756097561, "loss": 0.9226, "step": 1360 }, { "epoch": 0.2192, "grad_norm": 3.8008902072906494, "learning_rate": 0.0003967479674796748, "loss": 1.1128, "step": 1370 }, { "epoch": 0.2208, "grad_norm": 1.7756544351577759, "learning_rate": 0.0003959349593495935, "loss": 1.0248, "step": 1380 }, { "epoch": 0.2224, "grad_norm": 1.0137070417404175, "learning_rate": 0.00039512195121951224, "loss": 0.9582, "step": 1390 }, { "epoch": 0.224, "grad_norm": 2.0322794914245605, "learning_rate": 0.0003943089430894309, "loss": 1.0282, "step": 1400 }, { "epoch": 0.2256, "grad_norm": 3.740598678588867, "learning_rate": 0.0003934959349593496, "loss": 0.8924, "step": 1410 }, { "epoch": 0.2272, "grad_norm": 1.8170685768127441, "learning_rate": 0.0003926829268292683, "loss": 1.1218, "step": 1420 }, { "epoch": 0.2288, "grad_norm": 3.1609694957733154, "learning_rate": 0.00039186991869918697, "loss": 1.2141, "step": 1430 }, { "epoch": 0.2304, "grad_norm": 3.2545180320739746, "learning_rate": 0.0003910569105691057, "loss": 1.3735, "step": 1440 }, { "epoch": 0.232, "grad_norm": 3.983858585357666, "learning_rate": 0.0003902439024390244, "loss": 1.3722, "step": 1450 }, { "epoch": 0.2336, "grad_norm": 1.1452367305755615, "learning_rate": 0.00038943089430894305, "loss": 0.9909, "step": 1460 }, { "epoch": 0.2352, "grad_norm": 1.7607592344284058, "learning_rate": 0.0003886178861788618, "loss": 0.8425, "step": 1470 }, { "epoch": 0.2368, "grad_norm": 4.85504150390625, "learning_rate": 0.0003878048780487805, "loss": 1.3845, "step": 1480 }, { "epoch": 0.2384, "grad_norm": 1.5017164945602417, "learning_rate": 0.0003869918699186992, "loss": 1.2752, "step": 1490 }, { "epoch": 0.24, "grad_norm": 2.2073278427124023, "learning_rate": 0.0003861788617886179, "loss": 1.2848, "step": 1500 }, { "epoch": 0.24, "eval_loss": 1.2479583024978638, "eval_runtime": 983.0597, "eval_samples_per_second": 5.086, "eval_steps_per_second": 5.086, "step": 1500 }, { "epoch": 0.2416, "grad_norm": 1.8838679790496826, "learning_rate": 0.0003853658536585366, "loss": 0.8091, "step": 1510 }, { "epoch": 0.2432, "grad_norm": 1.3011258840560913, "learning_rate": 0.00038455284552845533, "loss": 0.9439, "step": 1520 }, { "epoch": 0.2448, "grad_norm": 2.353581666946411, "learning_rate": 0.00038373983739837397, "loss": 0.7322, "step": 1530 }, { "epoch": 0.2464, "grad_norm": 3.3765604496002197, "learning_rate": 0.0003829268292682927, "loss": 1.275, "step": 1540 }, { "epoch": 0.248, "grad_norm": 0.1768433153629303, "learning_rate": 0.0003821138211382114, "loss": 0.6793, "step": 1550 }, { "epoch": 0.2496, "grad_norm": 2.3571674823760986, "learning_rate": 0.00038130081300813005, "loss": 1.0786, "step": 1560 }, { "epoch": 0.2512, "grad_norm": 2.615161180496216, "learning_rate": 0.0003804878048780488, "loss": 1.0066, "step": 1570 }, { "epoch": 0.2528, "grad_norm": 4.079470634460449, "learning_rate": 0.0003796747967479675, "loss": 1.1569, "step": 1580 }, { "epoch": 0.2544, "grad_norm": 4.194530010223389, "learning_rate": 0.0003788617886178862, "loss": 0.8787, "step": 1590 }, { "epoch": 0.256, "grad_norm": 1.096019983291626, "learning_rate": 0.0003780487804878049, "loss": 1.2383, "step": 1600 }, { "epoch": 0.2576, "grad_norm": 1.352885365486145, "learning_rate": 0.0003772357723577236, "loss": 0.9154, "step": 1610 }, { "epoch": 0.2592, "grad_norm": 0.7181969285011292, "learning_rate": 0.0003764227642276423, "loss": 0.9014, "step": 1620 }, { "epoch": 0.2608, "grad_norm": 2.9218623638153076, "learning_rate": 0.000375609756097561, "loss": 0.5235, "step": 1630 }, { "epoch": 0.2624, "grad_norm": 1.3558822870254517, "learning_rate": 0.0003747967479674797, "loss": 0.9481, "step": 1640 }, { "epoch": 0.264, "grad_norm": 1.477393627166748, "learning_rate": 0.00037398373983739836, "loss": 0.952, "step": 1650 }, { "epoch": 0.2656, "grad_norm": 1.007265329360962, "learning_rate": 0.00037317073170731706, "loss": 0.9081, "step": 1660 }, { "epoch": 0.2672, "grad_norm": 0.8309774994850159, "learning_rate": 0.0003723577235772358, "loss": 0.8679, "step": 1670 }, { "epoch": 0.2688, "grad_norm": 0.13249029219150543, "learning_rate": 0.00037154471544715445, "loss": 0.9459, "step": 1680 }, { "epoch": 0.2704, "grad_norm": 0.34113630652427673, "learning_rate": 0.0003707317073170732, "loss": 0.91, "step": 1690 }, { "epoch": 0.272, "grad_norm": 1.434639811515808, "learning_rate": 0.0003699186991869919, "loss": 1.0692, "step": 1700 }, { "epoch": 0.2736, "grad_norm": 0.6987395286560059, "learning_rate": 0.00036910569105691053, "loss": 0.7542, "step": 1710 }, { "epoch": 0.2752, "grad_norm": 1.9249191284179688, "learning_rate": 0.0003682926829268293, "loss": 0.8768, "step": 1720 }, { "epoch": 0.2768, "grad_norm": 1.9825557470321655, "learning_rate": 0.000367479674796748, "loss": 0.8561, "step": 1730 }, { "epoch": 0.2784, "grad_norm": 1.9180725812911987, "learning_rate": 0.00036666666666666667, "loss": 0.9844, "step": 1740 }, { "epoch": 0.28, "grad_norm": 2.271852970123291, "learning_rate": 0.00036585365853658537, "loss": 0.8497, "step": 1750 }, { "epoch": 0.28, "eval_loss": 1.2472246885299683, "eval_runtime": 985.0188, "eval_samples_per_second": 5.076, "eval_steps_per_second": 5.076, "step": 1750 }, { "epoch": 0.2816, "grad_norm": 1.7116152048110962, "learning_rate": 0.00036504065040650406, "loss": 0.7791, "step": 1760 }, { "epoch": 0.2832, "grad_norm": 0.9404670596122742, "learning_rate": 0.0003642276422764228, "loss": 1.182, "step": 1770 }, { "epoch": 0.2848, "grad_norm": 1.3179750442504883, "learning_rate": 0.00036341463414634145, "loss": 0.969, "step": 1780 }, { "epoch": 0.2864, "grad_norm": 2.95133113861084, "learning_rate": 0.0003626016260162602, "loss": 1.1718, "step": 1790 }, { "epoch": 0.288, "grad_norm": 1.6571485996246338, "learning_rate": 0.0003617886178861789, "loss": 0.8182, "step": 1800 }, { "epoch": 0.2896, "grad_norm": 3.7702455520629883, "learning_rate": 0.00036097560975609753, "loss": 0.6961, "step": 1810 }, { "epoch": 0.2912, "grad_norm": 2.043027639389038, "learning_rate": 0.0003601626016260163, "loss": 1.2462, "step": 1820 }, { "epoch": 0.2928, "grad_norm": 3.061659336090088, "learning_rate": 0.000359349593495935, "loss": 0.9774, "step": 1830 }, { "epoch": 0.2944, "grad_norm": 2.4722511768341064, "learning_rate": 0.0003585365853658537, "loss": 1.0065, "step": 1840 }, { "epoch": 0.296, "grad_norm": 3.479311227798462, "learning_rate": 0.00035772357723577237, "loss": 0.939, "step": 1850 }, { "epoch": 0.2976, "grad_norm": 1.6811845302581787, "learning_rate": 0.00035691056910569106, "loss": 1.1362, "step": 1860 }, { "epoch": 0.2992, "grad_norm": 2.2085628509521484, "learning_rate": 0.00035609756097560976, "loss": 1.0811, "step": 1870 }, { "epoch": 0.3008, "grad_norm": 1.638785481452942, "learning_rate": 0.00035528455284552845, "loss": 0.9224, "step": 1880 }, { "epoch": 0.3024, "grad_norm": 2.308749198913574, "learning_rate": 0.0003544715447154472, "loss": 0.7213, "step": 1890 }, { "epoch": 0.304, "grad_norm": 3.1847198009490967, "learning_rate": 0.00035365853658536584, "loss": 1.0388, "step": 1900 }, { "epoch": 0.3056, "grad_norm": 0.1243385598063469, "learning_rate": 0.00035284552845528454, "loss": 0.9196, "step": 1910 }, { "epoch": 0.3072, "grad_norm": 3.720393657684326, "learning_rate": 0.0003520325203252033, "loss": 0.9104, "step": 1920 }, { "epoch": 0.3088, "grad_norm": 1.8433055877685547, "learning_rate": 0.0003512195121951219, "loss": 1.0138, "step": 1930 }, { "epoch": 0.3104, "grad_norm": 2.8602616786956787, "learning_rate": 0.0003504065040650407, "loss": 1.2236, "step": 1940 }, { "epoch": 0.312, "grad_norm": 1.5425291061401367, "learning_rate": 0.00034959349593495937, "loss": 1.1536, "step": 1950 }, { "epoch": 0.3136, "grad_norm": 3.1375532150268555, "learning_rate": 0.000348780487804878, "loss": 1.0996, "step": 1960 }, { "epoch": 0.3152, "grad_norm": 2.8346571922302246, "learning_rate": 0.00034796747967479676, "loss": 1.2285, "step": 1970 }, { "epoch": 0.3168, "grad_norm": 2.045203924179077, "learning_rate": 0.00034715447154471546, "loss": 1.1145, "step": 1980 }, { "epoch": 0.3184, "grad_norm": 2.8319339752197266, "learning_rate": 0.00034634146341463415, "loss": 0.9861, "step": 1990 }, { "epoch": 0.32, "grad_norm": 1.499254584312439, "learning_rate": 0.00034552845528455285, "loss": 1.1386, "step": 2000 }, { "epoch": 0.32, "eval_loss": 1.256957769393921, "eval_runtime": 991.1325, "eval_samples_per_second": 5.045, "eval_steps_per_second": 5.045, "step": 2000 }, { "epoch": 0.3216, "grad_norm": 0.11786320060491562, "learning_rate": 0.00034471544715447154, "loss": 0.5642, "step": 2010 }, { "epoch": 0.3232, "grad_norm": 3.7964823246002197, "learning_rate": 0.00034390243902439023, "loss": 0.7509, "step": 2020 }, { "epoch": 0.3248, "grad_norm": 0.842405378818512, "learning_rate": 0.00034308943089430893, "loss": 1.0208, "step": 2030 }, { "epoch": 0.3264, "grad_norm": 1.725205421447754, "learning_rate": 0.0003422764227642277, "loss": 0.8658, "step": 2040 }, { "epoch": 0.328, "grad_norm": 2.685744047164917, "learning_rate": 0.0003414634146341464, "loss": 0.9458, "step": 2050 }, { "epoch": 0.3296, "grad_norm": 1.153769850730896, "learning_rate": 0.000340650406504065, "loss": 0.711, "step": 2060 }, { "epoch": 0.3312, "grad_norm": 1.1558091640472412, "learning_rate": 0.00033983739837398376, "loss": 0.8342, "step": 2070 }, { "epoch": 0.3328, "grad_norm": 1.4191641807556152, "learning_rate": 0.00033902439024390246, "loss": 0.7802, "step": 2080 }, { "epoch": 0.3344, "grad_norm": 1.611901044845581, "learning_rate": 0.00033821138211382115, "loss": 0.6892, "step": 2090 }, { "epoch": 0.336, "grad_norm": 2.4627749919891357, "learning_rate": 0.00033739837398373985, "loss": 1.02, "step": 2100 }, { "epoch": 0.3376, "grad_norm": 1.315026044845581, "learning_rate": 0.00033658536585365854, "loss": 1.0683, "step": 2110 }, { "epoch": 0.3392, "grad_norm": 1.301009178161621, "learning_rate": 0.00033577235772357724, "loss": 0.877, "step": 2120 }, { "epoch": 0.3408, "grad_norm": 1.3801865577697754, "learning_rate": 0.00033495934959349593, "loss": 0.7509, "step": 2130 }, { "epoch": 0.3424, "grad_norm": 2.0285582542419434, "learning_rate": 0.0003341463414634147, "loss": 0.898, "step": 2140 }, { "epoch": 0.344, "grad_norm": 1.8573026657104492, "learning_rate": 0.0003333333333333333, "loss": 0.8242, "step": 2150 }, { "epoch": 0.3456, "grad_norm": 3.389634609222412, "learning_rate": 0.000332520325203252, "loss": 1.0648, "step": 2160 }, { "epoch": 0.3472, "grad_norm": 1.5863244533538818, "learning_rate": 0.00033170731707317077, "loss": 0.7867, "step": 2170 }, { "epoch": 0.3488, "grad_norm": 1.4909151792526245, "learning_rate": 0.0003308943089430894, "loss": 0.6263, "step": 2180 }, { "epoch": 0.3504, "grad_norm": 1.706332802772522, "learning_rate": 0.00033008130081300816, "loss": 1.2284, "step": 2190 }, { "epoch": 0.352, "grad_norm": 1.5109984874725342, "learning_rate": 0.00032926829268292685, "loss": 0.8387, "step": 2200 }, { "epoch": 0.3536, "grad_norm": 2.0420236587524414, "learning_rate": 0.0003284552845528455, "loss": 0.9217, "step": 2210 }, { "epoch": 0.3552, "grad_norm": 3.405055284500122, "learning_rate": 0.00032764227642276424, "loss": 1.2896, "step": 2220 }, { "epoch": 0.3568, "grad_norm": 1.0633502006530762, "learning_rate": 0.00032682926829268294, "loss": 0.9075, "step": 2230 }, { "epoch": 0.3584, "grad_norm": 8.972102165222168, "learning_rate": 0.00032601626016260163, "loss": 0.9067, "step": 2240 }, { "epoch": 0.36, "grad_norm": 2.066863775253296, "learning_rate": 0.0003252032520325203, "loss": 1.0274, "step": 2250 }, { "epoch": 0.36, "eval_loss": 1.2403863668441772, "eval_runtime": 988.1441, "eval_samples_per_second": 5.06, "eval_steps_per_second": 5.06, "step": 2250 }, { "epoch": 0.3616, "grad_norm": 2.7696852684020996, "learning_rate": 0.000324390243902439, "loss": 0.9223, "step": 2260 }, { "epoch": 0.3632, "grad_norm": 0.5893625020980835, "learning_rate": 0.0003235772357723577, "loss": 0.477, "step": 2270 }, { "epoch": 0.3648, "grad_norm": 4.938126564025879, "learning_rate": 0.0003227642276422764, "loss": 0.9821, "step": 2280 }, { "epoch": 0.3664, "grad_norm": 3.392449378967285, "learning_rate": 0.00032195121951219516, "loss": 0.7618, "step": 2290 }, { "epoch": 0.368, "grad_norm": 0.7704805135726929, "learning_rate": 0.00032113821138211385, "loss": 0.8412, "step": 2300 }, { "epoch": 0.3696, "grad_norm": 2.1122782230377197, "learning_rate": 0.0003203252032520325, "loss": 1.0207, "step": 2310 }, { "epoch": 0.3712, "grad_norm": 0.8763427734375, "learning_rate": 0.00031951219512195124, "loss": 0.9647, "step": 2320 }, { "epoch": 0.3728, "grad_norm": 3.7990410327911377, "learning_rate": 0.00031869918699186994, "loss": 0.9023, "step": 2330 }, { "epoch": 0.3744, "grad_norm": 1.1437709331512451, "learning_rate": 0.00031788617886178863, "loss": 0.6665, "step": 2340 }, { "epoch": 0.376, "grad_norm": 1.7292026281356812, "learning_rate": 0.00031707317073170733, "loss": 0.9873, "step": 2350 }, { "epoch": 0.3776, "grad_norm": 2.6598150730133057, "learning_rate": 0.000316260162601626, "loss": 1.0194, "step": 2360 }, { "epoch": 0.3792, "grad_norm": 1.1113004684448242, "learning_rate": 0.0003154471544715447, "loss": 0.9213, "step": 2370 }, { "epoch": 0.3808, "grad_norm": 2.316197395324707, "learning_rate": 0.0003146341463414634, "loss": 0.8524, "step": 2380 }, { "epoch": 0.3824, "grad_norm": 3.070237636566162, "learning_rate": 0.00031382113821138216, "loss": 1.0224, "step": 2390 }, { "epoch": 0.384, "grad_norm": 1.2676721811294556, "learning_rate": 0.0003130081300813008, "loss": 0.6098, "step": 2400 }, { "epoch": 0.3856, "grad_norm": 0.6013288497924805, "learning_rate": 0.0003121951219512195, "loss": 0.7596, "step": 2410 }, { "epoch": 0.3872, "grad_norm": 1.8486628532409668, "learning_rate": 0.00031138211382113825, "loss": 0.8179, "step": 2420 }, { "epoch": 0.3888, "grad_norm": 1.7957018613815308, "learning_rate": 0.0003105691056910569, "loss": 0.612, "step": 2430 }, { "epoch": 0.3904, "grad_norm": 0.8443304896354675, "learning_rate": 0.00030975609756097564, "loss": 1.1852, "step": 2440 }, { "epoch": 0.392, "grad_norm": 2.9211597442626953, "learning_rate": 0.00030894308943089433, "loss": 0.648, "step": 2450 }, { "epoch": 0.3936, "grad_norm": 1.8100574016571045, "learning_rate": 0.00030813008130081297, "loss": 0.9621, "step": 2460 }, { "epoch": 0.3952, "grad_norm": 0.5521060824394226, "learning_rate": 0.0003073170731707317, "loss": 0.7497, "step": 2470 }, { "epoch": 0.3968, "grad_norm": 3.2785725593566895, "learning_rate": 0.0003065040650406504, "loss": 1.1055, "step": 2480 }, { "epoch": 0.3984, "grad_norm": 0.59607994556427, "learning_rate": 0.0003056910569105691, "loss": 1.0316, "step": 2490 }, { "epoch": 0.4, "grad_norm": 2.6700527667999268, "learning_rate": 0.0003048780487804878, "loss": 1.1739, "step": 2500 }, { "epoch": 0.4, "eval_loss": 1.1851228475570679, "eval_runtime": 981.6822, "eval_samples_per_second": 5.093, "eval_steps_per_second": 5.093, "step": 2500 }, { "epoch": 0.4016, "grad_norm": 4.136201858520508, "learning_rate": 0.0003040650406504065, "loss": 1.065, "step": 2510 }, { "epoch": 0.4032, "grad_norm": 1.3209648132324219, "learning_rate": 0.0003032520325203252, "loss": 0.9474, "step": 2520 }, { "epoch": 0.4048, "grad_norm": 1.0437417030334473, "learning_rate": 0.0003024390243902439, "loss": 0.7807, "step": 2530 }, { "epoch": 0.4064, "grad_norm": 1.5563472509384155, "learning_rate": 0.00030162601626016264, "loss": 0.9598, "step": 2540 }, { "epoch": 0.408, "grad_norm": 2.1927716732025146, "learning_rate": 0.00030081300813008133, "loss": 0.9768, "step": 2550 }, { "epoch": 0.4096, "grad_norm": 2.606297016143799, "learning_rate": 0.0003, "loss": 0.9263, "step": 2560 }, { "epoch": 0.4112, "grad_norm": 2.935955762863159, "learning_rate": 0.0002991869918699187, "loss": 1.1745, "step": 2570 }, { "epoch": 0.4128, "grad_norm": 1.4000895023345947, "learning_rate": 0.0002983739837398374, "loss": 0.4629, "step": 2580 }, { "epoch": 0.4144, "grad_norm": 1.867844820022583, "learning_rate": 0.0002975609756097561, "loss": 0.5586, "step": 2590 }, { "epoch": 0.416, "grad_norm": 2.5813417434692383, "learning_rate": 0.0002967479674796748, "loss": 1.2132, "step": 2600 }, { "epoch": 0.4176, "grad_norm": 2.0057499408721924, "learning_rate": 0.0002959349593495935, "loss": 0.9353, "step": 2610 }, { "epoch": 0.4192, "grad_norm": 1.3958829641342163, "learning_rate": 0.0002951219512195122, "loss": 0.9026, "step": 2620 }, { "epoch": 0.4208, "grad_norm": 0.9305471777915955, "learning_rate": 0.0002943089430894309, "loss": 0.637, "step": 2630 }, { "epoch": 0.4224, "grad_norm": 1.509443998336792, "learning_rate": 0.00029349593495934964, "loss": 0.9101, "step": 2640 }, { "epoch": 0.424, "grad_norm": 1.1322437524795532, "learning_rate": 0.0002926829268292683, "loss": 0.8228, "step": 2650 }, { "epoch": 0.4256, "grad_norm": 1.4510544538497925, "learning_rate": 0.000291869918699187, "loss": 0.8965, "step": 2660 }, { "epoch": 0.4272, "grad_norm": 3.9876041412353516, "learning_rate": 0.0002910569105691057, "loss": 0.6826, "step": 2670 }, { "epoch": 0.4288, "grad_norm": 3.5676708221435547, "learning_rate": 0.00029024390243902437, "loss": 0.848, "step": 2680 }, { "epoch": 0.4304, "grad_norm": 2.8735740184783936, "learning_rate": 0.0002894308943089431, "loss": 0.8885, "step": 2690 }, { "epoch": 0.432, "grad_norm": 3.243591547012329, "learning_rate": 0.0002886178861788618, "loss": 0.7272, "step": 2700 }, { "epoch": 0.4336, "grad_norm": 1.0646330118179321, "learning_rate": 0.00028780487804878045, "loss": 0.9243, "step": 2710 }, { "epoch": 0.4352, "grad_norm": 2.145279884338379, "learning_rate": 0.0002869918699186992, "loss": 0.8605, "step": 2720 }, { "epoch": 0.4368, "grad_norm": 0.12611623108386993, "learning_rate": 0.0002861788617886179, "loss": 0.8593, "step": 2730 }, { "epoch": 0.4384, "grad_norm": 1.6009489297866821, "learning_rate": 0.0002853658536585366, "loss": 1.2734, "step": 2740 }, { "epoch": 0.44, "grad_norm": 1.202634572982788, "learning_rate": 0.0002845528455284553, "loss": 0.6056, "step": 2750 }, { "epoch": 0.44, "eval_loss": 1.1804109811782837, "eval_runtime": 980.959, "eval_samples_per_second": 5.097, "eval_steps_per_second": 5.097, "step": 2750 }, { "epoch": 0.4416, "grad_norm": 2.083847999572754, "learning_rate": 0.000283739837398374, "loss": 0.9537, "step": 2760 }, { "epoch": 0.4432, "grad_norm": 1.4282503128051758, "learning_rate": 0.0002829268292682927, "loss": 1.1229, "step": 2770 }, { "epoch": 0.4448, "grad_norm": 2.896178722381592, "learning_rate": 0.00028211382113821137, "loss": 1.0021, "step": 2780 }, { "epoch": 0.4464, "grad_norm": 2.1293370723724365, "learning_rate": 0.0002813008130081301, "loss": 0.7991, "step": 2790 }, { "epoch": 0.448, "grad_norm": 2.050287961959839, "learning_rate": 0.0002804878048780488, "loss": 0.7226, "step": 2800 }, { "epoch": 0.4496, "grad_norm": 2.431018590927124, "learning_rate": 0.00027967479674796745, "loss": 0.7713, "step": 2810 }, { "epoch": 0.4512, "grad_norm": 0.8670142292976379, "learning_rate": 0.0002788617886178862, "loss": 0.8047, "step": 2820 }, { "epoch": 0.4528, "grad_norm": 2.0386359691619873, "learning_rate": 0.0002780487804878049, "loss": 1.1617, "step": 2830 }, { "epoch": 0.4544, "grad_norm": 2.0506820678710938, "learning_rate": 0.0002772357723577236, "loss": 0.8922, "step": 2840 }, { "epoch": 0.456, "grad_norm": 0.9461020827293396, "learning_rate": 0.0002764227642276423, "loss": 0.4643, "step": 2850 }, { "epoch": 0.4576, "grad_norm": 2.101771116256714, "learning_rate": 0.000275609756097561, "loss": 0.6462, "step": 2860 }, { "epoch": 0.4592, "grad_norm": 2.1208040714263916, "learning_rate": 0.0002747967479674797, "loss": 0.9001, "step": 2870 }, { "epoch": 0.4608, "grad_norm": 2.1006226539611816, "learning_rate": 0.00027398373983739837, "loss": 1.0824, "step": 2880 }, { "epoch": 0.4624, "grad_norm": 1.7304776906967163, "learning_rate": 0.0002731707317073171, "loss": 0.8835, "step": 2890 }, { "epoch": 0.464, "grad_norm": 1.453547477722168, "learning_rate": 0.00027235772357723576, "loss": 0.8462, "step": 2900 }, { "epoch": 0.4656, "grad_norm": 2.442309856414795, "learning_rate": 0.00027154471544715446, "loss": 1.2086, "step": 2910 }, { "epoch": 0.4672, "grad_norm": 1.3511006832122803, "learning_rate": 0.0002707317073170732, "loss": 0.7028, "step": 2920 }, { "epoch": 0.4688, "grad_norm": 0.836704671382904, "learning_rate": 0.00026991869918699185, "loss": 0.5175, "step": 2930 }, { "epoch": 0.4704, "grad_norm": 1.680782675743103, "learning_rate": 0.0002691056910569106, "loss": 0.906, "step": 2940 }, { "epoch": 0.472, "grad_norm": 1.6119508743286133, "learning_rate": 0.0002682926829268293, "loss": 0.8772, "step": 2950 }, { "epoch": 0.4736, "grad_norm": 0.7434157729148865, "learning_rate": 0.00026747967479674793, "loss": 0.7295, "step": 2960 }, { "epoch": 0.4752, "grad_norm": 2.9454092979431152, "learning_rate": 0.0002666666666666667, "loss": 0.6404, "step": 2970 }, { "epoch": 0.4768, "grad_norm": 2.6205222606658936, "learning_rate": 0.0002658536585365854, "loss": 0.7348, "step": 2980 }, { "epoch": 0.4784, "grad_norm": 0.9789513945579529, "learning_rate": 0.00026504065040650407, "loss": 0.9531, "step": 2990 }, { "epoch": 0.48, "grad_norm": 2.718863010406494, "learning_rate": 0.00026422764227642276, "loss": 0.7694, "step": 3000 }, { "epoch": 0.48, "eval_loss": 1.234536051750183, "eval_runtime": 970.4874, "eval_samples_per_second": 5.152, "eval_steps_per_second": 5.152, "step": 3000 }, { "epoch": 0.4816, "grad_norm": 3.2361960411071777, "learning_rate": 0.00026341463414634146, "loss": 1.0666, "step": 3010 }, { "epoch": 0.4832, "grad_norm": 2.865548610687256, "learning_rate": 0.00026260162601626015, "loss": 0.9337, "step": 3020 }, { "epoch": 0.4848, "grad_norm": 2.602585554122925, "learning_rate": 0.00026178861788617885, "loss": 0.6121, "step": 3030 }, { "epoch": 0.4864, "grad_norm": 3.5708200931549072, "learning_rate": 0.0002609756097560976, "loss": 0.9949, "step": 3040 }, { "epoch": 0.488, "grad_norm": 1.3991180658340454, "learning_rate": 0.0002601626016260163, "loss": 0.6146, "step": 3050 }, { "epoch": 0.4896, "grad_norm": 1.6713191270828247, "learning_rate": 0.00025934959349593493, "loss": 0.903, "step": 3060 }, { "epoch": 0.4912, "grad_norm": 2.2837250232696533, "learning_rate": 0.0002585365853658537, "loss": 0.6912, "step": 3070 }, { "epoch": 0.4928, "grad_norm": 1.2613333463668823, "learning_rate": 0.0002577235772357724, "loss": 0.7742, "step": 3080 }, { "epoch": 0.4944, "grad_norm": 0.7632296085357666, "learning_rate": 0.00025691056910569107, "loss": 0.6821, "step": 3090 }, { "epoch": 0.496, "grad_norm": 2.6601462364196777, "learning_rate": 0.00025609756097560977, "loss": 1.1208, "step": 3100 }, { "epoch": 0.4976, "grad_norm": 1.0819050073623657, "learning_rate": 0.00025528455284552846, "loss": 0.596, "step": 3110 }, { "epoch": 0.4992, "grad_norm": 2.7370996475219727, "learning_rate": 0.00025447154471544716, "loss": 0.7677, "step": 3120 }, { "epoch": 0.5008, "grad_norm": 1.8494040966033936, "learning_rate": 0.00025365853658536585, "loss": 1.0403, "step": 3130 }, { "epoch": 0.5024, "grad_norm": 1.1479870080947876, "learning_rate": 0.0002528455284552846, "loss": 0.7458, "step": 3140 }, { "epoch": 0.504, "grad_norm": 2.6968982219696045, "learning_rate": 0.00025203252032520324, "loss": 1.0832, "step": 3150 }, { "epoch": 0.5056, "grad_norm": 2.046722173690796, "learning_rate": 0.00025121951219512194, "loss": 0.8306, "step": 3160 }, { "epoch": 0.5072, "grad_norm": 1.8968234062194824, "learning_rate": 0.0002504065040650407, "loss": 0.6512, "step": 3170 }, { "epoch": 0.5088, "grad_norm": 2.967087984085083, "learning_rate": 0.0002495934959349594, "loss": 0.7747, "step": 3180 }, { "epoch": 0.5104, "grad_norm": 1.0478880405426025, "learning_rate": 0.0002487804878048781, "loss": 0.8271, "step": 3190 }, { "epoch": 0.512, "grad_norm": 0.8186447620391846, "learning_rate": 0.0002479674796747967, "loss": 0.891, "step": 3200 }, { "epoch": 0.5136, "grad_norm": 4.616454601287842, "learning_rate": 0.00024715447154471546, "loss": 1.0238, "step": 3210 }, { "epoch": 0.5152, "grad_norm": 0.8574868440628052, "learning_rate": 0.00024634146341463416, "loss": 1.1656, "step": 3220 }, { "epoch": 0.5168, "grad_norm": 0.13767553865909576, "learning_rate": 0.00024552845528455285, "loss": 0.7881, "step": 3230 }, { "epoch": 0.5184, "grad_norm": 3.214853048324585, "learning_rate": 0.00024471544715447155, "loss": 1.1101, "step": 3240 }, { "epoch": 0.52, "grad_norm": 1.6308200359344482, "learning_rate": 0.00024390243902439024, "loss": 0.816, "step": 3250 }, { "epoch": 0.52, "eval_loss": 1.1804865598678589, "eval_runtime": 967.604, "eval_samples_per_second": 5.167, "eval_steps_per_second": 5.167, "step": 3250 }, { "epoch": 0.5216, "grad_norm": 2.277055501937866, "learning_rate": 0.00024308943089430897, "loss": 0.6252, "step": 3260 }, { "epoch": 0.5232, "grad_norm": 1.4975641965866089, "learning_rate": 0.00024227642276422766, "loss": 0.9561, "step": 3270 }, { "epoch": 0.5248, "grad_norm": 0.9692897796630859, "learning_rate": 0.00024146341463414633, "loss": 0.8452, "step": 3280 }, { "epoch": 0.5264, "grad_norm": 3.146620512008667, "learning_rate": 0.00024065040650406505, "loss": 0.7964, "step": 3290 }, { "epoch": 0.528, "grad_norm": 1.6603403091430664, "learning_rate": 0.00023983739837398375, "loss": 0.8526, "step": 3300 }, { "epoch": 0.5296, "grad_norm": 2.813284158706665, "learning_rate": 0.00023902439024390244, "loss": 0.7897, "step": 3310 }, { "epoch": 0.5312, "grad_norm": 2.0508971214294434, "learning_rate": 0.00023821138211382116, "loss": 0.7886, "step": 3320 }, { "epoch": 0.5328, "grad_norm": 2.0528271198272705, "learning_rate": 0.00023739837398373983, "loss": 0.9615, "step": 3330 }, { "epoch": 0.5344, "grad_norm": 0.16403831541538239, "learning_rate": 0.00023658536585365852, "loss": 0.646, "step": 3340 }, { "epoch": 0.536, "grad_norm": 0.7658578753471375, "learning_rate": 0.00023577235772357725, "loss": 0.952, "step": 3350 }, { "epoch": 0.5376, "grad_norm": 1.5900917053222656, "learning_rate": 0.00023495934959349594, "loss": 0.6692, "step": 3360 }, { "epoch": 0.5392, "grad_norm": 2.9648303985595703, "learning_rate": 0.00023414634146341466, "loss": 1.1142, "step": 3370 }, { "epoch": 0.5408, "grad_norm": 1.9145853519439697, "learning_rate": 0.00023333333333333333, "loss": 0.946, "step": 3380 }, { "epoch": 0.5424, "grad_norm": 1.9604383707046509, "learning_rate": 0.00023252032520325203, "loss": 1.0952, "step": 3390 }, { "epoch": 0.544, "grad_norm": 1.4773716926574707, "learning_rate": 0.00023170731707317075, "loss": 0.798, "step": 3400 }, { "epoch": 0.5456, "grad_norm": 1.0287730693817139, "learning_rate": 0.00023089430894308944, "loss": 0.6784, "step": 3410 }, { "epoch": 0.5472, "grad_norm": 1.171778678894043, "learning_rate": 0.00023008130081300814, "loss": 0.8399, "step": 3420 }, { "epoch": 0.5488, "grad_norm": 3.042232036590576, "learning_rate": 0.00022926829268292683, "loss": 0.9466, "step": 3430 }, { "epoch": 0.5504, "grad_norm": 2.661311388015747, "learning_rate": 0.00022845528455284553, "loss": 0.8741, "step": 3440 }, { "epoch": 0.552, "grad_norm": 0.824052631855011, "learning_rate": 0.00022764227642276422, "loss": 0.6971, "step": 3450 }, { "epoch": 0.5536, "grad_norm": 2.2068676948547363, "learning_rate": 0.00022682926829268294, "loss": 1.0792, "step": 3460 }, { "epoch": 0.5552, "grad_norm": 2.607996702194214, "learning_rate": 0.00022601626016260164, "loss": 0.7798, "step": 3470 }, { "epoch": 0.5568, "grad_norm": 3.1297590732574463, "learning_rate": 0.0002252032520325203, "loss": 0.7335, "step": 3480 }, { "epoch": 0.5584, "grad_norm": 1.5206081867218018, "learning_rate": 0.00022439024390243903, "loss": 0.5827, "step": 3490 }, { "epoch": 0.56, "grad_norm": 0.8850612044334412, "learning_rate": 0.00022357723577235772, "loss": 0.3555, "step": 3500 }, { "epoch": 0.56, "eval_loss": 1.173862338066101, "eval_runtime": 972.1905, "eval_samples_per_second": 5.143, "eval_steps_per_second": 5.143, "step": 3500 }, { "epoch": 0.5616, "grad_norm": 2.206645965576172, "learning_rate": 0.00022276422764227645, "loss": 1.0334, "step": 3510 }, { "epoch": 0.5632, "grad_norm": 2.9328489303588867, "learning_rate": 0.00022195121951219514, "loss": 0.9996, "step": 3520 }, { "epoch": 0.5648, "grad_norm": 1.533145785331726, "learning_rate": 0.0002211382113821138, "loss": 0.6801, "step": 3530 }, { "epoch": 0.5664, "grad_norm": 2.793165922164917, "learning_rate": 0.00022032520325203253, "loss": 0.5243, "step": 3540 }, { "epoch": 0.568, "grad_norm": 0.8398239612579346, "learning_rate": 0.00021951219512195122, "loss": 0.7804, "step": 3550 }, { "epoch": 0.5696, "grad_norm": 0.8570539951324463, "learning_rate": 0.00021869918699186992, "loss": 0.8727, "step": 3560 }, { "epoch": 0.5712, "grad_norm": 3.4033238887786865, "learning_rate": 0.00021788617886178864, "loss": 0.7058, "step": 3570 }, { "epoch": 0.5728, "grad_norm": 2.2475531101226807, "learning_rate": 0.0002170731707317073, "loss": 0.6313, "step": 3580 }, { "epoch": 0.5744, "grad_norm": 2.115605354309082, "learning_rate": 0.000216260162601626, "loss": 0.724, "step": 3590 }, { "epoch": 0.576, "grad_norm": 3.056288003921509, "learning_rate": 0.00021544715447154473, "loss": 0.8915, "step": 3600 }, { "epoch": 0.5776, "grad_norm": 2.0451629161834717, "learning_rate": 0.00021463414634146342, "loss": 0.9079, "step": 3610 }, { "epoch": 0.5792, "grad_norm": 2.807318925857544, "learning_rate": 0.00021382113821138212, "loss": 0.8133, "step": 3620 }, { "epoch": 0.5808, "grad_norm": 2.908517837524414, "learning_rate": 0.0002130081300813008, "loss": 1.1001, "step": 3630 }, { "epoch": 0.5824, "grad_norm": 3.8471007347106934, "learning_rate": 0.0002121951219512195, "loss": 1.2104, "step": 3640 }, { "epoch": 0.584, "grad_norm": 0.697363018989563, "learning_rate": 0.00021138211382113823, "loss": 0.7169, "step": 3650 }, { "epoch": 0.5856, "grad_norm": 3.8001105785369873, "learning_rate": 0.00021056910569105692, "loss": 0.7753, "step": 3660 }, { "epoch": 0.5872, "grad_norm": 1.2338181734085083, "learning_rate": 0.00020975609756097562, "loss": 0.8409, "step": 3670 }, { "epoch": 0.5888, "grad_norm": 2.482490062713623, "learning_rate": 0.0002089430894308943, "loss": 0.6949, "step": 3680 }, { "epoch": 0.5904, "grad_norm": 0.9223986864089966, "learning_rate": 0.000208130081300813, "loss": 0.5945, "step": 3690 }, { "epoch": 0.592, "grad_norm": 4.109449863433838, "learning_rate": 0.0002073170731707317, "loss": 0.819, "step": 3700 }, { "epoch": 0.5936, "grad_norm": 2.1545259952545166, "learning_rate": 0.00020650406504065042, "loss": 0.6917, "step": 3710 }, { "epoch": 0.5952, "grad_norm": 2.568392038345337, "learning_rate": 0.00020569105691056912, "loss": 0.948, "step": 3720 }, { "epoch": 0.5968, "grad_norm": 1.5172126293182373, "learning_rate": 0.00020487804878048779, "loss": 0.7395, "step": 3730 }, { "epoch": 0.5984, "grad_norm": 2.408418893814087, "learning_rate": 0.0002040650406504065, "loss": 0.6265, "step": 3740 }, { "epoch": 0.6, "grad_norm": 1.076709508895874, "learning_rate": 0.0002032520325203252, "loss": 0.8063, "step": 3750 }, { "epoch": 0.6, "eval_loss": 1.1569427251815796, "eval_runtime": 959.9775, "eval_samples_per_second": 5.208, "eval_steps_per_second": 5.208, "step": 3750 }, { "epoch": 0.6016, "grad_norm": 1.5189223289489746, "learning_rate": 0.00020243902439024393, "loss": 0.7415, "step": 3760 }, { "epoch": 0.6032, "grad_norm": 1.6611405611038208, "learning_rate": 0.00020162601626016262, "loss": 0.7716, "step": 3770 }, { "epoch": 0.6048, "grad_norm": 2.1746842861175537, "learning_rate": 0.0002008130081300813, "loss": 0.9592, "step": 3780 }, { "epoch": 0.6064, "grad_norm": 1.7051889896392822, "learning_rate": 0.0002, "loss": 0.7309, "step": 3790 }, { "epoch": 0.608, "grad_norm": 1.4838804006576538, "learning_rate": 0.0001991869918699187, "loss": 0.9909, "step": 3800 }, { "epoch": 0.6096, "grad_norm": 1.0016885995864868, "learning_rate": 0.0001983739837398374, "loss": 0.7118, "step": 3810 }, { "epoch": 0.6112, "grad_norm": 2.9000656604766846, "learning_rate": 0.00019756097560975612, "loss": 0.9148, "step": 3820 }, { "epoch": 0.6128, "grad_norm": 4.018497943878174, "learning_rate": 0.0001967479674796748, "loss": 0.7638, "step": 3830 }, { "epoch": 0.6144, "grad_norm": 2.2254245281219482, "learning_rate": 0.00019593495934959348, "loss": 0.6787, "step": 3840 }, { "epoch": 0.616, "grad_norm": 2.057569980621338, "learning_rate": 0.0001951219512195122, "loss": 1.046, "step": 3850 }, { "epoch": 0.6176, "grad_norm": 1.673245906829834, "learning_rate": 0.0001943089430894309, "loss": 0.7483, "step": 3860 }, { "epoch": 0.6192, "grad_norm": 1.6375939846038818, "learning_rate": 0.0001934959349593496, "loss": 0.7224, "step": 3870 }, { "epoch": 0.6208, "grad_norm": 1.7894866466522217, "learning_rate": 0.0001926829268292683, "loss": 1.1423, "step": 3880 }, { "epoch": 0.6224, "grad_norm": 3.3884127140045166, "learning_rate": 0.00019186991869918699, "loss": 1.2515, "step": 3890 }, { "epoch": 0.624, "grad_norm": 0.9296390414237976, "learning_rate": 0.0001910569105691057, "loss": 0.6837, "step": 3900 }, { "epoch": 0.6256, "grad_norm": 2.7435858249664307, "learning_rate": 0.0001902439024390244, "loss": 0.5283, "step": 3910 }, { "epoch": 0.6272, "grad_norm": 3.0927188396453857, "learning_rate": 0.0001894308943089431, "loss": 1.0301, "step": 3920 }, { "epoch": 0.6288, "grad_norm": 1.5934438705444336, "learning_rate": 0.0001886178861788618, "loss": 0.9681, "step": 3930 }, { "epoch": 0.6304, "grad_norm": 0.7750480771064758, "learning_rate": 0.0001878048780487805, "loss": 0.5935, "step": 3940 }, { "epoch": 0.632, "grad_norm": 2.40685772895813, "learning_rate": 0.00018699186991869918, "loss": 0.9389, "step": 3950 }, { "epoch": 0.6336, "grad_norm": 0.20400309562683105, "learning_rate": 0.0001861788617886179, "loss": 0.9855, "step": 3960 }, { "epoch": 0.6352, "grad_norm": 0.8683297634124756, "learning_rate": 0.0001853658536585366, "loss": 1.0224, "step": 3970 }, { "epoch": 0.6368, "grad_norm": 1.6435086727142334, "learning_rate": 0.00018455284552845527, "loss": 0.6436, "step": 3980 }, { "epoch": 0.6384, "grad_norm": 1.5825189352035522, "learning_rate": 0.000183739837398374, "loss": 0.6848, "step": 3990 }, { "epoch": 0.64, "grad_norm": 2.164686918258667, "learning_rate": 0.00018292682926829268, "loss": 0.962, "step": 4000 }, { "epoch": 0.64, "eval_loss": 1.164829134941101, "eval_runtime": 957.7122, "eval_samples_per_second": 5.221, "eval_steps_per_second": 5.221, "step": 4000 }, { "epoch": 0.6416, "grad_norm": 2.6798882484436035, "learning_rate": 0.0001821138211382114, "loss": 0.7698, "step": 4010 }, { "epoch": 0.6432, "grad_norm": 2.3492815494537354, "learning_rate": 0.0001813008130081301, "loss": 0.665, "step": 4020 }, { "epoch": 0.6448, "grad_norm": 0.8926377296447754, "learning_rate": 0.00018048780487804877, "loss": 0.6753, "step": 4030 }, { "epoch": 0.6464, "grad_norm": 3.1364357471466064, "learning_rate": 0.0001796747967479675, "loss": 0.8779, "step": 4040 }, { "epoch": 0.648, "grad_norm": 3.5229640007019043, "learning_rate": 0.00017886178861788618, "loss": 0.9561, "step": 4050 }, { "epoch": 0.6496, "grad_norm": 1.6057953834533691, "learning_rate": 0.00017804878048780488, "loss": 0.8222, "step": 4060 }, { "epoch": 0.6512, "grad_norm": 4.898631572723389, "learning_rate": 0.0001772357723577236, "loss": 1.0256, "step": 4070 }, { "epoch": 0.6528, "grad_norm": 2.4350786209106445, "learning_rate": 0.00017642276422764227, "loss": 0.6602, "step": 4080 }, { "epoch": 0.6544, "grad_norm": 2.0918514728546143, "learning_rate": 0.00017560975609756096, "loss": 1.1968, "step": 4090 }, { "epoch": 0.656, "grad_norm": 0.8441520929336548, "learning_rate": 0.00017479674796747969, "loss": 0.7727, "step": 4100 }, { "epoch": 0.6576, "grad_norm": 2.8887555599212646, "learning_rate": 0.00017398373983739838, "loss": 1.2474, "step": 4110 }, { "epoch": 0.6592, "grad_norm": 0.07653144001960754, "learning_rate": 0.00017317073170731708, "loss": 0.4434, "step": 4120 }, { "epoch": 0.6608, "grad_norm": 2.0132455825805664, "learning_rate": 0.00017235772357723577, "loss": 0.9298, "step": 4130 }, { "epoch": 0.6624, "grad_norm": 1.6578240394592285, "learning_rate": 0.00017154471544715446, "loss": 0.944, "step": 4140 }, { "epoch": 0.664, "grad_norm": 2.6716582775115967, "learning_rate": 0.0001707317073170732, "loss": 0.7706, "step": 4150 }, { "epoch": 0.6656, "grad_norm": 3.211911916732788, "learning_rate": 0.00016991869918699188, "loss": 0.8844, "step": 4160 }, { "epoch": 0.6672, "grad_norm": 2.383862257003784, "learning_rate": 0.00016910569105691058, "loss": 0.9795, "step": 4170 }, { "epoch": 0.6688, "grad_norm": 1.9137873649597168, "learning_rate": 0.00016829268292682927, "loss": 0.6962, "step": 4180 }, { "epoch": 0.6704, "grad_norm": 1.3267086744308472, "learning_rate": 0.00016747967479674797, "loss": 0.9447, "step": 4190 }, { "epoch": 0.672, "grad_norm": 2.085939645767212, "learning_rate": 0.00016666666666666666, "loss": 1.0395, "step": 4200 }, { "epoch": 0.6736, "grad_norm": 1.9438047409057617, "learning_rate": 0.00016585365853658538, "loss": 0.8711, "step": 4210 }, { "epoch": 0.6752, "grad_norm": 1.7191540002822876, "learning_rate": 0.00016504065040650408, "loss": 0.8736, "step": 4220 }, { "epoch": 0.6768, "grad_norm": 2.784453868865967, "learning_rate": 0.00016422764227642275, "loss": 0.7039, "step": 4230 }, { "epoch": 0.6784, "grad_norm": 2.904277801513672, "learning_rate": 0.00016341463414634147, "loss": 0.8601, "step": 4240 }, { "epoch": 0.68, "grad_norm": 1.8441609144210815, "learning_rate": 0.00016260162601626016, "loss": 0.8507, "step": 4250 }, { "epoch": 0.68, "eval_loss": 1.1460280418395996, "eval_runtime": 955.4574, "eval_samples_per_second": 5.233, "eval_steps_per_second": 5.233, "step": 4250 }, { "epoch": 0.6816, "grad_norm": 2.44185471534729, "learning_rate": 0.00016178861788617886, "loss": 1.2223, "step": 4260 }, { "epoch": 0.6832, "grad_norm": 2.5785441398620605, "learning_rate": 0.00016097560975609758, "loss": 0.6929, "step": 4270 }, { "epoch": 0.6848, "grad_norm": 0.8098218441009521, "learning_rate": 0.00016016260162601625, "loss": 0.6971, "step": 4280 }, { "epoch": 0.6864, "grad_norm": 2.408022880554199, "learning_rate": 0.00015934959349593497, "loss": 0.8393, "step": 4290 }, { "epoch": 0.688, "grad_norm": 1.8108437061309814, "learning_rate": 0.00015853658536585366, "loss": 0.9469, "step": 4300 }, { "epoch": 0.6896, "grad_norm": 1.3393510580062866, "learning_rate": 0.00015772357723577236, "loss": 0.8046, "step": 4310 }, { "epoch": 0.6912, "grad_norm": 0.27787142992019653, "learning_rate": 0.00015691056910569108, "loss": 0.7817, "step": 4320 }, { "epoch": 0.6928, "grad_norm": 1.5397439002990723, "learning_rate": 0.00015609756097560975, "loss": 0.8529, "step": 4330 }, { "epoch": 0.6944, "grad_norm": 4.285569667816162, "learning_rate": 0.00015528455284552844, "loss": 0.797, "step": 4340 }, { "epoch": 0.696, "grad_norm": 1.4360575675964355, "learning_rate": 0.00015447154471544717, "loss": 0.6302, "step": 4350 }, { "epoch": 0.6976, "grad_norm": 3.1087186336517334, "learning_rate": 0.00015365853658536586, "loss": 0.8003, "step": 4360 }, { "epoch": 0.6992, "grad_norm": 3.1041085720062256, "learning_rate": 0.00015284552845528455, "loss": 0.6171, "step": 4370 }, { "epoch": 0.7008, "grad_norm": 1.951988935470581, "learning_rate": 0.00015203252032520325, "loss": 0.7859, "step": 4380 }, { "epoch": 0.7024, "grad_norm": 2.5091583728790283, "learning_rate": 0.00015121951219512194, "loss": 0.7038, "step": 4390 }, { "epoch": 0.704, "grad_norm": 1.7080724239349365, "learning_rate": 0.00015040650406504067, "loss": 0.6291, "step": 4400 }, { "epoch": 0.7056, "grad_norm": 1.1388205289840698, "learning_rate": 0.00014959349593495936, "loss": 0.8191, "step": 4410 }, { "epoch": 0.7072, "grad_norm": 2.7150771617889404, "learning_rate": 0.00014878048780487806, "loss": 1.1169, "step": 4420 }, { "epoch": 0.7088, "grad_norm": 1.7084001302719116, "learning_rate": 0.00014796747967479675, "loss": 0.8503, "step": 4430 }, { "epoch": 0.7104, "grad_norm": 0.9705413579940796, "learning_rate": 0.00014715447154471545, "loss": 0.7798, "step": 4440 }, { "epoch": 0.712, "grad_norm": 0.6582396030426025, "learning_rate": 0.00014634146341463414, "loss": 0.8122, "step": 4450 }, { "epoch": 0.7136, "grad_norm": 3.819809913635254, "learning_rate": 0.00014552845528455286, "loss": 0.9902, "step": 4460 }, { "epoch": 0.7152, "grad_norm": 1.9070576429367065, "learning_rate": 0.00014471544715447156, "loss": 0.7999, "step": 4470 }, { "epoch": 0.7168, "grad_norm": 2.159898042678833, "learning_rate": 0.00014390243902439023, "loss": 0.6514, "step": 4480 }, { "epoch": 0.7184, "grad_norm": 3.2352945804595947, "learning_rate": 0.00014308943089430895, "loss": 0.8179, "step": 4490 }, { "epoch": 0.72, "grad_norm": 0.7301401495933533, "learning_rate": 0.00014227642276422764, "loss": 0.9165, "step": 4500 }, { "epoch": 0.72, "eval_loss": 1.130717396736145, "eval_runtime": 960.6233, "eval_samples_per_second": 5.205, "eval_steps_per_second": 5.205, "step": 4500 }, { "epoch": 0.7216, "grad_norm": 2.6877050399780273, "learning_rate": 0.00014146341463414634, "loss": 0.7501, "step": 4510 }, { "epoch": 0.7232, "grad_norm": 1.9907829761505127, "learning_rate": 0.00014065040650406506, "loss": 0.8375, "step": 4520 }, { "epoch": 0.7248, "grad_norm": 1.6620970964431763, "learning_rate": 0.00013983739837398373, "loss": 0.8965, "step": 4530 }, { "epoch": 0.7264, "grad_norm": 1.9919949769973755, "learning_rate": 0.00013902439024390245, "loss": 1.0359, "step": 4540 }, { "epoch": 0.728, "grad_norm": 1.9886322021484375, "learning_rate": 0.00013821138211382114, "loss": 0.7646, "step": 4550 }, { "epoch": 0.7296, "grad_norm": 3.00618577003479, "learning_rate": 0.00013739837398373984, "loss": 0.7386, "step": 4560 }, { "epoch": 0.7312, "grad_norm": 1.2489125728607178, "learning_rate": 0.00013658536585365856, "loss": 0.9153, "step": 4570 }, { "epoch": 0.7328, "grad_norm": 1.6659530401229858, "learning_rate": 0.00013577235772357723, "loss": 1.1601, "step": 4580 }, { "epoch": 0.7344, "grad_norm": 3.0665249824523926, "learning_rate": 0.00013495934959349592, "loss": 0.9097, "step": 4590 }, { "epoch": 0.736, "grad_norm": 1.2477220296859741, "learning_rate": 0.00013414634146341464, "loss": 0.5977, "step": 4600 }, { "epoch": 0.7376, "grad_norm": 2.523712158203125, "learning_rate": 0.00013333333333333334, "loss": 0.5476, "step": 4610 }, { "epoch": 0.7392, "grad_norm": 1.5238116979599, "learning_rate": 0.00013252032520325203, "loss": 0.6774, "step": 4620 }, { "epoch": 0.7408, "grad_norm": 4.065662860870361, "learning_rate": 0.00013170731707317073, "loss": 0.9686, "step": 4630 }, { "epoch": 0.7424, "grad_norm": 2.1243975162506104, "learning_rate": 0.00013089430894308942, "loss": 0.6776, "step": 4640 }, { "epoch": 0.744, "grad_norm": 1.4048924446105957, "learning_rate": 0.00013008130081300815, "loss": 0.6849, "step": 4650 }, { "epoch": 0.7456, "grad_norm": 0.9425554275512695, "learning_rate": 0.00012926829268292684, "loss": 0.5431, "step": 4660 }, { "epoch": 0.7472, "grad_norm": 1.4793920516967773, "learning_rate": 0.00012845528455284554, "loss": 0.611, "step": 4670 }, { "epoch": 0.7488, "grad_norm": 1.1930813789367676, "learning_rate": 0.00012764227642276423, "loss": 0.8103, "step": 4680 }, { "epoch": 0.7504, "grad_norm": 2.5445172786712646, "learning_rate": 0.00012682926829268293, "loss": 0.8219, "step": 4690 }, { "epoch": 0.752, "grad_norm": 1.7753766775131226, "learning_rate": 0.00012601626016260162, "loss": 0.8935, "step": 4700 }, { "epoch": 0.7536, "grad_norm": 1.7598661184310913, "learning_rate": 0.00012520325203252034, "loss": 0.7692, "step": 4710 }, { "epoch": 0.7552, "grad_norm": 0.9807813167572021, "learning_rate": 0.00012439024390243904, "loss": 0.8573, "step": 4720 }, { "epoch": 0.7568, "grad_norm": 1.0543572902679443, "learning_rate": 0.00012357723577235773, "loss": 0.362, "step": 4730 }, { "epoch": 0.7584, "grad_norm": 1.2011300325393677, "learning_rate": 0.00012276422764227643, "loss": 0.7364, "step": 4740 }, { "epoch": 0.76, "grad_norm": 1.013681411743164, "learning_rate": 0.00012195121951219512, "loss": 0.8223, "step": 4750 }, { "epoch": 0.76, "eval_loss": 1.1317497491836548, "eval_runtime": 1047.5369, "eval_samples_per_second": 4.773, "eval_steps_per_second": 4.773, "step": 4750 }, { "epoch": 0.7616, "grad_norm": 3.048567295074463, "learning_rate": 0.00012113821138211383, "loss": 0.5417, "step": 4760 }, { "epoch": 0.7632, "grad_norm": 1.684781551361084, "learning_rate": 0.00012032520325203253, "loss": 0.8972, "step": 4770 }, { "epoch": 0.7648, "grad_norm": 1.7157773971557617, "learning_rate": 0.00011951219512195122, "loss": 0.8071, "step": 4780 }, { "epoch": 0.7664, "grad_norm": 2.1893560886383057, "learning_rate": 0.00011869918699186991, "loss": 0.7259, "step": 4790 }, { "epoch": 0.768, "grad_norm": 0.559054434299469, "learning_rate": 0.00011788617886178862, "loss": 0.6906, "step": 4800 }, { "epoch": 0.7696, "grad_norm": 1.9966106414794922, "learning_rate": 0.00011707317073170733, "loss": 0.7862, "step": 4810 }, { "epoch": 0.7712, "grad_norm": 2.0883913040161133, "learning_rate": 0.00011626016260162601, "loss": 0.9832, "step": 4820 }, { "epoch": 0.7728, "grad_norm": 0.9213092923164368, "learning_rate": 0.00011544715447154472, "loss": 0.5425, "step": 4830 }, { "epoch": 0.7744, "grad_norm": 3.5715088844299316, "learning_rate": 0.00011463414634146342, "loss": 0.908, "step": 4840 }, { "epoch": 0.776, "grad_norm": 1.5071601867675781, "learning_rate": 0.00011382113821138211, "loss": 0.9961, "step": 4850 }, { "epoch": 0.7776, "grad_norm": 2.8544921875, "learning_rate": 0.00011300813008130082, "loss": 0.8617, "step": 4860 }, { "epoch": 0.7792, "grad_norm": 1.5049967765808105, "learning_rate": 0.00011219512195121951, "loss": 0.6921, "step": 4870 }, { "epoch": 0.7808, "grad_norm": 1.413341999053955, "learning_rate": 0.00011138211382113822, "loss": 0.87, "step": 4880 }, { "epoch": 0.7824, "grad_norm": 3.063117027282715, "learning_rate": 0.0001105691056910569, "loss": 0.8527, "step": 4890 }, { "epoch": 0.784, "grad_norm": 1.7717232704162598, "learning_rate": 0.00010975609756097561, "loss": 0.9359, "step": 4900 }, { "epoch": 0.7856, "grad_norm": 2.220553398132324, "learning_rate": 0.00010894308943089432, "loss": 0.8893, "step": 4910 }, { "epoch": 0.7872, "grad_norm": 1.1765658855438232, "learning_rate": 0.000108130081300813, "loss": 0.5423, "step": 4920 }, { "epoch": 0.7888, "grad_norm": 3.124976873397827, "learning_rate": 0.00010731707317073171, "loss": 0.8261, "step": 4930 }, { "epoch": 0.7904, "grad_norm": 1.6760934591293335, "learning_rate": 0.0001065040650406504, "loss": 0.8228, "step": 4940 }, { "epoch": 0.792, "grad_norm": 2.275233268737793, "learning_rate": 0.00010569105691056911, "loss": 1.1372, "step": 4950 }, { "epoch": 0.7936, "grad_norm": 1.0748039484024048, "learning_rate": 0.00010487804878048781, "loss": 0.9031, "step": 4960 }, { "epoch": 0.7952, "grad_norm": 3.2387609481811523, "learning_rate": 0.0001040650406504065, "loss": 1.041, "step": 4970 }, { "epoch": 0.7968, "grad_norm": 1.7894922494888306, "learning_rate": 0.00010325203252032521, "loss": 0.6705, "step": 4980 }, { "epoch": 0.7984, "grad_norm": 3.0146548748016357, "learning_rate": 0.00010243902439024389, "loss": 0.9184, "step": 4990 }, { "epoch": 0.8, "grad_norm": 1.578597903251648, "learning_rate": 0.0001016260162601626, "loss": 0.9872, "step": 5000 }, { "epoch": 0.8, "eval_loss": 1.1235560178756714, "eval_runtime": 1107.329, "eval_samples_per_second": 4.515, "eval_steps_per_second": 4.515, "step": 5000 }, { "epoch": 0.8016, "grad_norm": 2.0832505226135254, "learning_rate": 0.00010081300813008131, "loss": 0.6787, "step": 5010 }, { "epoch": 0.8032, "grad_norm": 4.393614292144775, "learning_rate": 0.0001, "loss": 0.9292, "step": 5020 }, { "epoch": 0.8048, "grad_norm": 3.805360794067383, "learning_rate": 9.91869918699187e-05, "loss": 0.7421, "step": 5030 }, { "epoch": 0.8064, "grad_norm": 2.305285930633545, "learning_rate": 9.83739837398374e-05, "loss": 0.6064, "step": 5040 }, { "epoch": 0.808, "grad_norm": 4.427598476409912, "learning_rate": 9.75609756097561e-05, "loss": 0.8991, "step": 5050 }, { "epoch": 0.8096, "grad_norm": 0.9261614680290222, "learning_rate": 9.67479674796748e-05, "loss": 0.9912, "step": 5060 }, { "epoch": 0.8112, "grad_norm": 0.9871659874916077, "learning_rate": 9.593495934959349e-05, "loss": 0.575, "step": 5070 }, { "epoch": 0.8128, "grad_norm": 0.12044885754585266, "learning_rate": 9.51219512195122e-05, "loss": 0.4722, "step": 5080 }, { "epoch": 0.8144, "grad_norm": 1.28267240524292, "learning_rate": 9.43089430894309e-05, "loss": 0.6495, "step": 5090 }, { "epoch": 0.816, "grad_norm": 2.445477247238159, "learning_rate": 9.349593495934959e-05, "loss": 0.8108, "step": 5100 }, { "epoch": 0.8176, "grad_norm": 1.838616132736206, "learning_rate": 9.26829268292683e-05, "loss": 0.698, "step": 5110 }, { "epoch": 0.8192, "grad_norm": 1.8858362436294556, "learning_rate": 9.1869918699187e-05, "loss": 0.7465, "step": 5120 }, { "epoch": 0.8208, "grad_norm": 2.1843161582946777, "learning_rate": 9.10569105691057e-05, "loss": 0.7991, "step": 5130 }, { "epoch": 0.8224, "grad_norm": 2.6587400436401367, "learning_rate": 9.024390243902438e-05, "loss": 0.7246, "step": 5140 }, { "epoch": 0.824, "grad_norm": 0.21658068895339966, "learning_rate": 8.943089430894309e-05, "loss": 0.4838, "step": 5150 }, { "epoch": 0.8256, "grad_norm": 2.1727850437164307, "learning_rate": 8.86178861788618e-05, "loss": 0.9581, "step": 5160 }, { "epoch": 0.8272, "grad_norm": 0.5486516952514648, "learning_rate": 8.780487804878048e-05, "loss": 0.7398, "step": 5170 }, { "epoch": 0.8288, "grad_norm": 1.6711406707763672, "learning_rate": 8.699186991869919e-05, "loss": 0.8343, "step": 5180 }, { "epoch": 0.8304, "grad_norm": 0.38640347123146057, "learning_rate": 8.617886178861789e-05, "loss": 0.5283, "step": 5190 }, { "epoch": 0.832, "grad_norm": 0.7390656471252441, "learning_rate": 8.53658536585366e-05, "loss": 0.5333, "step": 5200 }, { "epoch": 0.8336, "grad_norm": 1.4642919301986694, "learning_rate": 8.455284552845529e-05, "loss": 0.9112, "step": 5210 }, { "epoch": 0.8352, "grad_norm": 2.632808208465576, "learning_rate": 8.373983739837398e-05, "loss": 0.7809, "step": 5220 }, { "epoch": 0.8368, "grad_norm": 1.728801965713501, "learning_rate": 8.292682926829269e-05, "loss": 1.0487, "step": 5230 }, { "epoch": 0.8384, "grad_norm": 2.3986566066741943, "learning_rate": 8.211382113821137e-05, "loss": 0.7501, "step": 5240 }, { "epoch": 0.84, "grad_norm": 1.8456178903579712, "learning_rate": 8.130081300813008e-05, "loss": 0.8797, "step": 5250 }, { "epoch": 0.84, "eval_loss": 1.1170645952224731, "eval_runtime": 1154.3085, "eval_samples_per_second": 4.332, "eval_steps_per_second": 4.332, "step": 5250 }, { "epoch": 0.8416, "grad_norm": 1.5469543933868408, "learning_rate": 8.048780487804879e-05, "loss": 0.7024, "step": 5260 }, { "epoch": 0.8432, "grad_norm": 3.0582432746887207, "learning_rate": 7.967479674796748e-05, "loss": 0.8939, "step": 5270 }, { "epoch": 0.8448, "grad_norm": 2.4114296436309814, "learning_rate": 7.886178861788618e-05, "loss": 0.783, "step": 5280 }, { "epoch": 0.8464, "grad_norm": 1.299592137336731, "learning_rate": 7.804878048780487e-05, "loss": 0.7606, "step": 5290 }, { "epoch": 0.848, "grad_norm": 1.3195971250534058, "learning_rate": 7.723577235772358e-05, "loss": 0.7542, "step": 5300 }, { "epoch": 0.8496, "grad_norm": 2.526697874069214, "learning_rate": 7.642276422764228e-05, "loss": 0.9406, "step": 5310 }, { "epoch": 0.8512, "grad_norm": 1.22294020652771, "learning_rate": 7.560975609756097e-05, "loss": 1.3387, "step": 5320 }, { "epoch": 0.8528, "grad_norm": 0.27195674180984497, "learning_rate": 7.479674796747968e-05, "loss": 0.9164, "step": 5330 }, { "epoch": 0.8544, "grad_norm": 2.390148162841797, "learning_rate": 7.398373983739838e-05, "loss": 0.8772, "step": 5340 }, { "epoch": 0.856, "grad_norm": 2.907269239425659, "learning_rate": 7.317073170731707e-05, "loss": 0.7364, "step": 5350 }, { "epoch": 0.8576, "grad_norm": 1.9308322668075562, "learning_rate": 7.235772357723578e-05, "loss": 0.6116, "step": 5360 }, { "epoch": 0.8592, "grad_norm": 2.004450798034668, "learning_rate": 7.154471544715447e-05, "loss": 0.839, "step": 5370 }, { "epoch": 0.8608, "grad_norm": 2.5253965854644775, "learning_rate": 7.073170731707317e-05, "loss": 0.7411, "step": 5380 }, { "epoch": 0.8624, "grad_norm": 1.223568081855774, "learning_rate": 6.991869918699186e-05, "loss": 0.823, "step": 5390 }, { "epoch": 0.864, "grad_norm": 1.9021104574203491, "learning_rate": 6.910569105691057e-05, "loss": 0.8155, "step": 5400 }, { "epoch": 0.8656, "grad_norm": 1.9883354902267456, "learning_rate": 6.829268292682928e-05, "loss": 0.6916, "step": 5410 }, { "epoch": 0.8672, "grad_norm": 1.8869421482086182, "learning_rate": 6.747967479674796e-05, "loss": 0.7202, "step": 5420 }, { "epoch": 0.8688, "grad_norm": 1.137399673461914, "learning_rate": 6.666666666666667e-05, "loss": 0.8288, "step": 5430 }, { "epoch": 0.8704, "grad_norm": 1.6170374155044556, "learning_rate": 6.585365853658536e-05, "loss": 0.8518, "step": 5440 }, { "epoch": 0.872, "grad_norm": 1.8285601139068604, "learning_rate": 6.504065040650407e-05, "loss": 0.613, "step": 5450 }, { "epoch": 0.8736, "grad_norm": 2.910038948059082, "learning_rate": 6.422764227642277e-05, "loss": 0.8952, "step": 5460 }, { "epoch": 0.8752, "grad_norm": 1.7812882661819458, "learning_rate": 6.341463414634146e-05, "loss": 0.8391, "step": 5470 }, { "epoch": 0.8768, "grad_norm": 3.008392095565796, "learning_rate": 6.260162601626017e-05, "loss": 0.9243, "step": 5480 }, { "epoch": 0.8784, "grad_norm": 2.041684865951538, "learning_rate": 6.178861788617887e-05, "loss": 0.9482, "step": 5490 }, { "epoch": 0.88, "grad_norm": 2.1885712146759033, "learning_rate": 6.097560975609756e-05, "loss": 0.737, "step": 5500 }, { "epoch": 0.88, "eval_loss": 1.1130776405334473, "eval_runtime": 1175.4432, "eval_samples_per_second": 4.254, "eval_steps_per_second": 4.254, "step": 5500 }, { "epoch": 0.8816, "grad_norm": 2.233348846435547, "learning_rate": 6.016260162601626e-05, "loss": 0.6539, "step": 5510 }, { "epoch": 0.8832, "grad_norm": 3.41200590133667, "learning_rate": 5.934959349593496e-05, "loss": 0.777, "step": 5520 }, { "epoch": 0.8848, "grad_norm": 1.883143663406372, "learning_rate": 5.8536585365853666e-05, "loss": 1.083, "step": 5530 }, { "epoch": 0.8864, "grad_norm": 1.179457664489746, "learning_rate": 5.772357723577236e-05, "loss": 0.5159, "step": 5540 }, { "epoch": 0.888, "grad_norm": 4.108737945556641, "learning_rate": 5.6910569105691056e-05, "loss": 1.2811, "step": 5550 }, { "epoch": 0.8896, "grad_norm": 2.099215507507324, "learning_rate": 5.609756097560976e-05, "loss": 0.7685, "step": 5560 }, { "epoch": 0.8912, "grad_norm": 5.275564193725586, "learning_rate": 5.528455284552845e-05, "loss": 1.1286, "step": 5570 }, { "epoch": 0.8928, "grad_norm": 1.1515026092529297, "learning_rate": 5.447154471544716e-05, "loss": 0.6888, "step": 5580 }, { "epoch": 0.8944, "grad_norm": 2.397169828414917, "learning_rate": 5.3658536585365855e-05, "loss": 0.8067, "step": 5590 }, { "epoch": 0.896, "grad_norm": 1.0115450620651245, "learning_rate": 5.284552845528456e-05, "loss": 0.4546, "step": 5600 }, { "epoch": 0.8976, "grad_norm": 1.9359952211380005, "learning_rate": 5.203252032520325e-05, "loss": 1.2266, "step": 5610 }, { "epoch": 0.8992, "grad_norm": 1.6608821153640747, "learning_rate": 5.1219512195121947e-05, "loss": 1.0713, "step": 5620 }, { "epoch": 0.9008, "grad_norm": 3.0019173622131348, "learning_rate": 5.0406504065040655e-05, "loss": 1.1104, "step": 5630 }, { "epoch": 0.9024, "grad_norm": 0.2102556973695755, "learning_rate": 4.959349593495935e-05, "loss": 0.9822, "step": 5640 }, { "epoch": 0.904, "grad_norm": 2.968538761138916, "learning_rate": 4.878048780487805e-05, "loss": 1.1946, "step": 5650 }, { "epoch": 0.9056, "grad_norm": 1.3731448650360107, "learning_rate": 4.7967479674796746e-05, "loss": 0.7431, "step": 5660 }, { "epoch": 0.9072, "grad_norm": 1.5175280570983887, "learning_rate": 4.715447154471545e-05, "loss": 0.6726, "step": 5670 }, { "epoch": 0.9088, "grad_norm": 3.332031011581421, "learning_rate": 4.634146341463415e-05, "loss": 0.6165, "step": 5680 }, { "epoch": 0.9104, "grad_norm": 2.3446781635284424, "learning_rate": 4.552845528455285e-05, "loss": 0.5643, "step": 5690 }, { "epoch": 0.912, "grad_norm": 1.1185941696166992, "learning_rate": 4.4715447154471546e-05, "loss": 1.1405, "step": 5700 }, { "epoch": 0.9136, "grad_norm": 3.511198043823242, "learning_rate": 4.390243902439024e-05, "loss": 1.0318, "step": 5710 }, { "epoch": 0.9152, "grad_norm": 1.842178463935852, "learning_rate": 4.308943089430894e-05, "loss": 0.7284, "step": 5720 }, { "epoch": 0.9168, "grad_norm": 2.8651535511016846, "learning_rate": 4.2276422764227644e-05, "loss": 0.8321, "step": 5730 }, { "epoch": 0.9184, "grad_norm": 2.988203763961792, "learning_rate": 4.1463414634146346e-05, "loss": 0.889, "step": 5740 }, { "epoch": 0.92, "grad_norm": 0.8294357061386108, "learning_rate": 4.065040650406504e-05, "loss": 0.9193, "step": 5750 }, { "epoch": 0.92, "eval_loss": 1.1053773164749146, "eval_runtime": 1280.7262, "eval_samples_per_second": 3.904, "eval_steps_per_second": 3.904, "step": 5750 }, { "epoch": 0.9216, "grad_norm": 2.330263137817383, "learning_rate": 3.983739837398374e-05, "loss": 0.9316, "step": 5760 }, { "epoch": 0.9232, "grad_norm": 3.490957260131836, "learning_rate": 3.902439024390244e-05, "loss": 0.8297, "step": 5770 }, { "epoch": 0.9248, "grad_norm": 0.16155463457107544, "learning_rate": 3.821138211382114e-05, "loss": 0.5396, "step": 5780 }, { "epoch": 0.9264, "grad_norm": 3.8618061542510986, "learning_rate": 3.739837398373984e-05, "loss": 1.1497, "step": 5790 }, { "epoch": 0.928, "grad_norm": 2.8210208415985107, "learning_rate": 3.6585365853658535e-05, "loss": 1.3421, "step": 5800 }, { "epoch": 0.9296, "grad_norm": 3.1744306087493896, "learning_rate": 3.577235772357724e-05, "loss": 1.0488, "step": 5810 }, { "epoch": 0.9312, "grad_norm": 1.5382752418518066, "learning_rate": 3.495934959349593e-05, "loss": 0.6032, "step": 5820 }, { "epoch": 0.9328, "grad_norm": 1.915822148323059, "learning_rate": 3.414634146341464e-05, "loss": 0.7809, "step": 5830 }, { "epoch": 0.9344, "grad_norm": 0.8540381193161011, "learning_rate": 3.3333333333333335e-05, "loss": 0.6, "step": 5840 }, { "epoch": 0.936, "grad_norm": 1.4163843393325806, "learning_rate": 3.2520325203252037e-05, "loss": 0.6286, "step": 5850 }, { "epoch": 0.9376, "grad_norm": 0.13695108890533447, "learning_rate": 3.170731707317073e-05, "loss": 1.2472, "step": 5860 }, { "epoch": 0.9392, "grad_norm": 1.5286403894424438, "learning_rate": 3.089430894308943e-05, "loss": 1.0632, "step": 5870 }, { "epoch": 0.9408, "grad_norm": 1.7806613445281982, "learning_rate": 3.008130081300813e-05, "loss": 0.7279, "step": 5880 }, { "epoch": 0.9424, "grad_norm": 1.5880378484725952, "learning_rate": 2.9268292682926833e-05, "loss": 0.8116, "step": 5890 }, { "epoch": 0.944, "grad_norm": 0.9348939061164856, "learning_rate": 2.8455284552845528e-05, "loss": 0.6383, "step": 5900 }, { "epoch": 0.9456, "grad_norm": 2.188812732696533, "learning_rate": 2.7642276422764226e-05, "loss": 0.8308, "step": 5910 }, { "epoch": 0.9472, "grad_norm": 1.4115760326385498, "learning_rate": 2.6829268292682928e-05, "loss": 0.8653, "step": 5920 }, { "epoch": 0.9488, "grad_norm": 3.2353076934814453, "learning_rate": 2.6016260162601626e-05, "loss": 0.8548, "step": 5930 }, { "epoch": 0.9504, "grad_norm": 1.7031910419464111, "learning_rate": 2.5203252032520327e-05, "loss": 0.7358, "step": 5940 }, { "epoch": 0.952, "grad_norm": 0.33889999985694885, "learning_rate": 2.4390243902439026e-05, "loss": 0.6915, "step": 5950 }, { "epoch": 0.9536, "grad_norm": 1.296474575996399, "learning_rate": 2.3577235772357724e-05, "loss": 0.7619, "step": 5960 }, { "epoch": 0.9552, "grad_norm": 2.5524513721466064, "learning_rate": 2.2764227642276426e-05, "loss": 0.8742, "step": 5970 }, { "epoch": 0.9568, "grad_norm": 1.5270932912826538, "learning_rate": 2.195121951219512e-05, "loss": 0.628, "step": 5980 }, { "epoch": 0.9584, "grad_norm": 1.3427207469940186, "learning_rate": 2.1138211382113822e-05, "loss": 0.8086, "step": 5990 }, { "epoch": 0.96, "grad_norm": 2.8533713817596436, "learning_rate": 2.032520325203252e-05, "loss": 0.8316, "step": 6000 }, { "epoch": 0.96, "eval_loss": 1.1065137386322021, "eval_runtime": 1285.8072, "eval_samples_per_second": 3.889, "eval_steps_per_second": 3.889, "step": 6000 }, { "epoch": 0.9616, "grad_norm": 2.6748170852661133, "learning_rate": 1.951219512195122e-05, "loss": 1.2344, "step": 6010 }, { "epoch": 0.9632, "grad_norm": 2.6799983978271484, "learning_rate": 1.869918699186992e-05, "loss": 0.7761, "step": 6020 }, { "epoch": 0.9648, "grad_norm": 1.6079151630401611, "learning_rate": 1.788617886178862e-05, "loss": 0.4993, "step": 6030 }, { "epoch": 0.9664, "grad_norm": 4.892763137817383, "learning_rate": 1.707317073170732e-05, "loss": 0.8844, "step": 6040 }, { "epoch": 0.968, "grad_norm": 3.5101561546325684, "learning_rate": 1.6260162601626018e-05, "loss": 0.821, "step": 6050 }, { "epoch": 0.9696, "grad_norm": 1.485048532485962, "learning_rate": 1.5447154471544717e-05, "loss": 1.1131, "step": 6060 }, { "epoch": 0.9712, "grad_norm": 2.223806858062744, "learning_rate": 1.4634146341463416e-05, "loss": 0.6271, "step": 6070 }, { "epoch": 0.9728, "grad_norm": 1.194604516029358, "learning_rate": 1.3821138211382113e-05, "loss": 0.8082, "step": 6080 }, { "epoch": 0.9744, "grad_norm": 1.4954756498336792, "learning_rate": 1.3008130081300813e-05, "loss": 0.6707, "step": 6090 }, { "epoch": 0.976, "grad_norm": 4.3971028327941895, "learning_rate": 1.2195121951219513e-05, "loss": 0.9261, "step": 6100 }, { "epoch": 0.9776, "grad_norm": 3.624100923538208, "learning_rate": 1.1382113821138213e-05, "loss": 1.2992, "step": 6110 }, { "epoch": 0.9792, "grad_norm": 1.0655690431594849, "learning_rate": 1.0569105691056911e-05, "loss": 0.7596, "step": 6120 }, { "epoch": 0.9808, "grad_norm": 2.3329694271087646, "learning_rate": 9.75609756097561e-06, "loss": 0.7903, "step": 6130 }, { "epoch": 0.9824, "grad_norm": 2.0594286918640137, "learning_rate": 8.94308943089431e-06, "loss": 0.6795, "step": 6140 }, { "epoch": 0.984, "grad_norm": 2.90356707572937, "learning_rate": 8.130081300813009e-06, "loss": 0.8201, "step": 6150 }, { "epoch": 0.9856, "grad_norm": 1.485250473022461, "learning_rate": 7.317073170731708e-06, "loss": 0.6455, "step": 6160 }, { "epoch": 0.9872, "grad_norm": 1.0751053094863892, "learning_rate": 6.5040650406504065e-06, "loss": 0.6323, "step": 6170 }, { "epoch": 0.9888, "grad_norm": 0.8699440956115723, "learning_rate": 5.691056910569106e-06, "loss": 0.7094, "step": 6180 }, { "epoch": 0.9904, "grad_norm": 1.9519120454788208, "learning_rate": 4.878048780487805e-06, "loss": 1.0605, "step": 6190 }, { "epoch": 0.992, "grad_norm": 1.3058487176895142, "learning_rate": 4.0650406504065046e-06, "loss": 0.8777, "step": 6200 }, { "epoch": 0.9936, "grad_norm": 4.6405229568481445, "learning_rate": 3.2520325203252032e-06, "loss": 0.8997, "step": 6210 }, { "epoch": 0.9952, "grad_norm": 1.5533032417297363, "learning_rate": 2.4390243902439023e-06, "loss": 0.635, "step": 6220 }, { "epoch": 0.9968, "grad_norm": 1.528320074081421, "learning_rate": 1.6260162601626016e-06, "loss": 0.9057, "step": 6230 }, { "epoch": 0.9984, "grad_norm": 0.9600651860237122, "learning_rate": 8.130081300813008e-07, "loss": 0.9465, "step": 6240 }, { "epoch": 1.0, "grad_norm": 2.0253632068634033, "learning_rate": 0.0, "loss": 0.7621, "step": 6250 }, { "epoch": 1.0, "eval_loss": 1.101412057876587, "eval_runtime": 1362.455, "eval_samples_per_second": 3.67, "eval_steps_per_second": 3.67, "step": 6250 }, { "epoch": 1.0, "step": 6250, "total_flos": 2.345137078272e+17, "train_loss": 0.958240804862976, "train_runtime": 54923.8414, "train_samples_per_second": 0.91, "train_steps_per_second": 0.114 } ], "logging_steps": 10, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.345137078272e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }