Invalid JSON:
Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_metric": 1.101412057876587, | |
| "best_model_checkpoint": "models/agriQA-assistant\\checkpoint-6250", | |
| "epoch": 1.0, | |
| "eval_steps": 250, | |
| "global_step": 6250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016, | |
| "grad_norm": NaN, | |
| "learning_rate": 5e-05, | |
| "loss": 5.5315, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 2.2085492610931396, | |
| "learning_rate": 0.0001, | |
| "loss": 5.3337, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0048, | |
| "grad_norm": 3.2219746112823486, | |
| "learning_rate": 0.00015, | |
| "loss": 5.1164, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 4.628455638885498, | |
| "learning_rate": 0.0002, | |
| "loss": 3.6573, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 12.310958862304688, | |
| "learning_rate": 0.00025, | |
| "loss": 3.9022, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 9.530706405639648, | |
| "learning_rate": 0.0003, | |
| "loss": 3.309, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0112, | |
| "grad_norm": 4.938753128051758, | |
| "learning_rate": 0.00035, | |
| "loss": 2.356, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 3.4446096420288086, | |
| "learning_rate": 0.0004, | |
| "loss": 1.8554, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0144, | |
| "grad_norm": 2.971761465072632, | |
| "learning_rate": 0.00045000000000000004, | |
| "loss": 1.7541, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 6.156565189361572, | |
| "learning_rate": 0.0005, | |
| "loss": 1.8158, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0176, | |
| "grad_norm": 4.076125144958496, | |
| "learning_rate": 0.0004991869918699188, | |
| "loss": 1.7109, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 3.3770179748535156, | |
| "learning_rate": 0.0004983739837398374, | |
| "loss": 1.6453, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0208, | |
| "grad_norm": 4.189070224761963, | |
| "learning_rate": 0.0004975609756097561, | |
| "loss": 1.8939, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 4.818182468414307, | |
| "learning_rate": 0.0004967479674796748, | |
| "loss": 1.4016, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.7811826467514038, | |
| "learning_rate": 0.0004959349593495934, | |
| "loss": 1.4795, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 3.0742604732513428, | |
| "learning_rate": 0.0004951219512195122, | |
| "loss": 1.2655, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0272, | |
| "grad_norm": 2.1021037101745605, | |
| "learning_rate": 0.0004943089430894309, | |
| "loss": 1.4392, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 2.147939920425415, | |
| "learning_rate": 0.0004934959349593496, | |
| "loss": 1.6663, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0304, | |
| "grad_norm": 4.158925533294678, | |
| "learning_rate": 0.0004926829268292683, | |
| "loss": 1.6488, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.7264795303344727, | |
| "learning_rate": 0.0004918699186991871, | |
| "loss": 1.677, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0336, | |
| "grad_norm": 0.8538139462471008, | |
| "learning_rate": 0.0004910569105691057, | |
| "loss": 1.4913, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 5.219719409942627, | |
| "learning_rate": 0.0004902439024390243, | |
| "loss": 1.1787, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0368, | |
| "grad_norm": 2.3855836391448975, | |
| "learning_rate": 0.0004894308943089431, | |
| "loss": 1.907, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 6.361728191375732, | |
| "learning_rate": 0.0004886178861788618, | |
| "loss": 1.4054, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.0494649410247803, | |
| "learning_rate": 0.0004878048780487805, | |
| "loss": 1.5189, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 1.650099277496338, | |
| "eval_runtime": 1034.8057, | |
| "eval_samples_per_second": 4.832, | |
| "eval_steps_per_second": 4.832, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 1.2376840114593506, | |
| "learning_rate": 0.0004869918699186992, | |
| "loss": 1.4281, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0432, | |
| "grad_norm": 1.7537360191345215, | |
| "learning_rate": 0.00048617886178861793, | |
| "loss": 1.1609, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 2.7885940074920654, | |
| "learning_rate": 0.00048536585365853657, | |
| "loss": 1.4809, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0464, | |
| "grad_norm": 2.5449018478393555, | |
| "learning_rate": 0.0004845528455284553, | |
| "loss": 1.5099, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 4.444058418273926, | |
| "learning_rate": 0.000483739837398374, | |
| "loss": 1.2722, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0496, | |
| "grad_norm": 0.870796263217926, | |
| "learning_rate": 0.00048292682926829266, | |
| "loss": 1.0486, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 1.8581308126449585, | |
| "learning_rate": 0.0004821138211382114, | |
| "loss": 1.4882, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.0528, | |
| "grad_norm": 0.9003315567970276, | |
| "learning_rate": 0.0004813008130081301, | |
| "loss": 1.177, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 2.3489129543304443, | |
| "learning_rate": 0.0004804878048780488, | |
| "loss": 1.0059, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 2.488342046737671, | |
| "learning_rate": 0.0004796747967479675, | |
| "loss": 1.6738, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 2.3013713359832764, | |
| "learning_rate": 0.0004788617886178862, | |
| "loss": 1.362, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0592, | |
| "grad_norm": 1.2207469940185547, | |
| "learning_rate": 0.0004780487804878049, | |
| "loss": 1.3391, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 0.778947651386261, | |
| "learning_rate": 0.0004772357723577236, | |
| "loss": 1.4315, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.0624, | |
| "grad_norm": 0.9309093952178955, | |
| "learning_rate": 0.0004764227642276423, | |
| "loss": 1.052, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 6.077177047729492, | |
| "learning_rate": 0.00047560975609756096, | |
| "loss": 1.2999, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0656, | |
| "grad_norm": 1.5627727508544922, | |
| "learning_rate": 0.00047479674796747966, | |
| "loss": 0.9933, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 2.4397456645965576, | |
| "learning_rate": 0.0004739837398373984, | |
| "loss": 1.083, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.0688, | |
| "grad_norm": 1.7931780815124512, | |
| "learning_rate": 0.00047317073170731705, | |
| "loss": 1.0095, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 1.6210583448410034, | |
| "learning_rate": 0.0004723577235772358, | |
| "loss": 1.1547, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 2.350104808807373, | |
| "learning_rate": 0.0004715447154471545, | |
| "loss": 1.0687, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 3.4625842571258545, | |
| "learning_rate": 0.00047073170731707313, | |
| "loss": 1.0758, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.0752, | |
| "grad_norm": 1.5620652437210083, | |
| "learning_rate": 0.0004699186991869919, | |
| "loss": 0.8588, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 2.5379786491394043, | |
| "learning_rate": 0.0004691056910569106, | |
| "loss": 1.3742, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.0784, | |
| "grad_norm": 1.6187106370925903, | |
| "learning_rate": 0.0004682926829268293, | |
| "loss": 0.9447, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.7396605014801025, | |
| "learning_rate": 0.00046747967479674797, | |
| "loss": 1.0118, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 1.4677984714508057, | |
| "eval_runtime": 1024.8065, | |
| "eval_samples_per_second": 4.879, | |
| "eval_steps_per_second": 4.879, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0816, | |
| "grad_norm": 0.862608015537262, | |
| "learning_rate": 0.00046666666666666666, | |
| "loss": 1.3823, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 1.3692424297332764, | |
| "learning_rate": 0.0004658536585365854, | |
| "loss": 0.8877, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.0848, | |
| "grad_norm": 1.674166202545166, | |
| "learning_rate": 0.00046504065040650405, | |
| "loss": 1.3644, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 3.7798876762390137, | |
| "learning_rate": 0.0004642276422764228, | |
| "loss": 1.2296, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 2.530099630355835, | |
| "learning_rate": 0.0004634146341463415, | |
| "loss": 1.4154, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 2.375380039215088, | |
| "learning_rate": 0.00046260162601626014, | |
| "loss": 0.9654, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.0912, | |
| "grad_norm": 4.751425266265869, | |
| "learning_rate": 0.0004617886178861789, | |
| "loss": 1.4011, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 1.316773772239685, | |
| "learning_rate": 0.0004609756097560976, | |
| "loss": 0.9764, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.0944, | |
| "grad_norm": 3.132272243499756, | |
| "learning_rate": 0.0004601626016260163, | |
| "loss": 1.3579, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 2.0117361545562744, | |
| "learning_rate": 0.00045934959349593497, | |
| "loss": 1.2378, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0976, | |
| "grad_norm": 3.49707293510437, | |
| "learning_rate": 0.00045853658536585366, | |
| "loss": 0.9963, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 3.4453463554382324, | |
| "learning_rate": 0.00045772357723577236, | |
| "loss": 1.5574, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1008, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00045691056910569105, | |
| "loss": 0.981, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 1.6297922134399414, | |
| "learning_rate": 0.0004560975609756098, | |
| "loss": 1.3719, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 1.4996604919433594, | |
| "learning_rate": 0.00045528455284552844, | |
| "loss": 0.8291, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 4.539699077606201, | |
| "learning_rate": 0.00045447154471544714, | |
| "loss": 1.3347, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.1072, | |
| "grad_norm": 1.6817468404769897, | |
| "learning_rate": 0.0004536585365853659, | |
| "loss": 1.1633, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 2.0384576320648193, | |
| "learning_rate": 0.00045284552845528453, | |
| "loss": 0.7568, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.1104, | |
| "grad_norm": 0.6239755749702454, | |
| "learning_rate": 0.0004520325203252033, | |
| "loss": 0.9898, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 3.0464870929718018, | |
| "learning_rate": 0.00045121951219512197, | |
| "loss": 1.1183, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1136, | |
| "grad_norm": 7.100531101226807, | |
| "learning_rate": 0.0004504065040650406, | |
| "loss": 1.1491, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 2.297905683517456, | |
| "learning_rate": 0.00044959349593495936, | |
| "loss": 1.1005, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.1168, | |
| "grad_norm": 0.7417575120925903, | |
| "learning_rate": 0.00044878048780487806, | |
| "loss": 0.6954, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 4.498805999755859, | |
| "learning_rate": 0.00044796747967479675, | |
| "loss": 1.3516, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.0761196613311768, | |
| "learning_rate": 0.00044715447154471545, | |
| "loss": 1.3995, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 1.4611822366714478, | |
| "eval_runtime": 992.1537, | |
| "eval_samples_per_second": 5.04, | |
| "eval_steps_per_second": 5.04, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 0.4251354932785034, | |
| "learning_rate": 0.00044634146341463414, | |
| "loss": 0.89, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.1232, | |
| "grad_norm": 1.715605616569519, | |
| "learning_rate": 0.0004455284552845529, | |
| "loss": 0.7241, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 3.6502270698547363, | |
| "learning_rate": 0.00044471544715447153, | |
| "loss": 0.8324, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.1264, | |
| "grad_norm": 2.7257204055786133, | |
| "learning_rate": 0.0004439024390243903, | |
| "loss": 1.1949, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 3.6415014266967773, | |
| "learning_rate": 0.000443089430894309, | |
| "loss": 1.1487, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.1296, | |
| "grad_norm": 1.9723396301269531, | |
| "learning_rate": 0.0004422764227642276, | |
| "loss": 0.8628, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 3.361844778060913, | |
| "learning_rate": 0.00044146341463414636, | |
| "loss": 1.1734, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.1328, | |
| "grad_norm": 2.197422981262207, | |
| "learning_rate": 0.00044065040650406506, | |
| "loss": 1.4427, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 1.0647861957550049, | |
| "learning_rate": 0.00043983739837398375, | |
| "loss": 1.0792, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 2.6153125762939453, | |
| "learning_rate": 0.00043902439024390245, | |
| "loss": 1.0544, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 2.4081950187683105, | |
| "learning_rate": 0.00043821138211382114, | |
| "loss": 1.187, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.1392, | |
| "grad_norm": 1.5457711219787598, | |
| "learning_rate": 0.00043739837398373984, | |
| "loss": 1.0687, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 1.0338834524154663, | |
| "learning_rate": 0.00043658536585365853, | |
| "loss": 0.9691, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.1424, | |
| "grad_norm": 2.2360999584198, | |
| "learning_rate": 0.0004357723577235773, | |
| "loss": 1.0545, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 2.766735076904297, | |
| "learning_rate": 0.0004349593495934959, | |
| "loss": 0.8978, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1456, | |
| "grad_norm": 2.768793821334839, | |
| "learning_rate": 0.0004341463414634146, | |
| "loss": 1.3507, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 2.9616165161132812, | |
| "learning_rate": 0.00043333333333333337, | |
| "loss": 1.2602, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.1488, | |
| "grad_norm": 2.001955986022949, | |
| "learning_rate": 0.000432520325203252, | |
| "loss": 1.1164, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 3.6507134437561035, | |
| "learning_rate": 0.00043170731707317076, | |
| "loss": 1.0245, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 2.580472230911255, | |
| "learning_rate": 0.00043089430894308945, | |
| "loss": 1.1918, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 1.4432860612869263, | |
| "learning_rate": 0.0004300813008130081, | |
| "loss": 1.0045, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.1552, | |
| "grad_norm": 0.4128560423851013, | |
| "learning_rate": 0.00042926829268292684, | |
| "loss": 1.1314, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 1.6866891384124756, | |
| "learning_rate": 0.00042845528455284554, | |
| "loss": 0.8903, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1584, | |
| "grad_norm": 2.277833938598633, | |
| "learning_rate": 0.00042764227642276423, | |
| "loss": 1.1665, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.491178512573242, | |
| "learning_rate": 0.0004268292682926829, | |
| "loss": 0.8519, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 1.285598874092102, | |
| "eval_runtime": 985.2873, | |
| "eval_samples_per_second": 5.075, | |
| "eval_steps_per_second": 5.075, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1616, | |
| "grad_norm": 2.373556137084961, | |
| "learning_rate": 0.0004260162601626016, | |
| "loss": 0.8051, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 2.0977611541748047, | |
| "learning_rate": 0.00042520325203252037, | |
| "loss": 1.091, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.1648, | |
| "grad_norm": 1.58762526512146, | |
| "learning_rate": 0.000424390243902439, | |
| "loss": 1.0129, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 2.275988817214966, | |
| "learning_rate": 0.00042357723577235776, | |
| "loss": 1.2149, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 2.571326494216919, | |
| "learning_rate": 0.00042276422764227645, | |
| "loss": 0.6916, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 1.5755791664123535, | |
| "learning_rate": 0.0004219512195121951, | |
| "loss": 0.8999, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1712, | |
| "grad_norm": 2.5511739253997803, | |
| "learning_rate": 0.00042113821138211384, | |
| "loss": 1.3716, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 1.7605398893356323, | |
| "learning_rate": 0.00042032520325203254, | |
| "loss": 0.6896, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.1744, | |
| "grad_norm": 2.7377278804779053, | |
| "learning_rate": 0.00041951219512195123, | |
| "loss": 0.8409, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.7547760009765625, | |
| "learning_rate": 0.00041869918699186993, | |
| "loss": 0.5473, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1776, | |
| "grad_norm": 0.608930766582489, | |
| "learning_rate": 0.0004178861788617886, | |
| "loss": 0.9329, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 1.1628577709197998, | |
| "learning_rate": 0.0004170731707317073, | |
| "loss": 1.1194, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.1808, | |
| "grad_norm": 1.2819926738739014, | |
| "learning_rate": 0.000416260162601626, | |
| "loss": 0.8936, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 4.105648040771484, | |
| "learning_rate": 0.00041544715447154476, | |
| "loss": 1.0135, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 1.276444435119629, | |
| "learning_rate": 0.0004146341463414634, | |
| "loss": 0.9495, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 4.56814432144165, | |
| "learning_rate": 0.0004138211382113821, | |
| "loss": 0.981, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.1872, | |
| "grad_norm": 2.2705514430999756, | |
| "learning_rate": 0.00041300813008130085, | |
| "loss": 1.213, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 0.3999709188938141, | |
| "learning_rate": 0.0004121951219512195, | |
| "loss": 0.7223, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.1904, | |
| "grad_norm": 1.3647452592849731, | |
| "learning_rate": 0.00041138211382113824, | |
| "loss": 0.8783, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 1.4808121919631958, | |
| "learning_rate": 0.00041056910569105693, | |
| "loss": 1.2453, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.1936, | |
| "grad_norm": 1.598960041999817, | |
| "learning_rate": 0.00040975609756097557, | |
| "loss": 0.9894, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 2.9480648040771484, | |
| "learning_rate": 0.0004089430894308943, | |
| "loss": 1.3508, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.1968, | |
| "grad_norm": 1.4865163564682007, | |
| "learning_rate": 0.000408130081300813, | |
| "loss": 1.1306, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 1.445144772529602, | |
| "learning_rate": 0.0004073170731707317, | |
| "loss": 1.0977, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.741799235343933, | |
| "learning_rate": 0.0004065040650406504, | |
| "loss": 0.6742, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 1.3160793781280518, | |
| "eval_runtime": 979.8483, | |
| "eval_samples_per_second": 5.103, | |
| "eval_steps_per_second": 5.103, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 2.603684425354004, | |
| "learning_rate": 0.0004056910569105691, | |
| "loss": 1.0061, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.2032, | |
| "grad_norm": 1.8652371168136597, | |
| "learning_rate": 0.00040487804878048785, | |
| "loss": 1.4929, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 0.33945292234420776, | |
| "learning_rate": 0.0004040650406504065, | |
| "loss": 0.8642, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.2064, | |
| "grad_norm": 4.188396453857422, | |
| "learning_rate": 0.00040325203252032524, | |
| "loss": 0.8442, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.17668074369430542, | |
| "learning_rate": 0.00040243902439024393, | |
| "loss": 0.8546, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2096, | |
| "grad_norm": 2.030315637588501, | |
| "learning_rate": 0.0004016260162601626, | |
| "loss": 0.8966, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 2.019416332244873, | |
| "learning_rate": 0.0004008130081300813, | |
| "loss": 0.8694, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.2128, | |
| "grad_norm": 2.457531690597534, | |
| "learning_rate": 0.0004, | |
| "loss": 1.1978, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 1.8144443035125732, | |
| "learning_rate": 0.0003991869918699187, | |
| "loss": 0.5631, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 1.4982506036758423, | |
| "learning_rate": 0.0003983739837398374, | |
| "loss": 1.5233, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 2.3264763355255127, | |
| "learning_rate": 0.0003975609756097561, | |
| "loss": 0.9226, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.2192, | |
| "grad_norm": 3.8008902072906494, | |
| "learning_rate": 0.0003967479674796748, | |
| "loss": 1.1128, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 1.7756544351577759, | |
| "learning_rate": 0.0003959349593495935, | |
| "loss": 1.0248, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.2224, | |
| "grad_norm": 1.0137070417404175, | |
| "learning_rate": 0.00039512195121951224, | |
| "loss": 0.9582, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 2.0322794914245605, | |
| "learning_rate": 0.0003943089430894309, | |
| "loss": 1.0282, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2256, | |
| "grad_norm": 3.740598678588867, | |
| "learning_rate": 0.0003934959349593496, | |
| "loss": 0.8924, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 1.8170685768127441, | |
| "learning_rate": 0.0003926829268292683, | |
| "loss": 1.1218, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.2288, | |
| "grad_norm": 3.1609694957733154, | |
| "learning_rate": 0.00039186991869918697, | |
| "loss": 1.2141, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 3.2545180320739746, | |
| "learning_rate": 0.0003910569105691057, | |
| "loss": 1.3735, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 3.983858585357666, | |
| "learning_rate": 0.0003902439024390244, | |
| "loss": 1.3722, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 1.1452367305755615, | |
| "learning_rate": 0.00038943089430894305, | |
| "loss": 0.9909, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.2352, | |
| "grad_norm": 1.7607592344284058, | |
| "learning_rate": 0.0003886178861788618, | |
| "loss": 0.8425, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 4.85504150390625, | |
| "learning_rate": 0.0003878048780487805, | |
| "loss": 1.3845, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2384, | |
| "grad_norm": 1.5017164945602417, | |
| "learning_rate": 0.0003869918699186992, | |
| "loss": 1.2752, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.2073278427124023, | |
| "learning_rate": 0.0003861788617886179, | |
| "loss": 1.2848, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 1.2479583024978638, | |
| "eval_runtime": 983.0597, | |
| "eval_samples_per_second": 5.086, | |
| "eval_steps_per_second": 5.086, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2416, | |
| "grad_norm": 1.8838679790496826, | |
| "learning_rate": 0.0003853658536585366, | |
| "loss": 0.8091, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 1.3011258840560913, | |
| "learning_rate": 0.00038455284552845533, | |
| "loss": 0.9439, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.2448, | |
| "grad_norm": 2.353581666946411, | |
| "learning_rate": 0.00038373983739837397, | |
| "loss": 0.7322, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 3.3765604496002197, | |
| "learning_rate": 0.0003829268292682927, | |
| "loss": 1.275, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.1768433153629303, | |
| "learning_rate": 0.0003821138211382114, | |
| "loss": 0.6793, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 2.3571674823760986, | |
| "learning_rate": 0.00038130081300813005, | |
| "loss": 1.0786, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.2512, | |
| "grad_norm": 2.615161180496216, | |
| "learning_rate": 0.0003804878048780488, | |
| "loss": 1.0066, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 4.079470634460449, | |
| "learning_rate": 0.0003796747967479675, | |
| "loss": 1.1569, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.2544, | |
| "grad_norm": 4.194530010223389, | |
| "learning_rate": 0.0003788617886178862, | |
| "loss": 0.8787, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 1.096019983291626, | |
| "learning_rate": 0.0003780487804878049, | |
| "loss": 1.2383, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2576, | |
| "grad_norm": 1.352885365486145, | |
| "learning_rate": 0.0003772357723577236, | |
| "loss": 0.9154, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 0.7181969285011292, | |
| "learning_rate": 0.0003764227642276423, | |
| "loss": 0.9014, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.2608, | |
| "grad_norm": 2.9218623638153076, | |
| "learning_rate": 0.000375609756097561, | |
| "loss": 0.5235, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 1.3558822870254517, | |
| "learning_rate": 0.0003747967479674797, | |
| "loss": 0.9481, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 1.477393627166748, | |
| "learning_rate": 0.00037398373983739836, | |
| "loss": 0.952, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 1.007265329360962, | |
| "learning_rate": 0.00037317073170731706, | |
| "loss": 0.9081, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2672, | |
| "grad_norm": 0.8309774994850159, | |
| "learning_rate": 0.0003723577235772358, | |
| "loss": 0.8679, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.13249029219150543, | |
| "learning_rate": 0.00037154471544715445, | |
| "loss": 0.9459, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.2704, | |
| "grad_norm": 0.34113630652427673, | |
| "learning_rate": 0.0003707317073170732, | |
| "loss": 0.91, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 1.434639811515808, | |
| "learning_rate": 0.0003699186991869919, | |
| "loss": 1.0692, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2736, | |
| "grad_norm": 0.6987395286560059, | |
| "learning_rate": 0.00036910569105691053, | |
| "loss": 0.7542, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 1.9249191284179688, | |
| "learning_rate": 0.0003682926829268293, | |
| "loss": 0.8768, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.2768, | |
| "grad_norm": 1.9825557470321655, | |
| "learning_rate": 0.000367479674796748, | |
| "loss": 0.8561, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 1.9180725812911987, | |
| "learning_rate": 0.00036666666666666667, | |
| "loss": 0.9844, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.271852970123291, | |
| "learning_rate": 0.00036585365853658537, | |
| "loss": 0.8497, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": 1.2472246885299683, | |
| "eval_runtime": 985.0188, | |
| "eval_samples_per_second": 5.076, | |
| "eval_steps_per_second": 5.076, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 1.7116152048110962, | |
| "learning_rate": 0.00036504065040650406, | |
| "loss": 0.7791, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.2832, | |
| "grad_norm": 0.9404670596122742, | |
| "learning_rate": 0.0003642276422764228, | |
| "loss": 1.182, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 1.3179750442504883, | |
| "learning_rate": 0.00036341463414634145, | |
| "loss": 0.969, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.2864, | |
| "grad_norm": 2.95133113861084, | |
| "learning_rate": 0.0003626016260162602, | |
| "loss": 1.1718, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 1.6571485996246338, | |
| "learning_rate": 0.0003617886178861789, | |
| "loss": 0.8182, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2896, | |
| "grad_norm": 3.7702455520629883, | |
| "learning_rate": 0.00036097560975609753, | |
| "loss": 0.6961, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 2.043027639389038, | |
| "learning_rate": 0.0003601626016260163, | |
| "loss": 1.2462, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.2928, | |
| "grad_norm": 3.061659336090088, | |
| "learning_rate": 0.000359349593495935, | |
| "loss": 0.9774, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 2.4722511768341064, | |
| "learning_rate": 0.0003585365853658537, | |
| "loss": 1.0065, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 3.479311227798462, | |
| "learning_rate": 0.00035772357723577237, | |
| "loss": 0.939, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 1.6811845302581787, | |
| "learning_rate": 0.00035691056910569106, | |
| "loss": 1.1362, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.2992, | |
| "grad_norm": 2.2085628509521484, | |
| "learning_rate": 0.00035609756097560976, | |
| "loss": 1.0811, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 1.638785481452942, | |
| "learning_rate": 0.00035528455284552845, | |
| "loss": 0.9224, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.3024, | |
| "grad_norm": 2.308749198913574, | |
| "learning_rate": 0.0003544715447154472, | |
| "loss": 0.7213, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 3.1847198009490967, | |
| "learning_rate": 0.00035365853658536584, | |
| "loss": 1.0388, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3056, | |
| "grad_norm": 0.1243385598063469, | |
| "learning_rate": 0.00035284552845528454, | |
| "loss": 0.9196, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 3.720393657684326, | |
| "learning_rate": 0.0003520325203252033, | |
| "loss": 0.9104, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.3088, | |
| "grad_norm": 1.8433055877685547, | |
| "learning_rate": 0.0003512195121951219, | |
| "loss": 1.0138, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 2.8602616786956787, | |
| "learning_rate": 0.0003504065040650407, | |
| "loss": 1.2236, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 1.5425291061401367, | |
| "learning_rate": 0.00034959349593495937, | |
| "loss": 1.1536, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 3.1375532150268555, | |
| "learning_rate": 0.000348780487804878, | |
| "loss": 1.0996, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3152, | |
| "grad_norm": 2.8346571922302246, | |
| "learning_rate": 0.00034796747967479676, | |
| "loss": 1.2285, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 2.045203924179077, | |
| "learning_rate": 0.00034715447154471546, | |
| "loss": 1.1145, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.3184, | |
| "grad_norm": 2.8319339752197266, | |
| "learning_rate": 0.00034634146341463415, | |
| "loss": 0.9861, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.499254584312439, | |
| "learning_rate": 0.00034552845528455285, | |
| "loss": 1.1386, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 1.256957769393921, | |
| "eval_runtime": 991.1325, | |
| "eval_samples_per_second": 5.045, | |
| "eval_steps_per_second": 5.045, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3216, | |
| "grad_norm": 0.11786320060491562, | |
| "learning_rate": 0.00034471544715447154, | |
| "loss": 0.5642, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 3.7964823246002197, | |
| "learning_rate": 0.00034390243902439023, | |
| "loss": 0.7509, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.3248, | |
| "grad_norm": 0.842405378818512, | |
| "learning_rate": 0.00034308943089430893, | |
| "loss": 1.0208, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 1.725205421447754, | |
| "learning_rate": 0.0003422764227642277, | |
| "loss": 0.8658, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 2.685744047164917, | |
| "learning_rate": 0.0003414634146341464, | |
| "loss": 0.9458, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 1.153769850730896, | |
| "learning_rate": 0.000340650406504065, | |
| "loss": 0.711, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.3312, | |
| "grad_norm": 1.1558091640472412, | |
| "learning_rate": 0.00033983739837398376, | |
| "loss": 0.8342, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 1.4191641807556152, | |
| "learning_rate": 0.00033902439024390246, | |
| "loss": 0.7802, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3344, | |
| "grad_norm": 1.611901044845581, | |
| "learning_rate": 0.00033821138211382115, | |
| "loss": 0.6892, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 2.4627749919891357, | |
| "learning_rate": 0.00033739837398373985, | |
| "loss": 1.02, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3376, | |
| "grad_norm": 1.315026044845581, | |
| "learning_rate": 0.00033658536585365854, | |
| "loss": 1.0683, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 1.301009178161621, | |
| "learning_rate": 0.00033577235772357724, | |
| "loss": 0.877, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3408, | |
| "grad_norm": 1.3801865577697754, | |
| "learning_rate": 0.00033495934959349593, | |
| "loss": 0.7509, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 2.0285582542419434, | |
| "learning_rate": 0.0003341463414634147, | |
| "loss": 0.898, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 1.8573026657104492, | |
| "learning_rate": 0.0003333333333333333, | |
| "loss": 0.8242, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 3.389634609222412, | |
| "learning_rate": 0.000332520325203252, | |
| "loss": 1.0648, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3472, | |
| "grad_norm": 1.5863244533538818, | |
| "learning_rate": 0.00033170731707317077, | |
| "loss": 0.7867, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 1.4909151792526245, | |
| "learning_rate": 0.0003308943089430894, | |
| "loss": 0.6263, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.3504, | |
| "grad_norm": 1.706332802772522, | |
| "learning_rate": 0.00033008130081300816, | |
| "loss": 1.2284, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 1.5109984874725342, | |
| "learning_rate": 0.00032926829268292685, | |
| "loss": 0.8387, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3536, | |
| "grad_norm": 2.0420236587524414, | |
| "learning_rate": 0.0003284552845528455, | |
| "loss": 0.9217, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 3.405055284500122, | |
| "learning_rate": 0.00032764227642276424, | |
| "loss": 1.2896, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.3568, | |
| "grad_norm": 1.0633502006530762, | |
| "learning_rate": 0.00032682926829268294, | |
| "loss": 0.9075, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 8.972102165222168, | |
| "learning_rate": 0.00032601626016260163, | |
| "loss": 0.9067, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.066863775253296, | |
| "learning_rate": 0.0003252032520325203, | |
| "loss": 1.0274, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_loss": 1.2403863668441772, | |
| "eval_runtime": 988.1441, | |
| "eval_samples_per_second": 5.06, | |
| "eval_steps_per_second": 5.06, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 2.7696852684020996, | |
| "learning_rate": 0.000324390243902439, | |
| "loss": 0.9223, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.3632, | |
| "grad_norm": 0.5893625020980835, | |
| "learning_rate": 0.0003235772357723577, | |
| "loss": 0.477, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 4.938126564025879, | |
| "learning_rate": 0.0003227642276422764, | |
| "loss": 0.9821, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3664, | |
| "grad_norm": 3.392449378967285, | |
| "learning_rate": 0.00032195121951219516, | |
| "loss": 0.7618, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.7704805135726929, | |
| "learning_rate": 0.00032113821138211385, | |
| "loss": 0.8412, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3696, | |
| "grad_norm": 2.1122782230377197, | |
| "learning_rate": 0.0003203252032520325, | |
| "loss": 1.0207, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.8763427734375, | |
| "learning_rate": 0.00031951219512195124, | |
| "loss": 0.9647, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.3728, | |
| "grad_norm": 3.7990410327911377, | |
| "learning_rate": 0.00031869918699186994, | |
| "loss": 0.9023, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 1.1437709331512451, | |
| "learning_rate": 0.00031788617886178863, | |
| "loss": 0.6665, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 1.7292026281356812, | |
| "learning_rate": 0.00031707317073170733, | |
| "loss": 0.9873, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 2.6598150730133057, | |
| "learning_rate": 0.000316260162601626, | |
| "loss": 1.0194, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.3792, | |
| "grad_norm": 1.1113004684448242, | |
| "learning_rate": 0.0003154471544715447, | |
| "loss": 0.9213, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 2.316197395324707, | |
| "learning_rate": 0.0003146341463414634, | |
| "loss": 0.8524, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.3824, | |
| "grad_norm": 3.070237636566162, | |
| "learning_rate": 0.00031382113821138216, | |
| "loss": 1.0224, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 1.2676721811294556, | |
| "learning_rate": 0.0003130081300813008, | |
| "loss": 0.6098, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3856, | |
| "grad_norm": 0.6013288497924805, | |
| "learning_rate": 0.0003121951219512195, | |
| "loss": 0.7596, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 1.8486628532409668, | |
| "learning_rate": 0.00031138211382113825, | |
| "loss": 0.8179, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.3888, | |
| "grad_norm": 1.7957018613815308, | |
| "learning_rate": 0.0003105691056910569, | |
| "loss": 0.612, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 0.8443304896354675, | |
| "learning_rate": 0.00030975609756097564, | |
| "loss": 1.1852, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 2.9211597442626953, | |
| "learning_rate": 0.00030894308943089433, | |
| "loss": 0.648, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 1.8100574016571045, | |
| "learning_rate": 0.00030813008130081297, | |
| "loss": 0.9621, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.3952, | |
| "grad_norm": 0.5521060824394226, | |
| "learning_rate": 0.0003073170731707317, | |
| "loss": 0.7497, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 3.2785725593566895, | |
| "learning_rate": 0.0003065040650406504, | |
| "loss": 1.1055, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.3984, | |
| "grad_norm": 0.59607994556427, | |
| "learning_rate": 0.0003056910569105691, | |
| "loss": 1.0316, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.6700527667999268, | |
| "learning_rate": 0.0003048780487804878, | |
| "loss": 1.1739, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.1851228475570679, | |
| "eval_runtime": 981.6822, | |
| "eval_samples_per_second": 5.093, | |
| "eval_steps_per_second": 5.093, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4016, | |
| "grad_norm": 4.136201858520508, | |
| "learning_rate": 0.0003040650406504065, | |
| "loss": 1.065, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 1.3209648132324219, | |
| "learning_rate": 0.0003032520325203252, | |
| "loss": 0.9474, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4048, | |
| "grad_norm": 1.0437417030334473, | |
| "learning_rate": 0.0003024390243902439, | |
| "loss": 0.7807, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 1.5563472509384155, | |
| "learning_rate": 0.00030162601626016264, | |
| "loss": 0.9598, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 2.1927716732025146, | |
| "learning_rate": 0.00030081300813008133, | |
| "loss": 0.9768, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 2.606297016143799, | |
| "learning_rate": 0.0003, | |
| "loss": 0.9263, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.4112, | |
| "grad_norm": 2.935955762863159, | |
| "learning_rate": 0.0002991869918699187, | |
| "loss": 1.1745, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 1.4000895023345947, | |
| "learning_rate": 0.0002983739837398374, | |
| "loss": 0.4629, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.4144, | |
| "grad_norm": 1.867844820022583, | |
| "learning_rate": 0.0002975609756097561, | |
| "loss": 0.5586, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 2.5813417434692383, | |
| "learning_rate": 0.0002967479674796748, | |
| "loss": 1.2132, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4176, | |
| "grad_norm": 2.0057499408721924, | |
| "learning_rate": 0.0002959349593495935, | |
| "loss": 0.9353, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 1.3958829641342163, | |
| "learning_rate": 0.0002951219512195122, | |
| "loss": 0.9026, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.4208, | |
| "grad_norm": 0.9305471777915955, | |
| "learning_rate": 0.0002943089430894309, | |
| "loss": 0.637, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 1.509443998336792, | |
| "learning_rate": 0.00029349593495934964, | |
| "loss": 0.9101, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 1.1322437524795532, | |
| "learning_rate": 0.0002926829268292683, | |
| "loss": 0.8228, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 1.4510544538497925, | |
| "learning_rate": 0.000291869918699187, | |
| "loss": 0.8965, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.4272, | |
| "grad_norm": 3.9876041412353516, | |
| "learning_rate": 0.0002910569105691057, | |
| "loss": 0.6826, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 3.5676708221435547, | |
| "learning_rate": 0.00029024390243902437, | |
| "loss": 0.848, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4304, | |
| "grad_norm": 2.8735740184783936, | |
| "learning_rate": 0.0002894308943089431, | |
| "loss": 0.8885, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 3.243591547012329, | |
| "learning_rate": 0.0002886178861788618, | |
| "loss": 0.7272, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4336, | |
| "grad_norm": 1.0646330118179321, | |
| "learning_rate": 0.00028780487804878045, | |
| "loss": 0.9243, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 2.145279884338379, | |
| "learning_rate": 0.0002869918699186992, | |
| "loss": 0.8605, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.4368, | |
| "grad_norm": 0.12611623108386993, | |
| "learning_rate": 0.0002861788617886179, | |
| "loss": 0.8593, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 1.6009489297866821, | |
| "learning_rate": 0.0002853658536585366, | |
| "loss": 1.2734, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.202634572982788, | |
| "learning_rate": 0.0002845528455284553, | |
| "loss": 0.6056, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_loss": 1.1804109811782837, | |
| "eval_runtime": 980.959, | |
| "eval_samples_per_second": 5.097, | |
| "eval_steps_per_second": 5.097, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 2.083847999572754, | |
| "learning_rate": 0.000283739837398374, | |
| "loss": 0.9537, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.4432, | |
| "grad_norm": 1.4282503128051758, | |
| "learning_rate": 0.0002829268292682927, | |
| "loss": 1.1229, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 2.896178722381592, | |
| "learning_rate": 0.00028211382113821137, | |
| "loss": 1.0021, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.4464, | |
| "grad_norm": 2.1293370723724365, | |
| "learning_rate": 0.0002813008130081301, | |
| "loss": 0.7991, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 2.050287961959839, | |
| "learning_rate": 0.0002804878048780488, | |
| "loss": 0.7226, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4496, | |
| "grad_norm": 2.431018590927124, | |
| "learning_rate": 0.00027967479674796745, | |
| "loss": 0.7713, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 0.8670142292976379, | |
| "learning_rate": 0.0002788617886178862, | |
| "loss": 0.8047, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.4528, | |
| "grad_norm": 2.0386359691619873, | |
| "learning_rate": 0.0002780487804878049, | |
| "loss": 1.1617, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 2.0506820678710938, | |
| "learning_rate": 0.0002772357723577236, | |
| "loss": 0.8922, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.9461020827293396, | |
| "learning_rate": 0.0002764227642276423, | |
| "loss": 0.4643, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 2.101771116256714, | |
| "learning_rate": 0.000275609756097561, | |
| "loss": 0.6462, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.4592, | |
| "grad_norm": 2.1208040714263916, | |
| "learning_rate": 0.0002747967479674797, | |
| "loss": 0.9001, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 2.1006226539611816, | |
| "learning_rate": 0.00027398373983739837, | |
| "loss": 1.0824, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.4624, | |
| "grad_norm": 1.7304776906967163, | |
| "learning_rate": 0.0002731707317073171, | |
| "loss": 0.8835, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 1.453547477722168, | |
| "learning_rate": 0.00027235772357723576, | |
| "loss": 0.8462, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4656, | |
| "grad_norm": 2.442309856414795, | |
| "learning_rate": 0.00027154471544715446, | |
| "loss": 1.2086, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 1.3511006832122803, | |
| "learning_rate": 0.0002707317073170732, | |
| "loss": 0.7028, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.4688, | |
| "grad_norm": 0.836704671382904, | |
| "learning_rate": 0.00026991869918699185, | |
| "loss": 0.5175, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 1.680782675743103, | |
| "learning_rate": 0.0002691056910569106, | |
| "loss": 0.906, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 1.6119508743286133, | |
| "learning_rate": 0.0002682926829268293, | |
| "loss": 0.8772, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 0.7434157729148865, | |
| "learning_rate": 0.00026747967479674793, | |
| "loss": 0.7295, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.4752, | |
| "grad_norm": 2.9454092979431152, | |
| "learning_rate": 0.0002666666666666667, | |
| "loss": 0.6404, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 2.6205222606658936, | |
| "learning_rate": 0.0002658536585365854, | |
| "loss": 0.7348, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.4784, | |
| "grad_norm": 0.9789513945579529, | |
| "learning_rate": 0.00026504065040650407, | |
| "loss": 0.9531, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.718863010406494, | |
| "learning_rate": 0.00026422764227642276, | |
| "loss": 0.7694, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 1.234536051750183, | |
| "eval_runtime": 970.4874, | |
| "eval_samples_per_second": 5.152, | |
| "eval_steps_per_second": 5.152, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4816, | |
| "grad_norm": 3.2361960411071777, | |
| "learning_rate": 0.00026341463414634146, | |
| "loss": 1.0666, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 2.865548610687256, | |
| "learning_rate": 0.00026260162601626015, | |
| "loss": 0.9337, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.4848, | |
| "grad_norm": 2.602585554122925, | |
| "learning_rate": 0.00026178861788617885, | |
| "loss": 0.6121, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 3.5708200931549072, | |
| "learning_rate": 0.0002609756097560976, | |
| "loss": 0.9949, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 1.3991180658340454, | |
| "learning_rate": 0.0002601626016260163, | |
| "loss": 0.6146, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 1.6713191270828247, | |
| "learning_rate": 0.00025934959349593493, | |
| "loss": 0.903, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.4912, | |
| "grad_norm": 2.2837250232696533, | |
| "learning_rate": 0.0002585365853658537, | |
| "loss": 0.6912, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 1.2613333463668823, | |
| "learning_rate": 0.0002577235772357724, | |
| "loss": 0.7742, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.4944, | |
| "grad_norm": 0.7632296085357666, | |
| "learning_rate": 0.00025691056910569107, | |
| "loss": 0.6821, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 2.6601462364196777, | |
| "learning_rate": 0.00025609756097560977, | |
| "loss": 1.1208, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.4976, | |
| "grad_norm": 1.0819050073623657, | |
| "learning_rate": 0.00025528455284552846, | |
| "loss": 0.596, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 2.7370996475219727, | |
| "learning_rate": 0.00025447154471544716, | |
| "loss": 0.7677, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5008, | |
| "grad_norm": 1.8494040966033936, | |
| "learning_rate": 0.00025365853658536585, | |
| "loss": 1.0403, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 1.1479870080947876, | |
| "learning_rate": 0.0002528455284552846, | |
| "loss": 0.7458, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 2.6968982219696045, | |
| "learning_rate": 0.00025203252032520324, | |
| "loss": 1.0832, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 2.046722173690796, | |
| "learning_rate": 0.00025121951219512194, | |
| "loss": 0.8306, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.5072, | |
| "grad_norm": 1.8968234062194824, | |
| "learning_rate": 0.0002504065040650407, | |
| "loss": 0.6512, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 2.967087984085083, | |
| "learning_rate": 0.0002495934959349594, | |
| "loss": 0.7747, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.5104, | |
| "grad_norm": 1.0478880405426025, | |
| "learning_rate": 0.0002487804878048781, | |
| "loss": 0.8271, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.8186447620391846, | |
| "learning_rate": 0.0002479674796747967, | |
| "loss": 0.891, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5136, | |
| "grad_norm": 4.616454601287842, | |
| "learning_rate": 0.00024715447154471546, | |
| "loss": 1.0238, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 0.8574868440628052, | |
| "learning_rate": 0.00024634146341463416, | |
| "loss": 1.1656, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5168, | |
| "grad_norm": 0.13767553865909576, | |
| "learning_rate": 0.00024552845528455285, | |
| "loss": 0.7881, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 3.214853048324585, | |
| "learning_rate": 0.00024471544715447155, | |
| "loss": 1.1101, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.6308200359344482, | |
| "learning_rate": 0.00024390243902439024, | |
| "loss": 0.816, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_loss": 1.1804865598678589, | |
| "eval_runtime": 967.604, | |
| "eval_samples_per_second": 5.167, | |
| "eval_steps_per_second": 5.167, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 2.277055501937866, | |
| "learning_rate": 0.00024308943089430897, | |
| "loss": 0.6252, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.5232, | |
| "grad_norm": 1.4975641965866089, | |
| "learning_rate": 0.00024227642276422766, | |
| "loss": 0.9561, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 0.9692897796630859, | |
| "learning_rate": 0.00024146341463414633, | |
| "loss": 0.8452, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5264, | |
| "grad_norm": 3.146620512008667, | |
| "learning_rate": 0.00024065040650406505, | |
| "loss": 0.7964, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 1.6603403091430664, | |
| "learning_rate": 0.00023983739837398375, | |
| "loss": 0.8526, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5296, | |
| "grad_norm": 2.813284158706665, | |
| "learning_rate": 0.00023902439024390244, | |
| "loss": 0.7897, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 2.0508971214294434, | |
| "learning_rate": 0.00023821138211382116, | |
| "loss": 0.7886, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.5328, | |
| "grad_norm": 2.0528271198272705, | |
| "learning_rate": 0.00023739837398373983, | |
| "loss": 0.9615, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 0.16403831541538239, | |
| "learning_rate": 0.00023658536585365852, | |
| "loss": 0.646, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.7658578753471375, | |
| "learning_rate": 0.00023577235772357725, | |
| "loss": 0.952, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 1.5900917053222656, | |
| "learning_rate": 0.00023495934959349594, | |
| "loss": 0.6692, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5392, | |
| "grad_norm": 2.9648303985595703, | |
| "learning_rate": 0.00023414634146341466, | |
| "loss": 1.1142, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 1.9145853519439697, | |
| "learning_rate": 0.00023333333333333333, | |
| "loss": 0.946, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5424, | |
| "grad_norm": 1.9604383707046509, | |
| "learning_rate": 0.00023252032520325203, | |
| "loss": 1.0952, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 1.4773716926574707, | |
| "learning_rate": 0.00023170731707317075, | |
| "loss": 0.798, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5456, | |
| "grad_norm": 1.0287730693817139, | |
| "learning_rate": 0.00023089430894308944, | |
| "loss": 0.6784, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 1.171778678894043, | |
| "learning_rate": 0.00023008130081300814, | |
| "loss": 0.8399, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5488, | |
| "grad_norm": 3.042232036590576, | |
| "learning_rate": 0.00022926829268292683, | |
| "loss": 0.9466, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 2.661311388015747, | |
| "learning_rate": 0.00022845528455284553, | |
| "loss": 0.8741, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.824052631855011, | |
| "learning_rate": 0.00022764227642276422, | |
| "loss": 0.6971, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 2.2068676948547363, | |
| "learning_rate": 0.00022682926829268294, | |
| "loss": 1.0792, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5552, | |
| "grad_norm": 2.607996702194214, | |
| "learning_rate": 0.00022601626016260164, | |
| "loss": 0.7798, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 3.1297590732574463, | |
| "learning_rate": 0.0002252032520325203, | |
| "loss": 0.7335, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5584, | |
| "grad_norm": 1.5206081867218018, | |
| "learning_rate": 0.00022439024390243903, | |
| "loss": 0.5827, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8850612044334412, | |
| "learning_rate": 0.00022357723577235772, | |
| "loss": 0.3555, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 1.173862338066101, | |
| "eval_runtime": 972.1905, | |
| "eval_samples_per_second": 5.143, | |
| "eval_steps_per_second": 5.143, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5616, | |
| "grad_norm": 2.206645965576172, | |
| "learning_rate": 0.00022276422764227645, | |
| "loss": 1.0334, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 2.9328489303588867, | |
| "learning_rate": 0.00022195121951219514, | |
| "loss": 0.9996, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5648, | |
| "grad_norm": 1.533145785331726, | |
| "learning_rate": 0.0002211382113821138, | |
| "loss": 0.6801, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 2.793165922164917, | |
| "learning_rate": 0.00022032520325203253, | |
| "loss": 0.5243, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 0.8398239612579346, | |
| "learning_rate": 0.00021951219512195122, | |
| "loss": 0.7804, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 0.8570539951324463, | |
| "learning_rate": 0.00021869918699186992, | |
| "loss": 0.8727, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.5712, | |
| "grad_norm": 3.4033238887786865, | |
| "learning_rate": 0.00021788617886178864, | |
| "loss": 0.7058, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 2.2475531101226807, | |
| "learning_rate": 0.0002170731707317073, | |
| "loss": 0.6313, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.5744, | |
| "grad_norm": 2.115605354309082, | |
| "learning_rate": 0.000216260162601626, | |
| "loss": 0.724, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 3.056288003921509, | |
| "learning_rate": 0.00021544715447154473, | |
| "loss": 0.8915, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5776, | |
| "grad_norm": 2.0451629161834717, | |
| "learning_rate": 0.00021463414634146342, | |
| "loss": 0.9079, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 2.807318925857544, | |
| "learning_rate": 0.00021382113821138212, | |
| "loss": 0.8133, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.5808, | |
| "grad_norm": 2.908517837524414, | |
| "learning_rate": 0.0002130081300813008, | |
| "loss": 1.1001, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 3.8471007347106934, | |
| "learning_rate": 0.0002121951219512195, | |
| "loss": 1.2104, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 0.697363018989563, | |
| "learning_rate": 0.00021138211382113823, | |
| "loss": 0.7169, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 3.8001105785369873, | |
| "learning_rate": 0.00021056910569105692, | |
| "loss": 0.7753, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.5872, | |
| "grad_norm": 1.2338181734085083, | |
| "learning_rate": 0.00020975609756097562, | |
| "loss": 0.8409, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 2.482490062713623, | |
| "learning_rate": 0.0002089430894308943, | |
| "loss": 0.6949, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.5904, | |
| "grad_norm": 0.9223986864089966, | |
| "learning_rate": 0.000208130081300813, | |
| "loss": 0.5945, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 4.109449863433838, | |
| "learning_rate": 0.0002073170731707317, | |
| "loss": 0.819, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5936, | |
| "grad_norm": 2.1545259952545166, | |
| "learning_rate": 0.00020650406504065042, | |
| "loss": 0.6917, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 2.568392038345337, | |
| "learning_rate": 0.00020569105691056912, | |
| "loss": 0.948, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.5968, | |
| "grad_norm": 1.5172126293182373, | |
| "learning_rate": 0.00020487804878048779, | |
| "loss": 0.7395, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 2.408418893814087, | |
| "learning_rate": 0.0002040650406504065, | |
| "loss": 0.6265, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.076709508895874, | |
| "learning_rate": 0.0002032520325203252, | |
| "loss": 0.8063, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 1.1569427251815796, | |
| "eval_runtime": 959.9775, | |
| "eval_samples_per_second": 5.208, | |
| "eval_steps_per_second": 5.208, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 1.5189223289489746, | |
| "learning_rate": 0.00020243902439024393, | |
| "loss": 0.7415, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6032, | |
| "grad_norm": 1.6611405611038208, | |
| "learning_rate": 0.00020162601626016262, | |
| "loss": 0.7716, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 2.1746842861175537, | |
| "learning_rate": 0.0002008130081300813, | |
| "loss": 0.9592, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6064, | |
| "grad_norm": 1.7051889896392822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7309, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 1.4838804006576538, | |
| "learning_rate": 0.0001991869918699187, | |
| "loss": 0.9909, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6096, | |
| "grad_norm": 1.0016885995864868, | |
| "learning_rate": 0.0001983739837398374, | |
| "loss": 0.7118, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 2.9000656604766846, | |
| "learning_rate": 0.00019756097560975612, | |
| "loss": 0.9148, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.6128, | |
| "grad_norm": 4.018497943878174, | |
| "learning_rate": 0.0001967479674796748, | |
| "loss": 0.7638, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 2.2254245281219482, | |
| "learning_rate": 0.00019593495934959348, | |
| "loss": 0.6787, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 2.057569980621338, | |
| "learning_rate": 0.0001951219512195122, | |
| "loss": 1.046, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 1.673245906829834, | |
| "learning_rate": 0.0001943089430894309, | |
| "loss": 0.7483, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.6192, | |
| "grad_norm": 1.6375939846038818, | |
| "learning_rate": 0.0001934959349593496, | |
| "loss": 0.7224, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 1.7894866466522217, | |
| "learning_rate": 0.0001926829268292683, | |
| "loss": 1.1423, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.6224, | |
| "grad_norm": 3.3884127140045166, | |
| "learning_rate": 0.00019186991869918699, | |
| "loss": 1.2515, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.9296390414237976, | |
| "learning_rate": 0.0001910569105691057, | |
| "loss": 0.6837, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6256, | |
| "grad_norm": 2.7435858249664307, | |
| "learning_rate": 0.0001902439024390244, | |
| "loss": 0.5283, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 3.0927188396453857, | |
| "learning_rate": 0.0001894308943089431, | |
| "loss": 1.0301, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.6288, | |
| "grad_norm": 1.5934438705444336, | |
| "learning_rate": 0.0001886178861788618, | |
| "loss": 0.9681, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 0.7750480771064758, | |
| "learning_rate": 0.0001878048780487805, | |
| "loss": 0.5935, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 2.40685772895813, | |
| "learning_rate": 0.00018699186991869918, | |
| "loss": 0.9389, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.20400309562683105, | |
| "learning_rate": 0.0001861788617886179, | |
| "loss": 0.9855, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.6352, | |
| "grad_norm": 0.8683297634124756, | |
| "learning_rate": 0.0001853658536585366, | |
| "loss": 1.0224, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 1.6435086727142334, | |
| "learning_rate": 0.00018455284552845527, | |
| "loss": 0.6436, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.6384, | |
| "grad_norm": 1.5825189352035522, | |
| "learning_rate": 0.000183739837398374, | |
| "loss": 0.6848, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.164686918258667, | |
| "learning_rate": 0.00018292682926829268, | |
| "loss": 0.962, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 1.164829134941101, | |
| "eval_runtime": 957.7122, | |
| "eval_samples_per_second": 5.221, | |
| "eval_steps_per_second": 5.221, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6416, | |
| "grad_norm": 2.6798882484436035, | |
| "learning_rate": 0.0001821138211382114, | |
| "loss": 0.7698, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 2.3492815494537354, | |
| "learning_rate": 0.0001813008130081301, | |
| "loss": 0.665, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.6448, | |
| "grad_norm": 0.8926377296447754, | |
| "learning_rate": 0.00018048780487804877, | |
| "loss": 0.6753, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 3.1364357471466064, | |
| "learning_rate": 0.0001796747967479675, | |
| "loss": 0.8779, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 3.5229640007019043, | |
| "learning_rate": 0.00017886178861788618, | |
| "loss": 0.9561, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 1.6057953834533691, | |
| "learning_rate": 0.00017804878048780488, | |
| "loss": 0.8222, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6512, | |
| "grad_norm": 4.898631572723389, | |
| "learning_rate": 0.0001772357723577236, | |
| "loss": 1.0256, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 2.4350786209106445, | |
| "learning_rate": 0.00017642276422764227, | |
| "loss": 0.6602, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6544, | |
| "grad_norm": 2.0918514728546143, | |
| "learning_rate": 0.00017560975609756096, | |
| "loss": 1.1968, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.8441520929336548, | |
| "learning_rate": 0.00017479674796747969, | |
| "loss": 0.7727, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6576, | |
| "grad_norm": 2.8887555599212646, | |
| "learning_rate": 0.00017398373983739838, | |
| "loss": 1.2474, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 0.07653144001960754, | |
| "learning_rate": 0.00017317073170731708, | |
| "loss": 0.4434, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.6608, | |
| "grad_norm": 2.0132455825805664, | |
| "learning_rate": 0.00017235772357723577, | |
| "loss": 0.9298, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 1.6578240394592285, | |
| "learning_rate": 0.00017154471544715446, | |
| "loss": 0.944, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 2.6716582775115967, | |
| "learning_rate": 0.0001707317073170732, | |
| "loss": 0.7706, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 3.211911916732788, | |
| "learning_rate": 0.00016991869918699188, | |
| "loss": 0.8844, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.6672, | |
| "grad_norm": 2.383862257003784, | |
| "learning_rate": 0.00016910569105691058, | |
| "loss": 0.9795, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 1.9137873649597168, | |
| "learning_rate": 0.00016829268292682927, | |
| "loss": 0.6962, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.6704, | |
| "grad_norm": 1.3267086744308472, | |
| "learning_rate": 0.00016747967479674797, | |
| "loss": 0.9447, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 2.085939645767212, | |
| "learning_rate": 0.00016666666666666666, | |
| "loss": 1.0395, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6736, | |
| "grad_norm": 1.9438047409057617, | |
| "learning_rate": 0.00016585365853658538, | |
| "loss": 0.8711, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 1.7191540002822876, | |
| "learning_rate": 0.00016504065040650408, | |
| "loss": 0.8736, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.6768, | |
| "grad_norm": 2.784453868865967, | |
| "learning_rate": 0.00016422764227642275, | |
| "loss": 0.7039, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 2.904277801513672, | |
| "learning_rate": 0.00016341463414634147, | |
| "loss": 0.8601, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.8441609144210815, | |
| "learning_rate": 0.00016260162601626016, | |
| "loss": 0.8507, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_loss": 1.1460280418395996, | |
| "eval_runtime": 955.4574, | |
| "eval_samples_per_second": 5.233, | |
| "eval_steps_per_second": 5.233, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 2.44185471534729, | |
| "learning_rate": 0.00016178861788617886, | |
| "loss": 1.2223, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.6832, | |
| "grad_norm": 2.5785441398620605, | |
| "learning_rate": 0.00016097560975609758, | |
| "loss": 0.6929, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 0.8098218441009521, | |
| "learning_rate": 0.00016016260162601625, | |
| "loss": 0.6971, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.6864, | |
| "grad_norm": 2.408022880554199, | |
| "learning_rate": 0.00015934959349593497, | |
| "loss": 0.8393, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 1.8108437061309814, | |
| "learning_rate": 0.00015853658536585366, | |
| "loss": 0.9469, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6896, | |
| "grad_norm": 1.3393510580062866, | |
| "learning_rate": 0.00015772357723577236, | |
| "loss": 0.8046, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.27787142992019653, | |
| "learning_rate": 0.00015691056910569108, | |
| "loss": 0.7817, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.6928, | |
| "grad_norm": 1.5397439002990723, | |
| "learning_rate": 0.00015609756097560975, | |
| "loss": 0.8529, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 4.285569667816162, | |
| "learning_rate": 0.00015528455284552844, | |
| "loss": 0.797, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 1.4360575675964355, | |
| "learning_rate": 0.00015447154471544717, | |
| "loss": 0.6302, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 3.1087186336517334, | |
| "learning_rate": 0.00015365853658536586, | |
| "loss": 0.8003, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.6992, | |
| "grad_norm": 3.1041085720062256, | |
| "learning_rate": 0.00015284552845528455, | |
| "loss": 0.6171, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 1.951988935470581, | |
| "learning_rate": 0.00015203252032520325, | |
| "loss": 0.7859, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.7024, | |
| "grad_norm": 2.5091583728790283, | |
| "learning_rate": 0.00015121951219512194, | |
| "loss": 0.7038, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 1.7080724239349365, | |
| "learning_rate": 0.00015040650406504067, | |
| "loss": 0.6291, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7056, | |
| "grad_norm": 1.1388205289840698, | |
| "learning_rate": 0.00014959349593495936, | |
| "loss": 0.8191, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 2.7150771617889404, | |
| "learning_rate": 0.00014878048780487806, | |
| "loss": 1.1169, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.7088, | |
| "grad_norm": 1.7084001302719116, | |
| "learning_rate": 0.00014796747967479675, | |
| "loss": 0.8503, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.9705413579940796, | |
| "learning_rate": 0.00014715447154471545, | |
| "loss": 0.7798, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.6582396030426025, | |
| "learning_rate": 0.00014634146341463414, | |
| "loss": 0.8122, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 3.819809913635254, | |
| "learning_rate": 0.00014552845528455286, | |
| "loss": 0.9902, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.7152, | |
| "grad_norm": 1.9070576429367065, | |
| "learning_rate": 0.00014471544715447156, | |
| "loss": 0.7999, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 2.159898042678833, | |
| "learning_rate": 0.00014390243902439023, | |
| "loss": 0.6514, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.7184, | |
| "grad_norm": 3.2352945804595947, | |
| "learning_rate": 0.00014308943089430895, | |
| "loss": 0.8179, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7301401495933533, | |
| "learning_rate": 0.00014227642276422764, | |
| "loss": 0.9165, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 1.130717396736145, | |
| "eval_runtime": 960.6233, | |
| "eval_samples_per_second": 5.205, | |
| "eval_steps_per_second": 5.205, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7216, | |
| "grad_norm": 2.6877050399780273, | |
| "learning_rate": 0.00014146341463414634, | |
| "loss": 0.7501, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 1.9907829761505127, | |
| "learning_rate": 0.00014065040650406506, | |
| "loss": 0.8375, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.7248, | |
| "grad_norm": 1.6620970964431763, | |
| "learning_rate": 0.00013983739837398373, | |
| "loss": 0.8965, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 1.9919949769973755, | |
| "learning_rate": 0.00013902439024390245, | |
| "loss": 1.0359, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 1.9886322021484375, | |
| "learning_rate": 0.00013821138211382114, | |
| "loss": 0.7646, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 3.00618577003479, | |
| "learning_rate": 0.00013739837398373984, | |
| "loss": 0.7386, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.7312, | |
| "grad_norm": 1.2489125728607178, | |
| "learning_rate": 0.00013658536585365856, | |
| "loss": 0.9153, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 1.6659530401229858, | |
| "learning_rate": 0.00013577235772357723, | |
| "loss": 1.1601, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.7344, | |
| "grad_norm": 3.0665249824523926, | |
| "learning_rate": 0.00013495934959349592, | |
| "loss": 0.9097, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 1.2477220296859741, | |
| "learning_rate": 0.00013414634146341464, | |
| "loss": 0.5977, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7376, | |
| "grad_norm": 2.523712158203125, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 0.5476, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 1.5238116979599, | |
| "learning_rate": 0.00013252032520325203, | |
| "loss": 0.6774, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.7408, | |
| "grad_norm": 4.065662860870361, | |
| "learning_rate": 0.00013170731707317073, | |
| "loss": 0.9686, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 2.1243975162506104, | |
| "learning_rate": 0.00013089430894308942, | |
| "loss": 0.6776, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 1.4048924446105957, | |
| "learning_rate": 0.00013008130081300815, | |
| "loss": 0.6849, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 0.9425554275512695, | |
| "learning_rate": 0.00012926829268292684, | |
| "loss": 0.5431, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.7472, | |
| "grad_norm": 1.4793920516967773, | |
| "learning_rate": 0.00012845528455284554, | |
| "loss": 0.611, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 1.1930813789367676, | |
| "learning_rate": 0.00012764227642276423, | |
| "loss": 0.8103, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7504, | |
| "grad_norm": 2.5445172786712646, | |
| "learning_rate": 0.00012682926829268293, | |
| "loss": 0.8219, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 1.7753766775131226, | |
| "learning_rate": 0.00012601626016260162, | |
| "loss": 0.8935, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7536, | |
| "grad_norm": 1.7598661184310913, | |
| "learning_rate": 0.00012520325203252034, | |
| "loss": 0.7692, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 0.9807813167572021, | |
| "learning_rate": 0.00012439024390243904, | |
| "loss": 0.8573, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.7568, | |
| "grad_norm": 1.0543572902679443, | |
| "learning_rate": 0.00012357723577235773, | |
| "loss": 0.362, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 1.2011300325393677, | |
| "learning_rate": 0.00012276422764227643, | |
| "loss": 0.7364, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.013681411743164, | |
| "learning_rate": 0.00012195121951219512, | |
| "loss": 0.8223, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_loss": 1.1317497491836548, | |
| "eval_runtime": 1047.5369, | |
| "eval_samples_per_second": 4.773, | |
| "eval_steps_per_second": 4.773, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 3.048567295074463, | |
| "learning_rate": 0.00012113821138211383, | |
| "loss": 0.5417, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.7632, | |
| "grad_norm": 1.684781551361084, | |
| "learning_rate": 0.00012032520325203253, | |
| "loss": 0.8972, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 1.7157773971557617, | |
| "learning_rate": 0.00011951219512195122, | |
| "loss": 0.8071, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.7664, | |
| "grad_norm": 2.1893560886383057, | |
| "learning_rate": 0.00011869918699186991, | |
| "loss": 0.7259, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.559054434299469, | |
| "learning_rate": 0.00011788617886178862, | |
| "loss": 0.6906, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7696, | |
| "grad_norm": 1.9966106414794922, | |
| "learning_rate": 0.00011707317073170733, | |
| "loss": 0.7862, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 2.0883913040161133, | |
| "learning_rate": 0.00011626016260162601, | |
| "loss": 0.9832, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.7728, | |
| "grad_norm": 0.9213092923164368, | |
| "learning_rate": 0.00011544715447154472, | |
| "loss": 0.5425, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 3.5715088844299316, | |
| "learning_rate": 0.00011463414634146342, | |
| "loss": 0.908, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 1.5071601867675781, | |
| "learning_rate": 0.00011382113821138211, | |
| "loss": 0.9961, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 2.8544921875, | |
| "learning_rate": 0.00011300813008130082, | |
| "loss": 0.8617, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.7792, | |
| "grad_norm": 1.5049967765808105, | |
| "learning_rate": 0.00011219512195121951, | |
| "loss": 0.6921, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 1.413341999053955, | |
| "learning_rate": 0.00011138211382113822, | |
| "loss": 0.87, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.7824, | |
| "grad_norm": 3.063117027282715, | |
| "learning_rate": 0.0001105691056910569, | |
| "loss": 0.8527, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 1.7717232704162598, | |
| "learning_rate": 0.00010975609756097561, | |
| "loss": 0.9359, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7856, | |
| "grad_norm": 2.220553398132324, | |
| "learning_rate": 0.00010894308943089432, | |
| "loss": 0.8893, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 1.1765658855438232, | |
| "learning_rate": 0.000108130081300813, | |
| "loss": 0.5423, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.7888, | |
| "grad_norm": 3.124976873397827, | |
| "learning_rate": 0.00010731707317073171, | |
| "loss": 0.8261, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 1.6760934591293335, | |
| "learning_rate": 0.0001065040650406504, | |
| "loss": 0.8228, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 2.275233268737793, | |
| "learning_rate": 0.00010569105691056911, | |
| "loss": 1.1372, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 1.0748039484024048, | |
| "learning_rate": 0.00010487804878048781, | |
| "loss": 0.9031, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.7952, | |
| "grad_norm": 3.2387609481811523, | |
| "learning_rate": 0.0001040650406504065, | |
| "loss": 1.041, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 1.7894922494888306, | |
| "learning_rate": 0.00010325203252032521, | |
| "loss": 0.6705, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.7984, | |
| "grad_norm": 3.0146548748016357, | |
| "learning_rate": 0.00010243902439024389, | |
| "loss": 0.9184, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.578597903251648, | |
| "learning_rate": 0.0001016260162601626, | |
| "loss": 0.9872, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.1235560178756714, | |
| "eval_runtime": 1107.329, | |
| "eval_samples_per_second": 4.515, | |
| "eval_steps_per_second": 4.515, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8016, | |
| "grad_norm": 2.0832505226135254, | |
| "learning_rate": 0.00010081300813008131, | |
| "loss": 0.6787, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 4.393614292144775, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9292, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.8048, | |
| "grad_norm": 3.805360794067383, | |
| "learning_rate": 9.91869918699187e-05, | |
| "loss": 0.7421, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 2.305285930633545, | |
| "learning_rate": 9.83739837398374e-05, | |
| "loss": 0.6064, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 4.427598476409912, | |
| "learning_rate": 9.75609756097561e-05, | |
| "loss": 0.8991, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 0.9261614680290222, | |
| "learning_rate": 9.67479674796748e-05, | |
| "loss": 0.9912, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.8112, | |
| "grad_norm": 0.9871659874916077, | |
| "learning_rate": 9.593495934959349e-05, | |
| "loss": 0.575, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 0.12044885754585266, | |
| "learning_rate": 9.51219512195122e-05, | |
| "loss": 0.4722, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.8144, | |
| "grad_norm": 1.28267240524292, | |
| "learning_rate": 9.43089430894309e-05, | |
| "loss": 0.6495, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 2.445477247238159, | |
| "learning_rate": 9.349593495934959e-05, | |
| "loss": 0.8108, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.8176, | |
| "grad_norm": 1.838616132736206, | |
| "learning_rate": 9.26829268292683e-05, | |
| "loss": 0.698, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 1.8858362436294556, | |
| "learning_rate": 9.1869918699187e-05, | |
| "loss": 0.7465, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.8208, | |
| "grad_norm": 2.1843161582946777, | |
| "learning_rate": 9.10569105691057e-05, | |
| "loss": 0.7991, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 2.6587400436401367, | |
| "learning_rate": 9.024390243902438e-05, | |
| "loss": 0.7246, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 0.21658068895339966, | |
| "learning_rate": 8.943089430894309e-05, | |
| "loss": 0.4838, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 2.1727850437164307, | |
| "learning_rate": 8.86178861788618e-05, | |
| "loss": 0.9581, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.8272, | |
| "grad_norm": 0.5486516952514648, | |
| "learning_rate": 8.780487804878048e-05, | |
| "loss": 0.7398, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 1.6711406707763672, | |
| "learning_rate": 8.699186991869919e-05, | |
| "loss": 0.8343, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.8304, | |
| "grad_norm": 0.38640347123146057, | |
| "learning_rate": 8.617886178861789e-05, | |
| "loss": 0.5283, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.7390656471252441, | |
| "learning_rate": 8.53658536585366e-05, | |
| "loss": 0.5333, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.8336, | |
| "grad_norm": 1.4642919301986694, | |
| "learning_rate": 8.455284552845529e-05, | |
| "loss": 0.9112, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 2.632808208465576, | |
| "learning_rate": 8.373983739837398e-05, | |
| "loss": 0.7809, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.8368, | |
| "grad_norm": 1.728801965713501, | |
| "learning_rate": 8.292682926829269e-05, | |
| "loss": 1.0487, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 2.3986566066741943, | |
| "learning_rate": 8.211382113821137e-05, | |
| "loss": 0.7501, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.8456178903579712, | |
| "learning_rate": 8.130081300813008e-05, | |
| "loss": 0.8797, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_loss": 1.1170645952224731, | |
| "eval_runtime": 1154.3085, | |
| "eval_samples_per_second": 4.332, | |
| "eval_steps_per_second": 4.332, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 1.5469543933868408, | |
| "learning_rate": 8.048780487804879e-05, | |
| "loss": 0.7024, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.8432, | |
| "grad_norm": 3.0582432746887207, | |
| "learning_rate": 7.967479674796748e-05, | |
| "loss": 0.8939, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 2.4114296436309814, | |
| "learning_rate": 7.886178861788618e-05, | |
| "loss": 0.783, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.8464, | |
| "grad_norm": 1.299592137336731, | |
| "learning_rate": 7.804878048780487e-05, | |
| "loss": 0.7606, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 1.3195971250534058, | |
| "learning_rate": 7.723577235772358e-05, | |
| "loss": 0.7542, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.8496, | |
| "grad_norm": 2.526697874069214, | |
| "learning_rate": 7.642276422764228e-05, | |
| "loss": 0.9406, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 1.22294020652771, | |
| "learning_rate": 7.560975609756097e-05, | |
| "loss": 1.3387, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.8528, | |
| "grad_norm": 0.27195674180984497, | |
| "learning_rate": 7.479674796747968e-05, | |
| "loss": 0.9164, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 2.390148162841797, | |
| "learning_rate": 7.398373983739838e-05, | |
| "loss": 0.8772, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 2.907269239425659, | |
| "learning_rate": 7.317073170731707e-05, | |
| "loss": 0.7364, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 1.9308322668075562, | |
| "learning_rate": 7.235772357723578e-05, | |
| "loss": 0.6116, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.8592, | |
| "grad_norm": 2.004450798034668, | |
| "learning_rate": 7.154471544715447e-05, | |
| "loss": 0.839, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 2.5253965854644775, | |
| "learning_rate": 7.073170731707317e-05, | |
| "loss": 0.7411, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.8624, | |
| "grad_norm": 1.223568081855774, | |
| "learning_rate": 6.991869918699186e-05, | |
| "loss": 0.823, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.9021104574203491, | |
| "learning_rate": 6.910569105691057e-05, | |
| "loss": 0.8155, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.8656, | |
| "grad_norm": 1.9883354902267456, | |
| "learning_rate": 6.829268292682928e-05, | |
| "loss": 0.6916, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 1.8869421482086182, | |
| "learning_rate": 6.747967479674796e-05, | |
| "loss": 0.7202, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.8688, | |
| "grad_norm": 1.137399673461914, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.8288, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 1.6170374155044556, | |
| "learning_rate": 6.585365853658536e-05, | |
| "loss": 0.8518, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 1.8285601139068604, | |
| "learning_rate": 6.504065040650407e-05, | |
| "loss": 0.613, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 2.910038948059082, | |
| "learning_rate": 6.422764227642277e-05, | |
| "loss": 0.8952, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.8752, | |
| "grad_norm": 1.7812882661819458, | |
| "learning_rate": 6.341463414634146e-05, | |
| "loss": 0.8391, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 3.008392095565796, | |
| "learning_rate": 6.260162601626017e-05, | |
| "loss": 0.9243, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.8784, | |
| "grad_norm": 2.041684865951538, | |
| "learning_rate": 6.178861788617887e-05, | |
| "loss": 0.9482, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.1885712146759033, | |
| "learning_rate": 6.097560975609756e-05, | |
| "loss": 0.737, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_loss": 1.1130776405334473, | |
| "eval_runtime": 1175.4432, | |
| "eval_samples_per_second": 4.254, | |
| "eval_steps_per_second": 4.254, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8816, | |
| "grad_norm": 2.233348846435547, | |
| "learning_rate": 6.016260162601626e-05, | |
| "loss": 0.6539, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 3.41200590133667, | |
| "learning_rate": 5.934959349593496e-05, | |
| "loss": 0.777, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.8848, | |
| "grad_norm": 1.883143663406372, | |
| "learning_rate": 5.8536585365853666e-05, | |
| "loss": 1.083, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 1.179457664489746, | |
| "learning_rate": 5.772357723577236e-05, | |
| "loss": 0.5159, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 4.108737945556641, | |
| "learning_rate": 5.6910569105691056e-05, | |
| "loss": 1.2811, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 2.099215507507324, | |
| "learning_rate": 5.609756097560976e-05, | |
| "loss": 0.7685, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.8912, | |
| "grad_norm": 5.275564193725586, | |
| "learning_rate": 5.528455284552845e-05, | |
| "loss": 1.1286, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 1.1515026092529297, | |
| "learning_rate": 5.447154471544716e-05, | |
| "loss": 0.6888, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.8944, | |
| "grad_norm": 2.397169828414917, | |
| "learning_rate": 5.3658536585365855e-05, | |
| "loss": 0.8067, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.0115450620651245, | |
| "learning_rate": 5.284552845528456e-05, | |
| "loss": 0.4546, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8976, | |
| "grad_norm": 1.9359952211380005, | |
| "learning_rate": 5.203252032520325e-05, | |
| "loss": 1.2266, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 1.6608821153640747, | |
| "learning_rate": 5.1219512195121947e-05, | |
| "loss": 1.0713, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.9008, | |
| "grad_norm": 3.0019173622131348, | |
| "learning_rate": 5.0406504065040655e-05, | |
| "loss": 1.1104, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.2102556973695755, | |
| "learning_rate": 4.959349593495935e-05, | |
| "loss": 0.9822, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 2.968538761138916, | |
| "learning_rate": 4.878048780487805e-05, | |
| "loss": 1.1946, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 1.3731448650360107, | |
| "learning_rate": 4.7967479674796746e-05, | |
| "loss": 0.7431, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.9072, | |
| "grad_norm": 1.5175280570983887, | |
| "learning_rate": 4.715447154471545e-05, | |
| "loss": 0.6726, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 3.332031011581421, | |
| "learning_rate": 4.634146341463415e-05, | |
| "loss": 0.6165, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.9104, | |
| "grad_norm": 2.3446781635284424, | |
| "learning_rate": 4.552845528455285e-05, | |
| "loss": 0.5643, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 1.1185941696166992, | |
| "learning_rate": 4.4715447154471546e-05, | |
| "loss": 1.1405, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.9136, | |
| "grad_norm": 3.511198043823242, | |
| "learning_rate": 4.390243902439024e-05, | |
| "loss": 1.0318, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 1.842178463935852, | |
| "learning_rate": 4.308943089430894e-05, | |
| "loss": 0.7284, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.9168, | |
| "grad_norm": 2.8651535511016846, | |
| "learning_rate": 4.2276422764227644e-05, | |
| "loss": 0.8321, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 2.988203763961792, | |
| "learning_rate": 4.1463414634146346e-05, | |
| "loss": 0.889, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.8294357061386108, | |
| "learning_rate": 4.065040650406504e-05, | |
| "loss": 0.9193, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "eval_loss": 1.1053773164749146, | |
| "eval_runtime": 1280.7262, | |
| "eval_samples_per_second": 3.904, | |
| "eval_steps_per_second": 3.904, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 2.330263137817383, | |
| "learning_rate": 3.983739837398374e-05, | |
| "loss": 0.9316, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.9232, | |
| "grad_norm": 3.490957260131836, | |
| "learning_rate": 3.902439024390244e-05, | |
| "loss": 0.8297, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 0.16155463457107544, | |
| "learning_rate": 3.821138211382114e-05, | |
| "loss": 0.5396, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.9264, | |
| "grad_norm": 3.8618061542510986, | |
| "learning_rate": 3.739837398373984e-05, | |
| "loss": 1.1497, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 2.8210208415985107, | |
| "learning_rate": 3.6585365853658535e-05, | |
| "loss": 1.3421, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.9296, | |
| "grad_norm": 3.1744306087493896, | |
| "learning_rate": 3.577235772357724e-05, | |
| "loss": 1.0488, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 1.5382752418518066, | |
| "learning_rate": 3.495934959349593e-05, | |
| "loss": 0.6032, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.9328, | |
| "grad_norm": 1.915822148323059, | |
| "learning_rate": 3.414634146341464e-05, | |
| "loss": 0.7809, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 0.8540381193161011, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.6, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 1.4163843393325806, | |
| "learning_rate": 3.2520325203252037e-05, | |
| "loss": 0.6286, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 0.13695108890533447, | |
| "learning_rate": 3.170731707317073e-05, | |
| "loss": 1.2472, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.9392, | |
| "grad_norm": 1.5286403894424438, | |
| "learning_rate": 3.089430894308943e-05, | |
| "loss": 1.0632, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 1.7806613445281982, | |
| "learning_rate": 3.008130081300813e-05, | |
| "loss": 0.7279, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.9424, | |
| "grad_norm": 1.5880378484725952, | |
| "learning_rate": 2.9268292682926833e-05, | |
| "loss": 0.8116, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.9348939061164856, | |
| "learning_rate": 2.8455284552845528e-05, | |
| "loss": 0.6383, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.9456, | |
| "grad_norm": 2.188812732696533, | |
| "learning_rate": 2.7642276422764226e-05, | |
| "loss": 0.8308, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 1.4115760326385498, | |
| "learning_rate": 2.6829268292682928e-05, | |
| "loss": 0.8653, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.9488, | |
| "grad_norm": 3.2353076934814453, | |
| "learning_rate": 2.6016260162601626e-05, | |
| "loss": 0.8548, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 1.7031910419464111, | |
| "learning_rate": 2.5203252032520327e-05, | |
| "loss": 0.7358, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 0.33889999985694885, | |
| "learning_rate": 2.4390243902439026e-05, | |
| "loss": 0.6915, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 1.296474575996399, | |
| "learning_rate": 2.3577235772357724e-05, | |
| "loss": 0.7619, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.9552, | |
| "grad_norm": 2.5524513721466064, | |
| "learning_rate": 2.2764227642276426e-05, | |
| "loss": 0.8742, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 1.5270932912826538, | |
| "learning_rate": 2.195121951219512e-05, | |
| "loss": 0.628, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.9584, | |
| "grad_norm": 1.3427207469940186, | |
| "learning_rate": 2.1138211382113822e-05, | |
| "loss": 0.8086, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.8533713817596436, | |
| "learning_rate": 2.032520325203252e-05, | |
| "loss": 0.8316, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 1.1065137386322021, | |
| "eval_runtime": 1285.8072, | |
| "eval_samples_per_second": 3.889, | |
| "eval_steps_per_second": 3.889, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9616, | |
| "grad_norm": 2.6748170852661133, | |
| "learning_rate": 1.951219512195122e-05, | |
| "loss": 1.2344, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 2.6799983978271484, | |
| "learning_rate": 1.869918699186992e-05, | |
| "loss": 0.7761, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.9648, | |
| "grad_norm": 1.6079151630401611, | |
| "learning_rate": 1.788617886178862e-05, | |
| "loss": 0.4993, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 4.892763137817383, | |
| "learning_rate": 1.707317073170732e-05, | |
| "loss": 0.8844, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 3.5101561546325684, | |
| "learning_rate": 1.6260162601626018e-05, | |
| "loss": 0.821, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 1.485048532485962, | |
| "learning_rate": 1.5447154471544717e-05, | |
| "loss": 1.1131, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.9712, | |
| "grad_norm": 2.223806858062744, | |
| "learning_rate": 1.4634146341463416e-05, | |
| "loss": 0.6271, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 1.194604516029358, | |
| "learning_rate": 1.3821138211382113e-05, | |
| "loss": 0.8082, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.9744, | |
| "grad_norm": 1.4954756498336792, | |
| "learning_rate": 1.3008130081300813e-05, | |
| "loss": 0.6707, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 4.3971028327941895, | |
| "learning_rate": 1.2195121951219513e-05, | |
| "loss": 0.9261, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.9776, | |
| "grad_norm": 3.624100923538208, | |
| "learning_rate": 1.1382113821138213e-05, | |
| "loss": 1.2992, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 1.0655690431594849, | |
| "learning_rate": 1.0569105691056911e-05, | |
| "loss": 0.7596, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.9808, | |
| "grad_norm": 2.3329694271087646, | |
| "learning_rate": 9.75609756097561e-06, | |
| "loss": 0.7903, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 2.0594286918640137, | |
| "learning_rate": 8.94308943089431e-06, | |
| "loss": 0.6795, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 2.90356707572937, | |
| "learning_rate": 8.130081300813009e-06, | |
| "loss": 0.8201, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 1.485250473022461, | |
| "learning_rate": 7.317073170731708e-06, | |
| "loss": 0.6455, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.9872, | |
| "grad_norm": 1.0751053094863892, | |
| "learning_rate": 6.5040650406504065e-06, | |
| "loss": 0.6323, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 0.8699440956115723, | |
| "learning_rate": 5.691056910569106e-06, | |
| "loss": 0.7094, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.9904, | |
| "grad_norm": 1.9519120454788208, | |
| "learning_rate": 4.878048780487805e-06, | |
| "loss": 1.0605, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 1.3058487176895142, | |
| "learning_rate": 4.0650406504065046e-06, | |
| "loss": 0.8777, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9936, | |
| "grad_norm": 4.6405229568481445, | |
| "learning_rate": 3.2520325203252032e-06, | |
| "loss": 0.8997, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 1.5533032417297363, | |
| "learning_rate": 2.4390243902439023e-06, | |
| "loss": 0.635, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.9968, | |
| "grad_norm": 1.528320074081421, | |
| "learning_rate": 1.6260162601626016e-06, | |
| "loss": 0.9057, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 0.9600651860237122, | |
| "learning_rate": 8.130081300813008e-07, | |
| "loss": 0.9465, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.0253632068634033, | |
| "learning_rate": 0.0, | |
| "loss": 0.7621, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.101412057876587, | |
| "eval_runtime": 1362.455, | |
| "eval_samples_per_second": 3.67, | |
| "eval_steps_per_second": 3.67, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 6250, | |
| "total_flos": 2.345137078272e+17, | |
| "train_loss": 0.958240804862976, | |
| "train_runtime": 54923.8414, | |
| "train_samples_per_second": 0.91, | |
| "train_steps_per_second": 0.114 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.345137078272e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |