| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9974293059125965, | |
| "eval_steps": 500, | |
| "global_step": 1749, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01713796058269066, | |
| "grad_norm": 0.4733077883720398, | |
| "learning_rate": 4.971412235563179e-05, | |
| "loss": 2.5254, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03427592116538132, | |
| "grad_norm": 0.44018083810806274, | |
| "learning_rate": 4.9428244711263584e-05, | |
| "loss": 1.8284, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05141388174807198, | |
| "grad_norm": 0.7213996648788452, | |
| "learning_rate": 4.914236706689537e-05, | |
| "loss": 1.2128, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06855184233076264, | |
| "grad_norm": 0.6227633953094482, | |
| "learning_rate": 4.8856489422527165e-05, | |
| "loss": 0.6727, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0856898029134533, | |
| "grad_norm": 0.2804420590400696, | |
| "learning_rate": 4.8570611778158946e-05, | |
| "loss": 0.3089, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10282776349614396, | |
| "grad_norm": 0.10756277292966843, | |
| "learning_rate": 4.828473413379074e-05, | |
| "loss": 0.2448, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11996572407883462, | |
| "grad_norm": 0.11257284879684448, | |
| "learning_rate": 4.799885648942253e-05, | |
| "loss": 0.2268, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13710368466152528, | |
| "grad_norm": 0.09793874621391296, | |
| "learning_rate": 4.771297884505432e-05, | |
| "loss": 0.2328, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15424164524421594, | |
| "grad_norm": 0.10177452117204666, | |
| "learning_rate": 4.742710120068611e-05, | |
| "loss": 0.2183, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1713796058269066, | |
| "grad_norm": 0.09770502895116806, | |
| "learning_rate": 4.7141223556317895e-05, | |
| "loss": 0.1971, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18851756640959727, | |
| "grad_norm": 0.11037880927324295, | |
| "learning_rate": 4.685534591194969e-05, | |
| "loss": 0.1996, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20565552699228792, | |
| "grad_norm": 0.11869871616363525, | |
| "learning_rate": 4.656946826758148e-05, | |
| "loss": 0.2044, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22279348757497858, | |
| "grad_norm": 0.10279687494039536, | |
| "learning_rate": 4.628359062321327e-05, | |
| "loss": 0.1782, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.23993144815766923, | |
| "grad_norm": 0.10952438414096832, | |
| "learning_rate": 4.599771297884506e-05, | |
| "loss": 0.1837, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2570694087403599, | |
| "grad_norm": 0.13122445344924927, | |
| "learning_rate": 4.5711835334476845e-05, | |
| "loss": 0.1907, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.27420736932305056, | |
| "grad_norm": 0.11156395077705383, | |
| "learning_rate": 4.542595769010863e-05, | |
| "loss": 0.1748, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2913453299057412, | |
| "grad_norm": 0.1657029241323471, | |
| "learning_rate": 4.5140080045740427e-05, | |
| "loss": 0.1704, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.30848329048843187, | |
| "grad_norm": 0.11676887422800064, | |
| "learning_rate": 4.4854202401372214e-05, | |
| "loss": 0.16, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.32562125107112255, | |
| "grad_norm": 0.17466452717781067, | |
| "learning_rate": 4.4568324757004e-05, | |
| "loss": 0.1705, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3427592116538132, | |
| "grad_norm": 0.1373518407344818, | |
| "learning_rate": 4.4282447112635795e-05, | |
| "loss": 0.1663, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.35989717223650386, | |
| "grad_norm": 0.11815926432609558, | |
| "learning_rate": 4.399656946826758e-05, | |
| "loss": 0.1562, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.37703513281919454, | |
| "grad_norm": 0.12364204227924347, | |
| "learning_rate": 4.3710691823899376e-05, | |
| "loss": 0.1655, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.39417309340188517, | |
| "grad_norm": 0.11595863103866577, | |
| "learning_rate": 4.3424814179531164e-05, | |
| "loss": 0.1621, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.41131105398457585, | |
| "grad_norm": 0.11320952326059341, | |
| "learning_rate": 4.313893653516296e-05, | |
| "loss": 0.1523, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4284490145672665, | |
| "grad_norm": 0.12280760705471039, | |
| "learning_rate": 4.285305889079474e-05, | |
| "loss": 0.1711, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.44558697514995715, | |
| "grad_norm": 0.12085650116205215, | |
| "learning_rate": 4.256718124642653e-05, | |
| "loss": 0.1548, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.46272493573264784, | |
| "grad_norm": 0.1491166651248932, | |
| "learning_rate": 4.228130360205832e-05, | |
| "loss": 0.1631, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.47986289631533846, | |
| "grad_norm": 0.16156260669231415, | |
| "learning_rate": 4.199542595769011e-05, | |
| "loss": 0.1624, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.49700085689802914, | |
| "grad_norm": 0.12156664580106735, | |
| "learning_rate": 4.17095483133219e-05, | |
| "loss": 0.1383, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5141388174807198, | |
| "grad_norm": 0.11533980071544647, | |
| "learning_rate": 4.142367066895369e-05, | |
| "loss": 0.1539, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5312767780634104, | |
| "grad_norm": 0.1386200189590454, | |
| "learning_rate": 4.113779302458548e-05, | |
| "loss": 0.1525, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5484147386461011, | |
| "grad_norm": 0.16111765801906586, | |
| "learning_rate": 4.085191538021727e-05, | |
| "loss": 0.14, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5655526992287918, | |
| "grad_norm": 0.151380717754364, | |
| "learning_rate": 4.0566037735849064e-05, | |
| "loss": 0.1492, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5826906598114824, | |
| "grad_norm": 0.15228472650051117, | |
| "learning_rate": 4.028016009148085e-05, | |
| "loss": 0.157, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5998286203941731, | |
| "grad_norm": 0.11199972033500671, | |
| "learning_rate": 3.999428244711264e-05, | |
| "loss": 0.1492, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6169665809768637, | |
| "grad_norm": 0.17204181849956512, | |
| "learning_rate": 3.9708404802744425e-05, | |
| "loss": 0.1504, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6341045415595544, | |
| "grad_norm": 0.13271720707416534, | |
| "learning_rate": 3.942252715837621e-05, | |
| "loss": 0.1604, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6512425021422451, | |
| "grad_norm": 0.15998151898384094, | |
| "learning_rate": 3.913664951400801e-05, | |
| "loss": 0.148, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6683804627249358, | |
| "grad_norm": 0.12898313999176025, | |
| "learning_rate": 3.8850771869639794e-05, | |
| "loss": 0.1492, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6855184233076264, | |
| "grad_norm": 0.13998836278915405, | |
| "learning_rate": 3.856489422527159e-05, | |
| "loss": 0.1548, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.702656383890317, | |
| "grad_norm": 0.14940115809440613, | |
| "learning_rate": 3.8279016580903375e-05, | |
| "loss": 0.1432, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7197943444730077, | |
| "grad_norm": 0.13358236849308014, | |
| "learning_rate": 3.799313893653517e-05, | |
| "loss": 0.1459, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7369323050556984, | |
| "grad_norm": 0.1597578376531601, | |
| "learning_rate": 3.7707261292166957e-05, | |
| "loss": 0.1351, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7540702656383891, | |
| "grad_norm": 0.12782897055149078, | |
| "learning_rate": 3.7421383647798744e-05, | |
| "loss": 0.1392, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7712082262210797, | |
| "grad_norm": 0.13537828624248505, | |
| "learning_rate": 3.713550600343053e-05, | |
| "loss": 0.1402, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7883461868037703, | |
| "grad_norm": 0.17046277225017548, | |
| "learning_rate": 3.684962835906232e-05, | |
| "loss": 0.1227, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.805484147386461, | |
| "grad_norm": 0.16829894483089447, | |
| "learning_rate": 3.656375071469411e-05, | |
| "loss": 0.133, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8226221079691517, | |
| "grad_norm": 0.17760640382766724, | |
| "learning_rate": 3.62778730703259e-05, | |
| "loss": 0.1361, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8397600685518424, | |
| "grad_norm": 0.1783646047115326, | |
| "learning_rate": 3.5991995425957694e-05, | |
| "loss": 0.1272, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.856898029134533, | |
| "grad_norm": 0.1848060041666031, | |
| "learning_rate": 3.570611778158948e-05, | |
| "loss": 0.1281, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8740359897172236, | |
| "grad_norm": 0.1244303435087204, | |
| "learning_rate": 3.5420240137221275e-05, | |
| "loss": 0.1276, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8911739502999143, | |
| "grad_norm": 0.1454310566186905, | |
| "learning_rate": 3.513436249285306e-05, | |
| "loss": 0.1334, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.908311910882605, | |
| "grad_norm": 0.17194361984729767, | |
| "learning_rate": 3.484848484848485e-05, | |
| "loss": 0.1271, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9254498714652957, | |
| "grad_norm": 0.15851227939128876, | |
| "learning_rate": 3.456260720411664e-05, | |
| "loss": 0.1304, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9425878320479862, | |
| "grad_norm": 0.1614447981119156, | |
| "learning_rate": 3.4276729559748424e-05, | |
| "loss": 0.1256, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9597257926306769, | |
| "grad_norm": 0.19016973674297333, | |
| "learning_rate": 3.399085191538022e-05, | |
| "loss": 0.1329, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9768637532133676, | |
| "grad_norm": 0.15122413635253906, | |
| "learning_rate": 3.3704974271012005e-05, | |
| "loss": 0.1228, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9940017137960583, | |
| "grad_norm": 0.15631020069122314, | |
| "learning_rate": 3.34190966266438e-05, | |
| "loss": 0.1235, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9991431019708654, | |
| "eval_loss": 0.15606513619422913, | |
| "eval_runtime": 439.0741, | |
| "eval_samples_per_second": 10.932, | |
| "eval_steps_per_second": 1.367, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 1.0111396743787489, | |
| "grad_norm": 0.20745624601840973, | |
| "learning_rate": 3.313321898227559e-05, | |
| "loss": 0.1245, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0282776349614395, | |
| "grad_norm": 0.182315856218338, | |
| "learning_rate": 3.284734133790738e-05, | |
| "loss": 0.1112, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0454155955441302, | |
| "grad_norm": 0.1425042450428009, | |
| "learning_rate": 3.256146369353917e-05, | |
| "loss": 0.1186, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.062553556126821, | |
| "grad_norm": 0.1898319274187088, | |
| "learning_rate": 3.2275586049170955e-05, | |
| "loss": 0.1203, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0796915167095116, | |
| "grad_norm": 0.17110544443130493, | |
| "learning_rate": 3.198970840480275e-05, | |
| "loss": 0.1223, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.0968294772922023, | |
| "grad_norm": 0.17418451607227325, | |
| "learning_rate": 3.170383076043453e-05, | |
| "loss": 0.111, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.113967437874893, | |
| "grad_norm": 0.19765284657478333, | |
| "learning_rate": 3.1417953116066324e-05, | |
| "loss": 0.1173, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1311053984575836, | |
| "grad_norm": 0.17013542354106903, | |
| "learning_rate": 3.113207547169811e-05, | |
| "loss": 0.1128, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1482433590402743, | |
| "grad_norm": 0.21173644065856934, | |
| "learning_rate": 3.0846197827329905e-05, | |
| "loss": 0.109, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.165381319622965, | |
| "grad_norm": 0.13383643329143524, | |
| "learning_rate": 3.056032018296169e-05, | |
| "loss": 0.1092, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.1825192802056554, | |
| "grad_norm": 0.22101104259490967, | |
| "learning_rate": 3.0274442538593483e-05, | |
| "loss": 0.1175, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.1996572407883461, | |
| "grad_norm": 0.1745050698518753, | |
| "learning_rate": 2.9988564894225274e-05, | |
| "loss": 0.1096, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2167952013710368, | |
| "grad_norm": 0.2413550466299057, | |
| "learning_rate": 2.9702687249857064e-05, | |
| "loss": 0.1044, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.2339331619537275, | |
| "grad_norm": 0.15909789502620697, | |
| "learning_rate": 2.9416809605488855e-05, | |
| "loss": 0.1076, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.2510711225364182, | |
| "grad_norm": 0.21382008492946625, | |
| "learning_rate": 2.9130931961120646e-05, | |
| "loss": 0.1077, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.2682090831191088, | |
| "grad_norm": 0.1845552772283554, | |
| "learning_rate": 2.884505431675243e-05, | |
| "loss": 0.102, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.2853470437017995, | |
| "grad_norm": 0.19705334305763245, | |
| "learning_rate": 2.855917667238422e-05, | |
| "loss": 0.1048, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3024850042844902, | |
| "grad_norm": 0.16979779303073883, | |
| "learning_rate": 2.827329902801601e-05, | |
| "loss": 0.1012, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3196229648671807, | |
| "grad_norm": 0.18766264617443085, | |
| "learning_rate": 2.7987421383647798e-05, | |
| "loss": 0.1043, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.3367609254498714, | |
| "grad_norm": 0.1593862771987915, | |
| "learning_rate": 2.770154373927959e-05, | |
| "loss": 0.0953, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.353898886032562, | |
| "grad_norm": 0.19427362084388733, | |
| "learning_rate": 2.741566609491138e-05, | |
| "loss": 0.1008, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.3710368466152527, | |
| "grad_norm": 0.21518777310848236, | |
| "learning_rate": 2.712978845054317e-05, | |
| "loss": 0.0973, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.3881748071979434, | |
| "grad_norm": 0.2065572291612625, | |
| "learning_rate": 2.684391080617496e-05, | |
| "loss": 0.1035, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.405312767780634, | |
| "grad_norm": 0.26617249846458435, | |
| "learning_rate": 2.655803316180675e-05, | |
| "loss": 0.0969, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.4224507283633248, | |
| "grad_norm": 0.17817485332489014, | |
| "learning_rate": 2.627215551743854e-05, | |
| "loss": 0.0944, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4395886889460154, | |
| "grad_norm": 0.17743416130542755, | |
| "learning_rate": 2.5986277873070326e-05, | |
| "loss": 0.0984, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.4567266495287061, | |
| "grad_norm": 0.23810917139053345, | |
| "learning_rate": 2.5700400228702117e-05, | |
| "loss": 0.099, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4738646101113968, | |
| "grad_norm": 0.18336652219295502, | |
| "learning_rate": 2.5414522584333904e-05, | |
| "loss": 0.0954, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.4910025706940875, | |
| "grad_norm": 0.22074759006500244, | |
| "learning_rate": 2.5128644939965695e-05, | |
| "loss": 0.0915, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5081405312767782, | |
| "grad_norm": 0.2292771190404892, | |
| "learning_rate": 2.4842767295597485e-05, | |
| "loss": 0.0931, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.5252784918594688, | |
| "grad_norm": 0.27021974325180054, | |
| "learning_rate": 2.4556889651229276e-05, | |
| "loss": 0.0933, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5424164524421595, | |
| "grad_norm": 0.28702589869499207, | |
| "learning_rate": 2.4271012006861067e-05, | |
| "loss": 0.0978, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5595544130248502, | |
| "grad_norm": 0.18202678859233856, | |
| "learning_rate": 2.3985134362492854e-05, | |
| "loss": 0.0908, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.5766923736075407, | |
| "grad_norm": 0.15822364389896393, | |
| "learning_rate": 2.3699256718124644e-05, | |
| "loss": 0.0878, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.5938303341902313, | |
| "grad_norm": 0.21551179885864258, | |
| "learning_rate": 2.3413379073756435e-05, | |
| "loss": 0.0883, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.610968294772922, | |
| "grad_norm": 0.20409446954727173, | |
| "learning_rate": 2.3127501429388222e-05, | |
| "loss": 0.094, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.6281062553556127, | |
| "grad_norm": 0.16509220004081726, | |
| "learning_rate": 2.2841623785020013e-05, | |
| "loss": 0.0909, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6452442159383034, | |
| "grad_norm": 0.1946859210729599, | |
| "learning_rate": 2.25557461406518e-05, | |
| "loss": 0.0898, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.6623821765209938, | |
| "grad_norm": 0.16525417566299438, | |
| "learning_rate": 2.226986849628359e-05, | |
| "loss": 0.0842, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.6795201371036845, | |
| "grad_norm": 0.26819315552711487, | |
| "learning_rate": 2.198399085191538e-05, | |
| "loss": 0.0897, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.6966580976863752, | |
| "grad_norm": 0.273593008518219, | |
| "learning_rate": 2.1698113207547172e-05, | |
| "loss": 0.0895, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.713796058269066, | |
| "grad_norm": 0.20378242433071136, | |
| "learning_rate": 2.1412235563178963e-05, | |
| "loss": 0.0912, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7309340188517566, | |
| "grad_norm": 0.24366410076618195, | |
| "learning_rate": 2.112635791881075e-05, | |
| "loss": 0.0843, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.7480719794344473, | |
| "grad_norm": 0.2034538835287094, | |
| "learning_rate": 2.084048027444254e-05, | |
| "loss": 0.0874, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.765209940017138, | |
| "grad_norm": 0.2260463535785675, | |
| "learning_rate": 2.0554602630074328e-05, | |
| "loss": 0.0898, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.7823479005998286, | |
| "grad_norm": 0.165971040725708, | |
| "learning_rate": 2.026872498570612e-05, | |
| "loss": 0.0867, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.7994858611825193, | |
| "grad_norm": 0.20090453326702118, | |
| "learning_rate": 1.998284734133791e-05, | |
| "loss": 0.0891, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.81662382176521, | |
| "grad_norm": 0.24718008935451508, | |
| "learning_rate": 1.9696969696969697e-05, | |
| "loss": 0.0872, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.8337617823479007, | |
| "grad_norm": 0.2808043360710144, | |
| "learning_rate": 1.9411092052601487e-05, | |
| "loss": 0.0889, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.8508997429305913, | |
| "grad_norm": 0.19833321869373322, | |
| "learning_rate": 1.9125214408233278e-05, | |
| "loss": 0.0859, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.868037703513282, | |
| "grad_norm": 0.21585367619991302, | |
| "learning_rate": 1.883933676386507e-05, | |
| "loss": 0.0872, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.8851756640959727, | |
| "grad_norm": 0.16208423674106598, | |
| "learning_rate": 1.8553459119496856e-05, | |
| "loss": 0.0835, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9023136246786634, | |
| "grad_norm": 0.195535346865654, | |
| "learning_rate": 1.8267581475128647e-05, | |
| "loss": 0.0785, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.919451585261354, | |
| "grad_norm": 0.22447216510772705, | |
| "learning_rate": 1.7981703830760434e-05, | |
| "loss": 0.08, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.9365895458440445, | |
| "grad_norm": 0.22212448716163635, | |
| "learning_rate": 1.7695826186392224e-05, | |
| "loss": 0.0826, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.9537275064267352, | |
| "grad_norm": 0.2077985554933548, | |
| "learning_rate": 1.7409948542024015e-05, | |
| "loss": 0.0822, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.9708654670094259, | |
| "grad_norm": 0.20502477884292603, | |
| "learning_rate": 1.7124070897655802e-05, | |
| "loss": 0.0817, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.9880034275921166, | |
| "grad_norm": 0.22330917418003082, | |
| "learning_rate": 1.6838193253287593e-05, | |
| "loss": 0.0837, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.19116699695587158, | |
| "eval_runtime": 439.6787, | |
| "eval_samples_per_second": 10.917, | |
| "eval_steps_per_second": 1.365, | |
| "step": 1167 | |
| }, | |
| { | |
| "epoch": 2.005141388174807, | |
| "grad_norm": 0.1825590431690216, | |
| "learning_rate": 1.6552315608919384e-05, | |
| "loss": 0.0764, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.0222793487574977, | |
| "grad_norm": 0.245217964053154, | |
| "learning_rate": 1.6266437964551174e-05, | |
| "loss": 0.0776, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.0394173093401884, | |
| "grad_norm": 0.185066357254982, | |
| "learning_rate": 1.5980560320182965e-05, | |
| "loss": 0.0736, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.056555269922879, | |
| "grad_norm": 0.1703159511089325, | |
| "learning_rate": 1.5694682675814752e-05, | |
| "loss": 0.075, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.0736932305055698, | |
| "grad_norm": 0.19329093396663666, | |
| "learning_rate": 1.540880503144654e-05, | |
| "loss": 0.0789, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.0908311910882604, | |
| "grad_norm": 0.22116591036319733, | |
| "learning_rate": 1.5122927387078332e-05, | |
| "loss": 0.0761, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.107969151670951, | |
| "grad_norm": 0.17945989966392517, | |
| "learning_rate": 1.4837049742710121e-05, | |
| "loss": 0.0766, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.125107112253642, | |
| "grad_norm": 0.21690496802330017, | |
| "learning_rate": 1.4551172098341912e-05, | |
| "loss": 0.0748, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.1422450728363325, | |
| "grad_norm": 0.17853769659996033, | |
| "learning_rate": 1.4265294453973699e-05, | |
| "loss": 0.0755, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.159383033419023, | |
| "grad_norm": 0.18670514225959778, | |
| "learning_rate": 1.397941680960549e-05, | |
| "loss": 0.0789, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.176520994001714, | |
| "grad_norm": 0.2004610300064087, | |
| "learning_rate": 1.3693539165237278e-05, | |
| "loss": 0.0771, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.1936589545844045, | |
| "grad_norm": 0.17286266386508942, | |
| "learning_rate": 1.3407661520869069e-05, | |
| "loss": 0.0757, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.210796915167095, | |
| "grad_norm": 0.21835772693157196, | |
| "learning_rate": 1.312178387650086e-05, | |
| "loss": 0.0764, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.227934875749786, | |
| "grad_norm": 0.2717360854148865, | |
| "learning_rate": 1.2835906232132647e-05, | |
| "loss": 0.0761, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2450728363324766, | |
| "grad_norm": 0.17896392941474915, | |
| "learning_rate": 1.2550028587764438e-05, | |
| "loss": 0.0748, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.2622107969151672, | |
| "grad_norm": 0.20612064003944397, | |
| "learning_rate": 1.2264150943396227e-05, | |
| "loss": 0.0745, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.279348757497858, | |
| "grad_norm": 0.25621137022972107, | |
| "learning_rate": 1.1978273299028017e-05, | |
| "loss": 0.0754, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.2964867180805486, | |
| "grad_norm": 0.1826545149087906, | |
| "learning_rate": 1.1692395654659806e-05, | |
| "loss": 0.0738, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.3136246786632393, | |
| "grad_norm": 0.22315889596939087, | |
| "learning_rate": 1.1406518010291597e-05, | |
| "loss": 0.0738, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.33076263924593, | |
| "grad_norm": 0.2433796525001526, | |
| "learning_rate": 1.1120640365923384e-05, | |
| "loss": 0.0746, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.34790059982862, | |
| "grad_norm": 0.19135648012161255, | |
| "learning_rate": 1.0834762721555175e-05, | |
| "loss": 0.0745, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.365038560411311, | |
| "grad_norm": 0.21853765845298767, | |
| "learning_rate": 1.0548885077186965e-05, | |
| "loss": 0.0751, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.3821765209940016, | |
| "grad_norm": 0.23422804474830627, | |
| "learning_rate": 1.0263007432818754e-05, | |
| "loss": 0.0736, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.3993144815766922, | |
| "grad_norm": 0.22039854526519775, | |
| "learning_rate": 9.977129788450543e-06, | |
| "loss": 0.0722, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.416452442159383, | |
| "grad_norm": 0.20668627321720123, | |
| "learning_rate": 9.691252144082332e-06, | |
| "loss": 0.074, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.4335904027420736, | |
| "grad_norm": 0.3503040671348572, | |
| "learning_rate": 9.405374499714123e-06, | |
| "loss": 0.0786, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.4507283633247643, | |
| "grad_norm": 0.2581626772880554, | |
| "learning_rate": 9.119496855345912e-06, | |
| "loss": 0.0721, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.467866323907455, | |
| "grad_norm": 0.2439981997013092, | |
| "learning_rate": 8.833619210977703e-06, | |
| "loss": 0.0733, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.4850042844901457, | |
| "grad_norm": 0.16303370893001556, | |
| "learning_rate": 8.547741566609492e-06, | |
| "loss": 0.0727, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.5021422450728363, | |
| "grad_norm": 0.17133218050003052, | |
| "learning_rate": 8.26186392224128e-06, | |
| "loss": 0.0722, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.519280205655527, | |
| "grad_norm": 0.2298302799463272, | |
| "learning_rate": 7.975986277873071e-06, | |
| "loss": 0.0717, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.5364181662382177, | |
| "grad_norm": 0.22220905125141144, | |
| "learning_rate": 7.69010863350486e-06, | |
| "loss": 0.0763, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.5535561268209084, | |
| "grad_norm": 0.2196768969297409, | |
| "learning_rate": 7.40423098913665e-06, | |
| "loss": 0.0694, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.570694087403599, | |
| "grad_norm": 0.23876038193702698, | |
| "learning_rate": 7.11835334476844e-06, | |
| "loss": 0.0702, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.5878320479862897, | |
| "grad_norm": 0.2197275310754776, | |
| "learning_rate": 6.832475700400229e-06, | |
| "loss": 0.0731, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.6049700085689804, | |
| "grad_norm": 0.1521027684211731, | |
| "learning_rate": 6.5465980560320186e-06, | |
| "loss": 0.0706, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.622107969151671, | |
| "grad_norm": 0.2315615713596344, | |
| "learning_rate": 6.2607204116638075e-06, | |
| "loss": 0.0699, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.6392459297343613, | |
| "grad_norm": 0.22933197021484375, | |
| "learning_rate": 5.974842767295598e-06, | |
| "loss": 0.0726, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.656383890317052, | |
| "grad_norm": 0.23843450844287872, | |
| "learning_rate": 5.688965122927387e-06, | |
| "loss": 0.0696, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.6735218508997427, | |
| "grad_norm": 0.2547082304954529, | |
| "learning_rate": 5.403087478559177e-06, | |
| "loss": 0.0697, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.6906598114824334, | |
| "grad_norm": 0.19213077425956726, | |
| "learning_rate": 5.117209834190966e-06, | |
| "loss": 0.0725, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.707797772065124, | |
| "grad_norm": 0.19462022185325623, | |
| "learning_rate": 4.8313321898227566e-06, | |
| "loss": 0.0707, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.7249357326478147, | |
| "grad_norm": 0.22448916733264923, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.0708, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.7420736932305054, | |
| "grad_norm": 0.22550085186958313, | |
| "learning_rate": 4.259576901086335e-06, | |
| "loss": 0.0705, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.759211653813196, | |
| "grad_norm": 0.24738526344299316, | |
| "learning_rate": 3.973699256718124e-06, | |
| "loss": 0.0716, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.776349614395887, | |
| "grad_norm": 0.20870988070964813, | |
| "learning_rate": 3.687821612349914e-06, | |
| "loss": 0.0674, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.7934875749785775, | |
| "grad_norm": 0.2011324018239975, | |
| "learning_rate": 3.4019439679817043e-06, | |
| "loss": 0.0692, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.810625535561268, | |
| "grad_norm": 0.25708264112472534, | |
| "learning_rate": 3.1160663236134933e-06, | |
| "loss": 0.0699, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.827763496143959, | |
| "grad_norm": 0.20108073949813843, | |
| "learning_rate": 2.830188679245283e-06, | |
| "loss": 0.0702, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.8449014567266495, | |
| "grad_norm": 0.21398206055164337, | |
| "learning_rate": 2.5443110348770725e-06, | |
| "loss": 0.0727, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.86203941730934, | |
| "grad_norm": 0.1945817768573761, | |
| "learning_rate": 2.2584333905088623e-06, | |
| "loss": 0.0739, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.879177377892031, | |
| "grad_norm": 0.20674385130405426, | |
| "learning_rate": 1.972555746140652e-06, | |
| "loss": 0.0738, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.8963153384747216, | |
| "grad_norm": 0.2226879894733429, | |
| "learning_rate": 1.6866781017724415e-06, | |
| "loss": 0.0698, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.9134532990574122, | |
| "grad_norm": 0.1832134872674942, | |
| "learning_rate": 1.400800457404231e-06, | |
| "loss": 0.0701, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.930591259640103, | |
| "grad_norm": 0.2704426348209381, | |
| "learning_rate": 1.1149228130360207e-06, | |
| "loss": 0.0681, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.9477292202227936, | |
| "grad_norm": 0.2618798315525055, | |
| "learning_rate": 8.290451686678103e-07, | |
| "loss": 0.0696, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.9648671808054843, | |
| "grad_norm": 0.16076813638210297, | |
| "learning_rate": 5.431675242995998e-07, | |
| "loss": 0.0729, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.982005141388175, | |
| "grad_norm": 0.20760373771190643, | |
| "learning_rate": 2.572898799313894e-07, | |
| "loss": 0.0693, | |
| "step": 1740 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1749, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3059345020564275e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |