| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 1160, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004326663061114116, |
| "grad_norm": 12.063708720251004, |
| "learning_rate": 0.0, |
| "loss": 1.3101, |
| "num_tokens": 8052727.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.008653326122228232, |
| "grad_norm": 11.727718712189134, |
| "learning_rate": 5.714285714285715e-07, |
| "loss": 1.3318, |
| "num_tokens": 16316249.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.012979989183342347, |
| "grad_norm": 11.537944660412041, |
| "learning_rate": 1.142857142857143e-06, |
| "loss": 1.3489, |
| "num_tokens": 24414956.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.017306652244456464, |
| "grad_norm": 11.513834135754882, |
| "learning_rate": 1.7142857142857145e-06, |
| "loss": 1.3106, |
| "num_tokens": 32432061.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.02163331530557058, |
| "grad_norm": 10.709616265045359, |
| "learning_rate": 2.285714285714286e-06, |
| "loss": 1.3397, |
| "num_tokens": 40560264.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.025959978366684695, |
| "grad_norm": 6.3776460128645125, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 1.3048, |
| "num_tokens": 48746761.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03028664142779881, |
| "grad_norm": 4.803209403173688, |
| "learning_rate": 3.428571428571429e-06, |
| "loss": 1.2789, |
| "num_tokens": 56923208.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.03461330448891293, |
| "grad_norm": 3.3389242242644204, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.2859, |
| "num_tokens": 65037147.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.038939967550027044, |
| "grad_norm": 2.67959921100429, |
| "learning_rate": 4.571428571428572e-06, |
| "loss": 1.2151, |
| "num_tokens": 73157157.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.04326663061114116, |
| "grad_norm": 2.672716941514162, |
| "learning_rate": 5.142857142857142e-06, |
| "loss": 1.27, |
| "num_tokens": 81330425.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.047593293672255274, |
| "grad_norm": 2.3564616974865276, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 1.2467, |
| "num_tokens": 89515900.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.05191995673336939, |
| "grad_norm": 1.4041532628718385, |
| "learning_rate": 6.285714285714286e-06, |
| "loss": 1.2437, |
| "num_tokens": 97822905.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.056246619794483504, |
| "grad_norm": 3.4995096434595614, |
| "learning_rate": 6.857142857142858e-06, |
| "loss": 1.228, |
| "num_tokens": 105891211.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.06057328285559762, |
| "grad_norm": 1.7414005281229072, |
| "learning_rate": 7.428571428571429e-06, |
| "loss": 1.2271, |
| "num_tokens": 114159655.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.06489994591671173, |
| "grad_norm": 3.301008162368563, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.2085, |
| "num_tokens": 122302268.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.06922660897782586, |
| "grad_norm": 1.515296843459891, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 1.2182, |
| "num_tokens": 130363765.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.07355327203893997, |
| "grad_norm": 0.8966168784111427, |
| "learning_rate": 9.142857142857144e-06, |
| "loss": 1.1771, |
| "num_tokens": 138559833.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.07787993510005409, |
| "grad_norm": 1.0197011395640607, |
| "learning_rate": 9.714285714285715e-06, |
| "loss": 1.1908, |
| "num_tokens": 146796904.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.0822065981611682, |
| "grad_norm": 0.8539259040543176, |
| "learning_rate": 1.0285714285714285e-05, |
| "loss": 1.1752, |
| "num_tokens": 155019734.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.08653326122228232, |
| "grad_norm": 0.9490693214323738, |
| "learning_rate": 1.0857142857142858e-05, |
| "loss": 1.1664, |
| "num_tokens": 163012614.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09085992428339643, |
| "grad_norm": 0.7637306204373832, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": 1.1527, |
| "num_tokens": 171242469.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.09518658734451055, |
| "grad_norm": 0.684327460495295, |
| "learning_rate": 1.2e-05, |
| "loss": 1.169, |
| "num_tokens": 179418820.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.09951325040562466, |
| "grad_norm": 0.8747628249391602, |
| "learning_rate": 1.2571428571428572e-05, |
| "loss": 1.1426, |
| "num_tokens": 187617438.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.10383991346673878, |
| "grad_norm": 0.7383283690375211, |
| "learning_rate": 1.3142857142857145e-05, |
| "loss": 1.166, |
| "num_tokens": 195684109.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.10816657652785289, |
| "grad_norm": 0.7774312103590332, |
| "learning_rate": 1.3714285714285716e-05, |
| "loss": 1.1729, |
| "num_tokens": 8143851.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.11249323958896701, |
| "grad_norm": 0.742987701591983, |
| "learning_rate": 1.4285714285714287e-05, |
| "loss": 1.1418, |
| "num_tokens": 16231014.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.11681990265008113, |
| "grad_norm": 0.7597974072212311, |
| "learning_rate": 1.4857142857142858e-05, |
| "loss": 1.1519, |
| "num_tokens": 24491951.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.12114656571119524, |
| "grad_norm": 0.5863008064226061, |
| "learning_rate": 1.542857142857143e-05, |
| "loss": 1.1519, |
| "num_tokens": 32878252.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.12547322877230935, |
| "grad_norm": 0.8462710417949215, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.1511, |
| "num_tokens": 41054015.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.12979989183342347, |
| "grad_norm": 0.7552796906152683, |
| "learning_rate": 1.6571428571428574e-05, |
| "loss": 1.1372, |
| "num_tokens": 49241122.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1341265548945376, |
| "grad_norm": 0.7260275701543588, |
| "learning_rate": 1.7142857142857142e-05, |
| "loss": 1.1449, |
| "num_tokens": 57442255.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.13845321795565171, |
| "grad_norm": 0.956217372168425, |
| "learning_rate": 1.7714285714285717e-05, |
| "loss": 1.139, |
| "num_tokens": 65747927.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.1427798810167658, |
| "grad_norm": 0.8902649541620489, |
| "learning_rate": 1.8285714285714288e-05, |
| "loss": 1.123, |
| "num_tokens": 73728881.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.14710654407787993, |
| "grad_norm": 0.940638327725262, |
| "learning_rate": 1.885714285714286e-05, |
| "loss": 1.1332, |
| "num_tokens": 81823147.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.15143320713899405, |
| "grad_norm": 1.05820771639681, |
| "learning_rate": 1.942857142857143e-05, |
| "loss": 1.1233, |
| "num_tokens": 90140961.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.15575987020010817, |
| "grad_norm": 0.6786564535689352, |
| "learning_rate": 2e-05, |
| "loss": 1.1134, |
| "num_tokens": 98314830.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.16008653326122227, |
| "grad_norm": 1.3940550517981376, |
| "learning_rate": 1.9999964908096047e-05, |
| "loss": 1.1349, |
| "num_tokens": 106342502.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.1644131963223364, |
| "grad_norm": 0.6715963909078057, |
| "learning_rate": 1.9999859632657835e-05, |
| "loss": 1.0989, |
| "num_tokens": 114627503.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.1687398593834505, |
| "grad_norm": 0.9985206811473272, |
| "learning_rate": 1.9999684174506328e-05, |
| "loss": 1.1383, |
| "num_tokens": 122857334.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.17306652244456464, |
| "grad_norm": 0.9483815912954356, |
| "learning_rate": 1.999943853500978e-05, |
| "loss": 1.1224, |
| "num_tokens": 131055706.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.17739318550567876, |
| "grad_norm": 0.8683918297064362, |
| "learning_rate": 1.9999122716083737e-05, |
| "loss": 1.1573, |
| "num_tokens": 139253842.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.18171984856679285, |
| "grad_norm": 0.6429259475965016, |
| "learning_rate": 1.9998736720191024e-05, |
| "loss": 1.1007, |
| "num_tokens": 147222819.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.18604651162790697, |
| "grad_norm": 0.9456343421520338, |
| "learning_rate": 1.999828055034171e-05, |
| "loss": 1.1222, |
| "num_tokens": 155421830.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.1903731746890211, |
| "grad_norm": 0.8535995280206158, |
| "learning_rate": 1.99977542100931e-05, |
| "loss": 1.0983, |
| "num_tokens": 163580725.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.19469983775013522, |
| "grad_norm": 0.7453534127868048, |
| "learning_rate": 1.99971577035497e-05, |
| "loss": 1.1045, |
| "num_tokens": 171682606.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1990265008112493, |
| "grad_norm": 0.7952563274763261, |
| "learning_rate": 1.999649103536319e-05, |
| "loss": 1.091, |
| "num_tokens": 179937509.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.20335316387236343, |
| "grad_norm": 0.6958121333057308, |
| "learning_rate": 1.9995754210732382e-05, |
| "loss": 1.0797, |
| "num_tokens": 188304288.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.20767982693347756, |
| "grad_norm": 0.8009419384487555, |
| "learning_rate": 1.999494723540318e-05, |
| "loss": 1.1117, |
| "num_tokens": 196584086.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.21200648999459168, |
| "grad_norm": 1.1770686099470742, |
| "learning_rate": 1.9994070115668543e-05, |
| "loss": 1.1127, |
| "num_tokens": 204821676.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.21633315305570577, |
| "grad_norm": 0.5760382647360603, |
| "learning_rate": 1.9993122858368424e-05, |
| "loss": 1.0736, |
| "num_tokens": 213042491.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2206598161168199, |
| "grad_norm": 0.9211760426362613, |
| "learning_rate": 1.9992105470889727e-05, |
| "loss": 1.1007, |
| "num_tokens": 221162073.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.22498647917793402, |
| "grad_norm": 0.728065400596324, |
| "learning_rate": 1.9991017961166245e-05, |
| "loss": 1.0852, |
| "num_tokens": 229361548.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.22931314223904814, |
| "grad_norm": 0.9499983156478771, |
| "learning_rate": 1.9989860337678596e-05, |
| "loss": 1.0711, |
| "num_tokens": 237483363.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.23363980530016226, |
| "grad_norm": 0.6425952198321203, |
| "learning_rate": 1.998863260945416e-05, |
| "loss": 1.0803, |
| "num_tokens": 245616670.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.23796646836127636, |
| "grad_norm": 0.6756995511647822, |
| "learning_rate": 1.998733478606701e-05, |
| "loss": 1.118, |
| "num_tokens": 253758578.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.24229313142239048, |
| "grad_norm": 0.7048601099063915, |
| "learning_rate": 1.998596687763783e-05, |
| "loss": 1.0927, |
| "num_tokens": 261949951.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.2466197944835046, |
| "grad_norm": 0.7411902717404214, |
| "learning_rate": 1.998452889483385e-05, |
| "loss": 1.0838, |
| "num_tokens": 270112359.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.2509464575446187, |
| "grad_norm": 0.7583633449162208, |
| "learning_rate": 1.9983020848868745e-05, |
| "loss": 1.0751, |
| "num_tokens": 278249438.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.2552731206057328, |
| "grad_norm": 0.5895229075183283, |
| "learning_rate": 1.9981442751502562e-05, |
| "loss": 1.1097, |
| "num_tokens": 286515364.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.25959978366684694, |
| "grad_norm": 0.8689168674816765, |
| "learning_rate": 1.9979794615041623e-05, |
| "loss": 1.0633, |
| "num_tokens": 294730511.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.26392644672796106, |
| "grad_norm": 0.8437342550894297, |
| "learning_rate": 1.997807645233842e-05, |
| "loss": 1.0772, |
| "num_tokens": 302885915.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.2682531097890752, |
| "grad_norm": 0.7886516822119517, |
| "learning_rate": 1.9976288276791537e-05, |
| "loss": 1.0848, |
| "num_tokens": 310939993.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.2725797728501893, |
| "grad_norm": 0.89230967002305, |
| "learning_rate": 1.9974430102345526e-05, |
| "loss": 1.1142, |
| "num_tokens": 319044771.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.27690643591130343, |
| "grad_norm": 0.8141528756159552, |
| "learning_rate": 1.9972501943490805e-05, |
| "loss": 1.116, |
| "num_tokens": 327163024.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.28123309897241755, |
| "grad_norm": 0.8792199270720278, |
| "learning_rate": 1.9970503815263543e-05, |
| "loss": 1.0768, |
| "num_tokens": 335493759.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2855597620335316, |
| "grad_norm": 0.5782878344170364, |
| "learning_rate": 1.9968435733245542e-05, |
| "loss": 1.0713, |
| "num_tokens": 343741024.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.28988642509464574, |
| "grad_norm": 1.0783400413776696, |
| "learning_rate": 1.9966297713564123e-05, |
| "loss": 1.0777, |
| "num_tokens": 351897173.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.29421308815575986, |
| "grad_norm": 0.5912367198455412, |
| "learning_rate": 1.9964089772891998e-05, |
| "loss": 1.0587, |
| "num_tokens": 360108643.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.298539751216874, |
| "grad_norm": 0.9560230781906641, |
| "learning_rate": 1.9961811928447124e-05, |
| "loss": 1.0513, |
| "num_tokens": 368404831.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3028664142779881, |
| "grad_norm": 0.6694334113583972, |
| "learning_rate": 1.9959464197992592e-05, |
| "loss": 1.0941, |
| "num_tokens": 376701186.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3071930773391022, |
| "grad_norm": 0.9953837845607135, |
| "learning_rate": 1.995704659983648e-05, |
| "loss": 1.1126, |
| "num_tokens": 384948998.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.31151974040021635, |
| "grad_norm": 0.801064734619278, |
| "learning_rate": 1.9954559152831705e-05, |
| "loss": 1.0698, |
| "num_tokens": 393114257.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.31584640346133047, |
| "grad_norm": 0.7206220961948427, |
| "learning_rate": 1.995200187637587e-05, |
| "loss": 1.043, |
| "num_tokens": 401208167.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.32017306652244454, |
| "grad_norm": 0.6357689365088348, |
| "learning_rate": 1.9949374790411134e-05, |
| "loss": 1.0434, |
| "num_tokens": 409465133.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.32449972958355866, |
| "grad_norm": 0.8975335464913397, |
| "learning_rate": 1.9946677915424045e-05, |
| "loss": 1.0802, |
| "num_tokens": 417678210.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3288263926446728, |
| "grad_norm": 0.6684128266308609, |
| "learning_rate": 1.994391127244537e-05, |
| "loss": 1.0638, |
| "num_tokens": 425836991.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.3331530557057869, |
| "grad_norm": 0.8335803466238701, |
| "learning_rate": 1.994107488304995e-05, |
| "loss": 1.0404, |
| "num_tokens": 434061883.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.337479718766901, |
| "grad_norm": 0.701262475559687, |
| "learning_rate": 1.993816876935652e-05, |
| "loss": 1.072, |
| "num_tokens": 442148727.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.34180638182801515, |
| "grad_norm": 5.389426853375867, |
| "learning_rate": 1.9935192954027537e-05, |
| "loss": 1.0877, |
| "num_tokens": 450223300.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.34613304488912927, |
| "grad_norm": 1.2247442284969425, |
| "learning_rate": 1.9932147460269007e-05, |
| "loss": 1.0742, |
| "num_tokens": 458483353.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3504597079502434, |
| "grad_norm": 0.6785079644135494, |
| "learning_rate": 1.9929032311830303e-05, |
| "loss": 1.0609, |
| "num_tokens": 466519955.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.3547863710113575, |
| "grad_norm": 0.9818785578102666, |
| "learning_rate": 1.9925847533003976e-05, |
| "loss": 1.0626, |
| "num_tokens": 474749256.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.3591130340724716, |
| "grad_norm": 0.9226344028092789, |
| "learning_rate": 1.9922593148625573e-05, |
| "loss": 1.0946, |
| "num_tokens": 483036885.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.3634396971335857, |
| "grad_norm": 0.6996572072248013, |
| "learning_rate": 1.9919269184073435e-05, |
| "loss": 1.0366, |
| "num_tokens": 491030270.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.3677663601946998, |
| "grad_norm": 0.817594387577693, |
| "learning_rate": 1.9915875665268508e-05, |
| "loss": 1.0879, |
| "num_tokens": 499011152.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.37209302325581395, |
| "grad_norm": 0.7201730564502099, |
| "learning_rate": 1.9912412618674134e-05, |
| "loss": 1.0486, |
| "num_tokens": 507139514.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.37641968631692807, |
| "grad_norm": 0.7992318950396967, |
| "learning_rate": 1.9908880071295844e-05, |
| "loss": 1.0639, |
| "num_tokens": 515327001.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.3807463493780422, |
| "grad_norm": 0.6151247030430017, |
| "learning_rate": 1.990527805068115e-05, |
| "loss": 1.0425, |
| "num_tokens": 523524018.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.3850730124391563, |
| "grad_norm": 0.6686898707632741, |
| "learning_rate": 1.9901606584919336e-05, |
| "loss": 1.0562, |
| "num_tokens": 531727355.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.38939967550027044, |
| "grad_norm": 0.6423157646116131, |
| "learning_rate": 1.989786570264123e-05, |
| "loss": 1.0581, |
| "num_tokens": 539809499.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.39372633856138456, |
| "grad_norm": 0.7387999657802858, |
| "learning_rate": 1.9894055433018977e-05, |
| "loss": 1.0578, |
| "num_tokens": 548048650.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.3980530016224986, |
| "grad_norm": 0.8411899425159689, |
| "learning_rate": 1.9890175805765834e-05, |
| "loss": 1.0808, |
| "num_tokens": 556181435.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.40237966468361275, |
| "grad_norm": 0.775911076614371, |
| "learning_rate": 1.9886226851135904e-05, |
| "loss": 1.0798, |
| "num_tokens": 564455785.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.40670632774472687, |
| "grad_norm": 0.700418884054443, |
| "learning_rate": 1.988220859992394e-05, |
| "loss": 1.0864, |
| "num_tokens": 572734893.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.411032990805841, |
| "grad_norm": 0.6926462904609825, |
| "learning_rate": 1.987812108346506e-05, |
| "loss": 1.0779, |
| "num_tokens": 580907237.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.4153596538669551, |
| "grad_norm": 0.5780746275245598, |
| "learning_rate": 1.9873964333634546e-05, |
| "loss": 1.0606, |
| "num_tokens": 588950468.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.41968631692806924, |
| "grad_norm": 0.8269363889294378, |
| "learning_rate": 1.9869738382847567e-05, |
| "loss": 1.0515, |
| "num_tokens": 597058764.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.42401297998918336, |
| "grad_norm": 0.7858627732714016, |
| "learning_rate": 1.9865443264058936e-05, |
| "loss": 1.0289, |
| "num_tokens": 605330511.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.4283396430502975, |
| "grad_norm": 0.5098978815659675, |
| "learning_rate": 1.9861079010762852e-05, |
| "loss": 1.0614, |
| "num_tokens": 613468042.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.43266630611141155, |
| "grad_norm": 0.9837835335786151, |
| "learning_rate": 1.9856645656992637e-05, |
| "loss": 1.0528, |
| "num_tokens": 621617034.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.43699296917252567, |
| "grad_norm": 0.6960932513964322, |
| "learning_rate": 1.9852143237320475e-05, |
| "loss": 1.0773, |
| "num_tokens": 629963486.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.4413196322336398, |
| "grad_norm": 0.8282041321720025, |
| "learning_rate": 1.9847571786857142e-05, |
| "loss": 1.055, |
| "num_tokens": 638074685.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.4456462952947539, |
| "grad_norm": 0.8123048467055004, |
| "learning_rate": 1.984293134125172e-05, |
| "loss": 1.0691, |
| "num_tokens": 646360851.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.44997295835586804, |
| "grad_norm": 0.6160311721079573, |
| "learning_rate": 1.9838221936691347e-05, |
| "loss": 1.0863, |
| "num_tokens": 654661863.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.45429962141698216, |
| "grad_norm": 0.8232116092132997, |
| "learning_rate": 1.9833443609900896e-05, |
| "loss": 1.0177, |
| "num_tokens": 662981026.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.4586262844780963, |
| "grad_norm": 0.543372168006543, |
| "learning_rate": 1.9828596398142725e-05, |
| "loss": 1.044, |
| "num_tokens": 671216096.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.4629529475392104, |
| "grad_norm": 0.9423564400007317, |
| "learning_rate": 1.9823680339216363e-05, |
| "loss": 1.0448, |
| "num_tokens": 679484445.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.4672796106003245, |
| "grad_norm": 0.6916636602583178, |
| "learning_rate": 1.9818695471458224e-05, |
| "loss": 1.0487, |
| "num_tokens": 687812578.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.4716062736614386, |
| "grad_norm": 1.0709477226072333, |
| "learning_rate": 1.9813641833741308e-05, |
| "loss": 1.0705, |
| "num_tokens": 696049253.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.4759329367225527, |
| "grad_norm": 0.7763980799420581, |
| "learning_rate": 1.9808519465474898e-05, |
| "loss": 1.0923, |
| "num_tokens": 704261185.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.48025959978366684, |
| "grad_norm": 1.05928288241113, |
| "learning_rate": 1.9803328406604252e-05, |
| "loss": 1.0701, |
| "num_tokens": 712339843.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.48458626284478096, |
| "grad_norm": 0.8164014894407516, |
| "learning_rate": 1.979806869761029e-05, |
| "loss": 1.0841, |
| "num_tokens": 720578736.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.4889129259058951, |
| "grad_norm": 0.7814622580999907, |
| "learning_rate": 1.9792740379509274e-05, |
| "loss": 1.038, |
| "num_tokens": 728873337.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.4932395889670092, |
| "grad_norm": 0.9950430098048184, |
| "learning_rate": 1.9787343493852508e-05, |
| "loss": 1.0695, |
| "num_tokens": 737002170.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.4975662520281233, |
| "grad_norm": 0.7337952839442135, |
| "learning_rate": 1.9781878082725982e-05, |
| "loss": 1.0662, |
| "num_tokens": 745278394.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.5018929150892374, |
| "grad_norm": 0.811331088998138, |
| "learning_rate": 1.977634418875007e-05, |
| "loss": 1.0453, |
| "num_tokens": 753501022.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.5062195781503516, |
| "grad_norm": 2.392844149917475, |
| "learning_rate": 1.9770741855079197e-05, |
| "loss": 1.0585, |
| "num_tokens": 761674657.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.5105462412114656, |
| "grad_norm": 0.9750359184413522, |
| "learning_rate": 1.976507112540148e-05, |
| "loss": 1.0204, |
| "num_tokens": 769925505.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.5148729042725798, |
| "grad_norm": 0.5808964572584242, |
| "learning_rate": 1.9759332043938408e-05, |
| "loss": 1.0629, |
| "num_tokens": 778048637.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.5191995673336939, |
| "grad_norm": 2.0190518414090257, |
| "learning_rate": 1.9753524655444495e-05, |
| "loss": 1.055, |
| "num_tokens": 786210971.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5235262303948081, |
| "grad_norm": 1.300368784946342, |
| "learning_rate": 1.974764900520692e-05, |
| "loss": 1.0442, |
| "num_tokens": 794450890.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.5278528934559221, |
| "grad_norm": 0.7478247329402357, |
| "learning_rate": 1.9741705139045183e-05, |
| "loss": 1.0273, |
| "num_tokens": 802668849.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.5321795565170362, |
| "grad_norm": 0.7925626643246553, |
| "learning_rate": 1.9735693103310747e-05, |
| "loss": 1.0436, |
| "num_tokens": 810826072.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.5365062195781504, |
| "grad_norm": 0.8408791086220021, |
| "learning_rate": 1.9729612944886677e-05, |
| "loss": 1.0386, |
| "num_tokens": 819012511.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.5408328826392644, |
| "grad_norm": 0.705243046024607, |
| "learning_rate": 1.9723464711187267e-05, |
| "loss": 1.0584, |
| "num_tokens": 827153219.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5451595457003786, |
| "grad_norm": 0.6815712426490166, |
| "learning_rate": 1.9717248450157682e-05, |
| "loss": 1.0573, |
| "num_tokens": 835379532.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.5494862087614927, |
| "grad_norm": 0.73642201821198, |
| "learning_rate": 1.9710964210273567e-05, |
| "loss": 1.043, |
| "num_tokens": 843463437.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.5538128718226069, |
| "grad_norm": 0.7038070626897667, |
| "learning_rate": 1.9704612040540698e-05, |
| "loss": 1.0404, |
| "num_tokens": 851604009.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.5581395348837209, |
| "grad_norm": 0.6972978549881204, |
| "learning_rate": 1.969819199049456e-05, |
| "loss": 1.0488, |
| "num_tokens": 859466113.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.5624661979448351, |
| "grad_norm": 0.7164998231426472, |
| "learning_rate": 1.9691704110199997e-05, |
| "loss": 1.0768, |
| "num_tokens": 867843649.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5667928610059492, |
| "grad_norm": 0.6453947425095201, |
| "learning_rate": 1.9685148450250802e-05, |
| "loss": 1.0588, |
| "num_tokens": 876181521.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.5711195240670632, |
| "grad_norm": 0.6809744904176902, |
| "learning_rate": 1.9678525061769332e-05, |
| "loss": 1.0546, |
| "num_tokens": 884409671.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.5754461871281774, |
| "grad_norm": 0.6874574416882646, |
| "learning_rate": 1.96718339964061e-05, |
| "loss": 1.0038, |
| "num_tokens": 892626806.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.5797728501892915, |
| "grad_norm": 0.7460633332766357, |
| "learning_rate": 1.9665075306339373e-05, |
| "loss": 1.0521, |
| "num_tokens": 900828840.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.5840995132504057, |
| "grad_norm": 0.725769240642372, |
| "learning_rate": 1.9658249044274773e-05, |
| "loss": 1.0552, |
| "num_tokens": 908969468.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.5884261763115197, |
| "grad_norm": 1.3895008041252126, |
| "learning_rate": 1.965135526344487e-05, |
| "loss": 1.0346, |
| "num_tokens": 917020568.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.5927528393726339, |
| "grad_norm": 0.7981436115786992, |
| "learning_rate": 1.964439401760875e-05, |
| "loss": 1.0473, |
| "num_tokens": 925187380.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.597079502433748, |
| "grad_norm": 1.3753330128722794, |
| "learning_rate": 1.9637365361051602e-05, |
| "loss": 1.0606, |
| "num_tokens": 933387237.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.601406165494862, |
| "grad_norm": 0.8330566649762117, |
| "learning_rate": 1.9630269348584303e-05, |
| "loss": 1.0731, |
| "num_tokens": 941428016.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.6057328285559762, |
| "grad_norm": 1.436449192271606, |
| "learning_rate": 1.9623106035542988e-05, |
| "loss": 1.0497, |
| "num_tokens": 949470688.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6100594916170903, |
| "grad_norm": 1.246511400682441, |
| "learning_rate": 1.9615875477788607e-05, |
| "loss": 1.039, |
| "num_tokens": 957603328.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.6143861546782045, |
| "grad_norm": 1.009120574140708, |
| "learning_rate": 1.9608577731706502e-05, |
| "loss": 1.0123, |
| "num_tokens": 965709162.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.6187128177393185, |
| "grad_norm": 0.9947417068391968, |
| "learning_rate": 1.9601212854205965e-05, |
| "loss": 1.0227, |
| "num_tokens": 973985569.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.6230394808004327, |
| "grad_norm": 0.6903270192877234, |
| "learning_rate": 1.959378090271979e-05, |
| "loss": 1.0408, |
| "num_tokens": 982165724.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.6273661438615468, |
| "grad_norm": 1.051406929345143, |
| "learning_rate": 1.9586281935203823e-05, |
| "loss": 1.0328, |
| "num_tokens": 990346736.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6316928069226609, |
| "grad_norm": 0.70540346117896, |
| "learning_rate": 1.9578716010136524e-05, |
| "loss": 1.0354, |
| "num_tokens": 998496426.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.636019469983775, |
| "grad_norm": 0.9469873445550671, |
| "learning_rate": 1.9571083186518495e-05, |
| "loss": 1.06, |
| "num_tokens": 1006665416.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.6403461330448891, |
| "grad_norm": 0.8052442525964558, |
| "learning_rate": 1.956338352387203e-05, |
| "loss": 1.0473, |
| "num_tokens": 1014768764.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.6446727961060033, |
| "grad_norm": 0.6789355824383974, |
| "learning_rate": 1.955561708224064e-05, |
| "loss": 1.0253, |
| "num_tokens": 1023020143.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.6489994591671173, |
| "grad_norm": 0.8715361890721862, |
| "learning_rate": 1.9547783922188605e-05, |
| "loss": 1.0315, |
| "num_tokens": 1031209660.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6533261222282315, |
| "grad_norm": 0.645674014648836, |
| "learning_rate": 1.953988410480047e-05, |
| "loss": 1.0419, |
| "num_tokens": 1039525232.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.6576527852893456, |
| "grad_norm": 5.190388510459407, |
| "learning_rate": 1.9531917691680605e-05, |
| "loss": 1.0205, |
| "num_tokens": 1047879166.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.6619794483504597, |
| "grad_norm": 1.423350204561511, |
| "learning_rate": 1.95238847449527e-05, |
| "loss": 1.0683, |
| "num_tokens": 1056096106.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.6663061114115738, |
| "grad_norm": 0.7873289191766745, |
| "learning_rate": 1.9515785327259283e-05, |
| "loss": 1.0276, |
| "num_tokens": 1064133140.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.670632774472688, |
| "grad_norm": 1.2079248544903594, |
| "learning_rate": 1.950761950176125e-05, |
| "loss": 1.0292, |
| "num_tokens": 1072439522.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.674959437533802, |
| "grad_norm": 1.2768662541750604, |
| "learning_rate": 1.949938733213733e-05, |
| "loss": 1.0655, |
| "num_tokens": 1080614495.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.6792861005949161, |
| "grad_norm": 1.0319413112299183, |
| "learning_rate": 1.9491088882583653e-05, |
| "loss": 1.0346, |
| "num_tokens": 1088737339.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.6836127636560303, |
| "grad_norm": 0.9551004654341626, |
| "learning_rate": 1.948272421781319e-05, |
| "loss": 1.0464, |
| "num_tokens": 1096942894.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.6879394267171444, |
| "grad_norm": 1.0191544176749112, |
| "learning_rate": 1.9474293403055273e-05, |
| "loss": 1.0614, |
| "num_tokens": 1105139333.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.6922660897782585, |
| "grad_norm": 0.9291601915995058, |
| "learning_rate": 1.9465796504055095e-05, |
| "loss": 1.0583, |
| "num_tokens": 1113209692.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6965927528393726, |
| "grad_norm": 0.7716222635405472, |
| "learning_rate": 1.9457233587073177e-05, |
| "loss": 1.0223, |
| "num_tokens": 1121410540.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.7009194159004868, |
| "grad_norm": 5.407882430974452, |
| "learning_rate": 1.9448604718884868e-05, |
| "loss": 1.0364, |
| "num_tokens": 1129595028.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.7052460789616009, |
| "grad_norm": 1.6307777217274049, |
| "learning_rate": 1.9439909966779816e-05, |
| "loss": 1.0252, |
| "num_tokens": 1137932356.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.709572742022715, |
| "grad_norm": 0.7568287231544429, |
| "learning_rate": 1.943114939856144e-05, |
| "loss": 1.0071, |
| "num_tokens": 1146191607.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.7138994050838291, |
| "grad_norm": 1.2144114452361634, |
| "learning_rate": 1.942232308254642e-05, |
| "loss": 1.0426, |
| "num_tokens": 1154479388.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.7182260681449432, |
| "grad_norm": 1.0485800704463941, |
| "learning_rate": 1.941343108756413e-05, |
| "loss": 1.023, |
| "num_tokens": 1162787341.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.7225527312060573, |
| "grad_norm": 1.1656800361671036, |
| "learning_rate": 1.9404473482956143e-05, |
| "loss": 1.0349, |
| "num_tokens": 1171083191.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.7268793942671714, |
| "grad_norm": 0.7559925278495352, |
| "learning_rate": 1.9395450338575655e-05, |
| "loss": 1.0143, |
| "num_tokens": 1179175970.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.7312060573282856, |
| "grad_norm": 1.0467326732986577, |
| "learning_rate": 1.938636172478695e-05, |
| "loss": 1.016, |
| "num_tokens": 1187347926.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.7355327203893997, |
| "grad_norm": 0.8195812665726188, |
| "learning_rate": 1.937720771246488e-05, |
| "loss": 1.0222, |
| "num_tokens": 1195697998.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.7398593834505138, |
| "grad_norm": 0.8533808464409776, |
| "learning_rate": 1.9367988372994264e-05, |
| "loss": 1.0552, |
| "num_tokens": 1203836889.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.7441860465116279, |
| "grad_norm": 0.6988527076226115, |
| "learning_rate": 1.9358703778269362e-05, |
| "loss": 1.0207, |
| "num_tokens": 1212045311.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.7485127095727421, |
| "grad_norm": 0.7157144584664211, |
| "learning_rate": 1.934935400069331e-05, |
| "loss": 1.0419, |
| "num_tokens": 1220252456.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.7528393726338561, |
| "grad_norm": 0.6380442216825093, |
| "learning_rate": 1.933993911317755e-05, |
| "loss": 1.0043, |
| "num_tokens": 1228367124.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.7571660356949702, |
| "grad_norm": 0.5945370228493201, |
| "learning_rate": 1.933045918914127e-05, |
| "loss": 1.0519, |
| "num_tokens": 1236533919.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.7614926987560844, |
| "grad_norm": 0.8284989924164381, |
| "learning_rate": 1.932091430251082e-05, |
| "loss": 1.0669, |
| "num_tokens": 1244845033.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.7658193618171985, |
| "grad_norm": 4.649372515055804, |
| "learning_rate": 1.931130452771914e-05, |
| "loss": 1.0512, |
| "num_tokens": 1252967541.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.7701460248783126, |
| "grad_norm": 0.9360299522999799, |
| "learning_rate": 1.930162993970519e-05, |
| "loss": 1.022, |
| "num_tokens": 1261225085.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.7744726879394267, |
| "grad_norm": 0.5167446323720225, |
| "learning_rate": 1.9291890613913353e-05, |
| "loss": 1.0204, |
| "num_tokens": 1269356299.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.7787993510005409, |
| "grad_norm": 0.6071868007581671, |
| "learning_rate": 1.9282086626292835e-05, |
| "loss": 1.0114, |
| "num_tokens": 1277586117.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7831260140616549, |
| "grad_norm": 0.6571171117401933, |
| "learning_rate": 1.9272218053297113e-05, |
| "loss": 1.0318, |
| "num_tokens": 1285775878.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.7874526771227691, |
| "grad_norm": 0.5942174702410566, |
| "learning_rate": 1.9262284971883293e-05, |
| "loss": 1.0147, |
| "num_tokens": 1293811971.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.7917793401838832, |
| "grad_norm": 0.5624732411675234, |
| "learning_rate": 1.925228745951155e-05, |
| "loss": 1.0539, |
| "num_tokens": 1302092313.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.7961060032449973, |
| "grad_norm": 0.8397395351542903, |
| "learning_rate": 1.9242225594144487e-05, |
| "loss": 1.0255, |
| "num_tokens": 1310198939.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.8004326663061114, |
| "grad_norm": 0.5175398862515561, |
| "learning_rate": 1.9232099454246547e-05, |
| "loss": 1.0268, |
| "num_tokens": 1318331118.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.8047593293672255, |
| "grad_norm": 1.0546926210587257, |
| "learning_rate": 1.922190911878341e-05, |
| "loss": 1.0241, |
| "num_tokens": 1326291524.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.8090859924283397, |
| "grad_norm": 0.7638236700049821, |
| "learning_rate": 1.9211654667221356e-05, |
| "loss": 1.0222, |
| "num_tokens": 1334457499.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.8134126554894537, |
| "grad_norm": 0.8595408443222351, |
| "learning_rate": 1.9201336179526662e-05, |
| "loss": 1.0428, |
| "num_tokens": 1342599951.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.8177393185505679, |
| "grad_norm": 0.7876334403831634, |
| "learning_rate": 1.9190953736164962e-05, |
| "loss": 1.0451, |
| "num_tokens": 1350886069.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.822065981611682, |
| "grad_norm": 0.6045137327621173, |
| "learning_rate": 1.918050741810064e-05, |
| "loss": 1.007, |
| "num_tokens": 1359135384.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.826392644672796, |
| "grad_norm": 0.7005328713623081, |
| "learning_rate": 1.916999730679618e-05, |
| "loss": 1.0071, |
| "num_tokens": 1367261648.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.8307193077339102, |
| "grad_norm": 0.5292196634413274, |
| "learning_rate": 1.9159423484211542e-05, |
| "loss": 1.0382, |
| "num_tokens": 1375344605.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.8350459707950243, |
| "grad_norm": 0.7266484905494246, |
| "learning_rate": 1.9148786032803516e-05, |
| "loss": 1.0424, |
| "num_tokens": 1383221004.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.8393726338561385, |
| "grad_norm": 0.6760097186229455, |
| "learning_rate": 1.9138085035525088e-05, |
| "loss": 1.0329, |
| "num_tokens": 1391421045.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.8436992969172525, |
| "grad_norm": 0.5857244218105304, |
| "learning_rate": 1.912732057582479e-05, |
| "loss": 1.0063, |
| "num_tokens": 1399440266.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8480259599783667, |
| "grad_norm": 0.8290327240335202, |
| "learning_rate": 1.9116492737646025e-05, |
| "loss": 1.0044, |
| "num_tokens": 1407440904.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.8523526230394808, |
| "grad_norm": 0.5055803198102621, |
| "learning_rate": 1.9105601605426464e-05, |
| "loss": 1.0017, |
| "num_tokens": 1415547487.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.856679286100595, |
| "grad_norm": 0.842860045337866, |
| "learning_rate": 1.909464726409734e-05, |
| "loss": 1.012, |
| "num_tokens": 1423784486.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.861005949161709, |
| "grad_norm": 0.654168133223008, |
| "learning_rate": 1.9083629799082806e-05, |
| "loss": 1.0282, |
| "num_tokens": 1432002900.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.8653326122228231, |
| "grad_norm": 0.6784775913175939, |
| "learning_rate": 1.9072549296299272e-05, |
| "loss": 1.0185, |
| "num_tokens": 1440057714.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8696592752839373, |
| "grad_norm": 0.7106471735455494, |
| "learning_rate": 1.9061405842154716e-05, |
| "loss": 1.0492, |
| "num_tokens": 1448138846.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.8739859383450513, |
| "grad_norm": 0.6800123277515188, |
| "learning_rate": 1.9050199523548042e-05, |
| "loss": 1.0141, |
| "num_tokens": 1456485502.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.8783126014061655, |
| "grad_norm": 0.617362481570814, |
| "learning_rate": 1.9038930427868367e-05, |
| "loss": 1.0682, |
| "num_tokens": 1464589584.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.8826392644672796, |
| "grad_norm": 0.7859831645553701, |
| "learning_rate": 1.9027598642994357e-05, |
| "loss": 1.0467, |
| "num_tokens": 1472644084.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.8869659275283938, |
| "grad_norm": 1.247567866410136, |
| "learning_rate": 1.901620425729356e-05, |
| "loss": 1.0194, |
| "num_tokens": 1480866465.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.8912925905895078, |
| "grad_norm": 0.7038700676463598, |
| "learning_rate": 1.900474735962168e-05, |
| "loss": 1.0148, |
| "num_tokens": 1489135207.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.895619253650622, |
| "grad_norm": 1.0886191351908234, |
| "learning_rate": 1.89932280393219e-05, |
| "loss": 1.0368, |
| "num_tokens": 1497458419.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.8999459167117361, |
| "grad_norm": 0.7162950117168408, |
| "learning_rate": 1.8981646386224205e-05, |
| "loss": 1.014, |
| "num_tokens": 1505698225.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.9042725797728501, |
| "grad_norm": 1.343065307722291, |
| "learning_rate": 1.8970002490644643e-05, |
| "loss": 1.0351, |
| "num_tokens": 1513767920.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.9085992428339643, |
| "grad_norm": 1.0074287474494303, |
| "learning_rate": 1.8958296443384655e-05, |
| "loss": 1.0342, |
| "num_tokens": 1521949857.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9129259058950784, |
| "grad_norm": 1.1963801705182988, |
| "learning_rate": 1.8946528335730344e-05, |
| "loss": 1.0122, |
| "num_tokens": 1530125689.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.9172525689561926, |
| "grad_norm": 0.9762540750082646, |
| "learning_rate": 1.8934698259451784e-05, |
| "loss": 0.9763, |
| "num_tokens": 1538076331.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.9215792320173066, |
| "grad_norm": 1.2050370011602503, |
| "learning_rate": 1.8922806306802283e-05, |
| "loss": 1.0114, |
| "num_tokens": 1546085841.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.9259058950784208, |
| "grad_norm": 1.019244393618009, |
| "learning_rate": 1.891085257051768e-05, |
| "loss": 1.0102, |
| "num_tokens": 1554250179.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.9302325581395349, |
| "grad_norm": 1.2220847331235862, |
| "learning_rate": 1.8898837143815604e-05, |
| "loss": 1.0606, |
| "num_tokens": 1562575332.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.934559221200649, |
| "grad_norm": 1.0804822558122213, |
| "learning_rate": 1.8886760120394774e-05, |
| "loss": 1.0086, |
| "num_tokens": 1570784477.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.9388858842617631, |
| "grad_norm": 1.0198301768654703, |
| "learning_rate": 1.8874621594434242e-05, |
| "loss": 1.0374, |
| "num_tokens": 1578962689.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.9432125473228772, |
| "grad_norm": 0.9362158917261221, |
| "learning_rate": 1.8862421660592673e-05, |
| "loss": 1.0284, |
| "num_tokens": 1587242415.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.9475392103839914, |
| "grad_norm": 0.8668622386683569, |
| "learning_rate": 1.8850160414007595e-05, |
| "loss": 1.0378, |
| "num_tokens": 1595320714.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.9518658734451054, |
| "grad_norm": 0.822217400307939, |
| "learning_rate": 1.883783795029468e-05, |
| "loss": 1.0299, |
| "num_tokens": 1603469822.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9561925365062196, |
| "grad_norm": 0.753849423474381, |
| "learning_rate": 1.8825454365546974e-05, |
| "loss": 1.0207, |
| "num_tokens": 1611630522.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.9605191995673337, |
| "grad_norm": 0.8186514232729086, |
| "learning_rate": 1.8813009756334156e-05, |
| "loss": 1.0247, |
| "num_tokens": 1619824501.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.9648458626284478, |
| "grad_norm": 0.6595218775089154, |
| "learning_rate": 1.8800504219701788e-05, |
| "loss": 0.9922, |
| "num_tokens": 1628163466.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.9691725256895619, |
| "grad_norm": 0.7728060286552597, |
| "learning_rate": 1.8787937853170563e-05, |
| "loss": 1.0026, |
| "num_tokens": 1636389426.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.9734991887506761, |
| "grad_norm": 0.595600898984859, |
| "learning_rate": 1.8775310754735518e-05, |
| "loss": 1.0253, |
| "num_tokens": 1644683544.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.9778258518117902, |
| "grad_norm": 0.7038506695713876, |
| "learning_rate": 1.8762623022865317e-05, |
| "loss": 1.0104, |
| "num_tokens": 1652751148.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.9821525148729042, |
| "grad_norm": 0.5773805207875454, |
| "learning_rate": 1.874987475650144e-05, |
| "loss": 1.008, |
| "num_tokens": 1660831749.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.9864791779340184, |
| "grad_norm": 0.7144988810186003, |
| "learning_rate": 1.873706605505742e-05, |
| "loss": 1.0089, |
| "num_tokens": 1669068203.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.9908058409951325, |
| "grad_norm": 0.47273218493032554, |
| "learning_rate": 1.8724197018418092e-05, |
| "loss": 1.0212, |
| "num_tokens": 1677321958.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.9951325040562466, |
| "grad_norm": 0.8706008306058246, |
| "learning_rate": 1.8711267746938787e-05, |
| "loss": 1.0347, |
| "num_tokens": 1685530964.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.9994591671173607, |
| "grad_norm": 0.5393694224243536, |
| "learning_rate": 1.869827834144456e-05, |
| "loss": 1.0167, |
| "num_tokens": 1693877164.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.0116059536968622, |
| "learning_rate": 1.8685228903229408e-05, |
| "loss": 0.9419, |
| "num_tokens": 1694884683.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.004326663061114, |
| "grad_norm": 0.8523845557819635, |
| "learning_rate": 1.8672119534055465e-05, |
| "loss": 1.0281, |
| "num_tokens": 1703177073.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.0086533261222281, |
| "grad_norm": 0.7971322678050575, |
| "learning_rate": 1.8658950336152227e-05, |
| "loss": 1.0409, |
| "num_tokens": 1711430466.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.0129799891833424, |
| "grad_norm": 0.7054801324762776, |
| "learning_rate": 1.864572141221575e-05, |
| "loss": 1.0053, |
| "num_tokens": 1719439190.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.0173066522444565, |
| "grad_norm": 0.6283763715716751, |
| "learning_rate": 1.8632432865407835e-05, |
| "loss": 0.9948, |
| "num_tokens": 1727609354.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.0216333153055706, |
| "grad_norm": 0.8687854722145413, |
| "learning_rate": 1.861908479935524e-05, |
| "loss": 1.019, |
| "num_tokens": 1735661093.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.0259599783666846, |
| "grad_norm": 0.5500318403146999, |
| "learning_rate": 1.8605677318148872e-05, |
| "loss": 1.0003, |
| "num_tokens": 1743819625.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.030286641427799, |
| "grad_norm": 0.9311777237720954, |
| "learning_rate": 1.859221052634295e-05, |
| "loss": 1.0116, |
| "num_tokens": 1751877899.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.034613304488913, |
| "grad_norm": 0.7175578463961261, |
| "learning_rate": 1.8578684528954232e-05, |
| "loss": 1.0081, |
| "num_tokens": 1760146337.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.038939967550027, |
| "grad_norm": 3.3107808035192896, |
| "learning_rate": 1.8565099431461158e-05, |
| "loss": 1.0055, |
| "num_tokens": 1768338684.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.043266630611141, |
| "grad_norm": 1.5202129013625794, |
| "learning_rate": 1.8551455339803053e-05, |
| "loss": 1.0269, |
| "num_tokens": 1776491986.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.0475932936722552, |
| "grad_norm": 3.0266554495559577, |
| "learning_rate": 1.8537752360379277e-05, |
| "loss": 1.0205, |
| "num_tokens": 1784795461.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.0519199567333695, |
| "grad_norm": 1.0080337963121409, |
| "learning_rate": 1.852399060004842e-05, |
| "loss": 1.0164, |
| "num_tokens": 1792906059.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.0562466197944835, |
| "grad_norm": 0.6874961714524145, |
| "learning_rate": 1.8510170166127453e-05, |
| "loss": 0.9905, |
| "num_tokens": 1801216289.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0605732828555976, |
| "grad_norm": 0.9732598709336382, |
| "learning_rate": 1.8496291166390898e-05, |
| "loss": 1.0108, |
| "num_tokens": 1809486577.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.0648999459167117, |
| "grad_norm": 0.5494563319646898, |
| "learning_rate": 1.848235370906998e-05, |
| "loss": 0.9772, |
| "num_tokens": 1817790020.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.069226608977826, |
| "grad_norm": 1.006676590805476, |
| "learning_rate": 1.8468357902851788e-05, |
| "loss": 1.0049, |
| "num_tokens": 1825857770.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.07355327203894, |
| "grad_norm": 0.8087330733140408, |
| "learning_rate": 1.845430385687844e-05, |
| "loss": 0.9978, |
| "num_tokens": 1834035227.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.077879935100054, |
| "grad_norm": 0.9239546334768601, |
| "learning_rate": 1.84401916807462e-05, |
| "loss": 1.0173, |
| "num_tokens": 1842218920.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.0822065981611682, |
| "grad_norm": 0.7441844314059644, |
| "learning_rate": 1.8426021484504655e-05, |
| "loss": 1.0218, |
| "num_tokens": 1850389037.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.0865332612222822, |
| "grad_norm": 0.8917787308415308, |
| "learning_rate": 1.8411793378655847e-05, |
| "loss": 1.0076, |
| "num_tokens": 1858664333.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.0908599242833965, |
| "grad_norm": 0.6628470852468289, |
| "learning_rate": 1.83975074741534e-05, |
| "loss": 1.0061, |
| "num_tokens": 1866687616.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.0951865873445106, |
| "grad_norm": 0.9425059819695242, |
| "learning_rate": 1.8383163882401664e-05, |
| "loss": 1.0126, |
| "num_tokens": 1874927392.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.0995132504056246, |
| "grad_norm": 0.8014508302359454, |
| "learning_rate": 1.836876271525485e-05, |
| "loss": 1.0231, |
| "num_tokens": 1883000858.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.1038399134667387, |
| "grad_norm": 0.7398590141389113, |
| "learning_rate": 1.8354304085016157e-05, |
| "loss": 0.9832, |
| "num_tokens": 1891276371.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.1081665765278528, |
| "grad_norm": 0.7402302271445659, |
| "learning_rate": 1.8339788104436886e-05, |
| "loss": 1.0052, |
| "num_tokens": 1899358130.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.112493239588967, |
| "grad_norm": 0.658496295512364, |
| "learning_rate": 1.8325214886715567e-05, |
| "loss": 1.0036, |
| "num_tokens": 1907341426.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.1168199026500811, |
| "grad_norm": 0.5716390513296978, |
| "learning_rate": 1.8310584545497075e-05, |
| "loss": 1.0275, |
| "num_tokens": 1915436031.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.1211465657111952, |
| "grad_norm": 0.7032730355198555, |
| "learning_rate": 1.829589719487176e-05, |
| "loss": 0.9917, |
| "num_tokens": 1923730470.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.1254732287723093, |
| "grad_norm": 0.5511876551401437, |
| "learning_rate": 1.8281152949374527e-05, |
| "loss": 0.9666, |
| "num_tokens": 1931795232.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.1297998918334236, |
| "grad_norm": 0.7898567891349134, |
| "learning_rate": 1.8266351923983967e-05, |
| "loss": 0.9967, |
| "num_tokens": 1939773367.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.1341265548945376, |
| "grad_norm": 0.599615186489735, |
| "learning_rate": 1.8251494234121445e-05, |
| "loss": 0.9802, |
| "num_tokens": 1948032241.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.1384532179556517, |
| "grad_norm": 0.8668923604915937, |
| "learning_rate": 1.823657999565021e-05, |
| "loss": 1.027, |
| "num_tokens": 1956343781.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.1427798810167658, |
| "grad_norm": 0.6738644701502109, |
| "learning_rate": 1.8221609324874503e-05, |
| "loss": 0.9755, |
| "num_tokens": 1964501930.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.14710654407788, |
| "grad_norm": 0.8489600222509309, |
| "learning_rate": 1.82065823385386e-05, |
| "loss": 1.0475, |
| "num_tokens": 1972762164.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.151433207138994, |
| "grad_norm": 0.6976538570690681, |
| "learning_rate": 1.819149915382598e-05, |
| "loss": 1.0346, |
| "num_tokens": 1980766973.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.1557598702001082, |
| "grad_norm": 0.6334291327020631, |
| "learning_rate": 1.8176359888358332e-05, |
| "loss": 0.9949, |
| "num_tokens": 1988984376.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.1600865332612222, |
| "grad_norm": 0.7758411292456114, |
| "learning_rate": 1.8161164660194697e-05, |
| "loss": 1.0063, |
| "num_tokens": 1997243756.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.1644131963223363, |
| "grad_norm": 0.53766380879933, |
| "learning_rate": 1.814591358783052e-05, |
| "loss": 0.9687, |
| "num_tokens": 2005281688.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.1687398593834506, |
| "grad_norm": 0.7298357532804184, |
| "learning_rate": 1.813060679019672e-05, |
| "loss": 1.0034, |
| "num_tokens": 2013514643.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.1730665224445647, |
| "grad_norm": 0.5869951640518326, |
| "learning_rate": 1.811524438665878e-05, |
| "loss": 0.9717, |
| "num_tokens": 2021706426.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.1773931855056787, |
| "grad_norm": 0.7409836437587911, |
| "learning_rate": 1.809982649701581e-05, |
| "loss": 1.0074, |
| "num_tokens": 2029841931.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.1817198485667928, |
| "grad_norm": 0.5601651834188004, |
| "learning_rate": 1.808435324149961e-05, |
| "loss": 1.0175, |
| "num_tokens": 2038051382.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.1860465116279069, |
| "grad_norm": 0.7059057420034878, |
| "learning_rate": 1.806882474077374e-05, |
| "loss": 0.9781, |
| "num_tokens": 2046194678.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.1903731746890212, |
| "grad_norm": 0.5285941165253452, |
| "learning_rate": 1.805324111593256e-05, |
| "loss": 0.9934, |
| "num_tokens": 2054410246.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.1946998377501352, |
| "grad_norm": 0.8160359988210635, |
| "learning_rate": 1.8037602488500313e-05, |
| "loss": 1.0225, |
| "num_tokens": 2062614982.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.1990265008112493, |
| "grad_norm": 0.7592201740439128, |
| "learning_rate": 1.8021908980430153e-05, |
| "loss": 1.0009, |
| "num_tokens": 2070765170.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.2033531638723634, |
| "grad_norm": 0.6963096899729393, |
| "learning_rate": 1.8006160714103213e-05, |
| "loss": 1.0061, |
| "num_tokens": 2078798709.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.2076798269334776, |
| "grad_norm": 0.7659996635522403, |
| "learning_rate": 1.7990357812327634e-05, |
| "loss": 1.0477, |
| "num_tokens": 2087010586.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.2120064899945917, |
| "grad_norm": 0.6004295360408359, |
| "learning_rate": 1.797450039833762e-05, |
| "loss": 1.0005, |
| "num_tokens": 2095235843.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.2163331530557058, |
| "grad_norm": 0.7399161134131866, |
| "learning_rate": 1.7958588595792467e-05, |
| "loss": 1.0077, |
| "num_tokens": 2103386607.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.2206598161168198, |
| "grad_norm": 0.6508563400956765, |
| "learning_rate": 1.794262252877561e-05, |
| "loss": 0.9971, |
| "num_tokens": 2111487429.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.2249864791779341, |
| "grad_norm": 0.6883886284438999, |
| "learning_rate": 1.7926602321793652e-05, |
| "loss": 0.9877, |
| "num_tokens": 2119602589.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.2293131422390482, |
| "grad_norm": 0.7341240468720364, |
| "learning_rate": 1.791052809977538e-05, |
| "loss": 1.0029, |
| "num_tokens": 2127670452.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.2336398053001623, |
| "grad_norm": 0.6043417685905373, |
| "learning_rate": 1.7894399988070804e-05, |
| "loss": 0.9953, |
| "num_tokens": 2135883917.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.2379664683612763, |
| "grad_norm": 0.9246150340955609, |
| "learning_rate": 1.787821811245018e-05, |
| "loss": 0.9928, |
| "num_tokens": 2144047925.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.2422931314223904, |
| "grad_norm": 0.5523779600689154, |
| "learning_rate": 1.7861982599103033e-05, |
| "loss": 1.0071, |
| "num_tokens": 2152384051.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.2466197944835047, |
| "grad_norm": 0.8571329459319135, |
| "learning_rate": 1.7845693574637145e-05, |
| "loss": 1.0071, |
| "num_tokens": 2160699270.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.2509464575446188, |
| "grad_norm": 0.7056191532296572, |
| "learning_rate": 1.7829351166077613e-05, |
| "loss": 1.0144, |
| "num_tokens": 2168902172.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.2552731206057328, |
| "grad_norm": 0.6503760866034937, |
| "learning_rate": 1.781295550086581e-05, |
| "loss": 1.0066, |
| "num_tokens": 2177239381.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.2595997836668469, |
| "grad_norm": 0.7990458653208856, |
| "learning_rate": 1.779650670685843e-05, |
| "loss": 0.997, |
| "num_tokens": 2185441306.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.263926446727961, |
| "grad_norm": 0.6149308243930688, |
| "learning_rate": 1.7780004912326482e-05, |
| "loss": 0.9778, |
| "num_tokens": 2193573128.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.2682531097890752, |
| "grad_norm": 0.6225445687843602, |
| "learning_rate": 1.7763450245954265e-05, |
| "loss": 0.9604, |
| "num_tokens": 2201777204.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.2725797728501893, |
| "grad_norm": 0.6292492547633257, |
| "learning_rate": 1.7746842836838397e-05, |
| "loss": 0.9954, |
| "num_tokens": 2210018927.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.2769064359113034, |
| "grad_norm": 0.6516142728390708, |
| "learning_rate": 1.773018281448679e-05, |
| "loss": 0.9806, |
| "num_tokens": 2218100500.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.2812330989724177, |
| "grad_norm": 0.5320881419249361, |
| "learning_rate": 1.7713470308817642e-05, |
| "loss": 1.0039, |
| "num_tokens": 2226355010.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.2855597620335315, |
| "grad_norm": 0.5584190002591476, |
| "learning_rate": 1.769670545015843e-05, |
| "loss": 1.0101, |
| "num_tokens": 2234658122.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.2898864250946458, |
| "grad_norm": 0.6678876958868184, |
| "learning_rate": 1.7679888369244895e-05, |
| "loss": 0.9922, |
| "num_tokens": 2242975637.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.2942130881557599, |
| "grad_norm": 0.4558349860883781, |
| "learning_rate": 1.7663019197220003e-05, |
| "loss": 0.9782, |
| "num_tokens": 2251107863.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.298539751216874, |
| "grad_norm": 0.5253621106010484, |
| "learning_rate": 1.7646098065632956e-05, |
| "loss": 0.9693, |
| "num_tokens": 2259286425.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.3028664142779882, |
| "grad_norm": 0.5270006972835457, |
| "learning_rate": 1.7629125106438132e-05, |
| "loss": 0.9963, |
| "num_tokens": 2267601260.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.3071930773391023, |
| "grad_norm": 0.5164120144566925, |
| "learning_rate": 1.7612100451994077e-05, |
| "loss": 0.9976, |
| "num_tokens": 2275904161.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.3115197404002163, |
| "grad_norm": 0.6543412137671887, |
| "learning_rate": 1.759502423506246e-05, |
| "loss": 0.9804, |
| "num_tokens": 2284244187.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.3158464034613304, |
| "grad_norm": 0.5832832348747982, |
| "learning_rate": 1.7577896588807065e-05, |
| "loss": 1.0123, |
| "num_tokens": 2292398951.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.3201730665224445, |
| "grad_norm": 0.5621179876214708, |
| "learning_rate": 1.7560717646792704e-05, |
| "loss": 1.003, |
| "num_tokens": 2300523004.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.3244997295835588, |
| "grad_norm": 0.4864049233927397, |
| "learning_rate": 1.7543487542984227e-05, |
| "loss": 0.9805, |
| "num_tokens": 2308678897.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.3288263926446728, |
| "grad_norm": 0.6631344821261796, |
| "learning_rate": 1.752620641174544e-05, |
| "loss": 1.0121, |
| "num_tokens": 2316917049.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.333153055705787, |
| "grad_norm": 0.4724599093603475, |
| "learning_rate": 1.750887438783808e-05, |
| "loss": 1.0085, |
| "num_tokens": 2325186123.0, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.337479718766901, |
| "grad_norm": 0.5689296032553274, |
| "learning_rate": 1.749149160642075e-05, |
| "loss": 0.978, |
| "num_tokens": 2333446937.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.341806381828015, |
| "grad_norm": 0.6132116514129226, |
| "learning_rate": 1.7474058203047863e-05, |
| "loss": 1.0103, |
| "num_tokens": 2341787109.0, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.3461330448891293, |
| "grad_norm": 0.48324225857414066, |
| "learning_rate": 1.745657431366861e-05, |
| "loss": 1.0389, |
| "num_tokens": 2349822363.0, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.3504597079502434, |
| "grad_norm": 0.6027402450479121, |
| "learning_rate": 1.743904007462587e-05, |
| "loss": 0.992, |
| "num_tokens": 2358047057.0, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.3547863710113575, |
| "grad_norm": 0.5459463271112598, |
| "learning_rate": 1.742145562265516e-05, |
| "loss": 0.9789, |
| "num_tokens": 2366212866.0, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.3591130340724715, |
| "grad_norm": 0.6005164633273204, |
| "learning_rate": 1.7403821094883572e-05, |
| "loss": 1.0129, |
| "num_tokens": 2374409556.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.3634396971335856, |
| "grad_norm": 0.44348087107159345, |
| "learning_rate": 1.738613662882869e-05, |
| "loss": 0.9625, |
| "num_tokens": 2382456975.0, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.3677663601946999, |
| "grad_norm": 0.5583508504708583, |
| "learning_rate": 1.7368402362397537e-05, |
| "loss": 0.9857, |
| "num_tokens": 2390530712.0, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.372093023255814, |
| "grad_norm": 0.5361240200931625, |
| "learning_rate": 1.7350618433885487e-05, |
| "loss": 0.996, |
| "num_tokens": 2398702919.0, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.376419686316928, |
| "grad_norm": 0.6419494735313418, |
| "learning_rate": 1.7332784981975183e-05, |
| "loss": 0.9783, |
| "num_tokens": 2406786796.0, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.3807463493780423, |
| "grad_norm": 0.46442325392330475, |
| "learning_rate": 1.731490214573547e-05, |
| "loss": 0.9767, |
| "num_tokens": 2414957373.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.3850730124391564, |
| "grad_norm": 0.5504177189685487, |
| "learning_rate": 1.729697006462029e-05, |
| "loss": 0.9564, |
| "num_tokens": 2423238537.0, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.3893996755002704, |
| "grad_norm": 0.5568430696730932, |
| "learning_rate": 1.7278988878467616e-05, |
| "loss": 0.9875, |
| "num_tokens": 2431242288.0, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.3937263385613845, |
| "grad_norm": 0.6944929843782134, |
| "learning_rate": 1.7260958727498358e-05, |
| "loss": 0.9879, |
| "num_tokens": 2439410045.0, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.3980530016224986, |
| "grad_norm": 0.43050663385999666, |
| "learning_rate": 1.7242879752315246e-05, |
| "loss": 1.0351, |
| "num_tokens": 2447761767.0, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.4023796646836129, |
| "grad_norm": 0.5645224462691939, |
| "learning_rate": 1.722475209390176e-05, |
| "loss": 0.9672, |
| "num_tokens": 2455907686.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.406706327744727, |
| "grad_norm": 0.5505917448888162, |
| "learning_rate": 1.720657589362103e-05, |
| "loss": 0.9999, |
| "num_tokens": 2464177214.0, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.411032990805841, |
| "grad_norm": 0.6503856248771197, |
| "learning_rate": 1.7188351293214707e-05, |
| "loss": 0.9932, |
| "num_tokens": 2472423451.0, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.415359653866955, |
| "grad_norm": 0.5177449838157887, |
| "learning_rate": 1.7170078434801893e-05, |
| "loss": 0.9814, |
| "num_tokens": 2480548139.0, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.4196863169280691, |
| "grad_norm": 0.5428671196864796, |
| "learning_rate": 1.7151757460878006e-05, |
| "loss": 0.9839, |
| "num_tokens": 2488700875.0, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.4240129799891834, |
| "grad_norm": 0.6795948596741072, |
| "learning_rate": 1.713338851431368e-05, |
| "loss": 0.9831, |
| "num_tokens": 2496833920.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.4283396430502975, |
| "grad_norm": 0.4956381502852352, |
| "learning_rate": 1.7114971738353652e-05, |
| "loss": 0.99, |
| "num_tokens": 2504957672.0, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.4326663061114115, |
| "grad_norm": 0.5599271824993283, |
| "learning_rate": 1.7096507276615638e-05, |
| "loss": 0.9968, |
| "num_tokens": 2513212255.0, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.4369929691725256, |
| "grad_norm": 0.5272083444046696, |
| "learning_rate": 1.707799527308922e-05, |
| "loss": 0.9881, |
| "num_tokens": 2521365508.0, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.4413196322336397, |
| "grad_norm": 0.6114656981188148, |
| "learning_rate": 1.7059435872134725e-05, |
| "loss": 0.9658, |
| "num_tokens": 2529600061.0, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.445646295294754, |
| "grad_norm": 0.5311777583844968, |
| "learning_rate": 1.7040829218482083e-05, |
| "loss": 1.0014, |
| "num_tokens": 2537565588.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.449972958355868, |
| "grad_norm": 0.594295616323625, |
| "learning_rate": 1.7022175457229726e-05, |
| "loss": 0.9961, |
| "num_tokens": 2545669630.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.454299621416982, |
| "grad_norm": 7.315914279837485, |
| "learning_rate": 1.7003474733843423e-05, |
| "loss": 1.0044, |
| "num_tokens": 2553827025.0, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.4586262844780964, |
| "grad_norm": 0.9226883902311878, |
| "learning_rate": 1.6984727194155172e-05, |
| "loss": 0.9889, |
| "num_tokens": 2562035770.0, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.4629529475392105, |
| "grad_norm": 0.4628985508669373, |
| "learning_rate": 1.696593298436206e-05, |
| "loss": 1.0136, |
| "num_tokens": 2570209243.0, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.4672796106003245, |
| "grad_norm": 0.6513152028153515, |
| "learning_rate": 1.6947092251025103e-05, |
| "loss": 0.9955, |
| "num_tokens": 2578291736.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.4716062736614386, |
| "grad_norm": 0.5546903898287193, |
| "learning_rate": 1.6928205141068125e-05, |
| "loss": 1.0059, |
| "num_tokens": 2586425114.0, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.4759329367225527, |
| "grad_norm": 0.6184393569733373, |
| "learning_rate": 1.690927180177661e-05, |
| "loss": 0.9803, |
| "num_tokens": 2594721102.0, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.480259599783667, |
| "grad_norm": 0.7432953081745481, |
| "learning_rate": 1.6890292380796534e-05, |
| "loss": 0.9996, |
| "num_tokens": 2602895947.0, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.484586262844781, |
| "grad_norm": 0.5083465411774949, |
| "learning_rate": 1.687126702613324e-05, |
| "loss": 0.9853, |
| "num_tokens": 2611136812.0, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.488912925905895, |
| "grad_norm": 0.6819386036131501, |
| "learning_rate": 1.685219588615026e-05, |
| "loss": 1.012, |
| "num_tokens": 2619141613.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.4932395889670091, |
| "grad_norm": 0.5767934097291644, |
| "learning_rate": 1.683307910956818e-05, |
| "loss": 0.977, |
| "num_tokens": 2627283279.0, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.4975662520281232, |
| "grad_norm": 0.6396276200966129, |
| "learning_rate": 1.6813916845463462e-05, |
| "loss": 0.9982, |
| "num_tokens": 2635365385.0, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.5018929150892375, |
| "grad_norm": 0.5492232920660722, |
| "learning_rate": 1.6794709243267288e-05, |
| "loss": 0.9719, |
| "num_tokens": 2643617216.0, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.5062195781503516, |
| "grad_norm": 0.6330063608519101, |
| "learning_rate": 1.6775456452764398e-05, |
| "loss": 1.01, |
| "num_tokens": 2651803411.0, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.5105462412114656, |
| "grad_norm": 0.45914817800081437, |
| "learning_rate": 1.6756158624091923e-05, |
| "loss": 0.9947, |
| "num_tokens": 2660138375.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.51487290427258, |
| "grad_norm": 0.6030719337908035, |
| "learning_rate": 1.673681590773821e-05, |
| "loss": 1.0027, |
| "num_tokens": 2668455705.0, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.5191995673336938, |
| "grad_norm": 0.4539111994752465, |
| "learning_rate": 1.671742845454164e-05, |
| "loss": 0.9764, |
| "num_tokens": 2676612397.0, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.523526230394808, |
| "grad_norm": 0.6279947858419637, |
| "learning_rate": 1.6697996415689473e-05, |
| "loss": 0.9975, |
| "num_tokens": 2684750468.0, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.5278528934559221, |
| "grad_norm": 0.5588793031254649, |
| "learning_rate": 1.667851994271665e-05, |
| "loss": 0.9724, |
| "num_tokens": 2692847293.0, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.5321795565170362, |
| "grad_norm": 0.6174071639009994, |
| "learning_rate": 1.6658999187504615e-05, |
| "loss": 0.9763, |
| "num_tokens": 2700888345.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.5365062195781505, |
| "grad_norm": 0.44484731949205647, |
| "learning_rate": 1.6639434302280145e-05, |
| "loss": 0.962, |
| "num_tokens": 2709053608.0, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.5408328826392643, |
| "grad_norm": 0.5714553401258878, |
| "learning_rate": 1.6619825439614143e-05, |
| "loss": 0.9919, |
| "num_tokens": 2717170050.0, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.5451595457003786, |
| "grad_norm": 0.48460960422536814, |
| "learning_rate": 1.660017275242046e-05, |
| "loss": 0.9892, |
| "num_tokens": 2725229510.0, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.5494862087614927, |
| "grad_norm": 0.5112248791874374, |
| "learning_rate": 1.6580476393954694e-05, |
| "loss": 0.9662, |
| "num_tokens": 2733428514.0, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.5538128718226067, |
| "grad_norm": 0.4309041720695094, |
| "learning_rate": 1.6560736517813013e-05, |
| "loss": 0.9885, |
| "num_tokens": 2741589691.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.558139534883721, |
| "grad_norm": 0.553317003415503, |
| "learning_rate": 1.6540953277930925e-05, |
| "loss": 1.0059, |
| "num_tokens": 2749928247.0, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.562466197944835, |
| "grad_norm": 0.5129455597122908, |
| "learning_rate": 1.6521126828582118e-05, |
| "loss": 0.9823, |
| "num_tokens": 2758121545.0, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.5667928610059492, |
| "grad_norm": 0.5998715255176499, |
| "learning_rate": 1.6501257324377227e-05, |
| "loss": 1.0031, |
| "num_tokens": 2766372133.0, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.5711195240670632, |
| "grad_norm": 0.38349746741335283, |
| "learning_rate": 1.648134492026263e-05, |
| "loss": 1.0038, |
| "num_tokens": 2774564035.0, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.5754461871281773, |
| "grad_norm": 0.5607648982620999, |
| "learning_rate": 1.6461389771519263e-05, |
| "loss": 0.9557, |
| "num_tokens": 2782723296.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.5797728501892916, |
| "grad_norm": 0.5745161983634366, |
| "learning_rate": 1.6441392033761378e-05, |
| "loss": 0.9981, |
| "num_tokens": 2790897244.0, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.5840995132504057, |
| "grad_norm": 0.4868016408983241, |
| "learning_rate": 1.6421351862935348e-05, |
| "loss": 0.995, |
| "num_tokens": 2799161194.0, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.5884261763115197, |
| "grad_norm": 0.6663144418274797, |
| "learning_rate": 1.6401269415318462e-05, |
| "loss": 0.9716, |
| "num_tokens": 2807099848.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.592752839372634, |
| "grad_norm": 0.3999865533654534, |
| "learning_rate": 1.6381144847517672e-05, |
| "loss": 0.9727, |
| "num_tokens": 2815387290.0, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.5970795024337479, |
| "grad_norm": 0.5902182552201716, |
| "learning_rate": 1.6360978316468404e-05, |
| "loss": 0.9756, |
| "num_tokens": 2823576095.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.6014061654948621, |
| "grad_norm": 0.4725880021279996, |
| "learning_rate": 1.6340769979433314e-05, |
| "loss": 0.9894, |
| "num_tokens": 2831697837.0, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.6057328285559762, |
| "grad_norm": 0.5618830540568662, |
| "learning_rate": 1.632051999400108e-05, |
| "loss": 0.9769, |
| "num_tokens": 2839950847.0, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.6100594916170903, |
| "grad_norm": 0.49481337447529106, |
| "learning_rate": 1.6300228518085148e-05, |
| "loss": 0.9892, |
| "num_tokens": 2848152749.0, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.6143861546782046, |
| "grad_norm": 0.5629300935165885, |
| "learning_rate": 1.6279895709922534e-05, |
| "loss": 0.9968, |
| "num_tokens": 2856267040.0, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.6187128177393184, |
| "grad_norm": 0.5321521440430792, |
| "learning_rate": 1.625952172807255e-05, |
| "loss": 0.9772, |
| "num_tokens": 2864487220.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.6230394808004327, |
| "grad_norm": 0.4621380145986486, |
| "learning_rate": 1.6239106731415604e-05, |
| "loss": 0.9616, |
| "num_tokens": 2872722633.0, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.6273661438615468, |
| "grad_norm": 0.5566861055001202, |
| "learning_rate": 1.6218650879151946e-05, |
| "loss": 0.992, |
| "num_tokens": 2880964813.0, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.6316928069226608, |
| "grad_norm": 0.5691044633846349, |
| "learning_rate": 1.6198154330800408e-05, |
| "loss": 0.9852, |
| "num_tokens": 2889076459.0, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.6360194699837751, |
| "grad_norm": 0.4585342412787512, |
| "learning_rate": 1.6177617246197206e-05, |
| "loss": 1.0049, |
| "num_tokens": 2897211346.0, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.640346133044889, |
| "grad_norm": 0.5392035073552278, |
| "learning_rate": 1.615703978549464e-05, |
| "loss": 0.956, |
| "num_tokens": 2905280542.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.6446727961060033, |
| "grad_norm": 0.5178806174754045, |
| "learning_rate": 1.6136422109159887e-05, |
| "loss": 0.9749, |
| "num_tokens": 2913458269.0, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.6489994591671173, |
| "grad_norm": 0.46877312760108464, |
| "learning_rate": 1.611576437797373e-05, |
| "loss": 0.9773, |
| "num_tokens": 2921615325.0, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.6533261222282314, |
| "grad_norm": 0.5155483807676432, |
| "learning_rate": 1.60950667530293e-05, |
| "loss": 0.9955, |
| "num_tokens": 2929887058.0, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.6576527852893457, |
| "grad_norm": 0.5072802808973179, |
| "learning_rate": 1.607432939573084e-05, |
| "loss": 0.962, |
| "num_tokens": 2938013416.0, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.6619794483504597, |
| "grad_norm": 0.5129210712159445, |
| "learning_rate": 1.605355246779243e-05, |
| "loss": 1.0206, |
| "num_tokens": 2946138869.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.6663061114115738, |
| "grad_norm": 0.5839938970559614, |
| "learning_rate": 1.6032736131236722e-05, |
| "loss": 0.9575, |
| "num_tokens": 2954330909.0, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.670632774472688, |
| "grad_norm": 0.5339695370345615, |
| "learning_rate": 1.6011880548393694e-05, |
| "loss": 0.9475, |
| "num_tokens": 2962652473.0, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.674959437533802, |
| "grad_norm": 0.6181152846819087, |
| "learning_rate": 1.5990985881899367e-05, |
| "loss": 0.9765, |
| "num_tokens": 2970856021.0, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.6792861005949162, |
| "grad_norm": 0.4441388775819552, |
| "learning_rate": 1.597005229469455e-05, |
| "loss": 0.9635, |
| "num_tokens": 2978996470.0, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.6836127636560303, |
| "grad_norm": 0.7034052637679556, |
| "learning_rate": 1.594907995002356e-05, |
| "loss": 0.9745, |
| "num_tokens": 2987205522.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6879394267171444, |
| "grad_norm": 0.4820716961447039, |
| "learning_rate": 1.5928069011432955e-05, |
| "loss": 0.9565, |
| "num_tokens": 2995251335.0, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.6922660897782587, |
| "grad_norm": 0.6096635073457451, |
| "learning_rate": 1.590701964277025e-05, |
| "loss": 0.989, |
| "num_tokens": 3003455824.0, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.6965927528393725, |
| "grad_norm": 0.6098082087418374, |
| "learning_rate": 1.588593200818266e-05, |
| "loss": 0.9852, |
| "num_tokens": 3011809584.0, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.7009194159004868, |
| "grad_norm": 0.706589588823252, |
| "learning_rate": 1.5864806272115786e-05, |
| "loss": 0.9742, |
| "num_tokens": 3020038193.0, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.7052460789616009, |
| "grad_norm": 0.5332508357896025, |
| "learning_rate": 1.5843642599312365e-05, |
| "loss": 0.9658, |
| "num_tokens": 3028247851.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.709572742022715, |
| "grad_norm": 0.5967126870935362, |
| "learning_rate": 1.582244115481097e-05, |
| "loss": 0.9787, |
| "num_tokens": 3036393672.0, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.7138994050838292, |
| "grad_norm": 0.5618945234242413, |
| "learning_rate": 1.5801202103944725e-05, |
| "loss": 0.9716, |
| "num_tokens": 3044494253.0, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.718226068144943, |
| "grad_norm": 0.6427240201436156, |
| "learning_rate": 1.577992561234001e-05, |
| "loss": 0.9905, |
| "num_tokens": 3052731889.0, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.7225527312060573, |
| "grad_norm": 0.45664767030415754, |
| "learning_rate": 1.5758611845915188e-05, |
| "loss": 0.9914, |
| "num_tokens": 3060830320.0, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.7268793942671714, |
| "grad_norm": 0.5680532472735643, |
| "learning_rate": 1.573726097087928e-05, |
| "loss": 0.9912, |
| "num_tokens": 3068973700.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.7312060573282855, |
| "grad_norm": 0.5644498364761277, |
| "learning_rate": 1.5715873153730713e-05, |
| "loss": 0.9665, |
| "num_tokens": 3077224259.0, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.7355327203893998, |
| "grad_norm": 0.5014411330180275, |
| "learning_rate": 1.5694448561255972e-05, |
| "loss": 0.9652, |
| "num_tokens": 3085594237.0, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.7398593834505138, |
| "grad_norm": 0.41568492870972895, |
| "learning_rate": 1.5672987360528334e-05, |
| "loss": 0.9553, |
| "num_tokens": 3093879360.0, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.744186046511628, |
| "grad_norm": 0.5967844492183373, |
| "learning_rate": 1.5651489718906553e-05, |
| "loss": 0.9832, |
| "num_tokens": 3102164647.0, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.7485127095727422, |
| "grad_norm": 0.5543655378915199, |
| "learning_rate": 1.5629955804033558e-05, |
| "loss": 0.9806, |
| "num_tokens": 3110263161.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.752839372633856, |
| "grad_norm": 0.4435918962836691, |
| "learning_rate": 1.5608385783835145e-05, |
| "loss": 0.9604, |
| "num_tokens": 3118470439.0, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.7571660356949703, |
| "grad_norm": 0.5005651315002034, |
| "learning_rate": 1.558677982651866e-05, |
| "loss": 0.9688, |
| "num_tokens": 3126716914.0, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.7614926987560844, |
| "grad_norm": 4.391443015936278, |
| "learning_rate": 1.5565138100571703e-05, |
| "loss": 0.9629, |
| "num_tokens": 3134906980.0, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.7658193618171985, |
| "grad_norm": 0.9040714523990061, |
| "learning_rate": 1.5543460774760798e-05, |
| "loss": 0.981, |
| "num_tokens": 3143124349.0, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.7701460248783127, |
| "grad_norm": 0.43080204624761487, |
| "learning_rate": 1.5521748018130082e-05, |
| "loss": 1.0155, |
| "num_tokens": 3151490545.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.7744726879394266, |
| "grad_norm": 0.763386547386101, |
| "learning_rate": 1.55e-05, |
| "loss": 1.0146, |
| "num_tokens": 3159636609.0, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.7787993510005409, |
| "grad_norm": 0.6119279005971348, |
| "learning_rate": 1.5478216889965965e-05, |
| "loss": 0.9982, |
| "num_tokens": 3167775862.0, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.783126014061655, |
| "grad_norm": 0.5651533282088765, |
| "learning_rate": 1.545639885789704e-05, |
| "loss": 0.9818, |
| "num_tokens": 3176055047.0, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.787452677122769, |
| "grad_norm": 0.7405859257267926, |
| "learning_rate": 1.5434546073934625e-05, |
| "loss": 0.9995, |
| "num_tokens": 3184104527.0, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.7917793401838833, |
| "grad_norm": 0.5233140162738117, |
| "learning_rate": 1.541265870849112e-05, |
| "loss": 0.9852, |
| "num_tokens": 3192240853.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.7961060032449971, |
| "grad_norm": 0.7809493822032231, |
| "learning_rate": 1.5390736932248595e-05, |
| "loss": 0.9343, |
| "num_tokens": 3200252152.0, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.8004326663061114, |
| "grad_norm": 0.5756707174156201, |
| "learning_rate": 1.5368780916157466e-05, |
| "loss": 0.9817, |
| "num_tokens": 3208314962.0, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.8047593293672255, |
| "grad_norm": 0.8443572669557455, |
| "learning_rate": 1.5346790831435157e-05, |
| "loss": 1.0031, |
| "num_tokens": 3216354701.0, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.8090859924283396, |
| "grad_norm": 0.7330225783691384, |
| "learning_rate": 1.5324766849564766e-05, |
| "loss": 0.9701, |
| "num_tokens": 3224541175.0, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.8134126554894539, |
| "grad_norm": 0.8597981617306653, |
| "learning_rate": 1.5302709142293732e-05, |
| "loss": 1.003, |
| "num_tokens": 3232690124.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.817739318550568, |
| "grad_norm": 0.7597629364718675, |
| "learning_rate": 1.528061788163248e-05, |
| "loss": 0.9418, |
| "num_tokens": 3240925095.0, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.822065981611682, |
| "grad_norm": 0.7515345846987643, |
| "learning_rate": 1.52584932398531e-05, |
| "loss": 0.9852, |
| "num_tokens": 3249017852.0, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.826392644672796, |
| "grad_norm": 0.6736337890470313, |
| "learning_rate": 1.5236335389487997e-05, |
| "loss": 0.937, |
| "num_tokens": 3257064643.0, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.8307193077339101, |
| "grad_norm": 0.6249241728233079, |
| "learning_rate": 1.5214144503328532e-05, |
| "loss": 0.9513, |
| "num_tokens": 3265289767.0, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.8350459707950244, |
| "grad_norm": 0.6298196557489565, |
| "learning_rate": 1.5191920754423698e-05, |
| "loss": 0.9642, |
| "num_tokens": 3273615215.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.8393726338561385, |
| "grad_norm": 0.5745220636985442, |
| "learning_rate": 1.5169664316078758e-05, |
| "loss": 0.9586, |
| "num_tokens": 3281703406.0, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.8436992969172525, |
| "grad_norm": 0.5956954636450505, |
| "learning_rate": 1.514737536185388e-05, |
| "loss": 1.0039, |
| "num_tokens": 3289933861.0, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.8480259599783668, |
| "grad_norm": 0.5109096269242125, |
| "learning_rate": 1.512505406556281e-05, |
| "loss": 0.9607, |
| "num_tokens": 3298102935.0, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.8523526230394807, |
| "grad_norm": 0.6631068486693279, |
| "learning_rate": 1.5102700601271503e-05, |
| "loss": 0.9675, |
| "num_tokens": 3306272221.0, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.856679286100595, |
| "grad_norm": 0.4409566460742665, |
| "learning_rate": 1.5080315143296758e-05, |
| "loss": 0.9777, |
| "num_tokens": 3314431095.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.861005949161709, |
| "grad_norm": 0.8262060971600915, |
| "learning_rate": 1.5057897866204878e-05, |
| "loss": 0.9601, |
| "num_tokens": 3322663426.0, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.865332612222823, |
| "grad_norm": 0.5726708983593121, |
| "learning_rate": 1.5035448944810293e-05, |
| "loss": 0.9925, |
| "num_tokens": 3330865224.0, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.8696592752839374, |
| "grad_norm": 0.8747512008129081, |
| "learning_rate": 1.5012968554174198e-05, |
| "loss": 0.9698, |
| "num_tokens": 3339107321.0, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.8739859383450512, |
| "grad_norm": 0.7598171132270467, |
| "learning_rate": 1.4990456869603193e-05, |
| "loss": 0.9853, |
| "num_tokens": 3347289915.0, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.8783126014061655, |
| "grad_norm": 0.7035073136630965, |
| "learning_rate": 1.4967914066647928e-05, |
| "loss": 0.9693, |
| "num_tokens": 3355448024.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.8826392644672796, |
| "grad_norm": 0.734295536740874, |
| "learning_rate": 1.4945340321101698e-05, |
| "loss": 0.9864, |
| "num_tokens": 3363675135.0, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.8869659275283936, |
| "grad_norm": 0.5988812777750793, |
| "learning_rate": 1.4922735808999107e-05, |
| "loss": 0.9649, |
| "num_tokens": 3371726378.0, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.891292590589508, |
| "grad_norm": 0.8750638771397959, |
| "learning_rate": 1.4900100706614686e-05, |
| "loss": 0.9901, |
| "num_tokens": 3379870297.0, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.895619253650622, |
| "grad_norm": 0.5808382556577097, |
| "learning_rate": 1.4877435190461506e-05, |
| "loss": 0.9744, |
| "num_tokens": 3388200725.0, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.899945916711736, |
| "grad_norm": 0.9363462985119361, |
| "learning_rate": 1.4854739437289814e-05, |
| "loss": 0.9861, |
| "num_tokens": 3396309659.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.9042725797728501, |
| "grad_norm": 0.7948490424737458, |
| "learning_rate": 1.4832013624085654e-05, |
| "loss": 0.9837, |
| "num_tokens": 3404510694.0, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.9085992428339642, |
| "grad_norm": 0.8980882451384175, |
| "learning_rate": 1.4809257928069487e-05, |
| "loss": 1.008, |
| "num_tokens": 3412645391.0, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.9129259058950785, |
| "grad_norm": 0.7040381990057594, |
| "learning_rate": 1.4786472526694795e-05, |
| "loss": 0.9592, |
| "num_tokens": 3420985745.0, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.9172525689561926, |
| "grad_norm": 0.8991768615059288, |
| "learning_rate": 1.4763657597646713e-05, |
| "loss": 0.9968, |
| "num_tokens": 3429248541.0, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.9215792320173066, |
| "grad_norm": 0.6633916419975918, |
| "learning_rate": 1.4740813318840652e-05, |
| "loss": 0.9784, |
| "num_tokens": 3437348661.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.925905895078421, |
| "grad_norm": 0.8965515298346237, |
| "learning_rate": 1.4717939868420878e-05, |
| "loss": 0.9531, |
| "num_tokens": 3445445221.0, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.9302325581395348, |
| "grad_norm": 0.6871072191147697, |
| "learning_rate": 1.4695037424759153e-05, |
| "loss": 0.9747, |
| "num_tokens": 3453746865.0, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.934559221200649, |
| "grad_norm": 0.9238825067701328, |
| "learning_rate": 1.4672106166453337e-05, |
| "loss": 0.977, |
| "num_tokens": 3461909596.0, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.9388858842617631, |
| "grad_norm": 0.7490553984675871, |
| "learning_rate": 1.4649146272325984e-05, |
| "loss": 0.9329, |
| "num_tokens": 3470039928.0, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.9432125473228772, |
| "grad_norm": 0.8854672895396621, |
| "learning_rate": 1.4626157921422965e-05, |
| "loss": 0.9569, |
| "num_tokens": 3478190617.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.9475392103839915, |
| "grad_norm": 0.6923182092043201, |
| "learning_rate": 1.4603141293012057e-05, |
| "loss": 0.9862, |
| "num_tokens": 3486403151.0, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.9518658734451053, |
| "grad_norm": 0.8231453812124571, |
| "learning_rate": 1.458009656658155e-05, |
| "loss": 0.9803, |
| "num_tokens": 3494717203.0, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.9561925365062196, |
| "grad_norm": 0.6255900621201269, |
| "learning_rate": 1.4557023921838851e-05, |
| "loss": 0.9653, |
| "num_tokens": 3502823483.0, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.9605191995673337, |
| "grad_norm": 0.8440131825774589, |
| "learning_rate": 1.4533923538709076e-05, |
| "loss": 1.0024, |
| "num_tokens": 3510882681.0, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.9648458626284477, |
| "grad_norm": 0.7692850246575329, |
| "learning_rate": 1.4510795597333658e-05, |
| "loss": 0.9875, |
| "num_tokens": 3519029644.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.969172525689562, |
| "grad_norm": 0.7640584934065241, |
| "learning_rate": 1.4487640278068929e-05, |
| "loss": 0.9599, |
| "num_tokens": 3527262793.0, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.973499188750676, |
| "grad_norm": 0.6469547825051354, |
| "learning_rate": 1.4464457761484716e-05, |
| "loss": 0.9485, |
| "num_tokens": 3535520575.0, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.9778258518117902, |
| "grad_norm": 0.7154622216746662, |
| "learning_rate": 1.4441248228362943e-05, |
| "loss": 0.9432, |
| "num_tokens": 3543566968.0, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.9821525148729042, |
| "grad_norm": 0.5753194167751761, |
| "learning_rate": 1.4418011859696213e-05, |
| "loss": 0.9916, |
| "num_tokens": 3551798760.0, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.9864791779340183, |
| "grad_norm": 0.820941286931545, |
| "learning_rate": 1.4394748836686392e-05, |
| "loss": 0.981, |
| "num_tokens": 3559961473.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.9908058409951326, |
| "grad_norm": 0.6759926818924319, |
| "learning_rate": 1.437145934074321e-05, |
| "loss": 0.9613, |
| "num_tokens": 3568259851.0, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.9951325040562466, |
| "grad_norm": 0.7636814498744505, |
| "learning_rate": 1.4348143553482834e-05, |
| "loss": 0.9542, |
| "num_tokens": 3576256828.0, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.9994591671173607, |
| "grad_norm": 0.6253415385490834, |
| "learning_rate": 1.4324801656726457e-05, |
| "loss": 1.0001, |
| "num_tokens": 3584445935.0, |
| "step": 463 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.8654452684362129, |
| "learning_rate": 1.4301433832498879e-05, |
| "loss": 1.0153, |
| "num_tokens": 3585494269.0, |
| "step": 464 |
| }, |
| { |
| "epoch": 2.0043266630611143, |
| "grad_norm": 0.7804102878180353, |
| "learning_rate": 1.4278040263027087e-05, |
| "loss": 0.9711, |
| "num_tokens": 3593559452.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.008653326122228, |
| "grad_norm": 0.616918692608059, |
| "learning_rate": 1.425462113073883e-05, |
| "loss": 0.9421, |
| "num_tokens": 3601663035.0, |
| "step": 466 |
| }, |
| { |
| "epoch": 2.0129799891833424, |
| "grad_norm": 0.6370584091299962, |
| "learning_rate": 1.4231176618261218e-05, |
| "loss": 0.9627, |
| "num_tokens": 3609794842.0, |
| "step": 467 |
| }, |
| { |
| "epoch": 2.0173066522444563, |
| "grad_norm": 0.5439959676935536, |
| "learning_rate": 1.4207706908419257e-05, |
| "loss": 0.9312, |
| "num_tokens": 3618030710.0, |
| "step": 468 |
| }, |
| { |
| "epoch": 2.0216333153055706, |
| "grad_norm": 0.8536853956647672, |
| "learning_rate": 1.4184212184234465e-05, |
| "loss": 0.9631, |
| "num_tokens": 3626167487.0, |
| "step": 469 |
| }, |
| { |
| "epoch": 2.025959978366685, |
| "grad_norm": 0.5640997140033263, |
| "learning_rate": 1.416069262892342e-05, |
| "loss": 0.978, |
| "num_tokens": 3634141707.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.0302866414277987, |
| "grad_norm": 0.7089110329625673, |
| "learning_rate": 1.4137148425896338e-05, |
| "loss": 0.9813, |
| "num_tokens": 3642347731.0, |
| "step": 471 |
| }, |
| { |
| "epoch": 2.034613304488913, |
| "grad_norm": 0.5076945792366204, |
| "learning_rate": 1.4113579758755645e-05, |
| "loss": 0.9475, |
| "num_tokens": 3650536256.0, |
| "step": 472 |
| }, |
| { |
| "epoch": 2.038939967550027, |
| "grad_norm": 0.7641036202893745, |
| "learning_rate": 1.4089986811294537e-05, |
| "loss": 0.9695, |
| "num_tokens": 3658660767.0, |
| "step": 473 |
| }, |
| { |
| "epoch": 2.043266630611141, |
| "grad_norm": 0.6084043720101941, |
| "learning_rate": 1.4066369767495567e-05, |
| "loss": 0.953, |
| "num_tokens": 3666730262.0, |
| "step": 474 |
| }, |
| { |
| "epoch": 2.0475932936722554, |
| "grad_norm": 0.6900255247384617, |
| "learning_rate": 1.4042728811529175e-05, |
| "loss": 0.9338, |
| "num_tokens": 3674874052.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.0519199567333692, |
| "grad_norm": 0.6673406272001278, |
| "learning_rate": 1.4019064127752298e-05, |
| "loss": 0.9583, |
| "num_tokens": 3682866698.0, |
| "step": 476 |
| }, |
| { |
| "epoch": 2.0562466197944835, |
| "grad_norm": 0.5619696739721406, |
| "learning_rate": 1.399537590070688e-05, |
| "loss": 0.9444, |
| "num_tokens": 3691093555.0, |
| "step": 477 |
| }, |
| { |
| "epoch": 2.060573282855598, |
| "grad_norm": 0.5401104501685228, |
| "learning_rate": 1.3971664315118483e-05, |
| "loss": 0.9158, |
| "num_tokens": 3699284128.0, |
| "step": 478 |
| }, |
| { |
| "epoch": 2.0648999459167117, |
| "grad_norm": 0.6082865072416918, |
| "learning_rate": 1.3947929555894813e-05, |
| "loss": 0.9866, |
| "num_tokens": 3707572868.0, |
| "step": 479 |
| }, |
| { |
| "epoch": 2.069226608977826, |
| "grad_norm": 0.5219700556096115, |
| "learning_rate": 1.392417180812429e-05, |
| "loss": 0.925, |
| "num_tokens": 3715837638.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.07355327203894, |
| "grad_norm": 0.5818687691162563, |
| "learning_rate": 1.3900391257074601e-05, |
| "loss": 0.9313, |
| "num_tokens": 3723956355.0, |
| "step": 481 |
| }, |
| { |
| "epoch": 2.077879935100054, |
| "grad_norm": 0.6243227814666679, |
| "learning_rate": 1.3876588088191264e-05, |
| "loss": 0.9702, |
| "num_tokens": 3732169409.0, |
| "step": 482 |
| }, |
| { |
| "epoch": 2.0822065981611684, |
| "grad_norm": 0.4599940187012167, |
| "learning_rate": 1.3852762487096168e-05, |
| "loss": 0.9626, |
| "num_tokens": 3740297617.0, |
| "step": 483 |
| }, |
| { |
| "epoch": 2.086533261222282, |
| "grad_norm": 0.7390777191418446, |
| "learning_rate": 1.3828914639586138e-05, |
| "loss": 0.988, |
| "num_tokens": 3748584821.0, |
| "step": 484 |
| }, |
| { |
| "epoch": 2.0908599242833965, |
| "grad_norm": 1.288123858274811, |
| "learning_rate": 1.3805044731631475e-05, |
| "loss": 0.9624, |
| "num_tokens": 3756811621.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 2.0951865873445104, |
| "grad_norm": 0.6711188741890706, |
| "learning_rate": 1.3781152949374527e-05, |
| "loss": 0.9752, |
| "num_tokens": 3764852654.0, |
| "step": 486 |
| }, |
| { |
| "epoch": 2.0995132504056246, |
| "grad_norm": 0.4902348109671292, |
| "learning_rate": 1.3757239479128204e-05, |
| "loss": 0.9257, |
| "num_tokens": 3773125548.0, |
| "step": 487 |
| }, |
| { |
| "epoch": 2.103839913466739, |
| "grad_norm": 0.5653073702582646, |
| "learning_rate": 1.373330450737455e-05, |
| "loss": 0.9357, |
| "num_tokens": 3781308940.0, |
| "step": 488 |
| }, |
| { |
| "epoch": 2.1081665765278528, |
| "grad_norm": 0.4760945896815548, |
| "learning_rate": 1.3709348220763287e-05, |
| "loss": 0.956, |
| "num_tokens": 3789493303.0, |
| "step": 489 |
| }, |
| { |
| "epoch": 2.112493239588967, |
| "grad_norm": 2.455426675675134, |
| "learning_rate": 1.3685370806110343e-05, |
| "loss": 0.9677, |
| "num_tokens": 3797741283.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.1168199026500814, |
| "grad_norm": 2.0574524671779053, |
| "learning_rate": 1.3661372450396422e-05, |
| "loss": 0.9932, |
| "num_tokens": 3805823170.0, |
| "step": 491 |
| }, |
| { |
| "epoch": 2.121146565711195, |
| "grad_norm": 0.6481290112552625, |
| "learning_rate": 1.3637353340765518e-05, |
| "loss": 0.9522, |
| "num_tokens": 3813836268.0, |
| "step": 492 |
| }, |
| { |
| "epoch": 2.1254732287723095, |
| "grad_norm": 0.6162414056852944, |
| "learning_rate": 1.3613313664523476e-05, |
| "loss": 0.9827, |
| "num_tokens": 3822125722.0, |
| "step": 493 |
| }, |
| { |
| "epoch": 2.1297998918334233, |
| "grad_norm": 0.590791719137404, |
| "learning_rate": 1.3589253609136517e-05, |
| "loss": 0.9612, |
| "num_tokens": 3830333623.0, |
| "step": 494 |
| }, |
| { |
| "epoch": 2.1341265548945376, |
| "grad_norm": 0.5684707116662864, |
| "learning_rate": 1.3565173362229787e-05, |
| "loss": 0.984, |
| "num_tokens": 3838471268.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 2.138453217955652, |
| "grad_norm": 0.6053160814163213, |
| "learning_rate": 1.354107311158589e-05, |
| "loss": 0.9667, |
| "num_tokens": 3846752488.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 2.1427798810167658, |
| "grad_norm": 0.47119322042756134, |
| "learning_rate": 1.3516953045143421e-05, |
| "loss": 1.0044, |
| "num_tokens": 3855015535.0, |
| "step": 497 |
| }, |
| { |
| "epoch": 2.14710654407788, |
| "grad_norm": 0.6890877937333092, |
| "learning_rate": 1.3492813350995501e-05, |
| "loss": 0.9558, |
| "num_tokens": 3863332871.0, |
| "step": 498 |
| }, |
| { |
| "epoch": 2.151433207138994, |
| "grad_norm": 0.4720532087871396, |
| "learning_rate": 1.3468654217388322e-05, |
| "loss": 0.9438, |
| "num_tokens": 3871417760.0, |
| "step": 499 |
| }, |
| { |
| "epoch": 2.155759870200108, |
| "grad_norm": 0.6530842825033443, |
| "learning_rate": 1.344447583271965e-05, |
| "loss": 0.9335, |
| "num_tokens": 3879494222.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.1600865332612225, |
| "grad_norm": 0.48813659380264507, |
| "learning_rate": 1.342027838553739e-05, |
| "loss": 0.9807, |
| "num_tokens": 3887572575.0, |
| "step": 501 |
| }, |
| { |
| "epoch": 2.1644131963223363, |
| "grad_norm": 0.5432512528502169, |
| "learning_rate": 1.3396062064538103e-05, |
| "loss": 0.9508, |
| "num_tokens": 3895875651.0, |
| "step": 502 |
| }, |
| { |
| "epoch": 2.1687398593834506, |
| "grad_norm": 0.5246641442927332, |
| "learning_rate": 1.3371827058565517e-05, |
| "loss": 0.9335, |
| "num_tokens": 3903948482.0, |
| "step": 503 |
| }, |
| { |
| "epoch": 2.1730665224445644, |
| "grad_norm": 0.37219239494720535, |
| "learning_rate": 1.3347573556609075e-05, |
| "loss": 0.9336, |
| "num_tokens": 3912132117.0, |
| "step": 504 |
| }, |
| { |
| "epoch": 2.1773931855056787, |
| "grad_norm": 0.5512319309802416, |
| "learning_rate": 1.332330174780246e-05, |
| "loss": 0.9435, |
| "num_tokens": 3920270330.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 2.181719848566793, |
| "grad_norm": 0.4318054234173368, |
| "learning_rate": 1.3299011821422116e-05, |
| "loss": 0.9372, |
| "num_tokens": 3928514380.0, |
| "step": 506 |
| }, |
| { |
| "epoch": 2.186046511627907, |
| "grad_norm": 0.48811090489725933, |
| "learning_rate": 1.3274703966885765e-05, |
| "loss": 0.9624, |
| "num_tokens": 3936610571.0, |
| "step": 507 |
| }, |
| { |
| "epoch": 2.190373174689021, |
| "grad_norm": 0.48047513202626985, |
| "learning_rate": 1.3250378373750941e-05, |
| "loss": 0.9579, |
| "num_tokens": 3944799565.0, |
| "step": 508 |
| }, |
| { |
| "epoch": 2.194699837750135, |
| "grad_norm": 0.4381062126465775, |
| "learning_rate": 1.3226035231713504e-05, |
| "loss": 0.9609, |
| "num_tokens": 3952965563.0, |
| "step": 509 |
| }, |
| { |
| "epoch": 2.1990265008112493, |
| "grad_norm": 0.3842482894518455, |
| "learning_rate": 1.3201674730606166e-05, |
| "loss": 0.9454, |
| "num_tokens": 3960955314.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.2033531638723636, |
| "grad_norm": 0.4028195527619942, |
| "learning_rate": 1.317729706039701e-05, |
| "loss": 0.9569, |
| "num_tokens": 3969007594.0, |
| "step": 511 |
| }, |
| { |
| "epoch": 2.2076798269334774, |
| "grad_norm": 0.5267165379312942, |
| "learning_rate": 1.3152902411188007e-05, |
| "loss": 0.9383, |
| "num_tokens": 3977305210.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 2.2120064899945917, |
| "grad_norm": 0.408161580284809, |
| "learning_rate": 1.3128490973213523e-05, |
| "loss": 0.9651, |
| "num_tokens": 3985475847.0, |
| "step": 513 |
| }, |
| { |
| "epoch": 2.2163331530557056, |
| "grad_norm": 0.5824772012067178, |
| "learning_rate": 1.3104062936838863e-05, |
| "loss": 0.9904, |
| "num_tokens": 3993521666.0, |
| "step": 514 |
| }, |
| { |
| "epoch": 2.22065981611682, |
| "grad_norm": 0.4592171198699058, |
| "learning_rate": 1.3079618492558763e-05, |
| "loss": 0.9509, |
| "num_tokens": 4001881055.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 2.224986479177934, |
| "grad_norm": 0.4288405177420417, |
| "learning_rate": 1.3055157830995904e-05, |
| "loss": 0.9267, |
| "num_tokens": 4010098333.0, |
| "step": 516 |
| }, |
| { |
| "epoch": 2.229313142239048, |
| "grad_norm": 0.62245881444073, |
| "learning_rate": 1.3030681142899437e-05, |
| "loss": 0.924, |
| "num_tokens": 4018409777.0, |
| "step": 517 |
| }, |
| { |
| "epoch": 2.2336398053001623, |
| "grad_norm": 0.40777593363684145, |
| "learning_rate": 1.3006188619143505e-05, |
| "loss": 0.9726, |
| "num_tokens": 4026647609.0, |
| "step": 518 |
| }, |
| { |
| "epoch": 2.2379664683612766, |
| "grad_norm": 0.522692181425156, |
| "learning_rate": 1.2981680450725715e-05, |
| "loss": 0.9621, |
| "num_tokens": 4034775036.0, |
| "step": 519 |
| }, |
| { |
| "epoch": 2.2422931314223904, |
| "grad_norm": 0.47960290009387085, |
| "learning_rate": 1.2957156828765694e-05, |
| "loss": 0.9189, |
| "num_tokens": 4043062480.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.2466197944835047, |
| "grad_norm": 0.4678126732331299, |
| "learning_rate": 1.2932617944503572e-05, |
| "loss": 0.9546, |
| "num_tokens": 4051272045.0, |
| "step": 521 |
| }, |
| { |
| "epoch": 2.2509464575446185, |
| "grad_norm": 0.398362030110093, |
| "learning_rate": 1.2908063989298493e-05, |
| "loss": 0.9287, |
| "num_tokens": 4059530591.0, |
| "step": 522 |
| }, |
| { |
| "epoch": 2.255273120605733, |
| "grad_norm": 0.45585060066098454, |
| "learning_rate": 1.2883495154627138e-05, |
| "loss": 0.9418, |
| "num_tokens": 4067674515.0, |
| "step": 523 |
| }, |
| { |
| "epoch": 2.259599783666847, |
| "grad_norm": 0.4335281616940614, |
| "learning_rate": 1.2858911632082211e-05, |
| "loss": 0.9743, |
| "num_tokens": 4075850527.0, |
| "step": 524 |
| }, |
| { |
| "epoch": 2.263926446727961, |
| "grad_norm": 0.45201891721310733, |
| "learning_rate": 1.2834313613370966e-05, |
| "loss": 0.9391, |
| "num_tokens": 4083981564.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 2.2682531097890752, |
| "grad_norm": 0.44150737521062516, |
| "learning_rate": 1.2809701290313683e-05, |
| "loss": 0.9467, |
| "num_tokens": 4092119819.0, |
| "step": 526 |
| }, |
| { |
| "epoch": 2.2725797728501895, |
| "grad_norm": 0.45433913386085706, |
| "learning_rate": 1.278507485484221e-05, |
| "loss": 0.9393, |
| "num_tokens": 4100346271.0, |
| "step": 527 |
| }, |
| { |
| "epoch": 2.2769064359113034, |
| "grad_norm": 0.5507166636281073, |
| "learning_rate": 1.2760434498998434e-05, |
| "loss": 0.9391, |
| "num_tokens": 4108661518.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 2.2812330989724177, |
| "grad_norm": 0.37701447868392113, |
| "learning_rate": 1.27357804149328e-05, |
| "loss": 0.9524, |
| "num_tokens": 4116748380.0, |
| "step": 529 |
| }, |
| { |
| "epoch": 2.2855597620335315, |
| "grad_norm": 0.5746317779731845, |
| "learning_rate": 1.2711112794902813e-05, |
| "loss": 0.9421, |
| "num_tokens": 4124999062.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.289886425094646, |
| "grad_norm": 0.4463101017830629, |
| "learning_rate": 1.2686431831271523e-05, |
| "loss": 0.9625, |
| "num_tokens": 4133186868.0, |
| "step": 531 |
| }, |
| { |
| "epoch": 2.29421308815576, |
| "grad_norm": 0.5370113028092661, |
| "learning_rate": 1.2661737716506043e-05, |
| "loss": 0.9885, |
| "num_tokens": 4141449821.0, |
| "step": 532 |
| }, |
| { |
| "epoch": 2.298539751216874, |
| "grad_norm": 0.4423097943357427, |
| "learning_rate": 1.2637030643176042e-05, |
| "loss": 0.9542, |
| "num_tokens": 4149652432.0, |
| "step": 533 |
| }, |
| { |
| "epoch": 2.302866414277988, |
| "grad_norm": 0.5511789232845956, |
| "learning_rate": 1.2612310803952244e-05, |
| "loss": 0.9465, |
| "num_tokens": 4157808329.0, |
| "step": 534 |
| }, |
| { |
| "epoch": 2.307193077339102, |
| "grad_norm": 0.3876488127130369, |
| "learning_rate": 1.2587578391604913e-05, |
| "loss": 0.9499, |
| "num_tokens": 4165809160.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 2.3115197404002163, |
| "grad_norm": 0.6177035631999804, |
| "learning_rate": 1.2562833599002376e-05, |
| "loss": 0.9677, |
| "num_tokens": 4174000454.0, |
| "step": 536 |
| }, |
| { |
| "epoch": 2.3158464034613306, |
| "grad_norm": 0.42991099207796024, |
| "learning_rate": 1.2538076619109492e-05, |
| "loss": 0.9427, |
| "num_tokens": 4182078542.0, |
| "step": 537 |
| }, |
| { |
| "epoch": 2.3201730665224445, |
| "grad_norm": 0.641966921774766, |
| "learning_rate": 1.2513307644986165e-05, |
| "loss": 0.9263, |
| "num_tokens": 4190219796.0, |
| "step": 538 |
| }, |
| { |
| "epoch": 2.3244997295835588, |
| "grad_norm": 0.5474359287041366, |
| "learning_rate": 1.2488526869785831e-05, |
| "loss": 0.9759, |
| "num_tokens": 4198442007.0, |
| "step": 539 |
| }, |
| { |
| "epoch": 2.3288263926446726, |
| "grad_norm": 0.6031392642669225, |
| "learning_rate": 1.2463734486753953e-05, |
| "loss": 0.9768, |
| "num_tokens": 4206693320.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.333153055705787, |
| "grad_norm": 0.6408527604910021, |
| "learning_rate": 1.2438930689226516e-05, |
| "loss": 0.9794, |
| "num_tokens": 4214990068.0, |
| "step": 541 |
| }, |
| { |
| "epoch": 2.337479718766901, |
| "grad_norm": 0.5038473523210545, |
| "learning_rate": 1.241411567062851e-05, |
| "loss": 0.971, |
| "num_tokens": 4223038722.0, |
| "step": 542 |
| }, |
| { |
| "epoch": 2.341806381828015, |
| "grad_norm": 0.6388179575100953, |
| "learning_rate": 1.2389289624472443e-05, |
| "loss": 0.9506, |
| "num_tokens": 4231285793.0, |
| "step": 543 |
| }, |
| { |
| "epoch": 2.3461330448891293, |
| "grad_norm": 0.4013689490010426, |
| "learning_rate": 1.2364452744356803e-05, |
| "loss": 0.9847, |
| "num_tokens": 4239600068.0, |
| "step": 544 |
| }, |
| { |
| "epoch": 2.350459707950243, |
| "grad_norm": 0.770106065718662, |
| "learning_rate": 1.2339605223964571e-05, |
| "loss": 0.949, |
| "num_tokens": 4247861801.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 2.3547863710113575, |
| "grad_norm": 0.577361696338379, |
| "learning_rate": 1.2314747257061705e-05, |
| "loss": 0.9571, |
| "num_tokens": 4256038814.0, |
| "step": 546 |
| }, |
| { |
| "epoch": 2.3591130340724717, |
| "grad_norm": 0.6764182668606968, |
| "learning_rate": 1.2289879037495626e-05, |
| "loss": 0.9526, |
| "num_tokens": 4264362015.0, |
| "step": 547 |
| }, |
| { |
| "epoch": 2.3634396971335856, |
| "grad_norm": 0.6826859929166379, |
| "learning_rate": 1.22650007591937e-05, |
| "loss": 0.9867, |
| "num_tokens": 4272552088.0, |
| "step": 548 |
| }, |
| { |
| "epoch": 2.3677663601947, |
| "grad_norm": 1.1712639273168959, |
| "learning_rate": 1.2240112616161743e-05, |
| "loss": 0.9291, |
| "num_tokens": 4280783431.0, |
| "step": 549 |
| }, |
| { |
| "epoch": 2.3720930232558137, |
| "grad_norm": 0.9029578260961084, |
| "learning_rate": 1.2215214802482493e-05, |
| "loss": 0.9471, |
| "num_tokens": 4288988956.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.376419686316928, |
| "grad_norm": 0.5373017530946762, |
| "learning_rate": 1.2190307512314104e-05, |
| "loss": 0.948, |
| "num_tokens": 4297187620.0, |
| "step": 551 |
| }, |
| { |
| "epoch": 2.3807463493780423, |
| "grad_norm": 0.7957685972548623, |
| "learning_rate": 1.2165390939888622e-05, |
| "loss": 0.9799, |
| "num_tokens": 4305382112.0, |
| "step": 552 |
| }, |
| { |
| "epoch": 2.385073012439156, |
| "grad_norm": 0.6252518036830393, |
| "learning_rate": 1.2140465279510494e-05, |
| "loss": 1.0004, |
| "num_tokens": 4313476472.0, |
| "step": 553 |
| }, |
| { |
| "epoch": 2.3893996755002704, |
| "grad_norm": 0.6705980809309716, |
| "learning_rate": 1.2115530725555016e-05, |
| "loss": 0.9856, |
| "num_tokens": 4321660199.0, |
| "step": 554 |
| }, |
| { |
| "epoch": 2.3937263385613847, |
| "grad_norm": 0.5375506977246428, |
| "learning_rate": 1.2090587472466857e-05, |
| "loss": 0.9443, |
| "num_tokens": 4329933962.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 2.3980530016224986, |
| "grad_norm": 0.6972561334867685, |
| "learning_rate": 1.2065635714758513e-05, |
| "loss": 0.9612, |
| "num_tokens": 4338257005.0, |
| "step": 556 |
| }, |
| { |
| "epoch": 2.402379664683613, |
| "grad_norm": 0.5367223757844836, |
| "learning_rate": 1.2040675647008796e-05, |
| "loss": 0.9479, |
| "num_tokens": 4346287588.0, |
| "step": 557 |
| }, |
| { |
| "epoch": 2.4067063277447267, |
| "grad_norm": 0.6651025311923312, |
| "learning_rate": 1.2015707463861334e-05, |
| "loss": 0.9591, |
| "num_tokens": 4354313732.0, |
| "step": 558 |
| }, |
| { |
| "epoch": 2.411032990805841, |
| "grad_norm": 0.5230642046498125, |
| "learning_rate": 1.199073136002304e-05, |
| "loss": 0.9895, |
| "num_tokens": 4362453217.0, |
| "step": 559 |
| }, |
| { |
| "epoch": 2.4153596538669553, |
| "grad_norm": 0.6436791423085466, |
| "learning_rate": 1.1965747530262581e-05, |
| "loss": 0.9888, |
| "num_tokens": 4370690238.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.419686316928069, |
| "grad_norm": 0.5418843658820947, |
| "learning_rate": 1.1940756169408882e-05, |
| "loss": 0.9744, |
| "num_tokens": 4378964382.0, |
| "step": 561 |
| }, |
| { |
| "epoch": 2.4240129799891834, |
| "grad_norm": 0.6124584539065394, |
| "learning_rate": 1.1915757472349598e-05, |
| "loss": 0.9637, |
| "num_tokens": 4386973954.0, |
| "step": 562 |
| }, |
| { |
| "epoch": 2.4283396430502977, |
| "grad_norm": 0.5060160404773808, |
| "learning_rate": 1.1890751634029586e-05, |
| "loss": 0.9693, |
| "num_tokens": 4395090749.0, |
| "step": 563 |
| }, |
| { |
| "epoch": 2.4326663061114115, |
| "grad_norm": 0.6267585754010649, |
| "learning_rate": 1.18657388494494e-05, |
| "loss": 0.9309, |
| "num_tokens": 4403333165.0, |
| "step": 564 |
| }, |
| { |
| "epoch": 2.436992969172526, |
| "grad_norm": 0.554981345863305, |
| "learning_rate": 1.1840719313663758e-05, |
| "loss": 0.9704, |
| "num_tokens": 4411451209.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 2.4413196322336397, |
| "grad_norm": 0.6155646667621201, |
| "learning_rate": 1.1815693221780024e-05, |
| "loss": 0.9698, |
| "num_tokens": 4419790504.0, |
| "step": 566 |
| }, |
| { |
| "epoch": 2.445646295294754, |
| "grad_norm": 0.5336569352551696, |
| "learning_rate": 1.1790660768956692e-05, |
| "loss": 0.9662, |
| "num_tokens": 4427867123.0, |
| "step": 567 |
| }, |
| { |
| "epoch": 2.4499729583558683, |
| "grad_norm": 0.6203324832845126, |
| "learning_rate": 1.1765622150401855e-05, |
| "loss": 0.9797, |
| "num_tokens": 4436107636.0, |
| "step": 568 |
| }, |
| { |
| "epoch": 2.454299621416982, |
| "grad_norm": 0.5526647902620689, |
| "learning_rate": 1.1740577561371692e-05, |
| "loss": 0.931, |
| "num_tokens": 4444241142.0, |
| "step": 569 |
| }, |
| { |
| "epoch": 2.4586262844780964, |
| "grad_norm": 0.6147177036547514, |
| "learning_rate": 1.1715527197168938e-05, |
| "loss": 0.9559, |
| "num_tokens": 4452511750.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.4629529475392102, |
| "grad_norm": 0.5300983630062421, |
| "learning_rate": 1.1690471253141368e-05, |
| "loss": 0.9497, |
| "num_tokens": 4460721683.0, |
| "step": 571 |
| }, |
| { |
| "epoch": 2.4672796106003245, |
| "grad_norm": 0.6023194661956321, |
| "learning_rate": 1.1665409924680266e-05, |
| "loss": 0.9793, |
| "num_tokens": 4468873356.0, |
| "step": 572 |
| }, |
| { |
| "epoch": 2.471606273661439, |
| "grad_norm": 0.5295856874230167, |
| "learning_rate": 1.1640343407218904e-05, |
| "loss": 0.9503, |
| "num_tokens": 4476919366.0, |
| "step": 573 |
| }, |
| { |
| "epoch": 2.4759329367225527, |
| "grad_norm": 0.5665385218247417, |
| "learning_rate": 1.1615271896231019e-05, |
| "loss": 0.9662, |
| "num_tokens": 4485019312.0, |
| "step": 574 |
| }, |
| { |
| "epoch": 2.480259599783667, |
| "grad_norm": 0.49532958355297396, |
| "learning_rate": 1.1590195587229297e-05, |
| "loss": 0.9853, |
| "num_tokens": 4493371908.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 2.484586262844781, |
| "grad_norm": 0.5916742957768434, |
| "learning_rate": 1.1565114675763823e-05, |
| "loss": 0.9583, |
| "num_tokens": 4501379320.0, |
| "step": 576 |
| }, |
| { |
| "epoch": 2.488912925905895, |
| "grad_norm": 0.5199993454816884, |
| "learning_rate": 1.1540029357420588e-05, |
| "loss": 0.9358, |
| "num_tokens": 4509554917.0, |
| "step": 577 |
| }, |
| { |
| "epoch": 2.4932395889670094, |
| "grad_norm": 0.6131431462830838, |
| "learning_rate": 1.1514939827819945e-05, |
| "loss": 0.9703, |
| "num_tokens": 4517700650.0, |
| "step": 578 |
| }, |
| { |
| "epoch": 2.497566252028123, |
| "grad_norm": 0.5276906368093617, |
| "learning_rate": 1.1489846282615083e-05, |
| "loss": 0.945, |
| "num_tokens": 4525942676.0, |
| "step": 579 |
| }, |
| { |
| "epoch": 2.5018929150892375, |
| "grad_norm": 0.6310140050988171, |
| "learning_rate": 1.1464748917490512e-05, |
| "loss": 0.9712, |
| "num_tokens": 4534150166.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.5062195781503513, |
| "grad_norm": 0.5659729405686769, |
| "learning_rate": 1.1439647928160523e-05, |
| "loss": 0.9645, |
| "num_tokens": 4542343152.0, |
| "step": 581 |
| }, |
| { |
| "epoch": 2.5105462412114656, |
| "grad_norm": 0.5200842069223084, |
| "learning_rate": 1.1414543510367673e-05, |
| "loss": 0.9576, |
| "num_tokens": 4550593514.0, |
| "step": 582 |
| }, |
| { |
| "epoch": 2.51487290427258, |
| "grad_norm": 0.48643885067096815, |
| "learning_rate": 1.1389435859881255e-05, |
| "loss": 0.9514, |
| "num_tokens": 4558736992.0, |
| "step": 583 |
| }, |
| { |
| "epoch": 2.5191995673336938, |
| "grad_norm": 0.5621639097985508, |
| "learning_rate": 1.1364325172495773e-05, |
| "loss": 0.9563, |
| "num_tokens": 4566863792.0, |
| "step": 584 |
| }, |
| { |
| "epoch": 2.523526230394808, |
| "grad_norm": 0.46925979555832403, |
| "learning_rate": 1.1339211644029412e-05, |
| "loss": 0.919, |
| "num_tokens": 4575114098.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 2.527852893455922, |
| "grad_norm": 0.6021494127734743, |
| "learning_rate": 1.1314095470322512e-05, |
| "loss": 0.9139, |
| "num_tokens": 4583292940.0, |
| "step": 586 |
| }, |
| { |
| "epoch": 2.532179556517036, |
| "grad_norm": 0.5826582566216992, |
| "learning_rate": 1.1288976847236034e-05, |
| "loss": 0.9488, |
| "num_tokens": 4591542916.0, |
| "step": 587 |
| }, |
| { |
| "epoch": 2.5365062195781505, |
| "grad_norm": 0.5045409853325099, |
| "learning_rate": 1.1263855970650058e-05, |
| "loss": 0.9779, |
| "num_tokens": 4599599139.0, |
| "step": 588 |
| }, |
| { |
| "epoch": 2.5408328826392643, |
| "grad_norm": 0.5657692884716502, |
| "learning_rate": 1.1238733036462215e-05, |
| "loss": 0.9818, |
| "num_tokens": 4607817199.0, |
| "step": 589 |
| }, |
| { |
| "epoch": 2.5451595457003786, |
| "grad_norm": 0.46862372757938014, |
| "learning_rate": 1.1213608240586201e-05, |
| "loss": 0.9498, |
| "num_tokens": 4616026307.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.5494862087614925, |
| "grad_norm": 0.4860095951265856, |
| "learning_rate": 1.1188481778950214e-05, |
| "loss": 0.9385, |
| "num_tokens": 4624047379.0, |
| "step": 591 |
| }, |
| { |
| "epoch": 2.5538128718226067, |
| "grad_norm": 0.45733906332352875, |
| "learning_rate": 1.1163353847495455e-05, |
| "loss": 0.9218, |
| "num_tokens": 4632297344.0, |
| "step": 592 |
| }, |
| { |
| "epoch": 2.558139534883721, |
| "grad_norm": 0.4447225224407655, |
| "learning_rate": 1.1138224642174578e-05, |
| "loss": 0.9314, |
| "num_tokens": 4640502412.0, |
| "step": 593 |
| }, |
| { |
| "epoch": 2.5624661979448353, |
| "grad_norm": 0.4769586032532646, |
| "learning_rate": 1.1113094358950177e-05, |
| "loss": 0.962, |
| "num_tokens": 4648602406.0, |
| "step": 594 |
| }, |
| { |
| "epoch": 2.566792861005949, |
| "grad_norm": 0.42700328299134344, |
| "learning_rate": 1.1087963193793242e-05, |
| "loss": 0.9326, |
| "num_tokens": 4656901673.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 2.571119524067063, |
| "grad_norm": 0.4504153309965811, |
| "learning_rate": 1.1062831342681655e-05, |
| "loss": 0.929, |
| "num_tokens": 4665058500.0, |
| "step": 596 |
| }, |
| { |
| "epoch": 2.5754461871281773, |
| "grad_norm": 0.4964922520196292, |
| "learning_rate": 1.1037699001598636e-05, |
| "loss": 0.935, |
| "num_tokens": 4673282001.0, |
| "step": 597 |
| }, |
| { |
| "epoch": 2.5797728501892916, |
| "grad_norm": 0.40215258351767974, |
| "learning_rate": 1.1012566366531232e-05, |
| "loss": 0.9728, |
| "num_tokens": 4681534289.0, |
| "step": 598 |
| }, |
| { |
| "epoch": 2.584099513250406, |
| "grad_norm": 0.4191004702486287, |
| "learning_rate": 1.0987433633468771e-05, |
| "loss": 0.9745, |
| "num_tokens": 4689651407.0, |
| "step": 599 |
| }, |
| { |
| "epoch": 2.5884261763115197, |
| "grad_norm": 0.37353628363917635, |
| "learning_rate": 1.0962300998401367e-05, |
| "loss": 0.9079, |
| "num_tokens": 4697947274.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.592752839372634, |
| "grad_norm": 0.3812708767310222, |
| "learning_rate": 1.0937168657318347e-05, |
| "loss": 0.9597, |
| "num_tokens": 4705880972.0, |
| "step": 601 |
| }, |
| { |
| "epoch": 2.597079502433748, |
| "grad_norm": 0.43583790957520196, |
| "learning_rate": 1.091203680620676e-05, |
| "loss": 0.9573, |
| "num_tokens": 4714073064.0, |
| "step": 602 |
| }, |
| { |
| "epoch": 2.601406165494862, |
| "grad_norm": 0.35271036617589024, |
| "learning_rate": 1.0886905641049828e-05, |
| "loss": 0.9197, |
| "num_tokens": 4722198446.0, |
| "step": 603 |
| }, |
| { |
| "epoch": 2.6057328285559764, |
| "grad_norm": 0.3781948396692369, |
| "learning_rate": 1.0861775357825424e-05, |
| "loss": 0.9524, |
| "num_tokens": 4730250511.0, |
| "step": 604 |
| }, |
| { |
| "epoch": 2.6100594916170903, |
| "grad_norm": 0.4533417243971015, |
| "learning_rate": 1.0836646152504548e-05, |
| "loss": 0.9757, |
| "num_tokens": 4738501206.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 2.6143861546782046, |
| "grad_norm": 0.45748276165170915, |
| "learning_rate": 1.0811518221049787e-05, |
| "loss": 0.9298, |
| "num_tokens": 4746530561.0, |
| "step": 606 |
| }, |
| { |
| "epoch": 2.6187128177393184, |
| "grad_norm": 0.41575087815553385, |
| "learning_rate": 1.0786391759413805e-05, |
| "loss": 0.9356, |
| "num_tokens": 4754522348.0, |
| "step": 607 |
| }, |
| { |
| "epoch": 2.6230394808004327, |
| "grad_norm": 0.3970192273773917, |
| "learning_rate": 1.0761266963537786e-05, |
| "loss": 0.9725, |
| "num_tokens": 4762779442.0, |
| "step": 608 |
| }, |
| { |
| "epoch": 2.627366143861547, |
| "grad_norm": 0.37169659490918194, |
| "learning_rate": 1.0736144029349947e-05, |
| "loss": 0.9404, |
| "num_tokens": 4771069539.0, |
| "step": 609 |
| }, |
| { |
| "epoch": 2.631692806922661, |
| "grad_norm": 0.37442651321213727, |
| "learning_rate": 1.0711023152763967e-05, |
| "loss": 0.931, |
| "num_tokens": 4779346220.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.636019469983775, |
| "grad_norm": 0.44735405291732344, |
| "learning_rate": 1.0685904529677496e-05, |
| "loss": 0.9761, |
| "num_tokens": 4787563265.0, |
| "step": 611 |
| }, |
| { |
| "epoch": 2.640346133044889, |
| "grad_norm": 0.35959081797177606, |
| "learning_rate": 1.066078835597059e-05, |
| "loss": 0.9422, |
| "num_tokens": 4795789059.0, |
| "step": 612 |
| }, |
| { |
| "epoch": 2.6446727961060033, |
| "grad_norm": 0.34980726494078923, |
| "learning_rate": 1.063567482750423e-05, |
| "loss": 0.977, |
| "num_tokens": 4804010832.0, |
| "step": 613 |
| }, |
| { |
| "epoch": 2.6489994591671175, |
| "grad_norm": 0.3907706043819872, |
| "learning_rate": 1.061056414011875e-05, |
| "loss": 0.933, |
| "num_tokens": 4812294976.0, |
| "step": 614 |
| }, |
| { |
| "epoch": 2.6533261222282314, |
| "grad_norm": 0.4119640894365054, |
| "learning_rate": 1.0585456489632328e-05, |
| "loss": 0.9311, |
| "num_tokens": 4820531707.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 2.6576527852893457, |
| "grad_norm": 0.354677452859528, |
| "learning_rate": 1.0560352071839482e-05, |
| "loss": 0.9395, |
| "num_tokens": 4828837649.0, |
| "step": 616 |
| }, |
| { |
| "epoch": 2.6619794483504595, |
| "grad_norm": 0.3239073861757467, |
| "learning_rate": 1.0535251082509493e-05, |
| "loss": 0.9754, |
| "num_tokens": 4836950312.0, |
| "step": 617 |
| }, |
| { |
| "epoch": 2.666306111411574, |
| "grad_norm": 0.4230298637290067, |
| "learning_rate": 1.0510153717384922e-05, |
| "loss": 0.9206, |
| "num_tokens": 4845196377.0, |
| "step": 618 |
| }, |
| { |
| "epoch": 2.670632774472688, |
| "grad_norm": 0.4854466095116207, |
| "learning_rate": 1.0485060172180058e-05, |
| "loss": 0.9677, |
| "num_tokens": 4853465214.0, |
| "step": 619 |
| }, |
| { |
| "epoch": 2.674959437533802, |
| "grad_norm": 0.3941871664505198, |
| "learning_rate": 1.0459970642579419e-05, |
| "loss": 0.9754, |
| "num_tokens": 4861677592.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.6792861005949162, |
| "grad_norm": 0.33437251990630384, |
| "learning_rate": 1.0434885324236182e-05, |
| "loss": 0.932, |
| "num_tokens": 4869954573.0, |
| "step": 621 |
| }, |
| { |
| "epoch": 2.68361276365603, |
| "grad_norm": 0.4097442193963148, |
| "learning_rate": 1.040980441277071e-05, |
| "loss": 0.9558, |
| "num_tokens": 4878037940.0, |
| "step": 622 |
| }, |
| { |
| "epoch": 2.6879394267171444, |
| "grad_norm": 0.3468569242173382, |
| "learning_rate": 1.0384728103768984e-05, |
| "loss": 0.9744, |
| "num_tokens": 4886162108.0, |
| "step": 623 |
| }, |
| { |
| "epoch": 2.6922660897782587, |
| "grad_norm": 0.35424420890206254, |
| "learning_rate": 1.0359656592781102e-05, |
| "loss": 0.9671, |
| "num_tokens": 4893976133.0, |
| "step": 624 |
| }, |
| { |
| "epoch": 2.6965927528393725, |
| "grad_norm": 0.3616987336239977, |
| "learning_rate": 1.0334590075319737e-05, |
| "loss": 0.9529, |
| "num_tokens": 4902247512.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.700919415900487, |
| "grad_norm": 1.5398266966086605, |
| "learning_rate": 1.0309528746858633e-05, |
| "loss": 0.9405, |
| "num_tokens": 4910413738.0, |
| "step": 626 |
| }, |
| { |
| "epoch": 2.7052460789616006, |
| "grad_norm": 0.63781417279165, |
| "learning_rate": 1.0284472802831064e-05, |
| "loss": 0.994, |
| "num_tokens": 4918714880.0, |
| "step": 627 |
| }, |
| { |
| "epoch": 2.709572742022715, |
| "grad_norm": 0.3631737849921556, |
| "learning_rate": 1.0259422438628311e-05, |
| "loss": 0.9463, |
| "num_tokens": 4926916759.0, |
| "step": 628 |
| }, |
| { |
| "epoch": 2.713899405083829, |
| "grad_norm": 0.40687223170267717, |
| "learning_rate": 1.023437784959815e-05, |
| "loss": 0.8911, |
| "num_tokens": 4935201255.0, |
| "step": 629 |
| }, |
| { |
| "epoch": 2.718226068144943, |
| "grad_norm": 0.48334368577273396, |
| "learning_rate": 1.0209339231043314e-05, |
| "loss": 0.979, |
| "num_tokens": 4943203704.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.7225527312060573, |
| "grad_norm": 0.38429152843144504, |
| "learning_rate": 1.0184306778219982e-05, |
| "loss": 0.9614, |
| "num_tokens": 4951361362.0, |
| "step": 631 |
| }, |
| { |
| "epoch": 2.726879394267171, |
| "grad_norm": 0.3375014960801925, |
| "learning_rate": 1.0159280686336247e-05, |
| "loss": 0.9464, |
| "num_tokens": 4959475675.0, |
| "step": 632 |
| }, |
| { |
| "epoch": 2.7312060573282855, |
| "grad_norm": 0.5278716176582829, |
| "learning_rate": 1.0134261150550607e-05, |
| "loss": 0.9578, |
| "num_tokens": 4967495353.0, |
| "step": 633 |
| }, |
| { |
| "epoch": 2.7355327203893998, |
| "grad_norm": 0.38762155738765125, |
| "learning_rate": 1.0109248365970417e-05, |
| "loss": 0.9471, |
| "num_tokens": 4975635351.0, |
| "step": 634 |
| }, |
| { |
| "epoch": 2.739859383450514, |
| "grad_norm": 0.42752590625715914, |
| "learning_rate": 1.0084242527650405e-05, |
| "loss": 0.9337, |
| "num_tokens": 4983586441.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 2.744186046511628, |
| "grad_norm": 0.3486963485067961, |
| "learning_rate": 1.005924383059112e-05, |
| "loss": 0.9427, |
| "num_tokens": 4991751016.0, |
| "step": 636 |
| }, |
| { |
| "epoch": 2.748512709572742, |
| "grad_norm": 0.34837213958685875, |
| "learning_rate": 1.003425246973742e-05, |
| "loss": 0.9311, |
| "num_tokens": 4999725885.0, |
| "step": 637 |
| }, |
| { |
| "epoch": 2.752839372633856, |
| "grad_norm": 0.4023922893858053, |
| "learning_rate": 1.0009268639976963e-05, |
| "loss": 0.9673, |
| "num_tokens": 5008028648.0, |
| "step": 638 |
| }, |
| { |
| "epoch": 2.7571660356949703, |
| "grad_norm": 0.4231190144779909, |
| "learning_rate": 9.984292536138667e-06, |
| "loss": 0.937, |
| "num_tokens": 5016272386.0, |
| "step": 639 |
| }, |
| { |
| "epoch": 2.7614926987560846, |
| "grad_norm": 3.0302353492150087, |
| "learning_rate": 9.959324352991208e-06, |
| "loss": 0.9676, |
| "num_tokens": 5024339120.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.7658193618171985, |
| "grad_norm": 0.5891540777938301, |
| "learning_rate": 9.934364285241492e-06, |
| "loss": 0.9493, |
| "num_tokens": 5032678352.0, |
| "step": 641 |
| }, |
| { |
| "epoch": 2.7701460248783127, |
| "grad_norm": 0.3713633204848213, |
| "learning_rate": 9.90941252753315e-06, |
| "loss": 0.9253, |
| "num_tokens": 5040939116.0, |
| "step": 642 |
| }, |
| { |
| "epoch": 2.7744726879394266, |
| "grad_norm": 0.5678305665625419, |
| "learning_rate": 9.884469274444985e-06, |
| "loss": 0.9397, |
| "num_tokens": 5049128904.0, |
| "step": 643 |
| }, |
| { |
| "epoch": 2.778799351000541, |
| "grad_norm": 0.5210583704451124, |
| "learning_rate": 9.859534720489512e-06, |
| "loss": 0.9197, |
| "num_tokens": 5057221429.0, |
| "step": 644 |
| }, |
| { |
| "epoch": 2.783126014061655, |
| "grad_norm": 0.5058583173880178, |
| "learning_rate": 9.834609060111379e-06, |
| "loss": 0.9345, |
| "num_tokens": 5065603969.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 2.787452677122769, |
| "grad_norm": 1.065747933692024, |
| "learning_rate": 9.809692487685897e-06, |
| "loss": 0.9641, |
| "num_tokens": 5073922039.0, |
| "step": 646 |
| }, |
| { |
| "epoch": 2.7917793401838833, |
| "grad_norm": 0.6392727901163089, |
| "learning_rate": 9.784785197517508e-06, |
| "loss": 0.978, |
| "num_tokens": 5082066062.0, |
| "step": 647 |
| }, |
| { |
| "epoch": 2.796106003244997, |
| "grad_norm": 0.47665360710824717, |
| "learning_rate": 9.75988738383826e-06, |
| "loss": 0.9659, |
| "num_tokens": 5090277224.0, |
| "step": 648 |
| }, |
| { |
| "epoch": 2.8004326663061114, |
| "grad_norm": 0.4296714120604327, |
| "learning_rate": 9.734999240806305e-06, |
| "loss": 0.9419, |
| "num_tokens": 5098498265.0, |
| "step": 649 |
| }, |
| { |
| "epoch": 2.8047593293672257, |
| "grad_norm": 0.46022607243851776, |
| "learning_rate": 9.710120962504377e-06, |
| "loss": 0.9299, |
| "num_tokens": 5106762065.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.8090859924283396, |
| "grad_norm": 0.38037491268678614, |
| "learning_rate": 9.685252742938298e-06, |
| "loss": 0.9604, |
| "num_tokens": 5114866977.0, |
| "step": 651 |
| }, |
| { |
| "epoch": 2.813412655489454, |
| "grad_norm": 0.4450482562419481, |
| "learning_rate": 9.660394776035432e-06, |
| "loss": 0.9645, |
| "num_tokens": 5123156278.0, |
| "step": 652 |
| }, |
| { |
| "epoch": 2.8177393185505677, |
| "grad_norm": 0.39576092998234047, |
| "learning_rate": 9.635547255643203e-06, |
| "loss": 0.9446, |
| "num_tokens": 5131331083.0, |
| "step": 653 |
| }, |
| { |
| "epoch": 2.822065981611682, |
| "grad_norm": 0.41154398069393333, |
| "learning_rate": 9.610710375527561e-06, |
| "loss": 0.9246, |
| "num_tokens": 5139347026.0, |
| "step": 654 |
| }, |
| { |
| "epoch": 2.8263926446727963, |
| "grad_norm": 0.3421006708746006, |
| "learning_rate": 9.585884329371496e-06, |
| "loss": 0.9861, |
| "num_tokens": 5147540492.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 2.83071930773391, |
| "grad_norm": 0.4747580560478711, |
| "learning_rate": 9.561069310773487e-06, |
| "loss": 0.9542, |
| "num_tokens": 5155870243.0, |
| "step": 656 |
| }, |
| { |
| "epoch": 2.8350459707950244, |
| "grad_norm": 0.4224754159893733, |
| "learning_rate": 9.536265513246048e-06, |
| "loss": 0.935, |
| "num_tokens": 5163926726.0, |
| "step": 657 |
| }, |
| { |
| "epoch": 2.8393726338561383, |
| "grad_norm": 0.46762870636929993, |
| "learning_rate": 9.511473130214173e-06, |
| "loss": 0.9705, |
| "num_tokens": 5172107458.0, |
| "step": 658 |
| }, |
| { |
| "epoch": 2.8436992969172525, |
| "grad_norm": 0.4965576656745177, |
| "learning_rate": 9.486692355013838e-06, |
| "loss": 0.9183, |
| "num_tokens": 5180290484.0, |
| "step": 659 |
| }, |
| { |
| "epoch": 2.848025959978367, |
| "grad_norm": 0.3970648154104729, |
| "learning_rate": 9.461923380890513e-06, |
| "loss": 0.9421, |
| "num_tokens": 5188522043.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.8523526230394807, |
| "grad_norm": 0.4619137564242492, |
| "learning_rate": 9.437166400997629e-06, |
| "loss": 0.9724, |
| "num_tokens": 5196656199.0, |
| "step": 661 |
| }, |
| { |
| "epoch": 2.856679286100595, |
| "grad_norm": 0.38039081459659835, |
| "learning_rate": 9.41242160839509e-06, |
| "loss": 0.9309, |
| "num_tokens": 5204830052.0, |
| "step": 662 |
| }, |
| { |
| "epoch": 2.861005949161709, |
| "grad_norm": 0.3969836134364504, |
| "learning_rate": 9.387689196047761e-06, |
| "loss": 0.9708, |
| "num_tokens": 5212973545.0, |
| "step": 663 |
| }, |
| { |
| "epoch": 2.865332612222823, |
| "grad_norm": 0.367110555698261, |
| "learning_rate": 9.36296935682396e-06, |
| "loss": 0.9443, |
| "num_tokens": 5221092259.0, |
| "step": 664 |
| }, |
| { |
| "epoch": 2.8696592752839374, |
| "grad_norm": 0.34785384103882466, |
| "learning_rate": 9.33826228349396e-06, |
| "loss": 0.9454, |
| "num_tokens": 5229402791.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 2.8739859383450512, |
| "grad_norm": 0.40112917048533714, |
| "learning_rate": 9.313568168728478e-06, |
| "loss": 0.9502, |
| "num_tokens": 5237456315.0, |
| "step": 666 |
| }, |
| { |
| "epoch": 2.8783126014061655, |
| "grad_norm": 0.3426746194857651, |
| "learning_rate": 9.28888720509719e-06, |
| "loss": 0.9071, |
| "num_tokens": 5245758513.0, |
| "step": 667 |
| }, |
| { |
| "epoch": 2.8826392644672794, |
| "grad_norm": 0.41050977689909746, |
| "learning_rate": 9.264219585067197e-06, |
| "loss": 0.9598, |
| "num_tokens": 5254017826.0, |
| "step": 668 |
| }, |
| { |
| "epoch": 2.8869659275283936, |
| "grad_norm": 0.34041904977731363, |
| "learning_rate": 9.239565501001568e-06, |
| "loss": 0.9455, |
| "num_tokens": 5262031305.0, |
| "step": 669 |
| }, |
| { |
| "epoch": 2.891292590589508, |
| "grad_norm": 0.31648195039745797, |
| "learning_rate": 9.214925145157793e-06, |
| "loss": 0.952, |
| "num_tokens": 5270250950.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.8956192536506222, |
| "grad_norm": 0.3463504968110696, |
| "learning_rate": 9.190298709686321e-06, |
| "loss": 0.9338, |
| "num_tokens": 5278464277.0, |
| "step": 671 |
| }, |
| { |
| "epoch": 2.899945916711736, |
| "grad_norm": 0.28534318933782366, |
| "learning_rate": 9.165686386629039e-06, |
| "loss": 0.9476, |
| "num_tokens": 5286697113.0, |
| "step": 672 |
| }, |
| { |
| "epoch": 2.90427257977285, |
| "grad_norm": 0.3397081347153792, |
| "learning_rate": 9.141088367917792e-06, |
| "loss": 0.9433, |
| "num_tokens": 5294847314.0, |
| "step": 673 |
| }, |
| { |
| "epoch": 2.908599242833964, |
| "grad_norm": 0.3230949625825632, |
| "learning_rate": 9.116504845372865e-06, |
| "loss": 0.9141, |
| "num_tokens": 5302924987.0, |
| "step": 674 |
| }, |
| { |
| "epoch": 2.9129259058950785, |
| "grad_norm": 0.34833673066450366, |
| "learning_rate": 9.091936010701513e-06, |
| "loss": 0.9698, |
| "num_tokens": 5311177487.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.917252568956193, |
| "grad_norm": 0.3619472901098368, |
| "learning_rate": 9.067382055496431e-06, |
| "loss": 0.9719, |
| "num_tokens": 5319219559.0, |
| "step": 676 |
| }, |
| { |
| "epoch": 2.9215792320173066, |
| "grad_norm": 0.350338044609913, |
| "learning_rate": 9.042843171234307e-06, |
| "loss": 0.9576, |
| "num_tokens": 5327511456.0, |
| "step": 677 |
| }, |
| { |
| "epoch": 2.925905895078421, |
| "grad_norm": 0.3088705516097823, |
| "learning_rate": 9.018319549274288e-06, |
| "loss": 0.9562, |
| "num_tokens": 5335791238.0, |
| "step": 678 |
| }, |
| { |
| "epoch": 2.9302325581395348, |
| "grad_norm": 0.33745754789789145, |
| "learning_rate": 8.993811380856496e-06, |
| "loss": 0.9341, |
| "num_tokens": 5343893681.0, |
| "step": 679 |
| }, |
| { |
| "epoch": 2.934559221200649, |
| "grad_norm": 0.34258505632081965, |
| "learning_rate": 8.969318857100564e-06, |
| "loss": 0.9429, |
| "num_tokens": 5352079321.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.9388858842617633, |
| "grad_norm": 0.2995800360605643, |
| "learning_rate": 8.944842169004099e-06, |
| "loss": 0.9589, |
| "num_tokens": 5360263368.0, |
| "step": 681 |
| }, |
| { |
| "epoch": 2.943212547322877, |
| "grad_norm": 0.3773143251992146, |
| "learning_rate": 8.920381507441243e-06, |
| "loss": 0.9362, |
| "num_tokens": 5368504895.0, |
| "step": 682 |
| }, |
| { |
| "epoch": 2.9475392103839915, |
| "grad_norm": 0.34689165094951335, |
| "learning_rate": 8.89593706316114e-06, |
| "loss": 0.9264, |
| "num_tokens": 5376846780.0, |
| "step": 683 |
| }, |
| { |
| "epoch": 2.9518658734451053, |
| "grad_norm": 0.41412476158382777, |
| "learning_rate": 8.87150902678648e-06, |
| "loss": 0.9594, |
| "num_tokens": 5384933248.0, |
| "step": 684 |
| }, |
| { |
| "epoch": 2.9561925365062196, |
| "grad_norm": 0.36680127928889933, |
| "learning_rate": 8.847097588811998e-06, |
| "loss": 0.9589, |
| "num_tokens": 5393187704.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 2.960519199567334, |
| "grad_norm": 0.34424202273538157, |
| "learning_rate": 8.822702939602991e-06, |
| "loss": 0.9542, |
| "num_tokens": 5401348320.0, |
| "step": 686 |
| }, |
| { |
| "epoch": 2.9648458626284477, |
| "grad_norm": 0.3173741339592272, |
| "learning_rate": 8.798325269393837e-06, |
| "loss": 0.9313, |
| "num_tokens": 5409586273.0, |
| "step": 687 |
| }, |
| { |
| "epoch": 2.969172525689562, |
| "grad_norm": 0.3244123403034035, |
| "learning_rate": 8.773964768286496e-06, |
| "loss": 0.9612, |
| "num_tokens": 5417804753.0, |
| "step": 688 |
| }, |
| { |
| "epoch": 2.973499188750676, |
| "grad_norm": 0.32961010771746146, |
| "learning_rate": 8.749621626249064e-06, |
| "loss": 0.9268, |
| "num_tokens": 5426038605.0, |
| "step": 689 |
| }, |
| { |
| "epoch": 2.97782585181179, |
| "grad_norm": 0.3463195021104508, |
| "learning_rate": 8.725296033114236e-06, |
| "loss": 0.9314, |
| "num_tokens": 5434234877.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.9821525148729044, |
| "grad_norm": 0.3050357082850999, |
| "learning_rate": 8.700988178577887e-06, |
| "loss": 0.9222, |
| "num_tokens": 5442351769.0, |
| "step": 691 |
| }, |
| { |
| "epoch": 2.9864791779340183, |
| "grad_norm": 0.3170878559410219, |
| "learning_rate": 8.676698252197542e-06, |
| "loss": 0.9628, |
| "num_tokens": 5450580632.0, |
| "step": 692 |
| }, |
| { |
| "epoch": 2.9908058409951326, |
| "grad_norm": 0.3366257932732851, |
| "learning_rate": 8.652426443390931e-06, |
| "loss": 0.9407, |
| "num_tokens": 5458798186.0, |
| "step": 693 |
| }, |
| { |
| "epoch": 2.9951325040562464, |
| "grad_norm": 0.4018191360229351, |
| "learning_rate": 8.628172941434488e-06, |
| "loss": 0.9606, |
| "num_tokens": 5466877716.0, |
| "step": 694 |
| }, |
| { |
| "epoch": 2.9994591671173607, |
| "grad_norm": 0.3274075286425149, |
| "learning_rate": 8.603937935461901e-06, |
| "loss": 0.9164, |
| "num_tokens": 5475055549.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.5614038305725747, |
| "learning_rate": 8.57972161446261e-06, |
| "loss": 0.9008, |
| "num_tokens": 5476104008.0, |
| "step": 696 |
| }, |
| { |
| "epoch": 3.0043266630611143, |
| "grad_norm": 0.4105885304776379, |
| "learning_rate": 8.55552416728035e-06, |
| "loss": 0.913, |
| "num_tokens": 5484223267.0, |
| "step": 697 |
| }, |
| { |
| "epoch": 3.008653326122228, |
| "grad_norm": 0.3773905264475136, |
| "learning_rate": 8.531345782611683e-06, |
| "loss": 0.9167, |
| "num_tokens": 5492414464.0, |
| "step": 698 |
| }, |
| { |
| "epoch": 3.0129799891833424, |
| "grad_norm": 0.3566408355562363, |
| "learning_rate": 8.5071866490045e-06, |
| "loss": 0.9194, |
| "num_tokens": 5500623429.0, |
| "step": 699 |
| }, |
| { |
| "epoch": 3.0173066522444563, |
| "grad_norm": 0.4193673926911609, |
| "learning_rate": 8.483046954856585e-06, |
| "loss": 0.9491, |
| "num_tokens": 5508774636.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.0216333153055706, |
| "grad_norm": 0.35556886195846743, |
| "learning_rate": 8.458926888414112e-06, |
| "loss": 0.9589, |
| "num_tokens": 5517065103.0, |
| "step": 701 |
| }, |
| { |
| "epoch": 3.025959978366685, |
| "grad_norm": 0.41653240041710277, |
| "learning_rate": 8.434826637770217e-06, |
| "loss": 0.9173, |
| "num_tokens": 5525187881.0, |
| "step": 702 |
| }, |
| { |
| "epoch": 3.0302866414277987, |
| "grad_norm": 0.3146167010937598, |
| "learning_rate": 8.410746390863487e-06, |
| "loss": 0.9006, |
| "num_tokens": 5533313938.0, |
| "step": 703 |
| }, |
| { |
| "epoch": 3.034613304488913, |
| "grad_norm": 0.4044270250363667, |
| "learning_rate": 8.386686335476529e-06, |
| "loss": 0.923, |
| "num_tokens": 5541377157.0, |
| "step": 704 |
| }, |
| { |
| "epoch": 3.038939967550027, |
| "grad_norm": 0.3395022183255716, |
| "learning_rate": 8.362646659234485e-06, |
| "loss": 0.8981, |
| "num_tokens": 5549611519.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 3.043266630611141, |
| "grad_norm": 0.4471600294802068, |
| "learning_rate": 8.338627549603585e-06, |
| "loss": 0.9222, |
| "num_tokens": 5557919196.0, |
| "step": 706 |
| }, |
| { |
| "epoch": 3.0475932936722554, |
| "grad_norm": 0.5305326751152935, |
| "learning_rate": 8.314629193889658e-06, |
| "loss": 0.9397, |
| "num_tokens": 5566104499.0, |
| "step": 707 |
| }, |
| { |
| "epoch": 3.0519199567333692, |
| "grad_norm": 0.35027612291144045, |
| "learning_rate": 8.290651779236718e-06, |
| "loss": 0.9131, |
| "num_tokens": 5574322504.0, |
| "step": 708 |
| }, |
| { |
| "epoch": 3.0562466197944835, |
| "grad_norm": 0.33827746335457937, |
| "learning_rate": 8.266695492625454e-06, |
| "loss": 0.9456, |
| "num_tokens": 5582427975.0, |
| "step": 709 |
| }, |
| { |
| "epoch": 3.060573282855598, |
| "grad_norm": 0.40908561698414975, |
| "learning_rate": 8.242760520871797e-06, |
| "loss": 0.9471, |
| "num_tokens": 5590629219.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 3.0648999459167117, |
| "grad_norm": 0.3186114149151915, |
| "learning_rate": 8.218847050625476e-06, |
| "loss": 0.9024, |
| "num_tokens": 5598721452.0, |
| "step": 711 |
| }, |
| { |
| "epoch": 3.069226608977826, |
| "grad_norm": 0.3081259118888582, |
| "learning_rate": 8.194955268368526e-06, |
| "loss": 0.9243, |
| "num_tokens": 5606863531.0, |
| "step": 712 |
| }, |
| { |
| "epoch": 3.07355327203894, |
| "grad_norm": 0.35787737365668276, |
| "learning_rate": 8.171085360413867e-06, |
| "loss": 0.9459, |
| "num_tokens": 5614997329.0, |
| "step": 713 |
| }, |
| { |
| "epoch": 3.077879935100054, |
| "grad_norm": 0.32537016650973943, |
| "learning_rate": 8.147237512903834e-06, |
| "loss": 0.9094, |
| "num_tokens": 5622970021.0, |
| "step": 714 |
| }, |
| { |
| "epoch": 3.0822065981611684, |
| "grad_norm": 0.3105881415673291, |
| "learning_rate": 8.12341191180874e-06, |
| "loss": 0.927, |
| "num_tokens": 5631238351.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 3.086533261222282, |
| "grad_norm": 0.3008636618396721, |
| "learning_rate": 8.099608742925403e-06, |
| "loss": 0.9215, |
| "num_tokens": 5639467774.0, |
| "step": 716 |
| }, |
| { |
| "epoch": 3.0908599242833965, |
| "grad_norm": 0.326147375459268, |
| "learning_rate": 8.075828191875714e-06, |
| "loss": 0.931, |
| "num_tokens": 5647579841.0, |
| "step": 717 |
| }, |
| { |
| "epoch": 3.0951865873445104, |
| "grad_norm": 0.3522548972988388, |
| "learning_rate": 8.052070444105188e-06, |
| "loss": 0.9095, |
| "num_tokens": 5655755448.0, |
| "step": 718 |
| }, |
| { |
| "epoch": 3.0995132504056246, |
| "grad_norm": 3.89200308036797, |
| "learning_rate": 8.028335684881517e-06, |
| "loss": 0.9562, |
| "num_tokens": 5663831787.0, |
| "step": 719 |
| }, |
| { |
| "epoch": 3.103839913466739, |
| "grad_norm": 0.5290111236398576, |
| "learning_rate": 8.00462409929312e-06, |
| "loss": 0.9091, |
| "num_tokens": 5672081053.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.1081665765278528, |
| "grad_norm": 0.31017972774713937, |
| "learning_rate": 7.980935872247706e-06, |
| "loss": 0.8981, |
| "num_tokens": 5680229315.0, |
| "step": 721 |
| }, |
| { |
| "epoch": 3.112493239588967, |
| "grad_norm": 0.44238544431516175, |
| "learning_rate": 7.957271188470828e-06, |
| "loss": 0.9383, |
| "num_tokens": 5688587865.0, |
| "step": 722 |
| }, |
| { |
| "epoch": 3.1168199026500814, |
| "grad_norm": 0.36162049774260335, |
| "learning_rate": 7.933630232504437e-06, |
| "loss": 0.9364, |
| "num_tokens": 5696702957.0, |
| "step": 723 |
| }, |
| { |
| "epoch": 3.121146565711195, |
| "grad_norm": 0.3305075065326217, |
| "learning_rate": 7.910013188705464e-06, |
| "loss": 0.9394, |
| "num_tokens": 5704940872.0, |
| "step": 724 |
| }, |
| { |
| "epoch": 3.1254732287723095, |
| "grad_norm": 0.3612306221470166, |
| "learning_rate": 7.88642024124436e-06, |
| "loss": 0.9533, |
| "num_tokens": 5713220714.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 3.1297998918334233, |
| "grad_norm": 0.35107569970160807, |
| "learning_rate": 7.862851574103665e-06, |
| "loss": 0.9291, |
| "num_tokens": 5721074137.0, |
| "step": 726 |
| }, |
| { |
| "epoch": 3.1341265548945376, |
| "grad_norm": 0.368464722379813, |
| "learning_rate": 7.839307371076581e-06, |
| "loss": 0.9288, |
| "num_tokens": 5729151610.0, |
| "step": 727 |
| }, |
| { |
| "epoch": 3.138453217955652, |
| "grad_norm": 0.34514054184426424, |
| "learning_rate": 7.815787815765536e-06, |
| "loss": 0.9613, |
| "num_tokens": 5737530913.0, |
| "step": 728 |
| }, |
| { |
| "epoch": 3.1427798810167658, |
| "grad_norm": 0.35482210860728847, |
| "learning_rate": 7.792293091580746e-06, |
| "loss": 0.955, |
| "num_tokens": 5745744229.0, |
| "step": 729 |
| }, |
| { |
| "epoch": 3.14710654407788, |
| "grad_norm": 0.32475020191847437, |
| "learning_rate": 7.768823381738786e-06, |
| "loss": 0.9209, |
| "num_tokens": 5753821386.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.151433207138994, |
| "grad_norm": 0.34317842748446037, |
| "learning_rate": 7.74537886926117e-06, |
| "loss": 0.9305, |
| "num_tokens": 5762051038.0, |
| "step": 731 |
| }, |
| { |
| "epoch": 3.155759870200108, |
| "grad_norm": 0.28148438370865, |
| "learning_rate": 7.721959736972918e-06, |
| "loss": 0.9517, |
| "num_tokens": 5770277494.0, |
| "step": 732 |
| }, |
| { |
| "epoch": 3.1600865332612225, |
| "grad_norm": 0.3857808901571952, |
| "learning_rate": 7.698566167501124e-06, |
| "loss": 0.9133, |
| "num_tokens": 5778496571.0, |
| "step": 733 |
| }, |
| { |
| "epoch": 3.1644131963223363, |
| "grad_norm": 0.2789904847537803, |
| "learning_rate": 7.675198343273546e-06, |
| "loss": 0.9086, |
| "num_tokens": 5786685495.0, |
| "step": 734 |
| }, |
| { |
| "epoch": 3.1687398593834506, |
| "grad_norm": 0.37671200165653507, |
| "learning_rate": 7.651856446517172e-06, |
| "loss": 0.9147, |
| "num_tokens": 5794832046.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 3.1730665224445644, |
| "grad_norm": 0.30040878970925106, |
| "learning_rate": 7.628540659256791e-06, |
| "loss": 0.9403, |
| "num_tokens": 5803097014.0, |
| "step": 736 |
| }, |
| { |
| "epoch": 3.1773931855056787, |
| "grad_norm": 0.3219509548910601, |
| "learning_rate": 7.605251163313614e-06, |
| "loss": 0.9388, |
| "num_tokens": 5811278088.0, |
| "step": 737 |
| }, |
| { |
| "epoch": 3.181719848566793, |
| "grad_norm": 0.36758751240035475, |
| "learning_rate": 7.581988140303791e-06, |
| "loss": 0.9206, |
| "num_tokens": 5819328655.0, |
| "step": 738 |
| }, |
| { |
| "epoch": 3.186046511627907, |
| "grad_norm": 0.29512444869432597, |
| "learning_rate": 7.558751771637059e-06, |
| "loss": 0.9167, |
| "num_tokens": 5827380764.0, |
| "step": 739 |
| }, |
| { |
| "epoch": 3.190373174689021, |
| "grad_norm": 2.0124288170095754, |
| "learning_rate": 7.535542238515285e-06, |
| "loss": 0.9128, |
| "num_tokens": 5835351467.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.194699837750135, |
| "grad_norm": 0.4989076582507637, |
| "learning_rate": 7.512359721931075e-06, |
| "loss": 0.967, |
| "num_tokens": 5843312078.0, |
| "step": 741 |
| }, |
| { |
| "epoch": 3.1990265008112493, |
| "grad_norm": 0.3140029576082317, |
| "learning_rate": 7.489204402666344e-06, |
| "loss": 0.9486, |
| "num_tokens": 5851393198.0, |
| "step": 742 |
| }, |
| { |
| "epoch": 3.2033531638723636, |
| "grad_norm": 0.4125226908546305, |
| "learning_rate": 7.466076461290925e-06, |
| "loss": 0.9468, |
| "num_tokens": 5859740781.0, |
| "step": 743 |
| }, |
| { |
| "epoch": 3.2076798269334774, |
| "grad_norm": 0.39438107603387274, |
| "learning_rate": 7.442976078161155e-06, |
| "loss": 0.9247, |
| "num_tokens": 5868046866.0, |
| "step": 744 |
| }, |
| { |
| "epoch": 3.2120064899945917, |
| "grad_norm": 0.35730355992230906, |
| "learning_rate": 7.419903433418454e-06, |
| "loss": 0.9599, |
| "num_tokens": 5876294797.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 3.2163331530557056, |
| "grad_norm": 0.3483111629570251, |
| "learning_rate": 7.396858706987948e-06, |
| "loss": 0.939, |
| "num_tokens": 5884474850.0, |
| "step": 746 |
| }, |
| { |
| "epoch": 3.22065981611682, |
| "grad_norm": 0.36151168683199636, |
| "learning_rate": 7.373842078577038e-06, |
| "loss": 0.9431, |
| "num_tokens": 5892667501.0, |
| "step": 747 |
| }, |
| { |
| "epoch": 3.224986479177934, |
| "grad_norm": 0.3099927796740935, |
| "learning_rate": 7.350853727674019e-06, |
| "loss": 0.9233, |
| "num_tokens": 5900881285.0, |
| "step": 748 |
| }, |
| { |
| "epoch": 3.229313142239048, |
| "grad_norm": 0.33503892378294126, |
| "learning_rate": 7.327893833546666e-06, |
| "loss": 0.9496, |
| "num_tokens": 5909032821.0, |
| "step": 749 |
| }, |
| { |
| "epoch": 3.2336398053001623, |
| "grad_norm": 0.30206423338613264, |
| "learning_rate": 7.3049625752408485e-06, |
| "loss": 0.9302, |
| "num_tokens": 5917206619.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.2379664683612766, |
| "grad_norm": 0.3494142163645988, |
| "learning_rate": 7.282060131579125e-06, |
| "loss": 0.9062, |
| "num_tokens": 5925492396.0, |
| "step": 751 |
| }, |
| { |
| "epoch": 3.2422931314223904, |
| "grad_norm": 0.35685720307280216, |
| "learning_rate": 7.259186681159349e-06, |
| "loss": 0.908, |
| "num_tokens": 5933754893.0, |
| "step": 752 |
| }, |
| { |
| "epoch": 3.2466197944835047, |
| "grad_norm": 0.3282045615510198, |
| "learning_rate": 7.236342402353289e-06, |
| "loss": 0.9339, |
| "num_tokens": 5941936261.0, |
| "step": 753 |
| }, |
| { |
| "epoch": 3.2509464575446185, |
| "grad_norm": 0.33739095848560247, |
| "learning_rate": 7.213527473305211e-06, |
| "loss": 0.9322, |
| "num_tokens": 5950172687.0, |
| "step": 754 |
| }, |
| { |
| "epoch": 3.255273120605733, |
| "grad_norm": 0.3740851819217219, |
| "learning_rate": 7.1907420719305185e-06, |
| "loss": 0.917, |
| "num_tokens": 5958489000.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 3.259599783666847, |
| "grad_norm": 0.298804748301486, |
| "learning_rate": 7.167986375914347e-06, |
| "loss": 0.9376, |
| "num_tokens": 5966823523.0, |
| "step": 756 |
| }, |
| { |
| "epoch": 3.263926446727961, |
| "grad_norm": 0.36616478845543216, |
| "learning_rate": 7.145260562710188e-06, |
| "loss": 0.9445, |
| "num_tokens": 5975019052.0, |
| "step": 757 |
| }, |
| { |
| "epoch": 3.2682531097890752, |
| "grad_norm": 0.35620532611163425, |
| "learning_rate": 7.1225648095384994e-06, |
| "loss": 0.951, |
| "num_tokens": 5983270411.0, |
| "step": 758 |
| }, |
| { |
| "epoch": 3.2725797728501895, |
| "grad_norm": 0.2988864186906242, |
| "learning_rate": 7.099899293385317e-06, |
| "loss": 0.9334, |
| "num_tokens": 5991383255.0, |
| "step": 759 |
| }, |
| { |
| "epoch": 3.2769064359113034, |
| "grad_norm": 0.3325133244022023, |
| "learning_rate": 7.077264191000895e-06, |
| "loss": 0.9277, |
| "num_tokens": 5999661742.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 3.2812330989724177, |
| "grad_norm": 0.35243717348743153, |
| "learning_rate": 7.054659678898304e-06, |
| "loss": 0.9398, |
| "num_tokens": 6007956049.0, |
| "step": 761 |
| }, |
| { |
| "epoch": 3.2855597620335315, |
| "grad_norm": 0.28359237790060177, |
| "learning_rate": 7.032085933352075e-06, |
| "loss": 0.9716, |
| "num_tokens": 6016079855.0, |
| "step": 762 |
| }, |
| { |
| "epoch": 3.289886425094646, |
| "grad_norm": 0.2917404250644013, |
| "learning_rate": 7.0095431303968035e-06, |
| "loss": 0.9161, |
| "num_tokens": 6024330663.0, |
| "step": 763 |
| }, |
| { |
| "epoch": 3.29421308815576, |
| "grad_norm": 0.27888385505990193, |
| "learning_rate": 6.987031445825805e-06, |
| "loss": 0.9304, |
| "num_tokens": 6032564116.0, |
| "step": 764 |
| }, |
| { |
| "epoch": 3.298539751216874, |
| "grad_norm": 0.3196527440559267, |
| "learning_rate": 6.964551055189712e-06, |
| "loss": 0.9095, |
| "num_tokens": 6040658380.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 3.302866414277988, |
| "grad_norm": 0.7134391554970761, |
| "learning_rate": 6.942102133795126e-06, |
| "loss": 0.9254, |
| "num_tokens": 6048731691.0, |
| "step": 766 |
| }, |
| { |
| "epoch": 3.307193077339102, |
| "grad_norm": 0.4023470106512136, |
| "learning_rate": 6.919684856703244e-06, |
| "loss": 0.9311, |
| "num_tokens": 6056818812.0, |
| "step": 767 |
| }, |
| { |
| "epoch": 3.3115197404002163, |
| "grad_norm": 0.3052498648114602, |
| "learning_rate": 6.897299398728503e-06, |
| "loss": 0.941, |
| "num_tokens": 6064982756.0, |
| "step": 768 |
| }, |
| { |
| "epoch": 3.3158464034613306, |
| "grad_norm": 0.2785893249101237, |
| "learning_rate": 6.874945934437192e-06, |
| "loss": 0.8953, |
| "num_tokens": 6073271792.0, |
| "step": 769 |
| }, |
| { |
| "epoch": 3.3201730665224445, |
| "grad_norm": 0.3294134296277513, |
| "learning_rate": 6.852624638146123e-06, |
| "loss": 0.9185, |
| "num_tokens": 6081302170.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 3.3244997295835588, |
| "grad_norm": 0.27742083830120734, |
| "learning_rate": 6.830335683921248e-06, |
| "loss": 0.9409, |
| "num_tokens": 6089584899.0, |
| "step": 771 |
| }, |
| { |
| "epoch": 3.3288263926446726, |
| "grad_norm": 0.29411318866548547, |
| "learning_rate": 6.808079245576303e-06, |
| "loss": 0.9518, |
| "num_tokens": 6097617556.0, |
| "step": 772 |
| }, |
| { |
| "epoch": 3.333153055705787, |
| "grad_norm": 0.26948329262472964, |
| "learning_rate": 6.785855496671471e-06, |
| "loss": 0.946, |
| "num_tokens": 6105889261.0, |
| "step": 773 |
| }, |
| { |
| "epoch": 3.337479718766901, |
| "grad_norm": 0.293679847613122, |
| "learning_rate": 6.763664610512007e-06, |
| "loss": 0.8957, |
| "num_tokens": 6114035928.0, |
| "step": 774 |
| }, |
| { |
| "epoch": 3.341806381828015, |
| "grad_norm": 0.32098790507703917, |
| "learning_rate": 6.741506760146903e-06, |
| "loss": 0.9222, |
| "num_tokens": 6122198138.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 3.3461330448891293, |
| "grad_norm": 0.3050861139648036, |
| "learning_rate": 6.719382118367523e-06, |
| "loss": 0.9565, |
| "num_tokens": 6130496867.0, |
| "step": 776 |
| }, |
| { |
| "epoch": 3.350459707950243, |
| "grad_norm": 0.2953351225139781, |
| "learning_rate": 6.697290857706271e-06, |
| "loss": 0.943, |
| "num_tokens": 6138729262.0, |
| "step": 777 |
| }, |
| { |
| "epoch": 3.3547863710113575, |
| "grad_norm": 0.3084437070089497, |
| "learning_rate": 6.675233150435234e-06, |
| "loss": 0.9066, |
| "num_tokens": 6147052020.0, |
| "step": 778 |
| }, |
| { |
| "epoch": 3.3591130340724717, |
| "grad_norm": 0.30274593086650614, |
| "learning_rate": 6.653209168564847e-06, |
| "loss": 0.9322, |
| "num_tokens": 6155200347.0, |
| "step": 779 |
| }, |
| { |
| "epoch": 3.3634396971335856, |
| "grad_norm": 0.29878082533309175, |
| "learning_rate": 6.631219083842535e-06, |
| "loss": 0.9203, |
| "num_tokens": 6163279432.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 3.3677663601947, |
| "grad_norm": 0.29441364527784664, |
| "learning_rate": 6.609263067751406e-06, |
| "loss": 0.9471, |
| "num_tokens": 6171453047.0, |
| "step": 781 |
| }, |
| { |
| "epoch": 3.3720930232558137, |
| "grad_norm": 0.33375117226467504, |
| "learning_rate": 6.587341291508882e-06, |
| "loss": 0.9152, |
| "num_tokens": 6179561278.0, |
| "step": 782 |
| }, |
| { |
| "epoch": 3.376419686316928, |
| "grad_norm": 0.2801957802924117, |
| "learning_rate": 6.565453926065377e-06, |
| "loss": 0.9172, |
| "num_tokens": 6187682586.0, |
| "step": 783 |
| }, |
| { |
| "epoch": 3.3807463493780423, |
| "grad_norm": 0.3024317409157831, |
| "learning_rate": 6.543601142102964e-06, |
| "loss": 0.9159, |
| "num_tokens": 6195851861.0, |
| "step": 784 |
| }, |
| { |
| "epoch": 3.385073012439156, |
| "grad_norm": 0.32720306934594906, |
| "learning_rate": 6.521783110034038e-06, |
| "loss": 0.925, |
| "num_tokens": 6203979372.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 3.3893996755002704, |
| "grad_norm": 0.3038756358264362, |
| "learning_rate": 6.500000000000003e-06, |
| "loss": 0.9009, |
| "num_tokens": 6212201742.0, |
| "step": 786 |
| }, |
| { |
| "epoch": 3.3937263385613847, |
| "grad_norm": 0.27865244384411847, |
| "learning_rate": 6.478251981869919e-06, |
| "loss": 0.93, |
| "num_tokens": 6220292265.0, |
| "step": 787 |
| }, |
| { |
| "epoch": 3.3980530016224986, |
| "grad_norm": 0.27563984049065876, |
| "learning_rate": 6.4565392252392066e-06, |
| "loss": 0.9167, |
| "num_tokens": 6228496839.0, |
| "step": 788 |
| }, |
| { |
| "epoch": 3.402379664683613, |
| "grad_norm": 0.3179833299526435, |
| "learning_rate": 6.434861899428299e-06, |
| "loss": 0.9263, |
| "num_tokens": 6236728852.0, |
| "step": 789 |
| }, |
| { |
| "epoch": 3.4067063277447267, |
| "grad_norm": 0.2769787878341213, |
| "learning_rate": 6.41322017348134e-06, |
| "loss": 0.9465, |
| "num_tokens": 6244700023.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 3.411032990805841, |
| "grad_norm": 0.3394752547637937, |
| "learning_rate": 6.391614216164859e-06, |
| "loss": 0.9564, |
| "num_tokens": 6252905728.0, |
| "step": 791 |
| }, |
| { |
| "epoch": 3.4153596538669553, |
| "grad_norm": 0.29740964104918016, |
| "learning_rate": 6.370044195966443e-06, |
| "loss": 0.9377, |
| "num_tokens": 6260996249.0, |
| "step": 792 |
| }, |
| { |
| "epoch": 3.419686316928069, |
| "grad_norm": 0.29478500973407357, |
| "learning_rate": 6.3485102810934495e-06, |
| "loss": 0.9603, |
| "num_tokens": 6269249711.0, |
| "step": 793 |
| }, |
| { |
| "epoch": 3.4240129799891834, |
| "grad_norm": 0.3291226660785669, |
| "learning_rate": 6.327012639471668e-06, |
| "loss": 0.9345, |
| "num_tokens": 6277425640.0, |
| "step": 794 |
| }, |
| { |
| "epoch": 3.4283396430502977, |
| "grad_norm": 0.29781461848718044, |
| "learning_rate": 6.305551438744031e-06, |
| "loss": 0.9278, |
| "num_tokens": 6285662840.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 3.4326663061114115, |
| "grad_norm": 0.2869526690619102, |
| "learning_rate": 6.2841268462692894e-06, |
| "loss": 0.9404, |
| "num_tokens": 6293843869.0, |
| "step": 796 |
| }, |
| { |
| "epoch": 3.436992969172526, |
| "grad_norm": 0.3520399233785471, |
| "learning_rate": 6.262739029120721e-06, |
| "loss": 0.9328, |
| "num_tokens": 6302081427.0, |
| "step": 797 |
| }, |
| { |
| "epoch": 3.4413196322336397, |
| "grad_norm": 0.277183264753593, |
| "learning_rate": 6.241388154084815e-06, |
| "loss": 0.8792, |
| "num_tokens": 6310350558.0, |
| "step": 798 |
| }, |
| { |
| "epoch": 3.445646295294754, |
| "grad_norm": 0.3559461779938261, |
| "learning_rate": 6.220074387659993e-06, |
| "loss": 0.9365, |
| "num_tokens": 6318527498.0, |
| "step": 799 |
| }, |
| { |
| "epoch": 3.4499729583558683, |
| "grad_norm": 0.3120065081521411, |
| "learning_rate": 6.198797896055277e-06, |
| "loss": 0.9108, |
| "num_tokens": 6326734548.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 3.454299621416982, |
| "grad_norm": 0.2948366567102916, |
| "learning_rate": 6.177558845189029e-06, |
| "loss": 0.9409, |
| "num_tokens": 6334913287.0, |
| "step": 801 |
| }, |
| { |
| "epoch": 3.4586262844780964, |
| "grad_norm": 0.3130208095797164, |
| "learning_rate": 6.156357400687636e-06, |
| "loss": 0.9381, |
| "num_tokens": 6342974418.0, |
| "step": 802 |
| }, |
| { |
| "epoch": 3.4629529475392102, |
| "grad_norm": 0.2780789055689837, |
| "learning_rate": 6.135193727884217e-06, |
| "loss": 0.9134, |
| "num_tokens": 6351115042.0, |
| "step": 803 |
| }, |
| { |
| "epoch": 3.4672796106003245, |
| "grad_norm": 0.33413747564114543, |
| "learning_rate": 6.114067991817345e-06, |
| "loss": 0.9162, |
| "num_tokens": 6359218103.0, |
| "step": 804 |
| }, |
| { |
| "epoch": 3.471606273661439, |
| "grad_norm": 0.2856448915194415, |
| "learning_rate": 6.09298035722975e-06, |
| "loss": 0.9469, |
| "num_tokens": 6367267149.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 3.4759329367225527, |
| "grad_norm": 0.3010782239547219, |
| "learning_rate": 6.07193098856705e-06, |
| "loss": 0.9281, |
| "num_tokens": 6375383838.0, |
| "step": 806 |
| }, |
| { |
| "epoch": 3.480259599783667, |
| "grad_norm": 0.3598745028912282, |
| "learning_rate": 6.050920049976443e-06, |
| "loss": 0.9297, |
| "num_tokens": 6383520730.0, |
| "step": 807 |
| }, |
| { |
| "epoch": 3.484586262844781, |
| "grad_norm": 0.2655948499376127, |
| "learning_rate": 6.029947705305453e-06, |
| "loss": 0.9118, |
| "num_tokens": 6391710224.0, |
| "step": 808 |
| }, |
| { |
| "epoch": 3.488912925905895, |
| "grad_norm": 0.374075695585731, |
| "learning_rate": 6.009014118100638e-06, |
| "loss": 0.9175, |
| "num_tokens": 6399847271.0, |
| "step": 809 |
| }, |
| { |
| "epoch": 3.4932395889670094, |
| "grad_norm": 0.3031326750096311, |
| "learning_rate": 5.988119451606312e-06, |
| "loss": 0.936, |
| "num_tokens": 6407983914.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 3.497566252028123, |
| "grad_norm": 0.36971692188767286, |
| "learning_rate": 5.96726386876328e-06, |
| "loss": 0.9025, |
| "num_tokens": 6416189403.0, |
| "step": 811 |
| }, |
| { |
| "epoch": 3.5018929150892375, |
| "grad_norm": 0.3146277238724712, |
| "learning_rate": 5.946447532207571e-06, |
| "loss": 0.8989, |
| "num_tokens": 6424489803.0, |
| "step": 812 |
| }, |
| { |
| "epoch": 3.5062195781503513, |
| "grad_norm": 0.2982112963563615, |
| "learning_rate": 5.92567060426916e-06, |
| "loss": 0.9487, |
| "num_tokens": 6432747661.0, |
| "step": 813 |
| }, |
| { |
| "epoch": 3.5105462412114656, |
| "grad_norm": 0.37960863086345786, |
| "learning_rate": 5.904933246970699e-06, |
| "loss": 0.9156, |
| "num_tokens": 6440886009.0, |
| "step": 814 |
| }, |
| { |
| "epoch": 3.51487290427258, |
| "grad_norm": 0.296061478455817, |
| "learning_rate": 5.884235622026278e-06, |
| "loss": 0.9234, |
| "num_tokens": 6448957359.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 3.5191995673336938, |
| "grad_norm": 0.3382022290934504, |
| "learning_rate": 5.863577890840116e-06, |
| "loss": 0.9509, |
| "num_tokens": 6457102027.0, |
| "step": 816 |
| }, |
| { |
| "epoch": 3.523526230394808, |
| "grad_norm": 0.290191146124253, |
| "learning_rate": 5.842960214505366e-06, |
| "loss": 0.942, |
| "num_tokens": 6465359395.0, |
| "step": 817 |
| }, |
| { |
| "epoch": 3.527852893455922, |
| "grad_norm": 0.36396926595892626, |
| "learning_rate": 5.8223827538027974e-06, |
| "loss": 0.9527, |
| "num_tokens": 6473562045.0, |
| "step": 818 |
| }, |
| { |
| "epoch": 3.532179556517036, |
| "grad_norm": 0.26519025564463417, |
| "learning_rate": 5.801845669199594e-06, |
| "loss": 0.9286, |
| "num_tokens": 6481741265.0, |
| "step": 819 |
| }, |
| { |
| "epoch": 3.5365062195781505, |
| "grad_norm": 0.3007276045464739, |
| "learning_rate": 5.781349120848057e-06, |
| "loss": 0.9206, |
| "num_tokens": 6489853496.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 3.5408328826392643, |
| "grad_norm": 0.3152801349177596, |
| "learning_rate": 5.760893268584398e-06, |
| "loss": 0.9349, |
| "num_tokens": 6498038082.0, |
| "step": 821 |
| }, |
| { |
| "epoch": 3.5451595457003786, |
| "grad_norm": 0.31400051598257284, |
| "learning_rate": 5.740478271927452e-06, |
| "loss": 0.9645, |
| "num_tokens": 6506234847.0, |
| "step": 822 |
| }, |
| { |
| "epoch": 3.5494862087614925, |
| "grad_norm": 0.2831939350731783, |
| "learning_rate": 5.720104290077469e-06, |
| "loss": 0.9127, |
| "num_tokens": 6514345357.0, |
| "step": 823 |
| }, |
| { |
| "epoch": 3.5538128718226067, |
| "grad_norm": 0.31626659365486787, |
| "learning_rate": 5.6997714819148534e-06, |
| "loss": 0.9327, |
| "num_tokens": 6522489578.0, |
| "step": 824 |
| }, |
| { |
| "epoch": 3.558139534883721, |
| "grad_norm": 0.3035258072630229, |
| "learning_rate": 5.679480005998923e-06, |
| "loss": 0.9202, |
| "num_tokens": 6530750993.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 3.5624661979448353, |
| "grad_norm": 0.3255604103790486, |
| "learning_rate": 5.659230020566689e-06, |
| "loss": 0.952, |
| "num_tokens": 6539008159.0, |
| "step": 826 |
| }, |
| { |
| "epoch": 3.566792861005949, |
| "grad_norm": 0.28719166832906956, |
| "learning_rate": 5.639021683531598e-06, |
| "loss": 0.9219, |
| "num_tokens": 6547267835.0, |
| "step": 827 |
| }, |
| { |
| "epoch": 3.571119524067063, |
| "grad_norm": 0.279274924202361, |
| "learning_rate": 5.618855152482334e-06, |
| "loss": 0.9245, |
| "num_tokens": 6555511093.0, |
| "step": 828 |
| }, |
| { |
| "epoch": 3.5754461871281773, |
| "grad_norm": 0.31504033599617814, |
| "learning_rate": 5.5987305846815425e-06, |
| "loss": 0.948, |
| "num_tokens": 6563644940.0, |
| "step": 829 |
| }, |
| { |
| "epoch": 3.5797728501892916, |
| "grad_norm": 0.2885737141200188, |
| "learning_rate": 5.578648137064655e-06, |
| "loss": 0.9383, |
| "num_tokens": 6571675207.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 3.584099513250406, |
| "grad_norm": 0.287602171748228, |
| "learning_rate": 5.558607966238627e-06, |
| "loss": 0.9322, |
| "num_tokens": 6579800832.0, |
| "step": 831 |
| }, |
| { |
| "epoch": 3.5884261763115197, |
| "grad_norm": 0.323812546545481, |
| "learning_rate": 5.5386102284807395e-06, |
| "loss": 0.9254, |
| "num_tokens": 6587945088.0, |
| "step": 832 |
| }, |
| { |
| "epoch": 3.592752839372634, |
| "grad_norm": 0.28501015265860297, |
| "learning_rate": 5.518655079737371e-06, |
| "loss": 0.9275, |
| "num_tokens": 6596138413.0, |
| "step": 833 |
| }, |
| { |
| "epoch": 3.597079502433748, |
| "grad_norm": 0.2844443740370572, |
| "learning_rate": 5.498742675622777e-06, |
| "loss": 0.9795, |
| "num_tokens": 6604206973.0, |
| "step": 834 |
| }, |
| { |
| "epoch": 3.601406165494862, |
| "grad_norm": 0.2716290558748706, |
| "learning_rate": 5.478873171417884e-06, |
| "loss": 0.9244, |
| "num_tokens": 6612245627.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 3.6057328285559764, |
| "grad_norm": 0.2660089227467234, |
| "learning_rate": 5.459046722069077e-06, |
| "loss": 0.943, |
| "num_tokens": 6620394090.0, |
| "step": 836 |
| }, |
| { |
| "epoch": 3.6100594916170903, |
| "grad_norm": 0.27963142700479227, |
| "learning_rate": 5.439263482186993e-06, |
| "loss": 0.9504, |
| "num_tokens": 6628667370.0, |
| "step": 837 |
| }, |
| { |
| "epoch": 3.6143861546782046, |
| "grad_norm": 0.30095431039169446, |
| "learning_rate": 5.419523606045307e-06, |
| "loss": 0.9128, |
| "num_tokens": 6636964851.0, |
| "step": 838 |
| }, |
| { |
| "epoch": 3.6187128177393184, |
| "grad_norm": 0.29117050215907364, |
| "learning_rate": 5.399827247579543e-06, |
| "loss": 0.9421, |
| "num_tokens": 6645115078.0, |
| "step": 839 |
| }, |
| { |
| "epoch": 3.6230394808004327, |
| "grad_norm": 0.27927887812944785, |
| "learning_rate": 5.3801745603858606e-06, |
| "loss": 0.9252, |
| "num_tokens": 6653355252.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 3.627366143861547, |
| "grad_norm": 0.28681655737112005, |
| "learning_rate": 5.36056569771986e-06, |
| "loss": 0.9202, |
| "num_tokens": 6661518176.0, |
| "step": 841 |
| }, |
| { |
| "epoch": 3.631692806922661, |
| "grad_norm": 0.2994382383584901, |
| "learning_rate": 5.341000812495387e-06, |
| "loss": 0.9006, |
| "num_tokens": 6669758163.0, |
| "step": 842 |
| }, |
| { |
| "epoch": 3.636019469983775, |
| "grad_norm": 0.2859783006285974, |
| "learning_rate": 5.3214800572833535e-06, |
| "loss": 0.9562, |
| "num_tokens": 6678002025.0, |
| "step": 843 |
| }, |
| { |
| "epoch": 3.640346133044889, |
| "grad_norm": 0.3007253324040871, |
| "learning_rate": 5.302003584310531e-06, |
| "loss": 0.9235, |
| "num_tokens": 6686250664.0, |
| "step": 844 |
| }, |
| { |
| "epoch": 3.6446727961060033, |
| "grad_norm": 0.29129203810321896, |
| "learning_rate": 5.282571545458361e-06, |
| "loss": 0.9554, |
| "num_tokens": 6694483330.0, |
| "step": 845 |
| }, |
| { |
| "epoch": 3.6489994591671175, |
| "grad_norm": 0.29382448761915053, |
| "learning_rate": 5.263184092261793e-06, |
| "loss": 0.9242, |
| "num_tokens": 6702400177.0, |
| "step": 846 |
| }, |
| { |
| "epoch": 3.6533261222282314, |
| "grad_norm": 0.27678930676739016, |
| "learning_rate": 5.243841375908079e-06, |
| "loss": 0.9637, |
| "num_tokens": 6710452713.0, |
| "step": 847 |
| }, |
| { |
| "epoch": 3.6576527852893457, |
| "grad_norm": 0.30873496314841525, |
| "learning_rate": 5.2245435472356075e-06, |
| "loss": 0.9235, |
| "num_tokens": 6718658666.0, |
| "step": 848 |
| }, |
| { |
| "epoch": 3.6619794483504595, |
| "grad_norm": 0.2702325244264808, |
| "learning_rate": 5.205290756732717e-06, |
| "loss": 0.9308, |
| "num_tokens": 6726891839.0, |
| "step": 849 |
| }, |
| { |
| "epoch": 3.666306111411574, |
| "grad_norm": 0.3630867653307208, |
| "learning_rate": 5.186083154536545e-06, |
| "loss": 0.946, |
| "num_tokens": 6735027364.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 3.670632774472688, |
| "grad_norm": 0.2969734274544958, |
| "learning_rate": 5.166920890431822e-06, |
| "loss": 0.917, |
| "num_tokens": 6743405824.0, |
| "step": 851 |
| }, |
| { |
| "epoch": 3.674959437533802, |
| "grad_norm": 0.3437347470803757, |
| "learning_rate": 5.147804113849739e-06, |
| "loss": 0.9249, |
| "num_tokens": 6751720174.0, |
| "step": 852 |
| }, |
| { |
| "epoch": 3.6792861005949162, |
| "grad_norm": 0.3299578442270291, |
| "learning_rate": 5.128732973866764e-06, |
| "loss": 0.9397, |
| "num_tokens": 6759962969.0, |
| "step": 853 |
| }, |
| { |
| "epoch": 3.68361276365603, |
| "grad_norm": 0.2881740077030876, |
| "learning_rate": 5.109707619203468e-06, |
| "loss": 0.9229, |
| "num_tokens": 6768214898.0, |
| "step": 854 |
| }, |
| { |
| "epoch": 3.6879394267171444, |
| "grad_norm": 0.31574582431617765, |
| "learning_rate": 5.090728198223393e-06, |
| "loss": 0.9161, |
| "num_tokens": 6776411515.0, |
| "step": 855 |
| }, |
| { |
| "epoch": 3.6922660897782587, |
| "grad_norm": 0.27411621043292045, |
| "learning_rate": 5.071794858931875e-06, |
| "loss": 0.9408, |
| "num_tokens": 6784748226.0, |
| "step": 856 |
| }, |
| { |
| "epoch": 3.6965927528393725, |
| "grad_norm": 0.30409723396559724, |
| "learning_rate": 5.052907748974902e-06, |
| "loss": 0.8973, |
| "num_tokens": 6792880288.0, |
| "step": 857 |
| }, |
| { |
| "epoch": 3.700919415900487, |
| "grad_norm": 0.26352582652848877, |
| "learning_rate": 5.034067015637945e-06, |
| "loss": 0.9587, |
| "num_tokens": 6801081860.0, |
| "step": 858 |
| }, |
| { |
| "epoch": 3.7052460789616006, |
| "grad_norm": 0.2948163224706812, |
| "learning_rate": 5.015272805844829e-06, |
| "loss": 0.959, |
| "num_tokens": 6809396324.0, |
| "step": 859 |
| }, |
| { |
| "epoch": 3.709572742022715, |
| "grad_norm": 0.26728114208333664, |
| "learning_rate": 4.996525266156582e-06, |
| "loss": 0.9194, |
| "num_tokens": 6817556880.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 3.713899405083829, |
| "grad_norm": 0.24917090858865323, |
| "learning_rate": 4.977824542770279e-06, |
| "loss": 0.8967, |
| "num_tokens": 6825826880.0, |
| "step": 861 |
| }, |
| { |
| "epoch": 3.718226068144943, |
| "grad_norm": 0.26223408456264524, |
| "learning_rate": 4.959170781517917e-06, |
| "loss": 0.8977, |
| "num_tokens": 6834120960.0, |
| "step": 862 |
| }, |
| { |
| "epoch": 3.7225527312060573, |
| "grad_norm": 0.24834835176052955, |
| "learning_rate": 4.940564127865276e-06, |
| "loss": 0.9078, |
| "num_tokens": 6842342130.0, |
| "step": 863 |
| }, |
| { |
| "epoch": 3.726879394267171, |
| "grad_norm": 0.28897403903709534, |
| "learning_rate": 4.92200472691078e-06, |
| "loss": 0.9533, |
| "num_tokens": 6850501851.0, |
| "step": 864 |
| }, |
| { |
| "epoch": 3.7312060573282855, |
| "grad_norm": 0.24176033099925953, |
| "learning_rate": 4.903492723384366e-06, |
| "loss": 0.9427, |
| "num_tokens": 6858679110.0, |
| "step": 865 |
| }, |
| { |
| "epoch": 3.7355327203893998, |
| "grad_norm": 1.2200892765164042, |
| "learning_rate": 4.885028261646354e-06, |
| "loss": 0.9313, |
| "num_tokens": 6866722996.0, |
| "step": 866 |
| }, |
| { |
| "epoch": 3.739859383450514, |
| "grad_norm": 0.4354124277947738, |
| "learning_rate": 4.866611485686323e-06, |
| "loss": 0.9647, |
| "num_tokens": 6874881319.0, |
| "step": 867 |
| }, |
| { |
| "epoch": 3.744186046511628, |
| "grad_norm": 0.3097484640168433, |
| "learning_rate": 4.848242539121998e-06, |
| "loss": 0.9158, |
| "num_tokens": 6882948710.0, |
| "step": 868 |
| }, |
| { |
| "epoch": 3.748512709572742, |
| "grad_norm": 0.3408052167056257, |
| "learning_rate": 4.8299215651981095e-06, |
| "loss": 0.9496, |
| "num_tokens": 6891090626.0, |
| "step": 869 |
| }, |
| { |
| "epoch": 3.752839372633856, |
| "grad_norm": 0.35810397924998577, |
| "learning_rate": 4.8116487067852945e-06, |
| "loss": 0.901, |
| "num_tokens": 6899242627.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 3.7571660356949703, |
| "grad_norm": 0.34291620033093606, |
| "learning_rate": 4.793424106378972e-06, |
| "loss": 0.9192, |
| "num_tokens": 6907463249.0, |
| "step": 871 |
| }, |
| { |
| "epoch": 3.7614926987560846, |
| "grad_norm": 0.33243753034162843, |
| "learning_rate": 4.77524790609824e-06, |
| "loss": 0.9461, |
| "num_tokens": 6915673495.0, |
| "step": 872 |
| }, |
| { |
| "epoch": 3.7658193618171985, |
| "grad_norm": 0.32596987523166643, |
| "learning_rate": 4.7571202476847575e-06, |
| "loss": 0.8984, |
| "num_tokens": 6923786563.0, |
| "step": 873 |
| }, |
| { |
| "epoch": 3.7701460248783127, |
| "grad_norm": 0.33239664166194055, |
| "learning_rate": 4.739041272501643e-06, |
| "loss": 0.9011, |
| "num_tokens": 6931994777.0, |
| "step": 874 |
| }, |
| { |
| "epoch": 3.7744726879394266, |
| "grad_norm": 0.3224636410025114, |
| "learning_rate": 4.721011121532384e-06, |
| "loss": 0.9437, |
| "num_tokens": 6940150249.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 3.778799351000541, |
| "grad_norm": 0.29121879157929753, |
| "learning_rate": 4.703029935379711e-06, |
| "loss": 0.9497, |
| "num_tokens": 6948396232.0, |
| "step": 876 |
| }, |
| { |
| "epoch": 3.783126014061655, |
| "grad_norm": 0.2916266453802922, |
| "learning_rate": 4.685097854264535e-06, |
| "loss": 0.9295, |
| "num_tokens": 6956364532.0, |
| "step": 877 |
| }, |
| { |
| "epoch": 3.787452677122769, |
| "grad_norm": 0.28830831706624727, |
| "learning_rate": 4.66721501802482e-06, |
| "loss": 0.9037, |
| "num_tokens": 6964475231.0, |
| "step": 878 |
| }, |
| { |
| "epoch": 3.7917793401838833, |
| "grad_norm": 0.25385471758857564, |
| "learning_rate": 4.649381566114517e-06, |
| "loss": 0.9157, |
| "num_tokens": 6972702655.0, |
| "step": 879 |
| }, |
| { |
| "epoch": 3.796106003244997, |
| "grad_norm": 0.2783858674776144, |
| "learning_rate": 4.631597637602465e-06, |
| "loss": 0.8843, |
| "num_tokens": 6980708146.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 3.8004326663061114, |
| "grad_norm": 0.25631955554373037, |
| "learning_rate": 4.613863371171314e-06, |
| "loss": 0.9488, |
| "num_tokens": 6988956749.0, |
| "step": 881 |
| }, |
| { |
| "epoch": 3.8047593293672257, |
| "grad_norm": 0.2370426912422331, |
| "learning_rate": 4.5961789051164325e-06, |
| "loss": 0.9294, |
| "num_tokens": 6997191073.0, |
| "step": 882 |
| }, |
| { |
| "epoch": 3.8090859924283396, |
| "grad_norm": 0.25552438858767745, |
| "learning_rate": 4.578544377344841e-06, |
| "loss": 0.9112, |
| "num_tokens": 7005392482.0, |
| "step": 883 |
| }, |
| { |
| "epoch": 3.813412655489454, |
| "grad_norm": 0.2399956009221735, |
| "learning_rate": 4.560959925374133e-06, |
| "loss": 0.9182, |
| "num_tokens": 7013476301.0, |
| "step": 884 |
| }, |
| { |
| "epoch": 3.8177393185505677, |
| "grad_norm": 0.26400729271667883, |
| "learning_rate": 4.543425686331394e-06, |
| "loss": 0.9093, |
| "num_tokens": 7021567003.0, |
| "step": 885 |
| }, |
| { |
| "epoch": 3.822065981611682, |
| "grad_norm": 0.25155595917456375, |
| "learning_rate": 4.525941796952142e-06, |
| "loss": 0.9759, |
| "num_tokens": 7029905460.0, |
| "step": 886 |
| }, |
| { |
| "epoch": 3.8263926446727963, |
| "grad_norm": 0.2748577842617266, |
| "learning_rate": 4.5085083935792566e-06, |
| "loss": 0.9306, |
| "num_tokens": 7038155146.0, |
| "step": 887 |
| }, |
| { |
| "epoch": 3.83071930773391, |
| "grad_norm": 0.23872591495204412, |
| "learning_rate": 4.491125612161924e-06, |
| "loss": 0.9194, |
| "num_tokens": 7046355670.0, |
| "step": 888 |
| }, |
| { |
| "epoch": 3.8350459707950244, |
| "grad_norm": 0.2746878100367909, |
| "learning_rate": 4.47379358825456e-06, |
| "loss": 0.9471, |
| "num_tokens": 7054615035.0, |
| "step": 889 |
| }, |
| { |
| "epoch": 3.8393726338561383, |
| "grad_norm": 0.257786053865835, |
| "learning_rate": 4.456512457015775e-06, |
| "loss": 0.9375, |
| "num_tokens": 7062921452.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 3.8436992969172525, |
| "grad_norm": 0.2546439958259274, |
| "learning_rate": 4.4392823532072984e-06, |
| "loss": 0.9681, |
| "num_tokens": 7071058240.0, |
| "step": 891 |
| }, |
| { |
| "epoch": 3.848025959978367, |
| "grad_norm": 0.25787071531179456, |
| "learning_rate": 4.422103411192941e-06, |
| "loss": 0.9433, |
| "num_tokens": 7079255328.0, |
| "step": 892 |
| }, |
| { |
| "epoch": 3.8523526230394807, |
| "grad_norm": 0.26573100227653684, |
| "learning_rate": 4.404975764937541e-06, |
| "loss": 0.9419, |
| "num_tokens": 7087533637.0, |
| "step": 893 |
| }, |
| { |
| "epoch": 3.856679286100595, |
| "grad_norm": 0.26310774628285044, |
| "learning_rate": 4.387899548005927e-06, |
| "loss": 0.9418, |
| "num_tokens": 7095658511.0, |
| "step": 894 |
| }, |
| { |
| "epoch": 3.861005949161709, |
| "grad_norm": 0.282182857985642, |
| "learning_rate": 4.370874893561872e-06, |
| "loss": 0.9541, |
| "num_tokens": 7103829697.0, |
| "step": 895 |
| }, |
| { |
| "epoch": 3.865332612222823, |
| "grad_norm": 0.2619523701917039, |
| "learning_rate": 4.353901934367045e-06, |
| "loss": 0.926, |
| "num_tokens": 7111825762.0, |
| "step": 896 |
| }, |
| { |
| "epoch": 3.8696592752839374, |
| "grad_norm": 0.264850106481059, |
| "learning_rate": 4.336980802779998e-06, |
| "loss": 0.9393, |
| "num_tokens": 7119948260.0, |
| "step": 897 |
| }, |
| { |
| "epoch": 3.8739859383450512, |
| "grad_norm": 0.2704427422262429, |
| "learning_rate": 4.320111630755109e-06, |
| "loss": 0.9488, |
| "num_tokens": 7128064886.0, |
| "step": 898 |
| }, |
| { |
| "epoch": 3.8783126014061655, |
| "grad_norm": 0.24688426282829962, |
| "learning_rate": 4.303294549841573e-06, |
| "loss": 0.9644, |
| "num_tokens": 7136276452.0, |
| "step": 899 |
| }, |
| { |
| "epoch": 3.8826392644672794, |
| "grad_norm": 0.24921212539281543, |
| "learning_rate": 4.286529691182362e-06, |
| "loss": 0.9675, |
| "num_tokens": 7144530885.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 3.8869659275283936, |
| "grad_norm": 0.2491156891345615, |
| "learning_rate": 4.269817185513215e-06, |
| "loss": 0.9272, |
| "num_tokens": 7152678990.0, |
| "step": 901 |
| }, |
| { |
| "epoch": 3.891292590589508, |
| "grad_norm": 0.23412745486849854, |
| "learning_rate": 4.253157163161605e-06, |
| "loss": 0.8895, |
| "num_tokens": 7160980893.0, |
| "step": 902 |
| }, |
| { |
| "epoch": 3.8956192536506222, |
| "grad_norm": 0.2655564126452194, |
| "learning_rate": 4.236549754045737e-06, |
| "loss": 0.9073, |
| "num_tokens": 7169195089.0, |
| "step": 903 |
| }, |
| { |
| "epoch": 3.899945916711736, |
| "grad_norm": 0.2482515063514521, |
| "learning_rate": 4.2199950876735215e-06, |
| "loss": 0.9411, |
| "num_tokens": 7177386797.0, |
| "step": 904 |
| }, |
| { |
| "epoch": 3.90427257977285, |
| "grad_norm": 0.28405164933859445, |
| "learning_rate": 4.203493293141569e-06, |
| "loss": 0.9504, |
| "num_tokens": 7185698847.0, |
| "step": 905 |
| }, |
| { |
| "epoch": 3.908599242833964, |
| "grad_norm": 0.30931512212107404, |
| "learning_rate": 4.187044499134194e-06, |
| "loss": 0.9361, |
| "num_tokens": 7193881926.0, |
| "step": 906 |
| }, |
| { |
| "epoch": 3.9129259058950785, |
| "grad_norm": 0.2547147190501147, |
| "learning_rate": 4.170648833922391e-06, |
| "loss": 0.9185, |
| "num_tokens": 7202143917.0, |
| "step": 907 |
| }, |
| { |
| "epoch": 3.917252568956193, |
| "grad_norm": 0.2883629979901297, |
| "learning_rate": 4.154306425362856e-06, |
| "loss": 0.9031, |
| "num_tokens": 7210349690.0, |
| "step": 908 |
| }, |
| { |
| "epoch": 3.9215792320173066, |
| "grad_norm": 0.248809007780818, |
| "learning_rate": 4.1380174008969685e-06, |
| "loss": 0.9212, |
| "num_tokens": 7218471788.0, |
| "step": 909 |
| }, |
| { |
| "epoch": 3.925905895078421, |
| "grad_norm": 0.29335764844457096, |
| "learning_rate": 4.121781887549819e-06, |
| "loss": 0.9547, |
| "num_tokens": 7226737841.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 3.9302325581395348, |
| "grad_norm": 0.23301711632951558, |
| "learning_rate": 4.1056000119292e-06, |
| "loss": 0.9091, |
| "num_tokens": 7234929843.0, |
| "step": 911 |
| }, |
| { |
| "epoch": 3.934559221200649, |
| "grad_norm": 0.2681121586933482, |
| "learning_rate": 4.089471900224625e-06, |
| "loss": 0.9547, |
| "num_tokens": 7243103469.0, |
| "step": 912 |
| }, |
| { |
| "epoch": 3.9388858842617633, |
| "grad_norm": 0.24013450588798013, |
| "learning_rate": 4.07339767820635e-06, |
| "loss": 0.9566, |
| "num_tokens": 7251346629.0, |
| "step": 913 |
| }, |
| { |
| "epoch": 3.943212547322877, |
| "grad_norm": 0.2774200992515359, |
| "learning_rate": 4.057377471224389e-06, |
| "loss": 0.9225, |
| "num_tokens": 7259431732.0, |
| "step": 914 |
| }, |
| { |
| "epoch": 3.9475392103839915, |
| "grad_norm": 0.2455641564191236, |
| "learning_rate": 4.041411404207534e-06, |
| "loss": 0.8953, |
| "num_tokens": 7267640813.0, |
| "step": 915 |
| }, |
| { |
| "epoch": 3.9518658734451053, |
| "grad_norm": 0.2966275021843376, |
| "learning_rate": 4.025499601662385e-06, |
| "loss": 0.922, |
| "num_tokens": 7275749532.0, |
| "step": 916 |
| }, |
| { |
| "epoch": 3.9561925365062196, |
| "grad_norm": 0.2640929917423771, |
| "learning_rate": 4.009642187672371e-06, |
| "loss": 0.9231, |
| "num_tokens": 7283963651.0, |
| "step": 917 |
| }, |
| { |
| "epoch": 3.960519199567334, |
| "grad_norm": 0.3246691150518756, |
| "learning_rate": 3.99383928589679e-06, |
| "loss": 0.901, |
| "num_tokens": 7292247142.0, |
| "step": 918 |
| }, |
| { |
| "epoch": 3.9648458626284477, |
| "grad_norm": 0.26463531126722656, |
| "learning_rate": 3.9780910195698505e-06, |
| "loss": 0.8923, |
| "num_tokens": 7300417116.0, |
| "step": 919 |
| }, |
| { |
| "epoch": 3.969172525689562, |
| "grad_norm": 0.35122691809047024, |
| "learning_rate": 3.9623975114996905e-06, |
| "loss": 0.932, |
| "num_tokens": 7308379545.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 3.973499188750676, |
| "grad_norm": 0.2894341921143494, |
| "learning_rate": 3.946758884067443e-06, |
| "loss": 0.9398, |
| "num_tokens": 7316480967.0, |
| "step": 921 |
| }, |
| { |
| "epoch": 3.97782585181179, |
| "grad_norm": 0.30340316959991614, |
| "learning_rate": 3.9311752592262636e-06, |
| "loss": 0.9314, |
| "num_tokens": 7324483988.0, |
| "step": 922 |
| }, |
| { |
| "epoch": 3.9821525148729044, |
| "grad_norm": 0.2592529824891951, |
| "learning_rate": 3.915646758500391e-06, |
| "loss": 0.9429, |
| "num_tokens": 7332700887.0, |
| "step": 923 |
| }, |
| { |
| "epoch": 3.9864791779340183, |
| "grad_norm": 0.2684948193663238, |
| "learning_rate": 3.900173502984195e-06, |
| "loss": 0.9254, |
| "num_tokens": 7340983465.0, |
| "step": 924 |
| }, |
| { |
| "epoch": 3.9908058409951326, |
| "grad_norm": 0.24383973325622357, |
| "learning_rate": 3.884755613341223e-06, |
| "loss": 0.9226, |
| "num_tokens": 7349197836.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 3.9951325040562464, |
| "grad_norm": 0.267690631400667, |
| "learning_rate": 3.8693932098032845e-06, |
| "loss": 0.9218, |
| "num_tokens": 7357431144.0, |
| "step": 926 |
| }, |
| { |
| "epoch": 3.9994591671173607, |
| "grad_norm": 0.2588911360407846, |
| "learning_rate": 3.854086412169482e-06, |
| "loss": 0.9086, |
| "num_tokens": 7365719207.0, |
| "step": 927 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.5005024065190141, |
| "learning_rate": 3.838835339805301e-06, |
| "loss": 0.9425, |
| "num_tokens": 7366713790.0, |
| "step": 928 |
| }, |
| { |
| "epoch": 4.004326663061114, |
| "grad_norm": 0.3225210487220433, |
| "learning_rate": 3.8236401116416686e-06, |
| "loss": 0.9273, |
| "num_tokens": 7375010232.0, |
| "step": 929 |
| }, |
| { |
| "epoch": 4.008653326122229, |
| "grad_norm": 0.2579915323141952, |
| "learning_rate": 3.8085008461740245e-06, |
| "loss": 0.9168, |
| "num_tokens": 7383228901.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 4.012979989183342, |
| "grad_norm": 0.2831742045415681, |
| "learning_rate": 3.7934176614614004e-06, |
| "loss": 0.9198, |
| "num_tokens": 7391491934.0, |
| "step": 931 |
| }, |
| { |
| "epoch": 4.017306652244456, |
| "grad_norm": 0.2737796783844379, |
| "learning_rate": 3.778390675125503e-06, |
| "loss": 0.9441, |
| "num_tokens": 7399677041.0, |
| "step": 932 |
| }, |
| { |
| "epoch": 4.0216333153055706, |
| "grad_norm": 0.255136002812028, |
| "learning_rate": 3.7634200043497886e-06, |
| "loss": 0.9298, |
| "num_tokens": 7407919210.0, |
| "step": 933 |
| }, |
| { |
| "epoch": 4.025959978366685, |
| "grad_norm": 0.26259596411462577, |
| "learning_rate": 3.7485057658785564e-06, |
| "loss": 0.92, |
| "num_tokens": 7415945637.0, |
| "step": 934 |
| }, |
| { |
| "epoch": 4.030286641427799, |
| "grad_norm": 0.24948824793026556, |
| "learning_rate": 3.733648076016035e-06, |
| "loss": 0.911, |
| "num_tokens": 7423795891.0, |
| "step": 935 |
| }, |
| { |
| "epoch": 4.0346133044889125, |
| "grad_norm": 0.29219230996482654, |
| "learning_rate": 3.7188470506254746e-06, |
| "loss": 0.9126, |
| "num_tokens": 7431984437.0, |
| "step": 936 |
| }, |
| { |
| "epoch": 4.038939967550027, |
| "grad_norm": 0.26890911344982393, |
| "learning_rate": 3.704102805128242e-06, |
| "loss": 0.9187, |
| "num_tokens": 7440332143.0, |
| "step": 937 |
| }, |
| { |
| "epoch": 4.043266630611141, |
| "grad_norm": 0.2798260115969542, |
| "learning_rate": 3.6894154545029255e-06, |
| "loss": 0.9472, |
| "num_tokens": 7448558223.0, |
| "step": 938 |
| }, |
| { |
| "epoch": 4.047593293672255, |
| "grad_norm": 0.2849548143296411, |
| "learning_rate": 3.6747851132844392e-06, |
| "loss": 0.9024, |
| "num_tokens": 7456775767.0, |
| "step": 939 |
| }, |
| { |
| "epoch": 4.05191995673337, |
| "grad_norm": 0.2604631125958381, |
| "learning_rate": 3.660211895563117e-06, |
| "loss": 0.9086, |
| "num_tokens": 7464832642.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 4.056246619794483, |
| "grad_norm": 0.27136813930111225, |
| "learning_rate": 3.6456959149838443e-06, |
| "loss": 0.8951, |
| "num_tokens": 7472913404.0, |
| "step": 941 |
| }, |
| { |
| "epoch": 4.060573282855597, |
| "grad_norm": 0.2321722818439474, |
| "learning_rate": 3.6312372847451503e-06, |
| "loss": 0.9225, |
| "num_tokens": 7481058731.0, |
| "step": 942 |
| }, |
| { |
| "epoch": 4.064899945916712, |
| "grad_norm": 0.24363607914962288, |
| "learning_rate": 3.61683611759834e-06, |
| "loss": 0.8979, |
| "num_tokens": 7489381773.0, |
| "step": 943 |
| }, |
| { |
| "epoch": 4.069226608977826, |
| "grad_norm": 0.2512840779101482, |
| "learning_rate": 3.6024925258466028e-06, |
| "loss": 0.9016, |
| "num_tokens": 7497474453.0, |
| "step": 944 |
| }, |
| { |
| "epoch": 4.07355327203894, |
| "grad_norm": 0.2650459708817925, |
| "learning_rate": 3.5882066213441537e-06, |
| "loss": 0.9292, |
| "num_tokens": 7505713748.0, |
| "step": 945 |
| }, |
| { |
| "epoch": 4.077879935100054, |
| "grad_norm": 0.2628128175408556, |
| "learning_rate": 3.573978515495345e-06, |
| "loss": 0.9476, |
| "num_tokens": 7513871362.0, |
| "step": 946 |
| }, |
| { |
| "epoch": 4.082206598161168, |
| "grad_norm": 0.25397010876410847, |
| "learning_rate": 3.559808319253801e-06, |
| "loss": 0.9151, |
| "num_tokens": 7522106850.0, |
| "step": 947 |
| }, |
| { |
| "epoch": 4.086533261222282, |
| "grad_norm": 0.263659923128617, |
| "learning_rate": 3.545696143121563e-06, |
| "loss": 0.9468, |
| "num_tokens": 7530473282.0, |
| "step": 948 |
| }, |
| { |
| "epoch": 4.0908599242833965, |
| "grad_norm": 0.23015134804471984, |
| "learning_rate": 3.5316420971482133e-06, |
| "loss": 0.9174, |
| "num_tokens": 7538547933.0, |
| "step": 949 |
| }, |
| { |
| "epoch": 4.095186587344511, |
| "grad_norm": 0.26513242851859053, |
| "learning_rate": 3.5176462909300257e-06, |
| "loss": 0.9153, |
| "num_tokens": 7546860011.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 4.099513250405625, |
| "grad_norm": 0.24911847035989185, |
| "learning_rate": 3.5037088336091054e-06, |
| "loss": 0.9399, |
| "num_tokens": 7555038327.0, |
| "step": 951 |
| }, |
| { |
| "epoch": 4.1038399134667385, |
| "grad_norm": 0.2528255498389864, |
| "learning_rate": 3.48982983387255e-06, |
| "loss": 0.9401, |
| "num_tokens": 7563202397.0, |
| "step": 952 |
| }, |
| { |
| "epoch": 4.108166576527853, |
| "grad_norm": 0.2784976002129969, |
| "learning_rate": 3.476009399951582e-06, |
| "loss": 0.9086, |
| "num_tokens": 7571320131.0, |
| "step": 953 |
| }, |
| { |
| "epoch": 4.112493239588967, |
| "grad_norm": 0.23426691089213464, |
| "learning_rate": 3.4622476396207254e-06, |
| "loss": 0.9384, |
| "num_tokens": 7579479607.0, |
| "step": 954 |
| }, |
| { |
| "epoch": 4.116819902650081, |
| "grad_norm": 0.24357078878404217, |
| "learning_rate": 3.4485446601969507e-06, |
| "loss": 0.8955, |
| "num_tokens": 7587468144.0, |
| "step": 955 |
| }, |
| { |
| "epoch": 4.121146565711196, |
| "grad_norm": 0.22163907535402036, |
| "learning_rate": 3.4349005685388424e-06, |
| "loss": 0.8934, |
| "num_tokens": 7595579912.0, |
| "step": 956 |
| }, |
| { |
| "epoch": 4.125473228772309, |
| "grad_norm": 0.23264492656762667, |
| "learning_rate": 3.4213154710457708e-06, |
| "loss": 0.9391, |
| "num_tokens": 7603924326.0, |
| "step": 957 |
| }, |
| { |
| "epoch": 4.129799891833423, |
| "grad_norm": 0.2433223472459945, |
| "learning_rate": 3.4077894736570515e-06, |
| "loss": 0.8828, |
| "num_tokens": 7611982955.0, |
| "step": 958 |
| }, |
| { |
| "epoch": 4.134126554894538, |
| "grad_norm": 0.23069127266197514, |
| "learning_rate": 3.3943226818511333e-06, |
| "loss": 0.8969, |
| "num_tokens": 7620306000.0, |
| "step": 959 |
| }, |
| { |
| "epoch": 4.138453217955652, |
| "grad_norm": 0.22614376605829753, |
| "learning_rate": 3.3809152006447587e-06, |
| "loss": 0.9137, |
| "num_tokens": 7628430275.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 4.142779881016766, |
| "grad_norm": 0.24947724722258544, |
| "learning_rate": 3.367567134592167e-06, |
| "loss": 0.8879, |
| "num_tokens": 7636687515.0, |
| "step": 961 |
| }, |
| { |
| "epoch": 4.14710654407788, |
| "grad_norm": 0.21810329623060942, |
| "learning_rate": 3.354278587784253e-06, |
| "loss": 0.9061, |
| "num_tokens": 7644885409.0, |
| "step": 962 |
| }, |
| { |
| "epoch": 4.151433207138994, |
| "grad_norm": 0.23579980625448524, |
| "learning_rate": 3.341049663847775e-06, |
| "loss": 0.9298, |
| "num_tokens": 7653226663.0, |
| "step": 963 |
| }, |
| { |
| "epoch": 4.155759870200108, |
| "grad_norm": 0.2210900639251755, |
| "learning_rate": 3.3278804659445384e-06, |
| "loss": 0.9015, |
| "num_tokens": 7661515588.0, |
| "step": 964 |
| }, |
| { |
| "epoch": 4.1600865332612225, |
| "grad_norm": 0.25256619991347495, |
| "learning_rate": 3.3147710967705948e-06, |
| "loss": 0.9, |
| "num_tokens": 7669557188.0, |
| "step": 965 |
| }, |
| { |
| "epoch": 4.164413196322337, |
| "grad_norm": 0.2344894675908079, |
| "learning_rate": 3.301721658555441e-06, |
| "loss": 0.8805, |
| "num_tokens": 7677831073.0, |
| "step": 966 |
| }, |
| { |
| "epoch": 4.16873985938345, |
| "grad_norm": 0.252979642536075, |
| "learning_rate": 3.288732253061214e-06, |
| "loss": 0.9336, |
| "num_tokens": 7686006093.0, |
| "step": 967 |
| }, |
| { |
| "epoch": 4.173066522444564, |
| "grad_norm": 0.24604867580703016, |
| "learning_rate": 3.2758029815819105e-06, |
| "loss": 0.9023, |
| "num_tokens": 7694213953.0, |
| "step": 968 |
| }, |
| { |
| "epoch": 4.177393185505679, |
| "grad_norm": 0.2228523611045584, |
| "learning_rate": 3.2629339449425813e-06, |
| "loss": 0.9303, |
| "num_tokens": 7702384350.0, |
| "step": 969 |
| }, |
| { |
| "epoch": 4.181719848566793, |
| "grad_norm": 0.22509827369260485, |
| "learning_rate": 3.2501252434985642e-06, |
| "loss": 0.9264, |
| "num_tokens": 7710606640.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 4.186046511627907, |
| "grad_norm": 0.2667456008002394, |
| "learning_rate": 3.237376977134683e-06, |
| "loss": 0.9378, |
| "num_tokens": 7718777881.0, |
| "step": 971 |
| }, |
| { |
| "epoch": 4.190373174689021, |
| "grad_norm": 0.22092392495232688, |
| "learning_rate": 3.2246892452644827e-06, |
| "loss": 0.9203, |
| "num_tokens": 7726989645.0, |
| "step": 972 |
| }, |
| { |
| "epoch": 4.194699837750135, |
| "grad_norm": 0.23367876249626124, |
| "learning_rate": 3.212062146829442e-06, |
| "loss": 0.8967, |
| "num_tokens": 7735155541.0, |
| "step": 973 |
| }, |
| { |
| "epoch": 4.199026500811249, |
| "grad_norm": 0.24919500016307755, |
| "learning_rate": 3.1994957802982153e-06, |
| "loss": 0.9472, |
| "num_tokens": 7743378650.0, |
| "step": 974 |
| }, |
| { |
| "epoch": 4.203353163872364, |
| "grad_norm": 0.2362455094117848, |
| "learning_rate": 3.1869902436658484e-06, |
| "loss": 0.9017, |
| "num_tokens": 7751497549.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 4.207679826933478, |
| "grad_norm": 0.2344310291886097, |
| "learning_rate": 3.1745456344530296e-06, |
| "loss": 0.8933, |
| "num_tokens": 7759791744.0, |
| "step": 976 |
| }, |
| { |
| "epoch": 4.212006489994591, |
| "grad_norm": 0.25066147270392997, |
| "learning_rate": 3.1621620497053225e-06, |
| "loss": 0.9159, |
| "num_tokens": 7767921426.0, |
| "step": 977 |
| }, |
| { |
| "epoch": 4.2163331530557056, |
| "grad_norm": 0.23902995246327793, |
| "learning_rate": 3.149839585992407e-06, |
| "loss": 0.9059, |
| "num_tokens": 7776086986.0, |
| "step": 978 |
| }, |
| { |
| "epoch": 4.22065981611682, |
| "grad_norm": 0.25258997276156464, |
| "learning_rate": 3.1375783394073323e-06, |
| "loss": 0.9102, |
| "num_tokens": 7784151941.0, |
| "step": 979 |
| }, |
| { |
| "epoch": 4.224986479177934, |
| "grad_norm": 0.2643483528443113, |
| "learning_rate": 3.125378405565762e-06, |
| "loss": 0.9229, |
| "num_tokens": 7792420160.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 4.229313142239048, |
| "grad_norm": 0.24031178404950676, |
| "learning_rate": 3.11323987960523e-06, |
| "loss": 0.9035, |
| "num_tokens": 7800617397.0, |
| "step": 981 |
| }, |
| { |
| "epoch": 4.233639805300163, |
| "grad_norm": 0.23933163316740128, |
| "learning_rate": 3.1011628561843982e-06, |
| "loss": 0.8876, |
| "num_tokens": 7808759242.0, |
| "step": 982 |
| }, |
| { |
| "epoch": 4.237966468361276, |
| "grad_norm": 0.25169308565149046, |
| "learning_rate": 3.0891474294823253e-06, |
| "loss": 0.9071, |
| "num_tokens": 7816911001.0, |
| "step": 983 |
| }, |
| { |
| "epoch": 4.24229313142239, |
| "grad_norm": 0.237314725332469, |
| "learning_rate": 3.0771936931977185e-06, |
| "loss": 0.8977, |
| "num_tokens": 7825012529.0, |
| "step": 984 |
| }, |
| { |
| "epoch": 4.246619794483505, |
| "grad_norm": 0.2173955562906754, |
| "learning_rate": 3.065301740548219e-06, |
| "loss": 0.9154, |
| "num_tokens": 7833313011.0, |
| "step": 985 |
| }, |
| { |
| "epoch": 4.250946457544619, |
| "grad_norm": 0.22898129847081045, |
| "learning_rate": 3.053471664269658e-06, |
| "loss": 0.8967, |
| "num_tokens": 7841562459.0, |
| "step": 986 |
| }, |
| { |
| "epoch": 4.255273120605732, |
| "grad_norm": 0.24221863163289162, |
| "learning_rate": 3.0417035566153498e-06, |
| "loss": 0.9092, |
| "num_tokens": 7849697964.0, |
| "step": 987 |
| }, |
| { |
| "epoch": 4.259599783666847, |
| "grad_norm": 0.2462244118669248, |
| "learning_rate": 3.029997509355361e-06, |
| "loss": 0.9092, |
| "num_tokens": 7857816562.0, |
| "step": 988 |
| }, |
| { |
| "epoch": 4.263926446727961, |
| "grad_norm": 0.23906292297331386, |
| "learning_rate": 3.018353613775798e-06, |
| "loss": 0.9227, |
| "num_tokens": 7866172435.0, |
| "step": 989 |
| }, |
| { |
| "epoch": 4.268253109789075, |
| "grad_norm": 0.26238496146659623, |
| "learning_rate": 3.0067719606781e-06, |
| "loss": 0.8959, |
| "num_tokens": 7874266917.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 4.2725797728501895, |
| "grad_norm": 0.2406443914678014, |
| "learning_rate": 2.9952526403783227e-06, |
| "loss": 0.9318, |
| "num_tokens": 7882511531.0, |
| "step": 991 |
| }, |
| { |
| "epoch": 4.276906435911304, |
| "grad_norm": 0.2453279920036015, |
| "learning_rate": 2.9837957427064413e-06, |
| "loss": 0.9088, |
| "num_tokens": 7890688249.0, |
| "step": 992 |
| }, |
| { |
| "epoch": 4.281233098972417, |
| "grad_norm": 0.2513178827661643, |
| "learning_rate": 2.9724013570056436e-06, |
| "loss": 0.9286, |
| "num_tokens": 7898728597.0, |
| "step": 993 |
| }, |
| { |
| "epoch": 4.2855597620335315, |
| "grad_norm": 0.22432182690828315, |
| "learning_rate": 2.9610695721316396e-06, |
| "loss": 0.8988, |
| "num_tokens": 7906976032.0, |
| "step": 994 |
| }, |
| { |
| "epoch": 4.289886425094646, |
| "grad_norm": 0.2524437800844289, |
| "learning_rate": 2.949800476451963e-06, |
| "loss": 0.8918, |
| "num_tokens": 7915251932.0, |
| "step": 995 |
| }, |
| { |
| "epoch": 4.29421308815576, |
| "grad_norm": 0.23484108974910797, |
| "learning_rate": 2.9385941578452842e-06, |
| "loss": 0.9183, |
| "num_tokens": 7923323128.0, |
| "step": 996 |
| }, |
| { |
| "epoch": 4.298539751216874, |
| "grad_norm": 0.25577170783084596, |
| "learning_rate": 2.9274507037007312e-06, |
| "loss": 0.9555, |
| "num_tokens": 7931421185.0, |
| "step": 997 |
| }, |
| { |
| "epoch": 4.302866414277988, |
| "grad_norm": 0.2760603667425678, |
| "learning_rate": 2.9163702009171936e-06, |
| "loss": 0.9093, |
| "num_tokens": 7939532377.0, |
| "step": 998 |
| }, |
| { |
| "epoch": 4.307193077339102, |
| "grad_norm": 0.22875893145403134, |
| "learning_rate": 2.9053527359026605e-06, |
| "loss": 0.8907, |
| "num_tokens": 7947766698.0, |
| "step": 999 |
| }, |
| { |
| "epoch": 4.311519740400216, |
| "grad_norm": 0.2583150353816036, |
| "learning_rate": 2.8943983945735375e-06, |
| "loss": 0.9276, |
| "num_tokens": 7955887724.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 4.315846403461331, |
| "grad_norm": 0.24320219179355962, |
| "learning_rate": 2.883507262353977e-06, |
| "loss": 0.9101, |
| "num_tokens": 7963982677.0, |
| "step": 1001 |
| }, |
| { |
| "epoch": 4.320173066522445, |
| "grad_norm": 0.2833332765864319, |
| "learning_rate": 2.8726794241752163e-06, |
| "loss": 0.9044, |
| "num_tokens": 7972210481.0, |
| "step": 1002 |
| }, |
| { |
| "epoch": 4.324499729583558, |
| "grad_norm": 0.24214965298560892, |
| "learning_rate": 2.861914964474913e-06, |
| "loss": 0.9446, |
| "num_tokens": 7980398111.0, |
| "step": 1003 |
| }, |
| { |
| "epoch": 4.328826392644673, |
| "grad_norm": 0.24570245569395735, |
| "learning_rate": 2.8512139671964844e-06, |
| "loss": 0.9193, |
| "num_tokens": 7988615985.0, |
| "step": 1004 |
| }, |
| { |
| "epoch": 4.333153055705787, |
| "grad_norm": 0.2677223593998129, |
| "learning_rate": 2.8405765157884615e-06, |
| "loss": 0.9208, |
| "num_tokens": 7996771846.0, |
| "step": 1005 |
| }, |
| { |
| "epoch": 4.337479718766901, |
| "grad_norm": 0.22595326917303651, |
| "learning_rate": 2.830002693203823e-06, |
| "loss": 0.9052, |
| "num_tokens": 8004877666.0, |
| "step": 1006 |
| }, |
| { |
| "epoch": 4.3418063818280155, |
| "grad_norm": 0.2776838569441126, |
| "learning_rate": 2.8194925818993617e-06, |
| "loss": 0.8786, |
| "num_tokens": 8012805147.0, |
| "step": 1007 |
| }, |
| { |
| "epoch": 4.346133044889129, |
| "grad_norm": 0.23838814276876832, |
| "learning_rate": 2.8090462638350397e-06, |
| "loss": 0.9403, |
| "num_tokens": 8020848497.0, |
| "step": 1008 |
| }, |
| { |
| "epoch": 4.350459707950243, |
| "grad_norm": 0.23575983788350954, |
| "learning_rate": 2.7986638204733407e-06, |
| "loss": 0.9112, |
| "num_tokens": 8029064640.0, |
| "step": 1009 |
| }, |
| { |
| "epoch": 4.3547863710113575, |
| "grad_norm": 0.2451032311621356, |
| "learning_rate": 2.788345332778646e-06, |
| "loss": 0.9097, |
| "num_tokens": 8037282567.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 4.359113034072472, |
| "grad_norm": 1.1599064195215374, |
| "learning_rate": 2.778090881216592e-06, |
| "loss": 0.9209, |
| "num_tokens": 8045431558.0, |
| "step": 1011 |
| }, |
| { |
| "epoch": 4.363439697133586, |
| "grad_norm": 0.30873535879406216, |
| "learning_rate": 2.7679005457534557e-06, |
| "loss": 0.9147, |
| "num_tokens": 8053635644.0, |
| "step": 1012 |
| }, |
| { |
| "epoch": 4.367766360194699, |
| "grad_norm": 0.24702430360632943, |
| "learning_rate": 2.757774405855519e-06, |
| "loss": 0.8908, |
| "num_tokens": 8061854534.0, |
| "step": 1013 |
| }, |
| { |
| "epoch": 4.372093023255814, |
| "grad_norm": 0.2710714812381768, |
| "learning_rate": 2.747712540488454e-06, |
| "loss": 0.8975, |
| "num_tokens": 8070195917.0, |
| "step": 1014 |
| }, |
| { |
| "epoch": 4.376419686316928, |
| "grad_norm": 0.26762222390109924, |
| "learning_rate": 2.737715028116707e-06, |
| "loss": 0.9346, |
| "num_tokens": 8078364915.0, |
| "step": 1015 |
| }, |
| { |
| "epoch": 4.380746349378042, |
| "grad_norm": 0.24113215046758327, |
| "learning_rate": 2.727781946702891e-06, |
| "loss": 0.9001, |
| "num_tokens": 8086275582.0, |
| "step": 1016 |
| }, |
| { |
| "epoch": 4.385073012439157, |
| "grad_norm": 0.29603018439057366, |
| "learning_rate": 2.717913373707167e-06, |
| "loss": 0.9007, |
| "num_tokens": 8094238589.0, |
| "step": 1017 |
| }, |
| { |
| "epoch": 4.38939967550027, |
| "grad_norm": 0.2440085316018602, |
| "learning_rate": 2.708109386086653e-06, |
| "loss": 0.8845, |
| "num_tokens": 8102346728.0, |
| "step": 1018 |
| }, |
| { |
| "epoch": 4.393726338561384, |
| "grad_norm": 0.26675724694934916, |
| "learning_rate": 2.6983700602948116e-06, |
| "loss": 0.9249, |
| "num_tokens": 8110578253.0, |
| "step": 1019 |
| }, |
| { |
| "epoch": 4.398053001622499, |
| "grad_norm": 0.2681087433723698, |
| "learning_rate": 2.68869547228086e-06, |
| "loss": 0.8965, |
| "num_tokens": 8118504559.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 4.402379664683613, |
| "grad_norm": 0.21927043163651067, |
| "learning_rate": 2.679085697489183e-06, |
| "loss": 0.8941, |
| "num_tokens": 8126743920.0, |
| "step": 1021 |
| }, |
| { |
| "epoch": 4.406706327744727, |
| "grad_norm": 0.27323929746962095, |
| "learning_rate": 2.6695408108587314e-06, |
| "loss": 0.9277, |
| "num_tokens": 8135026189.0, |
| "step": 1022 |
| }, |
| { |
| "epoch": 4.411032990805841, |
| "grad_norm": 0.22857677582074945, |
| "learning_rate": 2.6600608868224516e-06, |
| "loss": 0.9125, |
| "num_tokens": 8143175762.0, |
| "step": 1023 |
| }, |
| { |
| "epoch": 4.415359653866955, |
| "grad_norm": 0.25364165355402224, |
| "learning_rate": 2.6506459993066918e-06, |
| "loss": 0.9513, |
| "num_tokens": 8151388760.0, |
| "step": 1024 |
| }, |
| { |
| "epoch": 4.419686316928069, |
| "grad_norm": 0.22639641169387734, |
| "learning_rate": 2.6412962217306415e-06, |
| "loss": 0.9668, |
| "num_tokens": 8159573555.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 4.424012979989183, |
| "grad_norm": 0.22430407380048098, |
| "learning_rate": 2.632011627005738e-06, |
| "loss": 0.8929, |
| "num_tokens": 8167719180.0, |
| "step": 1026 |
| }, |
| { |
| "epoch": 4.428339643050298, |
| "grad_norm": 0.23948485782246384, |
| "learning_rate": 2.6227922875351196e-06, |
| "loss": 0.8916, |
| "num_tokens": 8175831208.0, |
| "step": 1027 |
| }, |
| { |
| "epoch": 4.432666306111411, |
| "grad_norm": 0.24643284137803, |
| "learning_rate": 2.6136382752130486e-06, |
| "loss": 0.938, |
| "num_tokens": 8184024547.0, |
| "step": 1028 |
| }, |
| { |
| "epoch": 4.436992969172525, |
| "grad_norm": 0.24582088544418418, |
| "learning_rate": 2.6045496614243485e-06, |
| "loss": 0.8963, |
| "num_tokens": 8192200295.0, |
| "step": 1029 |
| }, |
| { |
| "epoch": 4.44131963223364, |
| "grad_norm": 0.2440944566577538, |
| "learning_rate": 2.5955265170438593e-06, |
| "loss": 0.8883, |
| "num_tokens": 8200316579.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 4.445646295294754, |
| "grad_norm": 0.24904469869515644, |
| "learning_rate": 2.5865689124358707e-06, |
| "loss": 0.9436, |
| "num_tokens": 8208558171.0, |
| "step": 1031 |
| }, |
| { |
| "epoch": 4.449972958355868, |
| "grad_norm": 0.24911259576594413, |
| "learning_rate": 2.5776769174535835e-06, |
| "loss": 0.9147, |
| "num_tokens": 8216669746.0, |
| "step": 1032 |
| }, |
| { |
| "epoch": 4.4542996214169825, |
| "grad_norm": 0.2234007333497611, |
| "learning_rate": 2.56885060143856e-06, |
| "loss": 0.9154, |
| "num_tokens": 8224995031.0, |
| "step": 1033 |
| }, |
| { |
| "epoch": 4.458626284478096, |
| "grad_norm": 0.24266587214310398, |
| "learning_rate": 2.560090033220187e-06, |
| "loss": 0.8957, |
| "num_tokens": 8233114355.0, |
| "step": 1034 |
| }, |
| { |
| "epoch": 4.46295294753921, |
| "grad_norm": 0.24147325182416088, |
| "learning_rate": 2.5513952811151338e-06, |
| "loss": 0.9117, |
| "num_tokens": 8241441116.0, |
| "step": 1035 |
| }, |
| { |
| "epoch": 4.4672796106003245, |
| "grad_norm": 0.2370461402183908, |
| "learning_rate": 2.5427664129268253e-06, |
| "loss": 0.8981, |
| "num_tokens": 8249541761.0, |
| "step": 1036 |
| }, |
| { |
| "epoch": 4.471606273661439, |
| "grad_norm": 0.23708579704850133, |
| "learning_rate": 2.5342034959449075e-06, |
| "loss": 0.9417, |
| "num_tokens": 8257714724.0, |
| "step": 1037 |
| }, |
| { |
| "epoch": 4.475932936722553, |
| "grad_norm": 0.23455593822165277, |
| "learning_rate": 2.5257065969447297e-06, |
| "loss": 0.9377, |
| "num_tokens": 8265816974.0, |
| "step": 1038 |
| }, |
| { |
| "epoch": 4.4802595997836665, |
| "grad_norm": 0.21487013012395628, |
| "learning_rate": 2.5172757821868144e-06, |
| "loss": 0.9327, |
| "num_tokens": 8273974273.0, |
| "step": 1039 |
| }, |
| { |
| "epoch": 4.484586262844781, |
| "grad_norm": 0.2254553909706869, |
| "learning_rate": 2.5089111174163483e-06, |
| "loss": 0.934, |
| "num_tokens": 8282132554.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 4.488912925905895, |
| "grad_norm": 0.2131771890264051, |
| "learning_rate": 2.5006126678626714e-06, |
| "loss": 0.8974, |
| "num_tokens": 8290234332.0, |
| "step": 1041 |
| }, |
| { |
| "epoch": 4.493239588967009, |
| "grad_norm": 0.22236682158923896, |
| "learning_rate": 2.492380498238756e-06, |
| "loss": 0.927, |
| "num_tokens": 8298413944.0, |
| "step": 1042 |
| }, |
| { |
| "epoch": 4.497566252028124, |
| "grad_norm": 0.2121975495704575, |
| "learning_rate": 2.4842146727407173e-06, |
| "loss": 0.9284, |
| "num_tokens": 8306583885.0, |
| "step": 1043 |
| }, |
| { |
| "epoch": 4.501892915089237, |
| "grad_norm": 0.2169856691232569, |
| "learning_rate": 2.4761152550473024e-06, |
| "loss": 0.9212, |
| "num_tokens": 8314795632.0, |
| "step": 1044 |
| }, |
| { |
| "epoch": 4.506219578150351, |
| "grad_norm": 0.22019170298331212, |
| "learning_rate": 2.468082308319397e-06, |
| "loss": 0.906, |
| "num_tokens": 8323018451.0, |
| "step": 1045 |
| }, |
| { |
| "epoch": 4.510546241211466, |
| "grad_norm": 0.2198386279437345, |
| "learning_rate": 2.4601158951995314e-06, |
| "loss": 0.9064, |
| "num_tokens": 8330954402.0, |
| "step": 1046 |
| }, |
| { |
| "epoch": 4.51487290427258, |
| "grad_norm": 0.21923668460774162, |
| "learning_rate": 2.4522160778113993e-06, |
| "loss": 0.9015, |
| "num_tokens": 8339229368.0, |
| "step": 1047 |
| }, |
| { |
| "epoch": 4.519199567333694, |
| "grad_norm": 0.21892201525761043, |
| "learning_rate": 2.44438291775936e-06, |
| "loss": 0.9273, |
| "num_tokens": 8347465643.0, |
| "step": 1048 |
| }, |
| { |
| "epoch": 4.523526230394808, |
| "grad_norm": 0.21551779191440174, |
| "learning_rate": 2.4366164761279707e-06, |
| "loss": 0.9187, |
| "num_tokens": 8355626064.0, |
| "step": 1049 |
| }, |
| { |
| "epoch": 4.527852893455922, |
| "grad_norm": 0.22282930039690238, |
| "learning_rate": 2.4289168134815065e-06, |
| "loss": 0.9576, |
| "num_tokens": 8363857534.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 4.532179556517036, |
| "grad_norm": 0.20982941027778246, |
| "learning_rate": 2.421283989863478e-06, |
| "loss": 0.8772, |
| "num_tokens": 8372105463.0, |
| "step": 1051 |
| }, |
| { |
| "epoch": 4.5365062195781505, |
| "grad_norm": 0.22455423628788954, |
| "learning_rate": 2.41371806479618e-06, |
| "loss": 0.9402, |
| "num_tokens": 8380368203.0, |
| "step": 1052 |
| }, |
| { |
| "epoch": 4.540832882639265, |
| "grad_norm": 0.23091228772734626, |
| "learning_rate": 2.406219097280214e-06, |
| "loss": 0.9479, |
| "num_tokens": 8388521209.0, |
| "step": 1053 |
| }, |
| { |
| "epoch": 4.545159545700379, |
| "grad_norm": 0.2398615660642649, |
| "learning_rate": 2.398787145794037e-06, |
| "loss": 0.9416, |
| "num_tokens": 8396627451.0, |
| "step": 1054 |
| }, |
| { |
| "epoch": 4.5494862087614925, |
| "grad_norm": 0.23939806557885646, |
| "learning_rate": 2.3914222682934986e-06, |
| "loss": 0.9161, |
| "num_tokens": 8404837650.0, |
| "step": 1055 |
| }, |
| { |
| "epoch": 4.553812871822607, |
| "grad_norm": 0.24446417367897916, |
| "learning_rate": 2.3841245222113953e-06, |
| "loss": 0.9306, |
| "num_tokens": 8412812351.0, |
| "step": 1056 |
| }, |
| { |
| "epoch": 4.558139534883721, |
| "grad_norm": 0.22572000773778622, |
| "learning_rate": 2.3768939644570143e-06, |
| "loss": 0.8796, |
| "num_tokens": 8420971902.0, |
| "step": 1057 |
| }, |
| { |
| "epoch": 4.562466197944835, |
| "grad_norm": 0.24474536426880072, |
| "learning_rate": 2.3697306514156978e-06, |
| "loss": 0.9244, |
| "num_tokens": 8429225388.0, |
| "step": 1058 |
| }, |
| { |
| "epoch": 4.566792861005949, |
| "grad_norm": 0.21928892468156305, |
| "learning_rate": 2.3626346389484005e-06, |
| "loss": 0.9096, |
| "num_tokens": 8437382172.0, |
| "step": 1059 |
| }, |
| { |
| "epoch": 4.571119524067063, |
| "grad_norm": 0.22842290654280803, |
| "learning_rate": 2.3556059823912524e-06, |
| "loss": 0.8858, |
| "num_tokens": 8445391658.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 4.575446187128177, |
| "grad_norm": 0.23251949213565723, |
| "learning_rate": 2.34864473655513e-06, |
| "loss": 0.9307, |
| "num_tokens": 8453520637.0, |
| "step": 1061 |
| }, |
| { |
| "epoch": 4.579772850189292, |
| "grad_norm": 0.21342174591976593, |
| "learning_rate": 2.341750955725227e-06, |
| "loss": 0.9321, |
| "num_tokens": 8461655842.0, |
| "step": 1062 |
| }, |
| { |
| "epoch": 4.584099513250406, |
| "grad_norm": 0.2542248578765479, |
| "learning_rate": 2.334924693660631e-06, |
| "loss": 0.9073, |
| "num_tokens": 8469896692.0, |
| "step": 1063 |
| }, |
| { |
| "epoch": 4.58842617631152, |
| "grad_norm": 0.24425754732071317, |
| "learning_rate": 2.328166003593904e-06, |
| "loss": 0.9106, |
| "num_tokens": 8478037823.0, |
| "step": 1064 |
| }, |
| { |
| "epoch": 4.592752839372634, |
| "grad_norm": 0.23248234618828792, |
| "learning_rate": 2.3214749382306696e-06, |
| "loss": 0.9215, |
| "num_tokens": 8486264180.0, |
| "step": 1065 |
| }, |
| { |
| "epoch": 4.597079502433748, |
| "grad_norm": 0.23757606431595135, |
| "learning_rate": 2.3148515497491976e-06, |
| "loss": 0.9219, |
| "num_tokens": 8494182868.0, |
| "step": 1066 |
| }, |
| { |
| "epoch": 4.601406165494862, |
| "grad_norm": 0.2561573985410888, |
| "learning_rate": 2.308295889800004e-06, |
| "loss": 0.9215, |
| "num_tokens": 8502445035.0, |
| "step": 1067 |
| }, |
| { |
| "epoch": 4.605732828555976, |
| "grad_norm": 0.23162663966204003, |
| "learning_rate": 2.3018080095054418e-06, |
| "loss": 0.9111, |
| "num_tokens": 8510724601.0, |
| "step": 1068 |
| }, |
| { |
| "epoch": 4.61005949161709, |
| "grad_norm": 0.26717575358601775, |
| "learning_rate": 2.2953879594593046e-06, |
| "loss": 0.9489, |
| "num_tokens": 8518808391.0, |
| "step": 1069 |
| }, |
| { |
| "epoch": 4.614386154678204, |
| "grad_norm": 0.2529372514931457, |
| "learning_rate": 2.2890357897264325e-06, |
| "loss": 0.928, |
| "num_tokens": 8526913901.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 4.618712817739318, |
| "grad_norm": 0.21355116829146173, |
| "learning_rate": 2.2827515498423204e-06, |
| "loss": 0.9649, |
| "num_tokens": 8534983780.0, |
| "step": 1071 |
| }, |
| { |
| "epoch": 4.623039480800433, |
| "grad_norm": 0.2456451804025508, |
| "learning_rate": 2.276535288812734e-06, |
| "loss": 0.9306, |
| "num_tokens": 8543253072.0, |
| "step": 1072 |
| }, |
| { |
| "epoch": 4.627366143861547, |
| "grad_norm": 0.230007736093212, |
| "learning_rate": 2.2703870551133246e-06, |
| "loss": 0.9239, |
| "num_tokens": 8551528178.0, |
| "step": 1073 |
| }, |
| { |
| "epoch": 4.631692806922661, |
| "grad_norm": 0.22045377494054158, |
| "learning_rate": 2.264306896689255e-06, |
| "loss": 0.9196, |
| "num_tokens": 8559817119.0, |
| "step": 1074 |
| }, |
| { |
| "epoch": 4.636019469983775, |
| "grad_norm": 0.2556158783264956, |
| "learning_rate": 2.2582948609548205e-06, |
| "loss": 0.9377, |
| "num_tokens": 8567832779.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 4.640346133044889, |
| "grad_norm": 0.2648793592168965, |
| "learning_rate": 2.2523509947930847e-06, |
| "loss": 0.9121, |
| "num_tokens": 8576038992.0, |
| "step": 1076 |
| }, |
| { |
| "epoch": 4.644672796106003, |
| "grad_norm": 0.2425175080351857, |
| "learning_rate": 2.2464753445555083e-06, |
| "loss": 0.9136, |
| "num_tokens": 8584231271.0, |
| "step": 1077 |
| }, |
| { |
| "epoch": 4.6489994591671175, |
| "grad_norm": 0.24879039753969864, |
| "learning_rate": 2.2406679560615948e-06, |
| "loss": 0.9131, |
| "num_tokens": 8592443058.0, |
| "step": 1078 |
| }, |
| { |
| "epoch": 4.653326122228232, |
| "grad_norm": 0.22681872861351485, |
| "learning_rate": 2.2349288745985235e-06, |
| "loss": 0.8941, |
| "num_tokens": 8600701259.0, |
| "step": 1079 |
| }, |
| { |
| "epoch": 4.657652785289345, |
| "grad_norm": 0.2513200301630142, |
| "learning_rate": 2.229258144920805e-06, |
| "loss": 0.8976, |
| "num_tokens": 8609010693.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 4.6619794483504595, |
| "grad_norm": 0.22219846006857522, |
| "learning_rate": 2.223655811249931e-06, |
| "loss": 0.952, |
| "num_tokens": 8617103908.0, |
| "step": 1081 |
| }, |
| { |
| "epoch": 4.666306111411574, |
| "grad_norm": 0.22978741078825024, |
| "learning_rate": 2.218121917274023e-06, |
| "loss": 0.9114, |
| "num_tokens": 8625308325.0, |
| "step": 1082 |
| }, |
| { |
| "epoch": 4.670632774472688, |
| "grad_norm": 0.23640956812538738, |
| "learning_rate": 2.2126565061474972e-06, |
| "loss": 0.9322, |
| "num_tokens": 8633613797.0, |
| "step": 1083 |
| }, |
| { |
| "epoch": 4.674959437533802, |
| "grad_norm": 0.23087974818902274, |
| "learning_rate": 2.207259620490727e-06, |
| "loss": 0.8983, |
| "num_tokens": 8641914517.0, |
| "step": 1084 |
| }, |
| { |
| "epoch": 4.679286100594916, |
| "grad_norm": 0.32959187451424254, |
| "learning_rate": 2.2019313023897142e-06, |
| "loss": 0.8854, |
| "num_tokens": 8649959913.0, |
| "step": 1085 |
| }, |
| { |
| "epoch": 4.68361276365603, |
| "grad_norm": 0.24230388409008827, |
| "learning_rate": 2.1966715933957493e-06, |
| "loss": 0.9461, |
| "num_tokens": 8657909497.0, |
| "step": 1086 |
| }, |
| { |
| "epoch": 4.687939426717144, |
| "grad_norm": 0.22642078906501323, |
| "learning_rate": 2.191480534525103e-06, |
| "loss": 0.886, |
| "num_tokens": 8666053678.0, |
| "step": 1087 |
| }, |
| { |
| "epoch": 4.692266089778259, |
| "grad_norm": 0.22694813074036918, |
| "learning_rate": 2.1863581662586945e-06, |
| "loss": 0.9011, |
| "num_tokens": 8674210962.0, |
| "step": 1088 |
| }, |
| { |
| "epoch": 4.696592752839373, |
| "grad_norm": 0.2298824888061183, |
| "learning_rate": 2.1813045285417785e-06, |
| "loss": 0.9267, |
| "num_tokens": 8682409942.0, |
| "step": 1089 |
| }, |
| { |
| "epoch": 4.700919415900486, |
| "grad_norm": 0.23919688355935076, |
| "learning_rate": 2.1763196607836393e-06, |
| "loss": 0.9098, |
| "num_tokens": 8690511024.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 4.705246078961601, |
| "grad_norm": 0.21956784819647854, |
| "learning_rate": 2.1714036018572764e-06, |
| "loss": 0.9384, |
| "num_tokens": 8698559389.0, |
| "step": 1091 |
| }, |
| { |
| "epoch": 4.709572742022715, |
| "grad_norm": 0.24610989261964578, |
| "learning_rate": 2.1665563900991043e-06, |
| "loss": 0.9225, |
| "num_tokens": 8706586074.0, |
| "step": 1092 |
| }, |
| { |
| "epoch": 4.713899405083829, |
| "grad_norm": 0.21977533775137006, |
| "learning_rate": 2.1617780633086545e-06, |
| "loss": 0.9376, |
| "num_tokens": 8714666218.0, |
| "step": 1093 |
| }, |
| { |
| "epoch": 4.7182260681449435, |
| "grad_norm": 0.213849406897767, |
| "learning_rate": 2.1570686587482796e-06, |
| "loss": 0.9224, |
| "num_tokens": 8722796038.0, |
| "step": 1094 |
| }, |
| { |
| "epoch": 4.722552731206058, |
| "grad_norm": 0.22286159150427315, |
| "learning_rate": 2.1524282131428615e-06, |
| "loss": 0.939, |
| "num_tokens": 8730944398.0, |
| "step": 1095 |
| }, |
| { |
| "epoch": 4.726879394267171, |
| "grad_norm": 0.21529351177056805, |
| "learning_rate": 2.147856762679528e-06, |
| "loss": 0.938, |
| "num_tokens": 8739279574.0, |
| "step": 1096 |
| }, |
| { |
| "epoch": 4.7312060573282855, |
| "grad_norm": 0.20965774958349334, |
| "learning_rate": 2.143354343007367e-06, |
| "loss": 0.9062, |
| "num_tokens": 8747329404.0, |
| "step": 1097 |
| }, |
| { |
| "epoch": 4.7355327203894, |
| "grad_norm": 0.20617803221216705, |
| "learning_rate": 2.1389209892371525e-06, |
| "loss": 0.923, |
| "num_tokens": 8755606713.0, |
| "step": 1098 |
| }, |
| { |
| "epoch": 4.739859383450514, |
| "grad_norm": 0.21016031766306656, |
| "learning_rate": 2.1345567359410665e-06, |
| "loss": 0.8719, |
| "num_tokens": 8763860984.0, |
| "step": 1099 |
| }, |
| { |
| "epoch": 4.7441860465116275, |
| "grad_norm": 0.20384128159589965, |
| "learning_rate": 2.1302616171524356e-06, |
| "loss": 0.9181, |
| "num_tokens": 8772171202.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 4.748512709572742, |
| "grad_norm": 0.20893323187166496, |
| "learning_rate": 2.1260356663654562e-06, |
| "loss": 0.9057, |
| "num_tokens": 8780360379.0, |
| "step": 1101 |
| }, |
| { |
| "epoch": 4.752839372633856, |
| "grad_norm": 0.20612438954780368, |
| "learning_rate": 2.1218789165349425e-06, |
| "loss": 0.9152, |
| "num_tokens": 8788673205.0, |
| "step": 1102 |
| }, |
| { |
| "epoch": 4.75716603569497, |
| "grad_norm": 0.2125809442142549, |
| "learning_rate": 2.117791400076065e-06, |
| "loss": 0.9329, |
| "num_tokens": 8796892652.0, |
| "step": 1103 |
| }, |
| { |
| "epoch": 4.761492698756085, |
| "grad_norm": 0.20743329950388847, |
| "learning_rate": 2.113773148864097e-06, |
| "loss": 0.9149, |
| "num_tokens": 8805045905.0, |
| "step": 1104 |
| }, |
| { |
| "epoch": 4.765819361817199, |
| "grad_norm": 0.20929881513400472, |
| "learning_rate": 2.1098241942341703e-06, |
| "loss": 0.905, |
| "num_tokens": 8813300823.0, |
| "step": 1105 |
| }, |
| { |
| "epoch": 4.770146024878312, |
| "grad_norm": 0.21793262700048058, |
| "learning_rate": 2.105944566981025e-06, |
| "loss": 0.919, |
| "num_tokens": 8821469096.0, |
| "step": 1106 |
| }, |
| { |
| "epoch": 4.774472687939427, |
| "grad_norm": 0.21402361258647987, |
| "learning_rate": 2.1021342973587747e-06, |
| "loss": 0.9063, |
| "num_tokens": 8829574419.0, |
| "step": 1107 |
| }, |
| { |
| "epoch": 4.778799351000541, |
| "grad_norm": 0.20268959562105737, |
| "learning_rate": 2.098393415080667e-06, |
| "loss": 0.9449, |
| "num_tokens": 8837909937.0, |
| "step": 1108 |
| }, |
| { |
| "epoch": 4.783126014061655, |
| "grad_norm": 0.21873853332943569, |
| "learning_rate": 2.0947219493188515e-06, |
| "loss": 0.8878, |
| "num_tokens": 8845922366.0, |
| "step": 1109 |
| }, |
| { |
| "epoch": 4.7874526771227695, |
| "grad_norm": 0.22427043627143187, |
| "learning_rate": 2.0911199287041585e-06, |
| "loss": 0.9175, |
| "num_tokens": 8854195314.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 4.791779340183883, |
| "grad_norm": 0.22242942729652762, |
| "learning_rate": 2.087587381325867e-06, |
| "loss": 0.9302, |
| "num_tokens": 8862237356.0, |
| "step": 1111 |
| }, |
| { |
| "epoch": 4.796106003244997, |
| "grad_norm": 0.22691459369076583, |
| "learning_rate": 2.0841243347314926e-06, |
| "loss": 0.9207, |
| "num_tokens": 8870498074.0, |
| "step": 1112 |
| }, |
| { |
| "epoch": 4.800432666306111, |
| "grad_norm": 0.21029802017820462, |
| "learning_rate": 2.080730815926566e-06, |
| "loss": 0.908, |
| "num_tokens": 8878725976.0, |
| "step": 1113 |
| }, |
| { |
| "epoch": 4.804759329367226, |
| "grad_norm": 0.2238375831744874, |
| "learning_rate": 2.0774068513744294e-06, |
| "loss": 0.9154, |
| "num_tokens": 8886863134.0, |
| "step": 1114 |
| }, |
| { |
| "epoch": 4.80908599242834, |
| "grad_norm": 0.23640277975937646, |
| "learning_rate": 2.0741524669960258e-06, |
| "loss": 0.9058, |
| "num_tokens": 8894933619.0, |
| "step": 1115 |
| }, |
| { |
| "epoch": 4.813412655489453, |
| "grad_norm": 0.20220296661913104, |
| "learning_rate": 2.0709676881697004e-06, |
| "loss": 0.8861, |
| "num_tokens": 8903264869.0, |
| "step": 1116 |
| }, |
| { |
| "epoch": 4.817739318550568, |
| "grad_norm": 0.23727722007354193, |
| "learning_rate": 2.0678525397309945e-06, |
| "loss": 0.9271, |
| "num_tokens": 8911421588.0, |
| "step": 1117 |
| }, |
| { |
| "epoch": 4.822065981611682, |
| "grad_norm": 0.23235343963895627, |
| "learning_rate": 2.0648070459724656e-06, |
| "loss": 0.924, |
| "num_tokens": 8919591119.0, |
| "step": 1118 |
| }, |
| { |
| "epoch": 4.826392644672796, |
| "grad_norm": 0.2199249951981579, |
| "learning_rate": 2.061831230643482e-06, |
| "loss": 0.9049, |
| "num_tokens": 8927950607.0, |
| "step": 1119 |
| }, |
| { |
| "epoch": 4.830719307733911, |
| "grad_norm": 0.2194764604982523, |
| "learning_rate": 2.0589251169500524e-06, |
| "loss": 0.921, |
| "num_tokens": 8936155001.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 4.835045970795024, |
| "grad_norm": 0.23153319677802373, |
| "learning_rate": 2.056088727554633e-06, |
| "loss": 0.8995, |
| "num_tokens": 8944442434.0, |
| "step": 1121 |
| }, |
| { |
| "epoch": 4.839372633856138, |
| "grad_norm": 0.22249165496560577, |
| "learning_rate": 2.0533220845759586e-06, |
| "loss": 0.8921, |
| "num_tokens": 8952767111.0, |
| "step": 1122 |
| }, |
| { |
| "epoch": 4.8436992969172525, |
| "grad_norm": 0.19670623434078507, |
| "learning_rate": 2.0506252095888685e-06, |
| "loss": 0.8599, |
| "num_tokens": 8961063920.0, |
| "step": 1123 |
| }, |
| { |
| "epoch": 4.848025959978367, |
| "grad_norm": 0.20818885095654868, |
| "learning_rate": 2.0479981236241335e-06, |
| "loss": 0.9202, |
| "num_tokens": 8969286529.0, |
| "step": 1124 |
| }, |
| { |
| "epoch": 4.852352623039481, |
| "grad_norm": 0.2512842270408484, |
| "learning_rate": 2.0454408471682986e-06, |
| "loss": 0.9138, |
| "num_tokens": 8977525391.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 4.856679286100595, |
| "grad_norm": 0.20124864644138032, |
| "learning_rate": 2.0429534001635194e-06, |
| "loss": 0.9072, |
| "num_tokens": 8985751690.0, |
| "step": 1126 |
| }, |
| { |
| "epoch": 4.861005949161709, |
| "grad_norm": 0.22703475290609854, |
| "learning_rate": 2.0405358020074076e-06, |
| "loss": 0.9098, |
| "num_tokens": 8993976990.0, |
| "step": 1127 |
| }, |
| { |
| "epoch": 4.865332612222823, |
| "grad_norm": 0.23749997007783907, |
| "learning_rate": 2.0381880715528786e-06, |
| "loss": 0.8907, |
| "num_tokens": 9002137261.0, |
| "step": 1128 |
| }, |
| { |
| "epoch": 4.869659275283937, |
| "grad_norm": 0.21976926195972712, |
| "learning_rate": 2.0359102271080062e-06, |
| "loss": 0.9438, |
| "num_tokens": 9010389288.0, |
| "step": 1129 |
| }, |
| { |
| "epoch": 4.873985938345052, |
| "grad_norm": 0.24792727403538053, |
| "learning_rate": 2.0337022864358786e-06, |
| "loss": 0.906, |
| "num_tokens": 9018658763.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 4.878312601406165, |
| "grad_norm": 0.2614860295951354, |
| "learning_rate": 2.031564266754461e-06, |
| "loss": 0.914, |
| "num_tokens": 9026699766.0, |
| "step": 1131 |
| }, |
| { |
| "epoch": 4.882639264467279, |
| "grad_norm": 0.2450275316189634, |
| "learning_rate": 2.0294961847364616e-06, |
| "loss": 0.9317, |
| "num_tokens": 9034918076.0, |
| "step": 1132 |
| }, |
| { |
| "epoch": 4.886965927528394, |
| "grad_norm": 0.24200253390740495, |
| "learning_rate": 2.0274980565091975e-06, |
| "loss": 0.8878, |
| "num_tokens": 9043115999.0, |
| "step": 1133 |
| }, |
| { |
| "epoch": 4.891292590589508, |
| "grad_norm": 0.2320164712307313, |
| "learning_rate": 2.025569897654475e-06, |
| "loss": 0.9276, |
| "num_tokens": 9051203798.0, |
| "step": 1134 |
| }, |
| { |
| "epoch": 4.895619253650622, |
| "grad_norm": 0.243473369606954, |
| "learning_rate": 2.0237117232084633e-06, |
| "loss": 0.9349, |
| "num_tokens": 9059325975.0, |
| "step": 1135 |
| }, |
| { |
| "epoch": 4.8999459167117365, |
| "grad_norm": 0.23928329522353953, |
| "learning_rate": 2.0219235476615828e-06, |
| "loss": 0.9132, |
| "num_tokens": 9067540449.0, |
| "step": 1136 |
| }, |
| { |
| "epoch": 4.90427257977285, |
| "grad_norm": 0.22430652521916916, |
| "learning_rate": 2.0202053849583807e-06, |
| "loss": 0.9032, |
| "num_tokens": 9075738214.0, |
| "step": 1137 |
| }, |
| { |
| "epoch": 4.908599242833964, |
| "grad_norm": 0.2096685655932549, |
| "learning_rate": 2.0185572484974404e-06, |
| "loss": 0.9093, |
| "num_tokens": 9083846921.0, |
| "step": 1138 |
| }, |
| { |
| "epoch": 4.9129259058950785, |
| "grad_norm": 0.21341345052548796, |
| "learning_rate": 2.0169791511312564e-06, |
| "loss": 0.9164, |
| "num_tokens": 9091982051.0, |
| "step": 1139 |
| }, |
| { |
| "epoch": 4.917252568956193, |
| "grad_norm": 0.24338222425544925, |
| "learning_rate": 2.0154711051661524e-06, |
| "loss": 0.9296, |
| "num_tokens": 9100243162.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 4.921579232017306, |
| "grad_norm": 0.22192657024672127, |
| "learning_rate": 2.014033122362171e-06, |
| "loss": 0.8941, |
| "num_tokens": 9108550126.0, |
| "step": 1141 |
| }, |
| { |
| "epoch": 4.9259058950784205, |
| "grad_norm": 0.22250281726737248, |
| "learning_rate": 2.0126652139329934e-06, |
| "loss": 0.9376, |
| "num_tokens": 9116730576.0, |
| "step": 1142 |
| }, |
| { |
| "epoch": 4.930232558139535, |
| "grad_norm": 0.21835212746251859, |
| "learning_rate": 2.0113673905458433e-06, |
| "loss": 0.9288, |
| "num_tokens": 9125114079.0, |
| "step": 1143 |
| }, |
| { |
| "epoch": 4.934559221200649, |
| "grad_norm": 0.2359765746718839, |
| "learning_rate": 2.0101396623214068e-06, |
| "loss": 0.937, |
| "num_tokens": 9133266569.0, |
| "step": 1144 |
| }, |
| { |
| "epoch": 4.938885884261763, |
| "grad_norm": 0.22227350570305626, |
| "learning_rate": 2.008982038833758e-06, |
| "loss": 0.9288, |
| "num_tokens": 9141497993.0, |
| "step": 1145 |
| }, |
| { |
| "epoch": 4.943212547322878, |
| "grad_norm": 0.2312417625381218, |
| "learning_rate": 2.0078945291102746e-06, |
| "loss": 0.8898, |
| "num_tokens": 9149640610.0, |
| "step": 1146 |
| }, |
| { |
| "epoch": 4.947539210383991, |
| "grad_norm": 0.22458979646702876, |
| "learning_rate": 2.0068771416315785e-06, |
| "loss": 0.9012, |
| "num_tokens": 9157856842.0, |
| "step": 1147 |
| }, |
| { |
| "epoch": 4.951865873445105, |
| "grad_norm": 0.22778028587745122, |
| "learning_rate": 2.0059298843314594e-06, |
| "loss": 0.9213, |
| "num_tokens": 9166201687.0, |
| "step": 1148 |
| }, |
| { |
| "epoch": 4.95619253650622, |
| "grad_norm": 0.22936925584883466, |
| "learning_rate": 2.005052764596822e-06, |
| "loss": 0.945, |
| "num_tokens": 9174460788.0, |
| "step": 1149 |
| }, |
| { |
| "epoch": 4.960519199567334, |
| "grad_norm": 0.21176895797925974, |
| "learning_rate": 2.0042457892676203e-06, |
| "loss": 0.905, |
| "num_tokens": 9182715086.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 4.964845862628448, |
| "grad_norm": 0.2267907836662072, |
| "learning_rate": 2.003508964636811e-06, |
| "loss": 0.9195, |
| "num_tokens": 9190927902.0, |
| "step": 1151 |
| }, |
| { |
| "epoch": 4.969172525689562, |
| "grad_norm": 0.20904877816094458, |
| "learning_rate": 2.0028422964503007e-06, |
| "loss": 0.9175, |
| "num_tokens": 9199252989.0, |
| "step": 1152 |
| }, |
| { |
| "epoch": 4.973499188750676, |
| "grad_norm": 0.21745305262154316, |
| "learning_rate": 2.002245789906901e-06, |
| "loss": 0.9088, |
| "num_tokens": 9207627705.0, |
| "step": 1153 |
| }, |
| { |
| "epoch": 4.97782585181179, |
| "grad_norm": 0.21658469544242928, |
| "learning_rate": 2.0017194496582903e-06, |
| "loss": 0.9116, |
| "num_tokens": 9215752545.0, |
| "step": 1154 |
| }, |
| { |
| "epoch": 4.9821525148729044, |
| "grad_norm": 0.228091048385714, |
| "learning_rate": 2.001263279808977e-06, |
| "loss": 0.9311, |
| "num_tokens": 9223826485.0, |
| "step": 1155 |
| }, |
| { |
| "epoch": 4.986479177934019, |
| "grad_norm": 0.22068068132470603, |
| "learning_rate": 2.0008772839162623e-06, |
| "loss": 0.9244, |
| "num_tokens": 9232130653.0, |
| "step": 1156 |
| }, |
| { |
| "epoch": 4.990805840995132, |
| "grad_norm": 0.21601781584175028, |
| "learning_rate": 2.000561464990222e-06, |
| "loss": 0.9161, |
| "num_tokens": 9240121021.0, |
| "step": 1157 |
| }, |
| { |
| "epoch": 4.995132504056246, |
| "grad_norm": 0.21668791628808873, |
| "learning_rate": 2.0003158254936748e-06, |
| "loss": 0.8818, |
| "num_tokens": 9248287497.0, |
| "step": 1158 |
| }, |
| { |
| "epoch": 4.999459167117361, |
| "grad_norm": 0.2256692786070936, |
| "learning_rate": 2.000140367342166e-06, |
| "loss": 0.9174, |
| "num_tokens": 9256321099.0, |
| "step": 1159 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.5189925985904169, |
| "learning_rate": 2.000035091903955e-06, |
| "loss": 0.8958, |
| "num_tokens": 9257323549.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 5.0, |
| "step": 1160, |
| "total_flos": 3832398801043456.0, |
| "train_loss": 0.9466197261522556, |
| "train_runtime": 88419.3231, |
| "train_samples_per_second": 1.673, |
| "train_steps_per_second": 0.013 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1160, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 24, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3832398801043456.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|