| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.016333938294010888, | |
| "eval_steps": 100, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 5.4446460980036295e-05, | |
| "grad_norm": 2.606283664703369, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 3.3597, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00010889292196007259, | |
| "grad_norm": 2.6963095664978027, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 3.3809, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0001633393829401089, | |
| "grad_norm": 3.531561851501465, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 3.353, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.00021778584392014518, | |
| "grad_norm": 2.844101667404175, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 3.3358, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0002722323049001815, | |
| "grad_norm": 2.73467755317688, | |
| "learning_rate": 0.00015, | |
| "loss": 3.2899, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0003266787658802178, | |
| "grad_norm": 2.266099452972412, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 3.2718, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.00038112522686025406, | |
| "grad_norm": 1.782454490661621, | |
| "learning_rate": 0.00020999999999999998, | |
| "loss": 3.25, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.00043557168784029036, | |
| "grad_norm": 1.6878714561462402, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 3.2508, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0004900181488203267, | |
| "grad_norm": 1.656855821609497, | |
| "learning_rate": 0.00027, | |
| "loss": 3.2482, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.000544464609800363, | |
| "grad_norm": 1.196555256843567, | |
| "learning_rate": 0.0003, | |
| "loss": 3.2249, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0005989110707803993, | |
| "grad_norm": 0.7616098523139954, | |
| "learning_rate": 0.00033, | |
| "loss": 3.2238, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0006533575317604356, | |
| "grad_norm": 0.8151504993438721, | |
| "learning_rate": 0.00035999999999999997, | |
| "loss": 3.1822, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0007078039927404719, | |
| "grad_norm": 0.7040852308273315, | |
| "learning_rate": 0.00039, | |
| "loss": 3.1587, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0007622504537205081, | |
| "grad_norm": 0.5113953351974487, | |
| "learning_rate": 0.00041999999999999996, | |
| "loss": 3.2203, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0008166969147005445, | |
| "grad_norm": 0.6095477342605591, | |
| "learning_rate": 0.00045, | |
| "loss": 3.1912, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0008711433756805807, | |
| "grad_norm": 0.7497212290763855, | |
| "learning_rate": 0.00047999999999999996, | |
| "loss": 3.1985, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0009255898366606171, | |
| "grad_norm": 0.9816092848777771, | |
| "learning_rate": 0.0005099999999999999, | |
| "loss": 3.1859, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0009800362976406533, | |
| "grad_norm": 0.9469059109687805, | |
| "learning_rate": 0.00054, | |
| "loss": 3.2167, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0010344827586206897, | |
| "grad_norm": 0.6933241486549377, | |
| "learning_rate": 0.00057, | |
| "loss": 3.1399, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.001088929219600726, | |
| "grad_norm": 0.9282199144363403, | |
| "learning_rate": 0.0006, | |
| "loss": 3.1684, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0011433756805807622, | |
| "grad_norm": 0.48486050963401794, | |
| "learning_rate": 0.000599990558513205, | |
| "loss": 3.2108, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0011978221415607985, | |
| "grad_norm": 0.5165353417396545, | |
| "learning_rate": 0.000599962235241376, | |
| "loss": 3.2203, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.001252268602540835, | |
| "grad_norm": 0.40520766377449036, | |
| "learning_rate": 0.0005999150337500299, | |
| "loss": 3.1709, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0013067150635208712, | |
| "grad_norm": 0.4057173430919647, | |
| "learning_rate": 0.0005998489599811971, | |
| "loss": 3.1699, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0013611615245009074, | |
| "grad_norm": 0.3643300533294678, | |
| "learning_rate": 0.0005997640222526724, | |
| "loss": 3.1722, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0014156079854809437, | |
| "grad_norm": 0.4470251798629761, | |
| "learning_rate": 0.0005996602312569683, | |
| "loss": 3.1321, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.00147005444646098, | |
| "grad_norm": 0.3419328033924103, | |
| "learning_rate": 0.0005995376000599692, | |
| "loss": 3.1753, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0015245009074410162, | |
| "grad_norm": 0.4055408537387848, | |
| "learning_rate": 0.0005993961440992859, | |
| "loss": 3.1604, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0015789473684210526, | |
| "grad_norm": 0.3212841749191284, | |
| "learning_rate": 0.0005992358811823128, | |
| "loss": 3.1666, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.001633393829401089, | |
| "grad_norm": 0.30549293756484985, | |
| "learning_rate": 0.0005990568314839863, | |
| "loss": 3.1292, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0016878402903811253, | |
| "grad_norm": 0.3610576391220093, | |
| "learning_rate": 0.0005988590175442446, | |
| "loss": 3.1435, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.0017422867513611614, | |
| "grad_norm": 0.2855391502380371, | |
| "learning_rate": 0.0005986424642651901, | |
| "loss": 3.1432, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0017967332123411978, | |
| "grad_norm": 0.30511122941970825, | |
| "learning_rate": 0.0005984071989079554, | |
| "loss": 3.1463, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.0018511796733212342, | |
| "grad_norm": 0.29414448142051697, | |
| "learning_rate": 0.0005981532510892706, | |
| "loss": 3.1336, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.0019056261343012705, | |
| "grad_norm": 0.3027634620666504, | |
| "learning_rate": 0.0005978806527777354, | |
| "loss": 3.1277, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0019600725952813067, | |
| "grad_norm": 0.3479376435279846, | |
| "learning_rate": 0.0005975894382897944, | |
| "loss": 3.1298, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.002014519056261343, | |
| "grad_norm": 0.31864309310913086, | |
| "learning_rate": 0.0005972796442854177, | |
| "loss": 3.1501, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.0020689655172413794, | |
| "grad_norm": 0.33572497963905334, | |
| "learning_rate": 0.0005969513097634852, | |
| "loss": 3.1505, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0021234119782214157, | |
| "grad_norm": 0.23914705216884613, | |
| "learning_rate": 0.0005966044760568779, | |
| "loss": 3.1518, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.002177858439201452, | |
| "grad_norm": 0.34076428413391113, | |
| "learning_rate": 0.0005962391868272735, | |
| "loss": 3.1378, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.002232304900181488, | |
| "grad_norm": 0.25502562522888184, | |
| "learning_rate": 0.0005958554880596515, | |
| "loss": 3.1532, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.0022867513611615244, | |
| "grad_norm": 0.2586739659309387, | |
| "learning_rate": 0.000595453428056503, | |
| "loss": 3.1261, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0023411978221415607, | |
| "grad_norm": 0.3028804659843445, | |
| "learning_rate": 0.0005950330574317509, | |
| "loss": 3.1535, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.002395644283121597, | |
| "grad_norm": 0.2421780228614807, | |
| "learning_rate": 0.0005945944291043779, | |
| "loss": 3.1712, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.0024500907441016334, | |
| "grad_norm": 0.2662357687950134, | |
| "learning_rate": 0.0005941375982917649, | |
| "loss": 3.1486, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.00250453720508167, | |
| "grad_norm": 0.23532505333423615, | |
| "learning_rate": 0.0005936626225027395, | |
| "loss": 3.1478, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.002558983666061706, | |
| "grad_norm": 0.33855125308036804, | |
| "learning_rate": 0.000593169561530337, | |
| "loss": 3.1025, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.0026134301270417425, | |
| "grad_norm": 0.23862820863723755, | |
| "learning_rate": 0.000592658477444273, | |
| "loss": 3.1272, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0026678765880217784, | |
| "grad_norm": 0.2811816930770874, | |
| "learning_rate": 0.0005921294345831293, | |
| "loss": 3.1428, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.0027223230490018148, | |
| "grad_norm": 0.2487388700246811, | |
| "learning_rate": 0.0005915824995462552, | |
| "loss": 3.1305, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.002776769509981851, | |
| "grad_norm": 0.2535880506038666, | |
| "learning_rate": 0.0005910177411853828, | |
| "loss": 3.1648, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.0028312159709618875, | |
| "grad_norm": 0.3038806617259979, | |
| "learning_rate": 0.0005904352305959605, | |
| "loss": 3.1285, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.002885662431941924, | |
| "grad_norm": 0.23585520684719086, | |
| "learning_rate": 0.000589835041108202, | |
| "loss": 3.1428, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.00294010889292196, | |
| "grad_norm": 0.2504676580429077, | |
| "learning_rate": 0.0005892172482778558, | |
| "loss": 3.0991, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.0029945553539019966, | |
| "grad_norm": 0.2501893937587738, | |
| "learning_rate": 0.000588581929876693, | |
| "loss": 3.1339, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0030490018148820325, | |
| "grad_norm": 0.2685960531234741, | |
| "learning_rate": 0.0005879291658827176, | |
| "loss": 3.1412, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.003103448275862069, | |
| "grad_norm": 0.23503637313842773, | |
| "learning_rate": 0.0005872590384700979, | |
| "loss": 3.1625, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.003157894736842105, | |
| "grad_norm": 0.24152350425720215, | |
| "learning_rate": 0.0005865716319988223, | |
| "loss": 3.1059, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.0032123411978221415, | |
| "grad_norm": 0.20929178595542908, | |
| "learning_rate": 0.000585867033004079, | |
| "loss": 3.1307, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.003266787658802178, | |
| "grad_norm": 0.2642001807689667, | |
| "learning_rate": 0.0005851453301853628, | |
| "loss": 3.1134, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0033212341197822143, | |
| "grad_norm": 0.24187405407428741, | |
| "learning_rate": 0.0005844066143953087, | |
| "loss": 3.1523, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.0033756805807622506, | |
| "grad_norm": 0.2009871006011963, | |
| "learning_rate": 0.0005836509786282552, | |
| "loss": 3.1294, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.0034301270417422865, | |
| "grad_norm": 0.25516337156295776, | |
| "learning_rate": 0.000582878518008537, | |
| "loss": 3.1309, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.003484573502722323, | |
| "grad_norm": 0.22489990293979645, | |
| "learning_rate": 0.0005820893297785106, | |
| "loss": 3.1414, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.0035390199637023593, | |
| "grad_norm": 0.22305169701576233, | |
| "learning_rate": 0.000581283513286313, | |
| "loss": 3.1573, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0035934664246823956, | |
| "grad_norm": 0.21161314845085144, | |
| "learning_rate": 0.0005804611699733543, | |
| "loss": 3.1103, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.003647912885662432, | |
| "grad_norm": 0.2384558469057083, | |
| "learning_rate": 0.0005796224033615482, | |
| "loss": 3.1364, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.0037023593466424683, | |
| "grad_norm": 0.2701220214366913, | |
| "learning_rate": 0.0005787673190402799, | |
| "loss": 3.1369, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.0037568058076225047, | |
| "grad_norm": 0.19185633957386017, | |
| "learning_rate": 0.0005778960246531138, | |
| "loss": 3.1247, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.003811252268602541, | |
| "grad_norm": 0.3245127499103546, | |
| "learning_rate": 0.0005770086298842426, | |
| "loss": 3.126, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.003865698729582577, | |
| "grad_norm": 0.281764417886734, | |
| "learning_rate": 0.0005761052464446795, | |
| "loss": 3.1295, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.003920145190562613, | |
| "grad_norm": 0.24820677936077118, | |
| "learning_rate": 0.0005751859880581954, | |
| "loss": 3.1337, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.00397459165154265, | |
| "grad_norm": 0.24963483214378357, | |
| "learning_rate": 0.0005742509704470024, | |
| "loss": 3.119, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.004029038112522686, | |
| "grad_norm": 0.264715313911438, | |
| "learning_rate": 0.0005733003113171864, | |
| "loss": 3.1203, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.004083484573502722, | |
| "grad_norm": 0.27801015973091125, | |
| "learning_rate": 0.000572334130343889, | |
| "loss": 3.1531, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.004137931034482759, | |
| "grad_norm": 0.21041367948055267, | |
| "learning_rate": 0.0005713525491562421, | |
| "loss": 3.1208, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.004192377495462795, | |
| "grad_norm": 0.22113123536109924, | |
| "learning_rate": 0.0005703556913220566, | |
| "loss": 3.1209, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.0042468239564428314, | |
| "grad_norm": 0.22394251823425293, | |
| "learning_rate": 0.0005693436823322671, | |
| "loss": 3.1101, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.004301270417422867, | |
| "grad_norm": 0.26046207547187805, | |
| "learning_rate": 0.0005683166495851336, | |
| "loss": 3.0935, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.004355716878402904, | |
| "grad_norm": 0.19576989114284515, | |
| "learning_rate": 0.0005672747223702044, | |
| "loss": 3.1381, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.00441016333938294, | |
| "grad_norm": 0.22762754559516907, | |
| "learning_rate": 0.0005662180318520402, | |
| "loss": 3.1117, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.004464609800362976, | |
| "grad_norm": 0.218556210398674, | |
| "learning_rate": 0.0005651467110537016, | |
| "loss": 3.1164, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.004519056261343013, | |
| "grad_norm": 0.23464246094226837, | |
| "learning_rate": 0.0005640608948400046, | |
| "loss": 3.1533, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.004573502722323049, | |
| "grad_norm": 0.23230504989624023, | |
| "learning_rate": 0.0005629607199005416, | |
| "loss": 3.128, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.0046279491833030855, | |
| "grad_norm": 0.2193627804517746, | |
| "learning_rate": 0.0005618463247324748, | |
| "loss": 3.1162, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.004682395644283121, | |
| "grad_norm": 0.22908492386341095, | |
| "learning_rate": 0.0005607178496231011, | |
| "loss": 3.1198, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.004736842105263158, | |
| "grad_norm": 0.2191692590713501, | |
| "learning_rate": 0.0005595754366321915, | |
| "loss": 3.1135, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.004791288566243194, | |
| "grad_norm": 0.21760721504688263, | |
| "learning_rate": 0.0005584192295741086, | |
| "loss": 3.112, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.00484573502722323, | |
| "grad_norm": 0.27237263321876526, | |
| "learning_rate": 0.0005572493739997012, | |
| "loss": 3.1167, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.004900181488203267, | |
| "grad_norm": 0.2498655915260315, | |
| "learning_rate": 0.000556066017177982, | |
| "loss": 3.119, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.004954627949183303, | |
| "grad_norm": 0.23274663090705872, | |
| "learning_rate": 0.0005548693080775885, | |
| "loss": 3.168, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.00500907441016334, | |
| "grad_norm": 0.22452446818351746, | |
| "learning_rate": 0.0005536593973480297, | |
| "loss": 3.1238, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.0050635208711433755, | |
| "grad_norm": 0.2021019011735916, | |
| "learning_rate": 0.000552436437300721, | |
| "loss": 3.1363, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.005117967332123412, | |
| "grad_norm": 0.20027242600917816, | |
| "learning_rate": 0.0005512005818898111, | |
| "loss": 3.1075, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.005172413793103448, | |
| "grad_norm": 0.20116809010505676, | |
| "learning_rate": 0.0005499519866928005, | |
| "loss": 3.1137, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.005226860254083485, | |
| "grad_norm": 0.29089683294296265, | |
| "learning_rate": 0.0005486908088909568, | |
| "loss": 3.1435, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.005281306715063521, | |
| "grad_norm": 0.26022160053253174, | |
| "learning_rate": 0.0005474172072495275, | |
| "loss": 3.1348, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.005335753176043557, | |
| "grad_norm": 0.25501441955566406, | |
| "learning_rate": 0.0005461313420977536, | |
| "loss": 3.1243, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.005390199637023594, | |
| "grad_norm": 0.2464355230331421, | |
| "learning_rate": 0.0005448333753086864, | |
| "loss": 3.1329, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.0054446460980036296, | |
| "grad_norm": 0.25668638944625854, | |
| "learning_rate": 0.00054352347027881, | |
| "loss": 3.0924, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0054446460980036296, | |
| "eval_loss": 3.0220067501068115, | |
| "eval_runtime": 69.1667, | |
| "eval_samples_per_second": 62.385, | |
| "eval_steps_per_second": 15.6, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.005499092558983666, | |
| "grad_norm": 0.25321197509765625, | |
| "learning_rate": 0.0005422017919074715, | |
| "loss": 3.1289, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.005553539019963702, | |
| "grad_norm": 0.21445196866989136, | |
| "learning_rate": 0.0005408685065761229, | |
| "loss": 3.123, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.005607985480943739, | |
| "grad_norm": 0.227112278342247, | |
| "learning_rate": 0.0005395237821273755, | |
| "loss": 3.1302, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.005662431941923775, | |
| "grad_norm": 0.22493597865104675, | |
| "learning_rate": 0.000538167787843871, | |
| "loss": 3.1471, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.005716878402903811, | |
| "grad_norm": 0.24361659586429596, | |
| "learning_rate": 0.0005368006944269708, | |
| "loss": 3.1271, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.005771324863883848, | |
| "grad_norm": 0.2216707319021225, | |
| "learning_rate": 0.0005354226739752678, | |
| "loss": 3.1156, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.005825771324863884, | |
| "grad_norm": 0.2140239179134369, | |
| "learning_rate": 0.0005340338999629203, | |
| "loss": 3.1071, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.00588021778584392, | |
| "grad_norm": 0.22374412417411804, | |
| "learning_rate": 0.0005326345472178154, | |
| "loss": 3.1244, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.005934664246823956, | |
| "grad_norm": 0.21664555370807648, | |
| "learning_rate": 0.0005312247918995588, | |
| "loss": 3.1254, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.005989110707803993, | |
| "grad_norm": 0.21399718523025513, | |
| "learning_rate": 0.0005298048114773004, | |
| "loss": 3.119, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.006043557168784029, | |
| "grad_norm": 0.18971006572246552, | |
| "learning_rate": 0.0005283747847073922, | |
| "loss": 3.1254, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.006098003629764065, | |
| "grad_norm": 0.1922215223312378, | |
| "learning_rate": 0.0005269348916108859, | |
| "loss": 3.1016, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.006152450090744102, | |
| "grad_norm": 0.21366891264915466, | |
| "learning_rate": 0.00052548531345087, | |
| "loss": 3.1192, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.006206896551724138, | |
| "grad_norm": 0.1934356689453125, | |
| "learning_rate": 0.000524026232709652, | |
| "loss": 3.1254, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.0062613430127041745, | |
| "grad_norm": 0.23197585344314575, | |
| "learning_rate": 0.0005225578330657859, | |
| "loss": 3.1503, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.00631578947368421, | |
| "grad_norm": 0.227361798286438, | |
| "learning_rate": 0.0005210802993709497, | |
| "loss": 3.112, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.006370235934664247, | |
| "grad_norm": 0.25826945900917053, | |
| "learning_rate": 0.0005195938176266751, | |
| "loss": 3.1405, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.006424682395644283, | |
| "grad_norm": 0.2799200117588043, | |
| "learning_rate": 0.000518098574960932, | |
| "loss": 3.1425, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.006479128856624319, | |
| "grad_norm": 0.24563385546207428, | |
| "learning_rate": 0.0005165947596045723, | |
| "loss": 3.1573, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.006533575317604356, | |
| "grad_norm": 0.2642490267753601, | |
| "learning_rate": 0.0005150825608676336, | |
| "loss": 3.1267, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.006588021778584392, | |
| "grad_norm": 0.2301069051027298, | |
| "learning_rate": 0.0005135621691155083, | |
| "loss": 3.1151, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.0066424682395644285, | |
| "grad_norm": 0.22567883133888245, | |
| "learning_rate": 0.0005120337757449781, | |
| "loss": 3.1465, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.0066969147005444644, | |
| "grad_norm": 0.23506537079811096, | |
| "learning_rate": 0.0005104975731601208, | |
| "loss": 3.1196, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.006751361161524501, | |
| "grad_norm": 0.20675747096538544, | |
| "learning_rate": 0.0005089537547480885, | |
| "loss": 3.1063, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.006805807622504537, | |
| "grad_norm": 0.20474228262901306, | |
| "learning_rate": 0.0005074025148547634, | |
| "loss": 3.1211, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.006860254083484573, | |
| "grad_norm": 0.21194572746753693, | |
| "learning_rate": 0.0005058440487602918, | |
| "loss": 3.1456, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.00691470054446461, | |
| "grad_norm": 0.20788611471652985, | |
| "learning_rate": 0.0005042785526545008, | |
| "loss": 3.1353, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.006969147005444646, | |
| "grad_norm": 0.19835753738880157, | |
| "learning_rate": 0.0005027062236122014, | |
| "loss": 3.0889, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.007023593466424683, | |
| "grad_norm": 0.24613966047763824, | |
| "learning_rate": 0.0005011272595683787, | |
| "loss": 3.1023, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.0070780399274047185, | |
| "grad_norm": 0.18778769671916962, | |
| "learning_rate": 0.000499541859293275, | |
| "loss": 3.1091, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.007132486388384755, | |
| "grad_norm": 0.22894078493118286, | |
| "learning_rate": 0.0004979502223673672, | |
| "loss": 3.0836, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.007186932849364791, | |
| "grad_norm": 0.2452639937400818, | |
| "learning_rate": 0.0004963525491562421, | |
| "loss": 3.1285, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.007241379310344828, | |
| "grad_norm": 0.26903706789016724, | |
| "learning_rate": 0.0004947490407853734, | |
| "loss": 3.135, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.007295825771324864, | |
| "grad_norm": 0.29524174332618713, | |
| "learning_rate": 0.0004931398991148025, | |
| "loss": 3.1204, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.0073502722323049, | |
| "grad_norm": 0.1946076601743698, | |
| "learning_rate": 0.0004915253267137274, | |
| "loss": 3.1482, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.007404718693284937, | |
| "grad_norm": 0.3097498416900635, | |
| "learning_rate": 0.0004899055268350012, | |
| "loss": 3.1188, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.0074591651542649726, | |
| "grad_norm": 0.23407086730003357, | |
| "learning_rate": 0.0004882807033895463, | |
| "loss": 3.1448, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.007513611615245009, | |
| "grad_norm": 0.22323715686798096, | |
| "learning_rate": 0.0004866510609206841, | |
| "loss": 3.096, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.007568058076225045, | |
| "grad_norm": 0.23807968199253082, | |
| "learning_rate": 0.0004850168045783858, | |
| "loss": 3.1348, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.007622504537205082, | |
| "grad_norm": 0.2817918658256531, | |
| "learning_rate": 0.0004833781400934471, | |
| "loss": 3.1215, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.007676950998185118, | |
| "grad_norm": 0.25021976232528687, | |
| "learning_rate": 0.00048173527375158944, | |
| "loss": 3.1019, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.007731397459165154, | |
| "grad_norm": 0.25827258825302124, | |
| "learning_rate": 0.00048008841236749084, | |
| "loss": 3.1155, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.007785843920145191, | |
| "grad_norm": 0.2191617339849472, | |
| "learning_rate": 0.00047843776325875173, | |
| "loss": 3.1183, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.007840290381125227, | |
| "grad_norm": 0.2649674415588379, | |
| "learning_rate": 0.0004767835342197954, | |
| "loss": 3.1098, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.007894736842105263, | |
| "grad_norm": 0.2287340611219406, | |
| "learning_rate": 0.00047512593349571043, | |
| "loss": 3.1004, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.0079491833030853, | |
| "grad_norm": 0.21950604021549225, | |
| "learning_rate": 0.00047346516975603465, | |
| "loss": 3.0733, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.008003629764065335, | |
| "grad_norm": 0.2002648264169693, | |
| "learning_rate": 0.00047180145206848686, | |
| "loss": 3.0934, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.008058076225045372, | |
| "grad_norm": 0.20886124670505524, | |
| "learning_rate": 0.0004701349898726483, | |
| "loss": 3.1007, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.008112522686025409, | |
| "grad_norm": 0.19583792984485626, | |
| "learning_rate": 0.00046846599295359635, | |
| "loss": 3.1249, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.008166969147005444, | |
| "grad_norm": 0.18620361387729645, | |
| "learning_rate": 0.00046679467141549615, | |
| "loss": 3.1514, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.00822141560798548, | |
| "grad_norm": 0.2266155332326889, | |
| "learning_rate": 0.00046512123565515065, | |
| "loss": 3.1583, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.008275862068965517, | |
| "grad_norm": 0.18331050872802734, | |
| "learning_rate": 0.00046344589633551497, | |
| "loss": 3.1015, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.008330308529945554, | |
| "grad_norm": 0.23021604120731354, | |
| "learning_rate": 0.00046176886435917667, | |
| "loss": 3.0984, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.00838475499092559, | |
| "grad_norm": 0.23210260272026062, | |
| "learning_rate": 0.00046009035084180593, | |
| "loss": 3.1239, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.008439201451905626, | |
| "grad_norm": 0.2083989381790161, | |
| "learning_rate": 0.0004584105670855787, | |
| "loss": 3.0929, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.008493647912885663, | |
| "grad_norm": 0.22709086537361145, | |
| "learning_rate": 0.00045672972455257723, | |
| "loss": 3.1376, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.008548094373865698, | |
| "grad_norm": 0.24739298224449158, | |
| "learning_rate": 0.0004550480348381691, | |
| "loss": 3.1135, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.008602540834845735, | |
| "grad_norm": 0.2940455973148346, | |
| "learning_rate": 0.0004533657096443708, | |
| "loss": 3.1164, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.008656987295825772, | |
| "grad_norm": 0.2183440625667572, | |
| "learning_rate": 0.00045168296075319685, | |
| "loss": 3.1496, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.008711433756805808, | |
| "grad_norm": 0.254537969827652, | |
| "learning_rate": 0.00045, | |
| "loss": 3.1164, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.008765880217785843, | |
| "grad_norm": 0.3475794792175293, | |
| "learning_rate": 0.00044831703924680307, | |
| "loss": 3.1241, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.00882032667876588, | |
| "grad_norm": 0.25378546118736267, | |
| "learning_rate": 0.00044663429035562925, | |
| "loss": 3.0785, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.008874773139745917, | |
| "grad_norm": 0.3237019181251526, | |
| "learning_rate": 0.0004449519651618309, | |
| "loss": 3.1247, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.008929219600725952, | |
| "grad_norm": 0.2086883783340454, | |
| "learning_rate": 0.0004432702754474228, | |
| "loss": 3.1397, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.008983666061705989, | |
| "grad_norm": 0.2567574083805084, | |
| "learning_rate": 0.0004415894329144212, | |
| "loss": 3.1443, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.009038112522686026, | |
| "grad_norm": 0.1955418884754181, | |
| "learning_rate": 0.000439909649158194, | |
| "loss": 3.1298, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.009092558983666062, | |
| "grad_norm": 0.23903286457061768, | |
| "learning_rate": 0.00043823113564082325, | |
| "loss": 3.1061, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.009147005444646097, | |
| "grad_norm": 0.18639686703681946, | |
| "learning_rate": 0.00043655410366448495, | |
| "loss": 3.1364, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.009201451905626134, | |
| "grad_norm": 0.28311777114868164, | |
| "learning_rate": 0.0004348787643448493, | |
| "loss": 3.1165, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.009255898366606171, | |
| "grad_norm": 0.21376436948776245, | |
| "learning_rate": 0.0004332053285845038, | |
| "loss": 3.1332, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.009310344827586206, | |
| "grad_norm": 0.25171706080436707, | |
| "learning_rate": 0.0004315340070464036, | |
| "loss": 3.0928, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.009364791288566243, | |
| "grad_norm": 0.2478170096874237, | |
| "learning_rate": 0.0004298650101273517, | |
| "loss": 3.1146, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.00941923774954628, | |
| "grad_norm": 0.2905559837818146, | |
| "learning_rate": 0.0004281985479315131, | |
| "loss": 3.1248, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.009473684210526316, | |
| "grad_norm": 0.23062513768672943, | |
| "learning_rate": 0.00042653483024396527, | |
| "loss": 3.0908, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.009528130671506351, | |
| "grad_norm": 0.21542707085609436, | |
| "learning_rate": 0.0004248740665042895, | |
| "loss": 3.116, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.009582577132486388, | |
| "grad_norm": 0.2628575563430786, | |
| "learning_rate": 0.0004232164657802045, | |
| "loss": 3.0974, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.009637023593466425, | |
| "grad_norm": 0.2287788689136505, | |
| "learning_rate": 0.0004215622367412482, | |
| "loss": 3.1229, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.00969147005444646, | |
| "grad_norm": 0.23420408368110657, | |
| "learning_rate": 0.0004199115876325091, | |
| "loss": 3.1408, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.009745916515426497, | |
| "grad_norm": 0.19503605365753174, | |
| "learning_rate": 0.0004182647262484106, | |
| "loss": 3.104, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.009800362976406534, | |
| "grad_norm": 0.2603705823421478, | |
| "learning_rate": 0.0004166218599065528, | |
| "loss": 3.1185, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.00985480943738657, | |
| "grad_norm": 0.20003163814544678, | |
| "learning_rate": 0.0004149831954216142, | |
| "loss": 3.1248, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.009909255898366606, | |
| "grad_norm": 0.2520509660243988, | |
| "learning_rate": 0.00041334893907931584, | |
| "loss": 3.1053, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.009963702359346642, | |
| "grad_norm": 0.2309730499982834, | |
| "learning_rate": 0.0004117192966104536, | |
| "loss": 3.0824, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.01001814882032668, | |
| "grad_norm": 0.24436035752296448, | |
| "learning_rate": 0.0004100944731649987, | |
| "loss": 3.1365, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.010072595281306716, | |
| "grad_norm": 0.1982448399066925, | |
| "learning_rate": 0.0004084746732862726, | |
| "loss": 3.1232, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.010127041742286751, | |
| "grad_norm": 0.28294622898101807, | |
| "learning_rate": 0.0004068601008851974, | |
| "loss": 3.1373, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.010181488203266788, | |
| "grad_norm": 0.19457624852657318, | |
| "learning_rate": 0.0004052509592146266, | |
| "loss": 3.1108, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.010235934664246825, | |
| "grad_norm": 0.20812362432479858, | |
| "learning_rate": 0.00040364745084375787, | |
| "loss": 3.1269, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.01029038112522686, | |
| "grad_norm": 0.26139721274375916, | |
| "learning_rate": 0.0004020497776326328, | |
| "loss": 3.1027, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.010344827586206896, | |
| "grad_norm": 0.24709898233413696, | |
| "learning_rate": 0.00040045814070672494, | |
| "loss": 3.1102, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.010399274047186933, | |
| "grad_norm": 0.20621921122074127, | |
| "learning_rate": 0.0003988727404316212, | |
| "loss": 3.1248, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.01045372050816697, | |
| "grad_norm": 0.21720059216022491, | |
| "learning_rate": 0.00039729377638779857, | |
| "loss": 3.1187, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.010508166969147005, | |
| "grad_norm": 0.1787269115447998, | |
| "learning_rate": 0.0003957214473454991, | |
| "loss": 3.1032, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.010562613430127042, | |
| "grad_norm": 0.22008821368217468, | |
| "learning_rate": 0.00039415595123970813, | |
| "loss": 3.1438, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.010617059891107079, | |
| "grad_norm": 0.2438061535358429, | |
| "learning_rate": 0.00039259748514523655, | |
| "loss": 3.1444, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.010671506352087114, | |
| "grad_norm": 0.21327753365039825, | |
| "learning_rate": 0.0003910462452519114, | |
| "loss": 3.1147, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.01072595281306715, | |
| "grad_norm": 0.22642219066619873, | |
| "learning_rate": 0.0003895024268398792, | |
| "loss": 3.1221, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.010780399274047187, | |
| "grad_norm": 0.1771782636642456, | |
| "learning_rate": 0.00038796622425502195, | |
| "loss": 3.1153, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.010834845735027224, | |
| "grad_norm": 0.25223565101623535, | |
| "learning_rate": 0.00038643783088449163, | |
| "loss": 3.1249, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.010889292196007259, | |
| "grad_norm": 0.15892428159713745, | |
| "learning_rate": 0.00038491743913236624, | |
| "loss": 3.1273, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.010889292196007259, | |
| "eval_loss": 3.0130114555358887, | |
| "eval_runtime": 69.209, | |
| "eval_samples_per_second": 62.347, | |
| "eval_steps_per_second": 15.59, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.010943738656987296, | |
| "grad_norm": 0.2665696144104004, | |
| "learning_rate": 0.0003834052403954277, | |
| "loss": 3.1162, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.010998185117967333, | |
| "grad_norm": 0.2027243971824646, | |
| "learning_rate": 0.00038190142503906794, | |
| "loss": 3.0788, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.011052631578947368, | |
| "grad_norm": 0.2640676498413086, | |
| "learning_rate": 0.00038040618237332485, | |
| "loss": 3.0978, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.011107078039927405, | |
| "grad_norm": 0.21408383548259735, | |
| "learning_rate": 0.0003789197006290502, | |
| "loss": 3.1155, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.011161524500907441, | |
| "grad_norm": 0.21028189361095428, | |
| "learning_rate": 0.00037744216693421403, | |
| "loss": 3.0896, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.011215970961887478, | |
| "grad_norm": 0.183872789144516, | |
| "learning_rate": 0.00037597376729034794, | |
| "loss": 3.1179, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.011270417422867513, | |
| "grad_norm": 0.24802212417125702, | |
| "learning_rate": 0.00037451468654912994, | |
| "loss": 3.1283, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.01132486388384755, | |
| "grad_norm": 0.20964276790618896, | |
| "learning_rate": 0.00037306510838911404, | |
| "loss": 3.0941, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.011379310344827587, | |
| "grad_norm": 0.2240600436925888, | |
| "learning_rate": 0.00037162521529260763, | |
| "loss": 3.1226, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.011433756805807622, | |
| "grad_norm": 0.23930035531520844, | |
| "learning_rate": 0.00037019518852269954, | |
| "loss": 3.111, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.011488203266787659, | |
| "grad_norm": 0.21928225457668304, | |
| "learning_rate": 0.0003687752081004411, | |
| "loss": 3.1014, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.011542649727767695, | |
| "grad_norm": 0.24399928748607635, | |
| "learning_rate": 0.0003673654527821846, | |
| "loss": 3.1345, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.011597096188747732, | |
| "grad_norm": 0.23558929562568665, | |
| "learning_rate": 0.00036596610003707954, | |
| "loss": 3.0852, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.011651542649727767, | |
| "grad_norm": 0.19627036154270172, | |
| "learning_rate": 0.00036457732602473216, | |
| "loss": 3.1098, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.011705989110707804, | |
| "grad_norm": 0.26581332087516785, | |
| "learning_rate": 0.0003631993055730291, | |
| "loss": 3.1467, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.01176043557168784, | |
| "grad_norm": 0.2310531884431839, | |
| "learning_rate": 0.000361832212156129, | |
| "loss": 3.1242, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.011814882032667876, | |
| "grad_norm": 0.2427210807800293, | |
| "learning_rate": 0.00036047621787262444, | |
| "loss": 3.1144, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.011869328493647913, | |
| "grad_norm": 0.2132464200258255, | |
| "learning_rate": 0.000359131493423877, | |
| "loss": 3.1184, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.01192377495462795, | |
| "grad_norm": 0.2877713441848755, | |
| "learning_rate": 0.0003577982080925284, | |
| "loss": 3.0916, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.011978221415607986, | |
| "grad_norm": 0.20918139815330505, | |
| "learning_rate": 0.00035647652972119, | |
| "loss": 3.1299, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.012032667876588021, | |
| "grad_norm": 0.23674660921096802, | |
| "learning_rate": 0.00035516662469131356, | |
| "loss": 3.1292, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.012087114337568058, | |
| "grad_norm": 0.1943867802619934, | |
| "learning_rate": 0.0003538686579022464, | |
| "loss": 3.1258, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.012141560798548095, | |
| "grad_norm": 0.34807291626930237, | |
| "learning_rate": 0.00035258279275047246, | |
| "loss": 3.1328, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.01219600725952813, | |
| "grad_norm": 0.20535360276699066, | |
| "learning_rate": 0.0003513091911090431, | |
| "loss": 3.1184, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.012250453720508167, | |
| "grad_norm": 0.26084408164024353, | |
| "learning_rate": 0.00035004801330719936, | |
| "loss": 3.1014, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.012304900181488203, | |
| "grad_norm": 0.2201356142759323, | |
| "learning_rate": 0.0003487994181101888, | |
| "loss": 3.0941, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.01235934664246824, | |
| "grad_norm": 0.20090270042419434, | |
| "learning_rate": 0.00034756356269927894, | |
| "loss": 3.093, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.012413793103448275, | |
| "grad_norm": 0.22596506774425507, | |
| "learning_rate": 0.00034634060265197026, | |
| "loss": 3.1011, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.012468239564428312, | |
| "grad_norm": 0.2359432578086853, | |
| "learning_rate": 0.00034513069192241137, | |
| "loss": 3.1118, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.012522686025408349, | |
| "grad_norm": 0.2078094184398651, | |
| "learning_rate": 0.0003439339828220179, | |
| "loss": 3.0898, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.012577132486388384, | |
| "grad_norm": 0.19436654448509216, | |
| "learning_rate": 0.00034275062600029865, | |
| "loss": 3.0997, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.01263157894736842, | |
| "grad_norm": 0.1749400496482849, | |
| "learning_rate": 0.0003415807704258913, | |
| "loss": 3.1013, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.012686025408348458, | |
| "grad_norm": 0.31390950083732605, | |
| "learning_rate": 0.00034042456336780833, | |
| "loss": 3.1088, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.012740471869328494, | |
| "grad_norm": 0.21334995329380035, | |
| "learning_rate": 0.00033928215037689886, | |
| "loss": 3.1367, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.01279491833030853, | |
| "grad_norm": 0.2765377461910248, | |
| "learning_rate": 0.00033815367526752516, | |
| "loss": 3.0837, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.012849364791288566, | |
| "grad_norm": 0.21292102336883545, | |
| "learning_rate": 0.0003370392800994583, | |
| "loss": 3.116, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.012903811252268603, | |
| "grad_norm": 0.22039784491062164, | |
| "learning_rate": 0.0003359391051599953, | |
| "loss": 3.1287, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.012958257713248638, | |
| "grad_norm": 0.31274768710136414, | |
| "learning_rate": 0.0003348532889462983, | |
| "loss": 3.103, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.013012704174228675, | |
| "grad_norm": 0.2530403435230255, | |
| "learning_rate": 0.00033378196814795987, | |
| "loss": 3.1088, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.013067150635208712, | |
| "grad_norm": 0.25343602895736694, | |
| "learning_rate": 0.0003327252776297955, | |
| "loss": 3.0815, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.013121597096188748, | |
| "grad_norm": 0.2652718722820282, | |
| "learning_rate": 0.0003316833504148663, | |
| "loss": 3.1188, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.013176043557168783, | |
| "grad_norm": 0.2119700163602829, | |
| "learning_rate": 0.0003306563176677328, | |
| "loss": 3.1262, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.01323049001814882, | |
| "grad_norm": 0.2630724310874939, | |
| "learning_rate": 0.00032964430867794326, | |
| "loss": 3.1183, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.013284936479128857, | |
| "grad_norm": 0.2809687554836273, | |
| "learning_rate": 0.00032864745084375783, | |
| "loss": 3.1087, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.013339382940108892, | |
| "grad_norm": 0.24847863614559174, | |
| "learning_rate": 0.00032766586965611095, | |
| "loss": 3.1053, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.013393829401088929, | |
| "grad_norm": 0.2698868215084076, | |
| "learning_rate": 0.00032669968868281353, | |
| "loss": 3.0977, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.013448275862068966, | |
| "grad_norm": 0.33301204442977905, | |
| "learning_rate": 0.0003257490295529975, | |
| "loss": 3.098, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.013502722323049002, | |
| "grad_norm": 0.2521134912967682, | |
| "learning_rate": 0.0003248140119418046, | |
| "loss": 3.0924, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.013557168784029038, | |
| "grad_norm": 0.22425313293933868, | |
| "learning_rate": 0.00032389475355532044, | |
| "loss": 3.0911, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.013611615245009074, | |
| "grad_norm": 0.2841155230998993, | |
| "learning_rate": 0.00032299137011575734, | |
| "loss": 3.1019, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.013666061705989111, | |
| "grad_norm": 0.19375889003276825, | |
| "learning_rate": 0.00032210397534688617, | |
| "loss": 3.1183, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.013720508166969146, | |
| "grad_norm": 0.26036033034324646, | |
| "learning_rate": 0.00032123268095972005, | |
| "loss": 3.1296, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.013774954627949183, | |
| "grad_norm": 0.24063162505626678, | |
| "learning_rate": 0.0003203775966384518, | |
| "loss": 3.142, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.01382940108892922, | |
| "grad_norm": 0.30820685625076294, | |
| "learning_rate": 0.0003195388300266457, | |
| "loss": 3.0998, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.013883847549909257, | |
| "grad_norm": 0.18717393279075623, | |
| "learning_rate": 0.0003187164867136869, | |
| "loss": 3.1363, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.013938294010889292, | |
| "grad_norm": 0.22616641223430634, | |
| "learning_rate": 0.0003179106702214893, | |
| "loss": 3.1284, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.013992740471869328, | |
| "grad_norm": 0.19695289433002472, | |
| "learning_rate": 0.000317121481991463, | |
| "loss": 3.0859, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.014047186932849365, | |
| "grad_norm": 0.24756500124931335, | |
| "learning_rate": 0.0003163490213717448, | |
| "loss": 3.1187, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.014101633393829402, | |
| "grad_norm": 0.2162931114435196, | |
| "learning_rate": 0.00031559338560469116, | |
| "loss": 3.1066, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.014156079854809437, | |
| "grad_norm": 0.2389383465051651, | |
| "learning_rate": 0.0003148546698146371, | |
| "loss": 3.1096, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.014210526315789474, | |
| "grad_norm": 0.1939517855644226, | |
| "learning_rate": 0.0003141329669959209, | |
| "loss": 3.115, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.01426497277676951, | |
| "grad_norm": 0.24441573023796082, | |
| "learning_rate": 0.00031342836800117763, | |
| "loss": 3.0954, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.014319419237749546, | |
| "grad_norm": 0.31514814496040344, | |
| "learning_rate": 0.000312740961529902, | |
| "loss": 3.1259, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.014373865698729582, | |
| "grad_norm": 0.28324607014656067, | |
| "learning_rate": 0.00031207083411728236, | |
| "loss": 3.1088, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.01442831215970962, | |
| "grad_norm": 0.21894101798534393, | |
| "learning_rate": 0.00031141807012330695, | |
| "loss": 3.1071, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.014482758620689656, | |
| "grad_norm": 0.32468879222869873, | |
| "learning_rate": 0.0003107827517221441, | |
| "loss": 3.1159, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.014537205081669691, | |
| "grad_norm": 0.2448977530002594, | |
| "learning_rate": 0.00031016495889179787, | |
| "loss": 3.1091, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.014591651542649728, | |
| "grad_norm": 0.3004207909107208, | |
| "learning_rate": 0.0003095647694040394, | |
| "loss": 3.1091, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.014646098003629765, | |
| "grad_norm": 0.2677028179168701, | |
| "learning_rate": 0.0003089822588146171, | |
| "loss": 3.1069, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.0147005444646098, | |
| "grad_norm": 0.26184970140457153, | |
| "learning_rate": 0.0003084175004537448, | |
| "loss": 3.1272, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.014754990925589836, | |
| "grad_norm": 0.2534137964248657, | |
| "learning_rate": 0.0003078705654168706, | |
| "loss": 3.0968, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.014809437386569873, | |
| "grad_norm": 0.34196603298187256, | |
| "learning_rate": 0.0003073415225557269, | |
| "loss": 3.0834, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.01486388384754991, | |
| "grad_norm": 0.20533894002437592, | |
| "learning_rate": 0.0003068304384696629, | |
| "loss": 3.11, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.014918330308529945, | |
| "grad_norm": 0.2631680369377136, | |
| "learning_rate": 0.00030633737749726045, | |
| "loss": 3.0984, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.014972776769509982, | |
| "grad_norm": 0.24283327162265778, | |
| "learning_rate": 0.0003058624017082351, | |
| "loss": 3.1158, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.015027223230490019, | |
| "grad_norm": 0.23213128745555878, | |
| "learning_rate": 0.000305405570895622, | |
| "loss": 3.1186, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.015081669691470054, | |
| "grad_norm": 0.2027149498462677, | |
| "learning_rate": 0.00030496694256824903, | |
| "loss": 3.0893, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.01513611615245009, | |
| "grad_norm": 0.3051553964614868, | |
| "learning_rate": 0.00030454657194349695, | |
| "loss": 3.1062, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.015190562613430127, | |
| "grad_norm": 0.28350913524627686, | |
| "learning_rate": 0.00030414451194034846, | |
| "loss": 3.1115, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.015245009074410164, | |
| "grad_norm": 0.28886678814888, | |
| "learning_rate": 0.00030376081317272645, | |
| "loss": 3.1113, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0152994555353902, | |
| "grad_norm": 0.24702845513820648, | |
| "learning_rate": 0.0003033955239431221, | |
| "loss": 3.092, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.015353901996370236, | |
| "grad_norm": 0.20456074178218842, | |
| "learning_rate": 0.00030304869023651464, | |
| "loss": 3.1087, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.015408348457350273, | |
| "grad_norm": 0.19598916172981262, | |
| "learning_rate": 0.0003027203557145822, | |
| "loss": 3.1357, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.015462794918330308, | |
| "grad_norm": 0.33903974294662476, | |
| "learning_rate": 0.0003024105617102055, | |
| "loss": 3.1097, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.015517241379310345, | |
| "grad_norm": 0.22449573874473572, | |
| "learning_rate": 0.0003021193472222646, | |
| "loss": 3.1204, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.015571687840290381, | |
| "grad_norm": 0.28471046686172485, | |
| "learning_rate": 0.0003018467489107293, | |
| "loss": 3.115, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.015626134301270418, | |
| "grad_norm": 0.2602684497833252, | |
| "learning_rate": 0.0003015928010920444, | |
| "loss": 3.1306, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.015680580762250453, | |
| "grad_norm": 0.23596693575382233, | |
| "learning_rate": 0.0003013575357348098, | |
| "loss": 3.1111, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.01573502722323049, | |
| "grad_norm": 0.3110821843147278, | |
| "learning_rate": 0.0003011409824557554, | |
| "loss": 3.0869, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.015789473684210527, | |
| "grad_norm": 0.22817878425121307, | |
| "learning_rate": 0.00030094316851601356, | |
| "loss": 3.1123, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.015843920145190562, | |
| "grad_norm": 0.26526954770088196, | |
| "learning_rate": 0.00030076411881768716, | |
| "loss": 3.1038, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.0158983666061706, | |
| "grad_norm": 0.24120832979679108, | |
| "learning_rate": 0.0003006038559007141, | |
| "loss": 3.0763, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.015952813067150635, | |
| "grad_norm": 0.2750589847564697, | |
| "learning_rate": 0.0003004623999400308, | |
| "loss": 3.1197, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.01600725952813067, | |
| "grad_norm": 0.26484569907188416, | |
| "learning_rate": 0.0003003397687430316, | |
| "loss": 3.0945, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.01606170598911071, | |
| "grad_norm": 0.22410178184509277, | |
| "learning_rate": 0.0003002359777473275, | |
| "loss": 3.1024, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.016116152450090744, | |
| "grad_norm": 0.23759888112545013, | |
| "learning_rate": 0.00030015104001880274, | |
| "loss": 3.1167, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.01617059891107078, | |
| "grad_norm": 0.21586690843105316, | |
| "learning_rate": 0.00030008496624996995, | |
| "loss": 3.0945, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.016225045372050818, | |
| "grad_norm": 0.23178769648075104, | |
| "learning_rate": 0.00030003776475862396, | |
| "loss": 3.095, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.016279491833030853, | |
| "grad_norm": 0.24513190984725952, | |
| "learning_rate": 0.0003000094414867949, | |
| "loss": 3.0728, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.016333938294010888, | |
| "grad_norm": 0.20249220728874207, | |
| "learning_rate": 0.0003, | |
| "loss": 3.0881, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.016333938294010888, | |
| "eval_loss": 3.00935697555542, | |
| "eval_runtime": 69.1435, | |
| "eval_samples_per_second": 62.406, | |
| "eval_steps_per_second": 15.605, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 300, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.5252105216e+16, | |
| "train_batch_size": 40, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |