{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.016333938294010888, "eval_steps": 100, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.4446460980036295e-05, "grad_norm": 2.606283664703369, "learning_rate": 2.9999999999999997e-05, "loss": 3.3597, "step": 1 }, { "epoch": 0.00010889292196007259, "grad_norm": 2.6963095664978027, "learning_rate": 5.9999999999999995e-05, "loss": 3.3809, "step": 2 }, { "epoch": 0.0001633393829401089, "grad_norm": 3.531561851501465, "learning_rate": 8.999999999999999e-05, "loss": 3.353, "step": 3 }, { "epoch": 0.00021778584392014518, "grad_norm": 2.844101667404175, "learning_rate": 0.00011999999999999999, "loss": 3.3358, "step": 4 }, { "epoch": 0.0002722323049001815, "grad_norm": 2.73467755317688, "learning_rate": 0.00015, "loss": 3.2899, "step": 5 }, { "epoch": 0.0003266787658802178, "grad_norm": 2.266099452972412, "learning_rate": 0.00017999999999999998, "loss": 3.2718, "step": 6 }, { "epoch": 0.00038112522686025406, "grad_norm": 1.782454490661621, "learning_rate": 0.00020999999999999998, "loss": 3.25, "step": 7 }, { "epoch": 0.00043557168784029036, "grad_norm": 1.6878714561462402, "learning_rate": 0.00023999999999999998, "loss": 3.2508, "step": 8 }, { "epoch": 0.0004900181488203267, "grad_norm": 1.656855821609497, "learning_rate": 0.00027, "loss": 3.2482, "step": 9 }, { "epoch": 0.000544464609800363, "grad_norm": 1.196555256843567, "learning_rate": 0.0003, "loss": 3.2249, "step": 10 }, { "epoch": 0.0005989110707803993, "grad_norm": 0.7616098523139954, "learning_rate": 0.00033, "loss": 3.2238, "step": 11 }, { "epoch": 0.0006533575317604356, "grad_norm": 0.8151504993438721, "learning_rate": 0.00035999999999999997, "loss": 3.1822, "step": 12 }, { "epoch": 0.0007078039927404719, "grad_norm": 0.7040852308273315, "learning_rate": 0.00039, "loss": 3.1587, "step": 13 }, { "epoch": 0.0007622504537205081, "grad_norm": 0.5113953351974487, "learning_rate": 0.00041999999999999996, "loss": 3.2203, "step": 14 }, { "epoch": 0.0008166969147005445, "grad_norm": 0.6095477342605591, "learning_rate": 0.00045, "loss": 3.1912, "step": 15 }, { "epoch": 0.0008711433756805807, "grad_norm": 0.7497212290763855, "learning_rate": 0.00047999999999999996, "loss": 3.1985, "step": 16 }, { "epoch": 0.0009255898366606171, "grad_norm": 0.9816092848777771, "learning_rate": 0.0005099999999999999, "loss": 3.1859, "step": 17 }, { "epoch": 0.0009800362976406533, "grad_norm": 0.9469059109687805, "learning_rate": 0.00054, "loss": 3.2167, "step": 18 }, { "epoch": 0.0010344827586206897, "grad_norm": 0.6933241486549377, "learning_rate": 0.00057, "loss": 3.1399, "step": 19 }, { "epoch": 0.001088929219600726, "grad_norm": 0.9282199144363403, "learning_rate": 0.0006, "loss": 3.1684, "step": 20 }, { "epoch": 0.0011433756805807622, "grad_norm": 0.48486050963401794, "learning_rate": 0.000599990558513205, "loss": 3.2108, "step": 21 }, { "epoch": 0.0011978221415607985, "grad_norm": 0.5165353417396545, "learning_rate": 0.000599962235241376, "loss": 3.2203, "step": 22 }, { "epoch": 0.001252268602540835, "grad_norm": 0.40520766377449036, "learning_rate": 0.0005999150337500299, "loss": 3.1709, "step": 23 }, { "epoch": 0.0013067150635208712, "grad_norm": 0.4057173430919647, "learning_rate": 0.0005998489599811971, "loss": 3.1699, "step": 24 }, { "epoch": 0.0013611615245009074, "grad_norm": 0.3643300533294678, "learning_rate": 0.0005997640222526724, "loss": 3.1722, "step": 25 }, { "epoch": 0.0014156079854809437, "grad_norm": 0.4470251798629761, "learning_rate": 0.0005996602312569683, "loss": 3.1321, "step": 26 }, { "epoch": 0.00147005444646098, "grad_norm": 0.3419328033924103, "learning_rate": 0.0005995376000599692, "loss": 3.1753, "step": 27 }, { "epoch": 0.0015245009074410162, "grad_norm": 0.4055408537387848, "learning_rate": 0.0005993961440992859, "loss": 3.1604, "step": 28 }, { "epoch": 0.0015789473684210526, "grad_norm": 0.3212841749191284, "learning_rate": 0.0005992358811823128, "loss": 3.1666, "step": 29 }, { "epoch": 0.001633393829401089, "grad_norm": 0.30549293756484985, "learning_rate": 0.0005990568314839863, "loss": 3.1292, "step": 30 }, { "epoch": 0.0016878402903811253, "grad_norm": 0.3610576391220093, "learning_rate": 0.0005988590175442446, "loss": 3.1435, "step": 31 }, { "epoch": 0.0017422867513611614, "grad_norm": 0.2855391502380371, "learning_rate": 0.0005986424642651901, "loss": 3.1432, "step": 32 }, { "epoch": 0.0017967332123411978, "grad_norm": 0.30511122941970825, "learning_rate": 0.0005984071989079554, "loss": 3.1463, "step": 33 }, { "epoch": 0.0018511796733212342, "grad_norm": 0.29414448142051697, "learning_rate": 0.0005981532510892706, "loss": 3.1336, "step": 34 }, { "epoch": 0.0019056261343012705, "grad_norm": 0.3027634620666504, "learning_rate": 0.0005978806527777354, "loss": 3.1277, "step": 35 }, { "epoch": 0.0019600725952813067, "grad_norm": 0.3479376435279846, "learning_rate": 0.0005975894382897944, "loss": 3.1298, "step": 36 }, { "epoch": 0.002014519056261343, "grad_norm": 0.31864309310913086, "learning_rate": 0.0005972796442854177, "loss": 3.1501, "step": 37 }, { "epoch": 0.0020689655172413794, "grad_norm": 0.33572497963905334, "learning_rate": 0.0005969513097634852, "loss": 3.1505, "step": 38 }, { "epoch": 0.0021234119782214157, "grad_norm": 0.23914705216884613, "learning_rate": 0.0005966044760568779, "loss": 3.1518, "step": 39 }, { "epoch": 0.002177858439201452, "grad_norm": 0.34076428413391113, "learning_rate": 0.0005962391868272735, "loss": 3.1378, "step": 40 }, { "epoch": 0.002232304900181488, "grad_norm": 0.25502562522888184, "learning_rate": 0.0005958554880596515, "loss": 3.1532, "step": 41 }, { "epoch": 0.0022867513611615244, "grad_norm": 0.2586739659309387, "learning_rate": 0.000595453428056503, "loss": 3.1261, "step": 42 }, { "epoch": 0.0023411978221415607, "grad_norm": 0.3028804659843445, "learning_rate": 0.0005950330574317509, "loss": 3.1535, "step": 43 }, { "epoch": 0.002395644283121597, "grad_norm": 0.2421780228614807, "learning_rate": 0.0005945944291043779, "loss": 3.1712, "step": 44 }, { "epoch": 0.0024500907441016334, "grad_norm": 0.2662357687950134, "learning_rate": 0.0005941375982917649, "loss": 3.1486, "step": 45 }, { "epoch": 0.00250453720508167, "grad_norm": 0.23532505333423615, "learning_rate": 0.0005936626225027395, "loss": 3.1478, "step": 46 }, { "epoch": 0.002558983666061706, "grad_norm": 0.33855125308036804, "learning_rate": 0.000593169561530337, "loss": 3.1025, "step": 47 }, { "epoch": 0.0026134301270417425, "grad_norm": 0.23862820863723755, "learning_rate": 0.000592658477444273, "loss": 3.1272, "step": 48 }, { "epoch": 0.0026678765880217784, "grad_norm": 0.2811816930770874, "learning_rate": 0.0005921294345831293, "loss": 3.1428, "step": 49 }, { "epoch": 0.0027223230490018148, "grad_norm": 0.2487388700246811, "learning_rate": 0.0005915824995462552, "loss": 3.1305, "step": 50 }, { "epoch": 0.002776769509981851, "grad_norm": 0.2535880506038666, "learning_rate": 0.0005910177411853828, "loss": 3.1648, "step": 51 }, { "epoch": 0.0028312159709618875, "grad_norm": 0.3038806617259979, "learning_rate": 0.0005904352305959605, "loss": 3.1285, "step": 52 }, { "epoch": 0.002885662431941924, "grad_norm": 0.23585520684719086, "learning_rate": 0.000589835041108202, "loss": 3.1428, "step": 53 }, { "epoch": 0.00294010889292196, "grad_norm": 0.2504676580429077, "learning_rate": 0.0005892172482778558, "loss": 3.0991, "step": 54 }, { "epoch": 0.0029945553539019966, "grad_norm": 0.2501893937587738, "learning_rate": 0.000588581929876693, "loss": 3.1339, "step": 55 }, { "epoch": 0.0030490018148820325, "grad_norm": 0.2685960531234741, "learning_rate": 0.0005879291658827176, "loss": 3.1412, "step": 56 }, { "epoch": 0.003103448275862069, "grad_norm": 0.23503637313842773, "learning_rate": 0.0005872590384700979, "loss": 3.1625, "step": 57 }, { "epoch": 0.003157894736842105, "grad_norm": 0.24152350425720215, "learning_rate": 0.0005865716319988223, "loss": 3.1059, "step": 58 }, { "epoch": 0.0032123411978221415, "grad_norm": 0.20929178595542908, "learning_rate": 0.000585867033004079, "loss": 3.1307, "step": 59 }, { "epoch": 0.003266787658802178, "grad_norm": 0.2642001807689667, "learning_rate": 0.0005851453301853628, "loss": 3.1134, "step": 60 }, { "epoch": 0.0033212341197822143, "grad_norm": 0.24187405407428741, "learning_rate": 0.0005844066143953087, "loss": 3.1523, "step": 61 }, { "epoch": 0.0033756805807622506, "grad_norm": 0.2009871006011963, "learning_rate": 0.0005836509786282552, "loss": 3.1294, "step": 62 }, { "epoch": 0.0034301270417422865, "grad_norm": 0.25516337156295776, "learning_rate": 0.000582878518008537, "loss": 3.1309, "step": 63 }, { "epoch": 0.003484573502722323, "grad_norm": 0.22489990293979645, "learning_rate": 0.0005820893297785106, "loss": 3.1414, "step": 64 }, { "epoch": 0.0035390199637023593, "grad_norm": 0.22305169701576233, "learning_rate": 0.000581283513286313, "loss": 3.1573, "step": 65 }, { "epoch": 0.0035934664246823956, "grad_norm": 0.21161314845085144, "learning_rate": 0.0005804611699733543, "loss": 3.1103, "step": 66 }, { "epoch": 0.003647912885662432, "grad_norm": 0.2384558469057083, "learning_rate": 0.0005796224033615482, "loss": 3.1364, "step": 67 }, { "epoch": 0.0037023593466424683, "grad_norm": 0.2701220214366913, "learning_rate": 0.0005787673190402799, "loss": 3.1369, "step": 68 }, { "epoch": 0.0037568058076225047, "grad_norm": 0.19185633957386017, "learning_rate": 0.0005778960246531138, "loss": 3.1247, "step": 69 }, { "epoch": 0.003811252268602541, "grad_norm": 0.3245127499103546, "learning_rate": 0.0005770086298842426, "loss": 3.126, "step": 70 }, { "epoch": 0.003865698729582577, "grad_norm": 0.281764417886734, "learning_rate": 0.0005761052464446795, "loss": 3.1295, "step": 71 }, { "epoch": 0.003920145190562613, "grad_norm": 0.24820677936077118, "learning_rate": 0.0005751859880581954, "loss": 3.1337, "step": 72 }, { "epoch": 0.00397459165154265, "grad_norm": 0.24963483214378357, "learning_rate": 0.0005742509704470024, "loss": 3.119, "step": 73 }, { "epoch": 0.004029038112522686, "grad_norm": 0.264715313911438, "learning_rate": 0.0005733003113171864, "loss": 3.1203, "step": 74 }, { "epoch": 0.004083484573502722, "grad_norm": 0.27801015973091125, "learning_rate": 0.000572334130343889, "loss": 3.1531, "step": 75 }, { "epoch": 0.004137931034482759, "grad_norm": 0.21041367948055267, "learning_rate": 0.0005713525491562421, "loss": 3.1208, "step": 76 }, { "epoch": 0.004192377495462795, "grad_norm": 0.22113123536109924, "learning_rate": 0.0005703556913220566, "loss": 3.1209, "step": 77 }, { "epoch": 0.0042468239564428314, "grad_norm": 0.22394251823425293, "learning_rate": 0.0005693436823322671, "loss": 3.1101, "step": 78 }, { "epoch": 0.004301270417422867, "grad_norm": 0.26046207547187805, "learning_rate": 0.0005683166495851336, "loss": 3.0935, "step": 79 }, { "epoch": 0.004355716878402904, "grad_norm": 0.19576989114284515, "learning_rate": 0.0005672747223702044, "loss": 3.1381, "step": 80 }, { "epoch": 0.00441016333938294, "grad_norm": 0.22762754559516907, "learning_rate": 0.0005662180318520402, "loss": 3.1117, "step": 81 }, { "epoch": 0.004464609800362976, "grad_norm": 0.218556210398674, "learning_rate": 0.0005651467110537016, "loss": 3.1164, "step": 82 }, { "epoch": 0.004519056261343013, "grad_norm": 0.23464246094226837, "learning_rate": 0.0005640608948400046, "loss": 3.1533, "step": 83 }, { "epoch": 0.004573502722323049, "grad_norm": 0.23230504989624023, "learning_rate": 0.0005629607199005416, "loss": 3.128, "step": 84 }, { "epoch": 0.0046279491833030855, "grad_norm": 0.2193627804517746, "learning_rate": 0.0005618463247324748, "loss": 3.1162, "step": 85 }, { "epoch": 0.004682395644283121, "grad_norm": 0.22908492386341095, "learning_rate": 0.0005607178496231011, "loss": 3.1198, "step": 86 }, { "epoch": 0.004736842105263158, "grad_norm": 0.2191692590713501, "learning_rate": 0.0005595754366321915, "loss": 3.1135, "step": 87 }, { "epoch": 0.004791288566243194, "grad_norm": 0.21760721504688263, "learning_rate": 0.0005584192295741086, "loss": 3.112, "step": 88 }, { "epoch": 0.00484573502722323, "grad_norm": 0.27237263321876526, "learning_rate": 0.0005572493739997012, "loss": 3.1167, "step": 89 }, { "epoch": 0.004900181488203267, "grad_norm": 0.2498655915260315, "learning_rate": 0.000556066017177982, "loss": 3.119, "step": 90 }, { "epoch": 0.004954627949183303, "grad_norm": 0.23274663090705872, "learning_rate": 0.0005548693080775885, "loss": 3.168, "step": 91 }, { "epoch": 0.00500907441016334, "grad_norm": 0.22452446818351746, "learning_rate": 0.0005536593973480297, "loss": 3.1238, "step": 92 }, { "epoch": 0.0050635208711433755, "grad_norm": 0.2021019011735916, "learning_rate": 0.000552436437300721, "loss": 3.1363, "step": 93 }, { "epoch": 0.005117967332123412, "grad_norm": 0.20027242600917816, "learning_rate": 0.0005512005818898111, "loss": 3.1075, "step": 94 }, { "epoch": 0.005172413793103448, "grad_norm": 0.20116809010505676, "learning_rate": 0.0005499519866928005, "loss": 3.1137, "step": 95 }, { "epoch": 0.005226860254083485, "grad_norm": 0.29089683294296265, "learning_rate": 0.0005486908088909568, "loss": 3.1435, "step": 96 }, { "epoch": 0.005281306715063521, "grad_norm": 0.26022160053253174, "learning_rate": 0.0005474172072495275, "loss": 3.1348, "step": 97 }, { "epoch": 0.005335753176043557, "grad_norm": 0.25501441955566406, "learning_rate": 0.0005461313420977536, "loss": 3.1243, "step": 98 }, { "epoch": 0.005390199637023594, "grad_norm": 0.2464355230331421, "learning_rate": 0.0005448333753086864, "loss": 3.1329, "step": 99 }, { "epoch": 0.0054446460980036296, "grad_norm": 0.25668638944625854, "learning_rate": 0.00054352347027881, "loss": 3.0924, "step": 100 }, { "epoch": 0.0054446460980036296, "eval_loss": 3.0220067501068115, "eval_runtime": 69.1667, "eval_samples_per_second": 62.385, "eval_steps_per_second": 15.6, "step": 100 }, { "epoch": 0.005499092558983666, "grad_norm": 0.25321197509765625, "learning_rate": 0.0005422017919074715, "loss": 3.1289, "step": 101 }, { "epoch": 0.005553539019963702, "grad_norm": 0.21445196866989136, "learning_rate": 0.0005408685065761229, "loss": 3.123, "step": 102 }, { "epoch": 0.005607985480943739, "grad_norm": 0.227112278342247, "learning_rate": 0.0005395237821273755, "loss": 3.1302, "step": 103 }, { "epoch": 0.005662431941923775, "grad_norm": 0.22493597865104675, "learning_rate": 0.000538167787843871, "loss": 3.1471, "step": 104 }, { "epoch": 0.005716878402903811, "grad_norm": 0.24361659586429596, "learning_rate": 0.0005368006944269708, "loss": 3.1271, "step": 105 }, { "epoch": 0.005771324863883848, "grad_norm": 0.2216707319021225, "learning_rate": 0.0005354226739752678, "loss": 3.1156, "step": 106 }, { "epoch": 0.005825771324863884, "grad_norm": 0.2140239179134369, "learning_rate": 0.0005340338999629203, "loss": 3.1071, "step": 107 }, { "epoch": 0.00588021778584392, "grad_norm": 0.22374412417411804, "learning_rate": 0.0005326345472178154, "loss": 3.1244, "step": 108 }, { "epoch": 0.005934664246823956, "grad_norm": 0.21664555370807648, "learning_rate": 0.0005312247918995588, "loss": 3.1254, "step": 109 }, { "epoch": 0.005989110707803993, "grad_norm": 0.21399718523025513, "learning_rate": 0.0005298048114773004, "loss": 3.119, "step": 110 }, { "epoch": 0.006043557168784029, "grad_norm": 0.18971006572246552, "learning_rate": 0.0005283747847073922, "loss": 3.1254, "step": 111 }, { "epoch": 0.006098003629764065, "grad_norm": 0.1922215223312378, "learning_rate": 0.0005269348916108859, "loss": 3.1016, "step": 112 }, { "epoch": 0.006152450090744102, "grad_norm": 0.21366891264915466, "learning_rate": 0.00052548531345087, "loss": 3.1192, "step": 113 }, { "epoch": 0.006206896551724138, "grad_norm": 0.1934356689453125, "learning_rate": 0.000524026232709652, "loss": 3.1254, "step": 114 }, { "epoch": 0.0062613430127041745, "grad_norm": 0.23197585344314575, "learning_rate": 0.0005225578330657859, "loss": 3.1503, "step": 115 }, { "epoch": 0.00631578947368421, "grad_norm": 0.227361798286438, "learning_rate": 0.0005210802993709497, "loss": 3.112, "step": 116 }, { "epoch": 0.006370235934664247, "grad_norm": 0.25826945900917053, "learning_rate": 0.0005195938176266751, "loss": 3.1405, "step": 117 }, { "epoch": 0.006424682395644283, "grad_norm": 0.2799200117588043, "learning_rate": 0.000518098574960932, "loss": 3.1425, "step": 118 }, { "epoch": 0.006479128856624319, "grad_norm": 0.24563385546207428, "learning_rate": 0.0005165947596045723, "loss": 3.1573, "step": 119 }, { "epoch": 0.006533575317604356, "grad_norm": 0.2642490267753601, "learning_rate": 0.0005150825608676336, "loss": 3.1267, "step": 120 }, { "epoch": 0.006588021778584392, "grad_norm": 0.2301069051027298, "learning_rate": 0.0005135621691155083, "loss": 3.1151, "step": 121 }, { "epoch": 0.0066424682395644285, "grad_norm": 0.22567883133888245, "learning_rate": 0.0005120337757449781, "loss": 3.1465, "step": 122 }, { "epoch": 0.0066969147005444644, "grad_norm": 0.23506537079811096, "learning_rate": 0.0005104975731601208, "loss": 3.1196, "step": 123 }, { "epoch": 0.006751361161524501, "grad_norm": 0.20675747096538544, "learning_rate": 0.0005089537547480885, "loss": 3.1063, "step": 124 }, { "epoch": 0.006805807622504537, "grad_norm": 0.20474228262901306, "learning_rate": 0.0005074025148547634, "loss": 3.1211, "step": 125 }, { "epoch": 0.006860254083484573, "grad_norm": 0.21194572746753693, "learning_rate": 0.0005058440487602918, "loss": 3.1456, "step": 126 }, { "epoch": 0.00691470054446461, "grad_norm": 0.20788611471652985, "learning_rate": 0.0005042785526545008, "loss": 3.1353, "step": 127 }, { "epoch": 0.006969147005444646, "grad_norm": 0.19835753738880157, "learning_rate": 0.0005027062236122014, "loss": 3.0889, "step": 128 }, { "epoch": 0.007023593466424683, "grad_norm": 0.24613966047763824, "learning_rate": 0.0005011272595683787, "loss": 3.1023, "step": 129 }, { "epoch": 0.0070780399274047185, "grad_norm": 0.18778769671916962, "learning_rate": 0.000499541859293275, "loss": 3.1091, "step": 130 }, { "epoch": 0.007132486388384755, "grad_norm": 0.22894078493118286, "learning_rate": 0.0004979502223673672, "loss": 3.0836, "step": 131 }, { "epoch": 0.007186932849364791, "grad_norm": 0.2452639937400818, "learning_rate": 0.0004963525491562421, "loss": 3.1285, "step": 132 }, { "epoch": 0.007241379310344828, "grad_norm": 0.26903706789016724, "learning_rate": 0.0004947490407853734, "loss": 3.135, "step": 133 }, { "epoch": 0.007295825771324864, "grad_norm": 0.29524174332618713, "learning_rate": 0.0004931398991148025, "loss": 3.1204, "step": 134 }, { "epoch": 0.0073502722323049, "grad_norm": 0.1946076601743698, "learning_rate": 0.0004915253267137274, "loss": 3.1482, "step": 135 }, { "epoch": 0.007404718693284937, "grad_norm": 0.3097498416900635, "learning_rate": 0.0004899055268350012, "loss": 3.1188, "step": 136 }, { "epoch": 0.0074591651542649726, "grad_norm": 0.23407086730003357, "learning_rate": 0.0004882807033895463, "loss": 3.1448, "step": 137 }, { "epoch": 0.007513611615245009, "grad_norm": 0.22323715686798096, "learning_rate": 0.0004866510609206841, "loss": 3.096, "step": 138 }, { "epoch": 0.007568058076225045, "grad_norm": 0.23807968199253082, "learning_rate": 0.0004850168045783858, "loss": 3.1348, "step": 139 }, { "epoch": 0.007622504537205082, "grad_norm": 0.2817918658256531, "learning_rate": 0.0004833781400934471, "loss": 3.1215, "step": 140 }, { "epoch": 0.007676950998185118, "grad_norm": 0.25021976232528687, "learning_rate": 0.00048173527375158944, "loss": 3.1019, "step": 141 }, { "epoch": 0.007731397459165154, "grad_norm": 0.25827258825302124, "learning_rate": 0.00048008841236749084, "loss": 3.1155, "step": 142 }, { "epoch": 0.007785843920145191, "grad_norm": 0.2191617339849472, "learning_rate": 0.00047843776325875173, "loss": 3.1183, "step": 143 }, { "epoch": 0.007840290381125227, "grad_norm": 0.2649674415588379, "learning_rate": 0.0004767835342197954, "loss": 3.1098, "step": 144 }, { "epoch": 0.007894736842105263, "grad_norm": 0.2287340611219406, "learning_rate": 0.00047512593349571043, "loss": 3.1004, "step": 145 }, { "epoch": 0.0079491833030853, "grad_norm": 0.21950604021549225, "learning_rate": 0.00047346516975603465, "loss": 3.0733, "step": 146 }, { "epoch": 0.008003629764065335, "grad_norm": 0.2002648264169693, "learning_rate": 0.00047180145206848686, "loss": 3.0934, "step": 147 }, { "epoch": 0.008058076225045372, "grad_norm": 0.20886124670505524, "learning_rate": 0.0004701349898726483, "loss": 3.1007, "step": 148 }, { "epoch": 0.008112522686025409, "grad_norm": 0.19583792984485626, "learning_rate": 0.00046846599295359635, "loss": 3.1249, "step": 149 }, { "epoch": 0.008166969147005444, "grad_norm": 0.18620361387729645, "learning_rate": 0.00046679467141549615, "loss": 3.1514, "step": 150 }, { "epoch": 0.00822141560798548, "grad_norm": 0.2266155332326889, "learning_rate": 0.00046512123565515065, "loss": 3.1583, "step": 151 }, { "epoch": 0.008275862068965517, "grad_norm": 0.18331050872802734, "learning_rate": 0.00046344589633551497, "loss": 3.1015, "step": 152 }, { "epoch": 0.008330308529945554, "grad_norm": 0.23021604120731354, "learning_rate": 0.00046176886435917667, "loss": 3.0984, "step": 153 }, { "epoch": 0.00838475499092559, "grad_norm": 0.23210260272026062, "learning_rate": 0.00046009035084180593, "loss": 3.1239, "step": 154 }, { "epoch": 0.008439201451905626, "grad_norm": 0.2083989381790161, "learning_rate": 0.0004584105670855787, "loss": 3.0929, "step": 155 }, { "epoch": 0.008493647912885663, "grad_norm": 0.22709086537361145, "learning_rate": 0.00045672972455257723, "loss": 3.1376, "step": 156 }, { "epoch": 0.008548094373865698, "grad_norm": 0.24739298224449158, "learning_rate": 0.0004550480348381691, "loss": 3.1135, "step": 157 }, { "epoch": 0.008602540834845735, "grad_norm": 0.2940455973148346, "learning_rate": 0.0004533657096443708, "loss": 3.1164, "step": 158 }, { "epoch": 0.008656987295825772, "grad_norm": 0.2183440625667572, "learning_rate": 0.00045168296075319685, "loss": 3.1496, "step": 159 }, { "epoch": 0.008711433756805808, "grad_norm": 0.254537969827652, "learning_rate": 0.00045, "loss": 3.1164, "step": 160 }, { "epoch": 0.008765880217785843, "grad_norm": 0.3475794792175293, "learning_rate": 0.00044831703924680307, "loss": 3.1241, "step": 161 }, { "epoch": 0.00882032667876588, "grad_norm": 0.25378546118736267, "learning_rate": 0.00044663429035562925, "loss": 3.0785, "step": 162 }, { "epoch": 0.008874773139745917, "grad_norm": 0.3237019181251526, "learning_rate": 0.0004449519651618309, "loss": 3.1247, "step": 163 }, { "epoch": 0.008929219600725952, "grad_norm": 0.2086883783340454, "learning_rate": 0.0004432702754474228, "loss": 3.1397, "step": 164 }, { "epoch": 0.008983666061705989, "grad_norm": 0.2567574083805084, "learning_rate": 0.0004415894329144212, "loss": 3.1443, "step": 165 }, { "epoch": 0.009038112522686026, "grad_norm": 0.1955418884754181, "learning_rate": 0.000439909649158194, "loss": 3.1298, "step": 166 }, { "epoch": 0.009092558983666062, "grad_norm": 0.23903286457061768, "learning_rate": 0.00043823113564082325, "loss": 3.1061, "step": 167 }, { "epoch": 0.009147005444646097, "grad_norm": 0.18639686703681946, "learning_rate": 0.00043655410366448495, "loss": 3.1364, "step": 168 }, { "epoch": 0.009201451905626134, "grad_norm": 0.28311777114868164, "learning_rate": 0.0004348787643448493, "loss": 3.1165, "step": 169 }, { "epoch": 0.009255898366606171, "grad_norm": 0.21376436948776245, "learning_rate": 0.0004332053285845038, "loss": 3.1332, "step": 170 }, { "epoch": 0.009310344827586206, "grad_norm": 0.25171706080436707, "learning_rate": 0.0004315340070464036, "loss": 3.0928, "step": 171 }, { "epoch": 0.009364791288566243, "grad_norm": 0.2478170096874237, "learning_rate": 0.0004298650101273517, "loss": 3.1146, "step": 172 }, { "epoch": 0.00941923774954628, "grad_norm": 0.2905559837818146, "learning_rate": 0.0004281985479315131, "loss": 3.1248, "step": 173 }, { "epoch": 0.009473684210526316, "grad_norm": 0.23062513768672943, "learning_rate": 0.00042653483024396527, "loss": 3.0908, "step": 174 }, { "epoch": 0.009528130671506351, "grad_norm": 0.21542707085609436, "learning_rate": 0.0004248740665042895, "loss": 3.116, "step": 175 }, { "epoch": 0.009582577132486388, "grad_norm": 0.2628575563430786, "learning_rate": 0.0004232164657802045, "loss": 3.0974, "step": 176 }, { "epoch": 0.009637023593466425, "grad_norm": 0.2287788689136505, "learning_rate": 0.0004215622367412482, "loss": 3.1229, "step": 177 }, { "epoch": 0.00969147005444646, "grad_norm": 0.23420408368110657, "learning_rate": 0.0004199115876325091, "loss": 3.1408, "step": 178 }, { "epoch": 0.009745916515426497, "grad_norm": 0.19503605365753174, "learning_rate": 0.0004182647262484106, "loss": 3.104, "step": 179 }, { "epoch": 0.009800362976406534, "grad_norm": 0.2603705823421478, "learning_rate": 0.0004166218599065528, "loss": 3.1185, "step": 180 }, { "epoch": 0.00985480943738657, "grad_norm": 0.20003163814544678, "learning_rate": 0.0004149831954216142, "loss": 3.1248, "step": 181 }, { "epoch": 0.009909255898366606, "grad_norm": 0.2520509660243988, "learning_rate": 0.00041334893907931584, "loss": 3.1053, "step": 182 }, { "epoch": 0.009963702359346642, "grad_norm": 0.2309730499982834, "learning_rate": 0.0004117192966104536, "loss": 3.0824, "step": 183 }, { "epoch": 0.01001814882032668, "grad_norm": 0.24436035752296448, "learning_rate": 0.0004100944731649987, "loss": 3.1365, "step": 184 }, { "epoch": 0.010072595281306716, "grad_norm": 0.1982448399066925, "learning_rate": 0.0004084746732862726, "loss": 3.1232, "step": 185 }, { "epoch": 0.010127041742286751, "grad_norm": 0.28294622898101807, "learning_rate": 0.0004068601008851974, "loss": 3.1373, "step": 186 }, { "epoch": 0.010181488203266788, "grad_norm": 0.19457624852657318, "learning_rate": 0.0004052509592146266, "loss": 3.1108, "step": 187 }, { "epoch": 0.010235934664246825, "grad_norm": 0.20812362432479858, "learning_rate": 0.00040364745084375787, "loss": 3.1269, "step": 188 }, { "epoch": 0.01029038112522686, "grad_norm": 0.26139721274375916, "learning_rate": 0.0004020497776326328, "loss": 3.1027, "step": 189 }, { "epoch": 0.010344827586206896, "grad_norm": 0.24709898233413696, "learning_rate": 0.00040045814070672494, "loss": 3.1102, "step": 190 }, { "epoch": 0.010399274047186933, "grad_norm": 0.20621921122074127, "learning_rate": 0.0003988727404316212, "loss": 3.1248, "step": 191 }, { "epoch": 0.01045372050816697, "grad_norm": 0.21720059216022491, "learning_rate": 0.00039729377638779857, "loss": 3.1187, "step": 192 }, { "epoch": 0.010508166969147005, "grad_norm": 0.1787269115447998, "learning_rate": 0.0003957214473454991, "loss": 3.1032, "step": 193 }, { "epoch": 0.010562613430127042, "grad_norm": 0.22008821368217468, "learning_rate": 0.00039415595123970813, "loss": 3.1438, "step": 194 }, { "epoch": 0.010617059891107079, "grad_norm": 0.2438061535358429, "learning_rate": 0.00039259748514523655, "loss": 3.1444, "step": 195 }, { "epoch": 0.010671506352087114, "grad_norm": 0.21327753365039825, "learning_rate": 0.0003910462452519114, "loss": 3.1147, "step": 196 }, { "epoch": 0.01072595281306715, "grad_norm": 0.22642219066619873, "learning_rate": 0.0003895024268398792, "loss": 3.1221, "step": 197 }, { "epoch": 0.010780399274047187, "grad_norm": 0.1771782636642456, "learning_rate": 0.00038796622425502195, "loss": 3.1153, "step": 198 }, { "epoch": 0.010834845735027224, "grad_norm": 0.25223565101623535, "learning_rate": 0.00038643783088449163, "loss": 3.1249, "step": 199 }, { "epoch": 0.010889292196007259, "grad_norm": 0.15892428159713745, "learning_rate": 0.00038491743913236624, "loss": 3.1273, "step": 200 }, { "epoch": 0.010889292196007259, "eval_loss": 3.0130114555358887, "eval_runtime": 69.209, "eval_samples_per_second": 62.347, "eval_steps_per_second": 15.59, "step": 200 }, { "epoch": 0.010943738656987296, "grad_norm": 0.2665696144104004, "learning_rate": 0.0003834052403954277, "loss": 3.1162, "step": 201 }, { "epoch": 0.010998185117967333, "grad_norm": 0.2027243971824646, "learning_rate": 0.00038190142503906794, "loss": 3.0788, "step": 202 }, { "epoch": 0.011052631578947368, "grad_norm": 0.2640676498413086, "learning_rate": 0.00038040618237332485, "loss": 3.0978, "step": 203 }, { "epoch": 0.011107078039927405, "grad_norm": 0.21408383548259735, "learning_rate": 0.0003789197006290502, "loss": 3.1155, "step": 204 }, { "epoch": 0.011161524500907441, "grad_norm": 0.21028189361095428, "learning_rate": 0.00037744216693421403, "loss": 3.0896, "step": 205 }, { "epoch": 0.011215970961887478, "grad_norm": 0.183872789144516, "learning_rate": 0.00037597376729034794, "loss": 3.1179, "step": 206 }, { "epoch": 0.011270417422867513, "grad_norm": 0.24802212417125702, "learning_rate": 0.00037451468654912994, "loss": 3.1283, "step": 207 }, { "epoch": 0.01132486388384755, "grad_norm": 0.20964276790618896, "learning_rate": 0.00037306510838911404, "loss": 3.0941, "step": 208 }, { "epoch": 0.011379310344827587, "grad_norm": 0.2240600436925888, "learning_rate": 0.00037162521529260763, "loss": 3.1226, "step": 209 }, { "epoch": 0.011433756805807622, "grad_norm": 0.23930035531520844, "learning_rate": 0.00037019518852269954, "loss": 3.111, "step": 210 }, { "epoch": 0.011488203266787659, "grad_norm": 0.21928225457668304, "learning_rate": 0.0003687752081004411, "loss": 3.1014, "step": 211 }, { "epoch": 0.011542649727767695, "grad_norm": 0.24399928748607635, "learning_rate": 0.0003673654527821846, "loss": 3.1345, "step": 212 }, { "epoch": 0.011597096188747732, "grad_norm": 0.23558929562568665, "learning_rate": 0.00036596610003707954, "loss": 3.0852, "step": 213 }, { "epoch": 0.011651542649727767, "grad_norm": 0.19627036154270172, "learning_rate": 0.00036457732602473216, "loss": 3.1098, "step": 214 }, { "epoch": 0.011705989110707804, "grad_norm": 0.26581332087516785, "learning_rate": 0.0003631993055730291, "loss": 3.1467, "step": 215 }, { "epoch": 0.01176043557168784, "grad_norm": 0.2310531884431839, "learning_rate": 0.000361832212156129, "loss": 3.1242, "step": 216 }, { "epoch": 0.011814882032667876, "grad_norm": 0.2427210807800293, "learning_rate": 0.00036047621787262444, "loss": 3.1144, "step": 217 }, { "epoch": 0.011869328493647913, "grad_norm": 0.2132464200258255, "learning_rate": 0.000359131493423877, "loss": 3.1184, "step": 218 }, { "epoch": 0.01192377495462795, "grad_norm": 0.2877713441848755, "learning_rate": 0.0003577982080925284, "loss": 3.0916, "step": 219 }, { "epoch": 0.011978221415607986, "grad_norm": 0.20918139815330505, "learning_rate": 0.00035647652972119, "loss": 3.1299, "step": 220 }, { "epoch": 0.012032667876588021, "grad_norm": 0.23674660921096802, "learning_rate": 0.00035516662469131356, "loss": 3.1292, "step": 221 }, { "epoch": 0.012087114337568058, "grad_norm": 0.1943867802619934, "learning_rate": 0.0003538686579022464, "loss": 3.1258, "step": 222 }, { "epoch": 0.012141560798548095, "grad_norm": 0.34807291626930237, "learning_rate": 0.00035258279275047246, "loss": 3.1328, "step": 223 }, { "epoch": 0.01219600725952813, "grad_norm": 0.20535360276699066, "learning_rate": 0.0003513091911090431, "loss": 3.1184, "step": 224 }, { "epoch": 0.012250453720508167, "grad_norm": 0.26084408164024353, "learning_rate": 0.00035004801330719936, "loss": 3.1014, "step": 225 }, { "epoch": 0.012304900181488203, "grad_norm": 0.2201356142759323, "learning_rate": 0.0003487994181101888, "loss": 3.0941, "step": 226 }, { "epoch": 0.01235934664246824, "grad_norm": 0.20090270042419434, "learning_rate": 0.00034756356269927894, "loss": 3.093, "step": 227 }, { "epoch": 0.012413793103448275, "grad_norm": 0.22596506774425507, "learning_rate": 0.00034634060265197026, "loss": 3.1011, "step": 228 }, { "epoch": 0.012468239564428312, "grad_norm": 0.2359432578086853, "learning_rate": 0.00034513069192241137, "loss": 3.1118, "step": 229 }, { "epoch": 0.012522686025408349, "grad_norm": 0.2078094184398651, "learning_rate": 0.0003439339828220179, "loss": 3.0898, "step": 230 }, { "epoch": 0.012577132486388384, "grad_norm": 0.19436654448509216, "learning_rate": 0.00034275062600029865, "loss": 3.0997, "step": 231 }, { "epoch": 0.01263157894736842, "grad_norm": 0.1749400496482849, "learning_rate": 0.0003415807704258913, "loss": 3.1013, "step": 232 }, { "epoch": 0.012686025408348458, "grad_norm": 0.31390950083732605, "learning_rate": 0.00034042456336780833, "loss": 3.1088, "step": 233 }, { "epoch": 0.012740471869328494, "grad_norm": 0.21334995329380035, "learning_rate": 0.00033928215037689886, "loss": 3.1367, "step": 234 }, { "epoch": 0.01279491833030853, "grad_norm": 0.2765377461910248, "learning_rate": 0.00033815367526752516, "loss": 3.0837, "step": 235 }, { "epoch": 0.012849364791288566, "grad_norm": 0.21292102336883545, "learning_rate": 0.0003370392800994583, "loss": 3.116, "step": 236 }, { "epoch": 0.012903811252268603, "grad_norm": 0.22039784491062164, "learning_rate": 0.0003359391051599953, "loss": 3.1287, "step": 237 }, { "epoch": 0.012958257713248638, "grad_norm": 0.31274768710136414, "learning_rate": 0.0003348532889462983, "loss": 3.103, "step": 238 }, { "epoch": 0.013012704174228675, "grad_norm": 0.2530403435230255, "learning_rate": 0.00033378196814795987, "loss": 3.1088, "step": 239 }, { "epoch": 0.013067150635208712, "grad_norm": 0.25343602895736694, "learning_rate": 0.0003327252776297955, "loss": 3.0815, "step": 240 }, { "epoch": 0.013121597096188748, "grad_norm": 0.2652718722820282, "learning_rate": 0.0003316833504148663, "loss": 3.1188, "step": 241 }, { "epoch": 0.013176043557168783, "grad_norm": 0.2119700163602829, "learning_rate": 0.0003306563176677328, "loss": 3.1262, "step": 242 }, { "epoch": 0.01323049001814882, "grad_norm": 0.2630724310874939, "learning_rate": 0.00032964430867794326, "loss": 3.1183, "step": 243 }, { "epoch": 0.013284936479128857, "grad_norm": 0.2809687554836273, "learning_rate": 0.00032864745084375783, "loss": 3.1087, "step": 244 }, { "epoch": 0.013339382940108892, "grad_norm": 0.24847863614559174, "learning_rate": 0.00032766586965611095, "loss": 3.1053, "step": 245 }, { "epoch": 0.013393829401088929, "grad_norm": 0.2698868215084076, "learning_rate": 0.00032669968868281353, "loss": 3.0977, "step": 246 }, { "epoch": 0.013448275862068966, "grad_norm": 0.33301204442977905, "learning_rate": 0.0003257490295529975, "loss": 3.098, "step": 247 }, { "epoch": 0.013502722323049002, "grad_norm": 0.2521134912967682, "learning_rate": 0.0003248140119418046, "loss": 3.0924, "step": 248 }, { "epoch": 0.013557168784029038, "grad_norm": 0.22425313293933868, "learning_rate": 0.00032389475355532044, "loss": 3.0911, "step": 249 }, { "epoch": 0.013611615245009074, "grad_norm": 0.2841155230998993, "learning_rate": 0.00032299137011575734, "loss": 3.1019, "step": 250 }, { "epoch": 0.013666061705989111, "grad_norm": 0.19375889003276825, "learning_rate": 0.00032210397534688617, "loss": 3.1183, "step": 251 }, { "epoch": 0.013720508166969146, "grad_norm": 0.26036033034324646, "learning_rate": 0.00032123268095972005, "loss": 3.1296, "step": 252 }, { "epoch": 0.013774954627949183, "grad_norm": 0.24063162505626678, "learning_rate": 0.0003203775966384518, "loss": 3.142, "step": 253 }, { "epoch": 0.01382940108892922, "grad_norm": 0.30820685625076294, "learning_rate": 0.0003195388300266457, "loss": 3.0998, "step": 254 }, { "epoch": 0.013883847549909257, "grad_norm": 0.18717393279075623, "learning_rate": 0.0003187164867136869, "loss": 3.1363, "step": 255 }, { "epoch": 0.013938294010889292, "grad_norm": 0.22616641223430634, "learning_rate": 0.0003179106702214893, "loss": 3.1284, "step": 256 }, { "epoch": 0.013992740471869328, "grad_norm": 0.19695289433002472, "learning_rate": 0.000317121481991463, "loss": 3.0859, "step": 257 }, { "epoch": 0.014047186932849365, "grad_norm": 0.24756500124931335, "learning_rate": 0.0003163490213717448, "loss": 3.1187, "step": 258 }, { "epoch": 0.014101633393829402, "grad_norm": 0.2162931114435196, "learning_rate": 0.00031559338560469116, "loss": 3.1066, "step": 259 }, { "epoch": 0.014156079854809437, "grad_norm": 0.2389383465051651, "learning_rate": 0.0003148546698146371, "loss": 3.1096, "step": 260 }, { "epoch": 0.014210526315789474, "grad_norm": 0.1939517855644226, "learning_rate": 0.0003141329669959209, "loss": 3.115, "step": 261 }, { "epoch": 0.01426497277676951, "grad_norm": 0.24441573023796082, "learning_rate": 0.00031342836800117763, "loss": 3.0954, "step": 262 }, { "epoch": 0.014319419237749546, "grad_norm": 0.31514814496040344, "learning_rate": 0.000312740961529902, "loss": 3.1259, "step": 263 }, { "epoch": 0.014373865698729582, "grad_norm": 0.28324607014656067, "learning_rate": 0.00031207083411728236, "loss": 3.1088, "step": 264 }, { "epoch": 0.01442831215970962, "grad_norm": 0.21894101798534393, "learning_rate": 0.00031141807012330695, "loss": 3.1071, "step": 265 }, { "epoch": 0.014482758620689656, "grad_norm": 0.32468879222869873, "learning_rate": 0.0003107827517221441, "loss": 3.1159, "step": 266 }, { "epoch": 0.014537205081669691, "grad_norm": 0.2448977530002594, "learning_rate": 0.00031016495889179787, "loss": 3.1091, "step": 267 }, { "epoch": 0.014591651542649728, "grad_norm": 0.3004207909107208, "learning_rate": 0.0003095647694040394, "loss": 3.1091, "step": 268 }, { "epoch": 0.014646098003629765, "grad_norm": 0.2677028179168701, "learning_rate": 0.0003089822588146171, "loss": 3.1069, "step": 269 }, { "epoch": 0.0147005444646098, "grad_norm": 0.26184970140457153, "learning_rate": 0.0003084175004537448, "loss": 3.1272, "step": 270 }, { "epoch": 0.014754990925589836, "grad_norm": 0.2534137964248657, "learning_rate": 0.0003078705654168706, "loss": 3.0968, "step": 271 }, { "epoch": 0.014809437386569873, "grad_norm": 0.34196603298187256, "learning_rate": 0.0003073415225557269, "loss": 3.0834, "step": 272 }, { "epoch": 0.01486388384754991, "grad_norm": 0.20533894002437592, "learning_rate": 0.0003068304384696629, "loss": 3.11, "step": 273 }, { "epoch": 0.014918330308529945, "grad_norm": 0.2631680369377136, "learning_rate": 0.00030633737749726045, "loss": 3.0984, "step": 274 }, { "epoch": 0.014972776769509982, "grad_norm": 0.24283327162265778, "learning_rate": 0.0003058624017082351, "loss": 3.1158, "step": 275 }, { "epoch": 0.015027223230490019, "grad_norm": 0.23213128745555878, "learning_rate": 0.000305405570895622, "loss": 3.1186, "step": 276 }, { "epoch": 0.015081669691470054, "grad_norm": 0.2027149498462677, "learning_rate": 0.00030496694256824903, "loss": 3.0893, "step": 277 }, { "epoch": 0.01513611615245009, "grad_norm": 0.3051553964614868, "learning_rate": 0.00030454657194349695, "loss": 3.1062, "step": 278 }, { "epoch": 0.015190562613430127, "grad_norm": 0.28350913524627686, "learning_rate": 0.00030414451194034846, "loss": 3.1115, "step": 279 }, { "epoch": 0.015245009074410164, "grad_norm": 0.28886678814888, "learning_rate": 0.00030376081317272645, "loss": 3.1113, "step": 280 }, { "epoch": 0.0152994555353902, "grad_norm": 0.24702845513820648, "learning_rate": 0.0003033955239431221, "loss": 3.092, "step": 281 }, { "epoch": 0.015353901996370236, "grad_norm": 0.20456074178218842, "learning_rate": 0.00030304869023651464, "loss": 3.1087, "step": 282 }, { "epoch": 0.015408348457350273, "grad_norm": 0.19598916172981262, "learning_rate": 0.0003027203557145822, "loss": 3.1357, "step": 283 }, { "epoch": 0.015462794918330308, "grad_norm": 0.33903974294662476, "learning_rate": 0.0003024105617102055, "loss": 3.1097, "step": 284 }, { "epoch": 0.015517241379310345, "grad_norm": 0.22449573874473572, "learning_rate": 0.0003021193472222646, "loss": 3.1204, "step": 285 }, { "epoch": 0.015571687840290381, "grad_norm": 0.28471046686172485, "learning_rate": 0.0003018467489107293, "loss": 3.115, "step": 286 }, { "epoch": 0.015626134301270418, "grad_norm": 0.2602684497833252, "learning_rate": 0.0003015928010920444, "loss": 3.1306, "step": 287 }, { "epoch": 0.015680580762250453, "grad_norm": 0.23596693575382233, "learning_rate": 0.0003013575357348098, "loss": 3.1111, "step": 288 }, { "epoch": 0.01573502722323049, "grad_norm": 0.3110821843147278, "learning_rate": 0.0003011409824557554, "loss": 3.0869, "step": 289 }, { "epoch": 0.015789473684210527, "grad_norm": 0.22817878425121307, "learning_rate": 0.00030094316851601356, "loss": 3.1123, "step": 290 }, { "epoch": 0.015843920145190562, "grad_norm": 0.26526954770088196, "learning_rate": 0.00030076411881768716, "loss": 3.1038, "step": 291 }, { "epoch": 0.0158983666061706, "grad_norm": 0.24120832979679108, "learning_rate": 0.0003006038559007141, "loss": 3.0763, "step": 292 }, { "epoch": 0.015952813067150635, "grad_norm": 0.2750589847564697, "learning_rate": 0.0003004623999400308, "loss": 3.1197, "step": 293 }, { "epoch": 0.01600725952813067, "grad_norm": 0.26484569907188416, "learning_rate": 0.0003003397687430316, "loss": 3.0945, "step": 294 }, { "epoch": 0.01606170598911071, "grad_norm": 0.22410178184509277, "learning_rate": 0.0003002359777473275, "loss": 3.1024, "step": 295 }, { "epoch": 0.016116152450090744, "grad_norm": 0.23759888112545013, "learning_rate": 0.00030015104001880274, "loss": 3.1167, "step": 296 }, { "epoch": 0.01617059891107078, "grad_norm": 0.21586690843105316, "learning_rate": 0.00030008496624996995, "loss": 3.0945, "step": 297 }, { "epoch": 0.016225045372050818, "grad_norm": 0.23178769648075104, "learning_rate": 0.00030003776475862396, "loss": 3.095, "step": 298 }, { "epoch": 0.016279491833030853, "grad_norm": 0.24513190984725952, "learning_rate": 0.0003000094414867949, "loss": 3.0728, "step": 299 }, { "epoch": 0.016333938294010888, "grad_norm": 0.20249220728874207, "learning_rate": 0.0003, "loss": 3.0881, "step": 300 }, { "epoch": 0.016333938294010888, "eval_loss": 3.00935697555542, "eval_runtime": 69.1435, "eval_samples_per_second": 62.406, "eval_steps_per_second": 15.605, "step": 300 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.5252105216e+16, "train_batch_size": 40, "trial_name": null, "trial_params": null }