| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 200, |
| "global_step": 482, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004149377593360996, |
| "grad_norm": 1.7676318772055255, |
| "learning_rate": 9.999893795201304e-06, |
| "loss": 0.3951, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.008298755186721992, |
| "grad_norm": 2.153938510867046, |
| "learning_rate": 9.999575185316994e-06, |
| "loss": 0.3506, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.012448132780082987, |
| "grad_norm": 1.0866083282976962, |
| "learning_rate": 9.999044183882234e-06, |
| "loss": 0.3212, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.016597510373443983, |
| "grad_norm": 0.931761317507556, |
| "learning_rate": 9.998300813454981e-06, |
| "loss": 0.3017, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.02074688796680498, |
| "grad_norm": 1.1753382039425297, |
| "learning_rate": 9.997345105615042e-06, |
| "loss": 0.2737, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.024896265560165973, |
| "grad_norm": 1.0040359387178692, |
| "learning_rate": 9.996177100962714e-06, |
| "loss": 0.2452, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.029045643153526972, |
| "grad_norm": 0.8719743071167806, |
| "learning_rate": 9.994796849117082e-06, |
| "loss": 0.2552, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.03319502074688797, |
| "grad_norm": 0.8365485517467004, |
| "learning_rate": 9.99320440871389e-06, |
| "loss": 0.265, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.03734439834024896, |
| "grad_norm": 0.8217986811130152, |
| "learning_rate": 9.991399847403066e-06, |
| "loss": 0.2344, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.04149377593360996, |
| "grad_norm": 0.8569379622633062, |
| "learning_rate": 9.98938324184584e-06, |
| "loss": 0.2672, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04564315352697095, |
| "grad_norm": 0.8207803423650186, |
| "learning_rate": 9.987154677711482e-06, |
| "loss": 0.2361, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.04979253112033195, |
| "grad_norm": 0.7412707898464974, |
| "learning_rate": 9.984714249673676e-06, |
| "loss": 0.2029, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.05394190871369295, |
| "grad_norm": 0.8150005467454026, |
| "learning_rate": 9.982062061406489e-06, |
| "loss": 0.2436, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.058091286307053944, |
| "grad_norm": 0.7451018300789697, |
| "learning_rate": 9.979198225579968e-06, |
| "loss": 0.2176, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.06224066390041494, |
| "grad_norm": 0.739345283205148, |
| "learning_rate": 9.976122863855362e-06, |
| "loss": 0.1967, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.06639004149377593, |
| "grad_norm": 0.7551340094752071, |
| "learning_rate": 9.972836106879936e-06, |
| "loss": 0.2224, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.07053941908713693, |
| "grad_norm": 0.7750691720438508, |
| "learning_rate": 9.969338094281432e-06, |
| "loss": 0.2258, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.07468879668049792, |
| "grad_norm": 0.8345827422112104, |
| "learning_rate": 9.965628974662145e-06, |
| "loss": 0.2352, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.07883817427385892, |
| "grad_norm": 0.8525577544213736, |
| "learning_rate": 9.961708905592594e-06, |
| "loss": 0.2671, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.08298755186721991, |
| "grad_norm": 0.8017727688186173, |
| "learning_rate": 9.957578053604837e-06, |
| "loss": 0.2564, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08713692946058091, |
| "grad_norm": 0.7485772111626616, |
| "learning_rate": 9.953236594185396e-06, |
| "loss": 0.2095, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.0912863070539419, |
| "grad_norm": 0.7997431251870294, |
| "learning_rate": 9.9486847117678e-06, |
| "loss": 0.2425, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.0954356846473029, |
| "grad_norm": 0.7838444722321428, |
| "learning_rate": 9.943922599724753e-06, |
| "loss": 0.2413, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0995850622406639, |
| "grad_norm": 0.7870701867938219, |
| "learning_rate": 9.938950460359912e-06, |
| "loss": 0.2038, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.1037344398340249, |
| "grad_norm": 0.7775437643337811, |
| "learning_rate": 9.933768504899305e-06, |
| "loss": 0.1907, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.1078838174273859, |
| "grad_norm": 0.7550159366634545, |
| "learning_rate": 9.928376953482343e-06, |
| "loss": 0.2451, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.11203319502074689, |
| "grad_norm": 0.7099704073140934, |
| "learning_rate": 9.922776035152484e-06, |
| "loss": 0.2072, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.11618257261410789, |
| "grad_norm": 0.7652497801188408, |
| "learning_rate": 9.916965987847485e-06, |
| "loss": 0.1993, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.12033195020746888, |
| "grad_norm": 0.73622015581224, |
| "learning_rate": 9.910947058389309e-06, |
| "loss": 0.2322, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.12448132780082988, |
| "grad_norm": 0.7332583019411074, |
| "learning_rate": 9.904719502473635e-06, |
| "loss": 0.2009, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.12863070539419086, |
| "grad_norm": 0.7930266814325916, |
| "learning_rate": 9.898283584658988e-06, |
| "loss": 0.2256, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.13278008298755187, |
| "grad_norm": 0.808737075078291, |
| "learning_rate": 9.891639578355511e-06, |
| "loss": 0.2382, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.13692946058091288, |
| "grad_norm": 0.7295161025634769, |
| "learning_rate": 9.884787765813348e-06, |
| "loss": 0.1877, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.14107883817427386, |
| "grad_norm": 0.6960480953194169, |
| "learning_rate": 9.877728438110645e-06, |
| "loss": 0.177, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.14522821576763487, |
| "grad_norm": 0.7762118501829516, |
| "learning_rate": 9.870461895141195e-06, |
| "loss": 0.2099, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.14937759336099585, |
| "grad_norm": 0.727269753206247, |
| "learning_rate": 9.86298844560169e-06, |
| "loss": 0.2127, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.15352697095435686, |
| "grad_norm": 0.7138427929408657, |
| "learning_rate": 9.85530840697861e-06, |
| "loss": 0.1848, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.15767634854771784, |
| "grad_norm": 0.7259620276040538, |
| "learning_rate": 9.847422105534739e-06, |
| "loss": 0.213, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.16182572614107885, |
| "grad_norm": 0.8324644671562974, |
| "learning_rate": 9.8393298762953e-06, |
| "loss": 0.2415, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.16597510373443983, |
| "grad_norm": 0.7064960332949763, |
| "learning_rate": 9.831032063033726e-06, |
| "loss": 0.1851, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.17012448132780084, |
| "grad_norm": 0.8110898374330895, |
| "learning_rate": 9.822529018257049e-06, |
| "loss": 0.2138, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.17427385892116182, |
| "grad_norm": 0.8714749214007722, |
| "learning_rate": 9.813821103190932e-06, |
| "loss": 0.2582, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.17842323651452283, |
| "grad_norm": 0.7394441240060803, |
| "learning_rate": 9.804908687764326e-06, |
| "loss": 0.2118, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.1825726141078838, |
| "grad_norm": 0.7761112789120154, |
| "learning_rate": 9.795792150593739e-06, |
| "loss": 0.2066, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.18672199170124482, |
| "grad_norm": 0.7971762487982779, |
| "learning_rate": 9.786471878967174e-06, |
| "loss": 0.1924, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1908713692946058, |
| "grad_norm": 0.7962572100593608, |
| "learning_rate": 9.776948268827658e-06, |
| "loss": 0.2404, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.1950207468879668, |
| "grad_norm": 0.7355219800994598, |
| "learning_rate": 9.76722172475643e-06, |
| "loss": 0.1956, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.1991701244813278, |
| "grad_norm": 0.6799008638651439, |
| "learning_rate": 9.757292659955755e-06, |
| "loss": 0.1722, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.2033195020746888, |
| "grad_norm": 0.7571593948984352, |
| "learning_rate": 9.747161496231359e-06, |
| "loss": 0.1859, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.2074688796680498, |
| "grad_norm": 0.7486776807975455, |
| "learning_rate": 9.736828663974527e-06, |
| "loss": 0.1998, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21161825726141079, |
| "grad_norm": 0.7496759785941373, |
| "learning_rate": 9.726294602143807e-06, |
| "loss": 0.1884, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.2157676348547718, |
| "grad_norm": 0.7016230512589497, |
| "learning_rate": 9.715559758246363e-06, |
| "loss": 0.1727, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.21991701244813278, |
| "grad_norm": 0.7940013817707481, |
| "learning_rate": 9.704624588318972e-06, |
| "loss": 0.2035, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.22406639004149378, |
| "grad_norm": 0.7477198569806254, |
| "learning_rate": 9.693489556908641e-06, |
| "loss": 0.2101, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.22821576763485477, |
| "grad_norm": 0.7770709165987761, |
| "learning_rate": 9.682155137052879e-06, |
| "loss": 0.1875, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.23236514522821577, |
| "grad_norm": 0.80078178622984, |
| "learning_rate": 9.670621810259596e-06, |
| "loss": 0.1913, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.23651452282157676, |
| "grad_norm": 0.7271062637038916, |
| "learning_rate": 9.658890066486651e-06, |
| "loss": 0.1825, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.24066390041493776, |
| "grad_norm": 0.7219480380769352, |
| "learning_rate": 9.646960404121042e-06, |
| "loss": 0.1582, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.24481327800829875, |
| "grad_norm": 0.7262914870173502, |
| "learning_rate": 9.634833329957722e-06, |
| "loss": 0.1807, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.24896265560165975, |
| "grad_norm": 0.7622053149811464, |
| "learning_rate": 9.62250935917808e-06, |
| "loss": 0.2081, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.25311203319502074, |
| "grad_norm": 0.7628500677504657, |
| "learning_rate": 9.609989015328052e-06, |
| "loss": 0.2054, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.2572614107883817, |
| "grad_norm": 0.7617360221525938, |
| "learning_rate": 9.597272830295877e-06, |
| "loss": 0.2029, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.26141078838174275, |
| "grad_norm": 0.7214154973719157, |
| "learning_rate": 9.584361344289499e-06, |
| "loss": 0.1841, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.26556016597510373, |
| "grad_norm": 0.6955736876069925, |
| "learning_rate": 9.571255105813632e-06, |
| "loss": 0.1805, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.2697095435684647, |
| "grad_norm": 0.7663861454386831, |
| "learning_rate": 9.55795467164644e-06, |
| "loss": 0.2095, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.27385892116182575, |
| "grad_norm": 0.6939193958655497, |
| "learning_rate": 9.544460606815901e-06, |
| "loss": 0.1609, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.27800829875518673, |
| "grad_norm": 0.7629821736909411, |
| "learning_rate": 9.530773484575785e-06, |
| "loss": 0.1889, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.2821576763485477, |
| "grad_norm": 0.7512315651425395, |
| "learning_rate": 9.516893886381324e-06, |
| "loss": 0.2023, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.2863070539419087, |
| "grad_norm": 0.8079916042578716, |
| "learning_rate": 9.502822401864484e-06, |
| "loss": 0.2102, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.29045643153526973, |
| "grad_norm": 0.723931061320125, |
| "learning_rate": 9.488559628808939e-06, |
| "loss": 0.1768, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2946058091286307, |
| "grad_norm": 0.7220408990609748, |
| "learning_rate": 9.474106173124667e-06, |
| "loss": 0.1765, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.2987551867219917, |
| "grad_norm": 0.7774704324723622, |
| "learning_rate": 9.459462648822209e-06, |
| "loss": 0.217, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.3029045643153527, |
| "grad_norm": 0.7443159710540501, |
| "learning_rate": 9.444629677986583e-06, |
| "loss": 0.1833, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.3070539419087137, |
| "grad_norm": 0.7696478423628316, |
| "learning_rate": 9.429607890750863e-06, |
| "loss": 0.1859, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.3112033195020747, |
| "grad_norm": 0.7735940287455797, |
| "learning_rate": 9.414397925269402e-06, |
| "loss": 0.1849, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3153526970954357, |
| "grad_norm": 0.7205860014852195, |
| "learning_rate": 9.399000427690736e-06, |
| "loss": 0.1644, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.31950207468879666, |
| "grad_norm": 0.7433776756498811, |
| "learning_rate": 9.38341605213011e-06, |
| "loss": 0.1764, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.3236514522821577, |
| "grad_norm": 0.7653711443173772, |
| "learning_rate": 9.367645460641716e-06, |
| "loss": 0.1642, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.3278008298755187, |
| "grad_norm": 0.7603265566217704, |
| "learning_rate": 9.35168932319055e-06, |
| "loss": 0.1851, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.33195020746887965, |
| "grad_norm": 0.780586497908624, |
| "learning_rate": 9.335548317623957e-06, |
| "loss": 0.1659, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3360995850622407, |
| "grad_norm": 0.759565238100644, |
| "learning_rate": 9.31922312964284e-06, |
| "loss": 0.1829, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.34024896265560167, |
| "grad_norm": 0.8911678916994763, |
| "learning_rate": 9.302714452772515e-06, |
| "loss": 0.2233, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.34439834024896265, |
| "grad_norm": 0.7873276205486429, |
| "learning_rate": 9.286022988333268e-06, |
| "loss": 0.1826, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.34854771784232363, |
| "grad_norm": 0.7873887244154294, |
| "learning_rate": 9.269149445410545e-06, |
| "loss": 0.1912, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.35269709543568467, |
| "grad_norm": 0.7572742935281274, |
| "learning_rate": 9.252094540824839e-06, |
| "loss": 0.1884, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.35684647302904565, |
| "grad_norm": 0.7729250135877875, |
| "learning_rate": 9.234858999101232e-06, |
| "loss": 0.2045, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.36099585062240663, |
| "grad_norm": 0.7492339910177302, |
| "learning_rate": 9.21744355243862e-06, |
| "loss": 0.1713, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.3651452282157676, |
| "grad_norm": 0.7279318208551071, |
| "learning_rate": 9.199848940678607e-06, |
| "loss": 0.1526, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.36929460580912865, |
| "grad_norm": 0.7701397202545115, |
| "learning_rate": 9.18207591127407e-06, |
| "loss": 0.1839, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.37344398340248963, |
| "grad_norm": 0.7734955369159264, |
| "learning_rate": 9.164125219257419e-06, |
| "loss": 0.1936, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3775933609958506, |
| "grad_norm": 0.6984959494367776, |
| "learning_rate": 9.1459976272085e-06, |
| "loss": 0.1535, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.3817427385892116, |
| "grad_norm": 0.7846836912164711, |
| "learning_rate": 9.127693905222223e-06, |
| "loss": 0.1869, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.38589211618257263, |
| "grad_norm": 0.760736222391232, |
| "learning_rate": 9.10921483087583e-06, |
| "loss": 0.1834, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.3900414937759336, |
| "grad_norm": 0.7700341024971602, |
| "learning_rate": 9.09056118919587e-06, |
| "loss": 0.191, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.3941908713692946, |
| "grad_norm": 0.7098525396416538, |
| "learning_rate": 9.071733772624847e-06, |
| "loss": 0.1615, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.3983402489626556, |
| "grad_norm": 0.836504808011679, |
| "learning_rate": 9.052733380987555e-06, |
| "loss": 0.1996, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.4024896265560166, |
| "grad_norm": 0.7809457430201482, |
| "learning_rate": 9.033560821457102e-06, |
| "loss": 0.2037, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.4066390041493776, |
| "grad_norm": 0.7131874504599527, |
| "learning_rate": 9.014216908520619e-06, |
| "loss": 0.1567, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.4107883817427386, |
| "grad_norm": 0.7241086621508711, |
| "learning_rate": 8.994702463944657e-06, |
| "loss": 0.1762, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.4149377593360996, |
| "grad_norm": 0.7470270273910699, |
| "learning_rate": 8.975018316740278e-06, |
| "loss": 0.189, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4190871369294606, |
| "grad_norm": 0.7956250924167121, |
| "learning_rate": 8.955165303127841e-06, |
| "loss": 0.2145, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.42323651452282157, |
| "grad_norm": 0.7610007103879098, |
| "learning_rate": 8.93514426650147e-06, |
| "loss": 0.1784, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.42738589211618255, |
| "grad_norm": 0.842316502916225, |
| "learning_rate": 8.914956057393231e-06, |
| "loss": 0.2031, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.4315352697095436, |
| "grad_norm": 0.7936898870281893, |
| "learning_rate": 8.894601533437e-06, |
| "loss": 0.2046, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.43568464730290457, |
| "grad_norm": 0.7265768890674549, |
| "learning_rate": 8.87408155933202e-06, |
| "loss": 0.182, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.43983402489626555, |
| "grad_norm": 0.6526179900015049, |
| "learning_rate": 8.853397006806183e-06, |
| "loss": 0.1373, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.44398340248962653, |
| "grad_norm": 0.7081605756028013, |
| "learning_rate": 8.832548754578981e-06, |
| "loss": 0.1619, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.44813278008298757, |
| "grad_norm": 0.7795529520759518, |
| "learning_rate": 8.811537688324187e-06, |
| "loss": 0.1699, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.45228215767634855, |
| "grad_norm": 0.6863306957159958, |
| "learning_rate": 8.79036470063223e-06, |
| "loss": 0.1599, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.45643153526970953, |
| "grad_norm": 0.7392254415434374, |
| "learning_rate": 8.769030690972262e-06, |
| "loss": 0.1749, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4605809128630705, |
| "grad_norm": 0.7406860180908847, |
| "learning_rate": 8.747536565653966e-06, |
| "loss": 0.1625, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.46473029045643155, |
| "grad_norm": 0.7256893779396758, |
| "learning_rate": 8.725883237789046e-06, |
| "loss": 0.156, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.46887966804979253, |
| "grad_norm": 0.7936161492458803, |
| "learning_rate": 8.704071627252428e-06, |
| "loss": 0.1927, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.4730290456431535, |
| "grad_norm": 0.7921344874492351, |
| "learning_rate": 8.682102660643196e-06, |
| "loss": 0.1902, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.47717842323651455, |
| "grad_norm": 0.7661741406638606, |
| "learning_rate": 8.659977271245224e-06, |
| "loss": 0.2017, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.48132780082987553, |
| "grad_norm": 0.717807546570814, |
| "learning_rate": 8.637696398987517e-06, |
| "loss": 0.179, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.4854771784232365, |
| "grad_norm": 0.7450855678577503, |
| "learning_rate": 8.615260990404301e-06, |
| "loss": 0.1812, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.4896265560165975, |
| "grad_norm": 0.7841540320532201, |
| "learning_rate": 8.592671998594794e-06, |
| "loss": 0.2024, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.49377593360995853, |
| "grad_norm": 0.8347965727378909, |
| "learning_rate": 8.56993038318273e-06, |
| "loss": 0.1847, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.4979253112033195, |
| "grad_norm": 0.7582917342684473, |
| "learning_rate": 8.54703711027558e-06, |
| "loss": 0.175, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5020746887966805, |
| "grad_norm": 0.7290057002386009, |
| "learning_rate": 8.523993152423522e-06, |
| "loss": 0.1647, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.5062240663900415, |
| "grad_norm": 0.6809327899218293, |
| "learning_rate": 8.50079948857812e-06, |
| "loss": 0.1427, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.5103734439834025, |
| "grad_norm": 0.6957572538516275, |
| "learning_rate": 8.477457104050732e-06, |
| "loss": 0.1536, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.5145228215767634, |
| "grad_norm": 0.7497563490013779, |
| "learning_rate": 8.453966990470656e-06, |
| "loss": 0.1845, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.5186721991701245, |
| "grad_norm": 0.7131343540595416, |
| "learning_rate": 8.430330145743011e-06, |
| "loss": 0.1656, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5228215767634855, |
| "grad_norm": 0.7355273724695276, |
| "learning_rate": 8.406547574006326e-06, |
| "loss": 0.1757, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.5269709543568465, |
| "grad_norm": 0.707388890545072, |
| "learning_rate": 8.3826202855899e-06, |
| "loss": 0.1645, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.5311203319502075, |
| "grad_norm": 0.7699859449989024, |
| "learning_rate": 8.358549296970877e-06, |
| "loss": 0.1908, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.5352697095435685, |
| "grad_norm": 0.7766679325618092, |
| "learning_rate": 8.334335630731051e-06, |
| "loss": 0.1697, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.5394190871369294, |
| "grad_norm": 0.8240433568844907, |
| "learning_rate": 8.309980315513444e-06, |
| "loss": 0.2141, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5435684647302904, |
| "grad_norm": 0.7228340520917644, |
| "learning_rate": 8.285484385978598e-06, |
| "loss": 0.1642, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.5477178423236515, |
| "grad_norm": 0.7554222129553129, |
| "learning_rate": 8.260848882760616e-06, |
| "loss": 0.1785, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.5518672199170125, |
| "grad_norm": 0.7166236882688062, |
| "learning_rate": 8.236074852422965e-06, |
| "loss": 0.1687, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.5560165975103735, |
| "grad_norm": 0.7548715208775477, |
| "learning_rate": 8.211163347414005e-06, |
| "loss": 0.1818, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.5601659751037344, |
| "grad_norm": 0.7609580580621476, |
| "learning_rate": 8.186115426022286e-06, |
| "loss": 0.1726, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.5643153526970954, |
| "grad_norm": 0.7081999400916217, |
| "learning_rate": 8.160932152331587e-06, |
| "loss": 0.1569, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.5684647302904564, |
| "grad_norm": 0.7380667317946897, |
| "learning_rate": 8.135614596175714e-06, |
| "loss": 0.1512, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.5726141078838174, |
| "grad_norm": 0.7462628971624624, |
| "learning_rate": 8.11016383309305e-06, |
| "loss": 0.1408, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.5767634854771784, |
| "grad_norm": 0.7686213864033702, |
| "learning_rate": 8.084580944280862e-06, |
| "loss": 0.1816, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.5809128630705395, |
| "grad_norm": 0.7420483653450449, |
| "learning_rate": 8.058867016549372e-06, |
| "loss": 0.1791, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5850622406639004, |
| "grad_norm": 0.71264089436099, |
| "learning_rate": 8.03302314227559e-06, |
| "loss": 0.1439, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.5892116182572614, |
| "grad_norm": 0.7586489110810907, |
| "learning_rate": 8.007050419356898e-06, |
| "loss": 0.1909, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.5933609958506224, |
| "grad_norm": 0.8567775411352899, |
| "learning_rate": 7.980949951164422e-06, |
| "loss": 0.2122, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.5975103734439834, |
| "grad_norm": 0.7346565956077903, |
| "learning_rate": 7.95472284649615e-06, |
| "loss": 0.1516, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.6016597510373444, |
| "grad_norm": 0.7432495374421877, |
| "learning_rate": 7.92837021952983e-06, |
| "loss": 0.1554, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6058091286307054, |
| "grad_norm": 0.7799968417933343, |
| "learning_rate": 7.90189318977564e-06, |
| "loss": 0.187, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.6099585062240664, |
| "grad_norm": 0.7494236481968739, |
| "learning_rate": 7.875292882028624e-06, |
| "loss": 0.1837, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.6141078838174274, |
| "grad_norm": 0.7474431038051979, |
| "learning_rate": 7.848570426320918e-06, |
| "loss": 0.1643, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.6182572614107884, |
| "grad_norm": 0.8170730043512447, |
| "learning_rate": 7.821726957873728e-06, |
| "loss": 0.1624, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.6224066390041494, |
| "grad_norm": 0.7695549295122858, |
| "learning_rate": 7.794763617049124e-06, |
| "loss": 0.1728, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6265560165975104, |
| "grad_norm": 0.786153361885955, |
| "learning_rate": 7.767681549301576e-06, |
| "loss": 0.198, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.6307053941908713, |
| "grad_norm": 0.7293084463135483, |
| "learning_rate": 7.740481905129307e-06, |
| "loss": 0.158, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.6348547717842323, |
| "grad_norm": 0.7976980758945141, |
| "learning_rate": 7.713165840025412e-06, |
| "loss": 0.1972, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.6390041493775933, |
| "grad_norm": 0.6910969018398649, |
| "learning_rate": 7.685734514428767e-06, |
| "loss": 0.1562, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.6431535269709544, |
| "grad_norm": 0.7012920246533267, |
| "learning_rate": 7.658189093674738e-06, |
| "loss": 0.1578, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.6473029045643154, |
| "grad_norm": 0.7301136191105242, |
| "learning_rate": 7.630530747945672e-06, |
| "loss": 0.1862, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.6514522821576764, |
| "grad_norm": 0.7861556907392464, |
| "learning_rate": 7.6027606522211835e-06, |
| "loss": 0.1733, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.6556016597510373, |
| "grad_norm": 0.7157404027922296, |
| "learning_rate": 7.574879986228245e-06, |
| "loss": 0.1554, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.6597510373443983, |
| "grad_norm": 0.7821110107372908, |
| "learning_rate": 7.546889934391065e-06, |
| "loss": 0.2143, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.6639004149377593, |
| "grad_norm": 0.7268702392383997, |
| "learning_rate": 7.518791685780769e-06, |
| "loss": 0.1551, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6680497925311203, |
| "grad_norm": 0.7543523627446657, |
| "learning_rate": 7.490586434064893e-06, |
| "loss": 0.1659, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.6721991701244814, |
| "grad_norm": 0.8320460591998305, |
| "learning_rate": 7.462275377456671e-06, |
| "loss": 0.2111, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.6763485477178424, |
| "grad_norm": 0.6854667430244615, |
| "learning_rate": 7.433859718664127e-06, |
| "loss": 0.1477, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.6804979253112033, |
| "grad_norm": 0.7395821155536143, |
| "learning_rate": 7.405340664838994e-06, |
| "loss": 0.1868, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.6846473029045643, |
| "grad_norm": 0.8119883130258085, |
| "learning_rate": 7.376719427525415e-06, |
| "loss": 0.1955, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.6887966804979253, |
| "grad_norm": 0.748708444154493, |
| "learning_rate": 7.3479972226084925e-06, |
| "loss": 0.1574, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.6929460580912863, |
| "grad_norm": 0.7541707172275884, |
| "learning_rate": 7.319175270262624e-06, |
| "loss": 0.1571, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.6970954356846473, |
| "grad_norm": 0.7901522781346901, |
| "learning_rate": 7.290254794899665e-06, |
| "loss": 0.1557, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.7012448132780082, |
| "grad_norm": 0.7618677553737644, |
| "learning_rate": 7.261237025116923e-06, |
| "loss": 0.1682, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.7053941908713693, |
| "grad_norm": 0.7642886029206197, |
| "learning_rate": 7.232123193644957e-06, |
| "loss": 0.1498, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.7095435684647303, |
| "grad_norm": 0.7791230848104441, |
| "learning_rate": 7.202914537295211e-06, |
| "loss": 0.1617, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.7136929460580913, |
| "grad_norm": 0.7393488540214589, |
| "learning_rate": 7.173612296907473e-06, |
| "loss": 0.1613, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.7178423236514523, |
| "grad_norm": 0.7038006522246097, |
| "learning_rate": 7.1442177172971586e-06, |
| "loss": 0.1401, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.7219917012448133, |
| "grad_norm": 0.7437385440346528, |
| "learning_rate": 7.114732047202433e-06, |
| "loss": 0.1822, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.7261410788381742, |
| "grad_norm": 0.7217780558130102, |
| "learning_rate": 7.085156539231159e-06, |
| "loss": 0.1639, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.7302904564315352, |
| "grad_norm": 0.7780272872916683, |
| "learning_rate": 7.055492449807684e-06, |
| "loss": 0.1909, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.7344398340248963, |
| "grad_norm": 0.7011610009172605, |
| "learning_rate": 7.025741039119466e-06, |
| "loss": 0.1504, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.7385892116182573, |
| "grad_norm": 0.6952695444986241, |
| "learning_rate": 6.995903571063541e-06, |
| "loss": 0.1367, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.7427385892116183, |
| "grad_norm": 0.7599801857515818, |
| "learning_rate": 6.96598131319282e-06, |
| "loss": 0.1677, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.7468879668049793, |
| "grad_norm": 0.7522191275629795, |
| "learning_rate": 6.935975536662254e-06, |
| "loss": 0.1851, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7510373443983402, |
| "grad_norm": 0.7755271260892246, |
| "learning_rate": 6.905887516174827e-06, |
| "loss": 0.1673, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.7551867219917012, |
| "grad_norm": 0.7029538367008582, |
| "learning_rate": 6.875718529927404e-06, |
| "loss": 0.1573, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.7593360995850622, |
| "grad_norm": 0.7427208404607889, |
| "learning_rate": 6.845469859556426e-06, |
| "loss": 0.1629, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.7634854771784232, |
| "grad_norm": 0.7413786757065406, |
| "learning_rate": 6.815142790083473e-06, |
| "loss": 0.1608, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.7676348547717843, |
| "grad_norm": 0.7331757595224525, |
| "learning_rate": 6.784738609860668e-06, |
| "loss": 0.1611, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.7717842323651453, |
| "grad_norm": 0.7585384853604383, |
| "learning_rate": 6.754258610515949e-06, |
| "loss": 0.1908, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.7759336099585062, |
| "grad_norm": 0.7407145482448325, |
| "learning_rate": 6.723704086898193e-06, |
| "loss": 0.1836, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.7800829875518672, |
| "grad_norm": 0.7045533572567579, |
| "learning_rate": 6.6930763370222104e-06, |
| "loss": 0.1713, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.7842323651452282, |
| "grad_norm": 0.7796381186959462, |
| "learning_rate": 6.662376662013609e-06, |
| "loss": 0.1878, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.7883817427385892, |
| "grad_norm": 0.7294399552388171, |
| "learning_rate": 6.631606366053507e-06, |
| "loss": 0.1706, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7925311203319502, |
| "grad_norm": 0.7242566299823047, |
| "learning_rate": 6.60076675632314e-06, |
| "loss": 0.1656, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.7966804979253111, |
| "grad_norm": 0.734716432035308, |
| "learning_rate": 6.5698591429483286e-06, |
| "loss": 0.1813, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.8008298755186722, |
| "grad_norm": 0.7243228793796995, |
| "learning_rate": 6.5388848389438095e-06, |
| "loss": 0.167, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.8049792531120332, |
| "grad_norm": 0.7146769575254076, |
| "learning_rate": 6.507845160157476e-06, |
| "loss": 0.1786, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.8091286307053942, |
| "grad_norm": 0.7000888246853346, |
| "learning_rate": 6.476741425214464e-06, |
| "loss": 0.1598, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8132780082987552, |
| "grad_norm": 0.7091228748093505, |
| "learning_rate": 6.445574955461134e-06, |
| "loss": 0.1555, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.8174273858921162, |
| "grad_norm": 0.7475590867202404, |
| "learning_rate": 6.414347074908944e-06, |
| "loss": 0.1816, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.8215767634854771, |
| "grad_norm": 0.6648744157507708, |
| "learning_rate": 6.383059110178205e-06, |
| "loss": 0.125, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.8257261410788381, |
| "grad_norm": 0.7046497121459463, |
| "learning_rate": 6.35171239044171e-06, |
| "loss": 0.1671, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.8298755186721992, |
| "grad_norm": 0.7338668870220733, |
| "learning_rate": 6.320308247368285e-06, |
| "loss": 0.1792, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8298755186721992, |
| "eval_loss": 0.18507465720176697, |
| "eval_runtime": 1.4909, |
| "eval_samples_per_second": 13.415, |
| "eval_steps_per_second": 3.354, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8340248962655602, |
| "grad_norm": 0.8256796055544355, |
| "learning_rate": 6.288848015066211e-06, |
| "loss": 0.1791, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.8381742738589212, |
| "grad_norm": 0.7814594735009075, |
| "learning_rate": 6.2573330300265375e-06, |
| "loss": 0.2016, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.8423236514522822, |
| "grad_norm": 0.7582397861722057, |
| "learning_rate": 6.225764631066326e-06, |
| "loss": 0.1595, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.8464730290456431, |
| "grad_norm": 0.7329889520402166, |
| "learning_rate": 6.1941441592717564e-06, |
| "loss": 0.1805, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.8506224066390041, |
| "grad_norm": 0.7159204670536288, |
| "learning_rate": 6.162472957941167e-06, |
| "loss": 0.1628, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.8547717842323651, |
| "grad_norm": 0.7450560042081226, |
| "learning_rate": 6.130752372527981e-06, |
| "loss": 0.1625, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.8589211618257261, |
| "grad_norm": 0.7276052754485396, |
| "learning_rate": 6.098983750583556e-06, |
| "loss": 0.1699, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.8630705394190872, |
| "grad_norm": 0.7080982220713725, |
| "learning_rate": 6.067168441699927e-06, |
| "loss": 0.1662, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.8672199170124482, |
| "grad_norm": 0.7202598720219618, |
| "learning_rate": 6.035307797452489e-06, |
| "loss": 0.1405, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.8713692946058091, |
| "grad_norm": 0.7491056971873838, |
| "learning_rate": 6.0034031713425636e-06, |
| "loss": 0.1754, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8755186721991701, |
| "grad_norm": 0.7441136532460226, |
| "learning_rate": 5.9714559187399094e-06, |
| "loss": 0.162, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.8796680497925311, |
| "grad_norm": 0.8207260119939723, |
| "learning_rate": 5.939467396825137e-06, |
| "loss": 0.1698, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.8838174273858921, |
| "grad_norm": 0.6868519061874842, |
| "learning_rate": 5.907438964532059e-06, |
| "loss": 0.1379, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.8879668049792531, |
| "grad_norm": 0.7956217383121814, |
| "learning_rate": 5.875371982489959e-06, |
| "loss": 0.163, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.8921161825726142, |
| "grad_norm": 0.7394417902907329, |
| "learning_rate": 5.843267812965783e-06, |
| "loss": 0.1655, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.8962655601659751, |
| "grad_norm": 0.7319044411180587, |
| "learning_rate": 5.811127819806277e-06, |
| "loss": 0.1684, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.9004149377593361, |
| "grad_norm": 0.6996596721883562, |
| "learning_rate": 5.7789533683800445e-06, |
| "loss": 0.1467, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.9045643153526971, |
| "grad_norm": 0.7320178544302903, |
| "learning_rate": 5.746745825519539e-06, |
| "loss": 0.1552, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.9087136929460581, |
| "grad_norm": 0.724383373673059, |
| "learning_rate": 5.714506559463001e-06, |
| "loss": 0.1405, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.9128630705394191, |
| "grad_norm": 0.6677748535395426, |
| "learning_rate": 5.682236939796337e-06, |
| "loss": 0.1512, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.91701244813278, |
| "grad_norm": 0.7845934322337835, |
| "learning_rate": 5.649938337394932e-06, |
| "loss": 0.1859, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.921161825726141, |
| "grad_norm": 0.7529205324420154, |
| "learning_rate": 5.617612124365411e-06, |
| "loss": 0.1686, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.9253112033195021, |
| "grad_norm": 0.7034404844878496, |
| "learning_rate": 5.585259673987352e-06, |
| "loss": 0.1473, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.9294605809128631, |
| "grad_norm": 0.716275540783631, |
| "learning_rate": 5.55288236065495e-06, |
| "loss": 0.1532, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.9336099585062241, |
| "grad_norm": 0.7270308402949697, |
| "learning_rate": 5.52048155981862e-06, |
| "loss": 0.1597, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.9377593360995851, |
| "grad_norm": 0.7001090415882342, |
| "learning_rate": 5.4880586479265774e-06, |
| "loss": 0.1754, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.941908713692946, |
| "grad_norm": 0.7711476173055917, |
| "learning_rate": 5.455615002366351e-06, |
| "loss": 0.1625, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.946058091286307, |
| "grad_norm": 0.7545759247777888, |
| "learning_rate": 5.423152001406282e-06, |
| "loss": 0.1687, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.950207468879668, |
| "grad_norm": 0.7283465494781498, |
| "learning_rate": 5.390671024136961e-06, |
| "loss": 0.139, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.9543568464730291, |
| "grad_norm": 0.7108090972367541, |
| "learning_rate": 5.358173450412649e-06, |
| "loss": 0.1476, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.9585062240663901, |
| "grad_norm": 0.758784013083764, |
| "learning_rate": 5.325660660792657e-06, |
| "loss": 0.1384, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.9626556016597511, |
| "grad_norm": 0.75965590058487, |
| "learning_rate": 5.293134036482697e-06, |
| "loss": 0.1705, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.966804979253112, |
| "grad_norm": 0.7382105422676851, |
| "learning_rate": 5.260594959276203e-06, |
| "loss": 0.1674, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.970954356846473, |
| "grad_norm": 0.7769590928701867, |
| "learning_rate": 5.228044811495632e-06, |
| "loss": 0.1846, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.975103734439834, |
| "grad_norm": 0.6779927231647213, |
| "learning_rate": 5.195484975933741e-06, |
| "loss": 0.1286, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.979253112033195, |
| "grad_norm": 0.7308532662700662, |
| "learning_rate": 5.162916835794843e-06, |
| "loss": 0.1555, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.983402489626556, |
| "grad_norm": 0.7775184861306178, |
| "learning_rate": 5.1303417746360455e-06, |
| "loss": 0.1417, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.9875518672199171, |
| "grad_norm": 0.8164386408608373, |
| "learning_rate": 5.097761176308471e-06, |
| "loss": 0.228, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.991701244813278, |
| "grad_norm": 0.6949846700263419, |
| "learning_rate": 5.0651764248984794e-06, |
| "loss": 0.1549, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.995850622406639, |
| "grad_norm": 0.7417924630808657, |
| "learning_rate": 5.032588904668851e-06, |
| "loss": 0.1693, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.7359282721788815, |
| "learning_rate": 5e-06, |
| "loss": 0.1539, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.004149377593361, |
| "grad_norm": 0.7832307294102421, |
| "learning_rate": 4.967411095331149e-06, |
| "loss": 0.0961, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.008298755186722, |
| "grad_norm": 0.8163715493753602, |
| "learning_rate": 4.934823575101523e-06, |
| "loss": 0.1047, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.012448132780083, |
| "grad_norm": 0.667448725807847, |
| "learning_rate": 4.9022388236915306e-06, |
| "loss": 0.0859, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.016597510373444, |
| "grad_norm": 0.6569623280552557, |
| "learning_rate": 4.869658225363957e-06, |
| "loss": 0.0848, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.020746887966805, |
| "grad_norm": 0.6630688357796415, |
| "learning_rate": 4.837083164205159e-06, |
| "loss": 0.0776, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.0248962655601659, |
| "grad_norm": 0.6783077300452073, |
| "learning_rate": 4.8045150240662615e-06, |
| "loss": 0.0884, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.0290456431535269, |
| "grad_norm": 0.7240958065398064, |
| "learning_rate": 4.771955188504371e-06, |
| "loss": 0.0998, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.033195020746888, |
| "grad_norm": 0.7601470439290279, |
| "learning_rate": 4.739405040723798e-06, |
| "loss": 0.0781, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.037344398340249, |
| "grad_norm": 0.8020958279653632, |
| "learning_rate": 4.7068659635173034e-06, |
| "loss": 0.099, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.04149377593361, |
| "grad_norm": 0.8193127008593971, |
| "learning_rate": 4.6743393392073435e-06, |
| "loss": 0.0905, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.045643153526971, |
| "grad_norm": 0.8674642457830525, |
| "learning_rate": 4.641826549587352e-06, |
| "loss": 0.0847, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.049792531120332, |
| "grad_norm": 0.7063356435232804, |
| "learning_rate": 4.60932897586304e-06, |
| "loss": 0.0638, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.053941908713693, |
| "grad_norm": 0.7935013912587565, |
| "learning_rate": 4.57684799859372e-06, |
| "loss": 0.0868, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.058091286307054, |
| "grad_norm": 0.8078033123113716, |
| "learning_rate": 4.54438499763365e-06, |
| "loss": 0.0849, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.062240663900415, |
| "grad_norm": 0.7491115280286844, |
| "learning_rate": 4.511941352073424e-06, |
| "loss": 0.0739, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.066390041493776, |
| "grad_norm": 0.7910641602613229, |
| "learning_rate": 4.479518440181381e-06, |
| "loss": 0.1018, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.070539419087137, |
| "grad_norm": 0.7384613928856985, |
| "learning_rate": 4.447117639345052e-06, |
| "loss": 0.0814, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.0746887966804979, |
| "grad_norm": 0.7576400301646911, |
| "learning_rate": 4.414740326012649e-06, |
| "loss": 0.0767, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.0788381742738589, |
| "grad_norm": 0.7353105802879545, |
| "learning_rate": 4.382387875634592e-06, |
| "loss": 0.0824, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.0829875518672198, |
| "grad_norm": 0.7551849354862943, |
| "learning_rate": 4.3500616626050705e-06, |
| "loss": 0.1021, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.0871369294605808, |
| "grad_norm": 0.753933529100596, |
| "learning_rate": 4.317763060203665e-06, |
| "loss": 0.0906, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.0912863070539418, |
| "grad_norm": 0.6657270829997127, |
| "learning_rate": 4.285493440537002e-06, |
| "loss": 0.0699, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.095435684647303, |
| "grad_norm": 0.7356269847744574, |
| "learning_rate": 4.253254174480462e-06, |
| "loss": 0.0917, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.099585062240664, |
| "grad_norm": 0.7056456539678235, |
| "learning_rate": 4.221046631619956e-06, |
| "loss": 0.0762, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.103734439834025, |
| "grad_norm": 0.7447577063439521, |
| "learning_rate": 4.188872180193723e-06, |
| "loss": 0.0773, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.107883817427386, |
| "grad_norm": 0.7285210666049802, |
| "learning_rate": 4.156732187034219e-06, |
| "loss": 0.0859, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.112033195020747, |
| "grad_norm": 0.7515360500123252, |
| "learning_rate": 4.124628017510043e-06, |
| "loss": 0.081, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.116182572614108, |
| "grad_norm": 0.754107635042173, |
| "learning_rate": 4.092561035467942e-06, |
| "loss": 0.0954, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.120331950207469, |
| "grad_norm": 0.7658986514175681, |
| "learning_rate": 4.060532603174865e-06, |
| "loss": 0.0803, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.1244813278008299, |
| "grad_norm": 0.7164719929051085, |
| "learning_rate": 4.028544081260093e-06, |
| "loss": 0.0875, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.1286307053941909, |
| "grad_norm": 0.7811385800972357, |
| "learning_rate": 3.996596828657437e-06, |
| "loss": 0.0957, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.1327800829875518, |
| "grad_norm": 0.8095703190806857, |
| "learning_rate": 3.9646922025475126e-06, |
| "loss": 0.0888, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.1369294605809128, |
| "grad_norm": 0.8577161530627458, |
| "learning_rate": 3.932831558300074e-06, |
| "loss": 0.1304, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.1410788381742738, |
| "grad_norm": 0.7551634325885881, |
| "learning_rate": 3.9010162494164475e-06, |
| "loss": 0.0694, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.1452282157676348, |
| "grad_norm": 0.7259339364603347, |
| "learning_rate": 3.869247627472021e-06, |
| "loss": 0.092, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.1493775933609958, |
| "grad_norm": 0.7797326733766821, |
| "learning_rate": 3.837527042058836e-06, |
| "loss": 0.0884, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.1535269709543567, |
| "grad_norm": 0.7885805459732599, |
| "learning_rate": 3.8058558407282465e-06, |
| "loss": 0.0787, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.1576763485477177, |
| "grad_norm": 0.8196108480937061, |
| "learning_rate": 3.7742353689336753e-06, |
| "loss": 0.0809, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.161825726141079, |
| "grad_norm": 0.775286344200391, |
| "learning_rate": 3.742666969973463e-06, |
| "loss": 0.0885, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.16597510373444, |
| "grad_norm": 0.7244030638509119, |
| "learning_rate": 3.7111519849337908e-06, |
| "loss": 0.08, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.170124481327801, |
| "grad_norm": 0.8052024240159372, |
| "learning_rate": 3.6796917526317153e-06, |
| "loss": 0.1052, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.1742738589211619, |
| "grad_norm": 0.7481457604441057, |
| "learning_rate": 3.648287609558291e-06, |
| "loss": 0.0828, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.1784232365145229, |
| "grad_norm": 0.8236128177922952, |
| "learning_rate": 3.6169408898217973e-06, |
| "loss": 0.126, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.1825726141078838, |
| "grad_norm": 0.8014821382259918, |
| "learning_rate": 3.5856529250910565e-06, |
| "loss": 0.1017, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.1867219917012448, |
| "grad_norm": 0.7657225404471665, |
| "learning_rate": 3.554425044538868e-06, |
| "loss": 0.0817, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.1908713692946058, |
| "grad_norm": 0.7292541020615424, |
| "learning_rate": 3.5232585747855376e-06, |
| "loss": 0.0884, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.1950207468879668, |
| "grad_norm": 0.7104999733621218, |
| "learning_rate": 3.4921548398425246e-06, |
| "loss": 0.0705, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.1991701244813278, |
| "grad_norm": 0.7340653325120157, |
| "learning_rate": 3.461115161056191e-06, |
| "loss": 0.0876, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.2033195020746887, |
| "grad_norm": 0.8218266725660978, |
| "learning_rate": 3.430140857051675e-06, |
| "loss": 0.104, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.2074688796680497, |
| "grad_norm": 0.7539001339920732, |
| "learning_rate": 3.3992332436768615e-06, |
| "loss": 0.0663, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.2116182572614107, |
| "grad_norm": 0.8019682759784094, |
| "learning_rate": 3.3683936339464957e-06, |
| "loss": 0.0907, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.215767634854772, |
| "grad_norm": 0.7682093069335607, |
| "learning_rate": 3.3376233379863943e-06, |
| "loss": 0.0766, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.2199170124481329, |
| "grad_norm": 0.7825481915497294, |
| "learning_rate": 3.306923662977789e-06, |
| "loss": 0.1012, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.2240663900414939, |
| "grad_norm": 0.7162145084205342, |
| "learning_rate": 3.276295913101808e-06, |
| "loss": 0.0741, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.2282157676348548, |
| "grad_norm": 0.7318286777705275, |
| "learning_rate": 3.2457413894840516e-06, |
| "loss": 0.0787, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.2323651452282158, |
| "grad_norm": 0.7641874664846489, |
| "learning_rate": 3.215261390139332e-06, |
| "loss": 0.0932, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.2365145228215768, |
| "grad_norm": 0.7445085106059115, |
| "learning_rate": 3.184857209916528e-06, |
| "loss": 0.0759, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.2406639004149378, |
| "grad_norm": 0.7754817714088118, |
| "learning_rate": 3.1545301404435756e-06, |
| "loss": 0.093, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.2448132780082988, |
| "grad_norm": 0.7577954657403386, |
| "learning_rate": 3.1242814700725977e-06, |
| "loss": 0.079, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.2489626556016598, |
| "grad_norm": 0.7421284956952211, |
| "learning_rate": 3.0941124838251734e-06, |
| "loss": 0.0816, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.2531120331950207, |
| "grad_norm": 0.8041903952253537, |
| "learning_rate": 3.064024463337747e-06, |
| "loss": 0.1059, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.2572614107883817, |
| "grad_norm": 0.8046374578629936, |
| "learning_rate": 3.034018686807182e-06, |
| "loss": 0.108, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.2614107883817427, |
| "grad_norm": 0.758163930794237, |
| "learning_rate": 3.0040964289364618e-06, |
| "loss": 0.0765, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.2655601659751037, |
| "grad_norm": 0.7344291933199262, |
| "learning_rate": 2.974258960880535e-06, |
| "loss": 0.077, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.2697095435684647, |
| "grad_norm": 0.8054215463295404, |
| "learning_rate": 2.944507550192318e-06, |
| "loss": 0.0889, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.2738589211618256, |
| "grad_norm": 0.717467590964347, |
| "learning_rate": 2.9148434607688426e-06, |
| "loss": 0.0726, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.2780082987551866, |
| "grad_norm": 0.7186905433521467, |
| "learning_rate": 2.885267952797569e-06, |
| "loss": 0.0757, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.2821576763485476, |
| "grad_norm": 0.7398347859605869, |
| "learning_rate": 2.855782282702841e-06, |
| "loss": 0.0726, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.2863070539419086, |
| "grad_norm": 0.6866464266246485, |
| "learning_rate": 2.826387703092528e-06, |
| "loss": 0.0645, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.2904564315352698, |
| "grad_norm": 0.7476154799430902, |
| "learning_rate": 2.7970854627047893e-06, |
| "loss": 0.0768, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.2946058091286308, |
| "grad_norm": 0.7783465796646424, |
| "learning_rate": 2.7678768063550454e-06, |
| "loss": 0.0952, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.2987551867219918, |
| "grad_norm": 0.7018719402660284, |
| "learning_rate": 2.738762974883078e-06, |
| "loss": 0.074, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.3029045643153527, |
| "grad_norm": 0.7006326758865006, |
| "learning_rate": 2.7097452051003375e-06, |
| "loss": 0.0752, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.3070539419087137, |
| "grad_norm": 0.7664880666119144, |
| "learning_rate": 2.680824729737378e-06, |
| "loss": 0.0823, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.3112033195020747, |
| "grad_norm": 0.7752147371943559, |
| "learning_rate": 2.6520027773915075e-06, |
| "loss": 0.0823, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.3153526970954357, |
| "grad_norm": 0.7273629350561008, |
| "learning_rate": 2.623280572474587e-06, |
| "loss": 0.0793, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.3195020746887967, |
| "grad_norm": 0.7622833114877324, |
| "learning_rate": 2.594659335161008e-06, |
| "loss": 0.0858, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.3236514522821576, |
| "grad_norm": 0.6911144920395204, |
| "learning_rate": 2.566140281335875e-06, |
| "loss": 0.0773, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.3278008298755186, |
| "grad_norm": 0.672521374023277, |
| "learning_rate": 2.5377246225433306e-06, |
| "loss": 0.0637, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.3319502074688796, |
| "grad_norm": 0.712016056392291, |
| "learning_rate": 2.509413565935107e-06, |
| "loss": 0.0672, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.3360995850622408, |
| "grad_norm": 0.7257502701410873, |
| "learning_rate": 2.481208314219233e-06, |
| "loss": 0.0729, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.3402489626556018, |
| "grad_norm": 0.757287282113775, |
| "learning_rate": 2.4531100656089365e-06, |
| "loss": 0.0748, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.3443983402489628, |
| "grad_norm": 0.7916678661865387, |
| "learning_rate": 2.4251200137717545e-06, |
| "loss": 0.0851, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.3485477178423237, |
| "grad_norm": 0.6867276776453386, |
| "learning_rate": 2.3972393477788157e-06, |
| "loss": 0.0662, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.3526970954356847, |
| "grad_norm": 0.824772150211262, |
| "learning_rate": 2.3694692520543293e-06, |
| "loss": 0.0874, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.3568464730290457, |
| "grad_norm": 0.7732735468909508, |
| "learning_rate": 2.3418109063252625e-06, |
| "loss": 0.0823, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.3609958506224067, |
| "grad_norm": 0.8837779304062807, |
| "learning_rate": 2.3142654855712353e-06, |
| "loss": 0.0862, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.3651452282157677, |
| "grad_norm": 0.6859223843936688, |
| "learning_rate": 2.2868341599745895e-06, |
| "loss": 0.072, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.3692946058091287, |
| "grad_norm": 0.822431545792467, |
| "learning_rate": 2.259518094870693e-06, |
| "loss": 0.0943, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.3734439834024896, |
| "grad_norm": 0.873667652862396, |
| "learning_rate": 2.2323184506984257e-06, |
| "loss": 0.0836, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.3775933609958506, |
| "grad_norm": 0.7763715356974263, |
| "learning_rate": 2.2052363829508776e-06, |
| "loss": 0.0861, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.3817427385892116, |
| "grad_norm": 0.8203628811889314, |
| "learning_rate": 2.1782730421262738e-06, |
| "loss": 0.0854, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.3858921161825726, |
| "grad_norm": 0.7403334503658842, |
| "learning_rate": 2.151429573679084e-06, |
| "loss": 0.0726, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.3900414937759336, |
| "grad_norm": 0.7925473877054873, |
| "learning_rate": 2.1247071179713774e-06, |
| "loss": 0.0848, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.3941908713692945, |
| "grad_norm": 0.7318960603550514, |
| "learning_rate": 2.098106810224362e-06, |
| "loss": 0.0717, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.3983402489626555, |
| "grad_norm": 0.7393994006621016, |
| "learning_rate": 2.071629780470171e-06, |
| "loss": 0.0711, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.4024896265560165, |
| "grad_norm": 0.7866727572307913, |
| "learning_rate": 2.0452771535038518e-06, |
| "loss": 0.0903, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.4066390041493775, |
| "grad_norm": 0.8011405845759542, |
| "learning_rate": 2.0190500488355776e-06, |
| "loss": 0.0965, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.4107883817427385, |
| "grad_norm": 0.7024967230772534, |
| "learning_rate": 1.9929495806431024e-06, |
| "loss": 0.0713, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.4149377593360997, |
| "grad_norm": 0.7447848584179991, |
| "learning_rate": 1.9669768577244107e-06, |
| "loss": 0.0727, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.4190871369294606, |
| "grad_norm": 0.7397444847607574, |
| "learning_rate": 1.9411329834506286e-06, |
| "loss": 0.0701, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.4232365145228216, |
| "grad_norm": 0.8596586029285681, |
| "learning_rate": 1.9154190557191387e-06, |
| "loss": 0.0935, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.4273858921161826, |
| "grad_norm": 0.7468897071548876, |
| "learning_rate": 1.8898361669069497e-06, |
| "loss": 0.0772, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.4315352697095436, |
| "grad_norm": 0.7567860586556581, |
| "learning_rate": 1.864385403824287e-06, |
| "loss": 0.0858, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.4356846473029046, |
| "grad_norm": 0.74547343386958, |
| "learning_rate": 1.8390678476684143e-06, |
| "loss": 0.0698, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.4398340248962656, |
| "grad_norm": 0.8287041007092206, |
| "learning_rate": 1.8138845739777167e-06, |
| "loss": 0.101, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.4439834024896265, |
| "grad_norm": 0.6949015586020539, |
| "learning_rate": 1.7888366525859968e-06, |
| "loss": 0.0682, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.4481327800829875, |
| "grad_norm": 0.8019337139478665, |
| "learning_rate": 1.7639251475770374e-06, |
| "loss": 0.1039, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.4522821576763485, |
| "grad_norm": 0.7205296459401602, |
| "learning_rate": 1.7391511172393849e-06, |
| "loss": 0.0719, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.4564315352697095, |
| "grad_norm": 0.6811493853455287, |
| "learning_rate": 1.7145156140214032e-06, |
| "loss": 0.0749, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.4605809128630705, |
| "grad_norm": 0.7172097545397581, |
| "learning_rate": 1.6900196844865575e-06, |
| "loss": 0.0738, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.4647302904564317, |
| "grad_norm": 0.7893967258035767, |
| "learning_rate": 1.6656643692689512e-06, |
| "loss": 0.0887, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.4688796680497926, |
| "grad_norm": 0.7589832042178465, |
| "learning_rate": 1.6414507030291249e-06, |
| "loss": 0.0922, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.4730290456431536, |
| "grad_norm": 0.7925586053286797, |
| "learning_rate": 1.617379714410099e-06, |
| "loss": 0.0873, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.4771784232365146, |
| "grad_norm": 0.7675642422062359, |
| "learning_rate": 1.5934524259936757e-06, |
| "loss": 0.083, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.4813278008298756, |
| "grad_norm": 0.6337455156122633, |
| "learning_rate": 1.5696698542569905e-06, |
| "loss": 0.0673, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.4854771784232366, |
| "grad_norm": 0.7199821450528227, |
| "learning_rate": 1.5460330095293447e-06, |
| "loss": 0.0722, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.4896265560165975, |
| "grad_norm": 0.8024411048948309, |
| "learning_rate": 1.5225428959492695e-06, |
| "loss": 0.0806, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.4937759336099585, |
| "grad_norm": 0.6978748125879287, |
| "learning_rate": 1.4992005114218805e-06, |
| "loss": 0.0705, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.4979253112033195, |
| "grad_norm": 0.7368562452138163, |
| "learning_rate": 1.4760068475764789e-06, |
| "loss": 0.0792, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.5020746887966805, |
| "grad_norm": 0.727972065311514, |
| "learning_rate": 1.4529628897244214e-06, |
| "loss": 0.0667, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.5062240663900415, |
| "grad_norm": 0.7055803502397686, |
| "learning_rate": 1.4300696168172735e-06, |
| "loss": 0.0636, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.5103734439834025, |
| "grad_norm": 0.7192990056922836, |
| "learning_rate": 1.4073280014052077e-06, |
| "loss": 0.0812, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.5145228215767634, |
| "grad_norm": 0.7730145132377668, |
| "learning_rate": 1.3847390095957003e-06, |
| "loss": 0.079, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.5186721991701244, |
| "grad_norm": 0.6999079487941452, |
| "learning_rate": 1.3623036010124845e-06, |
| "loss": 0.0666, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.5228215767634854, |
| "grad_norm": 0.8084988090864743, |
| "learning_rate": 1.3400227287547785e-06, |
| "loss": 0.1028, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.5269709543568464, |
| "grad_norm": 0.739877983711314, |
| "learning_rate": 1.3178973393568055e-06, |
| "loss": 0.0788, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.5311203319502074, |
| "grad_norm": 0.7277670152792125, |
| "learning_rate": 1.295928372747574e-06, |
| "loss": 0.0782, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.5352697095435683, |
| "grad_norm": 0.8270870127969331, |
| "learning_rate": 1.2741167622109557e-06, |
| "loss": 0.1103, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.5394190871369293, |
| "grad_norm": 0.7464319854627752, |
| "learning_rate": 1.2524634343460335e-06, |
| "loss": 0.0668, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.5435684647302903, |
| "grad_norm": 0.7496260324397296, |
| "learning_rate": 1.2309693090277392e-06, |
| "loss": 0.0836, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.5477178423236515, |
| "grad_norm": 0.7514590668417973, |
| "learning_rate": 1.2096352993677712e-06, |
| "loss": 0.0722, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.5518672199170125, |
| "grad_norm": 0.7074968757753791, |
| "learning_rate": 1.1884623116758121e-06, |
| "loss": 0.0679, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.5560165975103735, |
| "grad_norm": 0.7836392476257869, |
| "learning_rate": 1.1674512454210202e-06, |
| "loss": 0.0956, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.5601659751037344, |
| "grad_norm": 0.694186080406532, |
| "learning_rate": 1.1466029931938182e-06, |
| "loss": 0.0738, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.5643153526970954, |
| "grad_norm": 0.7353783509200603, |
| "learning_rate": 1.125918440667982e-06, |
| "loss": 0.0844, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.5684647302904564, |
| "grad_norm": 0.6929484125543315, |
| "learning_rate": 1.1053984665630025e-06, |
| "loss": 0.0645, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.5726141078838174, |
| "grad_norm": 0.7605210746246555, |
| "learning_rate": 1.0850439426067705e-06, |
| "loss": 0.0701, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.5767634854771784, |
| "grad_norm": 0.7885536633604376, |
| "learning_rate": 1.064855733498531e-06, |
| "loss": 0.1002, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.5809128630705396, |
| "grad_norm": 0.7739423491814291, |
| "learning_rate": 1.0448346968721596e-06, |
| "loss": 0.0822, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.5850622406639006, |
| "grad_norm": 0.7000383997686694, |
| "learning_rate": 1.024981683259723e-06, |
| "loss": 0.0724, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.5892116182572615, |
| "grad_norm": 0.7612207692729869, |
| "learning_rate": 1.0052975360553446e-06, |
| "loss": 0.0789, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.5933609958506225, |
| "grad_norm": 0.7376770921542023, |
| "learning_rate": 9.857830914793827e-07, |
| "loss": 0.0787, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.5975103734439835, |
| "grad_norm": 0.7003600585245541, |
| "learning_rate": 9.664391785428977e-07, |
| "loss": 0.0669, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.6016597510373445, |
| "grad_norm": 0.7493354808214869, |
| "learning_rate": 9.472666190124457e-07, |
| "loss": 0.0939, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.6058091286307055, |
| "grad_norm": 0.6906782939093074, |
| "learning_rate": 9.282662273751536e-07, |
| "loss": 0.0654, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.6099585062240664, |
| "grad_norm": 0.7217843451690958, |
| "learning_rate": 9.094388108041302e-07, |
| "loss": 0.0728, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.6141078838174274, |
| "grad_norm": 0.7503564894362919, |
| "learning_rate": 8.907851691241709e-07, |
| "loss": 0.0786, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.6182572614107884, |
| "grad_norm": 0.6989476339850978, |
| "learning_rate": 8.723060947777778e-07, |
| "loss": 0.0693, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6224066390041494, |
| "grad_norm": 0.7214414839493923, |
| "learning_rate": 8.540023727915015e-07, |
| "loss": 0.0693, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.6265560165975104, |
| "grad_norm": 0.7436689425663727, |
| "learning_rate": 8.358747807425827e-07, |
| "loss": 0.0817, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.6307053941908713, |
| "grad_norm": 0.6925551456815252, |
| "learning_rate": 8.179240887259304e-07, |
| "loss": 0.0722, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.6348547717842323, |
| "grad_norm": 0.7594694713557898, |
| "learning_rate": 8.001510593213946e-07, |
| "loss": 0.0961, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.6390041493775933, |
| "grad_norm": 0.7342750817062677, |
| "learning_rate": 7.825564475613806e-07, |
| "loss": 0.0768, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.6431535269709543, |
| "grad_norm": 0.688266435838653, |
| "learning_rate": 7.651410008987698e-07, |
| "loss": 0.0732, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.6473029045643153, |
| "grad_norm": 0.7322550264869961, |
| "learning_rate": 7.479054591751623e-07, |
| "loss": 0.0747, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.6514522821576763, |
| "grad_norm": 0.7206648764802494, |
| "learning_rate": 7.308505545894567e-07, |
| "loss": 0.0755, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.6556016597510372, |
| "grad_norm": 0.7661344327610974, |
| "learning_rate": 7.139770116667333e-07, |
| "loss": 0.0777, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.6597510373443982, |
| "grad_norm": 0.6668670661993277, |
| "learning_rate": 6.972855472274853e-07, |
| "loss": 0.065, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.6597510373443982, |
| "eval_loss": 0.18336524069309235, |
| "eval_runtime": 1.4913, |
| "eval_samples_per_second": 13.411, |
| "eval_steps_per_second": 3.353, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.6639004149377592, |
| "grad_norm": 0.7407493531818214, |
| "learning_rate": 6.807768703571616e-07, |
| "loss": 0.0753, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.6680497925311202, |
| "grad_norm": 0.7021176179745514, |
| "learning_rate": 6.644516823760439e-07, |
| "loss": 0.0687, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.6721991701244814, |
| "grad_norm": 0.7544780378754766, |
| "learning_rate": 6.483106768094516e-07, |
| "loss": 0.0826, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.6763485477178424, |
| "grad_norm": 0.7181204694523459, |
| "learning_rate": 6.323545393582847e-07, |
| "loss": 0.0646, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.6804979253112033, |
| "grad_norm": 0.728528171136692, |
| "learning_rate": 6.165839478698909e-07, |
| "loss": 0.0735, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.6846473029045643, |
| "grad_norm": 0.7584024865505997, |
| "learning_rate": 6.009995723092655e-07, |
| "loss": 0.0701, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.6887966804979253, |
| "grad_norm": 0.7362510646650599, |
| "learning_rate": 5.85602074730598e-07, |
| "loss": 0.0702, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.6929460580912863, |
| "grad_norm": 0.8094365167771168, |
| "learning_rate": 5.703921092491393e-07, |
| "loss": 0.086, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.6970954356846473, |
| "grad_norm": 0.7692786267568079, |
| "learning_rate": 5.553703220134188e-07, |
| "loss": 0.0847, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.7012448132780082, |
| "grad_norm": 0.7898257972112215, |
| "learning_rate": 5.405373511777939e-07, |
| "loss": 0.0935, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.7053941908713695, |
| "grad_norm": 0.8534045006213508, |
| "learning_rate": 5.258938268753344e-07, |
| "loss": 0.1036, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.7095435684647304, |
| "grad_norm": 0.8122284703848481, |
| "learning_rate": 5.114403711910631e-07, |
| "loss": 0.0897, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.7136929460580914, |
| "grad_norm": 0.6752641341390524, |
| "learning_rate": 4.971775981355181e-07, |
| "loss": 0.0618, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.7178423236514524, |
| "grad_norm": 0.7247070440800305, |
| "learning_rate": 4.831061136186787e-07, |
| "loss": 0.0652, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.7219917012448134, |
| "grad_norm": 0.7935319653399917, |
| "learning_rate": 4.692265154242137e-07, |
| "loss": 0.0802, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.7261410788381744, |
| "grad_norm": 0.7337802366702694, |
| "learning_rate": 4.555393931841001e-07, |
| "loss": 0.067, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.7302904564315353, |
| "grad_norm": 0.7798660236301157, |
| "learning_rate": 4.420453283535597e-07, |
| "loss": 0.0851, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.7344398340248963, |
| "grad_norm": 0.7189681188334966, |
| "learning_rate": 4.287448941863692e-07, |
| "loss": 0.0714, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.7385892116182573, |
| "grad_norm": 0.6926391079205091, |
| "learning_rate": 4.1563865571050243e-07, |
| "loss": 0.0597, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.7427385892116183, |
| "grad_norm": 0.7673967245329002, |
| "learning_rate": 4.0272716970412516e-07, |
| "loss": 0.0846, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.7468879668049793, |
| "grad_norm": 0.7922534453437592, |
| "learning_rate": 3.9001098467194907e-07, |
| "loss": 0.0849, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.7510373443983402, |
| "grad_norm": 0.7766499519398827, |
| "learning_rate": 3.7749064082191976e-07, |
| "loss": 0.0792, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.7551867219917012, |
| "grad_norm": 0.7624109782606755, |
| "learning_rate": 3.6516667004227904e-07, |
| "loss": 0.0748, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.7593360995850622, |
| "grad_norm": 0.8028939023287873, |
| "learning_rate": 3.53039595878959e-07, |
| "loss": 0.0858, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.7634854771784232, |
| "grad_norm": 0.7305663120522499, |
| "learning_rate": 3.4110993351334944e-07, |
| "loss": 0.0693, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.7676348547717842, |
| "grad_norm": 0.7928883317130192, |
| "learning_rate": 3.2937818974040637e-07, |
| "loss": 0.0824, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.7717842323651452, |
| "grad_norm": 0.7302477597530965, |
| "learning_rate": 3.178448629471226e-07, |
| "loss": 0.0745, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.7759336099585061, |
| "grad_norm": 0.84724113691283, |
| "learning_rate": 3.0651044309136016e-07, |
| "loss": 0.0801, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.7800829875518671, |
| "grad_norm": 0.8237508221533169, |
| "learning_rate": 2.953754116810287e-07, |
| "loss": 0.1089, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.784232365145228, |
| "grad_norm": 0.7556669232924229, |
| "learning_rate": 2.844402417536374e-07, |
| "loss": 0.0667, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.788381742738589, |
| "grad_norm": 0.772080989183171, |
| "learning_rate": 2.737053978561943e-07, |
| "loss": 0.0838, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.79253112033195, |
| "grad_norm": 0.7102656535327171, |
| "learning_rate": 2.631713360254734e-07, |
| "loss": 0.0737, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.796680497925311, |
| "grad_norm": 0.788052093799855, |
| "learning_rate": 2.5283850376864206e-07, |
| "loss": 0.0838, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.8008298755186722, |
| "grad_norm": 0.7658472127903286, |
| "learning_rate": 2.4270734004424643e-07, |
| "loss": 0.0895, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.8049792531120332, |
| "grad_norm": 0.67209631879037, |
| "learning_rate": 2.3277827524356976e-07, |
| "loss": 0.0606, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.8091286307053942, |
| "grad_norm": 0.7565388407521139, |
| "learning_rate": 2.2305173117234236e-07, |
| "loss": 0.084, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.8132780082987552, |
| "grad_norm": 0.7492152065844847, |
| "learning_rate": 2.1352812103282715e-07, |
| "loss": 0.0791, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.8174273858921162, |
| "grad_norm": 0.7805607963350516, |
| "learning_rate": 2.042078494062616e-07, |
| "loss": 0.0771, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.8215767634854771, |
| "grad_norm": 0.7172485874763758, |
| "learning_rate": 1.9509131223567623e-07, |
| "loss": 0.0719, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.8257261410788381, |
| "grad_norm": 0.8259753492038978, |
| "learning_rate": 1.861788968090683e-07, |
| "loss": 0.0855, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.8298755186721993, |
| "grad_norm": 0.731434220061269, |
| "learning_rate": 1.7747098174295208e-07, |
| "loss": 0.072, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.8340248962655603, |
| "grad_norm": 0.8070175019781584, |
| "learning_rate": 1.68967936966275e-07, |
| "loss": 0.0881, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.8381742738589213, |
| "grad_norm": 0.7425563064212982, |
| "learning_rate": 1.606701237046998e-07, |
| "loss": 0.0679, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.8423236514522823, |
| "grad_norm": 0.7479820412933463, |
| "learning_rate": 1.5257789446526172e-07, |
| "loss": 0.0764, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.8464730290456433, |
| "grad_norm": 0.736134548408696, |
| "learning_rate": 1.4469159302139157e-07, |
| "loss": 0.0735, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.8506224066390042, |
| "grad_norm": 0.7885288652580238, |
| "learning_rate": 1.3701155439831249e-07, |
| "loss": 0.0891, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.8547717842323652, |
| "grad_norm": 0.7169793781256871, |
| "learning_rate": 1.295381048588068e-07, |
| "loss": 0.0735, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.8589211618257262, |
| "grad_norm": 0.7419465114699466, |
| "learning_rate": 1.2227156188935552e-07, |
| "loss": 0.0767, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.8630705394190872, |
| "grad_norm": 0.8529136766099726, |
| "learning_rate": 1.1521223418665295e-07, |
| "loss": 0.1095, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.8672199170124482, |
| "grad_norm": 0.6673541325500725, |
| "learning_rate": 1.0836042164448945e-07, |
| "loss": 0.0669, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.8713692946058091, |
| "grad_norm": 0.7662725160062882, |
| "learning_rate": 1.017164153410144e-07, |
| "loss": 0.1, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.8755186721991701, |
| "grad_norm": 0.7503369715965427, |
| "learning_rate": 9.528049752636714e-08, |
| "loss": 0.072, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.879668049792531, |
| "grad_norm": 0.8105258662491354, |
| "learning_rate": 8.905294161069111e-08, |
| "loss": 0.1024, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.883817427385892, |
| "grad_norm": 0.7200030691605961, |
| "learning_rate": 8.303401215251583e-08, |
| "loss": 0.0709, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.887966804979253, |
| "grad_norm": 0.7279970331056289, |
| "learning_rate": 7.722396484751705e-08, |
| "loss": 0.0691, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.892116182572614, |
| "grad_norm": 0.8032992136998793, |
| "learning_rate": 7.16230465176565e-08, |
| "loss": 0.0866, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.896265560165975, |
| "grad_norm": 0.7754511619306107, |
| "learning_rate": 6.623149510069593e-08, |
| "loss": 0.0774, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.900414937759336, |
| "grad_norm": 0.8395976573342571, |
| "learning_rate": 6.104953964008897e-08, |
| "loss": 0.0725, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.904564315352697, |
| "grad_norm": 0.732494745176438, |
| "learning_rate": 5.6077400275248996e-08, |
| "loss": 0.0777, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.908713692946058, |
| "grad_norm": 0.7307232915987477, |
| "learning_rate": 5.1315288232201e-08, |
| "loss": 0.073, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.912863070539419, |
| "grad_norm": 0.7289477709176224, |
| "learning_rate": 4.6763405814604926e-08, |
| "loss": 0.0668, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.91701244813278, |
| "grad_norm": 0.6542938168620738, |
| "learning_rate": 4.2421946395164174e-08, |
| "loss": 0.0593, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.921161825726141, |
| "grad_norm": 0.7217186844568827, |
| "learning_rate": 3.829109440740719e-08, |
| "loss": 0.0741, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.9253112033195021, |
| "grad_norm": 0.7521600164181688, |
| "learning_rate": 3.437102533785541e-08, |
| "loss": 0.0815, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.929460580912863, |
| "grad_norm": 0.7159778531199582, |
| "learning_rate": 3.066190571856864e-08, |
| "loss": 0.0721, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.933609958506224, |
| "grad_norm": 0.7659356632299632, |
| "learning_rate": 2.7163893120066288e-08, |
| "loss": 0.0923, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.937759336099585, |
| "grad_norm": 0.7560583950564549, |
| "learning_rate": 2.3877136144638823e-08, |
| "loss": 0.0981, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.941908713692946, |
| "grad_norm": 0.7329659724004767, |
| "learning_rate": 2.0801774420031172e-08, |
| "loss": 0.0821, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.946058091286307, |
| "grad_norm": 0.785189771179115, |
| "learning_rate": 1.793793859351245e-08, |
| "loss": 0.0907, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.950207468879668, |
| "grad_norm": 0.7073552301072518, |
| "learning_rate": 1.5285750326325953e-08, |
| "loss": 0.0702, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.9543568464730292, |
| "grad_norm": 0.7118012286048271, |
| "learning_rate": 1.284532228851998e-08, |
| "loss": 0.0846, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.9585062240663902, |
| "grad_norm": 0.7399820417525123, |
| "learning_rate": 1.0616758154161633e-08, |
| "loss": 0.067, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.9626556016597512, |
| "grad_norm": 0.727348534517167, |
| "learning_rate": 8.600152596933142e-09, |
| "loss": 0.0691, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.9668049792531122, |
| "grad_norm": 0.7454470988451566, |
| "learning_rate": 6.7955912861095155e-09, |
| "loss": 0.0885, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.9709543568464731, |
| "grad_norm": 0.7525549770710052, |
| "learning_rate": 5.203150882918673e-09, |
| "loss": 0.0892, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.9751037344398341, |
| "grad_norm": 0.8218857035464914, |
| "learning_rate": 3.822899037286276e-09, |
| "loss": 0.0975, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.979253112033195, |
| "grad_norm": 0.7338075673837552, |
| "learning_rate": 2.654894384959694e-09, |
| "loss": 0.0709, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.983402489626556, |
| "grad_norm": 0.7806915747584687, |
| "learning_rate": 1.6991865450188827e-09, |
| "loss": 0.091, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.987551867219917, |
| "grad_norm": 0.700019407551831, |
| "learning_rate": 9.558161177669612e-10, |
| "loss": 0.0738, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.991701244813278, |
| "grad_norm": 0.7540815180352235, |
| "learning_rate": 4.2481468300603625e-10, |
| "loss": 0.0811, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.995850622406639, |
| "grad_norm": 0.7372484737179315, |
| "learning_rate": 1.0620479869771772e-10, |
| "loss": 0.0744, |
| "step": 481 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.7894507333442309, |
| "learning_rate": 0.0, |
| "loss": 0.0925, |
| "step": 482 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 482, |
| "total_flos": 16593740365824.0, |
| "train_loss": 0.13330045866459236, |
| "train_runtime": 872.8294, |
| "train_samples_per_second": 4.418, |
| "train_steps_per_second": 0.552 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 482, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 16593740365824.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|