| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9724310776942353, | |
| "eval_steps": 500, | |
| "global_step": 147, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020050125313283207, | |
| "grad_norm": 6.100090649437154, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 1.0088, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.040100250626566414, | |
| "grad_norm": 6.165069480911681, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 1.0202, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.06015037593984962, | |
| "grad_norm": 4.472412116195948, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.9496, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.08020050125313283, | |
| "grad_norm": 4.963164130948997, | |
| "learning_rate": 2.1333333333333335e-05, | |
| "loss": 0.9406, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.10025062656641603, | |
| "grad_norm": 4.041819143127636, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.8794, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.12030075187969924, | |
| "grad_norm": 3.719380540370943, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.9034, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 2.3309985890149765, | |
| "learning_rate": 3.733333333333334e-05, | |
| "loss": 0.8102, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.16040100250626566, | |
| "grad_norm": 2.2467101719324454, | |
| "learning_rate": 4.266666666666667e-05, | |
| "loss": 0.786, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.18045112781954886, | |
| "grad_norm": 2.8539093369835227, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.7531, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.20050125313283207, | |
| "grad_norm": 2.0083048527276404, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 0.7468, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.22055137844611528, | |
| "grad_norm": 2.165125600420533, | |
| "learning_rate": 5.8666666666666665e-05, | |
| "loss": 0.7391, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.24060150375939848, | |
| "grad_norm": 2.122858303676939, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.7233, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.2606516290726817, | |
| "grad_norm": 2.2789276233273843, | |
| "learning_rate": 6.933333333333334e-05, | |
| "loss": 0.7286, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 2.1616002933600393, | |
| "learning_rate": 7.466666666666667e-05, | |
| "loss": 0.7199, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.3007518796992481, | |
| "grad_norm": 3.2259560404881986, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7223, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3208020050125313, | |
| "grad_norm": 1.5369895434247212, | |
| "learning_rate": 7.998867178772517e-05, | |
| "loss": 0.7063, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.3408521303258145, | |
| "grad_norm": 3.28272311228189, | |
| "learning_rate": 7.995469356732033e-05, | |
| "loss": 0.7174, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3609022556390977, | |
| "grad_norm": 2.1611599200246974, | |
| "learning_rate": 7.989808458441014e-05, | |
| "loss": 0.6981, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 1.9324451852274291, | |
| "learning_rate": 7.981887690292339e-05, | |
| "loss": 0.6943, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.40100250626566414, | |
| "grad_norm": 18.248869277300834, | |
| "learning_rate": 7.971711538693153e-05, | |
| "loss": 0.6998, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 2.181882821404686, | |
| "learning_rate": 7.959285767523732e-05, | |
| "loss": 0.7193, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.44110275689223055, | |
| "grad_norm": 1.2480300021039636, | |
| "learning_rate": 7.944617414872747e-05, | |
| "loss": 0.6843, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.46115288220551376, | |
| "grad_norm": 2.3412336103636693, | |
| "learning_rate": 7.927714789050826e-05, | |
| "loss": 0.7089, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.48120300751879697, | |
| "grad_norm": 1.9642689883848843, | |
| "learning_rate": 7.908587463884638e-05, | |
| "loss": 0.6787, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.5012531328320802, | |
| "grad_norm": 1.3434579333848315, | |
| "learning_rate": 7.887246273294167e-05, | |
| "loss": 0.6773, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.5213032581453634, | |
| "grad_norm": 1.440602627293297, | |
| "learning_rate": 7.863703305156273e-05, | |
| "loss": 0.6792, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.5413533834586466, | |
| "grad_norm": 1.3175913095028617, | |
| "learning_rate": 7.837971894457991e-05, | |
| "loss": 0.6654, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 1.3198703652450279, | |
| "learning_rate": 7.810066615743443e-05, | |
| "loss": 0.6524, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.581453634085213, | |
| "grad_norm": 0.781557000061513, | |
| "learning_rate": 7.780003274858674e-05, | |
| "loss": 0.6573, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.6015037593984962, | |
| "grad_norm": 1.1674794127969026, | |
| "learning_rate": 7.747798899999048e-05, | |
| "loss": 0.6664, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.6215538847117794, | |
| "grad_norm": 1.623871556077205, | |
| "learning_rate": 7.71347173206429e-05, | |
| "loss": 0.6722, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.6416040100250626, | |
| "grad_norm": 1.1713900589893373, | |
| "learning_rate": 7.677041214326663e-05, | |
| "loss": 0.6403, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.6616541353383458, | |
| "grad_norm": 1.1207108511375592, | |
| "learning_rate": 7.638527981418075e-05, | |
| "loss": 0.6427, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.681704260651629, | |
| "grad_norm": 1.6390101502728163, | |
| "learning_rate": 7.597953847642413e-05, | |
| "loss": 0.6451, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.9703477640239104, | |
| "learning_rate": 7.555341794619695e-05, | |
| "loss": 0.6371, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.7218045112781954, | |
| "grad_norm": 1.888557130236824, | |
| "learning_rate": 7.510715958269023e-05, | |
| "loss": 0.6385, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.7418546365914787, | |
| "grad_norm": 1.4709265925596289, | |
| "learning_rate": 7.464101615137756e-05, | |
| "loss": 0.6468, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 1.3340842858015245, | |
| "learning_rate": 7.415525168084593e-05, | |
| "loss": 0.636, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.7819548872180451, | |
| "grad_norm": 1.2331491459930695, | |
| "learning_rate": 7.365014131324725e-05, | |
| "loss": 0.6423, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.8020050125313283, | |
| "grad_norm": 1.0459942770103978, | |
| "learning_rate": 7.312597114845483e-05, | |
| "loss": 0.6405, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.8220551378446115, | |
| "grad_norm": 0.9742677496779505, | |
| "learning_rate": 7.258303808201343e-05, | |
| "loss": 0.619, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.9126761857775636, | |
| "learning_rate": 7.202164963697442e-05, | |
| "loss": 0.6237, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.8621553884711779, | |
| "grad_norm": 1.1034224988906396, | |
| "learning_rate": 7.144212378971151e-05, | |
| "loss": 0.6126, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.8822055137844611, | |
| "grad_norm": 0.9050299139188852, | |
| "learning_rate": 7.084478878981552e-05, | |
| "loss": 0.6199, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.9022556390977443, | |
| "grad_norm": 0.9484346415934313, | |
| "learning_rate": 7.022998297417034e-05, | |
| "loss": 0.6242, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.9223057644110275, | |
| "grad_norm": 0.6761984153684619, | |
| "learning_rate": 6.959805457531536e-05, | |
| "loss": 0.6271, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.9423558897243107, | |
| "grad_norm": 0.7734163089859195, | |
| "learning_rate": 6.89493615242028e-05, | |
| "loss": 0.6057, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.9624060150375939, | |
| "grad_norm": 0.7126127374462361, | |
| "learning_rate": 6.828427124746191e-05, | |
| "loss": 0.6143, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.7040138292374798, | |
| "learning_rate": 6.760316045928449e-05, | |
| "loss": 0.5971, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.0150375939849625, | |
| "grad_norm": 1.0358340840041869, | |
| "learning_rate": 6.690641494805011e-05, | |
| "loss": 1.0623, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 0.975942175036343, | |
| "learning_rate": 6.619442935781141e-05, | |
| "loss": 0.5949, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.055137844611529, | |
| "grad_norm": 1.3817333278949258, | |
| "learning_rate": 6.546760696476354e-05, | |
| "loss": 0.5965, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.0751879699248121, | |
| "grad_norm": 0.5594198623207998, | |
| "learning_rate": 6.472635944882421e-05, | |
| "loss": 0.5817, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.0952380952380953, | |
| "grad_norm": 1.3502604906380835, | |
| "learning_rate": 6.397110666045388e-05, | |
| "loss": 0.5936, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.1152882205513786, | |
| "grad_norm": 0.662382371248644, | |
| "learning_rate": 6.320227638284793e-05, | |
| "loss": 0.597, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.1353383458646618, | |
| "grad_norm": 1.0092828128239282, | |
| "learning_rate": 6.242030408963576e-05, | |
| "loss": 0.5895, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.155388471177945, | |
| "grad_norm": 0.8148251028605993, | |
| "learning_rate": 6.162563269822391e-05, | |
| "loss": 0.5796, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.1754385964912282, | |
| "grad_norm": 0.7504641683568708, | |
| "learning_rate": 6.0818712318922894e-05, | |
| "loss": 0.5756, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.1954887218045114, | |
| "grad_norm": 0.7075215261142646, | |
| "learning_rate": 6.000000000000001e-05, | |
| "loss": 0.5899, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.2155388471177946, | |
| "grad_norm": 0.616627011160271, | |
| "learning_rate": 5.916995946880228e-05, | |
| "loss": 0.5756, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.2355889724310778, | |
| "grad_norm": 0.6224665456566449, | |
| "learning_rate": 5.832906086909642e-05, | |
| "loss": 0.5717, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.255639097744361, | |
| "grad_norm": 0.5410026994188678, | |
| "learning_rate": 5.747778049477438e-05, | |
| "loss": 0.5719, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.2756892230576442, | |
| "grad_norm": 0.5138816389771951, | |
| "learning_rate": 5.661660052007547e-05, | |
| "loss": 0.5767, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.2957393483709274, | |
| "grad_norm": 0.538651466379072, | |
| "learning_rate": 5.574600872647766e-05, | |
| "loss": 0.5754, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.39832101327162095, | |
| "learning_rate": 5.48664982264131e-05, | |
| "loss": 0.5806, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.3358395989974938, | |
| "grad_norm": 0.46665463498566045, | |
| "learning_rate": 5.397856718396394e-05, | |
| "loss": 0.5622, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.355889724310777, | |
| "grad_norm": 0.33596753910393834, | |
| "learning_rate": 5.3082718532696874e-05, | |
| "loss": 0.5635, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.3759398496240602, | |
| "grad_norm": 0.36330721064185245, | |
| "learning_rate": 5.217945969079629e-05, | |
| "loss": 0.5728, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.3959899749373434, | |
| "grad_norm": 0.31668248946183414, | |
| "learning_rate": 5.1269302273657195e-05, | |
| "loss": 0.5829, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.4160401002506267, | |
| "grad_norm": 0.3108858835369522, | |
| "learning_rate": 5.0352761804100835e-05, | |
| "loss": 0.5797, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.4360902255639099, | |
| "grad_norm": 0.37076038809913947, | |
| "learning_rate": 4.94303574203771e-05, | |
| "loss": 0.5678, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.456140350877193, | |
| "grad_norm": 0.27669921803024705, | |
| "learning_rate": 4.8502611582119065e-05, | |
| "loss": 0.5644, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.4761904761904763, | |
| "grad_norm": 0.3206960394723747, | |
| "learning_rate": 4.7570049774416414e-05, | |
| "loss": 0.5696, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.4962406015037595, | |
| "grad_norm": 0.35836009606291436, | |
| "learning_rate": 4.663320021017497e-05, | |
| "loss": 0.5574, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.5162907268170427, | |
| "grad_norm": 0.26025491223433994, | |
| "learning_rate": 4.5692593530931416e-05, | |
| "loss": 0.5683, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.536340852130326, | |
| "grad_norm": 0.25457026883587025, | |
| "learning_rate": 4.474876250629221e-05, | |
| "loss": 0.565, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.556390977443609, | |
| "grad_norm": 0.32769122481932667, | |
| "learning_rate": 4.38022417321673e-05, | |
| "loss": 0.5641, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.5764411027568923, | |
| "grad_norm": 0.22460593199796938, | |
| "learning_rate": 4.2853567327969296e-05, | |
| "loss": 0.557, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.5964912280701755, | |
| "grad_norm": 0.21753199007833202, | |
| "learning_rate": 4.19032766329497e-05, | |
| "loss": 0.5578, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.6165413533834587, | |
| "grad_norm": 0.23083331300054297, | |
| "learning_rate": 4.0951907901844296e-05, | |
| "loss": 0.5622, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.636591478696742, | |
| "grad_norm": 0.2564157643733938, | |
| "learning_rate": 4e-05, | |
| "loss": 0.5657, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.6566416040100251, | |
| "grad_norm": 0.27813133557097336, | |
| "learning_rate": 3.904809209815571e-05, | |
| "loss": 0.5603, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.6766917293233083, | |
| "grad_norm": 0.2038258932695245, | |
| "learning_rate": 3.809672336705031e-05, | |
| "loss": 0.5572, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.6967418546365916, | |
| "grad_norm": 0.29155489774909854, | |
| "learning_rate": 3.714643267203071e-05, | |
| "loss": 0.5544, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.7167919799498748, | |
| "grad_norm": 0.16516627488221486, | |
| "learning_rate": 3.6197758267832705e-05, | |
| "loss": 0.5584, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.29249607068570543, | |
| "learning_rate": 3.5251237493707804e-05, | |
| "loss": 0.5677, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.7568922305764412, | |
| "grad_norm": 0.19265408478071472, | |
| "learning_rate": 3.4307406469068604e-05, | |
| "loss": 0.5632, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.7769423558897244, | |
| "grad_norm": 0.21254433897953356, | |
| "learning_rate": 3.3366799789825044e-05, | |
| "loss": 0.5512, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.7969924812030076, | |
| "grad_norm": 0.2170294921675785, | |
| "learning_rate": 3.2429950225583606e-05, | |
| "loss": 0.5493, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.8170426065162908, | |
| "grad_norm": 0.17517457106296594, | |
| "learning_rate": 3.1497388417880935e-05, | |
| "loss": 0.5522, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.837092731829574, | |
| "grad_norm": 0.21184436677480925, | |
| "learning_rate": 3.0569642579622905e-05, | |
| "loss": 0.5533, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.1566054507779003, | |
| "learning_rate": 2.9647238195899168e-05, | |
| "loss": 0.538, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.8771929824561404, | |
| "grad_norm": 0.19291518917014827, | |
| "learning_rate": 2.873069772634281e-05, | |
| "loss": 0.5613, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.8972431077694236, | |
| "grad_norm": 0.14403155629811726, | |
| "learning_rate": 2.7820540309203728e-05, | |
| "loss": 0.5561, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.9172932330827068, | |
| "grad_norm": 0.18577775134211605, | |
| "learning_rate": 2.691728146730314e-05, | |
| "loss": 0.5619, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.93734335839599, | |
| "grad_norm": 0.17655395819783837, | |
| "learning_rate": 2.6021432816036073e-05, | |
| "loss": 0.557, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.9573934837092732, | |
| "grad_norm": 0.15678742149650277, | |
| "learning_rate": 2.5133501773586905e-05, | |
| "loss": 0.55, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.9774436090225564, | |
| "grad_norm": 0.147011080540491, | |
| "learning_rate": 2.425399127352235e-05, | |
| "loss": 0.5615, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.0100250626566414, | |
| "grad_norm": 0.2896327588272941, | |
| "learning_rate": 2.338339947992455e-05, | |
| "loss": 0.984, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.030075187969925, | |
| "grad_norm": 0.17749955956373084, | |
| "learning_rate": 2.2522219505225627e-05, | |
| "loss": 0.5472, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.050125313283208, | |
| "grad_norm": 0.18960883016778624, | |
| "learning_rate": 2.1670939130903585e-05, | |
| "loss": 0.5246, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.19580310466277323, | |
| "learning_rate": 2.0830040531197744e-05, | |
| "loss": 0.5333, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 2.090225563909774, | |
| "grad_norm": 0.17002438913970971, | |
| "learning_rate": 2.0000000000000012e-05, | |
| "loss": 0.5232, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 2.110275689223058, | |
| "grad_norm": 0.23635746246402964, | |
| "learning_rate": 1.9181287681077116e-05, | |
| "loss": 0.5299, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 2.1303258145363406, | |
| "grad_norm": 0.15533319903966208, | |
| "learning_rate": 1.8374367301776112e-05, | |
| "loss": 0.5193, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 2.1503759398496243, | |
| "grad_norm": 0.2082307249817901, | |
| "learning_rate": 1.7579695910364235e-05, | |
| "loss": 0.5342, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 2.170426065162907, | |
| "grad_norm": 0.14435259557657618, | |
| "learning_rate": 1.679772361715208e-05, | |
| "loss": 0.5361, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 0.20712238757176837, | |
| "learning_rate": 1.6028893339546122e-05, | |
| "loss": 0.5331, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.13522587328433971, | |
| "learning_rate": 1.527364055117579e-05, | |
| "loss": 0.5329, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 2.230576441102757, | |
| "grad_norm": 0.1593070945975799, | |
| "learning_rate": 1.4532393035236477e-05, | |
| "loss": 0.5323, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.25062656641604, | |
| "grad_norm": 0.1461967123983933, | |
| "learning_rate": 1.3805570642188602e-05, | |
| "loss": 0.5162, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 2.2706766917293235, | |
| "grad_norm": 0.11139864548024211, | |
| "learning_rate": 1.30935850519499e-05, | |
| "loss": 0.5258, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.2907268170426063, | |
| "grad_norm": 0.1459949648258527, | |
| "learning_rate": 1.2396839540715528e-05, | |
| "loss": 0.5249, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.31077694235589, | |
| "grad_norm": 0.1134072731915256, | |
| "learning_rate": 1.1715728752538103e-05, | |
| "loss": 0.5283, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.3308270676691727, | |
| "grad_norm": 0.10061797227153219, | |
| "learning_rate": 1.1050638475797193e-05, | |
| "loss": 0.5264, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.3508771929824563, | |
| "grad_norm": 0.11967704006720019, | |
| "learning_rate": 1.0401945424684653e-05, | |
| "loss": 0.5258, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.370927318295739, | |
| "grad_norm": 0.10081258865949869, | |
| "learning_rate": 9.770017025829675e-06, | |
| "loss": 0.5125, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.3909774436090228, | |
| "grad_norm": 0.0910736424605034, | |
| "learning_rate": 9.155211210184495e-06, | |
| "loss": 0.5215, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.4110275689223055, | |
| "grad_norm": 0.10320752546874508, | |
| "learning_rate": 8.55787621028851e-06, | |
| "loss": 0.5162, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.431077694235589, | |
| "grad_norm": 0.09217800293365436, | |
| "learning_rate": 7.978350363025588e-06, | |
| "loss": 0.5343, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.451127819548872, | |
| "grad_norm": 0.08490382451555073, | |
| "learning_rate": 7.416961917986572e-06, | |
| "loss": 0.5219, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.4711779448621556, | |
| "grad_norm": 0.08656662969137564, | |
| "learning_rate": 6.874028851545174e-06, | |
| "loss": 0.5212, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.4912280701754383, | |
| "grad_norm": 0.09050314726798651, | |
| "learning_rate": 6.349858686752748e-06, | |
| "loss": 0.5328, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.511278195488722, | |
| "grad_norm": 0.0824254065764698, | |
| "learning_rate": 5.8447483191540784e-06, | |
| "loss": 0.5282, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.5313283208020048, | |
| "grad_norm": 0.09004753737634522, | |
| "learning_rate": 5.358983848622452e-06, | |
| "loss": 0.5291, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.5513784461152884, | |
| "grad_norm": 0.08491851685766005, | |
| "learning_rate": 4.892840417309775e-06, | |
| "loss": 0.5174, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.08667374099676277, | |
| "learning_rate": 4.446582053803066e-06, | |
| "loss": 0.5269, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.591478696741855, | |
| "grad_norm": 0.08634185611814611, | |
| "learning_rate": 4.020461523575873e-06, | |
| "loss": 0.5404, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.6115288220551376, | |
| "grad_norm": 0.08142061930820212, | |
| "learning_rate": 3.6147201858192627e-06, | |
| "loss": 0.5297, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.08234802409772482, | |
| "learning_rate": 3.2295878567333784e-06, | |
| "loss": 0.5347, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.651629072681704, | |
| "grad_norm": 0.0850700303547024, | |
| "learning_rate": 2.8652826793570975e-06, | |
| "loss": 0.5309, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.6716791979949877, | |
| "grad_norm": 0.08576766617100663, | |
| "learning_rate": 2.5220110000095366e-06, | |
| "loss": 0.529, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.6917293233082704, | |
| "grad_norm": 0.08621611962612956, | |
| "learning_rate": 2.199967251413262e-06, | |
| "loss": 0.526, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.711779448621554, | |
| "grad_norm": 0.07973648331165978, | |
| "learning_rate": 1.8993338425655805e-06, | |
| "loss": 0.5291, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.731829573934837, | |
| "grad_norm": 0.07607751613838545, | |
| "learning_rate": 1.6202810554201099e-06, | |
| "loss": 0.5287, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.7518796992481205, | |
| "grad_norm": 0.0764140719381241, | |
| "learning_rate": 1.3629669484372722e-06, | |
| "loss": 0.519, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.07473144353146483, | |
| "learning_rate": 1.127537267058334e-06, | |
| "loss": 0.5299, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.791979949874687, | |
| "grad_norm": 0.07476164790510335, | |
| "learning_rate": 9.141253611536238e-07, | |
| "loss": 0.5335, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.8120300751879697, | |
| "grad_norm": 0.07472989997236351, | |
| "learning_rate": 7.228521094917318e-07, | |
| "loss": 0.5156, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.8320802005012533, | |
| "grad_norm": 0.07763611917758945, | |
| "learning_rate": 5.538258512725403e-07, | |
| "loss": 0.528, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.852130325814536, | |
| "grad_norm": 0.07864532159225761, | |
| "learning_rate": 4.0714232476269265e-07, | |
| "loss": 0.5205, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.8721804511278197, | |
| "grad_norm": 0.07157416039875991, | |
| "learning_rate": 2.8288461306846817e-07, | |
| "loss": 0.5251, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.8922305764411025, | |
| "grad_norm": 0.07874362826484563, | |
| "learning_rate": 1.8112309707661647e-07, | |
| "loss": 0.5326, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.912280701754386, | |
| "grad_norm": 0.07678251046065998, | |
| "learning_rate": 1.019154155898594e-07, | |
| "loss": 0.5325, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.932330827067669, | |
| "grad_norm": 0.07928853233638587, | |
| "learning_rate": 4.530643267968149e-08, | |
| "loss": 0.5283, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.9523809523809526, | |
| "grad_norm": 0.07049261679058248, | |
| "learning_rate": 1.1328212274839267e-08, | |
| "loss": 0.5307, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.9724310776942353, | |
| "grad_norm": 0.07518275904349948, | |
| "learning_rate": 0.0, | |
| "loss": 0.5333, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.9724310776942353, | |
| "step": 147, | |
| "total_flos": 3.782746824809382e+18, | |
| "train_loss": 0.6079183743924511, | |
| "train_runtime": 22677.177, | |
| "train_samples_per_second": 3.374, | |
| "train_steps_per_second": 0.006 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 147, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.782746824809382e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |