| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.6099885627144491, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": false, |
| "is_world_process_zero": false, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 26.258708272339597, |
| "learning_rate": 0.0001, |
| "loss": 6.0469, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 20.031528084144362, |
| "learning_rate": 0.0002, |
| "loss": 6.0524, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 9.323112879274246, |
| "learning_rate": 0.0003, |
| "loss": 6.0052, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 4.147754138762137, |
| "learning_rate": 0.0004, |
| "loss": 5.7581, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.4117904450662409, |
| "learning_rate": 0.0005, |
| "loss": 5.5959, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.450165632053372, |
| "learning_rate": 0.0006, |
| "loss": 5.3926, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.3379034522061206, |
| "learning_rate": 0.0007, |
| "loss": 5.2432, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.3149419664848485, |
| "learning_rate": 0.0008, |
| "loss": 4.9048, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.5265629850853388, |
| "learning_rate": 0.0009000000000000001, |
| "loss": 4.8456, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.2248009557702129, |
| "learning_rate": 0.001, |
| "loss": 4.4037, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.5753342093655753, |
| "learning_rate": 0.0009999754462587395, |
| "loss": 3.7762, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.0959671581261972, |
| "learning_rate": 0.0009999017874465026, |
| "loss": 3.5411, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.8383646382276924, |
| "learning_rate": 0.0009997790307976872, |
| "loss": 3.4374, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.6879126623626883, |
| "learning_rate": 0.0009996071883688333, |
| "loss": 3.3495, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.48057277440632934, |
| "learning_rate": 0.000999386277037439, |
| "loss": 3.3236, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.5193770648047042, |
| "learning_rate": 0.0009991163185003027, |
| "loss": 3.194, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.34046016814570407, |
| "learning_rate": 0.0009987973392713932, |
| "loss": 3.008, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.2969743503479429, |
| "learning_rate": 0.0009984293706792437, |
| "loss": 3.0682, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.22754131045377113, |
| "learning_rate": 0.0009980124488638772, |
| "loss": 2.92, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.15880784203685044, |
| "learning_rate": 0.000997546614773255, |
| "loss": 2.881, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.10804346892890891, |
| "learning_rate": 0.0009970319141592559, |
| "loss": 2.8386, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.0789962600177391, |
| "learning_rate": 0.0009964683975731828, |
| "loss": 2.8704, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.0678766718481922, |
| "learning_rate": 0.0009958561203607973, |
| "loss": 2.8326, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.15129601777209895, |
| "learning_rate": 0.000995195142656885, |
| "loss": 2.8444, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.23876681297628152, |
| "learning_rate": 0.0009944855293793475, |
| "loss": 2.8809, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.2458680275413572, |
| "learning_rate": 0.0009937273502228282, |
| "loss": 2.8308, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.2646415320413548, |
| "learning_rate": 0.0009929206796518663, |
| "loss": 2.8296, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.1855177204894958, |
| "learning_rate": 0.0009920655968935837, |
| "loss": 2.8302, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.1692458314027983, |
| "learning_rate": 0.0009911621859299041, |
| "loss": 2.7798, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.0887107703024984, |
| "learning_rate": 0.000990210535489303, |
| "loss": 2.8246, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.05971709937694414, |
| "learning_rate": 0.0009892107390380958, |
| "loss": 2.777, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.04203098841999937, |
| "learning_rate": 0.0009881628947712555, |
| "loss": 2.7673, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.03708904853213128, |
| "learning_rate": 0.0009870671056027706, |
| "loss": 2.7486, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.03528212659332326, |
| "learning_rate": 0.0009859234791555354, |
| "loss": 2.7886, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.03018922862288316, |
| "learning_rate": 0.000984732127750782, |
| "loss": 2.7327, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.03686815342528168, |
| "learning_rate": 0.0009834931683970467, |
| "loss": 2.7474, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.033172131047386, |
| "learning_rate": 0.0009822067227786794, |
| "loss": 2.7345, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.0373016703958135, |
| "learning_rate": 0.000980872917243891, |
| "loss": 2.7655, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.03688389598963858, |
| "learning_rate": 0.0009794918827923458, |
| "loss": 2.7393, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.038870114526809094, |
| "learning_rate": 0.000978063755062294, |
| "loss": 2.7611, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.04038306990198406, |
| "learning_rate": 0.0009765886743172511, |
| "loss": 2.7386, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.036670767168349196, |
| "learning_rate": 0.0009750667854322206, |
| "loss": 2.6912, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.03291379266422108, |
| "learning_rate": 0.0009734982378794661, |
| "loss": 2.7719, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.03542095069122837, |
| "learning_rate": 0.0009718831857138308, |
| "loss": 2.7095, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.03556844983258325, |
| "learning_rate": 0.0009702217875576068, |
| "loss": 2.7435, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.03705512589107, |
| "learning_rate": 0.0009685142065849555, |
| "loss": 2.713, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.027028911527830556, |
| "learning_rate": 0.0009667606105058828, |
| "loss": 2.6472, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.028600789964649966, |
| "learning_rate": 0.0009649611715497661, |
| "loss": 2.6931, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.029116613327395607, |
| "learning_rate": 0.0009631160664484398, |
| "loss": 2.7082, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.02557746875550266, |
| "learning_rate": 0.0009612254764188368, |
| "loss": 2.6707, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.0256723041758138, |
| "learning_rate": 0.0009592895871451908, |
| "loss": 2.6608, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.02295300029667766, |
| "learning_rate": 0.000957308588760799, |
| "loss": 2.6942, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.02203986352323152, |
| "learning_rate": 0.0009552826758293487, |
| "loss": 2.6441, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.02299784546740539, |
| "learning_rate": 0.0009532120473258075, |
| "loss": 2.6728, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.023913877734993542, |
| "learning_rate": 0.0009510969066168813, |
| "loss": 2.6924, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.020131800174103105, |
| "learning_rate": 0.0009489374614410414, |
| "loss": 2.6151, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.02106884604663375, |
| "learning_rate": 0.0009467339238881198, |
| "loss": 2.6413, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.02440845171243592, |
| "learning_rate": 0.0009444865103784803, |
| "loss": 2.6663, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.019024075041881073, |
| "learning_rate": 0.0009421954416417624, |
| "loss": 2.6063, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.019667612528268406, |
| "learning_rate": 0.0009398609426952018, |
| "loss": 2.6697, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.020324139148677475, |
| "learning_rate": 0.0009374832428215309, |
| "loss": 2.6739, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.02049609873784361, |
| "learning_rate": 0.00093506257554646, |
| "loss": 2.6115, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.022068793706639576, |
| "learning_rate": 0.0009325991786157404, |
| "loss": 2.6537, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.02119215165943166, |
| "learning_rate": 0.0009300932939718159, |
| "loss": 2.6668, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.020470851870879974, |
| "learning_rate": 0.000927545167730059, |
| "loss": 2.6495, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.020239015102427586, |
| "learning_rate": 0.0009249550501545996, |
| "loss": 2.6229, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.02006561036060577, |
| "learning_rate": 0.000922323195633745, |
| "loss": 2.6541, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.020517458730459407, |
| "learning_rate": 0.0009196498626549943, |
| "loss": 2.6263, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.020546751672942867, |
| "learning_rate": 0.0009169353137796533, |
| "loss": 2.6868, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.020795862822058586, |
| "learning_rate": 0.0009141798156170446, |
| "loss": 2.6376, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.023603442107788076, |
| "learning_rate": 0.0009113836387983239, |
| "loss": 2.6679, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.020593236730693215, |
| "learning_rate": 0.0009085470579498995, |
| "loss": 2.628, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.0185506077207348, |
| "learning_rate": 0.0009056703516664606, |
| "loss": 2.6315, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.02046637689680032, |
| "learning_rate": 0.0009027538024836141, |
| "loss": 2.6227, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.018026412977487194, |
| "learning_rate": 0.0008997976968501361, |
| "loss": 2.6062, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.017993838807838707, |
| "learning_rate": 0.000896802325099838, |
| "loss": 2.5869, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.01897876413125471, |
| "learning_rate": 0.0008937679814230517, |
| "loss": 2.6046, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.019221228785349705, |
| "learning_rate": 0.000890694963837735, |
| "loss": 2.6163, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.01999062056670978, |
| "learning_rate": 0.0008875835741602029, |
| "loss": 2.6624, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.01688098771768354, |
| "learning_rate": 0.0008844341179754839, |
| "loss": 2.597, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.021679649830905732, |
| "learning_rate": 0.0008812469046073068, |
| "loss": 2.6252, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.01976703829526433, |
| "learning_rate": 0.0008780222470877213, |
| "loss": 2.6547, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.01721216059701167, |
| "learning_rate": 0.000874760462126353, |
| "loss": 2.5894, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.017796757505650348, |
| "learning_rate": 0.0008714618700792976, |
| "loss": 2.6428, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.017516085193709605, |
| "learning_rate": 0.0008681267949176579, |
| "loss": 2.6267, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.019752354059471497, |
| "learning_rate": 0.0008647555641957244, |
| "loss": 2.6172, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.01942107114850748, |
| "learning_rate": 0.0008613485090188043, |
| "loss": 2.6469, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.021049275004733702, |
| "learning_rate": 0.000857905964010703, |
| "loss": 2.6337, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.017375996512513512, |
| "learning_rate": 0.0008544282672808579, |
| "loss": 2.6066, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.020054221841685173, |
| "learning_rate": 0.0008509157603911319, |
| "loss": 2.6293, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.016754816692027007, |
| "learning_rate": 0.0008473687883222664, |
| "loss": 2.553, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.018264517242335677, |
| "learning_rate": 0.0008437876994399991, |
| "loss": 2.582, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.01738231752273472, |
| "learning_rate": 0.0008401728454608494, |
| "loss": 2.6246, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.017582673986843284, |
| "learning_rate": 0.0008365245814175744, |
| "loss": 2.5377, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.01841568278108112, |
| "learning_rate": 0.0008328432656242997, |
| "loss": 2.6107, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.01806489585852763, |
| "learning_rate": 0.0008291292596413272, |
| "loss": 2.5388, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.01910768409990766, |
| "learning_rate": 0.0008253829282396245, |
| "loss": 2.6252, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.019034711943532324, |
| "learning_rate": 0.0008216046393649996, |
| "loss": 2.6193, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.020284737834854932, |
| "learning_rate": 0.0008177947641019621, |
| "loss": 2.6149, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.05158744985589759, |
| "learning_rate": 0.0008139536766372775, |
| "loss": 2.6072, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.018922120101005344, |
| "learning_rate": 0.0008100817542232173, |
| "loss": 2.6032, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.019044369144979888, |
| "learning_rate": 0.000806179377140506, |
| "loss": 2.5879, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.017582902915047, |
| "learning_rate": 0.000802246928660972, |
| "loss": 2.6069, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.019578808618249007, |
| "learning_rate": 0.0007982847950099055, |
| "loss": 2.5965, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.018236739907291074, |
| "learning_rate": 0.0007942933653281245, |
| "loss": 2.5707, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.018258178535900983, |
| "learning_rate": 0.0007902730316337556, |
| "loss": 2.5382, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.022391092471974905, |
| "learning_rate": 0.0007862241887837322, |
| "loss": 2.6118, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.016821289155497087, |
| "learning_rate": 0.0007821472344350131, |
| "loss": 2.5829, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.019386550472160386, |
| "learning_rate": 0.0007780425690055274, |
| "loss": 2.6255, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.1772374509222317, |
| "learning_rate": 0.0007739105956348464, |
| "loss": 2.5617, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.020123188408530047, |
| "learning_rate": 0.0007697517201445905, |
| "loss": 2.6127, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.0217268258079, |
| "learning_rate": 0.0007655663509985707, |
| "loss": 2.5991, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.018981113976254002, |
| "learning_rate": 0.0007613548992626711, |
| "loss": 2.6047, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.01878803361275613, |
| "learning_rate": 0.0007571177785644766, |
| "loss": 2.5482, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.018908774218863146, |
| "learning_rate": 0.0007528554050526488, |
| "loss": 2.6141, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.019547294966543106, |
| "learning_rate": 0.0007485681973560532, |
| "loss": 2.5407, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.018739519276120584, |
| "learning_rate": 0.0007442565765426436, |
| "loss": 2.6212, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.018979109952080055, |
| "learning_rate": 0.0007399209660781074, |
| "loss": 2.5721, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.0175358024051344, |
| "learning_rate": 0.0007355617917842751, |
| "loss": 2.577, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.018692342003597935, |
| "learning_rate": 0.0007311794817972975, |
| "loss": 2.5944, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.019650294357122313, |
| "learning_rate": 0.0007267744665255965, |
| "loss": 2.608, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.01787007786983172, |
| "learning_rate": 0.0007223471786075934, |
| "loss": 2.5898, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.018327854729029175, |
| "learning_rate": 0.0007178980528692161, |
| "loss": 2.5641, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.01819571331500734, |
| "learning_rate": 0.0007134275262811934, |
| "loss": 2.5724, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.016879527517399177, |
| "learning_rate": 0.0007089360379161381, |
| "loss": 2.6016, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.018893461615821387, |
| "learning_rate": 0.0007044240289054227, |
| "loss": 2.5726, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.018652185039036564, |
| "learning_rate": 0.0006998919423958547, |
| "loss": 2.5559, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.020673958617829544, |
| "learning_rate": 0.0006953402235061519, |
| "loss": 2.634, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.01939109964021446, |
| "learning_rate": 0.0006907693192832263, |
| "loss": 2.568, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.018062900613076246, |
| "learning_rate": 0.0006861796786582761, |
| "loss": 2.554, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.018609687828424634, |
| "learning_rate": 0.0006815717524026949, |
| "loss": 2.5842, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.019554131820221306, |
| "learning_rate": 0.0006769459930837989, |
| "loss": 2.632, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.017713236616086883, |
| "learning_rate": 0.0006723028550203778, |
| "loss": 2.5547, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.019353124709199913, |
| "learning_rate": 0.000667642794238074, |
| "loss": 2.6321, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.01984763650797756, |
| "learning_rate": 0.0006629662684245948, |
| "loss": 2.5521, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.0194768410660429, |
| "learning_rate": 0.0006582737368847592, |
| "loss": 2.564, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.02008541514106773, |
| "learning_rate": 0.0006535656604953884, |
| "loss": 2.6406, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.019023120632534838, |
| "learning_rate": 0.0006488425016600402, |
| "loss": 2.5492, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.019894786782134775, |
| "learning_rate": 0.0006441047242635947, |
| "loss": 2.598, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.02011873025218978, |
| "learning_rate": 0.0006393527936266933, |
| "loss": 2.5729, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.019172173916513022, |
| "learning_rate": 0.0006345871764600374, |
| "loss": 2.5594, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.019721366832981004, |
| "learning_rate": 0.0006298083408185502, |
| "loss": 2.5915, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.020228287976845075, |
| "learning_rate": 0.0006250167560554076, |
| "loss": 2.5898, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.020882340174208855, |
| "learning_rate": 0.0006202128927759391, |
| "loss": 2.6273, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.02081451600660678, |
| "learning_rate": 0.0006153972227914089, |
| "loss": 2.595, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.01941546346347613, |
| "learning_rate": 0.0006105702190726764, |
| "loss": 2.5732, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.02166634263027907, |
| "learning_rate": 0.000605732355703743, |
| "loss": 2.5453, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.019573148109730217, |
| "learning_rate": 0.0006008841078351903, |
| "loss": 2.5429, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.01903540723775177, |
| "learning_rate": 0.0005960259516375134, |
| "loss": 2.5388, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.02051218709860799, |
| "learning_rate": 0.0005911583642543531, |
| "loss": 2.5793, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.02101151521755854, |
| "learning_rate": 0.0005862818237556344, |
| "loss": 2.5677, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.02162618990843192, |
| "learning_rate": 0.0005813968090906116, |
| "loss": 2.5635, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.018935934203135756, |
| "learning_rate": 0.0005765038000408295, |
| "loss": 2.5174, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.021177497387846266, |
| "learning_rate": 0.0005716032771730008, |
| "loss": 2.5266, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.019633074877257486, |
| "learning_rate": 0.0005666957217918076, |
| "loss": 2.5909, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.020321617227441156, |
| "learning_rate": 0.0005617816158926302, |
| "loss": 2.5727, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.02171314000863455, |
| "learning_rate": 0.0005568614421142077, |
| "loss": 2.5728, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.02036197464612551, |
| "learning_rate": 0.0005519356836912357, |
| "loss": 2.5546, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.021214591076630044, |
| "learning_rate": 0.0005470048244069055, |
| "loss": 2.6089, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.021380356809754895, |
| "learning_rate": 0.0005420693485453892, |
| "loss": 2.5701, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.020301144411914027, |
| "learning_rate": 0.0005371297408442765, |
| "loss": 2.5783, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.02317064813036681, |
| "learning_rate": 0.0005321864864469646, |
| "loss": 2.5787, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.020981576388978553, |
| "learning_rate": 0.0005272400708550113, |
| "loss": 2.5657, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.01972062092753295, |
| "learning_rate": 0.0005222909798804515, |
| "loss": 2.5941, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.022818553102453083, |
| "learning_rate": 0.0005173396995980818, |
| "loss": 2.5567, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.021333416992185616, |
| "learning_rate": 0.0005123867162977224, |
| "loss": 2.5552, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.020142143404330078, |
| "learning_rate": 0.0005074325164364548, |
| "loss": 2.5305, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.022025222387284903, |
| "learning_rate": 0.0005024775865908451, |
| "loss": 2.59, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.020907939242901075, |
| "learning_rate": 0.000497522413409155, |
| "loss": 2.5452, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.022927802322477854, |
| "learning_rate": 0.0004925674835635454, |
| "loss": 2.5792, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.020657968617921074, |
| "learning_rate": 0.00048761328370227773, |
| "loss": 2.5099, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.020110997895501886, |
| "learning_rate": 0.0004826603004019182, |
| "loss": 2.5163, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.021049174068166514, |
| "learning_rate": 0.0004777090201195486, |
| "loss": 2.5372, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.02117182813897076, |
| "learning_rate": 0.00047275992914498865, |
| "loss": 2.5022, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.020475434729428973, |
| "learning_rate": 0.0004678135135530355, |
| "loss": 2.5626, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.01948270633034859, |
| "learning_rate": 0.0004628702591557237, |
| "loss": 2.5286, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.020212144683261414, |
| "learning_rate": 0.00045793065145461064, |
| "loss": 2.5305, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.020337213413087113, |
| "learning_rate": 0.00045299517559309457, |
| "loss": 2.5624, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.02101629597235714, |
| "learning_rate": 0.00044806431630876436, |
| "loss": 2.5109, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.02099914834706713, |
| "learning_rate": 0.00044313855788579234, |
| "loss": 2.5702, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.020635070288327804, |
| "learning_rate": 0.0004382183841073698, |
| "loss": 2.5483, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.019941103414359743, |
| "learning_rate": 0.00043330427820819256, |
| "loss": 2.5722, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.032674916215766084, |
| "learning_rate": 0.0004283967228269992, |
| "loss": 2.5775, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.021496384704403824, |
| "learning_rate": 0.00042349619995917057, |
| "loss": 2.5856, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.030321207463772364, |
| "learning_rate": 0.0004186031909093884, |
| "loss": 2.5618, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.02175980169134122, |
| "learning_rate": 0.00041371817624436577, |
| "loss": 2.5437, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.020391275857096873, |
| "learning_rate": 0.0004088416357456471, |
| "loss": 2.5338, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.022437463329837284, |
| "learning_rate": 0.00040397404836248684, |
| "loss": 2.5812, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.02129701651167051, |
| "learning_rate": 0.0003991158921648096, |
| "loss": 2.5453, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.0238633278616985, |
| "learning_rate": 0.00039426764429625693, |
| "loss": 2.5233, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.02998116183042863, |
| "learning_rate": 0.0003894297809273237, |
| "loss": 2.5646, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.022238039030238917, |
| "learning_rate": 0.00038460277720859116, |
| "loss": 2.5242, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.0231977730809198, |
| "learning_rate": 0.00037978710722406115, |
| "loss": 2.5681, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 1.3831873669660701, |
| "learning_rate": 0.0003749832439445925, |
| "loss": 2.6152, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.024837482288635006, |
| "learning_rate": 0.0003701916591814497, |
| "loss": 2.5305, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.02460819770267967, |
| "learning_rate": 0.00036541282353996275, |
| "loss": 2.5512, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.021382499301954485, |
| "learning_rate": 0.0003606472063733067, |
| "loss": 2.5726, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.022843480204416112, |
| "learning_rate": 0.00035589527573640534, |
| "loss": 2.5911, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.0217433912155617, |
| "learning_rate": 0.0003511574983399599, |
| "loss": 2.5437, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.022341313083280037, |
| "learning_rate": 0.0003464343395046117, |
| "loss": 2.5565, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 327, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 25, |
| "total_flos": 3.540319303825883e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|