{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 319, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003134796238244514, "grad_norm": 43.949100494384766, "learning_rate": 5.000000000000001e-07, "loss": 1.8127, "step": 1 }, { "epoch": 0.006269592476489028, "grad_norm": 46.380184173583984, "learning_rate": 1.0000000000000002e-06, "loss": 1.782, "step": 2 }, { "epoch": 0.009404388714733543, "grad_norm": 42.30755615234375, "learning_rate": 1.5e-06, "loss": 1.7996, "step": 3 }, { "epoch": 0.012539184952978056, "grad_norm": 42.93296813964844, "learning_rate": 2.0000000000000003e-06, "loss": 1.789, "step": 4 }, { "epoch": 0.01567398119122257, "grad_norm": 38.31450653076172, "learning_rate": 2.5e-06, "loss": 1.6763, "step": 5 }, { "epoch": 0.018808777429467086, "grad_norm": 30.79650115966797, "learning_rate": 3e-06, "loss": 1.4802, "step": 6 }, { "epoch": 0.0219435736677116, "grad_norm": 37.57395935058594, "learning_rate": 3.5e-06, "loss": 1.4183, "step": 7 }, { "epoch": 0.025078369905956112, "grad_norm": 19.238168716430664, "learning_rate": 4.000000000000001e-06, "loss": 1.1746, "step": 8 }, { "epoch": 0.02821316614420063, "grad_norm": 20.116256713867188, "learning_rate": 4.5e-06, "loss": 1.0266, "step": 9 }, { "epoch": 0.03134796238244514, "grad_norm": 8.39165210723877, "learning_rate": 5e-06, "loss": 0.9115, "step": 10 }, { "epoch": 0.034482758620689655, "grad_norm": 9.131606101989746, "learning_rate": 4.9998707921028104e-06, "loss": 0.8921, "step": 11 }, { "epoch": 0.03761755485893417, "grad_norm": 6.591880798339844, "learning_rate": 4.999483181766986e-06, "loss": 0.8571, "step": 12 }, { "epoch": 0.04075235109717868, "grad_norm": 6.148767948150635, "learning_rate": 4.998837209058379e-06, "loss": 0.8377, "step": 13 }, { "epoch": 0.0438871473354232, "grad_norm": 5.670365333557129, "learning_rate": 4.997932940748811e-06, "loss": 0.8791, "step": 14 }, { "epoch": 0.047021943573667714, "grad_norm": 4.403618335723877, "learning_rate": 4.996770470309167e-06, "loss": 0.892, "step": 15 }, { "epoch": 0.050156739811912224, "grad_norm": 3.835578441619873, "learning_rate": 4.995349917899735e-06, "loss": 0.7415, "step": 16 }, { "epoch": 0.05329153605015674, "grad_norm": 4.489407062530518, "learning_rate": 4.993671430357788e-06, "loss": 0.7822, "step": 17 }, { "epoch": 0.05642633228840126, "grad_norm": 5.612992763519287, "learning_rate": 4.991735181182401e-06, "loss": 0.7184, "step": 18 }, { "epoch": 0.05956112852664577, "grad_norm": 13.66352367401123, "learning_rate": 4.989541370516523e-06, "loss": 0.7222, "step": 19 }, { "epoch": 0.06269592476489028, "grad_norm": 3.666353940963745, "learning_rate": 4.987090225126285e-06, "loss": 0.7231, "step": 20 }, { "epoch": 0.06583072100313479, "grad_norm": 4.0020294189453125, "learning_rate": 4.9843819983775575e-06, "loss": 0.7674, "step": 21 }, { "epoch": 0.06896551724137931, "grad_norm": 3.6200690269470215, "learning_rate": 4.98141697020977e-06, "loss": 0.7478, "step": 22 }, { "epoch": 0.07210031347962383, "grad_norm": 3.7657089233398438, "learning_rate": 4.978195447106965e-06, "loss": 0.7388, "step": 23 }, { "epoch": 0.07523510971786834, "grad_norm": 3.262274742126465, "learning_rate": 4.974717762066123e-06, "loss": 0.7176, "step": 24 }, { "epoch": 0.07836990595611286, "grad_norm": 3.8943138122558594, "learning_rate": 4.970984274562741e-06, "loss": 0.6745, "step": 25 }, { "epoch": 0.08150470219435736, "grad_norm": 3.2295124530792236, "learning_rate": 4.966995370513675e-06, "loss": 0.7079, "step": 26 }, { "epoch": 0.08463949843260188, "grad_norm": 4.9130730628967285, "learning_rate": 4.962751462237248e-06, "loss": 0.7824, "step": 27 }, { "epoch": 0.0877742946708464, "grad_norm": 3.4682295322418213, "learning_rate": 4.958252988410631e-06, "loss": 0.7122, "step": 28 }, { "epoch": 0.09090909090909091, "grad_norm": 3.1807892322540283, "learning_rate": 4.9535004140245005e-06, "loss": 0.752, "step": 29 }, { "epoch": 0.09404388714733543, "grad_norm": 6.412112712860107, "learning_rate": 4.94849423033497e-06, "loss": 0.7503, "step": 30 }, { "epoch": 0.09717868338557993, "grad_norm": 3.3517048358917236, "learning_rate": 4.943234954812812e-06, "loss": 0.6783, "step": 31 }, { "epoch": 0.10031347962382445, "grad_norm": 3.2728397846221924, "learning_rate": 4.937723131089974e-06, "loss": 0.713, "step": 32 }, { "epoch": 0.10344827586206896, "grad_norm": 3.6628053188323975, "learning_rate": 4.931959328903376e-06, "loss": 0.6842, "step": 33 }, { "epoch": 0.10658307210031348, "grad_norm": 4.155019283294678, "learning_rate": 4.925944144036027e-06, "loss": 0.6627, "step": 34 }, { "epoch": 0.109717868338558, "grad_norm": 2.7333738803863525, "learning_rate": 4.919678198255438e-06, "loss": 0.6807, "step": 35 }, { "epoch": 0.11285266457680251, "grad_norm": 3.560575246810913, "learning_rate": 4.91316213924935e-06, "loss": 0.6802, "step": 36 }, { "epoch": 0.11598746081504702, "grad_norm": 2.562624454498291, "learning_rate": 4.90639664055879e-06, "loss": 0.6264, "step": 37 }, { "epoch": 0.11912225705329153, "grad_norm": 5.172059059143066, "learning_rate": 4.899382401508446e-06, "loss": 0.6783, "step": 38 }, { "epoch": 0.12225705329153605, "grad_norm": 3.2025134563446045, "learning_rate": 4.892120147134378e-06, "loss": 0.7013, "step": 39 }, { "epoch": 0.12539184952978055, "grad_norm": 2.37896466255188, "learning_rate": 4.884610628109082e-06, "loss": 0.6241, "step": 40 }, { "epoch": 0.12852664576802508, "grad_norm": 2.381072759628296, "learning_rate": 4.876854620663887e-06, "loss": 0.6328, "step": 41 }, { "epoch": 0.13166144200626959, "grad_norm": 2.695284605026245, "learning_rate": 4.868852926508721e-06, "loss": 0.6626, "step": 42 }, { "epoch": 0.13479623824451412, "grad_norm": 6.515556335449219, "learning_rate": 4.860606372749247e-06, "loss": 0.7056, "step": 43 }, { "epoch": 0.13793103448275862, "grad_norm": 3.1236801147460938, "learning_rate": 4.8521158118013605e-06, "loss": 0.678, "step": 44 }, { "epoch": 0.14106583072100312, "grad_norm": 3.5594000816345215, "learning_rate": 4.843382121303082e-06, "loss": 0.6649, "step": 45 }, { "epoch": 0.14420062695924765, "grad_norm": 2.737841844558716, "learning_rate": 4.83440620402384e-06, "loss": 0.5953, "step": 46 }, { "epoch": 0.14733542319749215, "grad_norm": 2.5807104110717773, "learning_rate": 4.825188987771149e-06, "loss": 0.6678, "step": 47 }, { "epoch": 0.15047021943573669, "grad_norm": 2.5918169021606445, "learning_rate": 4.815731425294716e-06, "loss": 0.7223, "step": 48 }, { "epoch": 0.1536050156739812, "grad_norm": 2.677417278289795, "learning_rate": 4.806034494187949e-06, "loss": 0.6688, "step": 49 }, { "epoch": 0.15673981191222572, "grad_norm": 2.6603641510009766, "learning_rate": 4.796099196786908e-06, "loss": 0.6652, "step": 50 }, { "epoch": 0.15987460815047022, "grad_norm": 2.502426862716675, "learning_rate": 4.785926560066703e-06, "loss": 0.6092, "step": 51 }, { "epoch": 0.16300940438871472, "grad_norm": 6.598116397857666, "learning_rate": 4.775517635535332e-06, "loss": 0.6496, "step": 52 }, { "epoch": 0.16614420062695925, "grad_norm": 3.122972249984741, "learning_rate": 4.764873499124997e-06, "loss": 0.6687, "step": 53 }, { "epoch": 0.16927899686520376, "grad_norm": 3.360384225845337, "learning_rate": 4.753995251080884e-06, "loss": 0.6911, "step": 54 }, { "epoch": 0.1724137931034483, "grad_norm": 2.609579563140869, "learning_rate": 4.742884015847436e-06, "loss": 0.6397, "step": 55 }, { "epoch": 0.1755485893416928, "grad_norm": 2.5354106426239014, "learning_rate": 4.731540941952126e-06, "loss": 0.6009, "step": 56 }, { "epoch": 0.1786833855799373, "grad_norm": 2.5063207149505615, "learning_rate": 4.719967201886734e-06, "loss": 0.5991, "step": 57 }, { "epoch": 0.18181818181818182, "grad_norm": 2.7579185962677, "learning_rate": 4.708163991986152e-06, "loss": 0.6064, "step": 58 }, { "epoch": 0.18495297805642633, "grad_norm": 2.891486883163452, "learning_rate": 4.696132532304727e-06, "loss": 0.6391, "step": 59 }, { "epoch": 0.18808777429467086, "grad_norm": 2.916727304458618, "learning_rate": 4.683874066490143e-06, "loss": 0.6727, "step": 60 }, { "epoch": 0.19122257053291536, "grad_norm": 2.708406686782837, "learning_rate": 4.671389861654873e-06, "loss": 0.6499, "step": 61 }, { "epoch": 0.19435736677115986, "grad_norm": 4.942819118499756, "learning_rate": 4.658681208245198e-06, "loss": 0.6356, "step": 62 }, { "epoch": 0.1974921630094044, "grad_norm": 3.262690544128418, "learning_rate": 4.645749419907829e-06, "loss": 0.6732, "step": 63 }, { "epoch": 0.2006269592476489, "grad_norm": 2.6621508598327637, "learning_rate": 4.632595833354105e-06, "loss": 0.6615, "step": 64 }, { "epoch": 0.20376175548589343, "grad_norm": 2.9821105003356934, "learning_rate": 4.619221808221833e-06, "loss": 0.6667, "step": 65 }, { "epoch": 0.20689655172413793, "grad_norm": 5.076481342315674, "learning_rate": 4.605628726934747e-06, "loss": 0.6505, "step": 66 }, { "epoch": 0.21003134796238246, "grad_norm": 4.245087623596191, "learning_rate": 4.5918179945596055e-06, "loss": 0.6129, "step": 67 }, { "epoch": 0.21316614420062696, "grad_norm": 3.060851812362671, "learning_rate": 4.577791038660959e-06, "loss": 0.6436, "step": 68 }, { "epoch": 0.21630094043887146, "grad_norm": 3.0752642154693604, "learning_rate": 4.563549309153589e-06, "loss": 0.6957, "step": 69 }, { "epoch": 0.219435736677116, "grad_norm": 2.996229887008667, "learning_rate": 4.549094278152631e-06, "loss": 0.6877, "step": 70 }, { "epoch": 0.2225705329153605, "grad_norm": 4.217442035675049, "learning_rate": 4.534427439821416e-06, "loss": 0.6231, "step": 71 }, { "epoch": 0.22570532915360503, "grad_norm": 3.009230136871338, "learning_rate": 4.519550310217013e-06, "loss": 0.6574, "step": 72 }, { "epoch": 0.22884012539184953, "grad_norm": 3.037616491317749, "learning_rate": 4.504464427133527e-06, "loss": 0.6238, "step": 73 }, { "epoch": 0.23197492163009403, "grad_norm": 3.02176833152771, "learning_rate": 4.489171349943144e-06, "loss": 0.6985, "step": 74 }, { "epoch": 0.23510971786833856, "grad_norm": 3.0625197887420654, "learning_rate": 4.473672659434941e-06, "loss": 0.7531, "step": 75 }, { "epoch": 0.23824451410658307, "grad_norm": 3.099853277206421, "learning_rate": 4.457969957651485e-06, "loss": 0.6188, "step": 76 }, { "epoch": 0.2413793103448276, "grad_norm": 3.041682004928589, "learning_rate": 4.442064867723236e-06, "loss": 0.6754, "step": 77 }, { "epoch": 0.2445141065830721, "grad_norm": 2.7343809604644775, "learning_rate": 4.425959033700776e-06, "loss": 0.615, "step": 78 }, { "epoch": 0.2476489028213166, "grad_norm": 2.6281471252441406, "learning_rate": 4.409654120384863e-06, "loss": 0.6247, "step": 79 }, { "epoch": 0.2507836990595611, "grad_norm": 2.4146194458007812, "learning_rate": 4.393151813154345e-06, "loss": 0.6237, "step": 80 }, { "epoch": 0.25391849529780564, "grad_norm": 2.641150712966919, "learning_rate": 4.3764538177919555e-06, "loss": 0.5891, "step": 81 }, { "epoch": 0.25705329153605017, "grad_norm": 2.4911434650421143, "learning_rate": 4.35956186030799e-06, "loss": 0.6456, "step": 82 }, { "epoch": 0.2601880877742947, "grad_norm": 3.6674325466156006, "learning_rate": 4.3424776867618935e-06, "loss": 0.5946, "step": 83 }, { "epoch": 0.26332288401253917, "grad_norm": 8.596332550048828, "learning_rate": 4.325203063081776e-06, "loss": 0.6133, "step": 84 }, { "epoch": 0.2664576802507837, "grad_norm": 2.6561825275421143, "learning_rate": 4.307739774881878e-06, "loss": 0.6654, "step": 85 }, { "epoch": 0.26959247648902823, "grad_norm": 3.8426294326782227, "learning_rate": 4.290089627277998e-06, "loss": 0.6124, "step": 86 }, { "epoch": 0.2727272727272727, "grad_norm": 3.1375346183776855, "learning_rate": 4.2722544447008995e-06, "loss": 0.6183, "step": 87 }, { "epoch": 0.27586206896551724, "grad_norm": 2.7622690200805664, "learning_rate": 4.254236070707734e-06, "loss": 0.6555, "step": 88 }, { "epoch": 0.27899686520376177, "grad_norm": 2.8743462562561035, "learning_rate": 4.236036367791471e-06, "loss": 0.5867, "step": 89 }, { "epoch": 0.28213166144200624, "grad_norm": 2.7058961391448975, "learning_rate": 4.2176572171883865e-06, "loss": 0.6346, "step": 90 }, { "epoch": 0.2852664576802508, "grad_norm": 2.520024538040161, "learning_rate": 4.199100518683601e-06, "loss": 0.646, "step": 91 }, { "epoch": 0.2884012539184953, "grad_norm": 2.6446547508239746, "learning_rate": 4.18036819041471e-06, "loss": 0.6502, "step": 92 }, { "epoch": 0.29153605015673983, "grad_norm": 2.559896945953369, "learning_rate": 4.161462168673508e-06, "loss": 0.6166, "step": 93 }, { "epoch": 0.2946708463949843, "grad_norm": 2.6093804836273193, "learning_rate": 4.142384407705846e-06, "loss": 0.607, "step": 94 }, { "epoch": 0.29780564263322884, "grad_norm": 2.8598270416259766, "learning_rate": 4.123136879509626e-06, "loss": 0.6094, "step": 95 }, { "epoch": 0.30094043887147337, "grad_norm": 2.6045126914978027, "learning_rate": 4.103721573630965e-06, "loss": 0.6639, "step": 96 }, { "epoch": 0.30407523510971785, "grad_norm": 2.7823448181152344, "learning_rate": 4.084140496958539e-06, "loss": 0.6341, "step": 97 }, { "epoch": 0.3072100313479624, "grad_norm": 2.320493459701538, "learning_rate": 4.06439567351614e-06, "loss": 0.5939, "step": 98 }, { "epoch": 0.3103448275862069, "grad_norm": 2.2848961353302, "learning_rate": 4.0444891442534615e-06, "loss": 0.5916, "step": 99 }, { "epoch": 0.31347962382445144, "grad_norm": 2.4216668605804443, "learning_rate": 4.024422966835137e-06, "loss": 0.6116, "step": 100 }, { "epoch": 0.3166144200626959, "grad_norm": 2.654069185256958, "learning_rate": 4.004199215428032e-06, "loss": 0.654, "step": 101 }, { "epoch": 0.31974921630094044, "grad_norm": 2.4687626361846924, "learning_rate": 3.9838199804868635e-06, "loss": 0.6136, "step": 102 }, { "epoch": 0.322884012539185, "grad_norm": 3.22712779045105, "learning_rate": 3.963287368538105e-06, "loss": 0.6208, "step": 103 }, { "epoch": 0.32601880877742945, "grad_norm": 2.4066531658172607, "learning_rate": 3.942603501962249e-06, "loss": 0.6191, "step": 104 }, { "epoch": 0.329153605015674, "grad_norm": 2.9688427448272705, "learning_rate": 3.92177051877442e-06, "loss": 0.6132, "step": 105 }, { "epoch": 0.3322884012539185, "grad_norm": 2.6026861667633057, "learning_rate": 3.900790572403376e-06, "loss": 0.5957, "step": 106 }, { "epoch": 0.335423197492163, "grad_norm": 3.1237998008728027, "learning_rate": 3.8796658314689205e-06, "loss": 0.6267, "step": 107 }, { "epoch": 0.3385579937304075, "grad_norm": 2.7337708473205566, "learning_rate": 3.858398479557739e-06, "loss": 0.6635, "step": 108 }, { "epoch": 0.34169278996865204, "grad_norm": 2.707108497619629, "learning_rate": 3.836990714997686e-06, "loss": 0.6217, "step": 109 }, { "epoch": 0.3448275862068966, "grad_norm": 2.4689037799835205, "learning_rate": 3.815444750630555e-06, "loss": 0.6364, "step": 110 }, { "epoch": 0.34796238244514105, "grad_norm": 2.9303574562072754, "learning_rate": 3.7937628135833453e-06, "loss": 0.6117, "step": 111 }, { "epoch": 0.3510971786833856, "grad_norm": 2.5677459239959717, "learning_rate": 3.7719471450380518e-06, "loss": 0.6007, "step": 112 }, { "epoch": 0.3542319749216301, "grad_norm": 2.688203811645508, "learning_rate": 3.7500000000000005e-06, "loss": 0.5778, "step": 113 }, { "epoch": 0.3573667711598746, "grad_norm": 2.5018489360809326, "learning_rate": 3.7279236470647593e-06, "loss": 0.5826, "step": 114 }, { "epoch": 0.3605015673981191, "grad_norm": 2.8333818912506104, "learning_rate": 3.7057203681836407e-06, "loss": 0.59, "step": 115 }, { "epoch": 0.36363636363636365, "grad_norm": 2.9619221687316895, "learning_rate": 3.683392458427825e-06, "loss": 0.6616, "step": 116 }, { "epoch": 0.3667711598746082, "grad_norm": 2.860719919204712, "learning_rate": 3.660942225751126e-06, "loss": 0.5618, "step": 117 }, { "epoch": 0.36990595611285265, "grad_norm": 2.507951021194458, "learning_rate": 3.638371990751428e-06, "loss": 0.6305, "step": 118 }, { "epoch": 0.3730407523510972, "grad_norm": 2.5088613033294678, "learning_rate": 3.615684086430815e-06, "loss": 0.5831, "step": 119 }, { "epoch": 0.3761755485893417, "grad_norm": 2.534837245941162, "learning_rate": 3.592880857954413e-06, "loss": 0.5818, "step": 120 }, { "epoch": 0.3793103448275862, "grad_norm": 2.4578607082366943, "learning_rate": 3.5699646624079824e-06, "loss": 0.6267, "step": 121 }, { "epoch": 0.3824451410658307, "grad_norm": 2.6599507331848145, "learning_rate": 3.5469378685542742e-06, "loss": 0.5954, "step": 122 }, { "epoch": 0.38557993730407525, "grad_norm": 3.10658597946167, "learning_rate": 3.52380285658818e-06, "loss": 0.5963, "step": 123 }, { "epoch": 0.3887147335423197, "grad_norm": 2.4190332889556885, "learning_rate": 3.500562017890695e-06, "loss": 0.6224, "step": 124 }, { "epoch": 0.39184952978056425, "grad_norm": 3.2278661727905273, "learning_rate": 3.4772177547817387e-06, "loss": 0.6182, "step": 125 }, { "epoch": 0.3949843260188088, "grad_norm": 2.7765655517578125, "learning_rate": 3.4537724802718294e-06, "loss": 0.6399, "step": 126 }, { "epoch": 0.3981191222570533, "grad_norm": 2.6910951137542725, "learning_rate": 3.430228617812661e-06, "loss": 0.5898, "step": 127 }, { "epoch": 0.4012539184952978, "grad_norm": 4.030726909637451, "learning_rate": 3.4065886010466014e-06, "loss": 0.6093, "step": 128 }, { "epoch": 0.4043887147335423, "grad_norm": 2.7327334880828857, "learning_rate": 3.382854873555137e-06, "loss": 0.5574, "step": 129 }, { "epoch": 0.40752351097178685, "grad_norm": 3.4407501220703125, "learning_rate": 3.3590298886062833e-06, "loss": 0.6339, "step": 130 }, { "epoch": 0.4106583072100313, "grad_norm": 2.798560380935669, "learning_rate": 3.3351161089010055e-06, "loss": 0.6152, "step": 131 }, { "epoch": 0.41379310344827586, "grad_norm": 2.4109294414520264, "learning_rate": 3.3111160063186553e-06, "loss": 0.5964, "step": 132 }, { "epoch": 0.4169278996865204, "grad_norm": 6.183897018432617, "learning_rate": 3.2870320616614626e-06, "loss": 0.5906, "step": 133 }, { "epoch": 0.4200626959247649, "grad_norm": 2.960341691970825, "learning_rate": 3.2628667643981036e-06, "loss": 0.663, "step": 134 }, { "epoch": 0.4231974921630094, "grad_norm": 2.809236764907837, "learning_rate": 3.238622612406373e-06, "loss": 0.6198, "step": 135 }, { "epoch": 0.4263322884012539, "grad_norm": 3.4371628761291504, "learning_rate": 3.21430211171499e-06, "loss": 0.6275, "step": 136 }, { "epoch": 0.42946708463949845, "grad_norm": 3.0913808345794678, "learning_rate": 3.189907776244556e-06, "loss": 0.6232, "step": 137 }, { "epoch": 0.43260188087774293, "grad_norm": 2.930027961730957, "learning_rate": 3.1654421275477045e-06, "loss": 0.5638, "step": 138 }, { "epoch": 0.43573667711598746, "grad_norm": 2.9050416946411133, "learning_rate": 3.1409076945484513e-06, "loss": 0.621, "step": 139 }, { "epoch": 0.438871473354232, "grad_norm": 2.39300274848938, "learning_rate": 3.116307013280793e-06, "loss": 0.5852, "step": 140 }, { "epoch": 0.44200626959247646, "grad_norm": 2.8745641708374023, "learning_rate": 3.0916426266265676e-06, "loss": 0.6119, "step": 141 }, { "epoch": 0.445141065830721, "grad_norm": 3.0072708129882812, "learning_rate": 3.066917084052603e-06, "loss": 0.5851, "step": 142 }, { "epoch": 0.4482758620689655, "grad_norm": 2.8062074184417725, "learning_rate": 3.042132941347189e-06, "loss": 0.586, "step": 143 }, { "epoch": 0.45141065830721006, "grad_norm": 3.202028274536133, "learning_rate": 3.017292760355896e-06, "loss": 0.6312, "step": 144 }, { "epoch": 0.45454545454545453, "grad_norm": 2.7520525455474854, "learning_rate": 2.9923991087167657e-06, "loss": 0.5769, "step": 145 }, { "epoch": 0.45768025078369906, "grad_norm": 2.7981884479522705, "learning_rate": 2.967454559594903e-06, "loss": 0.6349, "step": 146 }, { "epoch": 0.4608150470219436, "grad_norm": 2.9364802837371826, "learning_rate": 2.9424616914164982e-06, "loss": 0.5936, "step": 147 }, { "epoch": 0.46394984326018807, "grad_norm": 2.985931873321533, "learning_rate": 2.917423087602306e-06, "loss": 0.5731, "step": 148 }, { "epoch": 0.4670846394984326, "grad_norm": 2.4261667728424072, "learning_rate": 2.8923413363006038e-06, "loss": 0.602, "step": 149 }, { "epoch": 0.4702194357366771, "grad_norm": 2.6361424922943115, "learning_rate": 2.8672190301196655e-06, "loss": 0.5851, "step": 150 }, { "epoch": 0.47335423197492166, "grad_norm": 2.6938276290893555, "learning_rate": 2.842058765859776e-06, "loss": 0.6026, "step": 151 }, { "epoch": 0.47648902821316613, "grad_norm": 2.6547839641571045, "learning_rate": 2.8168631442448046e-06, "loss": 0.5863, "step": 152 }, { "epoch": 0.47962382445141066, "grad_norm": 2.8255505561828613, "learning_rate": 2.791634769653381e-06, "loss": 0.6096, "step": 153 }, { "epoch": 0.4827586206896552, "grad_norm": 2.461580514907837, "learning_rate": 2.7663762498496905e-06, "loss": 0.5744, "step": 154 }, { "epoch": 0.48589341692789967, "grad_norm": 2.9616644382476807, "learning_rate": 2.741090195713917e-06, "loss": 0.5849, "step": 155 }, { "epoch": 0.4890282131661442, "grad_norm": 2.7509751319885254, "learning_rate": 2.7157792209723654e-06, "loss": 0.5711, "step": 156 }, { "epoch": 0.49216300940438873, "grad_norm": 3.163322687149048, "learning_rate": 2.6904459419272955e-06, "loss": 0.6491, "step": 157 }, { "epoch": 0.4952978056426332, "grad_norm": 2.5019166469573975, "learning_rate": 2.6650929771864776e-06, "loss": 0.5608, "step": 158 }, { "epoch": 0.49843260188087773, "grad_norm": 2.8161232471466064, "learning_rate": 2.639722947392521e-06, "loss": 0.6116, "step": 159 }, { "epoch": 0.5015673981191222, "grad_norm": 2.9896793365478516, "learning_rate": 2.614338474951987e-06, "loss": 0.5859, "step": 160 }, { "epoch": 0.5047021943573667, "grad_norm": 2.9633073806762695, "learning_rate": 2.5889421837643186e-06, "loss": 0.5757, "step": 161 }, { "epoch": 0.5078369905956113, "grad_norm": 7.553648471832275, "learning_rate": 2.563536698950624e-06, "loss": 0.5985, "step": 162 }, { "epoch": 0.5109717868338558, "grad_norm": 2.4814646244049072, "learning_rate": 2.538124646582315e-06, "loss": 0.5918, "step": 163 }, { "epoch": 0.5141065830721003, "grad_norm": 2.601158380508423, "learning_rate": 2.512708653409674e-06, "loss": 0.5768, "step": 164 }, { "epoch": 0.5172413793103449, "grad_norm": 2.3953468799591064, "learning_rate": 2.487291346590326e-06, "loss": 0.5801, "step": 165 }, { "epoch": 0.5203761755485894, "grad_norm": 2.782278060913086, "learning_rate": 2.4618753534176854e-06, "loss": 0.5953, "step": 166 }, { "epoch": 0.5235109717868338, "grad_norm": 2.6170544624328613, "learning_rate": 2.436463301049378e-06, "loss": 0.584, "step": 167 }, { "epoch": 0.5266457680250783, "grad_norm": 2.2067277431488037, "learning_rate": 2.4110578162356814e-06, "loss": 0.5709, "step": 168 }, { "epoch": 0.5297805642633229, "grad_norm": 2.650946617126465, "learning_rate": 2.385661525048014e-06, "loss": 0.6157, "step": 169 }, { "epoch": 0.5329153605015674, "grad_norm": 2.4111557006835938, "learning_rate": 2.3602770526074804e-06, "loss": 0.587, "step": 170 }, { "epoch": 0.5360501567398119, "grad_norm": 2.574047088623047, "learning_rate": 2.334907022813523e-06, "loss": 0.5945, "step": 171 }, { "epoch": 0.5391849529780565, "grad_norm": 5.312939167022705, "learning_rate": 2.3095540580727054e-06, "loss": 0.5584, "step": 172 }, { "epoch": 0.542319749216301, "grad_norm": 2.999335289001465, "learning_rate": 2.2842207790276355e-06, "loss": 0.5588, "step": 173 }, { "epoch": 0.5454545454545454, "grad_norm": 2.8886678218841553, "learning_rate": 2.2589098042860838e-06, "loss": 0.5834, "step": 174 }, { "epoch": 0.54858934169279, "grad_norm": 2.7539076805114746, "learning_rate": 2.2336237501503103e-06, "loss": 0.529, "step": 175 }, { "epoch": 0.5517241379310345, "grad_norm": 2.600677967071533, "learning_rate": 2.2083652303466196e-06, "loss": 0.5694, "step": 176 }, { "epoch": 0.554858934169279, "grad_norm": 2.550015449523926, "learning_rate": 2.1831368557551962e-06, "loss": 0.5734, "step": 177 }, { "epoch": 0.5579937304075235, "grad_norm": 2.4202213287353516, "learning_rate": 2.157941234140225e-06, "loss": 0.5664, "step": 178 }, { "epoch": 0.5611285266457681, "grad_norm": 2.6022562980651855, "learning_rate": 2.1327809698803354e-06, "loss": 0.5516, "step": 179 }, { "epoch": 0.5642633228840125, "grad_norm": 2.3936119079589844, "learning_rate": 2.1076586636993975e-06, "loss": 0.5697, "step": 180 }, { "epoch": 0.567398119122257, "grad_norm": 2.6119375228881836, "learning_rate": 2.0825769123976954e-06, "loss": 0.5524, "step": 181 }, { "epoch": 0.5705329153605015, "grad_norm": 2.9607300758361816, "learning_rate": 2.057538308583502e-06, "loss": 0.5539, "step": 182 }, { "epoch": 0.5736677115987461, "grad_norm": 2.5394551753997803, "learning_rate": 2.0325454404050983e-06, "loss": 0.5902, "step": 183 }, { "epoch": 0.5768025078369906, "grad_norm": 2.248412609100342, "learning_rate": 2.0076008912832355e-06, "loss": 0.5684, "step": 184 }, { "epoch": 0.5799373040752351, "grad_norm": 2.369102954864502, "learning_rate": 1.9827072396441044e-06, "loss": 0.5473, "step": 185 }, { "epoch": 0.5830721003134797, "grad_norm": 2.8046441078186035, "learning_rate": 1.957867058652812e-06, "loss": 0.5125, "step": 186 }, { "epoch": 0.5862068965517241, "grad_norm": 2.5482161045074463, "learning_rate": 1.933082915947398e-06, "loss": 0.5586, "step": 187 }, { "epoch": 0.5893416927899686, "grad_norm": 2.53220534324646, "learning_rate": 1.9083573733734328e-06, "loss": 0.598, "step": 188 }, { "epoch": 0.5924764890282131, "grad_norm": 2.742966413497925, "learning_rate": 1.8836929867192077e-06, "loss": 0.5432, "step": 189 }, { "epoch": 0.5956112852664577, "grad_norm": 4.084339141845703, "learning_rate": 1.8590923054515504e-06, "loss": 0.5232, "step": 190 }, { "epoch": 0.5987460815047022, "grad_norm": 2.8514344692230225, "learning_rate": 1.8345578724522957e-06, "loss": 0.5748, "step": 191 }, { "epoch": 0.6018808777429467, "grad_norm": 2.736140012741089, "learning_rate": 1.8100922237554442e-06, "loss": 0.5315, "step": 192 }, { "epoch": 0.6050156739811913, "grad_norm": 2.4470133781433105, "learning_rate": 1.7856978882850112e-06, "loss": 0.5722, "step": 193 }, { "epoch": 0.6081504702194357, "grad_norm": 2.587562084197998, "learning_rate": 1.7613773875936274e-06, "loss": 0.5697, "step": 194 }, { "epoch": 0.6112852664576802, "grad_norm": 2.340610980987549, "learning_rate": 1.7371332356018972e-06, "loss": 0.5292, "step": 195 }, { "epoch": 0.6144200626959248, "grad_norm": 4.569915771484375, "learning_rate": 1.7129679383385384e-06, "loss": 0.582, "step": 196 }, { "epoch": 0.6175548589341693, "grad_norm": 2.7119388580322266, "learning_rate": 1.688883993681345e-06, "loss": 0.6219, "step": 197 }, { "epoch": 0.6206896551724138, "grad_norm": 2.8180112838745117, "learning_rate": 1.6648838910989955e-06, "loss": 0.5649, "step": 198 }, { "epoch": 0.6238244514106583, "grad_norm": 3.1212546825408936, "learning_rate": 1.6409701113937182e-06, "loss": 0.5269, "step": 199 }, { "epoch": 0.6269592476489029, "grad_norm": 3.6080574989318848, "learning_rate": 1.617145126444864e-06, "loss": 0.5903, "step": 200 }, { "epoch": 0.6300940438871473, "grad_norm": 2.5957369804382324, "learning_rate": 1.5934113989533992e-06, "loss": 0.6123, "step": 201 }, { "epoch": 0.6332288401253918, "grad_norm": 2.664415121078491, "learning_rate": 1.5697713821873401e-06, "loss": 0.6159, "step": 202 }, { "epoch": 0.6363636363636364, "grad_norm": 2.5966672897338867, "learning_rate": 1.5462275197281717e-06, "loss": 0.5255, "step": 203 }, { "epoch": 0.6394984326018809, "grad_norm": 2.552795886993408, "learning_rate": 1.5227822452182617e-06, "loss": 0.5485, "step": 204 }, { "epoch": 0.6426332288401254, "grad_norm": 2.581660032272339, "learning_rate": 1.499437982109305e-06, "loss": 0.5727, "step": 205 }, { "epoch": 0.64576802507837, "grad_norm": 2.6740126609802246, "learning_rate": 1.4761971434118207e-06, "loss": 0.568, "step": 206 }, { "epoch": 0.6489028213166145, "grad_norm": 2.314016342163086, "learning_rate": 1.4530621314457255e-06, "loss": 0.5335, "step": 207 }, { "epoch": 0.6520376175548589, "grad_norm": 2.5612449645996094, "learning_rate": 1.430035337592018e-06, "loss": 0.5422, "step": 208 }, { "epoch": 0.6551724137931034, "grad_norm": 6.558284759521484, "learning_rate": 1.4071191420455873e-06, "loss": 0.5938, "step": 209 }, { "epoch": 0.658307210031348, "grad_norm": 2.5115890502929688, "learning_rate": 1.3843159135691859e-06, "loss": 0.5194, "step": 210 }, { "epoch": 0.6614420062695925, "grad_norm": 3.0726919174194336, "learning_rate": 1.3616280092485719e-06, "loss": 0.554, "step": 211 }, { "epoch": 0.664576802507837, "grad_norm": 2.4502389430999756, "learning_rate": 1.3390577742488747e-06, "loss": 0.6057, "step": 212 }, { "epoch": 0.6677115987460815, "grad_norm": 2.853550434112549, "learning_rate": 1.3166075415721762e-06, "loss": 0.5049, "step": 213 }, { "epoch": 0.670846394984326, "grad_norm": 2.496123790740967, "learning_rate": 1.2942796318163595e-06, "loss": 0.5625, "step": 214 }, { "epoch": 0.6739811912225705, "grad_norm": 2.3185224533081055, "learning_rate": 1.2720763529352415e-06, "loss": 0.5336, "step": 215 }, { "epoch": 0.677115987460815, "grad_norm": 2.621919631958008, "learning_rate": 1.2500000000000007e-06, "loss": 0.539, "step": 216 }, { "epoch": 0.6802507836990596, "grad_norm": 2.744100570678711, "learning_rate": 1.2280528549619487e-06, "loss": 0.5213, "step": 217 }, { "epoch": 0.6833855799373041, "grad_norm": 2.494028329849243, "learning_rate": 1.2062371864166553e-06, "loss": 0.5419, "step": 218 }, { "epoch": 0.6865203761755486, "grad_norm": 2.599900245666504, "learning_rate": 1.1845552493694462e-06, "loss": 0.5456, "step": 219 }, { "epoch": 0.6896551724137931, "grad_norm": 2.5224337577819824, "learning_rate": 1.1630092850023148e-06, "loss": 0.566, "step": 220 }, { "epoch": 0.6927899686520376, "grad_norm": 2.492403030395508, "learning_rate": 1.141601520442262e-06, "loss": 0.5415, "step": 221 }, { "epoch": 0.6959247648902821, "grad_norm": 2.5345394611358643, "learning_rate": 1.120334168531081e-06, "loss": 0.5301, "step": 222 }, { "epoch": 0.6990595611285266, "grad_norm": 2.418922185897827, "learning_rate": 1.0992094275966256e-06, "loss": 0.5764, "step": 223 }, { "epoch": 0.7021943573667712, "grad_norm": 3.3536760807037354, "learning_rate": 1.078229481225582e-06, "loss": 0.5596, "step": 224 }, { "epoch": 0.7053291536050157, "grad_norm": 2.531526803970337, "learning_rate": 1.0573964980377517e-06, "loss": 0.549, "step": 225 }, { "epoch": 0.7084639498432602, "grad_norm": 2.7442548274993896, "learning_rate": 1.0367126314618946e-06, "loss": 0.5025, "step": 226 }, { "epoch": 0.7115987460815048, "grad_norm": 2.368351459503174, "learning_rate": 1.0161800195131372e-06, "loss": 0.5311, "step": 227 }, { "epoch": 0.7147335423197492, "grad_norm": 2.8416459560394287, "learning_rate": 9.95800784571969e-07, "loss": 0.5243, "step": 228 }, { "epoch": 0.7178683385579937, "grad_norm": 2.772183656692505, "learning_rate": 9.755770331648642e-07, "loss": 0.5677, "step": 229 }, { "epoch": 0.7210031347962382, "grad_norm": 2.4133315086364746, "learning_rate": 9.555108557465383e-07, "loss": 0.5507, "step": 230 }, { "epoch": 0.7241379310344828, "grad_norm": 2.677746295928955, "learning_rate": 9.356043264838607e-07, "loss": 0.5553, "step": 231 }, { "epoch": 0.7272727272727273, "grad_norm": 2.8451437950134277, "learning_rate": 9.158595030414621e-07, "loss": 0.5135, "step": 232 }, { "epoch": 0.7304075235109718, "grad_norm": 2.838019609451294, "learning_rate": 8.962784263690358e-07, "loss": 0.59, "step": 233 }, { "epoch": 0.7335423197492164, "grad_norm": 2.865750312805176, "learning_rate": 8.768631204903738e-07, "loss": 0.5164, "step": 234 }, { "epoch": 0.7366771159874608, "grad_norm": 2.9796836376190186, "learning_rate": 8.576155922941548e-07, "loss": 0.5242, "step": 235 }, { "epoch": 0.7398119122257053, "grad_norm": 3.789559841156006, "learning_rate": 8.385378313264933e-07, "loss": 0.5419, "step": 236 }, { "epoch": 0.7429467084639498, "grad_norm": 2.552150249481201, "learning_rate": 8.196318095852909e-07, "loss": 0.5426, "step": 237 }, { "epoch": 0.7460815047021944, "grad_norm": 3.243431568145752, "learning_rate": 8.008994813163995e-07, "loss": 0.5121, "step": 238 }, { "epoch": 0.7492163009404389, "grad_norm": 3.1874783039093018, "learning_rate": 7.823427828116148e-07, "loss": 0.5512, "step": 239 }, { "epoch": 0.7523510971786834, "grad_norm": 2.545905828475952, "learning_rate": 7.6396363220853e-07, "loss": 0.5483, "step": 240 }, { "epoch": 0.7554858934169278, "grad_norm": 2.719782829284668, "learning_rate": 7.457639292922675e-07, "loss": 0.5683, "step": 241 }, { "epoch": 0.7586206896551724, "grad_norm": 2.9073195457458496, "learning_rate": 7.277455552991011e-07, "loss": 0.5711, "step": 242 }, { "epoch": 0.7617554858934169, "grad_norm": 2.301893949508667, "learning_rate": 7.099103727220024e-07, "loss": 0.533, "step": 243 }, { "epoch": 0.7648902821316614, "grad_norm": 2.9436652660369873, "learning_rate": 6.922602251181221e-07, "loss": 0.5447, "step": 244 }, { "epoch": 0.768025078369906, "grad_norm": 3.2471468448638916, "learning_rate": 6.747969369182248e-07, "loss": 0.5551, "step": 245 }, { "epoch": 0.7711598746081505, "grad_norm": 2.480755567550659, "learning_rate": 6.575223132381067e-07, "loss": 0.5143, "step": 246 }, { "epoch": 0.774294670846395, "grad_norm": 2.8075945377349854, "learning_rate": 6.4043813969201e-07, "loss": 0.5099, "step": 247 }, { "epoch": 0.7774294670846394, "grad_norm": 3.024644136428833, "learning_rate": 6.235461822080449e-07, "loss": 0.5393, "step": 248 }, { "epoch": 0.780564263322884, "grad_norm": 2.6839873790740967, "learning_rate": 6.068481868456558e-07, "loss": 0.5509, "step": 249 }, { "epoch": 0.7836990595611285, "grad_norm": 3.0200679302215576, "learning_rate": 5.903458796151382e-07, "loss": 0.5647, "step": 250 }, { "epoch": 0.786833855799373, "grad_norm": 2.65813946723938, "learning_rate": 5.740409662992244e-07, "loss": 0.5202, "step": 251 }, { "epoch": 0.7899686520376176, "grad_norm": 2.6419460773468018, "learning_rate": 5.579351322767643e-07, "loss": 0.5412, "step": 252 }, { "epoch": 0.7931034482758621, "grad_norm": 2.5918684005737305, "learning_rate": 5.420300423485167e-07, "loss": 0.5671, "step": 253 }, { "epoch": 0.7962382445141066, "grad_norm": 2.6645092964172363, "learning_rate": 5.263273405650601e-07, "loss": 0.5971, "step": 254 }, { "epoch": 0.799373040752351, "grad_norm": 2.6975386142730713, "learning_rate": 5.108286500568562e-07, "loss": 0.5569, "step": 255 }, { "epoch": 0.8025078369905956, "grad_norm": 2.585435628890991, "learning_rate": 4.95535572866474e-07, "loss": 0.5394, "step": 256 }, { "epoch": 0.8056426332288401, "grad_norm": 2.4776594638824463, "learning_rate": 4.804496897829883e-07, "loss": 0.5231, "step": 257 }, { "epoch": 0.8087774294670846, "grad_norm": 2.783409833908081, "learning_rate": 4.6557256017858485e-07, "loss": 0.5114, "step": 258 }, { "epoch": 0.8119122257053292, "grad_norm": 2.355269193649292, "learning_rate": 4.5090572184736863e-07, "loss": 0.5202, "step": 259 }, { "epoch": 0.8150470219435737, "grad_norm": 2.541964292526245, "learning_rate": 4.3645069084641195e-07, "loss": 0.5414, "step": 260 }, { "epoch": 0.8181818181818182, "grad_norm": 2.6445441246032715, "learning_rate": 4.222089613390412e-07, "loss": 0.5289, "step": 261 }, { "epoch": 0.8213166144200627, "grad_norm": 2.8741798400878906, "learning_rate": 4.0818200544039484e-07, "loss": 0.5541, "step": 262 }, { "epoch": 0.8244514106583072, "grad_norm": 3.0294582843780518, "learning_rate": 3.9437127306525295e-07, "loss": 0.5234, "step": 263 }, { "epoch": 0.8275862068965517, "grad_norm": 2.6354918479919434, "learning_rate": 3.8077819177816695e-07, "loss": 0.5061, "step": 264 }, { "epoch": 0.8307210031347962, "grad_norm": 2.5358927249908447, "learning_rate": 3.6740416664589634e-07, "loss": 0.5108, "step": 265 }, { "epoch": 0.8338557993730408, "grad_norm": 3.637833595275879, "learning_rate": 3.5425058009217193e-07, "loss": 0.5398, "step": 266 }, { "epoch": 0.8369905956112853, "grad_norm": 2.77534556388855, "learning_rate": 3.413187917548019e-07, "loss": 0.5727, "step": 267 }, { "epoch": 0.8401253918495298, "grad_norm": 2.57776141166687, "learning_rate": 3.2861013834512844e-07, "loss": 0.5309, "step": 268 }, { "epoch": 0.8432601880877743, "grad_norm": 2.4252471923828125, "learning_rate": 3.161259335098571e-07, "loss": 0.4912, "step": 269 }, { "epoch": 0.8463949843260188, "grad_norm": 4.025521278381348, "learning_rate": 3.0386746769527323e-07, "loss": 0.5448, "step": 270 }, { "epoch": 0.8495297805642633, "grad_norm": 2.6746034622192383, "learning_rate": 2.9183600801384853e-07, "loss": 0.5454, "step": 271 }, { "epoch": 0.8526645768025078, "grad_norm": 2.7830092906951904, "learning_rate": 2.8003279811326724e-07, "loss": 0.539, "step": 272 }, { "epoch": 0.8557993730407524, "grad_norm": 2.5394887924194336, "learning_rate": 2.684590580478749e-07, "loss": 0.5234, "step": 273 }, { "epoch": 0.8589341692789969, "grad_norm": 2.765644073486328, "learning_rate": 2.57115984152565e-07, "loss": 0.5105, "step": 274 }, { "epoch": 0.8620689655172413, "grad_norm": 2.5733540058135986, "learning_rate": 2.4600474891911696e-07, "loss": 0.5381, "step": 275 }, { "epoch": 0.8652037617554859, "grad_norm": 2.2739200592041016, "learning_rate": 2.3512650087500338e-07, "loss": 0.5344, "step": 276 }, { "epoch": 0.8683385579937304, "grad_norm": 2.446244478225708, "learning_rate": 2.2448236446466847e-07, "loss": 0.5271, "step": 277 }, { "epoch": 0.8714733542319749, "grad_norm": 2.833040237426758, "learning_rate": 2.140734399332975e-07, "loss": 0.5841, "step": 278 }, { "epoch": 0.8746081504702194, "grad_norm": 2.487649440765381, "learning_rate": 2.0390080321309236e-07, "loss": 0.5353, "step": 279 }, { "epoch": 0.877742946708464, "grad_norm": 3.27183198928833, "learning_rate": 1.9396550581205208e-07, "loss": 0.5181, "step": 280 }, { "epoch": 0.8808777429467085, "grad_norm": 2.5979673862457275, "learning_rate": 1.8426857470528414e-07, "loss": 0.521, "step": 281 }, { "epoch": 0.8840125391849529, "grad_norm": 2.452927589416504, "learning_rate": 1.7481101222885126e-07, "loss": 0.5394, "step": 282 }, { "epoch": 0.8871473354231975, "grad_norm": 3.278170585632324, "learning_rate": 1.6559379597616136e-07, "loss": 0.5098, "step": 283 }, { "epoch": 0.890282131661442, "grad_norm": 2.9773340225219727, "learning_rate": 1.5661787869691858e-07, "loss": 0.5046, "step": 284 }, { "epoch": 0.8934169278996865, "grad_norm": 2.6932992935180664, "learning_rate": 1.4788418819864037e-07, "loss": 0.5517, "step": 285 }, { "epoch": 0.896551724137931, "grad_norm": 2.7016708850860596, "learning_rate": 1.3939362725075344e-07, "loss": 0.5386, "step": 286 }, { "epoch": 0.8996865203761756, "grad_norm": 3.2767891883850098, "learning_rate": 1.3114707349127954e-07, "loss": 0.53, "step": 287 }, { "epoch": 0.9028213166144201, "grad_norm": 2.865053653717041, "learning_rate": 1.2314537933611425e-07, "loss": 0.5306, "step": 288 }, { "epoch": 0.9059561128526645, "grad_norm": 2.703111410140991, "learning_rate": 1.1538937189091825e-07, "loss": 0.5677, "step": 289 }, { "epoch": 0.9090909090909091, "grad_norm": 2.6944828033447266, "learning_rate": 1.0787985286562219e-07, "loss": 0.5488, "step": 290 }, { "epoch": 0.9122257053291536, "grad_norm": 2.602788209915161, "learning_rate": 1.00617598491555e-07, "loss": 0.5406, "step": 291 }, { "epoch": 0.9153605015673981, "grad_norm": 2.350580930709839, "learning_rate": 9.360335944121029e-08, "loss": 0.5027, "step": 292 }, { "epoch": 0.9184952978056427, "grad_norm": 2.7530674934387207, "learning_rate": 8.683786075065065e-08, "loss": 0.5458, "step": 293 }, { "epoch": 0.9216300940438872, "grad_norm": 2.564846992492676, "learning_rate": 8.032180174456283e-08, "loss": 0.5267, "step": 294 }, { "epoch": 0.9247648902821317, "grad_norm": 3.4500110149383545, "learning_rate": 7.405585596397314e-08, "loss": 0.5129, "step": 295 }, { "epoch": 0.9278996865203761, "grad_norm": 2.9540138244628906, "learning_rate": 6.804067109662443e-08, "loss": 0.5221, "step": 296 }, { "epoch": 0.9310344827586207, "grad_norm": 2.5607857704162598, "learning_rate": 6.227686891002671e-08, "loss": 0.4751, "step": 297 }, { "epoch": 0.9341692789968652, "grad_norm": 2.6755456924438477, "learning_rate": 5.6765045187187614e-08, "loss": 0.5014, "step": 298 }, { "epoch": 0.9373040752351097, "grad_norm": 3.1413800716400146, "learning_rate": 5.150576966503063e-08, "loss": 0.5191, "step": 299 }, { "epoch": 0.9404388714733543, "grad_norm": 2.7286934852600098, "learning_rate": 4.649958597549964e-08, "loss": 0.5229, "step": 300 }, { "epoch": 0.9435736677115988, "grad_norm": 4.638148784637451, "learning_rate": 4.174701158936895e-08, "loss": 0.5181, "step": 301 }, { "epoch": 0.9467084639498433, "grad_norm": 2.8824093341827393, "learning_rate": 3.7248537762752666e-08, "loss": 0.5899, "step": 302 }, { "epoch": 0.9498432601880877, "grad_norm": 3.2580602169036865, "learning_rate": 3.300462948632593e-08, "loss": 0.5234, "step": 303 }, { "epoch": 0.9529780564263323, "grad_norm": 2.773378610610962, "learning_rate": 2.9015725437259724e-08, "loss": 0.5406, "step": 304 }, { "epoch": 0.9561128526645768, "grad_norm": 2.5910532474517822, "learning_rate": 2.5282237933877962e-08, "loss": 0.5322, "step": 305 }, { "epoch": 0.9592476489028213, "grad_norm": 4.50349760055542, "learning_rate": 2.180455289303579e-08, "loss": 0.6053, "step": 306 }, { "epoch": 0.9623824451410659, "grad_norm": 2.9559998512268066, "learning_rate": 1.8583029790230356e-08, "loss": 0.4884, "step": 307 }, { "epoch": 0.9655172413793104, "grad_norm": 3.511509418487549, "learning_rate": 1.561800162244248e-08, "loss": 0.5252, "step": 308 }, { "epoch": 0.9686520376175548, "grad_norm": 2.7472832202911377, "learning_rate": 1.2909774873715585e-08, "loss": 0.5427, "step": 309 }, { "epoch": 0.9717868338557993, "grad_norm": 3.0372958183288574, "learning_rate": 1.0458629483476868e-08, "loss": 0.5447, "step": 310 }, { "epoch": 0.9749216300940439, "grad_norm": 2.645599126815796, "learning_rate": 8.264818817599052e-09, "loss": 0.5911, "step": 311 }, { "epoch": 0.9780564263322884, "grad_norm": 3.0988388061523438, "learning_rate": 6.328569642212734e-09, "loss": 0.522, "step": 312 }, { "epoch": 0.9811912225705329, "grad_norm": 4.376644134521484, "learning_rate": 4.6500821002654075e-09, "loss": 0.551, "step": 313 }, { "epoch": 0.9843260188087775, "grad_norm": 2.846813917160034, "learning_rate": 3.2295296908338437e-09, "loss": 0.5204, "step": 314 }, { "epoch": 0.987460815047022, "grad_norm": 4.044336318969727, "learning_rate": 2.067059251189274e-09, "loss": 0.5254, "step": 315 }, { "epoch": 0.9905956112852664, "grad_norm": 2.489978790283203, "learning_rate": 1.1627909416211947e-09, "loss": 0.5397, "step": 316 }, { "epoch": 0.9937304075235109, "grad_norm": 2.610431671142578, "learning_rate": 5.168182330145266e-10, "loss": 0.555, "step": 317 }, { "epoch": 0.9968652037617555, "grad_norm": 3.202317953109741, "learning_rate": 1.292078971898425e-10, "loss": 0.5094, "step": 318 }, { "epoch": 1.0, "grad_norm": 2.7502589225769043, "learning_rate": 0.0, "loss": 0.5585, "step": 319 }, { "epoch": 1.0, "step": 319, "total_flos": 834160017014784.0, "train_loss": 0.6235808798325099, "train_runtime": 6415.1151, "train_samples_per_second": 3.182, "train_steps_per_second": 0.05 } ], "logging_steps": 1.0, "max_steps": 319, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 834160017014784.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }