| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 319, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003134796238244514, |
| "grad_norm": 43.949100494384766, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 1.8127, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.006269592476489028, |
| "grad_norm": 46.380184173583984, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.782, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.009404388714733543, |
| "grad_norm": 42.30755615234375, |
| "learning_rate": 1.5e-06, |
| "loss": 1.7996, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.012539184952978056, |
| "grad_norm": 42.93296813964844, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.789, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01567398119122257, |
| "grad_norm": 38.31450653076172, |
| "learning_rate": 2.5e-06, |
| "loss": 1.6763, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.018808777429467086, |
| "grad_norm": 30.79650115966797, |
| "learning_rate": 3e-06, |
| "loss": 1.4802, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0219435736677116, |
| "grad_norm": 37.57395935058594, |
| "learning_rate": 3.5e-06, |
| "loss": 1.4183, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.025078369905956112, |
| "grad_norm": 19.238168716430664, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.1746, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.02821316614420063, |
| "grad_norm": 20.116256713867188, |
| "learning_rate": 4.5e-06, |
| "loss": 1.0266, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03134796238244514, |
| "grad_norm": 8.39165210723877, |
| "learning_rate": 5e-06, |
| "loss": 0.9115, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.034482758620689655, |
| "grad_norm": 9.131606101989746, |
| "learning_rate": 4.9998707921028104e-06, |
| "loss": 0.8921, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.03761755485893417, |
| "grad_norm": 6.591880798339844, |
| "learning_rate": 4.999483181766986e-06, |
| "loss": 0.8571, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.04075235109717868, |
| "grad_norm": 6.148767948150635, |
| "learning_rate": 4.998837209058379e-06, |
| "loss": 0.8377, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.0438871473354232, |
| "grad_norm": 5.670365333557129, |
| "learning_rate": 4.997932940748811e-06, |
| "loss": 0.8791, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.047021943573667714, |
| "grad_norm": 4.403618335723877, |
| "learning_rate": 4.996770470309167e-06, |
| "loss": 0.892, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.050156739811912224, |
| "grad_norm": 3.835578441619873, |
| "learning_rate": 4.995349917899735e-06, |
| "loss": 0.7415, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.05329153605015674, |
| "grad_norm": 4.489407062530518, |
| "learning_rate": 4.993671430357788e-06, |
| "loss": 0.7822, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.05642633228840126, |
| "grad_norm": 5.612992763519287, |
| "learning_rate": 4.991735181182401e-06, |
| "loss": 0.7184, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.05956112852664577, |
| "grad_norm": 13.66352367401123, |
| "learning_rate": 4.989541370516523e-06, |
| "loss": 0.7222, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.06269592476489028, |
| "grad_norm": 3.666353940963745, |
| "learning_rate": 4.987090225126285e-06, |
| "loss": 0.7231, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06583072100313479, |
| "grad_norm": 4.0020294189453125, |
| "learning_rate": 4.9843819983775575e-06, |
| "loss": 0.7674, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.06896551724137931, |
| "grad_norm": 3.6200690269470215, |
| "learning_rate": 4.98141697020977e-06, |
| "loss": 0.7478, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.07210031347962383, |
| "grad_norm": 3.7657089233398438, |
| "learning_rate": 4.978195447106965e-06, |
| "loss": 0.7388, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.07523510971786834, |
| "grad_norm": 3.262274742126465, |
| "learning_rate": 4.974717762066123e-06, |
| "loss": 0.7176, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.07836990595611286, |
| "grad_norm": 3.8943138122558594, |
| "learning_rate": 4.970984274562741e-06, |
| "loss": 0.6745, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08150470219435736, |
| "grad_norm": 3.2295124530792236, |
| "learning_rate": 4.966995370513675e-06, |
| "loss": 0.7079, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.08463949843260188, |
| "grad_norm": 4.9130730628967285, |
| "learning_rate": 4.962751462237248e-06, |
| "loss": 0.7824, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0877742946708464, |
| "grad_norm": 3.4682295322418213, |
| "learning_rate": 4.958252988410631e-06, |
| "loss": 0.7122, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.09090909090909091, |
| "grad_norm": 3.1807892322540283, |
| "learning_rate": 4.9535004140245005e-06, |
| "loss": 0.752, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.09404388714733543, |
| "grad_norm": 6.412112712860107, |
| "learning_rate": 4.94849423033497e-06, |
| "loss": 0.7503, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.09717868338557993, |
| "grad_norm": 3.3517048358917236, |
| "learning_rate": 4.943234954812812e-06, |
| "loss": 0.6783, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.10031347962382445, |
| "grad_norm": 3.2728397846221924, |
| "learning_rate": 4.937723131089974e-06, |
| "loss": 0.713, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.10344827586206896, |
| "grad_norm": 3.6628053188323975, |
| "learning_rate": 4.931959328903376e-06, |
| "loss": 0.6842, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.10658307210031348, |
| "grad_norm": 4.155019283294678, |
| "learning_rate": 4.925944144036027e-06, |
| "loss": 0.6627, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.109717868338558, |
| "grad_norm": 2.7333738803863525, |
| "learning_rate": 4.919678198255438e-06, |
| "loss": 0.6807, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11285266457680251, |
| "grad_norm": 3.560575246810913, |
| "learning_rate": 4.91316213924935e-06, |
| "loss": 0.6802, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.11598746081504702, |
| "grad_norm": 2.562624454498291, |
| "learning_rate": 4.90639664055879e-06, |
| "loss": 0.6264, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.11912225705329153, |
| "grad_norm": 5.172059059143066, |
| "learning_rate": 4.899382401508446e-06, |
| "loss": 0.6783, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.12225705329153605, |
| "grad_norm": 3.2025134563446045, |
| "learning_rate": 4.892120147134378e-06, |
| "loss": 0.7013, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.12539184952978055, |
| "grad_norm": 2.37896466255188, |
| "learning_rate": 4.884610628109082e-06, |
| "loss": 0.6241, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.12852664576802508, |
| "grad_norm": 2.381072759628296, |
| "learning_rate": 4.876854620663887e-06, |
| "loss": 0.6328, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.13166144200626959, |
| "grad_norm": 2.695284605026245, |
| "learning_rate": 4.868852926508721e-06, |
| "loss": 0.6626, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.13479623824451412, |
| "grad_norm": 6.515556335449219, |
| "learning_rate": 4.860606372749247e-06, |
| "loss": 0.7056, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.13793103448275862, |
| "grad_norm": 3.1236801147460938, |
| "learning_rate": 4.8521158118013605e-06, |
| "loss": 0.678, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.14106583072100312, |
| "grad_norm": 3.5594000816345215, |
| "learning_rate": 4.843382121303082e-06, |
| "loss": 0.6649, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.14420062695924765, |
| "grad_norm": 2.737841844558716, |
| "learning_rate": 4.83440620402384e-06, |
| "loss": 0.5953, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.14733542319749215, |
| "grad_norm": 2.5807104110717773, |
| "learning_rate": 4.825188987771149e-06, |
| "loss": 0.6678, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.15047021943573669, |
| "grad_norm": 2.5918169021606445, |
| "learning_rate": 4.815731425294716e-06, |
| "loss": 0.7223, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.1536050156739812, |
| "grad_norm": 2.677417278289795, |
| "learning_rate": 4.806034494187949e-06, |
| "loss": 0.6688, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.15673981191222572, |
| "grad_norm": 2.6603641510009766, |
| "learning_rate": 4.796099196786908e-06, |
| "loss": 0.6652, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.15987460815047022, |
| "grad_norm": 2.502426862716675, |
| "learning_rate": 4.785926560066703e-06, |
| "loss": 0.6092, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.16300940438871472, |
| "grad_norm": 6.598116397857666, |
| "learning_rate": 4.775517635535332e-06, |
| "loss": 0.6496, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.16614420062695925, |
| "grad_norm": 3.122972249984741, |
| "learning_rate": 4.764873499124997e-06, |
| "loss": 0.6687, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.16927899686520376, |
| "grad_norm": 3.360384225845337, |
| "learning_rate": 4.753995251080884e-06, |
| "loss": 0.6911, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.1724137931034483, |
| "grad_norm": 2.609579563140869, |
| "learning_rate": 4.742884015847436e-06, |
| "loss": 0.6397, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1755485893416928, |
| "grad_norm": 2.5354106426239014, |
| "learning_rate": 4.731540941952126e-06, |
| "loss": 0.6009, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.1786833855799373, |
| "grad_norm": 2.5063207149505615, |
| "learning_rate": 4.719967201886734e-06, |
| "loss": 0.5991, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.18181818181818182, |
| "grad_norm": 2.7579185962677, |
| "learning_rate": 4.708163991986152e-06, |
| "loss": 0.6064, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.18495297805642633, |
| "grad_norm": 2.891486883163452, |
| "learning_rate": 4.696132532304727e-06, |
| "loss": 0.6391, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.18808777429467086, |
| "grad_norm": 2.916727304458618, |
| "learning_rate": 4.683874066490143e-06, |
| "loss": 0.6727, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.19122257053291536, |
| "grad_norm": 2.708406686782837, |
| "learning_rate": 4.671389861654873e-06, |
| "loss": 0.6499, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.19435736677115986, |
| "grad_norm": 4.942819118499756, |
| "learning_rate": 4.658681208245198e-06, |
| "loss": 0.6356, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.1974921630094044, |
| "grad_norm": 3.262690544128418, |
| "learning_rate": 4.645749419907829e-06, |
| "loss": 0.6732, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.2006269592476489, |
| "grad_norm": 2.6621508598327637, |
| "learning_rate": 4.632595833354105e-06, |
| "loss": 0.6615, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.20376175548589343, |
| "grad_norm": 2.9821105003356934, |
| "learning_rate": 4.619221808221833e-06, |
| "loss": 0.6667, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.20689655172413793, |
| "grad_norm": 5.076481342315674, |
| "learning_rate": 4.605628726934747e-06, |
| "loss": 0.6505, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.21003134796238246, |
| "grad_norm": 4.245087623596191, |
| "learning_rate": 4.5918179945596055e-06, |
| "loss": 0.6129, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.21316614420062696, |
| "grad_norm": 3.060851812362671, |
| "learning_rate": 4.577791038660959e-06, |
| "loss": 0.6436, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.21630094043887146, |
| "grad_norm": 3.0752642154693604, |
| "learning_rate": 4.563549309153589e-06, |
| "loss": 0.6957, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.219435736677116, |
| "grad_norm": 2.996229887008667, |
| "learning_rate": 4.549094278152631e-06, |
| "loss": 0.6877, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2225705329153605, |
| "grad_norm": 4.217442035675049, |
| "learning_rate": 4.534427439821416e-06, |
| "loss": 0.6231, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.22570532915360503, |
| "grad_norm": 3.009230136871338, |
| "learning_rate": 4.519550310217013e-06, |
| "loss": 0.6574, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.22884012539184953, |
| "grad_norm": 3.037616491317749, |
| "learning_rate": 4.504464427133527e-06, |
| "loss": 0.6238, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.23197492163009403, |
| "grad_norm": 3.02176833152771, |
| "learning_rate": 4.489171349943144e-06, |
| "loss": 0.6985, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.23510971786833856, |
| "grad_norm": 3.0625197887420654, |
| "learning_rate": 4.473672659434941e-06, |
| "loss": 0.7531, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.23824451410658307, |
| "grad_norm": 3.099853277206421, |
| "learning_rate": 4.457969957651485e-06, |
| "loss": 0.6188, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.2413793103448276, |
| "grad_norm": 3.041682004928589, |
| "learning_rate": 4.442064867723236e-06, |
| "loss": 0.6754, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.2445141065830721, |
| "grad_norm": 2.7343809604644775, |
| "learning_rate": 4.425959033700776e-06, |
| "loss": 0.615, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2476489028213166, |
| "grad_norm": 2.6281471252441406, |
| "learning_rate": 4.409654120384863e-06, |
| "loss": 0.6247, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.2507836990595611, |
| "grad_norm": 2.4146194458007812, |
| "learning_rate": 4.393151813154345e-06, |
| "loss": 0.6237, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.25391849529780564, |
| "grad_norm": 2.641150712966919, |
| "learning_rate": 4.3764538177919555e-06, |
| "loss": 0.5891, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.25705329153605017, |
| "grad_norm": 2.4911434650421143, |
| "learning_rate": 4.35956186030799e-06, |
| "loss": 0.6456, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.2601880877742947, |
| "grad_norm": 3.6674325466156006, |
| "learning_rate": 4.3424776867618935e-06, |
| "loss": 0.5946, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.26332288401253917, |
| "grad_norm": 8.596332550048828, |
| "learning_rate": 4.325203063081776e-06, |
| "loss": 0.6133, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.2664576802507837, |
| "grad_norm": 2.6561825275421143, |
| "learning_rate": 4.307739774881878e-06, |
| "loss": 0.6654, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.26959247648902823, |
| "grad_norm": 3.8426294326782227, |
| "learning_rate": 4.290089627277998e-06, |
| "loss": 0.6124, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.2727272727272727, |
| "grad_norm": 3.1375346183776855, |
| "learning_rate": 4.2722544447008995e-06, |
| "loss": 0.6183, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.27586206896551724, |
| "grad_norm": 2.7622690200805664, |
| "learning_rate": 4.254236070707734e-06, |
| "loss": 0.6555, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.27899686520376177, |
| "grad_norm": 2.8743462562561035, |
| "learning_rate": 4.236036367791471e-06, |
| "loss": 0.5867, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.28213166144200624, |
| "grad_norm": 2.7058961391448975, |
| "learning_rate": 4.2176572171883865e-06, |
| "loss": 0.6346, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2852664576802508, |
| "grad_norm": 2.520024538040161, |
| "learning_rate": 4.199100518683601e-06, |
| "loss": 0.646, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.2884012539184953, |
| "grad_norm": 2.6446547508239746, |
| "learning_rate": 4.18036819041471e-06, |
| "loss": 0.6502, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.29153605015673983, |
| "grad_norm": 2.559896945953369, |
| "learning_rate": 4.161462168673508e-06, |
| "loss": 0.6166, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.2946708463949843, |
| "grad_norm": 2.6093804836273193, |
| "learning_rate": 4.142384407705846e-06, |
| "loss": 0.607, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.29780564263322884, |
| "grad_norm": 2.8598270416259766, |
| "learning_rate": 4.123136879509626e-06, |
| "loss": 0.6094, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.30094043887147337, |
| "grad_norm": 2.6045126914978027, |
| "learning_rate": 4.103721573630965e-06, |
| "loss": 0.6639, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.30407523510971785, |
| "grad_norm": 2.7823448181152344, |
| "learning_rate": 4.084140496958539e-06, |
| "loss": 0.6341, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.3072100313479624, |
| "grad_norm": 2.320493459701538, |
| "learning_rate": 4.06439567351614e-06, |
| "loss": 0.5939, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.3103448275862069, |
| "grad_norm": 2.2848961353302, |
| "learning_rate": 4.0444891442534615e-06, |
| "loss": 0.5916, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.31347962382445144, |
| "grad_norm": 2.4216668605804443, |
| "learning_rate": 4.024422966835137e-06, |
| "loss": 0.6116, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3166144200626959, |
| "grad_norm": 2.654069185256958, |
| "learning_rate": 4.004199215428032e-06, |
| "loss": 0.654, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.31974921630094044, |
| "grad_norm": 2.4687626361846924, |
| "learning_rate": 3.9838199804868635e-06, |
| "loss": 0.6136, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.322884012539185, |
| "grad_norm": 3.22712779045105, |
| "learning_rate": 3.963287368538105e-06, |
| "loss": 0.6208, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.32601880877742945, |
| "grad_norm": 2.4066531658172607, |
| "learning_rate": 3.942603501962249e-06, |
| "loss": 0.6191, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.329153605015674, |
| "grad_norm": 2.9688427448272705, |
| "learning_rate": 3.92177051877442e-06, |
| "loss": 0.6132, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.3322884012539185, |
| "grad_norm": 2.6026861667633057, |
| "learning_rate": 3.900790572403376e-06, |
| "loss": 0.5957, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.335423197492163, |
| "grad_norm": 3.1237998008728027, |
| "learning_rate": 3.8796658314689205e-06, |
| "loss": 0.6267, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.3385579937304075, |
| "grad_norm": 2.7337708473205566, |
| "learning_rate": 3.858398479557739e-06, |
| "loss": 0.6635, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.34169278996865204, |
| "grad_norm": 2.707108497619629, |
| "learning_rate": 3.836990714997686e-06, |
| "loss": 0.6217, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.3448275862068966, |
| "grad_norm": 2.4689037799835205, |
| "learning_rate": 3.815444750630555e-06, |
| "loss": 0.6364, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.34796238244514105, |
| "grad_norm": 2.9303574562072754, |
| "learning_rate": 3.7937628135833453e-06, |
| "loss": 0.6117, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.3510971786833856, |
| "grad_norm": 2.5677459239959717, |
| "learning_rate": 3.7719471450380518e-06, |
| "loss": 0.6007, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.3542319749216301, |
| "grad_norm": 2.688203811645508, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.5778, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.3573667711598746, |
| "grad_norm": 2.5018489360809326, |
| "learning_rate": 3.7279236470647593e-06, |
| "loss": 0.5826, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.3605015673981191, |
| "grad_norm": 2.8333818912506104, |
| "learning_rate": 3.7057203681836407e-06, |
| "loss": 0.59, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 2.9619221687316895, |
| "learning_rate": 3.683392458427825e-06, |
| "loss": 0.6616, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.3667711598746082, |
| "grad_norm": 2.860719919204712, |
| "learning_rate": 3.660942225751126e-06, |
| "loss": 0.5618, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.36990595611285265, |
| "grad_norm": 2.507951021194458, |
| "learning_rate": 3.638371990751428e-06, |
| "loss": 0.6305, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.3730407523510972, |
| "grad_norm": 2.5088613033294678, |
| "learning_rate": 3.615684086430815e-06, |
| "loss": 0.5831, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3761755485893417, |
| "grad_norm": 2.534837245941162, |
| "learning_rate": 3.592880857954413e-06, |
| "loss": 0.5818, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3793103448275862, |
| "grad_norm": 2.4578607082366943, |
| "learning_rate": 3.5699646624079824e-06, |
| "loss": 0.6267, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.3824451410658307, |
| "grad_norm": 2.6599507331848145, |
| "learning_rate": 3.5469378685542742e-06, |
| "loss": 0.5954, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.38557993730407525, |
| "grad_norm": 3.10658597946167, |
| "learning_rate": 3.52380285658818e-06, |
| "loss": 0.5963, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.3887147335423197, |
| "grad_norm": 2.4190332889556885, |
| "learning_rate": 3.500562017890695e-06, |
| "loss": 0.6224, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.39184952978056425, |
| "grad_norm": 3.2278661727905273, |
| "learning_rate": 3.4772177547817387e-06, |
| "loss": 0.6182, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3949843260188088, |
| "grad_norm": 2.7765655517578125, |
| "learning_rate": 3.4537724802718294e-06, |
| "loss": 0.6399, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.3981191222570533, |
| "grad_norm": 2.6910951137542725, |
| "learning_rate": 3.430228617812661e-06, |
| "loss": 0.5898, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.4012539184952978, |
| "grad_norm": 4.030726909637451, |
| "learning_rate": 3.4065886010466014e-06, |
| "loss": 0.6093, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.4043887147335423, |
| "grad_norm": 2.7327334880828857, |
| "learning_rate": 3.382854873555137e-06, |
| "loss": 0.5574, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.40752351097178685, |
| "grad_norm": 3.4407501220703125, |
| "learning_rate": 3.3590298886062833e-06, |
| "loss": 0.6339, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4106583072100313, |
| "grad_norm": 2.798560380935669, |
| "learning_rate": 3.3351161089010055e-06, |
| "loss": 0.6152, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.41379310344827586, |
| "grad_norm": 2.4109294414520264, |
| "learning_rate": 3.3111160063186553e-06, |
| "loss": 0.5964, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.4169278996865204, |
| "grad_norm": 6.183897018432617, |
| "learning_rate": 3.2870320616614626e-06, |
| "loss": 0.5906, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.4200626959247649, |
| "grad_norm": 2.960341691970825, |
| "learning_rate": 3.2628667643981036e-06, |
| "loss": 0.663, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.4231974921630094, |
| "grad_norm": 2.809236764907837, |
| "learning_rate": 3.238622612406373e-06, |
| "loss": 0.6198, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.4263322884012539, |
| "grad_norm": 3.4371628761291504, |
| "learning_rate": 3.21430211171499e-06, |
| "loss": 0.6275, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.42946708463949845, |
| "grad_norm": 3.0913808345794678, |
| "learning_rate": 3.189907776244556e-06, |
| "loss": 0.6232, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.43260188087774293, |
| "grad_norm": 2.930027961730957, |
| "learning_rate": 3.1654421275477045e-06, |
| "loss": 0.5638, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.43573667711598746, |
| "grad_norm": 2.9050416946411133, |
| "learning_rate": 3.1409076945484513e-06, |
| "loss": 0.621, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.438871473354232, |
| "grad_norm": 2.39300274848938, |
| "learning_rate": 3.116307013280793e-06, |
| "loss": 0.5852, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.44200626959247646, |
| "grad_norm": 2.8745641708374023, |
| "learning_rate": 3.0916426266265676e-06, |
| "loss": 0.6119, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.445141065830721, |
| "grad_norm": 3.0072708129882812, |
| "learning_rate": 3.066917084052603e-06, |
| "loss": 0.5851, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.4482758620689655, |
| "grad_norm": 2.8062074184417725, |
| "learning_rate": 3.042132941347189e-06, |
| "loss": 0.586, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.45141065830721006, |
| "grad_norm": 3.202028274536133, |
| "learning_rate": 3.017292760355896e-06, |
| "loss": 0.6312, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.45454545454545453, |
| "grad_norm": 2.7520525455474854, |
| "learning_rate": 2.9923991087167657e-06, |
| "loss": 0.5769, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.45768025078369906, |
| "grad_norm": 2.7981884479522705, |
| "learning_rate": 2.967454559594903e-06, |
| "loss": 0.6349, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.4608150470219436, |
| "grad_norm": 2.9364802837371826, |
| "learning_rate": 2.9424616914164982e-06, |
| "loss": 0.5936, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.46394984326018807, |
| "grad_norm": 2.985931873321533, |
| "learning_rate": 2.917423087602306e-06, |
| "loss": 0.5731, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.4670846394984326, |
| "grad_norm": 2.4261667728424072, |
| "learning_rate": 2.8923413363006038e-06, |
| "loss": 0.602, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.4702194357366771, |
| "grad_norm": 2.6361424922943115, |
| "learning_rate": 2.8672190301196655e-06, |
| "loss": 0.5851, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.47335423197492166, |
| "grad_norm": 2.6938276290893555, |
| "learning_rate": 2.842058765859776e-06, |
| "loss": 0.6026, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.47648902821316613, |
| "grad_norm": 2.6547839641571045, |
| "learning_rate": 2.8168631442448046e-06, |
| "loss": 0.5863, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.47962382445141066, |
| "grad_norm": 2.8255505561828613, |
| "learning_rate": 2.791634769653381e-06, |
| "loss": 0.6096, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.4827586206896552, |
| "grad_norm": 2.461580514907837, |
| "learning_rate": 2.7663762498496905e-06, |
| "loss": 0.5744, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.48589341692789967, |
| "grad_norm": 2.9616644382476807, |
| "learning_rate": 2.741090195713917e-06, |
| "loss": 0.5849, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.4890282131661442, |
| "grad_norm": 2.7509751319885254, |
| "learning_rate": 2.7157792209723654e-06, |
| "loss": 0.5711, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.49216300940438873, |
| "grad_norm": 3.163322687149048, |
| "learning_rate": 2.6904459419272955e-06, |
| "loss": 0.6491, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.4952978056426332, |
| "grad_norm": 2.5019166469573975, |
| "learning_rate": 2.6650929771864776e-06, |
| "loss": 0.5608, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.49843260188087773, |
| "grad_norm": 2.8161232471466064, |
| "learning_rate": 2.639722947392521e-06, |
| "loss": 0.6116, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.5015673981191222, |
| "grad_norm": 2.9896793365478516, |
| "learning_rate": 2.614338474951987e-06, |
| "loss": 0.5859, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5047021943573667, |
| "grad_norm": 2.9633073806762695, |
| "learning_rate": 2.5889421837643186e-06, |
| "loss": 0.5757, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.5078369905956113, |
| "grad_norm": 7.553648471832275, |
| "learning_rate": 2.563536698950624e-06, |
| "loss": 0.5985, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5109717868338558, |
| "grad_norm": 2.4814646244049072, |
| "learning_rate": 2.538124646582315e-06, |
| "loss": 0.5918, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.5141065830721003, |
| "grad_norm": 2.601158380508423, |
| "learning_rate": 2.512708653409674e-06, |
| "loss": 0.5768, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5172413793103449, |
| "grad_norm": 2.3953468799591064, |
| "learning_rate": 2.487291346590326e-06, |
| "loss": 0.5801, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5203761755485894, |
| "grad_norm": 2.782278060913086, |
| "learning_rate": 2.4618753534176854e-06, |
| "loss": 0.5953, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5235109717868338, |
| "grad_norm": 2.6170544624328613, |
| "learning_rate": 2.436463301049378e-06, |
| "loss": 0.584, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.5266457680250783, |
| "grad_norm": 2.2067277431488037, |
| "learning_rate": 2.4110578162356814e-06, |
| "loss": 0.5709, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.5297805642633229, |
| "grad_norm": 2.650946617126465, |
| "learning_rate": 2.385661525048014e-06, |
| "loss": 0.6157, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.5329153605015674, |
| "grad_norm": 2.4111557006835938, |
| "learning_rate": 2.3602770526074804e-06, |
| "loss": 0.587, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5360501567398119, |
| "grad_norm": 2.574047088623047, |
| "learning_rate": 2.334907022813523e-06, |
| "loss": 0.5945, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.5391849529780565, |
| "grad_norm": 5.312939167022705, |
| "learning_rate": 2.3095540580727054e-06, |
| "loss": 0.5584, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.542319749216301, |
| "grad_norm": 2.999335289001465, |
| "learning_rate": 2.2842207790276355e-06, |
| "loss": 0.5588, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.5454545454545454, |
| "grad_norm": 2.8886678218841553, |
| "learning_rate": 2.2589098042860838e-06, |
| "loss": 0.5834, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.54858934169279, |
| "grad_norm": 2.7539076805114746, |
| "learning_rate": 2.2336237501503103e-06, |
| "loss": 0.529, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5517241379310345, |
| "grad_norm": 2.600677967071533, |
| "learning_rate": 2.2083652303466196e-06, |
| "loss": 0.5694, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.554858934169279, |
| "grad_norm": 2.550015449523926, |
| "learning_rate": 2.1831368557551962e-06, |
| "loss": 0.5734, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.5579937304075235, |
| "grad_norm": 2.4202213287353516, |
| "learning_rate": 2.157941234140225e-06, |
| "loss": 0.5664, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.5611285266457681, |
| "grad_norm": 2.6022562980651855, |
| "learning_rate": 2.1327809698803354e-06, |
| "loss": 0.5516, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.5642633228840125, |
| "grad_norm": 2.3936119079589844, |
| "learning_rate": 2.1076586636993975e-06, |
| "loss": 0.5697, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.567398119122257, |
| "grad_norm": 2.6119375228881836, |
| "learning_rate": 2.0825769123976954e-06, |
| "loss": 0.5524, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.5705329153605015, |
| "grad_norm": 2.9607300758361816, |
| "learning_rate": 2.057538308583502e-06, |
| "loss": 0.5539, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.5736677115987461, |
| "grad_norm": 2.5394551753997803, |
| "learning_rate": 2.0325454404050983e-06, |
| "loss": 0.5902, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.5768025078369906, |
| "grad_norm": 2.248412609100342, |
| "learning_rate": 2.0076008912832355e-06, |
| "loss": 0.5684, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.5799373040752351, |
| "grad_norm": 2.369102954864502, |
| "learning_rate": 1.9827072396441044e-06, |
| "loss": 0.5473, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.5830721003134797, |
| "grad_norm": 2.8046441078186035, |
| "learning_rate": 1.957867058652812e-06, |
| "loss": 0.5125, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.5862068965517241, |
| "grad_norm": 2.5482161045074463, |
| "learning_rate": 1.933082915947398e-06, |
| "loss": 0.5586, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.5893416927899686, |
| "grad_norm": 2.53220534324646, |
| "learning_rate": 1.9083573733734328e-06, |
| "loss": 0.598, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.5924764890282131, |
| "grad_norm": 2.742966413497925, |
| "learning_rate": 1.8836929867192077e-06, |
| "loss": 0.5432, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.5956112852664577, |
| "grad_norm": 4.084339141845703, |
| "learning_rate": 1.8590923054515504e-06, |
| "loss": 0.5232, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5987460815047022, |
| "grad_norm": 2.8514344692230225, |
| "learning_rate": 1.8345578724522957e-06, |
| "loss": 0.5748, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.6018808777429467, |
| "grad_norm": 2.736140012741089, |
| "learning_rate": 1.8100922237554442e-06, |
| "loss": 0.5315, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6050156739811913, |
| "grad_norm": 2.4470133781433105, |
| "learning_rate": 1.7856978882850112e-06, |
| "loss": 0.5722, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.6081504702194357, |
| "grad_norm": 2.587562084197998, |
| "learning_rate": 1.7613773875936274e-06, |
| "loss": 0.5697, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.6112852664576802, |
| "grad_norm": 2.340610980987549, |
| "learning_rate": 1.7371332356018972e-06, |
| "loss": 0.5292, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6144200626959248, |
| "grad_norm": 4.569915771484375, |
| "learning_rate": 1.7129679383385384e-06, |
| "loss": 0.582, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6175548589341693, |
| "grad_norm": 2.7119388580322266, |
| "learning_rate": 1.688883993681345e-06, |
| "loss": 0.6219, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.6206896551724138, |
| "grad_norm": 2.8180112838745117, |
| "learning_rate": 1.6648838910989955e-06, |
| "loss": 0.5649, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.6238244514106583, |
| "grad_norm": 3.1212546825408936, |
| "learning_rate": 1.6409701113937182e-06, |
| "loss": 0.5269, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.6269592476489029, |
| "grad_norm": 3.6080574989318848, |
| "learning_rate": 1.617145126444864e-06, |
| "loss": 0.5903, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6300940438871473, |
| "grad_norm": 2.5957369804382324, |
| "learning_rate": 1.5934113989533992e-06, |
| "loss": 0.6123, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.6332288401253918, |
| "grad_norm": 2.664415121078491, |
| "learning_rate": 1.5697713821873401e-06, |
| "loss": 0.6159, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.6363636363636364, |
| "grad_norm": 2.5966672897338867, |
| "learning_rate": 1.5462275197281717e-06, |
| "loss": 0.5255, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.6394984326018809, |
| "grad_norm": 2.552795886993408, |
| "learning_rate": 1.5227822452182617e-06, |
| "loss": 0.5485, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.6426332288401254, |
| "grad_norm": 2.581660032272339, |
| "learning_rate": 1.499437982109305e-06, |
| "loss": 0.5727, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.64576802507837, |
| "grad_norm": 2.6740126609802246, |
| "learning_rate": 1.4761971434118207e-06, |
| "loss": 0.568, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.6489028213166145, |
| "grad_norm": 2.314016342163086, |
| "learning_rate": 1.4530621314457255e-06, |
| "loss": 0.5335, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.6520376175548589, |
| "grad_norm": 2.5612449645996094, |
| "learning_rate": 1.430035337592018e-06, |
| "loss": 0.5422, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.6551724137931034, |
| "grad_norm": 6.558284759521484, |
| "learning_rate": 1.4071191420455873e-06, |
| "loss": 0.5938, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.658307210031348, |
| "grad_norm": 2.5115890502929688, |
| "learning_rate": 1.3843159135691859e-06, |
| "loss": 0.5194, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6614420062695925, |
| "grad_norm": 3.0726919174194336, |
| "learning_rate": 1.3616280092485719e-06, |
| "loss": 0.554, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.664576802507837, |
| "grad_norm": 2.4502389430999756, |
| "learning_rate": 1.3390577742488747e-06, |
| "loss": 0.6057, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.6677115987460815, |
| "grad_norm": 2.853550434112549, |
| "learning_rate": 1.3166075415721762e-06, |
| "loss": 0.5049, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.670846394984326, |
| "grad_norm": 2.496123790740967, |
| "learning_rate": 1.2942796318163595e-06, |
| "loss": 0.5625, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.6739811912225705, |
| "grad_norm": 2.3185224533081055, |
| "learning_rate": 1.2720763529352415e-06, |
| "loss": 0.5336, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.677115987460815, |
| "grad_norm": 2.621919631958008, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.539, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.6802507836990596, |
| "grad_norm": 2.744100570678711, |
| "learning_rate": 1.2280528549619487e-06, |
| "loss": 0.5213, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.6833855799373041, |
| "grad_norm": 2.494028329849243, |
| "learning_rate": 1.2062371864166553e-06, |
| "loss": 0.5419, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.6865203761755486, |
| "grad_norm": 2.599900245666504, |
| "learning_rate": 1.1845552493694462e-06, |
| "loss": 0.5456, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.6896551724137931, |
| "grad_norm": 2.5224337577819824, |
| "learning_rate": 1.1630092850023148e-06, |
| "loss": 0.566, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6927899686520376, |
| "grad_norm": 2.492403030395508, |
| "learning_rate": 1.141601520442262e-06, |
| "loss": 0.5415, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.6959247648902821, |
| "grad_norm": 2.5345394611358643, |
| "learning_rate": 1.120334168531081e-06, |
| "loss": 0.5301, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.6990595611285266, |
| "grad_norm": 2.418922185897827, |
| "learning_rate": 1.0992094275966256e-06, |
| "loss": 0.5764, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.7021943573667712, |
| "grad_norm": 3.3536760807037354, |
| "learning_rate": 1.078229481225582e-06, |
| "loss": 0.5596, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.7053291536050157, |
| "grad_norm": 2.531526803970337, |
| "learning_rate": 1.0573964980377517e-06, |
| "loss": 0.549, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7084639498432602, |
| "grad_norm": 2.7442548274993896, |
| "learning_rate": 1.0367126314618946e-06, |
| "loss": 0.5025, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.7115987460815048, |
| "grad_norm": 2.368351459503174, |
| "learning_rate": 1.0161800195131372e-06, |
| "loss": 0.5311, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.7147335423197492, |
| "grad_norm": 2.8416459560394287, |
| "learning_rate": 9.95800784571969e-07, |
| "loss": 0.5243, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.7178683385579937, |
| "grad_norm": 2.772183656692505, |
| "learning_rate": 9.755770331648642e-07, |
| "loss": 0.5677, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.7210031347962382, |
| "grad_norm": 2.4133315086364746, |
| "learning_rate": 9.555108557465383e-07, |
| "loss": 0.5507, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7241379310344828, |
| "grad_norm": 2.677746295928955, |
| "learning_rate": 9.356043264838607e-07, |
| "loss": 0.5553, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 2.8451437950134277, |
| "learning_rate": 9.158595030414621e-07, |
| "loss": 0.5135, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.7304075235109718, |
| "grad_norm": 2.838019609451294, |
| "learning_rate": 8.962784263690358e-07, |
| "loss": 0.59, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.7335423197492164, |
| "grad_norm": 2.865750312805176, |
| "learning_rate": 8.768631204903738e-07, |
| "loss": 0.5164, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.7366771159874608, |
| "grad_norm": 2.9796836376190186, |
| "learning_rate": 8.576155922941548e-07, |
| "loss": 0.5242, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7398119122257053, |
| "grad_norm": 3.789559841156006, |
| "learning_rate": 8.385378313264933e-07, |
| "loss": 0.5419, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.7429467084639498, |
| "grad_norm": 2.552150249481201, |
| "learning_rate": 8.196318095852909e-07, |
| "loss": 0.5426, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.7460815047021944, |
| "grad_norm": 3.243431568145752, |
| "learning_rate": 8.008994813163995e-07, |
| "loss": 0.5121, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.7492163009404389, |
| "grad_norm": 3.1874783039093018, |
| "learning_rate": 7.823427828116148e-07, |
| "loss": 0.5512, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.7523510971786834, |
| "grad_norm": 2.545905828475952, |
| "learning_rate": 7.6396363220853e-07, |
| "loss": 0.5483, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7554858934169278, |
| "grad_norm": 2.719782829284668, |
| "learning_rate": 7.457639292922675e-07, |
| "loss": 0.5683, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.7586206896551724, |
| "grad_norm": 2.9073195457458496, |
| "learning_rate": 7.277455552991011e-07, |
| "loss": 0.5711, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.7617554858934169, |
| "grad_norm": 2.301893949508667, |
| "learning_rate": 7.099103727220024e-07, |
| "loss": 0.533, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.7648902821316614, |
| "grad_norm": 2.9436652660369873, |
| "learning_rate": 6.922602251181221e-07, |
| "loss": 0.5447, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.768025078369906, |
| "grad_norm": 3.2471468448638916, |
| "learning_rate": 6.747969369182248e-07, |
| "loss": 0.5551, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.7711598746081505, |
| "grad_norm": 2.480755567550659, |
| "learning_rate": 6.575223132381067e-07, |
| "loss": 0.5143, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.774294670846395, |
| "grad_norm": 2.8075945377349854, |
| "learning_rate": 6.4043813969201e-07, |
| "loss": 0.5099, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.7774294670846394, |
| "grad_norm": 3.024644136428833, |
| "learning_rate": 6.235461822080449e-07, |
| "loss": 0.5393, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.780564263322884, |
| "grad_norm": 2.6839873790740967, |
| "learning_rate": 6.068481868456558e-07, |
| "loss": 0.5509, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.7836990595611285, |
| "grad_norm": 3.0200679302215576, |
| "learning_rate": 5.903458796151382e-07, |
| "loss": 0.5647, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.786833855799373, |
| "grad_norm": 2.65813946723938, |
| "learning_rate": 5.740409662992244e-07, |
| "loss": 0.5202, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.7899686520376176, |
| "grad_norm": 2.6419460773468018, |
| "learning_rate": 5.579351322767643e-07, |
| "loss": 0.5412, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.7931034482758621, |
| "grad_norm": 2.5918684005737305, |
| "learning_rate": 5.420300423485167e-07, |
| "loss": 0.5671, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.7962382445141066, |
| "grad_norm": 2.6645092964172363, |
| "learning_rate": 5.263273405650601e-07, |
| "loss": 0.5971, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.799373040752351, |
| "grad_norm": 2.6975386142730713, |
| "learning_rate": 5.108286500568562e-07, |
| "loss": 0.5569, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.8025078369905956, |
| "grad_norm": 2.585435628890991, |
| "learning_rate": 4.95535572866474e-07, |
| "loss": 0.5394, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.8056426332288401, |
| "grad_norm": 2.4776594638824463, |
| "learning_rate": 4.804496897829883e-07, |
| "loss": 0.5231, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.8087774294670846, |
| "grad_norm": 2.783409833908081, |
| "learning_rate": 4.6557256017858485e-07, |
| "loss": 0.5114, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.8119122257053292, |
| "grad_norm": 2.355269193649292, |
| "learning_rate": 4.5090572184736863e-07, |
| "loss": 0.5202, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.8150470219435737, |
| "grad_norm": 2.541964292526245, |
| "learning_rate": 4.3645069084641195e-07, |
| "loss": 0.5414, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8181818181818182, |
| "grad_norm": 2.6445441246032715, |
| "learning_rate": 4.222089613390412e-07, |
| "loss": 0.5289, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.8213166144200627, |
| "grad_norm": 2.8741798400878906, |
| "learning_rate": 4.0818200544039484e-07, |
| "loss": 0.5541, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.8244514106583072, |
| "grad_norm": 3.0294582843780518, |
| "learning_rate": 3.9437127306525295e-07, |
| "loss": 0.5234, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.8275862068965517, |
| "grad_norm": 2.6354918479919434, |
| "learning_rate": 3.8077819177816695e-07, |
| "loss": 0.5061, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.8307210031347962, |
| "grad_norm": 2.5358927249908447, |
| "learning_rate": 3.6740416664589634e-07, |
| "loss": 0.5108, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.8338557993730408, |
| "grad_norm": 3.637833595275879, |
| "learning_rate": 3.5425058009217193e-07, |
| "loss": 0.5398, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.8369905956112853, |
| "grad_norm": 2.77534556388855, |
| "learning_rate": 3.413187917548019e-07, |
| "loss": 0.5727, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.8401253918495298, |
| "grad_norm": 2.57776141166687, |
| "learning_rate": 3.2861013834512844e-07, |
| "loss": 0.5309, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.8432601880877743, |
| "grad_norm": 2.4252471923828125, |
| "learning_rate": 3.161259335098571e-07, |
| "loss": 0.4912, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.8463949843260188, |
| "grad_norm": 4.025521278381348, |
| "learning_rate": 3.0386746769527323e-07, |
| "loss": 0.5448, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8495297805642633, |
| "grad_norm": 2.6746034622192383, |
| "learning_rate": 2.9183600801384853e-07, |
| "loss": 0.5454, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.8526645768025078, |
| "grad_norm": 2.7830092906951904, |
| "learning_rate": 2.8003279811326724e-07, |
| "loss": 0.539, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.8557993730407524, |
| "grad_norm": 2.5394887924194336, |
| "learning_rate": 2.684590580478749e-07, |
| "loss": 0.5234, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.8589341692789969, |
| "grad_norm": 2.765644073486328, |
| "learning_rate": 2.57115984152565e-07, |
| "loss": 0.5105, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.8620689655172413, |
| "grad_norm": 2.5733540058135986, |
| "learning_rate": 2.4600474891911696e-07, |
| "loss": 0.5381, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8652037617554859, |
| "grad_norm": 2.2739200592041016, |
| "learning_rate": 2.3512650087500338e-07, |
| "loss": 0.5344, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.8683385579937304, |
| "grad_norm": 2.446244478225708, |
| "learning_rate": 2.2448236446466847e-07, |
| "loss": 0.5271, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.8714733542319749, |
| "grad_norm": 2.833040237426758, |
| "learning_rate": 2.140734399332975e-07, |
| "loss": 0.5841, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.8746081504702194, |
| "grad_norm": 2.487649440765381, |
| "learning_rate": 2.0390080321309236e-07, |
| "loss": 0.5353, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.877742946708464, |
| "grad_norm": 3.27183198928833, |
| "learning_rate": 1.9396550581205208e-07, |
| "loss": 0.5181, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8808777429467085, |
| "grad_norm": 2.5979673862457275, |
| "learning_rate": 1.8426857470528414e-07, |
| "loss": 0.521, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.8840125391849529, |
| "grad_norm": 2.452927589416504, |
| "learning_rate": 1.7481101222885126e-07, |
| "loss": 0.5394, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.8871473354231975, |
| "grad_norm": 3.278170585632324, |
| "learning_rate": 1.6559379597616136e-07, |
| "loss": 0.5098, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.890282131661442, |
| "grad_norm": 2.9773340225219727, |
| "learning_rate": 1.5661787869691858e-07, |
| "loss": 0.5046, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.8934169278996865, |
| "grad_norm": 2.6932992935180664, |
| "learning_rate": 1.4788418819864037e-07, |
| "loss": 0.5517, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.896551724137931, |
| "grad_norm": 2.7016708850860596, |
| "learning_rate": 1.3939362725075344e-07, |
| "loss": 0.5386, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.8996865203761756, |
| "grad_norm": 3.2767891883850098, |
| "learning_rate": 1.3114707349127954e-07, |
| "loss": 0.53, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.9028213166144201, |
| "grad_norm": 2.865053653717041, |
| "learning_rate": 1.2314537933611425e-07, |
| "loss": 0.5306, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.9059561128526645, |
| "grad_norm": 2.703111410140991, |
| "learning_rate": 1.1538937189091825e-07, |
| "loss": 0.5677, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 2.6944828033447266, |
| "learning_rate": 1.0787985286562219e-07, |
| "loss": 0.5488, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9122257053291536, |
| "grad_norm": 2.602788209915161, |
| "learning_rate": 1.00617598491555e-07, |
| "loss": 0.5406, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.9153605015673981, |
| "grad_norm": 2.350580930709839, |
| "learning_rate": 9.360335944121029e-08, |
| "loss": 0.5027, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.9184952978056427, |
| "grad_norm": 2.7530674934387207, |
| "learning_rate": 8.683786075065065e-08, |
| "loss": 0.5458, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.9216300940438872, |
| "grad_norm": 2.564846992492676, |
| "learning_rate": 8.032180174456283e-08, |
| "loss": 0.5267, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.9247648902821317, |
| "grad_norm": 3.4500110149383545, |
| "learning_rate": 7.405585596397314e-08, |
| "loss": 0.5129, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.9278996865203761, |
| "grad_norm": 2.9540138244628906, |
| "learning_rate": 6.804067109662443e-08, |
| "loss": 0.5221, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.9310344827586207, |
| "grad_norm": 2.5607857704162598, |
| "learning_rate": 6.227686891002671e-08, |
| "loss": 0.4751, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.9341692789968652, |
| "grad_norm": 2.6755456924438477, |
| "learning_rate": 5.6765045187187614e-08, |
| "loss": 0.5014, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.9373040752351097, |
| "grad_norm": 3.1413800716400146, |
| "learning_rate": 5.150576966503063e-08, |
| "loss": 0.5191, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.9404388714733543, |
| "grad_norm": 2.7286934852600098, |
| "learning_rate": 4.649958597549964e-08, |
| "loss": 0.5229, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9435736677115988, |
| "grad_norm": 4.638148784637451, |
| "learning_rate": 4.174701158936895e-08, |
| "loss": 0.5181, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.9467084639498433, |
| "grad_norm": 2.8824093341827393, |
| "learning_rate": 3.7248537762752666e-08, |
| "loss": 0.5899, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.9498432601880877, |
| "grad_norm": 3.2580602169036865, |
| "learning_rate": 3.300462948632593e-08, |
| "loss": 0.5234, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.9529780564263323, |
| "grad_norm": 2.773378610610962, |
| "learning_rate": 2.9015725437259724e-08, |
| "loss": 0.5406, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.9561128526645768, |
| "grad_norm": 2.5910532474517822, |
| "learning_rate": 2.5282237933877962e-08, |
| "loss": 0.5322, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.9592476489028213, |
| "grad_norm": 4.50349760055542, |
| "learning_rate": 2.180455289303579e-08, |
| "loss": 0.6053, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.9623824451410659, |
| "grad_norm": 2.9559998512268066, |
| "learning_rate": 1.8583029790230356e-08, |
| "loss": 0.4884, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.9655172413793104, |
| "grad_norm": 3.511509418487549, |
| "learning_rate": 1.561800162244248e-08, |
| "loss": 0.5252, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.9686520376175548, |
| "grad_norm": 2.7472832202911377, |
| "learning_rate": 1.2909774873715585e-08, |
| "loss": 0.5427, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.9717868338557993, |
| "grad_norm": 3.0372958183288574, |
| "learning_rate": 1.0458629483476868e-08, |
| "loss": 0.5447, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9749216300940439, |
| "grad_norm": 2.645599126815796, |
| "learning_rate": 8.264818817599052e-09, |
| "loss": 0.5911, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.9780564263322884, |
| "grad_norm": 3.0988388061523438, |
| "learning_rate": 6.328569642212734e-09, |
| "loss": 0.522, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.9811912225705329, |
| "grad_norm": 4.376644134521484, |
| "learning_rate": 4.6500821002654075e-09, |
| "loss": 0.551, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.9843260188087775, |
| "grad_norm": 2.846813917160034, |
| "learning_rate": 3.2295296908338437e-09, |
| "loss": 0.5204, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.987460815047022, |
| "grad_norm": 4.044336318969727, |
| "learning_rate": 2.067059251189274e-09, |
| "loss": 0.5254, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.9905956112852664, |
| "grad_norm": 2.489978790283203, |
| "learning_rate": 1.1627909416211947e-09, |
| "loss": 0.5397, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.9937304075235109, |
| "grad_norm": 2.610431671142578, |
| "learning_rate": 5.168182330145266e-10, |
| "loss": 0.555, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.9968652037617555, |
| "grad_norm": 3.202317953109741, |
| "learning_rate": 1.292078971898425e-10, |
| "loss": 0.5094, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.7502589225769043, |
| "learning_rate": 0.0, |
| "loss": 0.5585, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 319, |
| "total_flos": 834160017014784.0, |
| "train_loss": 0.6235808798325099, |
| "train_runtime": 6415.1151, |
| "train_samples_per_second": 3.182, |
| "train_steps_per_second": 0.05 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 319, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 400, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 834160017014784.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|