| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.01953125, | |
| "eval_steps": 500, | |
| "global_step": 256, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00390625, | |
| "grad_norm": 988.0331420898438, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 23.0443, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0078125, | |
| "grad_norm": 1018.0032958984375, | |
| "learning_rate": 7.692307692307694e-06, | |
| "loss": 22.2415, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01171875, | |
| "grad_norm": 905.46435546875, | |
| "learning_rate": 1.153846153846154e-05, | |
| "loss": 23.9407, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.015625, | |
| "grad_norm": 772.3015747070312, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 20.3585, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01953125, | |
| "grad_norm": 1083.967529296875, | |
| "learning_rate": 1.923076923076923e-05, | |
| "loss": 17.7004, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0234375, | |
| "grad_norm": 518.4392700195312, | |
| "learning_rate": 2.307692307692308e-05, | |
| "loss": 14.8905, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02734375, | |
| "grad_norm": 588.9624633789062, | |
| "learning_rate": 2.6923076923076923e-05, | |
| "loss": 15.46, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.03125, | |
| "grad_norm": 291.337646484375, | |
| "learning_rate": 3.0769230769230774e-05, | |
| "loss": 11.7233, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03515625, | |
| "grad_norm": 243.8762969970703, | |
| "learning_rate": 3.461538461538462e-05, | |
| "loss": 9.8939, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0390625, | |
| "grad_norm": 306.4288635253906, | |
| "learning_rate": 3.846153846153846e-05, | |
| "loss": 10.5548, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04296875, | |
| "grad_norm": 138.4855194091797, | |
| "learning_rate": 4.230769230769231e-05, | |
| "loss": 7.1429, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.046875, | |
| "grad_norm": 79.80815887451172, | |
| "learning_rate": 4.615384615384616e-05, | |
| "loss": 7.0453, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05078125, | |
| "grad_norm": 95.14498901367188, | |
| "learning_rate": 5e-05, | |
| "loss": 6.2978, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0546875, | |
| "grad_norm": 76.86641693115234, | |
| "learning_rate": 4.9794238683127575e-05, | |
| "loss": 6.9177, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.05859375, | |
| "grad_norm": 90.26302337646484, | |
| "learning_rate": 4.958847736625515e-05, | |
| "loss": 5.7372, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 51.85117721557617, | |
| "learning_rate": 4.938271604938271e-05, | |
| "loss": 5.7694, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.06640625, | |
| "grad_norm": 28.343177795410156, | |
| "learning_rate": 4.9176954732510286e-05, | |
| "loss": 5.8857, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0703125, | |
| "grad_norm": 28.591285705566406, | |
| "learning_rate": 4.8971193415637865e-05, | |
| "loss": 5.2027, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.07421875, | |
| "grad_norm": 23.616016387939453, | |
| "learning_rate": 4.876543209876544e-05, | |
| "loss": 4.8946, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 20.2352294921875, | |
| "learning_rate": 4.855967078189301e-05, | |
| "loss": 5.0189, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08203125, | |
| "grad_norm": 16.59236717224121, | |
| "learning_rate": 4.835390946502058e-05, | |
| "loss": 4.9116, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0859375, | |
| "grad_norm": 15.008003234863281, | |
| "learning_rate": 4.814814814814815e-05, | |
| "loss": 4.8674, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.08984375, | |
| "grad_norm": 11.656904220581055, | |
| "learning_rate": 4.794238683127572e-05, | |
| "loss": 4.96, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.09375, | |
| "grad_norm": 17.49643898010254, | |
| "learning_rate": 4.773662551440329e-05, | |
| "loss": 4.7911, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.09765625, | |
| "grad_norm": 12.967889785766602, | |
| "learning_rate": 4.7530864197530866e-05, | |
| "loss": 4.7574, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.1015625, | |
| "grad_norm": 13.78781795501709, | |
| "learning_rate": 4.732510288065844e-05, | |
| "loss": 4.5257, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.10546875, | |
| "grad_norm": 14.40069580078125, | |
| "learning_rate": 4.711934156378601e-05, | |
| "loss": 4.6842, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.109375, | |
| "grad_norm": 8.558459281921387, | |
| "learning_rate": 4.691358024691358e-05, | |
| "loss": 4.4912, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.11328125, | |
| "grad_norm": 9.563591003417969, | |
| "learning_rate": 4.6707818930041156e-05, | |
| "loss": 4.3927, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.1171875, | |
| "grad_norm": 13.471641540527344, | |
| "learning_rate": 4.650205761316873e-05, | |
| "loss": 4.446, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12109375, | |
| "grad_norm": 8.875003814697266, | |
| "learning_rate": 4.62962962962963e-05, | |
| "loss": 4.4367, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 9.281575202941895, | |
| "learning_rate": 4.609053497942387e-05, | |
| "loss": 4.4643, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.12890625, | |
| "grad_norm": 9.402353286743164, | |
| "learning_rate": 4.5884773662551446e-05, | |
| "loss": 4.4036, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1328125, | |
| "grad_norm": 8.509848594665527, | |
| "learning_rate": 4.567901234567901e-05, | |
| "loss": 4.3953, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.13671875, | |
| "grad_norm": 9.05785846710205, | |
| "learning_rate": 4.5473251028806584e-05, | |
| "loss": 4.4259, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.140625, | |
| "grad_norm": 7.090909481048584, | |
| "learning_rate": 4.5267489711934157e-05, | |
| "loss": 4.2375, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.14453125, | |
| "grad_norm": 10.807791709899902, | |
| "learning_rate": 4.506172839506173e-05, | |
| "loss": 4.3719, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.1484375, | |
| "grad_norm": 13.562170028686523, | |
| "learning_rate": 4.48559670781893e-05, | |
| "loss": 4.6153, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.15234375, | |
| "grad_norm": 11.16930103302002, | |
| "learning_rate": 4.4650205761316874e-05, | |
| "loss": 4.4007, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 12.779727935791016, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 4.5876, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.16015625, | |
| "grad_norm": 14.613138198852539, | |
| "learning_rate": 4.423868312757202e-05, | |
| "loss": 4.3842, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1640625, | |
| "grad_norm": 16.920902252197266, | |
| "learning_rate": 4.403292181069959e-05, | |
| "loss": 4.2888, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.16796875, | |
| "grad_norm": 23.178911209106445, | |
| "learning_rate": 4.3827160493827164e-05, | |
| "loss": 4.4708, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.171875, | |
| "grad_norm": 28.53046989440918, | |
| "learning_rate": 4.3621399176954737e-05, | |
| "loss": 4.5108, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.17578125, | |
| "grad_norm": 19.8005428314209, | |
| "learning_rate": 4.341563786008231e-05, | |
| "loss": 4.4592, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1796875, | |
| "grad_norm": 12.320777893066406, | |
| "learning_rate": 4.3209876543209875e-05, | |
| "loss": 4.5041, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.18359375, | |
| "grad_norm": 8.350976943969727, | |
| "learning_rate": 4.300411522633745e-05, | |
| "loss": 4.3769, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 14.553681373596191, | |
| "learning_rate": 4.279835390946502e-05, | |
| "loss": 4.683, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.19140625, | |
| "grad_norm": 13.202040672302246, | |
| "learning_rate": 4.259259259259259e-05, | |
| "loss": 4.4271, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 12.252931594848633, | |
| "learning_rate": 4.2386831275720165e-05, | |
| "loss": 4.202, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19921875, | |
| "grad_norm": 9.770834922790527, | |
| "learning_rate": 4.2181069958847744e-05, | |
| "loss": 4.4271, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.203125, | |
| "grad_norm": 10.324782371520996, | |
| "learning_rate": 4.197530864197531e-05, | |
| "loss": 4.3385, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.20703125, | |
| "grad_norm": 8.983942031860352, | |
| "learning_rate": 4.176954732510288e-05, | |
| "loss": 4.2885, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2109375, | |
| "grad_norm": 8.984896659851074, | |
| "learning_rate": 4.1563786008230455e-05, | |
| "loss": 4.446, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.21484375, | |
| "grad_norm": 10.578001022338867, | |
| "learning_rate": 4.135802469135803e-05, | |
| "loss": 4.3787, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.21875, | |
| "grad_norm": 12.245546340942383, | |
| "learning_rate": 4.11522633744856e-05, | |
| "loss": 4.1896, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.22265625, | |
| "grad_norm": 12.414233207702637, | |
| "learning_rate": 4.094650205761317e-05, | |
| "loss": 4.4919, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2265625, | |
| "grad_norm": 10.887884140014648, | |
| "learning_rate": 4.074074074074074e-05, | |
| "loss": 4.2051, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.23046875, | |
| "grad_norm": 11.619287490844727, | |
| "learning_rate": 4.053497942386831e-05, | |
| "loss": 4.015, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 16.54395294189453, | |
| "learning_rate": 4.032921810699588e-05, | |
| "loss": 3.9459, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.23828125, | |
| "grad_norm": 22.401296615600586, | |
| "learning_rate": 4.012345679012346e-05, | |
| "loss": 3.8599, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2421875, | |
| "grad_norm": 30.748231887817383, | |
| "learning_rate": 3.9917695473251035e-05, | |
| "loss": 3.8411, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.24609375, | |
| "grad_norm": 48.33995056152344, | |
| "learning_rate": 3.971193415637861e-05, | |
| "loss": 3.8338, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 40.25141906738281, | |
| "learning_rate": 3.950617283950617e-05, | |
| "loss": 3.7575, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.25390625, | |
| "grad_norm": 70.64269256591797, | |
| "learning_rate": 3.9300411522633746e-05, | |
| "loss": 3.7886, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2578125, | |
| "grad_norm": 46.51031494140625, | |
| "learning_rate": 3.909465020576132e-05, | |
| "loss": 3.7727, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.26171875, | |
| "grad_norm": 44.5860481262207, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 3.7284, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.265625, | |
| "grad_norm": 37.13584899902344, | |
| "learning_rate": 3.868312757201646e-05, | |
| "loss": 3.7974, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.26953125, | |
| "grad_norm": 17.04970932006836, | |
| "learning_rate": 3.8477366255144036e-05, | |
| "loss": 3.7399, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2734375, | |
| "grad_norm": 25.48470687866211, | |
| "learning_rate": 3.82716049382716e-05, | |
| "loss": 3.6582, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.27734375, | |
| "grad_norm": 15.296791076660156, | |
| "learning_rate": 3.806584362139918e-05, | |
| "loss": 3.759, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.28125, | |
| "grad_norm": 18.959625244140625, | |
| "learning_rate": 3.786008230452675e-05, | |
| "loss": 3.6962, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.28515625, | |
| "grad_norm": 16.249454498291016, | |
| "learning_rate": 3.7654320987654326e-05, | |
| "loss": 3.5659, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.2890625, | |
| "grad_norm": 17.6076602935791, | |
| "learning_rate": 3.74485596707819e-05, | |
| "loss": 3.6214, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.29296875, | |
| "grad_norm": 15.325469017028809, | |
| "learning_rate": 3.724279835390947e-05, | |
| "loss": 3.5469, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.296875, | |
| "grad_norm": 14.967309951782227, | |
| "learning_rate": 3.7037037037037037e-05, | |
| "loss": 3.5844, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.30078125, | |
| "grad_norm": 16.454774856567383, | |
| "learning_rate": 3.683127572016461e-05, | |
| "loss": 3.4725, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.3046875, | |
| "grad_norm": 15.80962085723877, | |
| "learning_rate": 3.662551440329218e-05, | |
| "loss": 3.4781, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.30859375, | |
| "grad_norm": 12.834327697753906, | |
| "learning_rate": 3.6419753086419754e-05, | |
| "loss": 3.6498, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 12.81863784790039, | |
| "learning_rate": 3.6213991769547327e-05, | |
| "loss": 3.4208, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.31640625, | |
| "grad_norm": 16.791345596313477, | |
| "learning_rate": 3.60082304526749e-05, | |
| "loss": 3.4432, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.3203125, | |
| "grad_norm": 20.827123641967773, | |
| "learning_rate": 3.580246913580247e-05, | |
| "loss": 3.4386, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.32421875, | |
| "grad_norm": 14.668889999389648, | |
| "learning_rate": 3.5596707818930044e-05, | |
| "loss": 3.3887, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.328125, | |
| "grad_norm": 17.22427749633789, | |
| "learning_rate": 3.539094650205762e-05, | |
| "loss": 3.6215, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.33203125, | |
| "grad_norm": 16.612550735473633, | |
| "learning_rate": 3.518518518518519e-05, | |
| "loss": 3.4215, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3359375, | |
| "grad_norm": 15.173816680908203, | |
| "learning_rate": 3.497942386831276e-05, | |
| "loss": 3.3305, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.33984375, | |
| "grad_norm": 18.891874313354492, | |
| "learning_rate": 3.4773662551440334e-05, | |
| "loss": 3.4129, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.34375, | |
| "grad_norm": 19.012630462646484, | |
| "learning_rate": 3.45679012345679e-05, | |
| "loss": 3.4039, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.34765625, | |
| "grad_norm": 25.285091400146484, | |
| "learning_rate": 3.436213991769547e-05, | |
| "loss": 3.7148, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.3515625, | |
| "grad_norm": 23.138742446899414, | |
| "learning_rate": 3.4156378600823045e-05, | |
| "loss": 3.3594, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.35546875, | |
| "grad_norm": 24.770191192626953, | |
| "learning_rate": 3.395061728395062e-05, | |
| "loss": 3.3106, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.359375, | |
| "grad_norm": 17.597614288330078, | |
| "learning_rate": 3.374485596707819e-05, | |
| "loss": 3.2024, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.36328125, | |
| "grad_norm": 13.329920768737793, | |
| "learning_rate": 3.353909465020576e-05, | |
| "loss": 3.196, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3671875, | |
| "grad_norm": 20.01732635498047, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 3.3282, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.37109375, | |
| "grad_norm": 21.501489639282227, | |
| "learning_rate": 3.312757201646091e-05, | |
| "loss": 3.2842, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 15.159469604492188, | |
| "learning_rate": 3.292181069958848e-05, | |
| "loss": 3.149, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.37890625, | |
| "grad_norm": 15.139326095581055, | |
| "learning_rate": 3.271604938271605e-05, | |
| "loss": 3.2335, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3828125, | |
| "grad_norm": 14.6196870803833, | |
| "learning_rate": 3.2510288065843625e-05, | |
| "loss": 3.104, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.38671875, | |
| "grad_norm": 12.317699432373047, | |
| "learning_rate": 3.230452674897119e-05, | |
| "loss": 3.1088, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 17.009883880615234, | |
| "learning_rate": 3.209876543209876e-05, | |
| "loss": 3.1713, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.39453125, | |
| "grad_norm": 22.170177459716797, | |
| "learning_rate": 3.1893004115226336e-05, | |
| "loss": 3.1348, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3984375, | |
| "grad_norm": 18.459367752075195, | |
| "learning_rate": 3.168724279835391e-05, | |
| "loss": 3.1625, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.40234375, | |
| "grad_norm": 11.8716402053833, | |
| "learning_rate": 3.148148148148148e-05, | |
| "loss": 3.0979, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.40625, | |
| "grad_norm": 16.9968204498291, | |
| "learning_rate": 3.127572016460906e-05, | |
| "loss": 3.1705, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.41015625, | |
| "grad_norm": 18.078325271606445, | |
| "learning_rate": 3.1069958847736626e-05, | |
| "loss": 3.1945, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4140625, | |
| "grad_norm": 16.14826202392578, | |
| "learning_rate": 3.08641975308642e-05, | |
| "loss": 3.2458, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.41796875, | |
| "grad_norm": 14.381550788879395, | |
| "learning_rate": 3.065843621399177e-05, | |
| "loss": 3.1335, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.421875, | |
| "grad_norm": 11.909482955932617, | |
| "learning_rate": 3.0452674897119343e-05, | |
| "loss": 3.09, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.42578125, | |
| "grad_norm": 17.585634231567383, | |
| "learning_rate": 3.0246913580246916e-05, | |
| "loss": 3.0548, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.4296875, | |
| "grad_norm": 15.303757667541504, | |
| "learning_rate": 3.0041152263374488e-05, | |
| "loss": 2.9545, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.43359375, | |
| "grad_norm": 15.899727821350098, | |
| "learning_rate": 2.9835390946502057e-05, | |
| "loss": 3.0344, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 12.18794059753418, | |
| "learning_rate": 2.962962962962963e-05, | |
| "loss": 3.1163, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.44140625, | |
| "grad_norm": 15.14059066772461, | |
| "learning_rate": 2.9423868312757202e-05, | |
| "loss": 3.0572, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.4453125, | |
| "grad_norm": 16.42298698425293, | |
| "learning_rate": 2.9218106995884775e-05, | |
| "loss": 2.9781, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.44921875, | |
| "grad_norm": 16.090763092041016, | |
| "learning_rate": 2.9012345679012347e-05, | |
| "loss": 3.1644, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.453125, | |
| "grad_norm": 18.76362419128418, | |
| "learning_rate": 2.880658436213992e-05, | |
| "loss": 3.0987, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.45703125, | |
| "grad_norm": 17.354793548583984, | |
| "learning_rate": 2.860082304526749e-05, | |
| "loss": 2.9983, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.4609375, | |
| "grad_norm": 13.282272338867188, | |
| "learning_rate": 2.839506172839506e-05, | |
| "loss": 3.0733, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.46484375, | |
| "grad_norm": 21.229665756225586, | |
| "learning_rate": 2.8189300411522634e-05, | |
| "loss": 3.0484, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 16.58381462097168, | |
| "learning_rate": 2.7983539094650207e-05, | |
| "loss": 3.1174, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.47265625, | |
| "grad_norm": 15.544482231140137, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 3.1705, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.4765625, | |
| "grad_norm": 15.846022605895996, | |
| "learning_rate": 2.757201646090535e-05, | |
| "loss": 3.0341, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.48046875, | |
| "grad_norm": 21.867094039916992, | |
| "learning_rate": 2.736625514403292e-05, | |
| "loss": 2.9884, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.484375, | |
| "grad_norm": 13.162125587463379, | |
| "learning_rate": 2.7160493827160493e-05, | |
| "loss": 3.0612, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "grad_norm": 11.88615608215332, | |
| "learning_rate": 2.6954732510288066e-05, | |
| "loss": 2.9845, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4921875, | |
| "grad_norm": 11.516526222229004, | |
| "learning_rate": 2.6748971193415638e-05, | |
| "loss": 2.9974, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.49609375, | |
| "grad_norm": 13.762920379638672, | |
| "learning_rate": 2.654320987654321e-05, | |
| "loss": 3.0515, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 13.856005668640137, | |
| "learning_rate": 2.6337448559670787e-05, | |
| "loss": 3.0096, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.50390625, | |
| "grad_norm": 12.380202293395996, | |
| "learning_rate": 2.6131687242798352e-05, | |
| "loss": 3.0274, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5078125, | |
| "grad_norm": 15.164392471313477, | |
| "learning_rate": 2.5925925925925925e-05, | |
| "loss": 2.9838, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.51171875, | |
| "grad_norm": 15.10387134552002, | |
| "learning_rate": 2.5720164609053497e-05, | |
| "loss": 3.1058, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.515625, | |
| "grad_norm": 11.87817096710205, | |
| "learning_rate": 2.551440329218107e-05, | |
| "loss": 3.0164, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.51953125, | |
| "grad_norm": 17.242656707763672, | |
| "learning_rate": 2.5308641975308646e-05, | |
| "loss": 2.9376, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5234375, | |
| "grad_norm": 20.221240997314453, | |
| "learning_rate": 2.510288065843622e-05, | |
| "loss": 3.0894, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.52734375, | |
| "grad_norm": 12.36820125579834, | |
| "learning_rate": 2.4897119341563787e-05, | |
| "loss": 3.0347, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.53125, | |
| "grad_norm": 14.862237930297852, | |
| "learning_rate": 2.4691358024691357e-05, | |
| "loss": 3.0826, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.53515625, | |
| "grad_norm": 14.29667854309082, | |
| "learning_rate": 2.4485596707818932e-05, | |
| "loss": 3.0467, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.5390625, | |
| "grad_norm": 15.347952842712402, | |
| "learning_rate": 2.4279835390946505e-05, | |
| "loss": 2.9934, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.54296875, | |
| "grad_norm": 19.236717224121094, | |
| "learning_rate": 2.4074074074074074e-05, | |
| "loss": 2.8921, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 13.943548202514648, | |
| "learning_rate": 2.3868312757201647e-05, | |
| "loss": 3.1285, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.55078125, | |
| "grad_norm": 15.554057121276855, | |
| "learning_rate": 2.366255144032922e-05, | |
| "loss": 2.9709, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.5546875, | |
| "grad_norm": 15.124194145202637, | |
| "learning_rate": 2.345679012345679e-05, | |
| "loss": 2.9845, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.55859375, | |
| "grad_norm": 15.458159446716309, | |
| "learning_rate": 2.3251028806584364e-05, | |
| "loss": 2.9581, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 11.531893730163574, | |
| "learning_rate": 2.3045267489711937e-05, | |
| "loss": 3.0545, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.56640625, | |
| "grad_norm": 16.634984970092773, | |
| "learning_rate": 2.2839506172839506e-05, | |
| "loss": 2.9772, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5703125, | |
| "grad_norm": 13.670973777770996, | |
| "learning_rate": 2.2633744855967078e-05, | |
| "loss": 2.9478, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.57421875, | |
| "grad_norm": 13.582576751708984, | |
| "learning_rate": 2.242798353909465e-05, | |
| "loss": 3.1053, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.578125, | |
| "grad_norm": 17.05980110168457, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 2.9789, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.58203125, | |
| "grad_norm": 13.586709976196289, | |
| "learning_rate": 2.2016460905349796e-05, | |
| "loss": 2.9268, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 14.885200500488281, | |
| "learning_rate": 2.1810699588477368e-05, | |
| "loss": 3.0281, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.58984375, | |
| "grad_norm": 14.411493301391602, | |
| "learning_rate": 2.1604938271604937e-05, | |
| "loss": 2.9164, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.59375, | |
| "grad_norm": 16.72563934326172, | |
| "learning_rate": 2.139917695473251e-05, | |
| "loss": 2.9586, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.59765625, | |
| "grad_norm": 11.668434143066406, | |
| "learning_rate": 2.1193415637860082e-05, | |
| "loss": 3.0239, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6015625, | |
| "grad_norm": 11.849235534667969, | |
| "learning_rate": 2.0987654320987655e-05, | |
| "loss": 2.9665, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.60546875, | |
| "grad_norm": 17.961620330810547, | |
| "learning_rate": 2.0781893004115227e-05, | |
| "loss": 3.0192, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.609375, | |
| "grad_norm": 13.321170806884766, | |
| "learning_rate": 2.05761316872428e-05, | |
| "loss": 2.9901, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.61328125, | |
| "grad_norm": 12.714600563049316, | |
| "learning_rate": 2.037037037037037e-05, | |
| "loss": 2.9933, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6171875, | |
| "grad_norm": 11.27708625793457, | |
| "learning_rate": 2.016460905349794e-05, | |
| "loss": 2.9186, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.62109375, | |
| "grad_norm": 11.366385459899902, | |
| "learning_rate": 1.9958847736625517e-05, | |
| "loss": 2.9615, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 14.28786849975586, | |
| "learning_rate": 1.9753086419753087e-05, | |
| "loss": 3.0847, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.62890625, | |
| "grad_norm": 17.186941146850586, | |
| "learning_rate": 1.954732510288066e-05, | |
| "loss": 3.1361, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.6328125, | |
| "grad_norm": 12.188273429870605, | |
| "learning_rate": 1.934156378600823e-05, | |
| "loss": 3.0134, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.63671875, | |
| "grad_norm": 18.98809051513672, | |
| "learning_rate": 1.91358024691358e-05, | |
| "loss": 2.904, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.640625, | |
| "grad_norm": 12.578585624694824, | |
| "learning_rate": 1.8930041152263377e-05, | |
| "loss": 2.9072, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.64453125, | |
| "grad_norm": 13.295378684997559, | |
| "learning_rate": 1.872427983539095e-05, | |
| "loss": 3.0163, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6484375, | |
| "grad_norm": 16.138151168823242, | |
| "learning_rate": 1.8518518518518518e-05, | |
| "loss": 2.9509, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.65234375, | |
| "grad_norm": 18.39076805114746, | |
| "learning_rate": 1.831275720164609e-05, | |
| "loss": 2.9098, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.65625, | |
| "grad_norm": 19.46346664428711, | |
| "learning_rate": 1.8106995884773663e-05, | |
| "loss": 2.9096, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.66015625, | |
| "grad_norm": 13.3604154586792, | |
| "learning_rate": 1.7901234567901236e-05, | |
| "loss": 2.8992, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6640625, | |
| "grad_norm": 12.542205810546875, | |
| "learning_rate": 1.769547325102881e-05, | |
| "loss": 2.9293, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.66796875, | |
| "grad_norm": 12.55959415435791, | |
| "learning_rate": 1.748971193415638e-05, | |
| "loss": 2.8809, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.671875, | |
| "grad_norm": 15.57677936553955, | |
| "learning_rate": 1.728395061728395e-05, | |
| "loss": 2.7552, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.67578125, | |
| "grad_norm": 11.344679832458496, | |
| "learning_rate": 1.7078189300411522e-05, | |
| "loss": 2.9377, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.6796875, | |
| "grad_norm": 15.26870059967041, | |
| "learning_rate": 1.6872427983539095e-05, | |
| "loss": 2.9145, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.68359375, | |
| "grad_norm": 12.774370193481445, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 2.9775, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 19.02437400817871, | |
| "learning_rate": 1.646090534979424e-05, | |
| "loss": 2.9581, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.69140625, | |
| "grad_norm": 14.445993423461914, | |
| "learning_rate": 1.6255144032921812e-05, | |
| "loss": 2.9639, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6953125, | |
| "grad_norm": 13.563401222229004, | |
| "learning_rate": 1.604938271604938e-05, | |
| "loss": 2.8964, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.69921875, | |
| "grad_norm": 17.80751609802246, | |
| "learning_rate": 1.5843621399176954e-05, | |
| "loss": 3.0603, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 13.13770866394043, | |
| "learning_rate": 1.563786008230453e-05, | |
| "loss": 3.0473, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.70703125, | |
| "grad_norm": 15.236124038696289, | |
| "learning_rate": 1.54320987654321e-05, | |
| "loss": 2.8392, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7109375, | |
| "grad_norm": 17.386003494262695, | |
| "learning_rate": 1.5226337448559672e-05, | |
| "loss": 2.8805, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.71484375, | |
| "grad_norm": 16.233539581298828, | |
| "learning_rate": 1.5020576131687244e-05, | |
| "loss": 2.9692, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.71875, | |
| "grad_norm": 19.721954345703125, | |
| "learning_rate": 1.4814814814814815e-05, | |
| "loss": 3.0988, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.72265625, | |
| "grad_norm": 12.52316951751709, | |
| "learning_rate": 1.4609053497942387e-05, | |
| "loss": 2.933, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7265625, | |
| "grad_norm": 12.95042610168457, | |
| "learning_rate": 1.440329218106996e-05, | |
| "loss": 2.927, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.73046875, | |
| "grad_norm": 12.077346801757812, | |
| "learning_rate": 1.419753086419753e-05, | |
| "loss": 2.8952, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.734375, | |
| "grad_norm": 11.764900207519531, | |
| "learning_rate": 1.3991769547325103e-05, | |
| "loss": 2.8488, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.73828125, | |
| "grad_norm": 13.644368171691895, | |
| "learning_rate": 1.3786008230452676e-05, | |
| "loss": 2.8838, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.7421875, | |
| "grad_norm": 12.35655689239502, | |
| "learning_rate": 1.3580246913580247e-05, | |
| "loss": 2.9183, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.74609375, | |
| "grad_norm": 13.980887413024902, | |
| "learning_rate": 1.3374485596707819e-05, | |
| "loss": 2.8803, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 18.003271102905273, | |
| "learning_rate": 1.3168724279835393e-05, | |
| "loss": 3.0357, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.75390625, | |
| "grad_norm": 14.126523971557617, | |
| "learning_rate": 1.2962962962962962e-05, | |
| "loss": 2.9313, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.7578125, | |
| "grad_norm": 12.55939769744873, | |
| "learning_rate": 1.2757201646090535e-05, | |
| "loss": 2.9224, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.76171875, | |
| "grad_norm": 13.05750846862793, | |
| "learning_rate": 1.255144032921811e-05, | |
| "loss": 2.9167, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.765625, | |
| "grad_norm": 16.52781867980957, | |
| "learning_rate": 1.2345679012345678e-05, | |
| "loss": 2.8645, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.76953125, | |
| "grad_norm": 15.458019256591797, | |
| "learning_rate": 1.2139917695473252e-05, | |
| "loss": 2.9848, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.7734375, | |
| "grad_norm": 12.694581985473633, | |
| "learning_rate": 1.1934156378600823e-05, | |
| "loss": 3.0691, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.77734375, | |
| "grad_norm": 11.771615982055664, | |
| "learning_rate": 1.1728395061728396e-05, | |
| "loss": 2.9316, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 15.438037872314453, | |
| "learning_rate": 1.1522633744855968e-05, | |
| "loss": 2.9295, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.78515625, | |
| "grad_norm": 13.371684074401855, | |
| "learning_rate": 1.1316872427983539e-05, | |
| "loss": 2.8849, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.7890625, | |
| "grad_norm": 11.904099464416504, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 2.862, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.79296875, | |
| "grad_norm": 11.638395309448242, | |
| "learning_rate": 1.0905349794238684e-05, | |
| "loss": 2.949, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.796875, | |
| "grad_norm": 14.490175247192383, | |
| "learning_rate": 1.0699588477366255e-05, | |
| "loss": 2.9131, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.80078125, | |
| "grad_norm": 11.613717079162598, | |
| "learning_rate": 1.0493827160493827e-05, | |
| "loss": 2.9497, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8046875, | |
| "grad_norm": 13.931456565856934, | |
| "learning_rate": 1.02880658436214e-05, | |
| "loss": 2.917, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.80859375, | |
| "grad_norm": 12.256906509399414, | |
| "learning_rate": 1.008230452674897e-05, | |
| "loss": 2.9048, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 12.851861953735352, | |
| "learning_rate": 9.876543209876543e-06, | |
| "loss": 2.9084, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.81640625, | |
| "grad_norm": 18.05097770690918, | |
| "learning_rate": 9.670781893004116e-06, | |
| "loss": 2.802, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.8203125, | |
| "grad_norm": 12.831151962280273, | |
| "learning_rate": 9.465020576131688e-06, | |
| "loss": 2.8659, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.82421875, | |
| "grad_norm": 11.60468864440918, | |
| "learning_rate": 9.259259259259259e-06, | |
| "loss": 2.8285, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.828125, | |
| "grad_norm": 14.392627716064453, | |
| "learning_rate": 9.053497942386832e-06, | |
| "loss": 2.8605, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.83203125, | |
| "grad_norm": 17.23533821105957, | |
| "learning_rate": 8.847736625514404e-06, | |
| "loss": 2.9186, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.8359375, | |
| "grad_norm": 13.937773704528809, | |
| "learning_rate": 8.641975308641975e-06, | |
| "loss": 2.9164, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.83984375, | |
| "grad_norm": 16.57691764831543, | |
| "learning_rate": 8.436213991769547e-06, | |
| "loss": 2.8829, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.84375, | |
| "grad_norm": 12.391244888305664, | |
| "learning_rate": 8.23045267489712e-06, | |
| "loss": 2.8417, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.84765625, | |
| "grad_norm": 15.762882232666016, | |
| "learning_rate": 8.02469135802469e-06, | |
| "loss": 2.871, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.8515625, | |
| "grad_norm": 16.37859344482422, | |
| "learning_rate": 7.818930041152265e-06, | |
| "loss": 2.9265, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.85546875, | |
| "grad_norm": 21.5294132232666, | |
| "learning_rate": 7.613168724279836e-06, | |
| "loss": 2.9628, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 18.281295776367188, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 2.8301, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.86328125, | |
| "grad_norm": 16.0869140625, | |
| "learning_rate": 7.20164609053498e-06, | |
| "loss": 2.8179, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.8671875, | |
| "grad_norm": 21.662134170532227, | |
| "learning_rate": 6.995884773662552e-06, | |
| "loss": 2.8893, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.87109375, | |
| "grad_norm": 12.4130277633667, | |
| "learning_rate": 6.790123456790123e-06, | |
| "loss": 2.8628, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 15.154963493347168, | |
| "learning_rate": 6.584362139917697e-06, | |
| "loss": 2.8991, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.87890625, | |
| "grad_norm": 12.677474021911621, | |
| "learning_rate": 6.3786008230452675e-06, | |
| "loss": 2.9428, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.8828125, | |
| "grad_norm": 12.973712921142578, | |
| "learning_rate": 6.172839506172839e-06, | |
| "loss": 2.8694, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.88671875, | |
| "grad_norm": 13.162025451660156, | |
| "learning_rate": 5.967078189300412e-06, | |
| "loss": 2.8119, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.890625, | |
| "grad_norm": 13.606464385986328, | |
| "learning_rate": 5.761316872427984e-06, | |
| "loss": 2.8455, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.89453125, | |
| "grad_norm": 17.518091201782227, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 2.8708, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.8984375, | |
| "grad_norm": 13.689889907836914, | |
| "learning_rate": 5.3497942386831275e-06, | |
| "loss": 2.8384, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.90234375, | |
| "grad_norm": 14.69385051727295, | |
| "learning_rate": 5.1440329218107e-06, | |
| "loss": 2.8291, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.90625, | |
| "grad_norm": 17.468791961669922, | |
| "learning_rate": 4.938271604938272e-06, | |
| "loss": 2.8588, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.91015625, | |
| "grad_norm": 14.33683967590332, | |
| "learning_rate": 4.732510288065844e-06, | |
| "loss": 2.7762, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.9140625, | |
| "grad_norm": 17.65381622314453, | |
| "learning_rate": 4.526748971193416e-06, | |
| "loss": 2.8697, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.91796875, | |
| "grad_norm": 17.74317169189453, | |
| "learning_rate": 4.3209876543209875e-06, | |
| "loss": 2.8151, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.921875, | |
| "grad_norm": 20.99629020690918, | |
| "learning_rate": 4.11522633744856e-06, | |
| "loss": 2.893, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.92578125, | |
| "grad_norm": 17.19089698791504, | |
| "learning_rate": 3.9094650205761325e-06, | |
| "loss": 2.8025, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.9296875, | |
| "grad_norm": 12.898162841796875, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 2.9405, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.93359375, | |
| "grad_norm": 13.902021408081055, | |
| "learning_rate": 3.497942386831276e-06, | |
| "loss": 2.8937, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 13.255682945251465, | |
| "learning_rate": 3.2921810699588483e-06, | |
| "loss": 2.7604, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.94140625, | |
| "grad_norm": 14.98218822479248, | |
| "learning_rate": 3.0864197530864196e-06, | |
| "loss": 2.8931, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.9453125, | |
| "grad_norm": 15.084565162658691, | |
| "learning_rate": 2.880658436213992e-06, | |
| "loss": 2.8565, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.94921875, | |
| "grad_norm": 12.473812103271484, | |
| "learning_rate": 2.6748971193415637e-06, | |
| "loss": 2.795, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.953125, | |
| "grad_norm": 13.222654342651367, | |
| "learning_rate": 2.469135802469136e-06, | |
| "loss": 2.8734, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.95703125, | |
| "grad_norm": 13.204935073852539, | |
| "learning_rate": 2.263374485596708e-06, | |
| "loss": 2.8013, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9609375, | |
| "grad_norm": 15.458930015563965, | |
| "learning_rate": 2.05761316872428e-06, | |
| "loss": 2.8541, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.96484375, | |
| "grad_norm": 17.476573944091797, | |
| "learning_rate": 1.8518518518518519e-06, | |
| "loss": 2.7538, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.96875, | |
| "grad_norm": 13.751199722290039, | |
| "learning_rate": 1.6460905349794242e-06, | |
| "loss": 2.8324, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.97265625, | |
| "grad_norm": 15.517616271972656, | |
| "learning_rate": 1.440329218106996e-06, | |
| "loss": 2.8402, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 16.016067504882812, | |
| "learning_rate": 1.234567901234568e-06, | |
| "loss": 2.895, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.98046875, | |
| "grad_norm": 11.978571891784668, | |
| "learning_rate": 1.02880658436214e-06, | |
| "loss": 2.9139, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.00390625, | |
| "grad_norm": 58.774688720703125, | |
| "learning_rate": 8.230452674897121e-07, | |
| "loss": 3.7051, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.0078125, | |
| "grad_norm": 52.733154296875, | |
| "learning_rate": 6.17283950617284e-07, | |
| "loss": 3.7482, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.01171875, | |
| "grad_norm": 68.14625549316406, | |
| "learning_rate": 4.1152263374485604e-07, | |
| "loss": 3.8065, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.015625, | |
| "grad_norm": 59.991546630859375, | |
| "learning_rate": 2.0576131687242802e-07, | |
| "loss": 3.6531, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.01953125, | |
| "grad_norm": 40.77336883544922, | |
| "learning_rate": 0.0, | |
| "loss": 3.4788, | |
| "step": 256 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 256, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 1.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |