diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5641 @@ +{ + "best_metric": 2.631765127182007, + "best_model_checkpoint": "Meta-Llama-3.1-8B-Instruct-finetuned/checkpoint-800", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00125, + "grad_norm": 0.8839266300201416, + "learning_rate": 1.9975e-05, + "loss": 3.5886, + "step": 1 + }, + { + "epoch": 0.0025, + "grad_norm": 0.5743476748466492, + "learning_rate": 1.9950000000000004e-05, + "loss": 3.1499, + "step": 2 + }, + { + "epoch": 0.00375, + "grad_norm": 0.6939172148704529, + "learning_rate": 1.9925e-05, + "loss": 2.4424, + "step": 3 + }, + { + "epoch": 0.005, + "grad_norm": 0.35782289505004883, + "learning_rate": 1.9900000000000003e-05, + "loss": 1.6857, + "step": 4 + }, + { + "epoch": 0.00625, + "grad_norm": 0.30019086599349976, + "learning_rate": 1.9875000000000002e-05, + "loss": 2.4306, + "step": 5 + }, + { + "epoch": 0.0075, + "grad_norm": 0.7822341918945312, + "learning_rate": 1.985e-05, + "loss": 3.1167, + "step": 6 + }, + { + "epoch": 0.00875, + "grad_norm": 1.0825086832046509, + "learning_rate": 1.9825e-05, + "loss": 3.4788, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 0.6958524584770203, + "learning_rate": 1.98e-05, + "loss": 2.4278, + "step": 8 + }, + { + "epoch": 0.01125, + "grad_norm": 1.0385024547576904, + "learning_rate": 1.9775000000000003e-05, + "loss": 2.7115, + "step": 9 + }, + { + "epoch": 0.0125, + "grad_norm": 1.0596249103546143, + "learning_rate": 1.9750000000000002e-05, + "loss": 3.8705, + "step": 10 + }, + { + "epoch": 0.01375, + "grad_norm": 1.014846682548523, + "learning_rate": 1.9725000000000002e-05, + "loss": 2.7525, + "step": 11 + }, + { + "epoch": 0.015, + "grad_norm": 1.0246942043304443, + "learning_rate": 1.97e-05, + "loss": 3.2441, + "step": 12 + }, + { + "epoch": 0.01625, + "grad_norm": 0.5323541760444641, + "learning_rate": 1.9675e-05, + "loss": 2.2139, + "step": 13 + }, + { + "epoch": 0.0175, + "grad_norm": 0.6004053950309753, + "learning_rate": 1.9650000000000003e-05, + "loss": 2.7995, + "step": 14 + }, + { + "epoch": 0.01875, + "grad_norm": 0.6828798651695251, + "learning_rate": 1.9625e-05, + "loss": 2.7872, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.8382136225700378, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.9668, + "step": 16 + }, + { + "epoch": 0.02125, + "grad_norm": 0.5875881910324097, + "learning_rate": 1.9575e-05, + "loss": 2.4291, + "step": 17 + }, + { + "epoch": 0.0225, + "grad_norm": 1.260549545288086, + "learning_rate": 1.955e-05, + "loss": 3.2646, + "step": 18 + }, + { + "epoch": 0.02375, + "grad_norm": 0.595811128616333, + "learning_rate": 1.9525e-05, + "loss": 2.5531, + "step": 19 + }, + { + "epoch": 0.025, + "grad_norm": 0.5156136155128479, + "learning_rate": 1.95e-05, + "loss": 2.4804, + "step": 20 + }, + { + "epoch": 0.02625, + "grad_norm": 0.4905349314212799, + "learning_rate": 1.9475000000000002e-05, + "loss": 2.1618, + "step": 21 + }, + { + "epoch": 0.0275, + "grad_norm": 0.3786165118217468, + "learning_rate": 1.9450000000000002e-05, + "loss": 2.2184, + "step": 22 + }, + { + "epoch": 0.02875, + "grad_norm": 0.6827333569526672, + "learning_rate": 1.9425e-05, + "loss": 2.2265, + "step": 23 + }, + { + "epoch": 0.03, + "grad_norm": 0.9549139738082886, + "learning_rate": 1.94e-05, + "loss": 3.0346, + "step": 24 + }, + { + "epoch": 0.03125, + "grad_norm": 0.4533088207244873, + "learning_rate": 1.9375e-05, + "loss": 1.9096, + "step": 25 + }, + { + "epoch": 0.0325, + "grad_norm": 1.1265308856964111, + "learning_rate": 1.9350000000000003e-05, + "loss": 3.1981, + "step": 26 + }, + { + "epoch": 0.03375, + "grad_norm": 0.6239346265792847, + "learning_rate": 1.9325000000000002e-05, + "loss": 2.6925, + "step": 27 + }, + { + "epoch": 0.035, + "grad_norm": 0.8297908306121826, + "learning_rate": 1.93e-05, + "loss": 2.9377, + "step": 28 + }, + { + "epoch": 0.03625, + "grad_norm": 0.5328755974769592, + "learning_rate": 1.9275e-05, + "loss": 3.5033, + "step": 29 + }, + { + "epoch": 0.0375, + "grad_norm": 0.748451292514801, + "learning_rate": 1.925e-05, + "loss": 3.3768, + "step": 30 + }, + { + "epoch": 0.03875, + "grad_norm": 0.6435258388519287, + "learning_rate": 1.9225000000000003e-05, + "loss": 2.8508, + "step": 31 + }, + { + "epoch": 0.04, + "grad_norm": 0.5679820775985718, + "learning_rate": 1.9200000000000003e-05, + "loss": 2.6533, + "step": 32 + }, + { + "epoch": 0.04125, + "grad_norm": 0.8853691220283508, + "learning_rate": 1.9175000000000002e-05, + "loss": 3.0708, + "step": 33 + }, + { + "epoch": 0.0425, + "grad_norm": 0.6417959332466125, + "learning_rate": 1.915e-05, + "loss": 1.6389, + "step": 34 + }, + { + "epoch": 0.04375, + "grad_norm": 0.5146605372428894, + "learning_rate": 1.9125000000000004e-05, + "loss": 2.7314, + "step": 35 + }, + { + "epoch": 0.045, + "grad_norm": 1.4297131299972534, + "learning_rate": 1.91e-05, + "loss": 3.6263, + "step": 36 + }, + { + "epoch": 0.04625, + "grad_norm": 1.8327345848083496, + "learning_rate": 1.9075000000000003e-05, + "loss": 3.2042, + "step": 37 + }, + { + "epoch": 0.0475, + "grad_norm": 0.9266634583473206, + "learning_rate": 1.9050000000000002e-05, + "loss": 2.8001, + "step": 38 + }, + { + "epoch": 0.04875, + "grad_norm": 0.944769024848938, + "learning_rate": 1.9025e-05, + "loss": 2.8379, + "step": 39 + }, + { + "epoch": 0.05, + "grad_norm": 0.6269248723983765, + "learning_rate": 1.9e-05, + "loss": 2.5707, + "step": 40 + }, + { + "epoch": 0.05125, + "grad_norm": 0.8288752436637878, + "learning_rate": 1.8975e-05, + "loss": 3.1964, + "step": 41 + }, + { + "epoch": 0.0525, + "grad_norm": 1.4263437986373901, + "learning_rate": 1.8950000000000003e-05, + "loss": 3.5353, + "step": 42 + }, + { + "epoch": 0.05375, + "grad_norm": 1.2455209493637085, + "learning_rate": 1.8925000000000003e-05, + "loss": 2.7257, + "step": 43 + }, + { + "epoch": 0.055, + "grad_norm": 0.5519097447395325, + "learning_rate": 1.8900000000000002e-05, + "loss": 2.2432, + "step": 44 + }, + { + "epoch": 0.05625, + "grad_norm": 0.7771982550621033, + "learning_rate": 1.8875e-05, + "loss": 2.1759, + "step": 45 + }, + { + "epoch": 0.0575, + "grad_norm": 1.3399147987365723, + "learning_rate": 1.885e-05, + "loss": 3.1654, + "step": 46 + }, + { + "epoch": 0.05875, + "grad_norm": 0.4468587040901184, + "learning_rate": 1.8825000000000004e-05, + "loss": 2.0097, + "step": 47 + }, + { + "epoch": 0.06, + "grad_norm": 0.6694964170455933, + "learning_rate": 1.88e-05, + "loss": 2.3989, + "step": 48 + }, + { + "epoch": 0.06125, + "grad_norm": 0.583661675453186, + "learning_rate": 1.8775000000000002e-05, + "loss": 2.3641, + "step": 49 + }, + { + "epoch": 0.0625, + "grad_norm": 1.7708433866500854, + "learning_rate": 1.8750000000000002e-05, + "loss": 3.4434, + "step": 50 + }, + { + "epoch": 0.06375, + "grad_norm": 1.0765807628631592, + "learning_rate": 1.8725e-05, + "loss": 3.0016, + "step": 51 + }, + { + "epoch": 0.065, + "grad_norm": 2.168282985687256, + "learning_rate": 1.8700000000000004e-05, + "loss": 3.9576, + "step": 52 + }, + { + "epoch": 0.06625, + "grad_norm": 1.4953194856643677, + "learning_rate": 1.8675e-05, + "loss": 3.1084, + "step": 53 + }, + { + "epoch": 0.0675, + "grad_norm": 0.9680989980697632, + "learning_rate": 1.8650000000000003e-05, + "loss": 2.678, + "step": 54 + }, + { + "epoch": 0.06875, + "grad_norm": 0.6683467030525208, + "learning_rate": 1.8625000000000002e-05, + "loss": 2.5912, + "step": 55 + }, + { + "epoch": 0.07, + "grad_norm": 0.5850144624710083, + "learning_rate": 1.86e-05, + "loss": 2.5303, + "step": 56 + }, + { + "epoch": 0.07125, + "grad_norm": 1.957913875579834, + "learning_rate": 1.8575e-05, + "loss": 3.5738, + "step": 57 + }, + { + "epoch": 0.0725, + "grad_norm": 0.4880407154560089, + "learning_rate": 1.855e-05, + "loss": 2.4431, + "step": 58 + }, + { + "epoch": 0.07375, + "grad_norm": 0.6582357883453369, + "learning_rate": 1.8525000000000003e-05, + "loss": 2.7631, + "step": 59 + }, + { + "epoch": 0.075, + "grad_norm": 1.223876953125, + "learning_rate": 1.8500000000000002e-05, + "loss": 2.551, + "step": 60 + }, + { + "epoch": 0.07625, + "grad_norm": 0.7573391199111938, + "learning_rate": 1.8475000000000002e-05, + "loss": 2.7722, + "step": 61 + }, + { + "epoch": 0.0775, + "grad_norm": 1.3893153667449951, + "learning_rate": 1.845e-05, + "loss": 2.6213, + "step": 62 + }, + { + "epoch": 0.07875, + "grad_norm": 0.8527727723121643, + "learning_rate": 1.8425e-05, + "loss": 2.76, + "step": 63 + }, + { + "epoch": 0.08, + "grad_norm": 0.539513885974884, + "learning_rate": 1.8400000000000003e-05, + "loss": 2.4052, + "step": 64 + }, + { + "epoch": 0.08125, + "grad_norm": 1.2717337608337402, + "learning_rate": 1.8375e-05, + "loss": 3.2137, + "step": 65 + }, + { + "epoch": 0.0825, + "grad_norm": 0.9680526852607727, + "learning_rate": 1.8350000000000002e-05, + "loss": 1.867, + "step": 66 + }, + { + "epoch": 0.08375, + "grad_norm": 0.7457209825515747, + "learning_rate": 1.8325e-05, + "loss": 2.9906, + "step": 67 + }, + { + "epoch": 0.085, + "grad_norm": 0.9684948325157166, + "learning_rate": 1.83e-05, + "loss": 2.3277, + "step": 68 + }, + { + "epoch": 0.08625, + "grad_norm": 1.1279115676879883, + "learning_rate": 1.8275e-05, + "loss": 2.9131, + "step": 69 + }, + { + "epoch": 0.0875, + "grad_norm": 0.7532200217247009, + "learning_rate": 1.825e-05, + "loss": 2.848, + "step": 70 + }, + { + "epoch": 0.08875, + "grad_norm": 2.1061675548553467, + "learning_rate": 1.8225000000000003e-05, + "loss": 3.3145, + "step": 71 + }, + { + "epoch": 0.09, + "grad_norm": 1.2701787948608398, + "learning_rate": 1.8200000000000002e-05, + "loss": 2.7297, + "step": 72 + }, + { + "epoch": 0.09125, + "grad_norm": 0.8566807508468628, + "learning_rate": 1.8175e-05, + "loss": 2.5054, + "step": 73 + }, + { + "epoch": 0.0925, + "grad_norm": 0.7943547964096069, + "learning_rate": 1.815e-05, + "loss": 2.9178, + "step": 74 + }, + { + "epoch": 0.09375, + "grad_norm": 1.2525556087493896, + "learning_rate": 1.8125e-05, + "loss": 3.1584, + "step": 75 + }, + { + "epoch": 0.095, + "grad_norm": 0.9087671041488647, + "learning_rate": 1.8100000000000003e-05, + "loss": 2.5073, + "step": 76 + }, + { + "epoch": 0.09625, + "grad_norm": 1.253976583480835, + "learning_rate": 1.8075000000000002e-05, + "loss": 2.8634, + "step": 77 + }, + { + "epoch": 0.0975, + "grad_norm": 1.6665199995040894, + "learning_rate": 1.805e-05, + "loss": 2.1963, + "step": 78 + }, + { + "epoch": 0.09875, + "grad_norm": 1.2400460243225098, + "learning_rate": 1.8025e-05, + "loss": 2.4813, + "step": 79 + }, + { + "epoch": 0.1, + "grad_norm": 1.0923187732696533, + "learning_rate": 1.8e-05, + "loss": 3.0353, + "step": 80 + }, + { + "epoch": 0.10125, + "grad_norm": 0.6350508332252502, + "learning_rate": 1.7975000000000003e-05, + "loss": 2.4959, + "step": 81 + }, + { + "epoch": 0.1025, + "grad_norm": 1.7657043933868408, + "learning_rate": 1.795e-05, + "loss": 3.2626, + "step": 82 + }, + { + "epoch": 0.10375, + "grad_norm": 2.0525500774383545, + "learning_rate": 1.7925000000000002e-05, + "loss": 2.5613, + "step": 83 + }, + { + "epoch": 0.105, + "grad_norm": 0.8630340099334717, + "learning_rate": 1.79e-05, + "loss": 2.54, + "step": 84 + }, + { + "epoch": 0.10625, + "grad_norm": 2.1113481521606445, + "learning_rate": 1.7875e-05, + "loss": 3.6185, + "step": 85 + }, + { + "epoch": 0.1075, + "grad_norm": 1.0695478916168213, + "learning_rate": 1.785e-05, + "loss": 2.9277, + "step": 86 + }, + { + "epoch": 0.10875, + "grad_norm": 1.6546651124954224, + "learning_rate": 1.7825e-05, + "loss": 2.7956, + "step": 87 + }, + { + "epoch": 0.11, + "grad_norm": 1.6290825605392456, + "learning_rate": 1.7800000000000002e-05, + "loss": 2.6147, + "step": 88 + }, + { + "epoch": 0.11125, + "grad_norm": 0.8302910923957825, + "learning_rate": 1.7775000000000002e-05, + "loss": 2.6113, + "step": 89 + }, + { + "epoch": 0.1125, + "grad_norm": 2.2139322757720947, + "learning_rate": 1.775e-05, + "loss": 2.7841, + "step": 90 + }, + { + "epoch": 0.11375, + "grad_norm": 0.5742428302764893, + "learning_rate": 1.7725e-05, + "loss": 2.6969, + "step": 91 + }, + { + "epoch": 0.115, + "grad_norm": 1.6150765419006348, + "learning_rate": 1.77e-05, + "loss": 3.3552, + "step": 92 + }, + { + "epoch": 0.11625, + "grad_norm": 1.053296685218811, + "learning_rate": 1.7675000000000003e-05, + "loss": 2.5297, + "step": 93 + }, + { + "epoch": 0.1175, + "grad_norm": 0.8964847922325134, + "learning_rate": 1.7650000000000002e-05, + "loss": 2.5295, + "step": 94 + }, + { + "epoch": 0.11875, + "grad_norm": 0.9291931390762329, + "learning_rate": 1.7625e-05, + "loss": 2.4275, + "step": 95 + }, + { + "epoch": 0.12, + "grad_norm": 1.3160396814346313, + "learning_rate": 1.76e-05, + "loss": 2.727, + "step": 96 + }, + { + "epoch": 0.12125, + "grad_norm": 3.190565824508667, + "learning_rate": 1.7575000000000004e-05, + "loss": 3.1598, + "step": 97 + }, + { + "epoch": 0.1225, + "grad_norm": 1.1592965126037598, + "learning_rate": 1.755e-05, + "loss": 1.8949, + "step": 98 + }, + { + "epoch": 0.12375, + "grad_norm": 1.942399263381958, + "learning_rate": 1.7525000000000002e-05, + "loss": 2.8239, + "step": 99 + }, + { + "epoch": 0.125, + "grad_norm": 0.8165029883384705, + "learning_rate": 1.7500000000000002e-05, + "loss": 2.0449, + "step": 100 + }, + { + "epoch": 0.12625, + "grad_norm": 0.9424604773521423, + "learning_rate": 1.7475e-05, + "loss": 2.994, + "step": 101 + }, + { + "epoch": 0.1275, + "grad_norm": 2.3983001708984375, + "learning_rate": 1.7450000000000004e-05, + "loss": 4.2536, + "step": 102 + }, + { + "epoch": 0.12875, + "grad_norm": 1.9262495040893555, + "learning_rate": 1.7425e-05, + "loss": 3.5003, + "step": 103 + }, + { + "epoch": 0.13, + "grad_norm": 0.5252268314361572, + "learning_rate": 1.7400000000000003e-05, + "loss": 2.2976, + "step": 104 + }, + { + "epoch": 0.13125, + "grad_norm": 1.0399681329727173, + "learning_rate": 1.7375000000000002e-05, + "loss": 2.5173, + "step": 105 + }, + { + "epoch": 0.1325, + "grad_norm": 0.9736959338188171, + "learning_rate": 1.735e-05, + "loss": 2.475, + "step": 106 + }, + { + "epoch": 0.13375, + "grad_norm": 2.253629446029663, + "learning_rate": 1.7325e-05, + "loss": 3.4539, + "step": 107 + }, + { + "epoch": 0.135, + "grad_norm": 0.9584936499595642, + "learning_rate": 1.73e-05, + "loss": 2.3618, + "step": 108 + }, + { + "epoch": 0.13625, + "grad_norm": 0.7614779472351074, + "learning_rate": 1.7275000000000003e-05, + "loss": 2.6198, + "step": 109 + }, + { + "epoch": 0.1375, + "grad_norm": 1.4478983879089355, + "learning_rate": 1.7250000000000003e-05, + "loss": 2.4151, + "step": 110 + }, + { + "epoch": 0.13875, + "grad_norm": 1.6698650121688843, + "learning_rate": 1.7225000000000002e-05, + "loss": 2.8188, + "step": 111 + }, + { + "epoch": 0.14, + "grad_norm": 1.1940186023712158, + "learning_rate": 1.72e-05, + "loss": 2.9268, + "step": 112 + }, + { + "epoch": 0.14125, + "grad_norm": 1.3393628597259521, + "learning_rate": 1.7175e-05, + "loss": 3.246, + "step": 113 + }, + { + "epoch": 0.1425, + "grad_norm": 1.6196234226226807, + "learning_rate": 1.7150000000000004e-05, + "loss": 3.1591, + "step": 114 + }, + { + "epoch": 0.14375, + "grad_norm": 2.1916654109954834, + "learning_rate": 1.7125e-05, + "loss": 2.6534, + "step": 115 + }, + { + "epoch": 0.145, + "grad_norm": 1.725541591644287, + "learning_rate": 1.7100000000000002e-05, + "loss": 2.5497, + "step": 116 + }, + { + "epoch": 0.14625, + "grad_norm": 1.0600862503051758, + "learning_rate": 1.7075e-05, + "loss": 2.5041, + "step": 117 + }, + { + "epoch": 0.1475, + "grad_norm": 0.9649932980537415, + "learning_rate": 1.705e-05, + "loss": 2.8991, + "step": 118 + }, + { + "epoch": 0.14875, + "grad_norm": 1.460185170173645, + "learning_rate": 1.7025e-05, + "loss": 2.7737, + "step": 119 + }, + { + "epoch": 0.15, + "grad_norm": 0.8451898694038391, + "learning_rate": 1.7e-05, + "loss": 2.3685, + "step": 120 + }, + { + "epoch": 0.15125, + "grad_norm": 1.7984133958816528, + "learning_rate": 1.6975000000000003e-05, + "loss": 2.599, + "step": 121 + }, + { + "epoch": 0.1525, + "grad_norm": 0.9329972267150879, + "learning_rate": 1.6950000000000002e-05, + "loss": 2.9886, + "step": 122 + }, + { + "epoch": 0.15375, + "grad_norm": 0.725104808807373, + "learning_rate": 1.6925e-05, + "loss": 2.5776, + "step": 123 + }, + { + "epoch": 0.155, + "grad_norm": 0.7359060049057007, + "learning_rate": 1.69e-05, + "loss": 2.8262, + "step": 124 + }, + { + "epoch": 0.15625, + "grad_norm": 1.0401649475097656, + "learning_rate": 1.6875e-05, + "loss": 2.853, + "step": 125 + }, + { + "epoch": 0.1575, + "grad_norm": 1.748463749885559, + "learning_rate": 1.6850000000000003e-05, + "loss": 2.4642, + "step": 126 + }, + { + "epoch": 0.15875, + "grad_norm": 1.278817057609558, + "learning_rate": 1.6825000000000002e-05, + "loss": 2.1242, + "step": 127 + }, + { + "epoch": 0.16, + "grad_norm": 1.4376240968704224, + "learning_rate": 1.6800000000000002e-05, + "loss": 2.7639, + "step": 128 + }, + { + "epoch": 0.16125, + "grad_norm": 1.3094562292099, + "learning_rate": 1.6775e-05, + "loss": 2.4866, + "step": 129 + }, + { + "epoch": 0.1625, + "grad_norm": 0.7014777064323425, + "learning_rate": 1.675e-05, + "loss": 3.0492, + "step": 130 + }, + { + "epoch": 0.16375, + "grad_norm": 1.578884243965149, + "learning_rate": 1.6725000000000003e-05, + "loss": 3.039, + "step": 131 + }, + { + "epoch": 0.165, + "grad_norm": 0.9808546900749207, + "learning_rate": 1.67e-05, + "loss": 2.7078, + "step": 132 + }, + { + "epoch": 0.16625, + "grad_norm": 0.7291256189346313, + "learning_rate": 1.6675000000000002e-05, + "loss": 2.3461, + "step": 133 + }, + { + "epoch": 0.1675, + "grad_norm": 0.8226473927497864, + "learning_rate": 1.665e-05, + "loss": 2.7881, + "step": 134 + }, + { + "epoch": 0.16875, + "grad_norm": 1.732428789138794, + "learning_rate": 1.6625e-05, + "loss": 2.9649, + "step": 135 + }, + { + "epoch": 0.17, + "grad_norm": 1.4389218091964722, + "learning_rate": 1.66e-05, + "loss": 2.7957, + "step": 136 + }, + { + "epoch": 0.17125, + "grad_norm": 1.5826399326324463, + "learning_rate": 1.6575e-05, + "loss": 2.911, + "step": 137 + }, + { + "epoch": 0.1725, + "grad_norm": 1.0410722494125366, + "learning_rate": 1.6550000000000002e-05, + "loss": 2.6444, + "step": 138 + }, + { + "epoch": 0.17375, + "grad_norm": 0.9773198366165161, + "learning_rate": 1.6525000000000002e-05, + "loss": 2.3028, + "step": 139 + }, + { + "epoch": 0.175, + "grad_norm": 0.9613956212997437, + "learning_rate": 1.65e-05, + "loss": 2.2155, + "step": 140 + }, + { + "epoch": 0.17625, + "grad_norm": 1.3112648725509644, + "learning_rate": 1.6475e-05, + "loss": 2.2479, + "step": 141 + }, + { + "epoch": 0.1775, + "grad_norm": 1.0890480279922485, + "learning_rate": 1.645e-05, + "loss": 2.155, + "step": 142 + }, + { + "epoch": 0.17875, + "grad_norm": 0.837094247341156, + "learning_rate": 1.6425000000000003e-05, + "loss": 2.6826, + "step": 143 + }, + { + "epoch": 0.18, + "grad_norm": 1.2103476524353027, + "learning_rate": 1.64e-05, + "loss": 3.228, + "step": 144 + }, + { + "epoch": 0.18125, + "grad_norm": 1.9761539697647095, + "learning_rate": 1.6375e-05, + "loss": 2.7196, + "step": 145 + }, + { + "epoch": 0.1825, + "grad_norm": 0.9773359894752502, + "learning_rate": 1.635e-05, + "loss": 2.9387, + "step": 146 + }, + { + "epoch": 0.18375, + "grad_norm": 0.9832570552825928, + "learning_rate": 1.6325e-05, + "loss": 2.8681, + "step": 147 + }, + { + "epoch": 0.185, + "grad_norm": 1.0604499578475952, + "learning_rate": 1.63e-05, + "loss": 2.5064, + "step": 148 + }, + { + "epoch": 0.18625, + "grad_norm": 0.48051193356513977, + "learning_rate": 1.6275e-05, + "loss": 2.4564, + "step": 149 + }, + { + "epoch": 0.1875, + "grad_norm": 1.50802481174469, + "learning_rate": 1.6250000000000002e-05, + "loss": 2.8379, + "step": 150 + }, + { + "epoch": 0.18875, + "grad_norm": 1.812436580657959, + "learning_rate": 1.6225e-05, + "loss": 1.9878, + "step": 151 + }, + { + "epoch": 0.19, + "grad_norm": 1.7669923305511475, + "learning_rate": 1.62e-05, + "loss": 2.8151, + "step": 152 + }, + { + "epoch": 0.19125, + "grad_norm": 2.281033754348755, + "learning_rate": 1.6175e-05, + "loss": 3.3304, + "step": 153 + }, + { + "epoch": 0.1925, + "grad_norm": 0.9153265357017517, + "learning_rate": 1.6150000000000003e-05, + "loss": 1.8299, + "step": 154 + }, + { + "epoch": 0.19375, + "grad_norm": 1.041798710823059, + "learning_rate": 1.6125000000000002e-05, + "loss": 2.9185, + "step": 155 + }, + { + "epoch": 0.195, + "grad_norm": 0.6834219694137573, + "learning_rate": 1.6100000000000002e-05, + "loss": 2.3889, + "step": 156 + }, + { + "epoch": 0.19625, + "grad_norm": 1.8793840408325195, + "learning_rate": 1.6075e-05, + "loss": 2.7536, + "step": 157 + }, + { + "epoch": 0.1975, + "grad_norm": 1.4421665668487549, + "learning_rate": 1.605e-05, + "loss": 2.9225, + "step": 158 + }, + { + "epoch": 0.19875, + "grad_norm": 2.3824641704559326, + "learning_rate": 1.6025000000000003e-05, + "loss": 3.4679, + "step": 159 + }, + { + "epoch": 0.2, + "grad_norm": 2.144702672958374, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.194, + "step": 160 + }, + { + "epoch": 0.20125, + "grad_norm": 1.5149937868118286, + "learning_rate": 1.5975000000000002e-05, + "loss": 3.0138, + "step": 161 + }, + { + "epoch": 0.2025, + "grad_norm": 2.1063072681427, + "learning_rate": 1.595e-05, + "loss": 3.0519, + "step": 162 + }, + { + "epoch": 0.20375, + "grad_norm": 0.820692777633667, + "learning_rate": 1.5925e-05, + "loss": 1.9736, + "step": 163 + }, + { + "epoch": 0.205, + "grad_norm": 1.1006802320480347, + "learning_rate": 1.5900000000000004e-05, + "loss": 2.7983, + "step": 164 + }, + { + "epoch": 0.20625, + "grad_norm": 1.038564920425415, + "learning_rate": 1.5875e-05, + "loss": 2.506, + "step": 165 + }, + { + "epoch": 0.2075, + "grad_norm": 1.1868886947631836, + "learning_rate": 1.5850000000000002e-05, + "loss": 2.2386, + "step": 166 + }, + { + "epoch": 0.20875, + "grad_norm": 0.9923886060714722, + "learning_rate": 1.5825000000000002e-05, + "loss": 2.2732, + "step": 167 + }, + { + "epoch": 0.21, + "grad_norm": 1.5702191591262817, + "learning_rate": 1.58e-05, + "loss": 3.1675, + "step": 168 + }, + { + "epoch": 0.21125, + "grad_norm": 1.3807926177978516, + "learning_rate": 1.5775e-05, + "loss": 2.1471, + "step": 169 + }, + { + "epoch": 0.2125, + "grad_norm": 1.0479799509048462, + "learning_rate": 1.575e-05, + "loss": 2.1308, + "step": 170 + }, + { + "epoch": 0.21375, + "grad_norm": 1.8413770198822021, + "learning_rate": 1.5725000000000003e-05, + "loss": 2.9807, + "step": 171 + }, + { + "epoch": 0.215, + "grad_norm": 1.8779033422470093, + "learning_rate": 1.5700000000000002e-05, + "loss": 2.6333, + "step": 172 + }, + { + "epoch": 0.21625, + "grad_norm": 0.9915759563446045, + "learning_rate": 1.5675e-05, + "loss": 2.1059, + "step": 173 + }, + { + "epoch": 0.2175, + "grad_norm": 1.3852919340133667, + "learning_rate": 1.565e-05, + "loss": 2.8429, + "step": 174 + }, + { + "epoch": 0.21875, + "grad_norm": 0.8641107082366943, + "learning_rate": 1.5625e-05, + "loss": 2.5064, + "step": 175 + }, + { + "epoch": 0.22, + "grad_norm": 1.1318968534469604, + "learning_rate": 1.5600000000000003e-05, + "loss": 2.6325, + "step": 176 + }, + { + "epoch": 0.22125, + "grad_norm": 1.3027793169021606, + "learning_rate": 1.5575000000000002e-05, + "loss": 2.0696, + "step": 177 + }, + { + "epoch": 0.2225, + "grad_norm": 1.8702739477157593, + "learning_rate": 1.5550000000000002e-05, + "loss": 2.6865, + "step": 178 + }, + { + "epoch": 0.22375, + "grad_norm": 1.9452624320983887, + "learning_rate": 1.5525e-05, + "loss": 3.6065, + "step": 179 + }, + { + "epoch": 0.225, + "grad_norm": 2.445535182952881, + "learning_rate": 1.55e-05, + "loss": 3.6079, + "step": 180 + }, + { + "epoch": 0.22625, + "grad_norm": 1.1266591548919678, + "learning_rate": 1.5475000000000003e-05, + "loss": 2.3907, + "step": 181 + }, + { + "epoch": 0.2275, + "grad_norm": 1.3965399265289307, + "learning_rate": 1.545e-05, + "loss": 2.7698, + "step": 182 + }, + { + "epoch": 0.22875, + "grad_norm": 1.1321989297866821, + "learning_rate": 1.5425000000000002e-05, + "loss": 3.7761, + "step": 183 + }, + { + "epoch": 0.23, + "grad_norm": 1.289818525314331, + "learning_rate": 1.54e-05, + "loss": 2.7404, + "step": 184 + }, + { + "epoch": 0.23125, + "grad_norm": 1.3494073152542114, + "learning_rate": 1.5375e-05, + "loss": 2.6953, + "step": 185 + }, + { + "epoch": 0.2325, + "grad_norm": 0.6751367449760437, + "learning_rate": 1.535e-05, + "loss": 2.4991, + "step": 186 + }, + { + "epoch": 0.23375, + "grad_norm": 1.2142468690872192, + "learning_rate": 1.5325e-05, + "loss": 2.7054, + "step": 187 + }, + { + "epoch": 0.235, + "grad_norm": 1.5462883710861206, + "learning_rate": 1.5300000000000003e-05, + "loss": 3.0457, + "step": 188 + }, + { + "epoch": 0.23625, + "grad_norm": 2.16507625579834, + "learning_rate": 1.5275000000000002e-05, + "loss": 2.8668, + "step": 189 + }, + { + "epoch": 0.2375, + "grad_norm": 2.034597635269165, + "learning_rate": 1.525e-05, + "loss": 3.3775, + "step": 190 + }, + { + "epoch": 0.23875, + "grad_norm": 1.8364763259887695, + "learning_rate": 1.5225e-05, + "loss": 2.3838, + "step": 191 + }, + { + "epoch": 0.24, + "grad_norm": 2.2459280490875244, + "learning_rate": 1.5200000000000002e-05, + "loss": 2.604, + "step": 192 + }, + { + "epoch": 0.24125, + "grad_norm": 1.3169443607330322, + "learning_rate": 1.5175000000000001e-05, + "loss": 2.6639, + "step": 193 + }, + { + "epoch": 0.2425, + "grad_norm": 1.6183668375015259, + "learning_rate": 1.515e-05, + "loss": 2.663, + "step": 194 + }, + { + "epoch": 0.24375, + "grad_norm": 0.8612351417541504, + "learning_rate": 1.5125e-05, + "loss": 2.2988, + "step": 195 + }, + { + "epoch": 0.245, + "grad_norm": 1.1287574768066406, + "learning_rate": 1.5100000000000001e-05, + "loss": 2.5189, + "step": 196 + }, + { + "epoch": 0.24625, + "grad_norm": 0.8260729312896729, + "learning_rate": 1.5075000000000002e-05, + "loss": 2.1106, + "step": 197 + }, + { + "epoch": 0.2475, + "grad_norm": 1.1428359746932983, + "learning_rate": 1.505e-05, + "loss": 2.3685, + "step": 198 + }, + { + "epoch": 0.24875, + "grad_norm": 2.2927756309509277, + "learning_rate": 1.5025000000000001e-05, + "loss": 3.2356, + "step": 199 + }, + { + "epoch": 0.25, + "grad_norm": 1.6453107595443726, + "learning_rate": 1.5000000000000002e-05, + "loss": 2.9095, + "step": 200 + }, + { + "epoch": 0.25125, + "grad_norm": 2.4193787574768066, + "learning_rate": 1.4975000000000001e-05, + "loss": 3.0942, + "step": 201 + }, + { + "epoch": 0.2525, + "grad_norm": 1.536061406135559, + "learning_rate": 1.4950000000000003e-05, + "loss": 2.6016, + "step": 202 + }, + { + "epoch": 0.25375, + "grad_norm": 1.0485507249832153, + "learning_rate": 1.4925e-05, + "loss": 2.4134, + "step": 203 + }, + { + "epoch": 0.255, + "grad_norm": 2.519632577896118, + "learning_rate": 1.4900000000000001e-05, + "loss": 2.4384, + "step": 204 + }, + { + "epoch": 0.25625, + "grad_norm": 1.311072826385498, + "learning_rate": 1.4875000000000002e-05, + "loss": 3.0807, + "step": 205 + }, + { + "epoch": 0.2575, + "grad_norm": 2.9829020500183105, + "learning_rate": 1.4850000000000002e-05, + "loss": 3.4535, + "step": 206 + }, + { + "epoch": 0.25875, + "grad_norm": 1.2975739240646362, + "learning_rate": 1.4825000000000001e-05, + "loss": 2.5714, + "step": 207 + }, + { + "epoch": 0.26, + "grad_norm": 1.9030929803848267, + "learning_rate": 1.48e-05, + "loss": 2.7733, + "step": 208 + }, + { + "epoch": 0.26125, + "grad_norm": 0.9590077996253967, + "learning_rate": 1.4775000000000002e-05, + "loss": 2.6125, + "step": 209 + }, + { + "epoch": 0.2625, + "grad_norm": 0.9263010025024414, + "learning_rate": 1.4750000000000003e-05, + "loss": 2.6456, + "step": 210 + }, + { + "epoch": 0.26375, + "grad_norm": 2.1573827266693115, + "learning_rate": 1.4725e-05, + "loss": 2.4447, + "step": 211 + }, + { + "epoch": 0.265, + "grad_norm": 1.386278510093689, + "learning_rate": 1.4700000000000002e-05, + "loss": 2.5971, + "step": 212 + }, + { + "epoch": 0.26625, + "grad_norm": 1.487113118171692, + "learning_rate": 1.4675000000000001e-05, + "loss": 2.8983, + "step": 213 + }, + { + "epoch": 0.2675, + "grad_norm": 1.6664327383041382, + "learning_rate": 1.4650000000000002e-05, + "loss": 2.0951, + "step": 214 + }, + { + "epoch": 0.26875, + "grad_norm": 1.2928472757339478, + "learning_rate": 1.4625e-05, + "loss": 2.7343, + "step": 215 + }, + { + "epoch": 0.27, + "grad_norm": 1.1214581727981567, + "learning_rate": 1.46e-05, + "loss": 2.149, + "step": 216 + }, + { + "epoch": 0.27125, + "grad_norm": 1.724827766418457, + "learning_rate": 1.4575000000000002e-05, + "loss": 2.3816, + "step": 217 + }, + { + "epoch": 0.2725, + "grad_norm": 2.641063690185547, + "learning_rate": 1.4550000000000001e-05, + "loss": 3.357, + "step": 218 + }, + { + "epoch": 0.27375, + "grad_norm": 1.2942688465118408, + "learning_rate": 1.4525e-05, + "loss": 2.8008, + "step": 219 + }, + { + "epoch": 0.275, + "grad_norm": 2.02486252784729, + "learning_rate": 1.45e-05, + "loss": 2.724, + "step": 220 + }, + { + "epoch": 0.27625, + "grad_norm": 2.8531992435455322, + "learning_rate": 1.4475000000000001e-05, + "loss": 2.8367, + "step": 221 + }, + { + "epoch": 0.2775, + "grad_norm": 0.9137434363365173, + "learning_rate": 1.4450000000000002e-05, + "loss": 2.0391, + "step": 222 + }, + { + "epoch": 0.27875, + "grad_norm": 2.9269864559173584, + "learning_rate": 1.4425e-05, + "loss": 2.9638, + "step": 223 + }, + { + "epoch": 0.28, + "grad_norm": 3.105649471282959, + "learning_rate": 1.4400000000000001e-05, + "loss": 3.2201, + "step": 224 + }, + { + "epoch": 0.28125, + "grad_norm": 2.2428364753723145, + "learning_rate": 1.4375e-05, + "loss": 2.7664, + "step": 225 + }, + { + "epoch": 0.2825, + "grad_norm": 1.8540781736373901, + "learning_rate": 1.4350000000000002e-05, + "loss": 3.1237, + "step": 226 + }, + { + "epoch": 0.28375, + "grad_norm": 2.0040481090545654, + "learning_rate": 1.4325000000000003e-05, + "loss": 2.5248, + "step": 227 + }, + { + "epoch": 0.285, + "grad_norm": 1.4190040826797485, + "learning_rate": 1.43e-05, + "loss": 2.9788, + "step": 228 + }, + { + "epoch": 0.28625, + "grad_norm": 2.412309408187866, + "learning_rate": 1.4275000000000001e-05, + "loss": 2.568, + "step": 229 + }, + { + "epoch": 0.2875, + "grad_norm": 1.4534833431243896, + "learning_rate": 1.425e-05, + "loss": 2.7647, + "step": 230 + }, + { + "epoch": 0.28875, + "grad_norm": 0.662830650806427, + "learning_rate": 1.4225000000000002e-05, + "loss": 2.5177, + "step": 231 + }, + { + "epoch": 0.29, + "grad_norm": 2.5043435096740723, + "learning_rate": 1.4200000000000001e-05, + "loss": 2.8444, + "step": 232 + }, + { + "epoch": 0.29125, + "grad_norm": 1.3479336500167847, + "learning_rate": 1.4175e-05, + "loss": 2.2902, + "step": 233 + }, + { + "epoch": 0.2925, + "grad_norm": 1.9785624742507935, + "learning_rate": 1.4150000000000002e-05, + "loss": 2.8377, + "step": 234 + }, + { + "epoch": 0.29375, + "grad_norm": 3.683432102203369, + "learning_rate": 1.4125000000000003e-05, + "loss": 3.849, + "step": 235 + }, + { + "epoch": 0.295, + "grad_norm": 2.3539721965789795, + "learning_rate": 1.41e-05, + "loss": 3.0713, + "step": 236 + }, + { + "epoch": 0.29625, + "grad_norm": 2.2666311264038086, + "learning_rate": 1.4075000000000002e-05, + "loss": 2.6689, + "step": 237 + }, + { + "epoch": 0.2975, + "grad_norm": 0.7512000203132629, + "learning_rate": 1.4050000000000001e-05, + "loss": 2.1519, + "step": 238 + }, + { + "epoch": 0.29875, + "grad_norm": 1.755519986152649, + "learning_rate": 1.4025000000000002e-05, + "loss": 2.9841, + "step": 239 + }, + { + "epoch": 0.3, + "grad_norm": 0.8116776943206787, + "learning_rate": 1.4e-05, + "loss": 2.4049, + "step": 240 + }, + { + "epoch": 0.30125, + "grad_norm": 1.5843620300292969, + "learning_rate": 1.3975000000000001e-05, + "loss": 2.7027, + "step": 241 + }, + { + "epoch": 0.3025, + "grad_norm": 1.8443893194198608, + "learning_rate": 1.3950000000000002e-05, + "loss": 2.6175, + "step": 242 + }, + { + "epoch": 0.30375, + "grad_norm": 4.015542030334473, + "learning_rate": 1.3925000000000001e-05, + "loss": 3.2365, + "step": 243 + }, + { + "epoch": 0.305, + "grad_norm": 1.515284776687622, + "learning_rate": 1.39e-05, + "loss": 3.4183, + "step": 244 + }, + { + "epoch": 0.30625, + "grad_norm": 1.4685897827148438, + "learning_rate": 1.3875e-05, + "loss": 2.5062, + "step": 245 + }, + { + "epoch": 0.3075, + "grad_norm": 1.9483778476715088, + "learning_rate": 1.3850000000000001e-05, + "loss": 2.6737, + "step": 246 + }, + { + "epoch": 0.30875, + "grad_norm": 2.194448471069336, + "learning_rate": 1.3825000000000002e-05, + "loss": 2.5334, + "step": 247 + }, + { + "epoch": 0.31, + "grad_norm": 1.1393736600875854, + "learning_rate": 1.38e-05, + "loss": 2.452, + "step": 248 + }, + { + "epoch": 0.31125, + "grad_norm": 3.909176826477051, + "learning_rate": 1.3775000000000001e-05, + "loss": 3.8808, + "step": 249 + }, + { + "epoch": 0.3125, + "grad_norm": 1.2332671880722046, + "learning_rate": 1.375e-05, + "loss": 2.5233, + "step": 250 + }, + { + "epoch": 0.31375, + "grad_norm": 2.6342389583587646, + "learning_rate": 1.3725000000000002e-05, + "loss": 2.4018, + "step": 251 + }, + { + "epoch": 0.315, + "grad_norm": 2.0232162475585938, + "learning_rate": 1.3700000000000003e-05, + "loss": 2.704, + "step": 252 + }, + { + "epoch": 0.31625, + "grad_norm": 1.709433674812317, + "learning_rate": 1.3675e-05, + "loss": 3.0054, + "step": 253 + }, + { + "epoch": 0.3175, + "grad_norm": 1.425729751586914, + "learning_rate": 1.3650000000000001e-05, + "loss": 2.5461, + "step": 254 + }, + { + "epoch": 0.31875, + "grad_norm": 1.902305245399475, + "learning_rate": 1.3625e-05, + "loss": 2.7863, + "step": 255 + }, + { + "epoch": 0.32, + "grad_norm": 1.7936694622039795, + "learning_rate": 1.3600000000000002e-05, + "loss": 2.5435, + "step": 256 + }, + { + "epoch": 0.32125, + "grad_norm": 2.072840929031372, + "learning_rate": 1.3575e-05, + "loss": 2.7425, + "step": 257 + }, + { + "epoch": 0.3225, + "grad_norm": 2.37286376953125, + "learning_rate": 1.355e-05, + "loss": 3.6158, + "step": 258 + }, + { + "epoch": 0.32375, + "grad_norm": 1.3893417119979858, + "learning_rate": 1.3525000000000002e-05, + "loss": 2.2764, + "step": 259 + }, + { + "epoch": 0.325, + "grad_norm": 1.2564778327941895, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.9123, + "step": 260 + }, + { + "epoch": 0.32625, + "grad_norm": 1.7227541208267212, + "learning_rate": 1.3475e-05, + "loss": 2.8062, + "step": 261 + }, + { + "epoch": 0.3275, + "grad_norm": 1.5551847219467163, + "learning_rate": 1.3450000000000002e-05, + "loss": 2.5495, + "step": 262 + }, + { + "epoch": 0.32875, + "grad_norm": 3.329219102859497, + "learning_rate": 1.3425000000000001e-05, + "loss": 3.1482, + "step": 263 + }, + { + "epoch": 0.33, + "grad_norm": 2.6194255352020264, + "learning_rate": 1.3400000000000002e-05, + "loss": 2.0326, + "step": 264 + }, + { + "epoch": 0.33125, + "grad_norm": 2.509632110595703, + "learning_rate": 1.3375e-05, + "loss": 2.49, + "step": 265 + }, + { + "epoch": 0.3325, + "grad_norm": 1.3284372091293335, + "learning_rate": 1.3350000000000001e-05, + "loss": 2.7824, + "step": 266 + }, + { + "epoch": 0.33375, + "grad_norm": 1.663150429725647, + "learning_rate": 1.3325000000000002e-05, + "loss": 2.5316, + "step": 267 + }, + { + "epoch": 0.335, + "grad_norm": 1.093685507774353, + "learning_rate": 1.3300000000000001e-05, + "loss": 2.3014, + "step": 268 + }, + { + "epoch": 0.33625, + "grad_norm": 2.4130373001098633, + "learning_rate": 1.3275e-05, + "loss": 2.3933, + "step": 269 + }, + { + "epoch": 0.3375, + "grad_norm": 1.6137781143188477, + "learning_rate": 1.325e-05, + "loss": 1.8412, + "step": 270 + }, + { + "epoch": 0.33875, + "grad_norm": 2.0770466327667236, + "learning_rate": 1.3225000000000001e-05, + "loss": 2.4478, + "step": 271 + }, + { + "epoch": 0.34, + "grad_norm": 2.511610269546509, + "learning_rate": 1.3200000000000002e-05, + "loss": 2.6513, + "step": 272 + }, + { + "epoch": 0.34125, + "grad_norm": 1.3404005765914917, + "learning_rate": 1.3175e-05, + "loss": 2.3267, + "step": 273 + }, + { + "epoch": 0.3425, + "grad_norm": 2.10129714012146, + "learning_rate": 1.3150000000000001e-05, + "loss": 2.8609, + "step": 274 + }, + { + "epoch": 0.34375, + "grad_norm": 4.970488548278809, + "learning_rate": 1.3125e-05, + "loss": 4.5557, + "step": 275 + }, + { + "epoch": 0.345, + "grad_norm": 1.0287278890609741, + "learning_rate": 1.3100000000000002e-05, + "loss": 1.9752, + "step": 276 + }, + { + "epoch": 0.34625, + "grad_norm": 1.4155839681625366, + "learning_rate": 1.3075000000000003e-05, + "loss": 2.5988, + "step": 277 + }, + { + "epoch": 0.3475, + "grad_norm": 2.1226813793182373, + "learning_rate": 1.305e-05, + "loss": 2.4375, + "step": 278 + }, + { + "epoch": 0.34875, + "grad_norm": 1.3161485195159912, + "learning_rate": 1.3025000000000002e-05, + "loss": 2.3141, + "step": 279 + }, + { + "epoch": 0.35, + "grad_norm": 1.0883145332336426, + "learning_rate": 1.3000000000000001e-05, + "loss": 2.2943, + "step": 280 + }, + { + "epoch": 0.35125, + "grad_norm": 0.6219211220741272, + "learning_rate": 1.2975000000000002e-05, + "loss": 2.7156, + "step": 281 + }, + { + "epoch": 0.3525, + "grad_norm": 3.4014060497283936, + "learning_rate": 1.295e-05, + "loss": 1.9477, + "step": 282 + }, + { + "epoch": 0.35375, + "grad_norm": 1.447366714477539, + "learning_rate": 1.2925e-05, + "loss": 3.2115, + "step": 283 + }, + { + "epoch": 0.355, + "grad_norm": 0.8695298433303833, + "learning_rate": 1.2900000000000002e-05, + "loss": 2.0062, + "step": 284 + }, + { + "epoch": 0.35625, + "grad_norm": 1.2442666292190552, + "learning_rate": 1.2875000000000001e-05, + "loss": 2.782, + "step": 285 + }, + { + "epoch": 0.3575, + "grad_norm": 4.334051609039307, + "learning_rate": 1.285e-05, + "loss": 3.7922, + "step": 286 + }, + { + "epoch": 0.35875, + "grad_norm": 1.0075643062591553, + "learning_rate": 1.2825e-05, + "loss": 2.3886, + "step": 287 + }, + { + "epoch": 0.36, + "grad_norm": 2.952709436416626, + "learning_rate": 1.2800000000000001e-05, + "loss": 3.1041, + "step": 288 + }, + { + "epoch": 0.36125, + "grad_norm": 1.9426610469818115, + "learning_rate": 1.2775000000000002e-05, + "loss": 2.5739, + "step": 289 + }, + { + "epoch": 0.3625, + "grad_norm": 1.2706924676895142, + "learning_rate": 1.275e-05, + "loss": 2.6037, + "step": 290 + }, + { + "epoch": 0.36375, + "grad_norm": 1.1551876068115234, + "learning_rate": 1.2725000000000001e-05, + "loss": 2.2662, + "step": 291 + }, + { + "epoch": 0.365, + "grad_norm": 2.400639772415161, + "learning_rate": 1.27e-05, + "loss": 3.0005, + "step": 292 + }, + { + "epoch": 0.36625, + "grad_norm": 2.8095648288726807, + "learning_rate": 1.2675000000000001e-05, + "loss": 3.0654, + "step": 293 + }, + { + "epoch": 0.3675, + "grad_norm": 0.8239273428916931, + "learning_rate": 1.2650000000000001e-05, + "loss": 1.9067, + "step": 294 + }, + { + "epoch": 0.36875, + "grad_norm": 1.5768872499465942, + "learning_rate": 1.2625e-05, + "loss": 2.4867, + "step": 295 + }, + { + "epoch": 0.37, + "grad_norm": 1.463496208190918, + "learning_rate": 1.2600000000000001e-05, + "loss": 3.0372, + "step": 296 + }, + { + "epoch": 0.37125, + "grad_norm": 1.8864333629608154, + "learning_rate": 1.2575000000000002e-05, + "loss": 2.5445, + "step": 297 + }, + { + "epoch": 0.3725, + "grad_norm": 2.185873508453369, + "learning_rate": 1.255e-05, + "loss": 2.7776, + "step": 298 + }, + { + "epoch": 0.37375, + "grad_norm": 1.242254376411438, + "learning_rate": 1.2525000000000001e-05, + "loss": 2.6274, + "step": 299 + }, + { + "epoch": 0.375, + "grad_norm": 1.5784997940063477, + "learning_rate": 1.25e-05, + "loss": 2.6328, + "step": 300 + }, + { + "epoch": 0.37625, + "grad_norm": 2.0619335174560547, + "learning_rate": 1.2475000000000002e-05, + "loss": 2.2983, + "step": 301 + }, + { + "epoch": 0.3775, + "grad_norm": 1.7328943014144897, + "learning_rate": 1.2450000000000003e-05, + "loss": 2.3851, + "step": 302 + }, + { + "epoch": 0.37875, + "grad_norm": 1.7219791412353516, + "learning_rate": 1.2425e-05, + "loss": 2.1684, + "step": 303 + }, + { + "epoch": 0.38, + "grad_norm": 1.9850049018859863, + "learning_rate": 1.2400000000000002e-05, + "loss": 3.1503, + "step": 304 + }, + { + "epoch": 0.38125, + "grad_norm": 1.193284511566162, + "learning_rate": 1.2375000000000001e-05, + "loss": 2.8272, + "step": 305 + }, + { + "epoch": 0.3825, + "grad_norm": 8.433061599731445, + "learning_rate": 1.2350000000000002e-05, + "loss": 2.7003, + "step": 306 + }, + { + "epoch": 0.38375, + "grad_norm": 1.3799391984939575, + "learning_rate": 1.2325e-05, + "loss": 2.7715, + "step": 307 + }, + { + "epoch": 0.385, + "grad_norm": 1.137082576751709, + "learning_rate": 1.23e-05, + "loss": 2.6402, + "step": 308 + }, + { + "epoch": 0.38625, + "grad_norm": 3.6281142234802246, + "learning_rate": 1.2275000000000002e-05, + "loss": 3.1791, + "step": 309 + }, + { + "epoch": 0.3875, + "grad_norm": 1.4617842435836792, + "learning_rate": 1.2250000000000001e-05, + "loss": 2.2238, + "step": 310 + }, + { + "epoch": 0.38875, + "grad_norm": 1.729715347290039, + "learning_rate": 1.2225e-05, + "loss": 3.2183, + "step": 311 + }, + { + "epoch": 0.39, + "grad_norm": 1.7599000930786133, + "learning_rate": 1.22e-05, + "loss": 2.6337, + "step": 312 + }, + { + "epoch": 0.39125, + "grad_norm": 2.0601325035095215, + "learning_rate": 1.2175000000000001e-05, + "loss": 2.7048, + "step": 313 + }, + { + "epoch": 0.3925, + "grad_norm": 1.496025562286377, + "learning_rate": 1.2150000000000002e-05, + "loss": 2.2228, + "step": 314 + }, + { + "epoch": 0.39375, + "grad_norm": 2.1809468269348145, + "learning_rate": 1.2125e-05, + "loss": 2.1678, + "step": 315 + }, + { + "epoch": 0.395, + "grad_norm": 1.5901917219161987, + "learning_rate": 1.2100000000000001e-05, + "loss": 1.8807, + "step": 316 + }, + { + "epoch": 0.39625, + "grad_norm": 1.4371182918548584, + "learning_rate": 1.2075e-05, + "loss": 2.0627, + "step": 317 + }, + { + "epoch": 0.3975, + "grad_norm": 1.4142842292785645, + "learning_rate": 1.2050000000000002e-05, + "loss": 1.93, + "step": 318 + }, + { + "epoch": 0.39875, + "grad_norm": 1.0198673009872437, + "learning_rate": 1.2025e-05, + "loss": 2.3295, + "step": 319 + }, + { + "epoch": 0.4, + "grad_norm": 1.1797082424163818, + "learning_rate": 1.2e-05, + "loss": 2.5342, + "step": 320 + }, + { + "epoch": 0.40125, + "grad_norm": 1.0422452688217163, + "learning_rate": 1.1975000000000001e-05, + "loss": 2.0141, + "step": 321 + }, + { + "epoch": 0.4025, + "grad_norm": 1.1430671215057373, + "learning_rate": 1.195e-05, + "loss": 1.8992, + "step": 322 + }, + { + "epoch": 0.40375, + "grad_norm": 2.177053928375244, + "learning_rate": 1.1925e-05, + "loss": 2.7447, + "step": 323 + }, + { + "epoch": 0.405, + "grad_norm": 2.095257520675659, + "learning_rate": 1.1900000000000001e-05, + "loss": 2.5267, + "step": 324 + }, + { + "epoch": 0.40625, + "grad_norm": 1.4478448629379272, + "learning_rate": 1.1875e-05, + "loss": 2.6989, + "step": 325 + }, + { + "epoch": 0.4075, + "grad_norm": 1.9374433755874634, + "learning_rate": 1.1850000000000002e-05, + "loss": 2.4978, + "step": 326 + }, + { + "epoch": 0.40875, + "grad_norm": 0.9773975014686584, + "learning_rate": 1.1825000000000003e-05, + "loss": 2.6889, + "step": 327 + }, + { + "epoch": 0.41, + "grad_norm": 1.5445204973220825, + "learning_rate": 1.18e-05, + "loss": 2.4233, + "step": 328 + }, + { + "epoch": 0.41125, + "grad_norm": 1.492608904838562, + "learning_rate": 1.1775000000000002e-05, + "loss": 2.6976, + "step": 329 + }, + { + "epoch": 0.4125, + "grad_norm": 2.212707996368408, + "learning_rate": 1.1750000000000001e-05, + "loss": 3.2724, + "step": 330 + }, + { + "epoch": 0.41375, + "grad_norm": 1.4107556343078613, + "learning_rate": 1.1725000000000002e-05, + "loss": 3.0866, + "step": 331 + }, + { + "epoch": 0.415, + "grad_norm": 2.913604736328125, + "learning_rate": 1.17e-05, + "loss": 2.4357, + "step": 332 + }, + { + "epoch": 0.41625, + "grad_norm": 3.2254488468170166, + "learning_rate": 1.1675000000000001e-05, + "loss": 3.5358, + "step": 333 + }, + { + "epoch": 0.4175, + "grad_norm": 1.6059589385986328, + "learning_rate": 1.1650000000000002e-05, + "loss": 2.6456, + "step": 334 + }, + { + "epoch": 0.41875, + "grad_norm": 2.4393796920776367, + "learning_rate": 1.1625000000000001e-05, + "loss": 2.0989, + "step": 335 + }, + { + "epoch": 0.42, + "grad_norm": 1.5735288858413696, + "learning_rate": 1.16e-05, + "loss": 2.979, + "step": 336 + }, + { + "epoch": 0.42125, + "grad_norm": 1.5758427381515503, + "learning_rate": 1.1575e-05, + "loss": 2.7381, + "step": 337 + }, + { + "epoch": 0.4225, + "grad_norm": 0.8109416961669922, + "learning_rate": 1.1550000000000001e-05, + "loss": 2.3608, + "step": 338 + }, + { + "epoch": 0.42375, + "grad_norm": 1.8177721500396729, + "learning_rate": 1.1525000000000002e-05, + "loss": 2.8134, + "step": 339 + }, + { + "epoch": 0.425, + "grad_norm": 1.5914677381515503, + "learning_rate": 1.15e-05, + "loss": 2.4923, + "step": 340 + }, + { + "epoch": 0.42625, + "grad_norm": 0.7988778352737427, + "learning_rate": 1.1475000000000001e-05, + "loss": 1.855, + "step": 341 + }, + { + "epoch": 0.4275, + "grad_norm": 1.4848300218582153, + "learning_rate": 1.145e-05, + "loss": 2.1767, + "step": 342 + }, + { + "epoch": 0.42875, + "grad_norm": 2.404296875, + "learning_rate": 1.1425000000000002e-05, + "loss": 2.5743, + "step": 343 + }, + { + "epoch": 0.43, + "grad_norm": 0.8775387406349182, + "learning_rate": 1.14e-05, + "loss": 2.0211, + "step": 344 + }, + { + "epoch": 0.43125, + "grad_norm": 1.3193469047546387, + "learning_rate": 1.1375e-05, + "loss": 1.3909, + "step": 345 + }, + { + "epoch": 0.4325, + "grad_norm": 2.3813071250915527, + "learning_rate": 1.1350000000000001e-05, + "loss": 3.1726, + "step": 346 + }, + { + "epoch": 0.43375, + "grad_norm": 1.7489192485809326, + "learning_rate": 1.1325e-05, + "loss": 2.557, + "step": 347 + }, + { + "epoch": 0.435, + "grad_norm": 1.4432573318481445, + "learning_rate": 1.13e-05, + "loss": 2.8275, + "step": 348 + }, + { + "epoch": 0.43625, + "grad_norm": 1.5648714303970337, + "learning_rate": 1.1275e-05, + "loss": 1.975, + "step": 349 + }, + { + "epoch": 0.4375, + "grad_norm": 1.1547632217407227, + "learning_rate": 1.125e-05, + "loss": 1.8893, + "step": 350 + }, + { + "epoch": 0.43875, + "grad_norm": 1.3992825746536255, + "learning_rate": 1.1225000000000002e-05, + "loss": 2.3774, + "step": 351 + }, + { + "epoch": 0.44, + "grad_norm": 1.0416218042373657, + "learning_rate": 1.1200000000000001e-05, + "loss": 2.3281, + "step": 352 + }, + { + "epoch": 0.44125, + "grad_norm": 1.459821105003357, + "learning_rate": 1.1175e-05, + "loss": 2.1034, + "step": 353 + }, + { + "epoch": 0.4425, + "grad_norm": 1.4830873012542725, + "learning_rate": 1.1150000000000002e-05, + "loss": 2.8088, + "step": 354 + }, + { + "epoch": 0.44375, + "grad_norm": 1.200899362564087, + "learning_rate": 1.1125000000000001e-05, + "loss": 2.3231, + "step": 355 + }, + { + "epoch": 0.445, + "grad_norm": 2.005309820175171, + "learning_rate": 1.1100000000000002e-05, + "loss": 3.5848, + "step": 356 + }, + { + "epoch": 0.44625, + "grad_norm": 1.3485474586486816, + "learning_rate": 1.1075e-05, + "loss": 2.7273, + "step": 357 + }, + { + "epoch": 0.4475, + "grad_norm": 1.5020636320114136, + "learning_rate": 1.1050000000000001e-05, + "loss": 2.5139, + "step": 358 + }, + { + "epoch": 0.44875, + "grad_norm": 1.4698582887649536, + "learning_rate": 1.1025000000000002e-05, + "loss": 2.4339, + "step": 359 + }, + { + "epoch": 0.45, + "grad_norm": 1.5619611740112305, + "learning_rate": 1.1000000000000001e-05, + "loss": 2.958, + "step": 360 + }, + { + "epoch": 0.45125, + "grad_norm": 1.1998145580291748, + "learning_rate": 1.0975e-05, + "loss": 2.4375, + "step": 361 + }, + { + "epoch": 0.4525, + "grad_norm": 3.1132500171661377, + "learning_rate": 1.095e-05, + "loss": 3.1062, + "step": 362 + }, + { + "epoch": 0.45375, + "grad_norm": 3.445772886276245, + "learning_rate": 1.0925000000000001e-05, + "loss": 3.3763, + "step": 363 + }, + { + "epoch": 0.455, + "grad_norm": 2.5865681171417236, + "learning_rate": 1.0900000000000002e-05, + "loss": 2.6878, + "step": 364 + }, + { + "epoch": 0.45625, + "grad_norm": 2.759075880050659, + "learning_rate": 1.0875e-05, + "loss": 2.9461, + "step": 365 + }, + { + "epoch": 0.4575, + "grad_norm": 2.7135589122772217, + "learning_rate": 1.0850000000000001e-05, + "loss": 2.7731, + "step": 366 + }, + { + "epoch": 0.45875, + "grad_norm": 3.1075406074523926, + "learning_rate": 1.0825e-05, + "loss": 2.9753, + "step": 367 + }, + { + "epoch": 0.46, + "grad_norm": 1.8841419219970703, + "learning_rate": 1.0800000000000002e-05, + "loss": 2.4276, + "step": 368 + }, + { + "epoch": 0.46125, + "grad_norm": 1.5538018941879272, + "learning_rate": 1.0775e-05, + "loss": 2.5934, + "step": 369 + }, + { + "epoch": 0.4625, + "grad_norm": 1.6202460527420044, + "learning_rate": 1.075e-05, + "loss": 2.2371, + "step": 370 + }, + { + "epoch": 0.46375, + "grad_norm": 1.8580071926116943, + "learning_rate": 1.0725000000000001e-05, + "loss": 2.7594, + "step": 371 + }, + { + "epoch": 0.465, + "grad_norm": 1.1591219902038574, + "learning_rate": 1.0700000000000001e-05, + "loss": 2.0254, + "step": 372 + }, + { + "epoch": 0.46625, + "grad_norm": 1.995326042175293, + "learning_rate": 1.0675e-05, + "loss": 2.7667, + "step": 373 + }, + { + "epoch": 0.4675, + "grad_norm": 1.5032466650009155, + "learning_rate": 1.065e-05, + "loss": 2.5207, + "step": 374 + }, + { + "epoch": 0.46875, + "grad_norm": 2.4473445415496826, + "learning_rate": 1.0625e-05, + "loss": 2.9158, + "step": 375 + }, + { + "epoch": 0.47, + "grad_norm": 3.215508460998535, + "learning_rate": 1.0600000000000002e-05, + "loss": 3.2581, + "step": 376 + }, + { + "epoch": 0.47125, + "grad_norm": 1.3192355632781982, + "learning_rate": 1.0575000000000001e-05, + "loss": 2.4273, + "step": 377 + }, + { + "epoch": 0.4725, + "grad_norm": 1.9035662412643433, + "learning_rate": 1.055e-05, + "loss": 2.9935, + "step": 378 + }, + { + "epoch": 0.47375, + "grad_norm": 1.2860153913497925, + "learning_rate": 1.0525e-05, + "loss": 2.6538, + "step": 379 + }, + { + "epoch": 0.475, + "grad_norm": 1.6805506944656372, + "learning_rate": 1.0500000000000001e-05, + "loss": 2.5256, + "step": 380 + }, + { + "epoch": 0.47625, + "grad_norm": 1.1306779384613037, + "learning_rate": 1.0475000000000002e-05, + "loss": 2.124, + "step": 381 + }, + { + "epoch": 0.4775, + "grad_norm": 2.3374178409576416, + "learning_rate": 1.045e-05, + "loss": 2.8561, + "step": 382 + }, + { + "epoch": 0.47875, + "grad_norm": 1.739198088645935, + "learning_rate": 1.0425000000000001e-05, + "loss": 2.6378, + "step": 383 + }, + { + "epoch": 0.48, + "grad_norm": 2.08516526222229, + "learning_rate": 1.04e-05, + "loss": 3.1051, + "step": 384 + }, + { + "epoch": 0.48125, + "grad_norm": 1.6211674213409424, + "learning_rate": 1.0375000000000001e-05, + "loss": 3.189, + "step": 385 + }, + { + "epoch": 0.4825, + "grad_norm": 1.768182396888733, + "learning_rate": 1.0350000000000001e-05, + "loss": 2.268, + "step": 386 + }, + { + "epoch": 0.48375, + "grad_norm": 1.2901172637939453, + "learning_rate": 1.0325e-05, + "loss": 2.2684, + "step": 387 + }, + { + "epoch": 0.485, + "grad_norm": 1.6452449560165405, + "learning_rate": 1.0300000000000001e-05, + "loss": 2.7506, + "step": 388 + }, + { + "epoch": 0.48625, + "grad_norm": 1.1954340934753418, + "learning_rate": 1.0275000000000002e-05, + "loss": 2.1034, + "step": 389 + }, + { + "epoch": 0.4875, + "grad_norm": 1.6171998977661133, + "learning_rate": 1.025e-05, + "loss": 2.6603, + "step": 390 + }, + { + "epoch": 0.48875, + "grad_norm": 1.2973988056182861, + "learning_rate": 1.0225000000000001e-05, + "loss": 2.6099, + "step": 391 + }, + { + "epoch": 0.49, + "grad_norm": 3.1014139652252197, + "learning_rate": 1.02e-05, + "loss": 2.9254, + "step": 392 + }, + { + "epoch": 0.49125, + "grad_norm": 1.9552756547927856, + "learning_rate": 1.0175000000000002e-05, + "loss": 2.892, + "step": 393 + }, + { + "epoch": 0.4925, + "grad_norm": 2.1882715225219727, + "learning_rate": 1.015e-05, + "loss": 3.0698, + "step": 394 + }, + { + "epoch": 0.49375, + "grad_norm": 1.013117790222168, + "learning_rate": 1.0125e-05, + "loss": 2.7269, + "step": 395 + }, + { + "epoch": 0.495, + "grad_norm": 1.3441874980926514, + "learning_rate": 1.0100000000000002e-05, + "loss": 1.7805, + "step": 396 + }, + { + "epoch": 0.49625, + "grad_norm": 1.674667477607727, + "learning_rate": 1.0075000000000001e-05, + "loss": 2.3934, + "step": 397 + }, + { + "epoch": 0.4975, + "grad_norm": 3.498429536819458, + "learning_rate": 1.005e-05, + "loss": 2.9463, + "step": 398 + }, + { + "epoch": 0.49875, + "grad_norm": 1.2237039804458618, + "learning_rate": 1.0025e-05, + "loss": 2.7709, + "step": 399 + }, + { + "epoch": 0.5, + "grad_norm": 2.075360059738159, + "learning_rate": 1e-05, + "loss": 3.304, + "step": 400 + }, + { + "epoch": 0.50125, + "grad_norm": 1.4542925357818604, + "learning_rate": 9.975000000000002e-06, + "loss": 2.1337, + "step": 401 + }, + { + "epoch": 0.5025, + "grad_norm": 2.508572816848755, + "learning_rate": 9.950000000000001e-06, + "loss": 2.842, + "step": 402 + }, + { + "epoch": 0.50375, + "grad_norm": 2.260857105255127, + "learning_rate": 9.925e-06, + "loss": 2.6903, + "step": 403 + }, + { + "epoch": 0.505, + "grad_norm": 1.9771928787231445, + "learning_rate": 9.9e-06, + "loss": 2.4159, + "step": 404 + }, + { + "epoch": 0.50625, + "grad_norm": 1.307166337966919, + "learning_rate": 9.875000000000001e-06, + "loss": 2.2289, + "step": 405 + }, + { + "epoch": 0.5075, + "grad_norm": 2.5441768169403076, + "learning_rate": 9.85e-06, + "loss": 2.4819, + "step": 406 + }, + { + "epoch": 0.50875, + "grad_norm": 1.579392433166504, + "learning_rate": 9.825000000000002e-06, + "loss": 2.5208, + "step": 407 + }, + { + "epoch": 0.51, + "grad_norm": 0.8270081281661987, + "learning_rate": 9.800000000000001e-06, + "loss": 2.2591, + "step": 408 + }, + { + "epoch": 0.51125, + "grad_norm": 1.8742762804031372, + "learning_rate": 9.775e-06, + "loss": 2.3944, + "step": 409 + }, + { + "epoch": 0.5125, + "grad_norm": 1.6326040029525757, + "learning_rate": 9.75e-06, + "loss": 2.7157, + "step": 410 + }, + { + "epoch": 0.51375, + "grad_norm": 2.675300121307373, + "learning_rate": 9.725000000000001e-06, + "loss": 2.648, + "step": 411 + }, + { + "epoch": 0.515, + "grad_norm": 1.707234263420105, + "learning_rate": 9.7e-06, + "loss": 2.2568, + "step": 412 + }, + { + "epoch": 0.51625, + "grad_norm": 1.860190987586975, + "learning_rate": 9.675000000000001e-06, + "loss": 2.3879, + "step": 413 + }, + { + "epoch": 0.5175, + "grad_norm": 2.554323673248291, + "learning_rate": 9.65e-06, + "loss": 2.4429, + "step": 414 + }, + { + "epoch": 0.51875, + "grad_norm": 3.1000308990478516, + "learning_rate": 9.625e-06, + "loss": 3.6857, + "step": 415 + }, + { + "epoch": 0.52, + "grad_norm": 1.2858200073242188, + "learning_rate": 9.600000000000001e-06, + "loss": 2.3423, + "step": 416 + }, + { + "epoch": 0.52125, + "grad_norm": 1.7305254936218262, + "learning_rate": 9.575e-06, + "loss": 2.5126, + "step": 417 + }, + { + "epoch": 0.5225, + "grad_norm": 2.0434417724609375, + "learning_rate": 9.55e-06, + "loss": 2.8066, + "step": 418 + }, + { + "epoch": 0.52375, + "grad_norm": 1.1936882734298706, + "learning_rate": 9.525000000000001e-06, + "loss": 1.9428, + "step": 419 + }, + { + "epoch": 0.525, + "grad_norm": 1.5286387205123901, + "learning_rate": 9.5e-06, + "loss": 2.0241, + "step": 420 + }, + { + "epoch": 0.52625, + "grad_norm": 1.162139654159546, + "learning_rate": 9.475000000000002e-06, + "loss": 2.8492, + "step": 421 + }, + { + "epoch": 0.5275, + "grad_norm": 2.5677266120910645, + "learning_rate": 9.450000000000001e-06, + "loss": 1.8682, + "step": 422 + }, + { + "epoch": 0.52875, + "grad_norm": 2.4863381385803223, + "learning_rate": 9.425e-06, + "loss": 2.8088, + "step": 423 + }, + { + "epoch": 0.53, + "grad_norm": 3.074288845062256, + "learning_rate": 9.4e-06, + "loss": 3.1033, + "step": 424 + }, + { + "epoch": 0.53125, + "grad_norm": 3.1781013011932373, + "learning_rate": 9.375000000000001e-06, + "loss": 2.7687, + "step": 425 + }, + { + "epoch": 0.5325, + "grad_norm": 1.8094357252120972, + "learning_rate": 9.350000000000002e-06, + "loss": 2.6157, + "step": 426 + }, + { + "epoch": 0.53375, + "grad_norm": 1.5501965284347534, + "learning_rate": 9.325000000000001e-06, + "loss": 2.5792, + "step": 427 + }, + { + "epoch": 0.535, + "grad_norm": 2.8156535625457764, + "learning_rate": 9.3e-06, + "loss": 2.4914, + "step": 428 + }, + { + "epoch": 0.53625, + "grad_norm": 1.1889698505401611, + "learning_rate": 9.275e-06, + "loss": 2.4051, + "step": 429 + }, + { + "epoch": 0.5375, + "grad_norm": 2.8830533027648926, + "learning_rate": 9.250000000000001e-06, + "loss": 2.5135, + "step": 430 + }, + { + "epoch": 0.53875, + "grad_norm": 1.8553954362869263, + "learning_rate": 9.225e-06, + "loss": 3.0061, + "step": 431 + }, + { + "epoch": 0.54, + "grad_norm": 2.38657283782959, + "learning_rate": 9.200000000000002e-06, + "loss": 2.3366, + "step": 432 + }, + { + "epoch": 0.54125, + "grad_norm": 2.5273995399475098, + "learning_rate": 9.175000000000001e-06, + "loss": 2.3687, + "step": 433 + }, + { + "epoch": 0.5425, + "grad_norm": 2.155266761779785, + "learning_rate": 9.15e-06, + "loss": 3.3555, + "step": 434 + }, + { + "epoch": 0.54375, + "grad_norm": 2.964810371398926, + "learning_rate": 9.125e-06, + "loss": 3.2155, + "step": 435 + }, + { + "epoch": 0.545, + "grad_norm": 1.2716864347457886, + "learning_rate": 9.100000000000001e-06, + "loss": 2.637, + "step": 436 + }, + { + "epoch": 0.54625, + "grad_norm": 1.1982064247131348, + "learning_rate": 9.075e-06, + "loss": 2.5646, + "step": 437 + }, + { + "epoch": 0.5475, + "grad_norm": 3.224846601486206, + "learning_rate": 9.050000000000001e-06, + "loss": 2.0353, + "step": 438 + }, + { + "epoch": 0.54875, + "grad_norm": 2.4795050621032715, + "learning_rate": 9.025e-06, + "loss": 2.4973, + "step": 439 + }, + { + "epoch": 0.55, + "grad_norm": 1.392571210861206, + "learning_rate": 9e-06, + "loss": 1.8671, + "step": 440 + }, + { + "epoch": 0.55125, + "grad_norm": 2.579519748687744, + "learning_rate": 8.975e-06, + "loss": 2.4024, + "step": 441 + }, + { + "epoch": 0.5525, + "grad_norm": 1.491835594177246, + "learning_rate": 8.95e-06, + "loss": 2.8333, + "step": 442 + }, + { + "epoch": 0.55375, + "grad_norm": 1.6103180646896362, + "learning_rate": 8.925e-06, + "loss": 2.4064, + "step": 443 + }, + { + "epoch": 0.555, + "grad_norm": 1.8389121294021606, + "learning_rate": 8.900000000000001e-06, + "loss": 2.8206, + "step": 444 + }, + { + "epoch": 0.55625, + "grad_norm": 1.8887784481048584, + "learning_rate": 8.875e-06, + "loss": 3.0003, + "step": 445 + }, + { + "epoch": 0.5575, + "grad_norm": 3.0405466556549072, + "learning_rate": 8.85e-06, + "loss": 3.1721, + "step": 446 + }, + { + "epoch": 0.55875, + "grad_norm": 1.5504568815231323, + "learning_rate": 8.825000000000001e-06, + "loss": 3.1523, + "step": 447 + }, + { + "epoch": 0.56, + "grad_norm": 2.4831838607788086, + "learning_rate": 8.8e-06, + "loss": 2.9069, + "step": 448 + }, + { + "epoch": 0.56125, + "grad_norm": 1.7960699796676636, + "learning_rate": 8.775e-06, + "loss": 2.3023, + "step": 449 + }, + { + "epoch": 0.5625, + "grad_norm": 2.359325885772705, + "learning_rate": 8.750000000000001e-06, + "loss": 2.7974, + "step": 450 + }, + { + "epoch": 0.56375, + "grad_norm": 3.2360641956329346, + "learning_rate": 8.725000000000002e-06, + "loss": 3.0734, + "step": 451 + }, + { + "epoch": 0.565, + "grad_norm": 1.5362831354141235, + "learning_rate": 8.700000000000001e-06, + "loss": 2.1028, + "step": 452 + }, + { + "epoch": 0.56625, + "grad_norm": 0.892631471157074, + "learning_rate": 8.675e-06, + "loss": 2.0199, + "step": 453 + }, + { + "epoch": 0.5675, + "grad_norm": 2.0016226768493652, + "learning_rate": 8.65e-06, + "loss": 2.5738, + "step": 454 + }, + { + "epoch": 0.56875, + "grad_norm": 1.3834383487701416, + "learning_rate": 8.625000000000001e-06, + "loss": 2.8472, + "step": 455 + }, + { + "epoch": 0.57, + "grad_norm": 2.3233683109283447, + "learning_rate": 8.6e-06, + "loss": 2.653, + "step": 456 + }, + { + "epoch": 0.57125, + "grad_norm": 1.5941835641860962, + "learning_rate": 8.575000000000002e-06, + "loss": 3.1008, + "step": 457 + }, + { + "epoch": 0.5725, + "grad_norm": 1.9550966024398804, + "learning_rate": 8.550000000000001e-06, + "loss": 2.1212, + "step": 458 + }, + { + "epoch": 0.57375, + "grad_norm": 1.5184082984924316, + "learning_rate": 8.525e-06, + "loss": 2.6667, + "step": 459 + }, + { + "epoch": 0.575, + "grad_norm": 0.9546151757240295, + "learning_rate": 8.5e-06, + "loss": 2.6412, + "step": 460 + }, + { + "epoch": 0.57625, + "grad_norm": 1.303501844406128, + "learning_rate": 8.475000000000001e-06, + "loss": 2.4247, + "step": 461 + }, + { + "epoch": 0.5775, + "grad_norm": 1.3831559419631958, + "learning_rate": 8.45e-06, + "loss": 2.315, + "step": 462 + }, + { + "epoch": 0.57875, + "grad_norm": 1.7408896684646606, + "learning_rate": 8.425000000000001e-06, + "loss": 2.9685, + "step": 463 + }, + { + "epoch": 0.58, + "grad_norm": 0.8592466115951538, + "learning_rate": 8.400000000000001e-06, + "loss": 2.3298, + "step": 464 + }, + { + "epoch": 0.58125, + "grad_norm": 1.8565177917480469, + "learning_rate": 8.375e-06, + "loss": 3.0411, + "step": 465 + }, + { + "epoch": 0.5825, + "grad_norm": 1.3421576023101807, + "learning_rate": 8.35e-06, + "loss": 3.0402, + "step": 466 + }, + { + "epoch": 0.58375, + "grad_norm": 2.391731023788452, + "learning_rate": 8.325e-06, + "loss": 2.8541, + "step": 467 + }, + { + "epoch": 0.585, + "grad_norm": 1.367205023765564, + "learning_rate": 8.3e-06, + "loss": 2.5367, + "step": 468 + }, + { + "epoch": 0.58625, + "grad_norm": 2.5906476974487305, + "learning_rate": 8.275000000000001e-06, + "loss": 3.0516, + "step": 469 + }, + { + "epoch": 0.5875, + "grad_norm": 2.569822072982788, + "learning_rate": 8.25e-06, + "loss": 2.2488, + "step": 470 + }, + { + "epoch": 0.58875, + "grad_norm": 1.5829050540924072, + "learning_rate": 8.225e-06, + "loss": 2.4019, + "step": 471 + }, + { + "epoch": 0.59, + "grad_norm": 4.332492828369141, + "learning_rate": 8.2e-06, + "loss": 3.1786, + "step": 472 + }, + { + "epoch": 0.59125, + "grad_norm": 2.3091604709625244, + "learning_rate": 8.175e-06, + "loss": 2.3487, + "step": 473 + }, + { + "epoch": 0.5925, + "grad_norm": 1.708343505859375, + "learning_rate": 8.15e-06, + "loss": 2.5667, + "step": 474 + }, + { + "epoch": 0.59375, + "grad_norm": 1.4315516948699951, + "learning_rate": 8.125000000000001e-06, + "loss": 2.4841, + "step": 475 + }, + { + "epoch": 0.595, + "grad_norm": 1.4762920141220093, + "learning_rate": 8.1e-06, + "loss": 2.5809, + "step": 476 + }, + { + "epoch": 0.59625, + "grad_norm": 1.6080222129821777, + "learning_rate": 8.075000000000001e-06, + "loss": 2.6133, + "step": 477 + }, + { + "epoch": 0.5975, + "grad_norm": 1.0008354187011719, + "learning_rate": 8.050000000000001e-06, + "loss": 2.3204, + "step": 478 + }, + { + "epoch": 0.59875, + "grad_norm": 1.433173656463623, + "learning_rate": 8.025e-06, + "loss": 2.6281, + "step": 479 + }, + { + "epoch": 0.6, + "grad_norm": 1.1544861793518066, + "learning_rate": 8.000000000000001e-06, + "loss": 2.7092, + "step": 480 + }, + { + "epoch": 0.60125, + "grad_norm": 1.2630051374435425, + "learning_rate": 7.975e-06, + "loss": 2.2522, + "step": 481 + }, + { + "epoch": 0.6025, + "grad_norm": 2.1408400535583496, + "learning_rate": 7.950000000000002e-06, + "loss": 3.1307, + "step": 482 + }, + { + "epoch": 0.60375, + "grad_norm": 1.695609211921692, + "learning_rate": 7.925000000000001e-06, + "loss": 2.4184, + "step": 483 + }, + { + "epoch": 0.605, + "grad_norm": 1.8924223184585571, + "learning_rate": 7.9e-06, + "loss": 3.2703, + "step": 484 + }, + { + "epoch": 0.60625, + "grad_norm": 4.110727787017822, + "learning_rate": 7.875e-06, + "loss": 3.0813, + "step": 485 + }, + { + "epoch": 0.6075, + "grad_norm": 1.2760142087936401, + "learning_rate": 7.850000000000001e-06, + "loss": 2.7032, + "step": 486 + }, + { + "epoch": 0.60875, + "grad_norm": 1.2902387380599976, + "learning_rate": 7.825e-06, + "loss": 2.7254, + "step": 487 + }, + { + "epoch": 0.61, + "grad_norm": 2.366107940673828, + "learning_rate": 7.800000000000002e-06, + "loss": 2.8782, + "step": 488 + }, + { + "epoch": 0.61125, + "grad_norm": 2.737114906311035, + "learning_rate": 7.775000000000001e-06, + "loss": 3.5096, + "step": 489 + }, + { + "epoch": 0.6125, + "grad_norm": 4.056971073150635, + "learning_rate": 7.75e-06, + "loss": 3.4208, + "step": 490 + }, + { + "epoch": 0.61375, + "grad_norm": 1.18091881275177, + "learning_rate": 7.725e-06, + "loss": 2.6447, + "step": 491 + }, + { + "epoch": 0.615, + "grad_norm": 5.914666652679443, + "learning_rate": 7.7e-06, + "loss": 4.1583, + "step": 492 + }, + { + "epoch": 0.61625, + "grad_norm": 1.383678913116455, + "learning_rate": 7.675e-06, + "loss": 2.685, + "step": 493 + }, + { + "epoch": 0.6175, + "grad_norm": 1.5389842987060547, + "learning_rate": 7.650000000000001e-06, + "loss": 2.9398, + "step": 494 + }, + { + "epoch": 0.61875, + "grad_norm": 1.1608572006225586, + "learning_rate": 7.625e-06, + "loss": 2.8138, + "step": 495 + }, + { + "epoch": 0.62, + "grad_norm": 1.245866298675537, + "learning_rate": 7.600000000000001e-06, + "loss": 2.2935, + "step": 496 + }, + { + "epoch": 0.62125, + "grad_norm": 1.0834670066833496, + "learning_rate": 7.575e-06, + "loss": 2.3084, + "step": 497 + }, + { + "epoch": 0.6225, + "grad_norm": 1.079750657081604, + "learning_rate": 7.5500000000000006e-06, + "loss": 1.861, + "step": 498 + }, + { + "epoch": 0.62375, + "grad_norm": 1.2369153499603271, + "learning_rate": 7.525e-06, + "loss": 2.3253, + "step": 499 + }, + { + "epoch": 0.625, + "grad_norm": 0.6367318630218506, + "learning_rate": 7.500000000000001e-06, + "loss": 2.1319, + "step": 500 + }, + { + "epoch": 0.62625, + "grad_norm": 1.771266222000122, + "learning_rate": 7.475000000000001e-06, + "loss": 2.6704, + "step": 501 + }, + { + "epoch": 0.6275, + "grad_norm": 2.236402988433838, + "learning_rate": 7.450000000000001e-06, + "loss": 3.2022, + "step": 502 + }, + { + "epoch": 0.62875, + "grad_norm": 1.600723385810852, + "learning_rate": 7.425000000000001e-06, + "loss": 2.4008, + "step": 503 + }, + { + "epoch": 0.63, + "grad_norm": 2.818192720413208, + "learning_rate": 7.4e-06, + "loss": 3.191, + "step": 504 + }, + { + "epoch": 0.63125, + "grad_norm": 2.4682326316833496, + "learning_rate": 7.375000000000001e-06, + "loss": 2.9254, + "step": 505 + }, + { + "epoch": 0.6325, + "grad_norm": 1.8725566864013672, + "learning_rate": 7.350000000000001e-06, + "loss": 2.6224, + "step": 506 + }, + { + "epoch": 0.63375, + "grad_norm": 1.5210118293762207, + "learning_rate": 7.325000000000001e-06, + "loss": 2.2774, + "step": 507 + }, + { + "epoch": 0.635, + "grad_norm": 1.918311357498169, + "learning_rate": 7.3e-06, + "loss": 2.6411, + "step": 508 + }, + { + "epoch": 0.63625, + "grad_norm": 0.7823852896690369, + "learning_rate": 7.275000000000001e-06, + "loss": 2.2566, + "step": 509 + }, + { + "epoch": 0.6375, + "grad_norm": 1.1743433475494385, + "learning_rate": 7.25e-06, + "loss": 2.7283, + "step": 510 + }, + { + "epoch": 0.63875, + "grad_norm": 4.318971633911133, + "learning_rate": 7.225000000000001e-06, + "loss": 3.1266, + "step": 511 + }, + { + "epoch": 0.64, + "grad_norm": 1.3978825807571411, + "learning_rate": 7.2000000000000005e-06, + "loss": 2.3688, + "step": 512 + }, + { + "epoch": 0.64125, + "grad_norm": 1.2514866590499878, + "learning_rate": 7.175000000000001e-06, + "loss": 1.8098, + "step": 513 + }, + { + "epoch": 0.6425, + "grad_norm": 2.5508549213409424, + "learning_rate": 7.15e-06, + "loss": 2.7809, + "step": 514 + }, + { + "epoch": 0.64375, + "grad_norm": 1.093632698059082, + "learning_rate": 7.125e-06, + "loss": 2.6663, + "step": 515 + }, + { + "epoch": 0.645, + "grad_norm": 1.880379557609558, + "learning_rate": 7.100000000000001e-06, + "loss": 2.539, + "step": 516 + }, + { + "epoch": 0.64625, + "grad_norm": 3.1032094955444336, + "learning_rate": 7.075000000000001e-06, + "loss": 2.1864, + "step": 517 + }, + { + "epoch": 0.6475, + "grad_norm": 1.828586459159851, + "learning_rate": 7.05e-06, + "loss": 2.7087, + "step": 518 + }, + { + "epoch": 0.64875, + "grad_norm": 5.738358497619629, + "learning_rate": 7.0250000000000005e-06, + "loss": 3.8585, + "step": 519 + }, + { + "epoch": 0.65, + "grad_norm": 3.2483720779418945, + "learning_rate": 7e-06, + "loss": 2.424, + "step": 520 + }, + { + "epoch": 0.65125, + "grad_norm": 1.6957286596298218, + "learning_rate": 6.975000000000001e-06, + "loss": 2.053, + "step": 521 + }, + { + "epoch": 0.6525, + "grad_norm": 1.2034648656845093, + "learning_rate": 6.95e-06, + "loss": 1.6862, + "step": 522 + }, + { + "epoch": 0.65375, + "grad_norm": 2.2950422763824463, + "learning_rate": 6.925000000000001e-06, + "loss": 2.6543, + "step": 523 + }, + { + "epoch": 0.655, + "grad_norm": 3.466404438018799, + "learning_rate": 6.9e-06, + "loss": 3.5708, + "step": 524 + }, + { + "epoch": 0.65625, + "grad_norm": 1.826366662979126, + "learning_rate": 6.875e-06, + "loss": 2.5286, + "step": 525 + }, + { + "epoch": 0.6575, + "grad_norm": 1.3622981309890747, + "learning_rate": 6.850000000000001e-06, + "loss": 2.3816, + "step": 526 + }, + { + "epoch": 0.65875, + "grad_norm": 1.7400751113891602, + "learning_rate": 6.825000000000001e-06, + "loss": 2.1949, + "step": 527 + }, + { + "epoch": 0.66, + "grad_norm": 1.261826753616333, + "learning_rate": 6.800000000000001e-06, + "loss": 2.1731, + "step": 528 + }, + { + "epoch": 0.66125, + "grad_norm": 1.299221158027649, + "learning_rate": 6.775e-06, + "loss": 2.664, + "step": 529 + }, + { + "epoch": 0.6625, + "grad_norm": 5.387721538543701, + "learning_rate": 6.750000000000001e-06, + "loss": 3.4304, + "step": 530 + }, + { + "epoch": 0.66375, + "grad_norm": 2.0014383792877197, + "learning_rate": 6.725000000000001e-06, + "loss": 2.4377, + "step": 531 + }, + { + "epoch": 0.665, + "grad_norm": 1.8447600603103638, + "learning_rate": 6.700000000000001e-06, + "loss": 3.0142, + "step": 532 + }, + { + "epoch": 0.66625, + "grad_norm": 2.1541025638580322, + "learning_rate": 6.6750000000000005e-06, + "loss": 2.3221, + "step": 533 + }, + { + "epoch": 0.6675, + "grad_norm": 2.365069627761841, + "learning_rate": 6.650000000000001e-06, + "loss": 2.8227, + "step": 534 + }, + { + "epoch": 0.66875, + "grad_norm": 2.2634615898132324, + "learning_rate": 6.625e-06, + "loss": 2.7516, + "step": 535 + }, + { + "epoch": 0.67, + "grad_norm": 2.8768017292022705, + "learning_rate": 6.600000000000001e-06, + "loss": 3.129, + "step": 536 + }, + { + "epoch": 0.67125, + "grad_norm": 2.3933587074279785, + "learning_rate": 6.5750000000000006e-06, + "loss": 2.457, + "step": 537 + }, + { + "epoch": 0.6725, + "grad_norm": 1.068185806274414, + "learning_rate": 6.550000000000001e-06, + "loss": 2.2805, + "step": 538 + }, + { + "epoch": 0.67375, + "grad_norm": 1.4682180881500244, + "learning_rate": 6.525e-06, + "loss": 2.4401, + "step": 539 + }, + { + "epoch": 0.675, + "grad_norm": 2.4375839233398438, + "learning_rate": 6.5000000000000004e-06, + "loss": 3.0402, + "step": 540 + }, + { + "epoch": 0.67625, + "grad_norm": 1.7654716968536377, + "learning_rate": 6.475e-06, + "loss": 2.4122, + "step": 541 + }, + { + "epoch": 0.6775, + "grad_norm": 2.0273666381835938, + "learning_rate": 6.450000000000001e-06, + "loss": 2.7685, + "step": 542 + }, + { + "epoch": 0.67875, + "grad_norm": 2.323498487472534, + "learning_rate": 6.425e-06, + "loss": 2.735, + "step": 543 + }, + { + "epoch": 0.68, + "grad_norm": 1.4310922622680664, + "learning_rate": 6.4000000000000006e-06, + "loss": 2.0557, + "step": 544 + }, + { + "epoch": 0.68125, + "grad_norm": 2.29921817779541, + "learning_rate": 6.375e-06, + "loss": 2.6454, + "step": 545 + }, + { + "epoch": 0.6825, + "grad_norm": 5.133302211761475, + "learning_rate": 6.35e-06, + "loss": 3.3667, + "step": 546 + }, + { + "epoch": 0.68375, + "grad_norm": 1.5254071950912476, + "learning_rate": 6.3250000000000004e-06, + "loss": 2.1213, + "step": 547 + }, + { + "epoch": 0.685, + "grad_norm": 2.4520602226257324, + "learning_rate": 6.300000000000001e-06, + "loss": 2.9527, + "step": 548 + }, + { + "epoch": 0.68625, + "grad_norm": 3.5153932571411133, + "learning_rate": 6.275e-06, + "loss": 2.6369, + "step": 549 + }, + { + "epoch": 0.6875, + "grad_norm": 1.860868215560913, + "learning_rate": 6.25e-06, + "loss": 3.4027, + "step": 550 + }, + { + "epoch": 0.68875, + "grad_norm": 2.164926290512085, + "learning_rate": 6.225000000000001e-06, + "loss": 2.5587, + "step": 551 + }, + { + "epoch": 0.69, + "grad_norm": 2.0053963661193848, + "learning_rate": 6.200000000000001e-06, + "loss": 2.7041, + "step": 552 + }, + { + "epoch": 0.69125, + "grad_norm": 1.467716932296753, + "learning_rate": 6.175000000000001e-06, + "loss": 2.4994, + "step": 553 + }, + { + "epoch": 0.6925, + "grad_norm": 1.1203712224960327, + "learning_rate": 6.15e-06, + "loss": 2.8008, + "step": 554 + }, + { + "epoch": 0.69375, + "grad_norm": 1.4410206079483032, + "learning_rate": 6.125000000000001e-06, + "loss": 2.6894, + "step": 555 + }, + { + "epoch": 0.695, + "grad_norm": 2.3052964210510254, + "learning_rate": 6.1e-06, + "loss": 2.7815, + "step": 556 + }, + { + "epoch": 0.69625, + "grad_norm": 1.6395195722579956, + "learning_rate": 6.075000000000001e-06, + "loss": 2.3493, + "step": 557 + }, + { + "epoch": 0.6975, + "grad_norm": 0.6925020217895508, + "learning_rate": 6.0500000000000005e-06, + "loss": 2.375, + "step": 558 + }, + { + "epoch": 0.69875, + "grad_norm": 2.1917574405670166, + "learning_rate": 6.025000000000001e-06, + "loss": 3.4288, + "step": 559 + }, + { + "epoch": 0.7, + "grad_norm": 1.4762030839920044, + "learning_rate": 6e-06, + "loss": 2.8223, + "step": 560 + }, + { + "epoch": 0.70125, + "grad_norm": 2.2189676761627197, + "learning_rate": 5.975e-06, + "loss": 3.0123, + "step": 561 + }, + { + "epoch": 0.7025, + "grad_norm": 2.046457052230835, + "learning_rate": 5.950000000000001e-06, + "loss": 2.1167, + "step": 562 + }, + { + "epoch": 0.70375, + "grad_norm": 2.0804660320281982, + "learning_rate": 5.925000000000001e-06, + "loss": 2.4246, + "step": 563 + }, + { + "epoch": 0.705, + "grad_norm": 2.419306993484497, + "learning_rate": 5.9e-06, + "loss": 2.2055, + "step": 564 + }, + { + "epoch": 0.70625, + "grad_norm": 1.8702435493469238, + "learning_rate": 5.8750000000000005e-06, + "loss": 2.6608, + "step": 565 + }, + { + "epoch": 0.7075, + "grad_norm": 2.246138572692871, + "learning_rate": 5.85e-06, + "loss": 2.7353, + "step": 566 + }, + { + "epoch": 0.70875, + "grad_norm": 1.2351369857788086, + "learning_rate": 5.825000000000001e-06, + "loss": 2.0033, + "step": 567 + }, + { + "epoch": 0.71, + "grad_norm": 1.4821653366088867, + "learning_rate": 5.8e-06, + "loss": 2.9512, + "step": 568 + }, + { + "epoch": 0.71125, + "grad_norm": 1.1718158721923828, + "learning_rate": 5.775000000000001e-06, + "loss": 2.5153, + "step": 569 + }, + { + "epoch": 0.7125, + "grad_norm": 1.8521480560302734, + "learning_rate": 5.75e-06, + "loss": 3.0076, + "step": 570 + }, + { + "epoch": 0.71375, + "grad_norm": 1.7520173788070679, + "learning_rate": 5.725e-06, + "loss": 2.4602, + "step": 571 + }, + { + "epoch": 0.715, + "grad_norm": 3.2388157844543457, + "learning_rate": 5.7e-06, + "loss": 3.1706, + "step": 572 + }, + { + "epoch": 0.71625, + "grad_norm": 1.5112570524215698, + "learning_rate": 5.675000000000001e-06, + "loss": 2.5468, + "step": 573 + }, + { + "epoch": 0.7175, + "grad_norm": 2.0039563179016113, + "learning_rate": 5.65e-06, + "loss": 2.98, + "step": 574 + }, + { + "epoch": 0.71875, + "grad_norm": 2.5694010257720947, + "learning_rate": 5.625e-06, + "loss": 2.454, + "step": 575 + }, + { + "epoch": 0.72, + "grad_norm": 2.8285837173461914, + "learning_rate": 5.600000000000001e-06, + "loss": 2.4644, + "step": 576 + }, + { + "epoch": 0.72125, + "grad_norm": 1.8699629306793213, + "learning_rate": 5.575000000000001e-06, + "loss": 2.0484, + "step": 577 + }, + { + "epoch": 0.7225, + "grad_norm": 1.4237912893295288, + "learning_rate": 5.550000000000001e-06, + "loss": 2.5569, + "step": 578 + }, + { + "epoch": 0.72375, + "grad_norm": 1.812650442123413, + "learning_rate": 5.5250000000000005e-06, + "loss": 2.8348, + "step": 579 + }, + { + "epoch": 0.725, + "grad_norm": 4.9448137283325195, + "learning_rate": 5.500000000000001e-06, + "loss": 4.1679, + "step": 580 + }, + { + "epoch": 0.72625, + "grad_norm": 3.0024564266204834, + "learning_rate": 5.475e-06, + "loss": 3.1915, + "step": 581 + }, + { + "epoch": 0.7275, + "grad_norm": 1.2082493305206299, + "learning_rate": 5.450000000000001e-06, + "loss": 1.7674, + "step": 582 + }, + { + "epoch": 0.72875, + "grad_norm": 2.0212562084198, + "learning_rate": 5.4250000000000006e-06, + "loss": 2.8302, + "step": 583 + }, + { + "epoch": 0.73, + "grad_norm": 1.7197890281677246, + "learning_rate": 5.400000000000001e-06, + "loss": 2.0274, + "step": 584 + }, + { + "epoch": 0.73125, + "grad_norm": 1.3871604204177856, + "learning_rate": 5.375e-06, + "loss": 2.4914, + "step": 585 + }, + { + "epoch": 0.7325, + "grad_norm": 1.2983068227767944, + "learning_rate": 5.3500000000000004e-06, + "loss": 3.1802, + "step": 586 + }, + { + "epoch": 0.73375, + "grad_norm": 1.3344168663024902, + "learning_rate": 5.325e-06, + "loss": 2.1649, + "step": 587 + }, + { + "epoch": 0.735, + "grad_norm": 1.4151086807250977, + "learning_rate": 5.300000000000001e-06, + "loss": 2.8521, + "step": 588 + }, + { + "epoch": 0.73625, + "grad_norm": 2.3366806507110596, + "learning_rate": 5.275e-06, + "loss": 2.8001, + "step": 589 + }, + { + "epoch": 0.7375, + "grad_norm": 3.088120460510254, + "learning_rate": 5.2500000000000006e-06, + "loss": 2.4478, + "step": 590 + }, + { + "epoch": 0.73875, + "grad_norm": 3.974681854248047, + "learning_rate": 5.225e-06, + "loss": 2.633, + "step": 591 + }, + { + "epoch": 0.74, + "grad_norm": 2.696179151535034, + "learning_rate": 5.2e-06, + "loss": 2.8837, + "step": 592 + }, + { + "epoch": 0.74125, + "grad_norm": 2.7638463973999023, + "learning_rate": 5.1750000000000004e-06, + "loss": 2.4203, + "step": 593 + }, + { + "epoch": 0.7425, + "grad_norm": 1.9097126722335815, + "learning_rate": 5.150000000000001e-06, + "loss": 2.1879, + "step": 594 + }, + { + "epoch": 0.74375, + "grad_norm": 1.0455313920974731, + "learning_rate": 5.125e-06, + "loss": 2.529, + "step": 595 + }, + { + "epoch": 0.745, + "grad_norm": 1.6425269842147827, + "learning_rate": 5.1e-06, + "loss": 3.3192, + "step": 596 + }, + { + "epoch": 0.74625, + "grad_norm": 0.8040964007377625, + "learning_rate": 5.075e-06, + "loss": 2.2812, + "step": 597 + }, + { + "epoch": 0.7475, + "grad_norm": 1.8883963823318481, + "learning_rate": 5.050000000000001e-06, + "loss": 3.0112, + "step": 598 + }, + { + "epoch": 0.74875, + "grad_norm": 1.580411434173584, + "learning_rate": 5.025e-06, + "loss": 2.7145, + "step": 599 + }, + { + "epoch": 0.75, + "grad_norm": 1.2164394855499268, + "learning_rate": 5e-06, + "loss": 2.0817, + "step": 600 + }, + { + "epoch": 0.75125, + "grad_norm": 1.7922728061676025, + "learning_rate": 4.975000000000001e-06, + "loss": 2.4854, + "step": 601 + }, + { + "epoch": 0.7525, + "grad_norm": 1.1481162309646606, + "learning_rate": 4.95e-06, + "loss": 2.5298, + "step": 602 + }, + { + "epoch": 0.75375, + "grad_norm": 1.3136919736862183, + "learning_rate": 4.925e-06, + "loss": 2.4225, + "step": 603 + }, + { + "epoch": 0.755, + "grad_norm": 2.7019455432891846, + "learning_rate": 4.9000000000000005e-06, + "loss": 2.8419, + "step": 604 + }, + { + "epoch": 0.75625, + "grad_norm": 1.661961555480957, + "learning_rate": 4.875e-06, + "loss": 2.097, + "step": 605 + }, + { + "epoch": 0.7575, + "grad_norm": 1.692785620689392, + "learning_rate": 4.85e-06, + "loss": 2.7115, + "step": 606 + }, + { + "epoch": 0.75875, + "grad_norm": 3.2370035648345947, + "learning_rate": 4.825e-06, + "loss": 2.7567, + "step": 607 + }, + { + "epoch": 0.76, + "grad_norm": 0.9687768220901489, + "learning_rate": 4.800000000000001e-06, + "loss": 2.362, + "step": 608 + }, + { + "epoch": 0.76125, + "grad_norm": 1.3951308727264404, + "learning_rate": 4.775e-06, + "loss": 2.2084, + "step": 609 + }, + { + "epoch": 0.7625, + "grad_norm": 2.4143717288970947, + "learning_rate": 4.75e-06, + "loss": 2.3073, + "step": 610 + }, + { + "epoch": 0.76375, + "grad_norm": 1.0696815252304077, + "learning_rate": 4.7250000000000005e-06, + "loss": 1.8974, + "step": 611 + }, + { + "epoch": 0.765, + "grad_norm": 1.8856892585754395, + "learning_rate": 4.7e-06, + "loss": 2.6322, + "step": 612 + }, + { + "epoch": 0.76625, + "grad_norm": 2.168942451477051, + "learning_rate": 4.675000000000001e-06, + "loss": 2.5815, + "step": 613 + }, + { + "epoch": 0.7675, + "grad_norm": 2.5787465572357178, + "learning_rate": 4.65e-06, + "loss": 3.1303, + "step": 614 + }, + { + "epoch": 0.76875, + "grad_norm": 1.4108450412750244, + "learning_rate": 4.625000000000001e-06, + "loss": 2.44, + "step": 615 + }, + { + "epoch": 0.77, + "grad_norm": 1.7232167720794678, + "learning_rate": 4.600000000000001e-06, + "loss": 2.4796, + "step": 616 + }, + { + "epoch": 0.77125, + "grad_norm": 1.3400771617889404, + "learning_rate": 4.575e-06, + "loss": 2.5869, + "step": 617 + }, + { + "epoch": 0.7725, + "grad_norm": 1.1327736377716064, + "learning_rate": 4.5500000000000005e-06, + "loss": 1.856, + "step": 618 + }, + { + "epoch": 0.77375, + "grad_norm": 1.3946475982666016, + "learning_rate": 4.525000000000001e-06, + "loss": 2.4039, + "step": 619 + }, + { + "epoch": 0.775, + "grad_norm": 2.2213644981384277, + "learning_rate": 4.5e-06, + "loss": 2.6722, + "step": 620 + }, + { + "epoch": 0.77625, + "grad_norm": 3.5649960041046143, + "learning_rate": 4.475e-06, + "loss": 2.5282, + "step": 621 + }, + { + "epoch": 0.7775, + "grad_norm": 1.5441259145736694, + "learning_rate": 4.450000000000001e-06, + "loss": 2.6255, + "step": 622 + }, + { + "epoch": 0.77875, + "grad_norm": 1.637509822845459, + "learning_rate": 4.425e-06, + "loss": 2.3554, + "step": 623 + }, + { + "epoch": 0.78, + "grad_norm": 1.0805779695510864, + "learning_rate": 4.4e-06, + "loss": 2.543, + "step": 624 + }, + { + "epoch": 0.78125, + "grad_norm": 2.173614740371704, + "learning_rate": 4.3750000000000005e-06, + "loss": 2.2831, + "step": 625 + }, + { + "epoch": 0.7825, + "grad_norm": 1.3933883905410767, + "learning_rate": 4.350000000000001e-06, + "loss": 2.3316, + "step": 626 + }, + { + "epoch": 0.78375, + "grad_norm": 0.9714035987854004, + "learning_rate": 4.325e-06, + "loss": 2.344, + "step": 627 + }, + { + "epoch": 0.785, + "grad_norm": 1.6793344020843506, + "learning_rate": 4.3e-06, + "loss": 2.5151, + "step": 628 + }, + { + "epoch": 0.78625, + "grad_norm": 1.507856011390686, + "learning_rate": 4.2750000000000006e-06, + "loss": 3.1518, + "step": 629 + }, + { + "epoch": 0.7875, + "grad_norm": 3.8492484092712402, + "learning_rate": 4.25e-06, + "loss": 3.2012, + "step": 630 + }, + { + "epoch": 0.78875, + "grad_norm": 2.8230409622192383, + "learning_rate": 4.225e-06, + "loss": 3.0812, + "step": 631 + }, + { + "epoch": 0.79, + "grad_norm": 1.977073311805725, + "learning_rate": 4.2000000000000004e-06, + "loss": 2.4044, + "step": 632 + }, + { + "epoch": 0.79125, + "grad_norm": 2.227055788040161, + "learning_rate": 4.175e-06, + "loss": 2.9186, + "step": 633 + }, + { + "epoch": 0.7925, + "grad_norm": 3.2962162494659424, + "learning_rate": 4.15e-06, + "loss": 3.5546, + "step": 634 + }, + { + "epoch": 0.79375, + "grad_norm": 1.9414913654327393, + "learning_rate": 4.125e-06, + "loss": 2.7475, + "step": 635 + }, + { + "epoch": 0.795, + "grad_norm": 1.6483408212661743, + "learning_rate": 4.1e-06, + "loss": 3.0241, + "step": 636 + }, + { + "epoch": 0.79625, + "grad_norm": 1.1849037408828735, + "learning_rate": 4.075e-06, + "loss": 2.6205, + "step": 637 + }, + { + "epoch": 0.7975, + "grad_norm": 1.7737408876419067, + "learning_rate": 4.05e-06, + "loss": 2.5812, + "step": 638 + }, + { + "epoch": 0.79875, + "grad_norm": 1.0205885171890259, + "learning_rate": 4.0250000000000004e-06, + "loss": 2.5972, + "step": 639 + }, + { + "epoch": 0.8, + "grad_norm": 0.9625855088233948, + "learning_rate": 4.000000000000001e-06, + "loss": 2.2021, + "step": 640 + }, + { + "epoch": 0.80125, + "grad_norm": 1.3905662298202515, + "learning_rate": 3.975000000000001e-06, + "loss": 2.1898, + "step": 641 + }, + { + "epoch": 0.8025, + "grad_norm": 2.1168220043182373, + "learning_rate": 3.95e-06, + "loss": 2.866, + "step": 642 + }, + { + "epoch": 0.80375, + "grad_norm": 3.0257530212402344, + "learning_rate": 3.9250000000000005e-06, + "loss": 2.8517, + "step": 643 + }, + { + "epoch": 0.805, + "grad_norm": 2.563762903213501, + "learning_rate": 3.900000000000001e-06, + "loss": 2.3336, + "step": 644 + }, + { + "epoch": 0.80625, + "grad_norm": 3.3193588256835938, + "learning_rate": 3.875e-06, + "loss": 3.1035, + "step": 645 + }, + { + "epoch": 0.8075, + "grad_norm": 2.2538487911224365, + "learning_rate": 3.85e-06, + "loss": 2.8489, + "step": 646 + }, + { + "epoch": 0.80875, + "grad_norm": 1.6400374174118042, + "learning_rate": 3.825000000000001e-06, + "loss": 2.7454, + "step": 647 + }, + { + "epoch": 0.81, + "grad_norm": 1.152010202407837, + "learning_rate": 3.8000000000000005e-06, + "loss": 2.5955, + "step": 648 + }, + { + "epoch": 0.81125, + "grad_norm": 1.100693941116333, + "learning_rate": 3.7750000000000003e-06, + "loss": 1.9502, + "step": 649 + }, + { + "epoch": 0.8125, + "grad_norm": 1.8576849699020386, + "learning_rate": 3.7500000000000005e-06, + "loss": 2.7005, + "step": 650 + }, + { + "epoch": 0.81375, + "grad_norm": 2.48925518989563, + "learning_rate": 3.7250000000000003e-06, + "loss": 2.7473, + "step": 651 + }, + { + "epoch": 0.815, + "grad_norm": 1.4268637895584106, + "learning_rate": 3.7e-06, + "loss": 2.4413, + "step": 652 + }, + { + "epoch": 0.81625, + "grad_norm": 0.8600934743881226, + "learning_rate": 3.6750000000000004e-06, + "loss": 2.7305, + "step": 653 + }, + { + "epoch": 0.8175, + "grad_norm": 1.7161887884140015, + "learning_rate": 3.65e-06, + "loss": 2.0339, + "step": 654 + }, + { + "epoch": 0.81875, + "grad_norm": 2.643136739730835, + "learning_rate": 3.625e-06, + "loss": 2.7862, + "step": 655 + }, + { + "epoch": 0.82, + "grad_norm": 1.1652709245681763, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.3251, + "step": 656 + }, + { + "epoch": 0.82125, + "grad_norm": 1.3739567995071411, + "learning_rate": 3.575e-06, + "loss": 2.5092, + "step": 657 + }, + { + "epoch": 0.8225, + "grad_norm": 1.4893128871917725, + "learning_rate": 3.5500000000000003e-06, + "loss": 2.2714, + "step": 658 + }, + { + "epoch": 0.82375, + "grad_norm": 0.9546101093292236, + "learning_rate": 3.525e-06, + "loss": 2.106, + "step": 659 + }, + { + "epoch": 0.825, + "grad_norm": 1.596313238143921, + "learning_rate": 3.5e-06, + "loss": 2.9919, + "step": 660 + }, + { + "epoch": 0.82625, + "grad_norm": 1.6426167488098145, + "learning_rate": 3.475e-06, + "loss": 2.2044, + "step": 661 + }, + { + "epoch": 0.8275, + "grad_norm": 1.0089974403381348, + "learning_rate": 3.45e-06, + "loss": 2.5872, + "step": 662 + }, + { + "epoch": 0.82875, + "grad_norm": 1.9362369775772095, + "learning_rate": 3.4250000000000007e-06, + "loss": 2.5531, + "step": 663 + }, + { + "epoch": 0.83, + "grad_norm": 0.6755287051200867, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.9512, + "step": 664 + }, + { + "epoch": 0.83125, + "grad_norm": 1.8554205894470215, + "learning_rate": 3.3750000000000003e-06, + "loss": 2.5813, + "step": 665 + }, + { + "epoch": 0.8325, + "grad_norm": 2.031695604324341, + "learning_rate": 3.3500000000000005e-06, + "loss": 2.6638, + "step": 666 + }, + { + "epoch": 0.83375, + "grad_norm": 1.817267656326294, + "learning_rate": 3.3250000000000004e-06, + "loss": 2.7071, + "step": 667 + }, + { + "epoch": 0.835, + "grad_norm": 1.8573476076126099, + "learning_rate": 3.3000000000000006e-06, + "loss": 2.9174, + "step": 668 + }, + { + "epoch": 0.83625, + "grad_norm": 1.9300901889801025, + "learning_rate": 3.2750000000000004e-06, + "loss": 3.0282, + "step": 669 + }, + { + "epoch": 0.8375, + "grad_norm": 0.8869524598121643, + "learning_rate": 3.2500000000000002e-06, + "loss": 2.4725, + "step": 670 + }, + { + "epoch": 0.83875, + "grad_norm": 1.1387746334075928, + "learning_rate": 3.2250000000000005e-06, + "loss": 2.3742, + "step": 671 + }, + { + "epoch": 0.84, + "grad_norm": 2.481876850128174, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.567, + "step": 672 + }, + { + "epoch": 0.84125, + "grad_norm": 1.3130849599838257, + "learning_rate": 3.175e-06, + "loss": 2.1865, + "step": 673 + }, + { + "epoch": 0.8425, + "grad_norm": 1.5404133796691895, + "learning_rate": 3.1500000000000003e-06, + "loss": 2.6415, + "step": 674 + }, + { + "epoch": 0.84375, + "grad_norm": 2.30031418800354, + "learning_rate": 3.125e-06, + "loss": 2.6346, + "step": 675 + }, + { + "epoch": 0.845, + "grad_norm": 1.8086575269699097, + "learning_rate": 3.1000000000000004e-06, + "loss": 2.1789, + "step": 676 + }, + { + "epoch": 0.84625, + "grad_norm": 2.2194817066192627, + "learning_rate": 3.075e-06, + "loss": 2.048, + "step": 677 + }, + { + "epoch": 0.8475, + "grad_norm": 1.6533406972885132, + "learning_rate": 3.05e-06, + "loss": 2.3017, + "step": 678 + }, + { + "epoch": 0.84875, + "grad_norm": 4.319630146026611, + "learning_rate": 3.0250000000000003e-06, + "loss": 3.3279, + "step": 679 + }, + { + "epoch": 0.85, + "grad_norm": 1.66692316532135, + "learning_rate": 3e-06, + "loss": 2.6651, + "step": 680 + }, + { + "epoch": 0.85125, + "grad_norm": 0.786642849445343, + "learning_rate": 2.9750000000000003e-06, + "loss": 1.92, + "step": 681 + }, + { + "epoch": 0.8525, + "grad_norm": 1.6372087001800537, + "learning_rate": 2.95e-06, + "loss": 2.394, + "step": 682 + }, + { + "epoch": 0.85375, + "grad_norm": 1.3699547052383423, + "learning_rate": 2.925e-06, + "loss": 2.5697, + "step": 683 + }, + { + "epoch": 0.855, + "grad_norm": 2.5320935249328613, + "learning_rate": 2.9e-06, + "loss": 2.8745, + "step": 684 + }, + { + "epoch": 0.85625, + "grad_norm": 2.3555850982666016, + "learning_rate": 2.875e-06, + "loss": 2.7129, + "step": 685 + }, + { + "epoch": 0.8575, + "grad_norm": 2.2113137245178223, + "learning_rate": 2.85e-06, + "loss": 3.0649, + "step": 686 + }, + { + "epoch": 0.85875, + "grad_norm": 2.225691318511963, + "learning_rate": 2.825e-06, + "loss": 2.1795, + "step": 687 + }, + { + "epoch": 0.86, + "grad_norm": 1.7828199863433838, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.5819, + "step": 688 + }, + { + "epoch": 0.86125, + "grad_norm": 1.4047306776046753, + "learning_rate": 2.7750000000000005e-06, + "loss": 2.4888, + "step": 689 + }, + { + "epoch": 0.8625, + "grad_norm": 3.13736891746521, + "learning_rate": 2.7500000000000004e-06, + "loss": 3.2848, + "step": 690 + }, + { + "epoch": 0.86375, + "grad_norm": 1.6474286317825317, + "learning_rate": 2.7250000000000006e-06, + "loss": 2.9487, + "step": 691 + }, + { + "epoch": 0.865, + "grad_norm": 1.1393457651138306, + "learning_rate": 2.7000000000000004e-06, + "loss": 2.6245, + "step": 692 + }, + { + "epoch": 0.86625, + "grad_norm": 2.4159979820251465, + "learning_rate": 2.6750000000000002e-06, + "loss": 2.5928, + "step": 693 + }, + { + "epoch": 0.8675, + "grad_norm": 2.7899107933044434, + "learning_rate": 2.6500000000000005e-06, + "loss": 3.0244, + "step": 694 + }, + { + "epoch": 0.86875, + "grad_norm": 1.7800438404083252, + "learning_rate": 2.6250000000000003e-06, + "loss": 1.7862, + "step": 695 + }, + { + "epoch": 0.87, + "grad_norm": 2.111862897872925, + "learning_rate": 2.6e-06, + "loss": 2.8517, + "step": 696 + }, + { + "epoch": 0.87125, + "grad_norm": 2.0922229290008545, + "learning_rate": 2.5750000000000003e-06, + "loss": 2.2728, + "step": 697 + }, + { + "epoch": 0.8725, + "grad_norm": 1.8772225379943848, + "learning_rate": 2.55e-06, + "loss": 2.3808, + "step": 698 + }, + { + "epoch": 0.87375, + "grad_norm": 1.4865831136703491, + "learning_rate": 2.5250000000000004e-06, + "loss": 2.814, + "step": 699 + }, + { + "epoch": 0.875, + "grad_norm": 2.728365182876587, + "learning_rate": 2.5e-06, + "loss": 3.2643, + "step": 700 + }, + { + "epoch": 0.87625, + "grad_norm": 2.335869789123535, + "learning_rate": 2.475e-06, + "loss": 2.5471, + "step": 701 + }, + { + "epoch": 0.8775, + "grad_norm": 2.8404269218444824, + "learning_rate": 2.4500000000000003e-06, + "loss": 3.1107, + "step": 702 + }, + { + "epoch": 0.87875, + "grad_norm": 2.249152898788452, + "learning_rate": 2.425e-06, + "loss": 2.2618, + "step": 703 + }, + { + "epoch": 0.88, + "grad_norm": 2.662062168121338, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.6716, + "step": 704 + }, + { + "epoch": 0.88125, + "grad_norm": 0.9941419363021851, + "learning_rate": 2.375e-06, + "loss": 2.3987, + "step": 705 + }, + { + "epoch": 0.8825, + "grad_norm": 0.5474485754966736, + "learning_rate": 2.35e-06, + "loss": 2.429, + "step": 706 + }, + { + "epoch": 0.88375, + "grad_norm": 1.473949909210205, + "learning_rate": 2.325e-06, + "loss": 2.5365, + "step": 707 + }, + { + "epoch": 0.885, + "grad_norm": 1.6973087787628174, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.1838, + "step": 708 + }, + { + "epoch": 0.88625, + "grad_norm": 1.3658370971679688, + "learning_rate": 2.2750000000000002e-06, + "loss": 2.7352, + "step": 709 + }, + { + "epoch": 0.8875, + "grad_norm": 1.7037274837493896, + "learning_rate": 2.25e-06, + "loss": 2.4561, + "step": 710 + }, + { + "epoch": 0.88875, + "grad_norm": 1.5743016004562378, + "learning_rate": 2.2250000000000003e-06, + "loss": 2.4667, + "step": 711 + }, + { + "epoch": 0.89, + "grad_norm": 2.2770535945892334, + "learning_rate": 2.2e-06, + "loss": 2.315, + "step": 712 + }, + { + "epoch": 0.89125, + "grad_norm": 1.4078290462493896, + "learning_rate": 2.1750000000000004e-06, + "loss": 1.8871, + "step": 713 + }, + { + "epoch": 0.8925, + "grad_norm": 4.485980987548828, + "learning_rate": 2.15e-06, + "loss": 3.6076, + "step": 714 + }, + { + "epoch": 0.89375, + "grad_norm": 1.9531930685043335, + "learning_rate": 2.125e-06, + "loss": 2.9939, + "step": 715 + }, + { + "epoch": 0.895, + "grad_norm": 1.6232836246490479, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.7136, + "step": 716 + }, + { + "epoch": 0.89625, + "grad_norm": 1.2092350721359253, + "learning_rate": 2.075e-06, + "loss": 2.693, + "step": 717 + }, + { + "epoch": 0.8975, + "grad_norm": 3.6011886596679688, + "learning_rate": 2.05e-06, + "loss": 2.6018, + "step": 718 + }, + { + "epoch": 0.89875, + "grad_norm": 1.6210492849349976, + "learning_rate": 2.025e-06, + "loss": 2.3993, + "step": 719 + }, + { + "epoch": 0.9, + "grad_norm": 3.093855142593384, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.8579, + "step": 720 + }, + { + "epoch": 0.90125, + "grad_norm": 1.624720811843872, + "learning_rate": 1.975e-06, + "loss": 2.5612, + "step": 721 + }, + { + "epoch": 0.9025, + "grad_norm": 2.27695631980896, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.6602, + "step": 722 + }, + { + "epoch": 0.90375, + "grad_norm": 0.9946104884147644, + "learning_rate": 1.925e-06, + "loss": 2.2992, + "step": 723 + }, + { + "epoch": 0.905, + "grad_norm": 1.5145623683929443, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.9268, + "step": 724 + }, + { + "epoch": 0.90625, + "grad_norm": 1.1302818059921265, + "learning_rate": 1.8750000000000003e-06, + "loss": 2.3965, + "step": 725 + }, + { + "epoch": 0.9075, + "grad_norm": 1.6978087425231934, + "learning_rate": 1.85e-06, + "loss": 2.4108, + "step": 726 + }, + { + "epoch": 0.90875, + "grad_norm": 2.2601537704467773, + "learning_rate": 1.825e-06, + "loss": 2.3154, + "step": 727 + }, + { + "epoch": 0.91, + "grad_norm": 1.95534348487854, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.6331, + "step": 728 + }, + { + "epoch": 0.91125, + "grad_norm": 1.851906418800354, + "learning_rate": 1.7750000000000002e-06, + "loss": 2.5358, + "step": 729 + }, + { + "epoch": 0.9125, + "grad_norm": 1.785172700881958, + "learning_rate": 1.75e-06, + "loss": 1.9176, + "step": 730 + }, + { + "epoch": 0.91375, + "grad_norm": 1.4503031969070435, + "learning_rate": 1.725e-06, + "loss": 2.1631, + "step": 731 + }, + { + "epoch": 0.915, + "grad_norm": 1.4128133058547974, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.4415, + "step": 732 + }, + { + "epoch": 0.91625, + "grad_norm": 0.8669462203979492, + "learning_rate": 1.6750000000000003e-06, + "loss": 2.3275, + "step": 733 + }, + { + "epoch": 0.9175, + "grad_norm": 2.6948771476745605, + "learning_rate": 1.6500000000000003e-06, + "loss": 2.0348, + "step": 734 + }, + { + "epoch": 0.91875, + "grad_norm": 1.6759194135665894, + "learning_rate": 1.6250000000000001e-06, + "loss": 2.2684, + "step": 735 + }, + { + "epoch": 0.92, + "grad_norm": 1.6948565244674683, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.416, + "step": 736 + }, + { + "epoch": 0.92125, + "grad_norm": 2.1370370388031006, + "learning_rate": 1.5750000000000002e-06, + "loss": 1.907, + "step": 737 + }, + { + "epoch": 0.9225, + "grad_norm": 2.4928674697875977, + "learning_rate": 1.5500000000000002e-06, + "loss": 2.2468, + "step": 738 + }, + { + "epoch": 0.92375, + "grad_norm": 1.3928344249725342, + "learning_rate": 1.525e-06, + "loss": 2.9909, + "step": 739 + }, + { + "epoch": 0.925, + "grad_norm": 0.9298745393753052, + "learning_rate": 1.5e-06, + "loss": 2.2005, + "step": 740 + }, + { + "epoch": 0.92625, + "grad_norm": 1.4485491514205933, + "learning_rate": 1.475e-06, + "loss": 2.1133, + "step": 741 + }, + { + "epoch": 0.9275, + "grad_norm": 1.4372122287750244, + "learning_rate": 1.45e-06, + "loss": 2.923, + "step": 742 + }, + { + "epoch": 0.92875, + "grad_norm": 0.9330675005912781, + "learning_rate": 1.425e-06, + "loss": 2.4085, + "step": 743 + }, + { + "epoch": 0.93, + "grad_norm": 2.9367544651031494, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.8133, + "step": 744 + }, + { + "epoch": 0.93125, + "grad_norm": 2.0561740398406982, + "learning_rate": 1.3750000000000002e-06, + "loss": 2.6545, + "step": 745 + }, + { + "epoch": 0.9325, + "grad_norm": 3.575596332550049, + "learning_rate": 1.3500000000000002e-06, + "loss": 2.4606, + "step": 746 + }, + { + "epoch": 0.93375, + "grad_norm": 1.240574836730957, + "learning_rate": 1.3250000000000002e-06, + "loss": 2.7273, + "step": 747 + }, + { + "epoch": 0.935, + "grad_norm": 1.4624780416488647, + "learning_rate": 1.3e-06, + "loss": 2.06, + "step": 748 + }, + { + "epoch": 0.93625, + "grad_norm": 4.1101460456848145, + "learning_rate": 1.275e-06, + "loss": 3.6805, + "step": 749 + }, + { + "epoch": 0.9375, + "grad_norm": 1.3366179466247559, + "learning_rate": 1.25e-06, + "loss": 2.5252, + "step": 750 + }, + { + "epoch": 0.93875, + "grad_norm": 1.6742165088653564, + "learning_rate": 1.2250000000000001e-06, + "loss": 2.4546, + "step": 751 + }, + { + "epoch": 0.94, + "grad_norm": 1.976860523223877, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.5712, + "step": 752 + }, + { + "epoch": 0.94125, + "grad_norm": 2.7342398166656494, + "learning_rate": 1.175e-06, + "loss": 2.8187, + "step": 753 + }, + { + "epoch": 0.9425, + "grad_norm": 1.6856083869934082, + "learning_rate": 1.1500000000000002e-06, + "loss": 2.2563, + "step": 754 + }, + { + "epoch": 0.94375, + "grad_norm": 1.8753656148910522, + "learning_rate": 1.125e-06, + "loss": 2.6912, + "step": 755 + }, + { + "epoch": 0.945, + "grad_norm": 1.1665239334106445, + "learning_rate": 1.1e-06, + "loss": 2.57, + "step": 756 + }, + { + "epoch": 0.94625, + "grad_norm": 0.815209150314331, + "learning_rate": 1.075e-06, + "loss": 2.1256, + "step": 757 + }, + { + "epoch": 0.9475, + "grad_norm": 4.842214107513428, + "learning_rate": 1.0500000000000001e-06, + "loss": 2.4971, + "step": 758 + }, + { + "epoch": 0.94875, + "grad_norm": 2.0265555381774902, + "learning_rate": 1.025e-06, + "loss": 2.134, + "step": 759 + }, + { + "epoch": 0.95, + "grad_norm": 2.723125696182251, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.7601, + "step": 760 + }, + { + "epoch": 0.95125, + "grad_norm": 0.8349653482437134, + "learning_rate": 9.750000000000002e-07, + "loss": 2.0944, + "step": 761 + }, + { + "epoch": 0.9525, + "grad_norm": 2.7524056434631348, + "learning_rate": 9.500000000000001e-07, + "loss": 2.9711, + "step": 762 + }, + { + "epoch": 0.95375, + "grad_norm": 3.6746034622192383, + "learning_rate": 9.25e-07, + "loss": 2.6018, + "step": 763 + }, + { + "epoch": 0.955, + "grad_norm": 4.50912618637085, + "learning_rate": 9.000000000000001e-07, + "loss": 3.2356, + "step": 764 + }, + { + "epoch": 0.95625, + "grad_norm": 2.569899797439575, + "learning_rate": 8.75e-07, + "loss": 2.3839, + "step": 765 + }, + { + "epoch": 0.9575, + "grad_norm": 2.2566895484924316, + "learning_rate": 8.500000000000001e-07, + "loss": 3.053, + "step": 766 + }, + { + "epoch": 0.95875, + "grad_norm": 1.2487592697143555, + "learning_rate": 8.250000000000001e-07, + "loss": 2.8636, + "step": 767 + }, + { + "epoch": 0.96, + "grad_norm": 2.2288849353790283, + "learning_rate": 8.000000000000001e-07, + "loss": 1.9255, + "step": 768 + }, + { + "epoch": 0.96125, + "grad_norm": 2.1028974056243896, + "learning_rate": 7.750000000000001e-07, + "loss": 2.5616, + "step": 769 + }, + { + "epoch": 0.9625, + "grad_norm": 1.2116632461547852, + "learning_rate": 7.5e-07, + "loss": 2.56, + "step": 770 + }, + { + "epoch": 0.96375, + "grad_norm": 2.2025723457336426, + "learning_rate": 7.25e-07, + "loss": 3.1438, + "step": 771 + }, + { + "epoch": 0.965, + "grad_norm": 2.4728424549102783, + "learning_rate": 7.000000000000001e-07, + "loss": 2.478, + "step": 772 + }, + { + "epoch": 0.96625, + "grad_norm": 1.092957615852356, + "learning_rate": 6.750000000000001e-07, + "loss": 2.288, + "step": 773 + }, + { + "epoch": 0.9675, + "grad_norm": 1.0330168008804321, + "learning_rate": 6.5e-07, + "loss": 2.4745, + "step": 774 + }, + { + "epoch": 0.96875, + "grad_norm": 1.3029001951217651, + "learning_rate": 6.25e-07, + "loss": 2.3561, + "step": 775 + }, + { + "epoch": 0.97, + "grad_norm": 2.4613988399505615, + "learning_rate": 6.000000000000001e-07, + "loss": 2.4335, + "step": 776 + }, + { + "epoch": 0.97125, + "grad_norm": 1.6901812553405762, + "learning_rate": 5.750000000000001e-07, + "loss": 2.2362, + "step": 777 + }, + { + "epoch": 0.9725, + "grad_norm": 2.4732651710510254, + "learning_rate": 5.5e-07, + "loss": 2.6915, + "step": 778 + }, + { + "epoch": 0.97375, + "grad_norm": 1.1935354471206665, + "learning_rate": 5.250000000000001e-07, + "loss": 2.3641, + "step": 779 + }, + { + "epoch": 0.975, + "grad_norm": 1.6522023677825928, + "learning_rate": 5.000000000000001e-07, + "loss": 2.3062, + "step": 780 + }, + { + "epoch": 0.97625, + "grad_norm": 1.62522554397583, + "learning_rate": 4.7500000000000006e-07, + "loss": 2.3801, + "step": 781 + }, + { + "epoch": 0.9775, + "grad_norm": 14.57507610321045, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.5558, + "step": 782 + }, + { + "epoch": 0.97875, + "grad_norm": 3.2062268257141113, + "learning_rate": 4.2500000000000006e-07, + "loss": 2.9463, + "step": 783 + }, + { + "epoch": 0.98, + "grad_norm": 2.024563789367676, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.9367, + "step": 784 + }, + { + "epoch": 0.98125, + "grad_norm": 2.408194065093994, + "learning_rate": 3.75e-07, + "loss": 2.5923, + "step": 785 + }, + { + "epoch": 0.9825, + "grad_norm": 1.4996728897094727, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.7131, + "step": 786 + }, + { + "epoch": 0.98375, + "grad_norm": 0.4530770182609558, + "learning_rate": 3.25e-07, + "loss": 1.6141, + "step": 787 + }, + { + "epoch": 0.985, + "grad_norm": 1.9080215692520142, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.8874, + "step": 788 + }, + { + "epoch": 0.98625, + "grad_norm": 5.6946635246276855, + "learning_rate": 2.75e-07, + "loss": 3.0954, + "step": 789 + }, + { + "epoch": 0.9875, + "grad_norm": 1.8404529094696045, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.5155, + "step": 790 + }, + { + "epoch": 0.98875, + "grad_norm": 1.1621073484420776, + "learning_rate": 2.2500000000000002e-07, + "loss": 2.2214, + "step": 791 + }, + { + "epoch": 0.99, + "grad_norm": 4.576001167297363, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.3722, + "step": 792 + }, + { + "epoch": 0.99125, + "grad_norm": 2.642176866531372, + "learning_rate": 1.7500000000000002e-07, + "loss": 3.2569, + "step": 793 + }, + { + "epoch": 0.9925, + "grad_norm": 2.1654837131500244, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.2922, + "step": 794 + }, + { + "epoch": 0.99375, + "grad_norm": 1.3945523500442505, + "learning_rate": 1.2500000000000002e-07, + "loss": 2.2981, + "step": 795 + }, + { + "epoch": 0.995, + "grad_norm": 1.6085453033447266, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.4976, + "step": 796 + }, + { + "epoch": 0.99625, + "grad_norm": 3.1184237003326416, + "learning_rate": 7.500000000000001e-08, + "loss": 2.3142, + "step": 797 + }, + { + "epoch": 0.9975, + "grad_norm": 2.5621306896209717, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.8623, + "step": 798 + }, + { + "epoch": 0.99875, + "grad_norm": 3.707613468170166, + "learning_rate": 2.5000000000000002e-08, + "loss": 3.019, + "step": 799 + }, + { + "epoch": 1.0, + "grad_norm": 1.426806926727295, + "learning_rate": 0.0, + "loss": 2.7174, + "step": 800 + }, + { + "epoch": 1.0, + "eval_loss": 2.631765127182007, + "eval_runtime": 130.0102, + "eval_samples_per_second": 1.538, + "eval_steps_per_second": 1.538, + "step": 800 + } + ], + "logging_steps": 1, + "max_steps": 800, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.0586892345344e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}