{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 1.474351326504247, "learning_rate": 3e-06, "loss": 10.8527, "step": 1 }, { "epoch": 2e-05, "grad_norm": 1.4679683222098754, "learning_rate": 6e-06, "loss": 10.8518, "step": 2 }, { "epoch": 3e-05, "grad_norm": 1.4778785546190916, "learning_rate": 9e-06, "loss": 10.8528, "step": 3 }, { "epoch": 4e-05, "grad_norm": 1.4606433182728087, "learning_rate": 1.2e-05, "loss": 10.8514, "step": 4 }, { "epoch": 5e-05, "grad_norm": 1.495032790614139, "learning_rate": 1.5e-05, "loss": 10.8475, "step": 5 }, { "epoch": 6e-05, "grad_norm": 1.4922781628880415, "learning_rate": 1.8e-05, "loss": 10.8456, "step": 6 }, { "epoch": 7e-05, "grad_norm": 1.4624391310631026, "learning_rate": 2.1000000000000002e-05, "loss": 10.8332, "step": 7 }, { "epoch": 8e-05, "grad_norm": 1.3590231342483396, "learning_rate": 2.4e-05, "loss": 10.8048, "step": 8 }, { "epoch": 9e-05, "grad_norm": 1.3125465506408707, "learning_rate": 2.7e-05, "loss": 10.7995, "step": 9 }, { "epoch": 0.0001, "grad_norm": 1.2786530848646291, "learning_rate": 3e-05, "loss": 10.7845, "step": 10 }, { "epoch": 0.00011, "grad_norm": 1.173935963529719, "learning_rate": 3.2999999999999996e-05, "loss": 10.7675, "step": 11 }, { "epoch": 0.00012, "grad_norm": 1.1421003663969806, "learning_rate": 3.6e-05, "loss": 10.7555, "step": 12 }, { "epoch": 0.00013, "grad_norm": 1.0874053290178312, "learning_rate": 3.9e-05, "loss": 10.7368, "step": 13 }, { "epoch": 0.00014, "grad_norm": 1.0659709149955685, "learning_rate": 4.2000000000000004e-05, "loss": 10.725, "step": 14 }, { "epoch": 0.00015, "grad_norm": 1.0504212767689105, "learning_rate": 4.4999999999999996e-05, "loss": 10.7164, "step": 15 }, { "epoch": 0.00016, "grad_norm": 1.0094105445485244, "learning_rate": 4.8e-05, "loss": 10.6992, "step": 16 }, { "epoch": 0.00017, "grad_norm": 0.9821324312557077, "learning_rate": 5.1000000000000006e-05, "loss": 10.6847, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.9678398856931318, "learning_rate": 5.4e-05, "loss": 10.6682, "step": 18 }, { "epoch": 0.00019, "grad_norm": 0.9461041060244328, "learning_rate": 5.7e-05, "loss": 10.6548, "step": 19 }, { "epoch": 0.0002, "grad_norm": 0.9477034430151614, "learning_rate": 6e-05, "loss": 10.643, "step": 20 }, { "epoch": 0.00021, "grad_norm": 0.9315183579373711, "learning_rate": 6.3e-05, "loss": 10.6292, "step": 21 }, { "epoch": 0.00022, "grad_norm": 0.9306702069422275, "learning_rate": 6.599999999999999e-05, "loss": 10.6138, "step": 22 }, { "epoch": 0.00023, "grad_norm": 0.9273403280850633, "learning_rate": 6.9e-05, "loss": 10.6009, "step": 23 }, { "epoch": 0.00024, "grad_norm": 0.9220363741136194, "learning_rate": 7.2e-05, "loss": 10.5875, "step": 24 }, { "epoch": 0.00025, "grad_norm": 0.9201767792857789, "learning_rate": 7.500000000000001e-05, "loss": 10.5733, "step": 25 }, { "epoch": 0.00026, "grad_norm": 0.9175580667413662, "learning_rate": 7.8e-05, "loss": 10.5584, "step": 26 }, { "epoch": 0.00027, "grad_norm": 0.9079543629096309, "learning_rate": 8.1e-05, "loss": 10.5463, "step": 27 }, { "epoch": 0.00028, "grad_norm": 0.9048806062272016, "learning_rate": 8.400000000000001e-05, "loss": 10.5323, "step": 28 }, { "epoch": 0.00029, "grad_norm": 0.9080906603458408, "learning_rate": 8.7e-05, "loss": 10.5179, "step": 29 }, { "epoch": 0.0003, "grad_norm": 0.9143257896829334, "learning_rate": 8.999999999999999e-05, "loss": 10.5023, "step": 30 }, { "epoch": 0.00031, "grad_norm": 0.9133459401307438, "learning_rate": 9.3e-05, "loss": 10.4873, "step": 31 }, { "epoch": 0.00032, "grad_norm": 0.9088988454650742, "learning_rate": 9.6e-05, "loss": 10.4732, "step": 32 }, { "epoch": 0.00033, "grad_norm": 0.9091480228096109, "learning_rate": 9.900000000000001e-05, "loss": 10.4568, "step": 33 }, { "epoch": 0.00034, "grad_norm": 0.9093347160978491, "learning_rate": 0.00010200000000000001, "loss": 10.4401, "step": 34 }, { "epoch": 0.00035, "grad_norm": 0.9151059868353938, "learning_rate": 0.00010500000000000002, "loss": 10.4221, "step": 35 }, { "epoch": 0.00036, "grad_norm": 0.9109062872419206, "learning_rate": 0.000108, "loss": 10.4047, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.9005574122536919, "learning_rate": 0.000111, "loss": 10.3878, "step": 37 }, { "epoch": 0.00038, "grad_norm": 0.9059566976246669, "learning_rate": 0.000114, "loss": 10.3677, "step": 38 }, { "epoch": 0.00039, "grad_norm": 0.909004498745371, "learning_rate": 0.000117, "loss": 10.3483, "step": 39 }, { "epoch": 0.0004, "grad_norm": 0.9133826925572958, "learning_rate": 0.00012, "loss": 10.3258, "step": 40 }, { "epoch": 0.00041, "grad_norm": 0.9039563412032365, "learning_rate": 0.000123, "loss": 10.3065, "step": 41 }, { "epoch": 0.00042, "grad_norm": 0.9101200951645674, "learning_rate": 0.000126, "loss": 10.2853, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.9146848302450299, "learning_rate": 0.000129, "loss": 10.2609, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.9151886453844831, "learning_rate": 0.00013199999999999998, "loss": 10.2386, "step": 44 }, { "epoch": 0.00045, "grad_norm": 0.9168774496112169, "learning_rate": 0.000135, "loss": 10.2149, "step": 45 }, { "epoch": 0.00046, "grad_norm": 0.9075399781785387, "learning_rate": 0.000138, "loss": 10.1934, "step": 46 }, { "epoch": 0.00047, "grad_norm": 0.9135459535669499, "learning_rate": 0.000141, "loss": 10.1656, "step": 47 }, { "epoch": 0.00048, "grad_norm": 0.91262035081973, "learning_rate": 0.000144, "loss": 10.141, "step": 48 }, { "epoch": 0.00049, "grad_norm": 0.9110594218495042, "learning_rate": 0.000147, "loss": 10.1163, "step": 49 }, { "epoch": 0.0005, "grad_norm": 0.9132890933284591, "learning_rate": 0.00015000000000000001, "loss": 10.0897, "step": 50 }, { "epoch": 0.00051, "grad_norm": 0.9150426988347248, "learning_rate": 0.000153, "loss": 10.0624, "step": 51 }, { "epoch": 0.00052, "grad_norm": 0.9109744535674626, "learning_rate": 0.000156, "loss": 10.0358, "step": 52 }, { "epoch": 0.00053, "grad_norm": 0.901532060032359, "learning_rate": 0.000159, "loss": 10.0114, "step": 53 }, { "epoch": 0.00054, "grad_norm": 0.9209685410544358, "learning_rate": 0.000162, "loss": 9.977, "step": 54 }, { "epoch": 0.00055, "grad_norm": 0.9128557250148688, "learning_rate": 0.000165, "loss": 9.9526, "step": 55 }, { "epoch": 0.00056, "grad_norm": 0.9040444969022912, "learning_rate": 0.00016800000000000002, "loss": 9.9235, "step": 56 }, { "epoch": 0.00057, "grad_norm": 0.9096304515854158, "learning_rate": 0.000171, "loss": 9.8951, "step": 57 }, { "epoch": 0.00058, "grad_norm": 0.9067337485581307, "learning_rate": 0.000174, "loss": 9.869, "step": 58 }, { "epoch": 0.00059, "grad_norm": 0.9110609200320162, "learning_rate": 0.000177, "loss": 9.8368, "step": 59 }, { "epoch": 0.0006, "grad_norm": 0.898536280212388, "learning_rate": 0.00017999999999999998, "loss": 9.8104, "step": 60 }, { "epoch": 0.00061, "grad_norm": 0.8973231498486932, "learning_rate": 0.000183, "loss": 9.781, "step": 61 }, { "epoch": 0.00062, "grad_norm": 0.9031035316544737, "learning_rate": 0.000186, "loss": 9.749, "step": 62 }, { "epoch": 0.00063, "grad_norm": 0.895903440256181, "learning_rate": 0.000189, "loss": 9.7201, "step": 63 }, { "epoch": 0.00064, "grad_norm": 0.892797539460924, "learning_rate": 0.000192, "loss": 9.6896, "step": 64 }, { "epoch": 0.00065, "grad_norm": 0.9002961211110138, "learning_rate": 0.00019500000000000002, "loss": 9.6554, "step": 65 }, { "epoch": 0.00066, "grad_norm": 0.8937380482279099, "learning_rate": 0.00019800000000000002, "loss": 9.6306, "step": 66 }, { "epoch": 0.00067, "grad_norm": 0.8953750022876055, "learning_rate": 0.000201, "loss": 9.6003, "step": 67 }, { "epoch": 0.00068, "grad_norm": 0.9015956584917496, "learning_rate": 0.00020400000000000003, "loss": 9.5664, "step": 68 }, { "epoch": 0.00069, "grad_norm": 0.8870475418543912, "learning_rate": 0.00020700000000000002, "loss": 9.5382, "step": 69 }, { "epoch": 0.0007, "grad_norm": 0.9005981766349919, "learning_rate": 0.00021000000000000004, "loss": 9.4999, "step": 70 }, { "epoch": 0.00071, "grad_norm": 0.8908379991096952, "learning_rate": 0.00021299999999999997, "loss": 9.4723, "step": 71 }, { "epoch": 0.00072, "grad_norm": 0.89454119786981, "learning_rate": 0.000216, "loss": 9.4447, "step": 72 }, { "epoch": 0.00073, "grad_norm": 0.8950328287787978, "learning_rate": 0.00021899999999999998, "loss": 9.4098, "step": 73 }, { "epoch": 0.00074, "grad_norm": 0.8955182261448745, "learning_rate": 0.000222, "loss": 9.3823, "step": 74 }, { "epoch": 0.00075, "grad_norm": 0.8910449269411236, "learning_rate": 0.000225, "loss": 9.3459, "step": 75 }, { "epoch": 0.00076, "grad_norm": 0.8994830990969283, "learning_rate": 0.000228, "loss": 9.3173, "step": 76 }, { "epoch": 0.00077, "grad_norm": 0.8945465759020658, "learning_rate": 0.000231, "loss": 9.2874, "step": 77 }, { "epoch": 0.00078, "grad_norm": 0.8885636678908063, "learning_rate": 0.000234, "loss": 9.2563, "step": 78 }, { "epoch": 0.00079, "grad_norm": 0.8922899375029398, "learning_rate": 0.00023700000000000001, "loss": 9.2188, "step": 79 }, { "epoch": 0.0008, "grad_norm": 0.8928864834710244, "learning_rate": 0.00024, "loss": 9.1835, "step": 80 }, { "epoch": 0.00081, "grad_norm": 0.898627735558165, "learning_rate": 0.00024300000000000002, "loss": 9.1509, "step": 81 }, { "epoch": 0.00082, "grad_norm": 0.9056278867565712, "learning_rate": 0.000246, "loss": 9.126, "step": 82 }, { "epoch": 0.00083, "grad_norm": 0.8959873225353988, "learning_rate": 0.00024900000000000004, "loss": 9.0897, "step": 83 }, { "epoch": 0.00084, "grad_norm": 0.8992390398731138, "learning_rate": 0.000252, "loss": 9.0582, "step": 84 }, { "epoch": 0.00085, "grad_norm": 0.8929295183813671, "learning_rate": 0.000255, "loss": 9.0339, "step": 85 }, { "epoch": 0.00086, "grad_norm": 0.895130193545426, "learning_rate": 0.000258, "loss": 8.9969, "step": 86 }, { "epoch": 0.00087, "grad_norm": 0.8878745708200014, "learning_rate": 0.000261, "loss": 8.9749, "step": 87 }, { "epoch": 0.00088, "grad_norm": 0.8916925738718505, "learning_rate": 0.00026399999999999997, "loss": 8.9377, "step": 88 }, { "epoch": 0.00089, "grad_norm": 0.8869518010064994, "learning_rate": 0.000267, "loss": 8.9112, "step": 89 }, { "epoch": 0.0009, "grad_norm": 0.8862477905139782, "learning_rate": 0.00027, "loss": 8.8834, "step": 90 }, { "epoch": 0.00091, "grad_norm": 0.8806789869277061, "learning_rate": 0.000273, "loss": 8.8534, "step": 91 }, { "epoch": 0.00092, "grad_norm": 0.8827535811867636, "learning_rate": 0.000276, "loss": 8.8203, "step": 92 }, { "epoch": 0.00093, "grad_norm": 0.8863633123718828, "learning_rate": 0.000279, "loss": 8.7906, "step": 93 }, { "epoch": 0.00094, "grad_norm": 0.8854377864602295, "learning_rate": 0.000282, "loss": 8.7643, "step": 94 }, { "epoch": 0.00095, "grad_norm": 0.8811122089944353, "learning_rate": 0.000285, "loss": 8.7332, "step": 95 }, { "epoch": 0.00096, "grad_norm": 0.8875459668079012, "learning_rate": 0.000288, "loss": 8.701, "step": 96 }, { "epoch": 0.00097, "grad_norm": 0.8790489647826922, "learning_rate": 0.000291, "loss": 8.6763, "step": 97 }, { "epoch": 0.00098, "grad_norm": 0.8816372646189595, "learning_rate": 0.000294, "loss": 8.6393, "step": 98 }, { "epoch": 0.00099, "grad_norm": 0.8790820085994937, "learning_rate": 0.000297, "loss": 8.6187, "step": 99 }, { "epoch": 0.001, "grad_norm": 0.8752174625291169, "learning_rate": 0.00030000000000000003, "loss": 8.5928, "step": 100 }, { "epoch": 0.00101, "grad_norm": 0.8773415713400505, "learning_rate": 0.00030300000000000005, "loss": 8.5584, "step": 101 }, { "epoch": 0.00102, "grad_norm": 0.8643663486077326, "learning_rate": 0.000306, "loss": 8.5391, "step": 102 }, { "epoch": 0.00103, "grad_norm": 0.8697458804733939, "learning_rate": 0.000309, "loss": 8.5142, "step": 103 }, { "epoch": 0.00104, "grad_norm": 0.8606206924682385, "learning_rate": 0.000312, "loss": 8.4945, "step": 104 }, { "epoch": 0.00105, "grad_norm": 0.8647215352159486, "learning_rate": 0.000315, "loss": 8.4678, "step": 105 }, { "epoch": 0.00106, "grad_norm": 0.8616489218188034, "learning_rate": 0.000318, "loss": 8.4385, "step": 106 }, { "epoch": 0.00107, "grad_norm": 0.8529010606970446, "learning_rate": 0.000321, "loss": 8.4145, "step": 107 }, { "epoch": 0.00108, "grad_norm": 0.8573365553662458, "learning_rate": 0.000324, "loss": 8.3854, "step": 108 }, { "epoch": 0.00109, "grad_norm": 0.8466244623266435, "learning_rate": 0.000327, "loss": 8.3656, "step": 109 }, { "epoch": 0.0011, "grad_norm": 0.8493987341311698, "learning_rate": 0.00033, "loss": 8.3429, "step": 110 }, { "epoch": 0.00111, "grad_norm": 0.8418183624218302, "learning_rate": 0.000333, "loss": 8.3123, "step": 111 }, { "epoch": 0.00112, "grad_norm": 0.837428520328162, "learning_rate": 0.00033600000000000004, "loss": 8.2891, "step": 112 }, { "epoch": 0.00113, "grad_norm": 0.8530708116750574, "learning_rate": 0.000339, "loss": 8.2481, "step": 113 }, { "epoch": 0.00114, "grad_norm": 0.8531960306338435, "learning_rate": 0.000342, "loss": 8.2404, "step": 114 }, { "epoch": 0.00115, "grad_norm": 0.9106108649120399, "learning_rate": 0.00034500000000000004, "loss": 8.2161, "step": 115 }, { "epoch": 0.00116, "grad_norm": 0.9754034767309321, "learning_rate": 0.000348, "loss": 8.1959, "step": 116 }, { "epoch": 0.00117, "grad_norm": 0.8915473944545516, "learning_rate": 0.000351, "loss": 8.164, "step": 117 }, { "epoch": 0.00118, "grad_norm": 0.8220334698299727, "learning_rate": 0.000354, "loss": 8.146, "step": 118 }, { "epoch": 0.00119, "grad_norm": 0.8743176947954969, "learning_rate": 0.000357, "loss": 8.1361, "step": 119 }, { "epoch": 0.0012, "grad_norm": 0.8050422574835271, "learning_rate": 0.00035999999999999997, "loss": 8.1077, "step": 120 }, { "epoch": 0.00121, "grad_norm": 0.8378364817917794, "learning_rate": 0.000363, "loss": 8.0814, "step": 121 }, { "epoch": 0.00122, "grad_norm": 0.7934725047304613, "learning_rate": 0.000366, "loss": 8.0639, "step": 122 }, { "epoch": 0.00123, "grad_norm": 0.8001278864423256, "learning_rate": 0.000369, "loss": 8.0414, "step": 123 }, { "epoch": 0.00124, "grad_norm": 0.7891502944141346, "learning_rate": 0.000372, "loss": 8.0193, "step": 124 }, { "epoch": 0.00125, "grad_norm": 0.795452065739791, "learning_rate": 0.000375, "loss": 7.9938, "step": 125 }, { "epoch": 0.00126, "grad_norm": 0.7774797219919642, "learning_rate": 0.000378, "loss": 7.981, "step": 126 }, { "epoch": 0.00127, "grad_norm": 0.7551582047426328, "learning_rate": 0.000381, "loss": 7.9562, "step": 127 }, { "epoch": 0.00128, "grad_norm": 0.7770412292819664, "learning_rate": 0.000384, "loss": 7.9363, "step": 128 }, { "epoch": 0.00129, "grad_norm": 0.730245978508778, "learning_rate": 0.00038700000000000003, "loss": 7.9242, "step": 129 }, { "epoch": 0.0013, "grad_norm": 0.7581733929203945, "learning_rate": 0.00039000000000000005, "loss": 7.8975, "step": 130 }, { "epoch": 0.00131, "grad_norm": 0.7437048547743047, "learning_rate": 0.000393, "loss": 7.8642, "step": 131 }, { "epoch": 0.00132, "grad_norm": 0.7447956363319407, "learning_rate": 0.00039600000000000003, "loss": 7.8553, "step": 132 }, { "epoch": 0.00133, "grad_norm": 0.8219877092942749, "learning_rate": 0.00039900000000000005, "loss": 7.8359, "step": 133 }, { "epoch": 0.00134, "grad_norm": 1.2937247547771387, "learning_rate": 0.000402, "loss": 7.8316, "step": 134 }, { "epoch": 0.00135, "grad_norm": 1.173069005114359, "learning_rate": 0.00040500000000000003, "loss": 7.8093, "step": 135 }, { "epoch": 0.00136, "grad_norm": 0.7468130576157733, "learning_rate": 0.00040800000000000005, "loss": 7.7827, "step": 136 }, { "epoch": 0.00137, "grad_norm": 0.9811726952576713, "learning_rate": 0.000411, "loss": 7.7674, "step": 137 }, { "epoch": 0.00138, "grad_norm": 0.9462188302991427, "learning_rate": 0.00041400000000000003, "loss": 7.7401, "step": 138 }, { "epoch": 0.00139, "grad_norm": 0.8654346045111051, "learning_rate": 0.00041700000000000005, "loss": 7.7263, "step": 139 }, { "epoch": 0.0014, "grad_norm": 0.7113280167772615, "learning_rate": 0.00042000000000000007, "loss": 7.706, "step": 140 }, { "epoch": 0.00141, "grad_norm": 0.9535594232109135, "learning_rate": 0.000423, "loss": 7.6966, "step": 141 }, { "epoch": 0.00142, "grad_norm": 0.721845056784479, "learning_rate": 0.00042599999999999995, "loss": 7.6788, "step": 142 }, { "epoch": 0.00143, "grad_norm": 0.7614673366661663, "learning_rate": 0.00042899999999999997, "loss": 7.6517, "step": 143 }, { "epoch": 0.00144, "grad_norm": 0.6558320856409382, "learning_rate": 0.000432, "loss": 7.6295, "step": 144 }, { "epoch": 0.00145, "grad_norm": 0.7045333447763749, "learning_rate": 0.000435, "loss": 7.6259, "step": 145 }, { "epoch": 0.00146, "grad_norm": 0.6284166988938972, "learning_rate": 0.00043799999999999997, "loss": 7.585, "step": 146 }, { "epoch": 0.00147, "grad_norm": 0.6467600462746109, "learning_rate": 0.000441, "loss": 7.5791, "step": 147 }, { "epoch": 0.00148, "grad_norm": 0.6213738634604956, "learning_rate": 0.000444, "loss": 7.5641, "step": 148 }, { "epoch": 0.00149, "grad_norm": 0.5966591129804824, "learning_rate": 0.00044699999999999997, "loss": 7.5597, "step": 149 }, { "epoch": 0.0015, "grad_norm": 0.5636118960239422, "learning_rate": 0.00045, "loss": 7.5245, "step": 150 }, { "epoch": 0.00151, "grad_norm": 0.5909619206615454, "learning_rate": 0.000453, "loss": 7.5045, "step": 151 }, { "epoch": 0.00152, "grad_norm": 0.5383178931411042, "learning_rate": 0.000456, "loss": 7.497, "step": 152 }, { "epoch": 0.00153, "grad_norm": 0.5291626942545249, "learning_rate": 0.000459, "loss": 7.4853, "step": 153 }, { "epoch": 0.00154, "grad_norm": 0.5532016756899171, "learning_rate": 0.000462, "loss": 7.4637, "step": 154 }, { "epoch": 0.00155, "grad_norm": 0.5008942629560345, "learning_rate": 0.000465, "loss": 7.4483, "step": 155 }, { "epoch": 0.00156, "grad_norm": 0.5258951319062264, "learning_rate": 0.000468, "loss": 7.4224, "step": 156 }, { "epoch": 0.00157, "grad_norm": 0.4967086695788387, "learning_rate": 0.000471, "loss": 7.4172, "step": 157 }, { "epoch": 0.00158, "grad_norm": 0.48191095037320825, "learning_rate": 0.00047400000000000003, "loss": 7.3957, "step": 158 }, { "epoch": 0.00159, "grad_norm": 0.46801796765212106, "learning_rate": 0.000477, "loss": 7.3849, "step": 159 }, { "epoch": 0.0016, "grad_norm": 0.5255102038727893, "learning_rate": 0.00048, "loss": 7.3658, "step": 160 }, { "epoch": 0.00161, "grad_norm": 0.5299389188763052, "learning_rate": 0.00048300000000000003, "loss": 7.3428, "step": 161 }, { "epoch": 0.00162, "grad_norm": 0.5953669493848933, "learning_rate": 0.00048600000000000005, "loss": 7.3479, "step": 162 }, { "epoch": 0.00163, "grad_norm": 0.5335280557878616, "learning_rate": 0.0004890000000000001, "loss": 7.3286, "step": 163 }, { "epoch": 0.00164, "grad_norm": 0.4971558665793995, "learning_rate": 0.000492, "loss": 7.3284, "step": 164 }, { "epoch": 0.00165, "grad_norm": 0.45817405077400825, "learning_rate": 0.000495, "loss": 7.302, "step": 165 }, { "epoch": 0.00166, "grad_norm": 0.5803800594997708, "learning_rate": 0.0004980000000000001, "loss": 7.2782, "step": 166 }, { "epoch": 0.00167, "grad_norm": 0.6082011920361033, "learning_rate": 0.000501, "loss": 7.2708, "step": 167 }, { "epoch": 0.00168, "grad_norm": 0.6336998596864457, "learning_rate": 0.000504, "loss": 7.2488, "step": 168 }, { "epoch": 0.00169, "grad_norm": 0.6003268704217197, "learning_rate": 0.0005070000000000001, "loss": 7.2442, "step": 169 }, { "epoch": 0.0017, "grad_norm": 0.6942878050784774, "learning_rate": 0.00051, "loss": 7.2281, "step": 170 }, { "epoch": 0.00171, "grad_norm": 0.6828271953312538, "learning_rate": 0.000513, "loss": 7.2148, "step": 171 }, { "epoch": 0.00172, "grad_norm": 0.49790810289726045, "learning_rate": 0.000516, "loss": 7.2077, "step": 172 }, { "epoch": 0.00173, "grad_norm": 0.5828184415582448, "learning_rate": 0.0005189999999999999, "loss": 7.1934, "step": 173 }, { "epoch": 0.00174, "grad_norm": 0.5895089207006229, "learning_rate": 0.000522, "loss": 7.2103, "step": 174 }, { "epoch": 0.00175, "grad_norm": 0.7514233155115281, "learning_rate": 0.000525, "loss": 7.1773, "step": 175 }, { "epoch": 0.00176, "grad_norm": 0.4352152488230879, "learning_rate": 0.0005279999999999999, "loss": 7.144, "step": 176 }, { "epoch": 0.00177, "grad_norm": 0.4945639346042136, "learning_rate": 0.000531, "loss": 7.1524, "step": 177 }, { "epoch": 0.00178, "grad_norm": 0.4797513290452921, "learning_rate": 0.000534, "loss": 7.1405, "step": 178 }, { "epoch": 0.00179, "grad_norm": 0.4349662352142082, "learning_rate": 0.000537, "loss": 7.1325, "step": 179 }, { "epoch": 0.0018, "grad_norm": 0.38883631831142346, "learning_rate": 0.00054, "loss": 7.1029, "step": 180 }, { "epoch": 0.00181, "grad_norm": 0.49930253237147615, "learning_rate": 0.000543, "loss": 7.093, "step": 181 }, { "epoch": 0.00182, "grad_norm": 0.409986197327767, "learning_rate": 0.000546, "loss": 7.0882, "step": 182 }, { "epoch": 0.00183, "grad_norm": 0.4860898971704763, "learning_rate": 0.000549, "loss": 7.0781, "step": 183 }, { "epoch": 0.00184, "grad_norm": 0.5722795587528919, "learning_rate": 0.000552, "loss": 7.069, "step": 184 }, { "epoch": 0.00185, "grad_norm": 0.5738191189627982, "learning_rate": 0.000555, "loss": 7.0441, "step": 185 }, { "epoch": 0.00186, "grad_norm": 0.480936267251981, "learning_rate": 0.000558, "loss": 7.0527, "step": 186 }, { "epoch": 0.00187, "grad_norm": 0.6162211084806141, "learning_rate": 0.000561, "loss": 7.0307, "step": 187 }, { "epoch": 0.00188, "grad_norm": 0.5594962116008837, "learning_rate": 0.000564, "loss": 7.0081, "step": 188 }, { "epoch": 0.00189, "grad_norm": 0.5037393315861951, "learning_rate": 0.000567, "loss": 7.0028, "step": 189 }, { "epoch": 0.0019, "grad_norm": 0.4257079536133974, "learning_rate": 0.00057, "loss": 6.9836, "step": 190 }, { "epoch": 0.00191, "grad_norm": 0.4729880834795408, "learning_rate": 0.000573, "loss": 6.9775, "step": 191 }, { "epoch": 0.00192, "grad_norm": 0.35979347564719405, "learning_rate": 0.000576, "loss": 6.9672, "step": 192 }, { "epoch": 0.00193, "grad_norm": 0.3504590762022799, "learning_rate": 0.000579, "loss": 6.9673, "step": 193 }, { "epoch": 0.00194, "grad_norm": 0.40017651296086854, "learning_rate": 0.000582, "loss": 6.9599, "step": 194 }, { "epoch": 0.00195, "grad_norm": 0.4217845269426591, "learning_rate": 0.000585, "loss": 6.9408, "step": 195 }, { "epoch": 0.00196, "grad_norm": 0.3759474186154596, "learning_rate": 0.000588, "loss": 6.9274, "step": 196 }, { "epoch": 0.00197, "grad_norm": 0.38483250179950723, "learning_rate": 0.000591, "loss": 6.9235, "step": 197 }, { "epoch": 0.00198, "grad_norm": 0.5774981009451395, "learning_rate": 0.000594, "loss": 6.9171, "step": 198 }, { "epoch": 0.00199, "grad_norm": 0.7947838931607083, "learning_rate": 0.0005970000000000001, "loss": 6.905, "step": 199 }, { "epoch": 0.002, "grad_norm": 1.3095452790094997, "learning_rate": 0.0006000000000000001, "loss": 6.9171, "step": 200 }, { "epoch": 0.00201, "grad_norm": 0.8941477277896986, "learning_rate": 0.000603, "loss": 6.8832, "step": 201 }, { "epoch": 0.00202, "grad_norm": 0.6446211768569402, "learning_rate": 0.0006060000000000001, "loss": 6.8657, "step": 202 }, { "epoch": 0.00203, "grad_norm": 0.5529910396828944, "learning_rate": 0.0006090000000000001, "loss": 6.8657, "step": 203 }, { "epoch": 0.00204, "grad_norm": 0.6798364259433272, "learning_rate": 0.000612, "loss": 6.8577, "step": 204 }, { "epoch": 0.00205, "grad_norm": 0.4644570109539818, "learning_rate": 0.000615, "loss": 6.8377, "step": 205 }, { "epoch": 0.00206, "grad_norm": 0.44631259910699017, "learning_rate": 0.000618, "loss": 6.8297, "step": 206 }, { "epoch": 0.00207, "grad_norm": 0.5149457252986483, "learning_rate": 0.000621, "loss": 6.8239, "step": 207 }, { "epoch": 0.00208, "grad_norm": 0.38821898080366113, "learning_rate": 0.000624, "loss": 6.815, "step": 208 }, { "epoch": 0.00209, "grad_norm": 0.5320501684611929, "learning_rate": 0.000627, "loss": 6.8005, "step": 209 }, { "epoch": 0.0021, "grad_norm": 0.31085611137953467, "learning_rate": 0.00063, "loss": 6.7833, "step": 210 }, { "epoch": 0.00211, "grad_norm": 0.4810682865402516, "learning_rate": 0.000633, "loss": 6.7824, "step": 211 }, { "epoch": 0.00212, "grad_norm": 0.46892365964621546, "learning_rate": 0.000636, "loss": 6.7721, "step": 212 }, { "epoch": 0.00213, "grad_norm": 0.4643908337987038, "learning_rate": 0.000639, "loss": 6.7526, "step": 213 }, { "epoch": 0.00214, "grad_norm": 0.4868851924200075, "learning_rate": 0.000642, "loss": 6.7506, "step": 214 }, { "epoch": 0.00215, "grad_norm": 0.4395116450664113, "learning_rate": 0.000645, "loss": 6.7458, "step": 215 }, { "epoch": 0.00216, "grad_norm": 0.41354828342235656, "learning_rate": 0.000648, "loss": 6.74, "step": 216 }, { "epoch": 0.00217, "grad_norm": 0.3070152795385597, "learning_rate": 0.000651, "loss": 6.709, "step": 217 }, { "epoch": 0.00218, "grad_norm": 0.40442589086752323, "learning_rate": 0.000654, "loss": 6.7138, "step": 218 }, { "epoch": 0.00219, "grad_norm": 0.441644630454743, "learning_rate": 0.000657, "loss": 6.6872, "step": 219 }, { "epoch": 0.0022, "grad_norm": 0.6023964511244305, "learning_rate": 0.00066, "loss": 6.7066, "step": 220 }, { "epoch": 0.00221, "grad_norm": 0.7436617780357457, "learning_rate": 0.0006630000000000001, "loss": 6.6883, "step": 221 }, { "epoch": 0.00222, "grad_norm": 1.107124936944928, "learning_rate": 0.000666, "loss": 6.6857, "step": 222 }, { "epoch": 0.00223, "grad_norm": 0.9806174045781153, "learning_rate": 0.000669, "loss": 6.6846, "step": 223 }, { "epoch": 0.00224, "grad_norm": 0.5030190225010961, "learning_rate": 0.0006720000000000001, "loss": 6.6654, "step": 224 }, { "epoch": 0.00225, "grad_norm": 0.6877668848936607, "learning_rate": 0.000675, "loss": 6.6563, "step": 225 }, { "epoch": 0.00226, "grad_norm": 0.8124295484727102, "learning_rate": 0.000678, "loss": 6.655, "step": 226 }, { "epoch": 0.00227, "grad_norm": 0.7781712168651111, "learning_rate": 0.0006810000000000001, "loss": 6.6292, "step": 227 }, { "epoch": 0.00228, "grad_norm": 0.6536472724437511, "learning_rate": 0.000684, "loss": 6.6269, "step": 228 }, { "epoch": 0.00229, "grad_norm": 0.5457565622061897, "learning_rate": 0.000687, "loss": 6.6228, "step": 229 }, { "epoch": 0.0023, "grad_norm": 0.3628095747750561, "learning_rate": 0.0006900000000000001, "loss": 6.5952, "step": 230 }, { "epoch": 0.00231, "grad_norm": 0.5474995275109145, "learning_rate": 0.000693, "loss": 6.5849, "step": 231 }, { "epoch": 0.00232, "grad_norm": 0.40776460058160763, "learning_rate": 0.000696, "loss": 6.5921, "step": 232 }, { "epoch": 0.00233, "grad_norm": 0.35178189775679714, "learning_rate": 0.0006990000000000001, "loss": 6.5782, "step": 233 }, { "epoch": 0.00234, "grad_norm": 0.4281055336873759, "learning_rate": 0.000702, "loss": 6.5708, "step": 234 }, { "epoch": 0.00235, "grad_norm": 0.2803773137034946, "learning_rate": 0.000705, "loss": 6.5534, "step": 235 }, { "epoch": 0.00236, "grad_norm": 0.3921542773711977, "learning_rate": 0.000708, "loss": 6.5364, "step": 236 }, { "epoch": 0.00237, "grad_norm": 0.3526763779813035, "learning_rate": 0.0007109999999999999, "loss": 6.5448, "step": 237 }, { "epoch": 0.00238, "grad_norm": 0.5116945273918518, "learning_rate": 0.000714, "loss": 6.5357, "step": 238 }, { "epoch": 0.00239, "grad_norm": 0.6738279229074454, "learning_rate": 0.000717, "loss": 6.5224, "step": 239 }, { "epoch": 0.0024, "grad_norm": 0.8853347385085402, "learning_rate": 0.0007199999999999999, "loss": 6.5441, "step": 240 }, { "epoch": 0.00241, "grad_norm": 0.7775153220084405, "learning_rate": 0.000723, "loss": 6.5282, "step": 241 }, { "epoch": 0.00242, "grad_norm": 0.41795084065691346, "learning_rate": 0.000726, "loss": 6.4926, "step": 242 }, { "epoch": 0.00243, "grad_norm": 0.6402313725243838, "learning_rate": 0.000729, "loss": 6.4862, "step": 243 }, { "epoch": 0.00244, "grad_norm": 0.715394437807476, "learning_rate": 0.000732, "loss": 6.4949, "step": 244 }, { "epoch": 0.00245, "grad_norm": 0.8873416534047, "learning_rate": 0.000735, "loss": 6.487, "step": 245 }, { "epoch": 0.00246, "grad_norm": 1.0259114438292367, "learning_rate": 0.000738, "loss": 6.4771, "step": 246 }, { "epoch": 0.00247, "grad_norm": 0.6418814776118988, "learning_rate": 0.000741, "loss": 6.46, "step": 247 }, { "epoch": 0.00248, "grad_norm": 0.491074950554399, "learning_rate": 0.000744, "loss": 6.4659, "step": 248 }, { "epoch": 0.00249, "grad_norm": 0.5836449103944761, "learning_rate": 0.000747, "loss": 6.4457, "step": 249 }, { "epoch": 0.0025, "grad_norm": 0.4400008529473732, "learning_rate": 0.00075, "loss": 6.447, "step": 250 }, { "epoch": 0.00251, "grad_norm": 0.5056683711317587, "learning_rate": 0.000753, "loss": 6.4089, "step": 251 }, { "epoch": 0.00252, "grad_norm": 0.5475557445898553, "learning_rate": 0.000756, "loss": 6.4219, "step": 252 }, { "epoch": 0.00253, "grad_norm": 0.5310741382277282, "learning_rate": 0.000759, "loss": 6.41, "step": 253 }, { "epoch": 0.00254, "grad_norm": 0.5250408531352325, "learning_rate": 0.000762, "loss": 6.4057, "step": 254 }, { "epoch": 0.00255, "grad_norm": 0.3683514074420304, "learning_rate": 0.0007650000000000001, "loss": 6.4012, "step": 255 }, { "epoch": 0.00256, "grad_norm": 0.43783601316780707, "learning_rate": 0.000768, "loss": 6.3808, "step": 256 }, { "epoch": 0.00257, "grad_norm": 0.5249634850463766, "learning_rate": 0.000771, "loss": 6.3866, "step": 257 }, { "epoch": 0.00258, "grad_norm": 0.7257500993113621, "learning_rate": 0.0007740000000000001, "loss": 6.367, "step": 258 }, { "epoch": 0.00259, "grad_norm": 0.8020520964186684, "learning_rate": 0.000777, "loss": 6.3732, "step": 259 }, { "epoch": 0.0026, "grad_norm": 0.8803264650469431, "learning_rate": 0.0007800000000000001, "loss": 6.357, "step": 260 }, { "epoch": 0.00261, "grad_norm": 0.9480171721362259, "learning_rate": 0.0007830000000000001, "loss": 6.3728, "step": 261 }, { "epoch": 0.00262, "grad_norm": 0.8196081199065424, "learning_rate": 0.000786, "loss": 6.3478, "step": 262 }, { "epoch": 0.00263, "grad_norm": 0.5320550716287101, "learning_rate": 0.0007890000000000001, "loss": 6.3465, "step": 263 }, { "epoch": 0.00264, "grad_norm": 0.47601242681958134, "learning_rate": 0.0007920000000000001, "loss": 6.3294, "step": 264 }, { "epoch": 0.00265, "grad_norm": 0.5284160270316348, "learning_rate": 0.000795, "loss": 6.3216, "step": 265 }, { "epoch": 0.00266, "grad_norm": 0.45329071221457967, "learning_rate": 0.0007980000000000001, "loss": 6.3157, "step": 266 }, { "epoch": 0.00267, "grad_norm": 0.5054867442589596, "learning_rate": 0.0008010000000000001, "loss": 6.3107, "step": 267 }, { "epoch": 0.00268, "grad_norm": 0.43889982597834043, "learning_rate": 0.000804, "loss": 6.3036, "step": 268 }, { "epoch": 0.00269, "grad_norm": 0.5126978914131535, "learning_rate": 0.0008070000000000001, "loss": 6.2878, "step": 269 }, { "epoch": 0.0027, "grad_norm": 0.49623237637976697, "learning_rate": 0.0008100000000000001, "loss": 6.2881, "step": 270 }, { "epoch": 0.00271, "grad_norm": 0.44574618013664674, "learning_rate": 0.000813, "loss": 6.2929, "step": 271 }, { "epoch": 0.00272, "grad_norm": 0.5373388276032048, "learning_rate": 0.0008160000000000001, "loss": 6.2781, "step": 272 }, { "epoch": 0.00273, "grad_norm": 0.5925104788121834, "learning_rate": 0.0008190000000000001, "loss": 6.2673, "step": 273 }, { "epoch": 0.00274, "grad_norm": 0.7812486674134582, "learning_rate": 0.000822, "loss": 6.2537, "step": 274 }, { "epoch": 0.00275, "grad_norm": 1.4372466998888491, "learning_rate": 0.0008250000000000001, "loss": 6.2838, "step": 275 }, { "epoch": 0.00276, "grad_norm": 1.009790255288434, "learning_rate": 0.0008280000000000001, "loss": 6.2706, "step": 276 }, { "epoch": 0.00277, "grad_norm": 1.1366790246792218, "learning_rate": 0.0008310000000000001, "loss": 6.2794, "step": 277 }, { "epoch": 0.00278, "grad_norm": 0.7426000684998063, "learning_rate": 0.0008340000000000001, "loss": 6.2448, "step": 278 }, { "epoch": 0.00279, "grad_norm": 1.1507422621364256, "learning_rate": 0.0008370000000000001, "loss": 6.2577, "step": 279 }, { "epoch": 0.0028, "grad_norm": 0.9287326777762811, "learning_rate": 0.0008400000000000001, "loss": 6.2422, "step": 280 }, { "epoch": 0.00281, "grad_norm": 0.6961892841552818, "learning_rate": 0.0008430000000000001, "loss": 6.2248, "step": 281 }, { "epoch": 0.00282, "grad_norm": 0.5479119740633785, "learning_rate": 0.000846, "loss": 6.219, "step": 282 }, { "epoch": 0.00283, "grad_norm": 0.533852757527586, "learning_rate": 0.0008489999999999999, "loss": 6.2128, "step": 283 }, { "epoch": 0.00284, "grad_norm": 0.4914173114643737, "learning_rate": 0.0008519999999999999, "loss": 6.2012, "step": 284 }, { "epoch": 0.00285, "grad_norm": 0.5452017206249536, "learning_rate": 0.000855, "loss": 6.2041, "step": 285 }, { "epoch": 0.00286, "grad_norm": 0.5575344533377293, "learning_rate": 0.0008579999999999999, "loss": 6.1806, "step": 286 }, { "epoch": 0.00287, "grad_norm": 0.5864183738682309, "learning_rate": 0.000861, "loss": 6.1845, "step": 287 }, { "epoch": 0.00288, "grad_norm": 0.5841667976006761, "learning_rate": 0.000864, "loss": 6.1638, "step": 288 }, { "epoch": 0.00289, "grad_norm": 0.46053068944693165, "learning_rate": 0.0008669999999999999, "loss": 6.1757, "step": 289 }, { "epoch": 0.0029, "grad_norm": 0.3129456576237599, "learning_rate": 0.00087, "loss": 6.1522, "step": 290 }, { "epoch": 0.00291, "grad_norm": 0.42419235273038636, "learning_rate": 0.000873, "loss": 6.1475, "step": 291 }, { "epoch": 0.00292, "grad_norm": 0.4195983642240095, "learning_rate": 0.0008759999999999999, "loss": 6.1444, "step": 292 }, { "epoch": 0.00293, "grad_norm": 0.4481246758453938, "learning_rate": 0.000879, "loss": 6.1262, "step": 293 }, { "epoch": 0.00294, "grad_norm": 0.6187999235349624, "learning_rate": 0.000882, "loss": 6.1427, "step": 294 }, { "epoch": 0.00295, "grad_norm": 0.9782573637293742, "learning_rate": 0.0008849999999999999, "loss": 6.1183, "step": 295 }, { "epoch": 0.00296, "grad_norm": 1.1963373758150497, "learning_rate": 0.000888, "loss": 6.1501, "step": 296 }, { "epoch": 0.00297, "grad_norm": 0.599998736767153, "learning_rate": 0.000891, "loss": 6.0984, "step": 297 }, { "epoch": 0.00298, "grad_norm": 0.8013796512698059, "learning_rate": 0.0008939999999999999, "loss": 6.1246, "step": 298 }, { "epoch": 0.00299, "grad_norm": 0.5344179597512946, "learning_rate": 0.000897, "loss": 6.0868, "step": 299 }, { "epoch": 0.003, "grad_norm": 0.7019061294911672, "learning_rate": 0.0009, "loss": 6.0964, "step": 300 }, { "epoch": 0.00301, "grad_norm": 0.6084278233141631, "learning_rate": 0.0009029999999999999, "loss": 6.0828, "step": 301 }, { "epoch": 0.00302, "grad_norm": 0.8187255130373073, "learning_rate": 0.000906, "loss": 6.0734, "step": 302 }, { "epoch": 0.00303, "grad_norm": 0.8857858146685933, "learning_rate": 0.000909, "loss": 6.0899, "step": 303 }, { "epoch": 0.00304, "grad_norm": 1.2136542484144364, "learning_rate": 0.000912, "loss": 6.0938, "step": 304 }, { "epoch": 0.00305, "grad_norm": 1.177396089785494, "learning_rate": 0.000915, "loss": 6.0972, "step": 305 }, { "epoch": 0.00306, "grad_norm": 0.6904788914447576, "learning_rate": 0.000918, "loss": 6.0696, "step": 306 }, { "epoch": 0.00307, "grad_norm": 0.6208733747216669, "learning_rate": 0.000921, "loss": 6.0635, "step": 307 }, { "epoch": 0.00308, "grad_norm": 0.5825923197804476, "learning_rate": 0.000924, "loss": 6.0448, "step": 308 }, { "epoch": 0.00309, "grad_norm": 0.5505228855267935, "learning_rate": 0.000927, "loss": 6.0605, "step": 309 }, { "epoch": 0.0031, "grad_norm": 0.4641714577243504, "learning_rate": 0.00093, "loss": 6.0377, "step": 310 }, { "epoch": 0.00311, "grad_norm": 0.5505721165236824, "learning_rate": 0.000933, "loss": 6.0314, "step": 311 }, { "epoch": 0.00312, "grad_norm": 0.7061898380577587, "learning_rate": 0.000936, "loss": 6.0144, "step": 312 }, { "epoch": 0.00313, "grad_norm": 0.8674813006917282, "learning_rate": 0.0009390000000000001, "loss": 6.0347, "step": 313 }, { "epoch": 0.00314, "grad_norm": 0.9558239920779429, "learning_rate": 0.000942, "loss": 6.0427, "step": 314 }, { "epoch": 0.00315, "grad_norm": 0.8148008359990885, "learning_rate": 0.000945, "loss": 6.0164, "step": 315 }, { "epoch": 0.00316, "grad_norm": 0.8527496297887206, "learning_rate": 0.0009480000000000001, "loss": 5.992, "step": 316 }, { "epoch": 0.00317, "grad_norm": 0.9373289538679535, "learning_rate": 0.000951, "loss": 6.0135, "step": 317 }, { "epoch": 0.00318, "grad_norm": 0.915551727118195, "learning_rate": 0.000954, "loss": 6.0085, "step": 318 }, { "epoch": 0.00319, "grad_norm": 1.0100858429902202, "learning_rate": 0.0009570000000000001, "loss": 6.0051, "step": 319 }, { "epoch": 0.0032, "grad_norm": 0.6230170449860293, "learning_rate": 0.00096, "loss": 5.9891, "step": 320 }, { "epoch": 0.00321, "grad_norm": 0.7031926346795826, "learning_rate": 0.000963, "loss": 5.9829, "step": 321 }, { "epoch": 0.00322, "grad_norm": 0.6443754150070977, "learning_rate": 0.0009660000000000001, "loss": 5.9655, "step": 322 }, { "epoch": 0.00323, "grad_norm": 0.7449741143243878, "learning_rate": 0.000969, "loss": 5.9659, "step": 323 }, { "epoch": 0.00324, "grad_norm": 0.8499498801669653, "learning_rate": 0.0009720000000000001, "loss": 5.9786, "step": 324 }, { "epoch": 0.00325, "grad_norm": 0.7370021274780517, "learning_rate": 0.0009750000000000001, "loss": 5.9372, "step": 325 }, { "epoch": 0.00326, "grad_norm": 0.7692642106759962, "learning_rate": 0.0009780000000000001, "loss": 5.9561, "step": 326 }, { "epoch": 0.00327, "grad_norm": 0.9205755534728525, "learning_rate": 0.000981, "loss": 5.96, "step": 327 }, { "epoch": 0.00328, "grad_norm": 0.9462851248298534, "learning_rate": 0.000984, "loss": 5.9392, "step": 328 }, { "epoch": 0.00329, "grad_norm": 0.6598697228026301, "learning_rate": 0.000987, "loss": 5.9306, "step": 329 }, { "epoch": 0.0033, "grad_norm": 0.6819397096877848, "learning_rate": 0.00099, "loss": 5.9255, "step": 330 }, { "epoch": 0.00331, "grad_norm": 0.7335362675715908, "learning_rate": 0.0009930000000000002, "loss": 5.9282, "step": 331 }, { "epoch": 0.00332, "grad_norm": 0.7128341571534664, "learning_rate": 0.0009960000000000001, "loss": 5.9115, "step": 332 }, { "epoch": 0.00333, "grad_norm": 0.6642964548475624, "learning_rate": 0.000999, "loss": 5.8958, "step": 333 }, { "epoch": 0.00334, "grad_norm": 0.6296302431952895, "learning_rate": 0.001002, "loss": 5.9043, "step": 334 }, { "epoch": 0.00335, "grad_norm": 0.610819189925057, "learning_rate": 0.001005, "loss": 5.9015, "step": 335 }, { "epoch": 0.00336, "grad_norm": 0.5248795967011678, "learning_rate": 0.001008, "loss": 5.8916, "step": 336 }, { "epoch": 0.00337, "grad_norm": 0.474981761201143, "learning_rate": 0.0010110000000000002, "loss": 5.868, "step": 337 }, { "epoch": 0.00338, "grad_norm": 0.4982919042440728, "learning_rate": 0.0010140000000000001, "loss": 5.8835, "step": 338 }, { "epoch": 0.00339, "grad_norm": 0.4565693208938873, "learning_rate": 0.0010170000000000001, "loss": 5.8849, "step": 339 }, { "epoch": 0.0034, "grad_norm": 0.5166741693031021, "learning_rate": 0.00102, "loss": 5.8551, "step": 340 }, { "epoch": 0.00341, "grad_norm": 0.517141824320128, "learning_rate": 0.001023, "loss": 5.8368, "step": 341 }, { "epoch": 0.00342, "grad_norm": 0.5527942444154041, "learning_rate": 0.001026, "loss": 5.8429, "step": 342 }, { "epoch": 0.00343, "grad_norm": 0.6606601975038157, "learning_rate": 0.0010290000000000002, "loss": 5.8479, "step": 343 }, { "epoch": 0.00344, "grad_norm": 0.7784149171541165, "learning_rate": 0.001032, "loss": 5.8398, "step": 344 }, { "epoch": 0.00345, "grad_norm": 0.8397401648868169, "learning_rate": 0.001035, "loss": 5.8375, "step": 345 }, { "epoch": 0.00346, "grad_norm": 0.9944775596372618, "learning_rate": 0.0010379999999999999, "loss": 5.8449, "step": 346 }, { "epoch": 0.00347, "grad_norm": 1.0844541863277397, "learning_rate": 0.001041, "loss": 5.8271, "step": 347 }, { "epoch": 0.00348, "grad_norm": 1.1115702342985518, "learning_rate": 0.001044, "loss": 5.8384, "step": 348 }, { "epoch": 0.00349, "grad_norm": 1.0763373685259212, "learning_rate": 0.001047, "loss": 5.8194, "step": 349 }, { "epoch": 0.0035, "grad_norm": 1.154751185583195, "learning_rate": 0.00105, "loss": 5.8503, "step": 350 }, { "epoch": 0.00351, "grad_norm": 0.9949055141165053, "learning_rate": 0.001053, "loss": 5.8465, "step": 351 }, { "epoch": 0.00352, "grad_norm": 1.1269100337690212, "learning_rate": 0.0010559999999999999, "loss": 5.8358, "step": 352 }, { "epoch": 0.00353, "grad_norm": 0.7896283778005796, "learning_rate": 0.001059, "loss": 5.8219, "step": 353 }, { "epoch": 0.00354, "grad_norm": 0.6875113153469495, "learning_rate": 0.001062, "loss": 5.7918, "step": 354 }, { "epoch": 0.00355, "grad_norm": 0.7321424784274652, "learning_rate": 0.001065, "loss": 5.8057, "step": 355 }, { "epoch": 0.00356, "grad_norm": 0.6838178663323626, "learning_rate": 0.001068, "loss": 5.7969, "step": 356 }, { "epoch": 0.00357, "grad_norm": 0.5941172260432777, "learning_rate": 0.001071, "loss": 5.7852, "step": 357 }, { "epoch": 0.00358, "grad_norm": 0.6211494063140103, "learning_rate": 0.001074, "loss": 5.7764, "step": 358 }, { "epoch": 0.00359, "grad_norm": 0.645459064059718, "learning_rate": 0.001077, "loss": 5.7674, "step": 359 }, { "epoch": 0.0036, "grad_norm": 0.7187316170793214, "learning_rate": 0.00108, "loss": 5.7713, "step": 360 }, { "epoch": 0.00361, "grad_norm": 0.8420575980895092, "learning_rate": 0.001083, "loss": 5.7694, "step": 361 }, { "epoch": 0.00362, "grad_norm": 0.8000647457554158, "learning_rate": 0.001086, "loss": 5.7526, "step": 362 }, { "epoch": 0.00363, "grad_norm": 0.7128868464597033, "learning_rate": 0.001089, "loss": 5.7444, "step": 363 }, { "epoch": 0.00364, "grad_norm": 0.660116265321768, "learning_rate": 0.001092, "loss": 5.7546, "step": 364 }, { "epoch": 0.00365, "grad_norm": 0.7837831658386917, "learning_rate": 0.001095, "loss": 5.7547, "step": 365 }, { "epoch": 0.00366, "grad_norm": 0.7821248144040719, "learning_rate": 0.001098, "loss": 5.7348, "step": 366 }, { "epoch": 0.00367, "grad_norm": 0.8460018101445828, "learning_rate": 0.001101, "loss": 5.7355, "step": 367 }, { "epoch": 0.00368, "grad_norm": 0.962056228227274, "learning_rate": 0.001104, "loss": 5.7428, "step": 368 }, { "epoch": 0.00369, "grad_norm": 0.8851788755380371, "learning_rate": 0.001107, "loss": 5.7443, "step": 369 }, { "epoch": 0.0037, "grad_norm": 0.7906813569461731, "learning_rate": 0.00111, "loss": 5.7238, "step": 370 }, { "epoch": 0.00371, "grad_norm": 0.6753760943719406, "learning_rate": 0.001113, "loss": 5.6912, "step": 371 }, { "epoch": 0.00372, "grad_norm": 0.6449512979931109, "learning_rate": 0.001116, "loss": 5.712, "step": 372 }, { "epoch": 0.00373, "grad_norm": 0.7672882935081258, "learning_rate": 0.001119, "loss": 5.7038, "step": 373 }, { "epoch": 0.00374, "grad_norm": 0.9442109709312425, "learning_rate": 0.001122, "loss": 5.7133, "step": 374 }, { "epoch": 0.00375, "grad_norm": 0.8527878740214185, "learning_rate": 0.0011250000000000001, "loss": 5.7144, "step": 375 }, { "epoch": 0.00376, "grad_norm": 0.6469115537095235, "learning_rate": 0.001128, "loss": 5.7041, "step": 376 }, { "epoch": 0.00377, "grad_norm": 0.6476498140164245, "learning_rate": 0.001131, "loss": 5.6975, "step": 377 }, { "epoch": 0.00378, "grad_norm": 0.6293494723973451, "learning_rate": 0.001134, "loss": 5.6937, "step": 378 }, { "epoch": 0.00379, "grad_norm": 0.5839693486494109, "learning_rate": 0.001137, "loss": 5.6765, "step": 379 }, { "epoch": 0.0038, "grad_norm": 0.6735782782443015, "learning_rate": 0.00114, "loss": 5.6974, "step": 380 }, { "epoch": 0.00381, "grad_norm": 0.8231233374417803, "learning_rate": 0.0011430000000000001, "loss": 5.6678, "step": 381 }, { "epoch": 0.00382, "grad_norm": 0.9310146874963119, "learning_rate": 0.001146, "loss": 5.6605, "step": 382 }, { "epoch": 0.00383, "grad_norm": 0.8465306298864036, "learning_rate": 0.001149, "loss": 5.6696, "step": 383 }, { "epoch": 0.00384, "grad_norm": 0.7735682894418534, "learning_rate": 0.001152, "loss": 5.6728, "step": 384 }, { "epoch": 0.00385, "grad_norm": 0.7108251568334872, "learning_rate": 0.001155, "loss": 5.66, "step": 385 }, { "epoch": 0.00386, "grad_norm": 0.6607218758999341, "learning_rate": 0.001158, "loss": 5.6625, "step": 386 }, { "epoch": 0.00387, "grad_norm": 0.8710168775036707, "learning_rate": 0.0011610000000000001, "loss": 5.6396, "step": 387 }, { "epoch": 0.00388, "grad_norm": 1.2329079335292137, "learning_rate": 0.001164, "loss": 5.6555, "step": 388 }, { "epoch": 0.00389, "grad_norm": 0.9663820489145136, "learning_rate": 0.001167, "loss": 5.6567, "step": 389 }, { "epoch": 0.0039, "grad_norm": 1.1217202209674602, "learning_rate": 0.00117, "loss": 5.644, "step": 390 }, { "epoch": 0.00391, "grad_norm": 1.151101954947749, "learning_rate": 0.001173, "loss": 5.6576, "step": 391 }, { "epoch": 0.00392, "grad_norm": 0.8694084215496658, "learning_rate": 0.001176, "loss": 5.66, "step": 392 }, { "epoch": 0.00393, "grad_norm": 0.9543644853323229, "learning_rate": 0.0011790000000000001, "loss": 5.6338, "step": 393 }, { "epoch": 0.00394, "grad_norm": 0.8780145887874276, "learning_rate": 0.001182, "loss": 5.6222, "step": 394 }, { "epoch": 0.00395, "grad_norm": 0.7527231415908622, "learning_rate": 0.001185, "loss": 5.6018, "step": 395 }, { "epoch": 0.00396, "grad_norm": 0.8774478125599554, "learning_rate": 0.001188, "loss": 5.6219, "step": 396 }, { "epoch": 0.00397, "grad_norm": 0.9175690164934137, "learning_rate": 0.001191, "loss": 5.6177, "step": 397 }, { "epoch": 0.00398, "grad_norm": 0.7008521842282015, "learning_rate": 0.0011940000000000002, "loss": 5.6167, "step": 398 }, { "epoch": 0.00399, "grad_norm": 0.7914810239489021, "learning_rate": 0.0011970000000000001, "loss": 5.6198, "step": 399 }, { "epoch": 0.004, "grad_norm": 0.8769542296732423, "learning_rate": 0.0012000000000000001, "loss": 5.6078, "step": 400 }, { "epoch": 0.00401, "grad_norm": 0.9532232290382274, "learning_rate": 0.001203, "loss": 5.5929, "step": 401 }, { "epoch": 0.00402, "grad_norm": 0.7358311359997632, "learning_rate": 0.001206, "loss": 5.5932, "step": 402 }, { "epoch": 0.00403, "grad_norm": 0.5843625246076212, "learning_rate": 0.001209, "loss": 5.5883, "step": 403 }, { "epoch": 0.00404, "grad_norm": 0.5438164721501143, "learning_rate": 0.0012120000000000002, "loss": 5.5786, "step": 404 }, { "epoch": 0.00405, "grad_norm": 0.503935889546328, "learning_rate": 0.0012150000000000002, "loss": 5.5729, "step": 405 }, { "epoch": 0.00406, "grad_norm": 0.581555817136203, "learning_rate": 0.0012180000000000001, "loss": 5.5609, "step": 406 }, { "epoch": 0.00407, "grad_norm": 0.6908000438827393, "learning_rate": 0.0012209999999999999, "loss": 5.5552, "step": 407 }, { "epoch": 0.00408, "grad_norm": 0.9017213905273226, "learning_rate": 0.001224, "loss": 5.5815, "step": 408 }, { "epoch": 0.00409, "grad_norm": 0.8946003332710046, "learning_rate": 0.001227, "loss": 5.5595, "step": 409 }, { "epoch": 0.0041, "grad_norm": 0.8697402094229448, "learning_rate": 0.00123, "loss": 5.5572, "step": 410 }, { "epoch": 0.00411, "grad_norm": 0.889615246973329, "learning_rate": 0.001233, "loss": 5.5739, "step": 411 }, { "epoch": 0.00412, "grad_norm": 1.0125641304675919, "learning_rate": 0.001236, "loss": 5.5666, "step": 412 }, { "epoch": 0.00413, "grad_norm": 0.8779023797874608, "learning_rate": 0.0012389999999999999, "loss": 5.5643, "step": 413 }, { "epoch": 0.00414, "grad_norm": 0.5508909189800907, "learning_rate": 0.001242, "loss": 5.5325, "step": 414 }, { "epoch": 0.00415, "grad_norm": 0.5649392842818297, "learning_rate": 0.001245, "loss": 5.5358, "step": 415 }, { "epoch": 0.00416, "grad_norm": 0.5511318314324237, "learning_rate": 0.001248, "loss": 5.5166, "step": 416 }, { "epoch": 0.00417, "grad_norm": 0.5999723506722826, "learning_rate": 0.001251, "loss": 5.5477, "step": 417 }, { "epoch": 0.00418, "grad_norm": 0.6470183959659118, "learning_rate": 0.001254, "loss": 5.5224, "step": 418 }, { "epoch": 0.00419, "grad_norm": 0.7320078508628892, "learning_rate": 0.0012569999999999999, "loss": 5.5107, "step": 419 }, { "epoch": 0.0042, "grad_norm": 0.8456080065384579, "learning_rate": 0.00126, "loss": 5.5293, "step": 420 }, { "epoch": 0.00421, "grad_norm": 0.9939084210290385, "learning_rate": 0.001263, "loss": 5.5281, "step": 421 }, { "epoch": 0.00422, "grad_norm": 1.1479664785212111, "learning_rate": 0.001266, "loss": 5.5264, "step": 422 }, { "epoch": 0.00423, "grad_norm": 1.0903698179641128, "learning_rate": 0.001269, "loss": 5.5217, "step": 423 }, { "epoch": 0.00424, "grad_norm": 1.0214513010937782, "learning_rate": 0.001272, "loss": 5.5248, "step": 424 }, { "epoch": 0.00425, "grad_norm": 1.034943759284661, "learning_rate": 0.001275, "loss": 5.5311, "step": 425 }, { "epoch": 0.00426, "grad_norm": 0.7614861129943166, "learning_rate": 0.001278, "loss": 5.5212, "step": 426 }, { "epoch": 0.00427, "grad_norm": 0.7672748957608901, "learning_rate": 0.001281, "loss": 5.5058, "step": 427 }, { "epoch": 0.00428, "grad_norm": 0.8987012131115926, "learning_rate": 0.001284, "loss": 5.4868, "step": 428 }, { "epoch": 0.00429, "grad_norm": 0.8700946902461411, "learning_rate": 0.001287, "loss": 5.5045, "step": 429 }, { "epoch": 0.0043, "grad_norm": 0.9264807594834162, "learning_rate": 0.00129, "loss": 5.5003, "step": 430 }, { "epoch": 0.00431, "grad_norm": 0.9714177447726252, "learning_rate": 0.001293, "loss": 5.4888, "step": 431 }, { "epoch": 0.00432, "grad_norm": 0.8362399273583541, "learning_rate": 0.001296, "loss": 5.498, "step": 432 }, { "epoch": 0.00433, "grad_norm": 0.7360288767544512, "learning_rate": 0.001299, "loss": 5.4879, "step": 433 }, { "epoch": 0.00434, "grad_norm": 0.49506730909723906, "learning_rate": 0.001302, "loss": 5.4769, "step": 434 }, { "epoch": 0.00435, "grad_norm": 0.5541770965077963, "learning_rate": 0.001305, "loss": 5.4763, "step": 435 }, { "epoch": 0.00436, "grad_norm": 0.4143567360382353, "learning_rate": 0.001308, "loss": 5.448, "step": 436 }, { "epoch": 0.00437, "grad_norm": 0.5010974412085327, "learning_rate": 0.001311, "loss": 5.468, "step": 437 }, { "epoch": 0.00438, "grad_norm": 0.4773129501944819, "learning_rate": 0.001314, "loss": 5.4471, "step": 438 }, { "epoch": 0.00439, "grad_norm": 0.4480238817975887, "learning_rate": 0.001317, "loss": 5.4733, "step": 439 }, { "epoch": 0.0044, "grad_norm": 0.5102675481146428, "learning_rate": 0.00132, "loss": 5.4437, "step": 440 }, { "epoch": 0.00441, "grad_norm": 0.7603199484002093, "learning_rate": 0.001323, "loss": 5.4453, "step": 441 }, { "epoch": 0.00442, "grad_norm": 1.1458651313729942, "learning_rate": 0.0013260000000000001, "loss": 5.4537, "step": 442 }, { "epoch": 0.00443, "grad_norm": 0.8849689457299824, "learning_rate": 0.001329, "loss": 5.4514, "step": 443 }, { "epoch": 0.00444, "grad_norm": 0.8143276002862829, "learning_rate": 0.001332, "loss": 5.4477, "step": 444 }, { "epoch": 0.00445, "grad_norm": 1.014187478097306, "learning_rate": 0.001335, "loss": 5.4432, "step": 445 }, { "epoch": 0.00446, "grad_norm": 1.0652502023957744, "learning_rate": 0.001338, "loss": 5.449, "step": 446 }, { "epoch": 0.00447, "grad_norm": 0.9667242613798618, "learning_rate": 0.001341, "loss": 5.4482, "step": 447 }, { "epoch": 0.00448, "grad_norm": 0.8366191486526497, "learning_rate": 0.0013440000000000001, "loss": 5.4279, "step": 448 }, { "epoch": 0.00449, "grad_norm": 0.7799129698781961, "learning_rate": 0.001347, "loss": 5.43, "step": 449 }, { "epoch": 0.0045, "grad_norm": 0.7012481729938261, "learning_rate": 0.00135, "loss": 5.4286, "step": 450 }, { "epoch": 0.00451, "grad_norm": 0.7248552907075281, "learning_rate": 0.001353, "loss": 5.4198, "step": 451 }, { "epoch": 0.00452, "grad_norm": 0.7044649784518014, "learning_rate": 0.001356, "loss": 5.4008, "step": 452 }, { "epoch": 0.00453, "grad_norm": 0.7484508328215168, "learning_rate": 0.001359, "loss": 5.3954, "step": 453 }, { "epoch": 0.00454, "grad_norm": 0.8395085041820342, "learning_rate": 0.0013620000000000001, "loss": 5.4145, "step": 454 }, { "epoch": 0.00455, "grad_norm": 0.8415231105291262, "learning_rate": 0.0013650000000000001, "loss": 5.4127, "step": 455 }, { "epoch": 0.00456, "grad_norm": 0.7255220872264407, "learning_rate": 0.001368, "loss": 5.4058, "step": 456 }, { "epoch": 0.00457, "grad_norm": 0.9594202170388311, "learning_rate": 0.001371, "loss": 5.4022, "step": 457 }, { "epoch": 0.00458, "grad_norm": 1.3293728411767247, "learning_rate": 0.001374, "loss": 5.432, "step": 458 }, { "epoch": 0.00459, "grad_norm": 0.6757510453727773, "learning_rate": 0.0013770000000000002, "loss": 5.3986, "step": 459 }, { "epoch": 0.0046, "grad_norm": 0.642286715349539, "learning_rate": 0.0013800000000000002, "loss": 5.3842, "step": 460 }, { "epoch": 0.00461, "grad_norm": 0.718506682267116, "learning_rate": 0.0013830000000000001, "loss": 5.3809, "step": 461 }, { "epoch": 0.00462, "grad_norm": 0.8721301960351167, "learning_rate": 0.001386, "loss": 5.3962, "step": 462 }, { "epoch": 0.00463, "grad_norm": 1.0129389296679803, "learning_rate": 0.001389, "loss": 5.378, "step": 463 }, { "epoch": 0.00464, "grad_norm": 1.0088046974816025, "learning_rate": 0.001392, "loss": 5.4001, "step": 464 }, { "epoch": 0.00465, "grad_norm": 1.0306323020829953, "learning_rate": 0.0013950000000000002, "loss": 5.3734, "step": 465 }, { "epoch": 0.00466, "grad_norm": 0.8720845065856513, "learning_rate": 0.0013980000000000002, "loss": 5.3874, "step": 466 }, { "epoch": 0.00467, "grad_norm": 0.8887797743014489, "learning_rate": 0.0014010000000000001, "loss": 5.3768, "step": 467 }, { "epoch": 0.00468, "grad_norm": 0.9259005236029753, "learning_rate": 0.001404, "loss": 5.3729, "step": 468 }, { "epoch": 0.00469, "grad_norm": 0.8792163356951412, "learning_rate": 0.001407, "loss": 5.3748, "step": 469 }, { "epoch": 0.0047, "grad_norm": 0.9535575619633974, "learning_rate": 0.00141, "loss": 5.3888, "step": 470 }, { "epoch": 0.00471, "grad_norm": 0.8943151432047335, "learning_rate": 0.001413, "loss": 5.3656, "step": 471 }, { "epoch": 0.00472, "grad_norm": 0.9438166210457617, "learning_rate": 0.001416, "loss": 5.3818, "step": 472 }, { "epoch": 0.00473, "grad_norm": 0.955062498554473, "learning_rate": 0.001419, "loss": 5.3925, "step": 473 }, { "epoch": 0.00474, "grad_norm": 0.8744713005253077, "learning_rate": 0.0014219999999999999, "loss": 5.3698, "step": 474 }, { "epoch": 0.00475, "grad_norm": 0.9433538313620896, "learning_rate": 0.001425, "loss": 5.3673, "step": 475 }, { "epoch": 0.00476, "grad_norm": 1.003831968151295, "learning_rate": 0.001428, "loss": 5.4007, "step": 476 }, { "epoch": 0.00477, "grad_norm": 1.1933630822112062, "learning_rate": 0.001431, "loss": 5.3632, "step": 477 }, { "epoch": 0.00478, "grad_norm": 0.862117250818067, "learning_rate": 0.001434, "loss": 5.3781, "step": 478 }, { "epoch": 0.00479, "grad_norm": 0.8127073460113728, "learning_rate": 0.001437, "loss": 5.3546, "step": 479 }, { "epoch": 0.0048, "grad_norm": 0.7646068732813385, "learning_rate": 0.0014399999999999999, "loss": 5.3463, "step": 480 }, { "epoch": 0.00481, "grad_norm": 0.7199260331938973, "learning_rate": 0.001443, "loss": 5.3466, "step": 481 }, { "epoch": 0.00482, "grad_norm": 0.6802449871429381, "learning_rate": 0.001446, "loss": 5.3202, "step": 482 }, { "epoch": 0.00483, "grad_norm": 0.6495861656689597, "learning_rate": 0.001449, "loss": 5.3291, "step": 483 }, { "epoch": 0.00484, "grad_norm": 0.644169893492728, "learning_rate": 0.001452, "loss": 5.3277, "step": 484 }, { "epoch": 0.00485, "grad_norm": 0.7933051477323634, "learning_rate": 0.001455, "loss": 5.3368, "step": 485 }, { "epoch": 0.00486, "grad_norm": 0.8700253631116802, "learning_rate": 0.001458, "loss": 5.3096, "step": 486 }, { "epoch": 0.00487, "grad_norm": 0.7948159001624678, "learning_rate": 0.001461, "loss": 5.3257, "step": 487 }, { "epoch": 0.00488, "grad_norm": 0.5603690187831122, "learning_rate": 0.001464, "loss": 5.294, "step": 488 }, { "epoch": 0.00489, "grad_norm": 0.6185202940839952, "learning_rate": 0.001467, "loss": 5.287, "step": 489 }, { "epoch": 0.0049, "grad_norm": 0.5616341711444851, "learning_rate": 0.00147, "loss": 5.2988, "step": 490 }, { "epoch": 0.00491, "grad_norm": 0.49889379281960217, "learning_rate": 0.001473, "loss": 5.2927, "step": 491 }, { "epoch": 0.00492, "grad_norm": 0.5492592192725451, "learning_rate": 0.001476, "loss": 5.2915, "step": 492 }, { "epoch": 0.00493, "grad_norm": 0.5409556054523526, "learning_rate": 0.001479, "loss": 5.2842, "step": 493 }, { "epoch": 0.00494, "grad_norm": 0.5986439471872183, "learning_rate": 0.001482, "loss": 5.29, "step": 494 }, { "epoch": 0.00495, "grad_norm": 0.5846070904075265, "learning_rate": 0.001485, "loss": 5.2961, "step": 495 }, { "epoch": 0.00496, "grad_norm": 0.5348806476926008, "learning_rate": 0.001488, "loss": 5.2784, "step": 496 }, { "epoch": 0.00497, "grad_norm": 0.4761317629398082, "learning_rate": 0.001491, "loss": 5.2561, "step": 497 }, { "epoch": 0.00498, "grad_norm": 0.5515846127270029, "learning_rate": 0.001494, "loss": 5.2847, "step": 498 }, { "epoch": 0.00499, "grad_norm": 0.5172721915151642, "learning_rate": 0.001497, "loss": 5.2576, "step": 499 }, { "epoch": 0.005, "grad_norm": 0.5199821616229346, "learning_rate": 0.0015, "loss": 5.2619, "step": 500 }, { "epoch": 0.00501, "grad_norm": 0.6001320721934307, "learning_rate": 0.001503, "loss": 5.2644, "step": 501 }, { "epoch": 0.00502, "grad_norm": 0.6030864866826301, "learning_rate": 0.001506, "loss": 5.2703, "step": 502 }, { "epoch": 0.00503, "grad_norm": 0.5191760166109092, "learning_rate": 0.0015090000000000001, "loss": 5.2717, "step": 503 }, { "epoch": 0.00504, "grad_norm": 0.5783228415677168, "learning_rate": 0.001512, "loss": 5.254, "step": 504 }, { "epoch": 0.00505, "grad_norm": 0.6689655395334532, "learning_rate": 0.001515, "loss": 5.2492, "step": 505 }, { "epoch": 0.00506, "grad_norm": 0.6647691433281518, "learning_rate": 0.001518, "loss": 5.2491, "step": 506 }, { "epoch": 0.00507, "grad_norm": 0.8889512182842091, "learning_rate": 0.001521, "loss": 5.2524, "step": 507 }, { "epoch": 0.00508, "grad_norm": 0.890408095664645, "learning_rate": 0.001524, "loss": 5.258, "step": 508 }, { "epoch": 0.00509, "grad_norm": 0.7970662574337232, "learning_rate": 0.0015270000000000001, "loss": 5.2359, "step": 509 }, { "epoch": 0.0051, "grad_norm": 1.2307905246856345, "learning_rate": 0.0015300000000000001, "loss": 5.2751, "step": 510 }, { "epoch": 0.00511, "grad_norm": 0.7669806359491947, "learning_rate": 0.001533, "loss": 5.2366, "step": 511 }, { "epoch": 0.00512, "grad_norm": 0.791675505211477, "learning_rate": 0.001536, "loss": 5.2387, "step": 512 }, { "epoch": 0.00513, "grad_norm": 0.8098770401540116, "learning_rate": 0.001539, "loss": 5.2533, "step": 513 }, { "epoch": 0.00514, "grad_norm": 0.9791401462579269, "learning_rate": 0.001542, "loss": 5.2434, "step": 514 }, { "epoch": 0.00515, "grad_norm": 1.2171542206910853, "learning_rate": 0.0015450000000000001, "loss": 5.2622, "step": 515 }, { "epoch": 0.00516, "grad_norm": 1.023390089785043, "learning_rate": 0.0015480000000000001, "loss": 5.2555, "step": 516 }, { "epoch": 0.00517, "grad_norm": 1.1466434595757542, "learning_rate": 0.001551, "loss": 5.2575, "step": 517 }, { "epoch": 0.00518, "grad_norm": 0.8131985942906488, "learning_rate": 0.001554, "loss": 5.2332, "step": 518 }, { "epoch": 0.00519, "grad_norm": 1.0287267638756976, "learning_rate": 0.001557, "loss": 5.2441, "step": 519 }, { "epoch": 0.0052, "grad_norm": 1.2674528356226307, "learning_rate": 0.0015600000000000002, "loss": 5.2693, "step": 520 }, { "epoch": 0.00521, "grad_norm": 1.0294461133705461, "learning_rate": 0.0015630000000000002, "loss": 5.2367, "step": 521 }, { "epoch": 0.00522, "grad_norm": 0.9591121151827766, "learning_rate": 0.0015660000000000001, "loss": 5.2358, "step": 522 }, { "epoch": 0.00523, "grad_norm": 1.1408114706321955, "learning_rate": 0.001569, "loss": 5.2593, "step": 523 }, { "epoch": 0.00524, "grad_norm": 1.0111329673862832, "learning_rate": 0.001572, "loss": 5.2618, "step": 524 }, { "epoch": 0.00525, "grad_norm": 0.9661016074260314, "learning_rate": 0.001575, "loss": 5.2564, "step": 525 }, { "epoch": 0.00526, "grad_norm": 0.843793078461792, "learning_rate": 0.0015780000000000002, "loss": 5.2213, "step": 526 }, { "epoch": 0.00527, "grad_norm": 0.9400473384736733, "learning_rate": 0.0015810000000000002, "loss": 5.2433, "step": 527 }, { "epoch": 0.00528, "grad_norm": 1.243261864721202, "learning_rate": 0.0015840000000000001, "loss": 5.2328, "step": 528 }, { "epoch": 0.00529, "grad_norm": 0.8157117130861912, "learning_rate": 0.001587, "loss": 5.2295, "step": 529 }, { "epoch": 0.0053, "grad_norm": 0.7227236931041067, "learning_rate": 0.00159, "loss": 5.2376, "step": 530 }, { "epoch": 0.00531, "grad_norm": 0.7295638878978784, "learning_rate": 0.001593, "loss": 5.2183, "step": 531 }, { "epoch": 0.00532, "grad_norm": 1.0309756013631586, "learning_rate": 0.0015960000000000002, "loss": 5.2317, "step": 532 }, { "epoch": 0.00533, "grad_norm": 1.341295235532221, "learning_rate": 0.0015990000000000002, "loss": 5.2489, "step": 533 }, { "epoch": 0.00534, "grad_norm": 0.5418763696970031, "learning_rate": 0.0016020000000000001, "loss": 5.2102, "step": 534 }, { "epoch": 0.00535, "grad_norm": 0.6280107905402296, "learning_rate": 0.001605, "loss": 5.2186, "step": 535 }, { "epoch": 0.00536, "grad_norm": 0.6018626261301364, "learning_rate": 0.001608, "loss": 5.2146, "step": 536 }, { "epoch": 0.00537, "grad_norm": 0.7029103701631062, "learning_rate": 0.0016110000000000002, "loss": 5.2095, "step": 537 }, { "epoch": 0.00538, "grad_norm": 0.6704768823630471, "learning_rate": 0.0016140000000000002, "loss": 5.1871, "step": 538 }, { "epoch": 0.00539, "grad_norm": 0.5297125620314311, "learning_rate": 0.0016170000000000002, "loss": 5.1804, "step": 539 }, { "epoch": 0.0054, "grad_norm": 0.5293869004936285, "learning_rate": 0.0016200000000000001, "loss": 5.1861, "step": 540 }, { "epoch": 0.00541, "grad_norm": 0.6423736030714277, "learning_rate": 0.001623, "loss": 5.1794, "step": 541 }, { "epoch": 0.00542, "grad_norm": 0.7347991006328795, "learning_rate": 0.001626, "loss": 5.2023, "step": 542 }, { "epoch": 0.00543, "grad_norm": 0.8573526470960469, "learning_rate": 0.0016290000000000002, "loss": 5.1766, "step": 543 }, { "epoch": 0.00544, "grad_norm": 0.8407261490154571, "learning_rate": 0.0016320000000000002, "loss": 5.1927, "step": 544 }, { "epoch": 0.00545, "grad_norm": 0.880652021609057, "learning_rate": 0.0016350000000000002, "loss": 5.2041, "step": 545 }, { "epoch": 0.00546, "grad_norm": 0.8052038715443169, "learning_rate": 0.0016380000000000001, "loss": 5.1733, "step": 546 }, { "epoch": 0.00547, "grad_norm": 1.039008513503308, "learning_rate": 0.001641, "loss": 5.1801, "step": 547 }, { "epoch": 0.00548, "grad_norm": 0.9852213695140999, "learning_rate": 0.001644, "loss": 5.1858, "step": 548 }, { "epoch": 0.00549, "grad_norm": 0.8484168198653431, "learning_rate": 0.0016470000000000002, "loss": 5.1761, "step": 549 }, { "epoch": 0.0055, "grad_norm": 0.7806834244324883, "learning_rate": 0.0016500000000000002, "loss": 5.1846, "step": 550 }, { "epoch": 0.00551, "grad_norm": 0.84417611786463, "learning_rate": 0.0016530000000000002, "loss": 5.1857, "step": 551 }, { "epoch": 0.00552, "grad_norm": 0.7190100339826029, "learning_rate": 0.0016560000000000001, "loss": 5.1737, "step": 552 }, { "epoch": 0.00553, "grad_norm": 0.8870015639864556, "learning_rate": 0.001659, "loss": 5.1694, "step": 553 }, { "epoch": 0.00554, "grad_norm": 0.9374486447018174, "learning_rate": 0.0016620000000000003, "loss": 5.1739, "step": 554 }, { "epoch": 0.00555, "grad_norm": 0.8845970130105012, "learning_rate": 0.0016650000000000002, "loss": 5.1752, "step": 555 }, { "epoch": 0.00556, "grad_norm": 1.1195859643592967, "learning_rate": 0.0016680000000000002, "loss": 5.175, "step": 556 }, { "epoch": 0.00557, "grad_norm": 0.7291605041068904, "learning_rate": 0.0016710000000000002, "loss": 5.1436, "step": 557 }, { "epoch": 0.00558, "grad_norm": 0.5874484008921741, "learning_rate": 0.0016740000000000001, "loss": 5.1692, "step": 558 }, { "epoch": 0.00559, "grad_norm": 0.5085539730956898, "learning_rate": 0.001677, "loss": 5.1457, "step": 559 }, { "epoch": 0.0056, "grad_norm": 0.5291508832604556, "learning_rate": 0.0016800000000000003, "loss": 5.1235, "step": 560 }, { "epoch": 0.00561, "grad_norm": 0.5189974837553842, "learning_rate": 0.0016830000000000003, "loss": 5.1467, "step": 561 }, { "epoch": 0.00562, "grad_norm": 0.4777820071853692, "learning_rate": 0.0016860000000000002, "loss": 5.1356, "step": 562 }, { "epoch": 0.00563, "grad_norm": 0.504889791549622, "learning_rate": 0.001689, "loss": 5.1357, "step": 563 }, { "epoch": 0.00564, "grad_norm": 0.5778473577877967, "learning_rate": 0.001692, "loss": 5.1198, "step": 564 }, { "epoch": 0.00565, "grad_norm": 0.7633412851296998, "learning_rate": 0.001695, "loss": 5.1252, "step": 565 }, { "epoch": 0.00566, "grad_norm": 0.8414122298345739, "learning_rate": 0.0016979999999999999, "loss": 5.1285, "step": 566 }, { "epoch": 0.00567, "grad_norm": 0.6675623145494364, "learning_rate": 0.0017009999999999998, "loss": 5.1143, "step": 567 }, { "epoch": 0.00568, "grad_norm": 0.7239302836610552, "learning_rate": 0.0017039999999999998, "loss": 5.1279, "step": 568 }, { "epoch": 0.00569, "grad_norm": 0.811387127406939, "learning_rate": 0.001707, "loss": 5.1137, "step": 569 }, { "epoch": 0.0057, "grad_norm": 0.7754716722905325, "learning_rate": 0.00171, "loss": 5.1282, "step": 570 }, { "epoch": 0.00571, "grad_norm": 0.8562307066263974, "learning_rate": 0.001713, "loss": 5.1309, "step": 571 }, { "epoch": 0.00572, "grad_norm": 1.0627285671250943, "learning_rate": 0.0017159999999999999, "loss": 5.1191, "step": 572 }, { "epoch": 0.00573, "grad_norm": 0.8645205310284692, "learning_rate": 0.0017189999999999998, "loss": 5.1232, "step": 573 }, { "epoch": 0.00574, "grad_norm": 0.8122273817042681, "learning_rate": 0.001722, "loss": 5.1281, "step": 574 }, { "epoch": 0.00575, "grad_norm": 0.9590435469565912, "learning_rate": 0.001725, "loss": 5.1128, "step": 575 }, { "epoch": 0.00576, "grad_norm": 1.251082350842663, "learning_rate": 0.001728, "loss": 5.1225, "step": 576 }, { "epoch": 0.00577, "grad_norm": 0.7445616334136517, "learning_rate": 0.001731, "loss": 5.13, "step": 577 }, { "epoch": 0.00578, "grad_norm": 0.7845996621971332, "learning_rate": 0.0017339999999999999, "loss": 5.1117, "step": 578 }, { "epoch": 0.00579, "grad_norm": 0.886365355656363, "learning_rate": 0.0017369999999999998, "loss": 5.1037, "step": 579 }, { "epoch": 0.0058, "grad_norm": 0.8821301019883174, "learning_rate": 0.00174, "loss": 5.1259, "step": 580 }, { "epoch": 0.00581, "grad_norm": 0.7745940006716965, "learning_rate": 0.001743, "loss": 5.1161, "step": 581 }, { "epoch": 0.00582, "grad_norm": 0.8386253624392527, "learning_rate": 0.001746, "loss": 5.0979, "step": 582 }, { "epoch": 0.00583, "grad_norm": 0.8978101980978702, "learning_rate": 0.001749, "loss": 5.1236, "step": 583 }, { "epoch": 0.00584, "grad_norm": 0.9625436120988844, "learning_rate": 0.0017519999999999999, "loss": 5.1221, "step": 584 }, { "epoch": 0.00585, "grad_norm": 0.9007332505147065, "learning_rate": 0.0017549999999999998, "loss": 5.1079, "step": 585 }, { "epoch": 0.00586, "grad_norm": 0.8448788460460928, "learning_rate": 0.001758, "loss": 5.1127, "step": 586 }, { "epoch": 0.00587, "grad_norm": 0.8070040540273229, "learning_rate": 0.001761, "loss": 5.0903, "step": 587 }, { "epoch": 0.00588, "grad_norm": 0.8623574245532327, "learning_rate": 0.001764, "loss": 5.1032, "step": 588 }, { "epoch": 0.00589, "grad_norm": 0.8788484382751248, "learning_rate": 0.001767, "loss": 5.1023, "step": 589 }, { "epoch": 0.0059, "grad_norm": 0.88888081120713, "learning_rate": 0.0017699999999999999, "loss": 5.0951, "step": 590 }, { "epoch": 0.00591, "grad_norm": 0.8051815272157371, "learning_rate": 0.001773, "loss": 5.0807, "step": 591 }, { "epoch": 0.00592, "grad_norm": 0.9664766552178456, "learning_rate": 0.001776, "loss": 5.1164, "step": 592 }, { "epoch": 0.00593, "grad_norm": 0.933634159550007, "learning_rate": 0.001779, "loss": 5.1154, "step": 593 }, { "epoch": 0.00594, "grad_norm": 0.8388450047288221, "learning_rate": 0.001782, "loss": 5.1078, "step": 594 }, { "epoch": 0.00595, "grad_norm": 0.7870593334930018, "learning_rate": 0.001785, "loss": 5.0959, "step": 595 }, { "epoch": 0.00596, "grad_norm": 0.7663924438117601, "learning_rate": 0.0017879999999999999, "loss": 5.0923, "step": 596 }, { "epoch": 0.00597, "grad_norm": 0.9656629603017429, "learning_rate": 0.001791, "loss": 5.0773, "step": 597 }, { "epoch": 0.00598, "grad_norm": 0.9080147636693349, "learning_rate": 0.001794, "loss": 5.0893, "step": 598 }, { "epoch": 0.00599, "grad_norm": 0.732143169922112, "learning_rate": 0.001797, "loss": 5.0865, "step": 599 }, { "epoch": 0.006, "grad_norm": 0.5760572673229344, "learning_rate": 0.0018, "loss": 5.074, "step": 600 }, { "epoch": 0.00601, "grad_norm": 0.6602199941282026, "learning_rate": 0.001803, "loss": 5.0693, "step": 601 }, { "epoch": 0.00602, "grad_norm": 0.7069624755412358, "learning_rate": 0.0018059999999999999, "loss": 5.057, "step": 602 }, { "epoch": 0.00603, "grad_norm": 0.7134743776208471, "learning_rate": 0.001809, "loss": 5.0779, "step": 603 }, { "epoch": 0.00604, "grad_norm": 0.7455726311972387, "learning_rate": 0.001812, "loss": 5.0546, "step": 604 }, { "epoch": 0.00605, "grad_norm": 0.9135484051416195, "learning_rate": 0.001815, "loss": 5.0709, "step": 605 }, { "epoch": 0.00606, "grad_norm": 1.0710981560183195, "learning_rate": 0.001818, "loss": 5.0777, "step": 606 }, { "epoch": 0.00607, "grad_norm": 0.8059186248325771, "learning_rate": 0.001821, "loss": 5.0728, "step": 607 }, { "epoch": 0.00608, "grad_norm": 0.7260370848471782, "learning_rate": 0.001824, "loss": 5.0553, "step": 608 }, { "epoch": 0.00609, "grad_norm": 0.6501751004175685, "learning_rate": 0.001827, "loss": 5.0597, "step": 609 }, { "epoch": 0.0061, "grad_norm": 0.6148399666740725, "learning_rate": 0.00183, "loss": 5.0329, "step": 610 }, { "epoch": 0.00611, "grad_norm": 0.577060811762949, "learning_rate": 0.001833, "loss": 5.0322, "step": 611 }, { "epoch": 0.00612, "grad_norm": 0.6173055277138813, "learning_rate": 0.001836, "loss": 5.0402, "step": 612 }, { "epoch": 0.00613, "grad_norm": 0.615549230626737, "learning_rate": 0.001839, "loss": 5.0293, "step": 613 }, { "epoch": 0.00614, "grad_norm": 0.5145573627594662, "learning_rate": 0.001842, "loss": 5.0484, "step": 614 }, { "epoch": 0.00615, "grad_norm": 0.4622291494997757, "learning_rate": 0.001845, "loss": 5.0334, "step": 615 }, { "epoch": 0.00616, "grad_norm": 0.4156130098379491, "learning_rate": 0.001848, "loss": 5.0166, "step": 616 }, { "epoch": 0.00617, "grad_norm": 0.40499819493514183, "learning_rate": 0.001851, "loss": 5.0285, "step": 617 }, { "epoch": 0.00618, "grad_norm": 0.3547440364864465, "learning_rate": 0.001854, "loss": 5.0137, "step": 618 }, { "epoch": 0.00619, "grad_norm": 0.36790407868650016, "learning_rate": 0.001857, "loss": 5.0167, "step": 619 }, { "epoch": 0.0062, "grad_norm": 0.3873903126111541, "learning_rate": 0.00186, "loss": 5.0022, "step": 620 }, { "epoch": 0.00621, "grad_norm": 0.44545310636639085, "learning_rate": 0.001863, "loss": 5.0066, "step": 621 }, { "epoch": 0.00622, "grad_norm": 0.5872404207208428, "learning_rate": 0.001866, "loss": 5.0041, "step": 622 }, { "epoch": 0.00623, "grad_norm": 0.8188217453795142, "learning_rate": 0.001869, "loss": 4.9995, "step": 623 }, { "epoch": 0.00624, "grad_norm": 0.8582006227624419, "learning_rate": 0.001872, "loss": 5.0174, "step": 624 }, { "epoch": 0.00625, "grad_norm": 0.6624984450187602, "learning_rate": 0.001875, "loss": 5.0237, "step": 625 }, { "epoch": 0.00626, "grad_norm": 0.871266869538683, "learning_rate": 0.0018780000000000001, "loss": 5.0057, "step": 626 }, { "epoch": 0.00627, "grad_norm": 1.0014549660549559, "learning_rate": 0.001881, "loss": 5.0166, "step": 627 }, { "epoch": 0.00628, "grad_norm": 1.1152158811911206, "learning_rate": 0.001884, "loss": 5.0595, "step": 628 }, { "epoch": 0.00629, "grad_norm": 1.140057506672223, "learning_rate": 0.001887, "loss": 5.0271, "step": 629 }, { "epoch": 0.0063, "grad_norm": 1.1555923456286916, "learning_rate": 0.00189, "loss": 5.0294, "step": 630 }, { "epoch": 0.00631, "grad_norm": 1.0725579163260357, "learning_rate": 0.0018930000000000002, "loss": 5.0252, "step": 631 }, { "epoch": 0.00632, "grad_norm": 1.0622262545784495, "learning_rate": 0.0018960000000000001, "loss": 5.0506, "step": 632 }, { "epoch": 0.00633, "grad_norm": 1.1849469330653912, "learning_rate": 0.001899, "loss": 5.0591, "step": 633 }, { "epoch": 0.00634, "grad_norm": 1.186181168414293, "learning_rate": 0.001902, "loss": 5.0537, "step": 634 }, { "epoch": 0.00635, "grad_norm": 1.011082200052449, "learning_rate": 0.001905, "loss": 5.0328, "step": 635 }, { "epoch": 0.00636, "grad_norm": 1.1328438325981298, "learning_rate": 0.001908, "loss": 5.0454, "step": 636 }, { "epoch": 0.00637, "grad_norm": 0.9828095701538521, "learning_rate": 0.0019110000000000002, "loss": 5.0558, "step": 637 }, { "epoch": 0.00638, "grad_norm": 1.1150894310825157, "learning_rate": 0.0019140000000000001, "loss": 5.0551, "step": 638 }, { "epoch": 0.00639, "grad_norm": 0.9702746164579431, "learning_rate": 0.001917, "loss": 5.026, "step": 639 }, { "epoch": 0.0064, "grad_norm": 0.9463541889835033, "learning_rate": 0.00192, "loss": 5.0338, "step": 640 }, { "epoch": 0.00641, "grad_norm": 0.9922736299635262, "learning_rate": 0.001923, "loss": 5.0439, "step": 641 }, { "epoch": 0.00642, "grad_norm": 1.2833852757783575, "learning_rate": 0.001926, "loss": 5.0503, "step": 642 }, { "epoch": 0.00643, "grad_norm": 0.955455086522239, "learning_rate": 0.0019290000000000002, "loss": 5.0565, "step": 643 }, { "epoch": 0.00644, "grad_norm": 0.9001823386797868, "learning_rate": 0.0019320000000000001, "loss": 5.0314, "step": 644 }, { "epoch": 0.00645, "grad_norm": 0.7364586500805348, "learning_rate": 0.001935, "loss": 5.0304, "step": 645 }, { "epoch": 0.00646, "grad_norm": 0.6493450233680076, "learning_rate": 0.001938, "loss": 5.0293, "step": 646 }, { "epoch": 0.00647, "grad_norm": 0.6779095336835834, "learning_rate": 0.001941, "loss": 5.0401, "step": 647 }, { "epoch": 0.00648, "grad_norm": 0.6955209345108658, "learning_rate": 0.0019440000000000002, "loss": 5.0218, "step": 648 }, { "epoch": 0.00649, "grad_norm": 0.8304779062701974, "learning_rate": 0.0019470000000000002, "loss": 5.0072, "step": 649 }, { "epoch": 0.0065, "grad_norm": 0.820697007677232, "learning_rate": 0.0019500000000000001, "loss": 5.0315, "step": 650 }, { "epoch": 0.00651, "grad_norm": 0.7961110649030418, "learning_rate": 0.001953, "loss": 5.0121, "step": 651 }, { "epoch": 0.00652, "grad_norm": 0.8582193993973842, "learning_rate": 0.0019560000000000003, "loss": 5.009, "step": 652 }, { "epoch": 0.00653, "grad_norm": 0.8560401501333912, "learning_rate": 0.0019590000000000002, "loss": 5.0017, "step": 653 }, { "epoch": 0.00654, "grad_norm": 0.7831167086219499, "learning_rate": 0.001962, "loss": 4.9889, "step": 654 }, { "epoch": 0.00655, "grad_norm": 0.664585881803506, "learning_rate": 0.001965, "loss": 4.9976, "step": 655 }, { "epoch": 0.00656, "grad_norm": 0.6054298409810266, "learning_rate": 0.001968, "loss": 4.9857, "step": 656 }, { "epoch": 0.00657, "grad_norm": 0.557461380803133, "learning_rate": 0.001971, "loss": 4.9741, "step": 657 }, { "epoch": 0.00658, "grad_norm": 0.5329806623877416, "learning_rate": 0.001974, "loss": 4.9967, "step": 658 }, { "epoch": 0.00659, "grad_norm": 0.4733014627721161, "learning_rate": 0.001977, "loss": 4.9736, "step": 659 }, { "epoch": 0.0066, "grad_norm": 0.499559117562219, "learning_rate": 0.00198, "loss": 4.966, "step": 660 }, { "epoch": 0.00661, "grad_norm": 0.6099291254035663, "learning_rate": 0.001983, "loss": 4.9526, "step": 661 }, { "epoch": 0.00662, "grad_norm": 0.541377627450273, "learning_rate": 0.0019860000000000004, "loss": 4.9481, "step": 662 }, { "epoch": 0.00663, "grad_norm": 0.6125285838075794, "learning_rate": 0.0019890000000000003, "loss": 4.9658, "step": 663 }, { "epoch": 0.00664, "grad_norm": 0.7753721713708359, "learning_rate": 0.0019920000000000003, "loss": 4.9533, "step": 664 }, { "epoch": 0.00665, "grad_norm": 0.8561210143900951, "learning_rate": 0.0019950000000000002, "loss": 4.9333, "step": 665 }, { "epoch": 0.00666, "grad_norm": 0.7787737897544907, "learning_rate": 0.001998, "loss": 4.9543, "step": 666 }, { "epoch": 0.00667, "grad_norm": 0.693798446349636, "learning_rate": 0.002001, "loss": 4.9492, "step": 667 }, { "epoch": 0.00668, "grad_norm": 0.7363706742156494, "learning_rate": 0.002004, "loss": 4.9723, "step": 668 }, { "epoch": 0.00669, "grad_norm": 0.6301530155716453, "learning_rate": 0.002007, "loss": 4.9625, "step": 669 }, { "epoch": 0.0067, "grad_norm": 0.6196899316310271, "learning_rate": 0.00201, "loss": 4.9551, "step": 670 }, { "epoch": 0.00671, "grad_norm": 0.694548381041035, "learning_rate": 0.002013, "loss": 4.933, "step": 671 }, { "epoch": 0.00672, "grad_norm": 0.7621465658548545, "learning_rate": 0.002016, "loss": 4.9421, "step": 672 }, { "epoch": 0.00673, "grad_norm": 0.8075757173752571, "learning_rate": 0.002019, "loss": 4.9422, "step": 673 }, { "epoch": 0.00674, "grad_norm": 0.7418881135809064, "learning_rate": 0.0020220000000000004, "loss": 4.9464, "step": 674 }, { "epoch": 0.00675, "grad_norm": 0.6370116878967934, "learning_rate": 0.0020250000000000003, "loss": 4.9429, "step": 675 }, { "epoch": 0.00676, "grad_norm": 0.5146902497615841, "learning_rate": 0.0020280000000000003, "loss": 4.9317, "step": 676 }, { "epoch": 0.00677, "grad_norm": 0.5957426329332375, "learning_rate": 0.0020310000000000003, "loss": 4.9389, "step": 677 }, { "epoch": 0.00678, "grad_norm": 0.6059714528540561, "learning_rate": 0.0020340000000000002, "loss": 4.9473, "step": 678 }, { "epoch": 0.00679, "grad_norm": 0.6120329195118005, "learning_rate": 0.002037, "loss": 4.926, "step": 679 }, { "epoch": 0.0068, "grad_norm": 0.67013121421428, "learning_rate": 0.00204, "loss": 4.931, "step": 680 }, { "epoch": 0.00681, "grad_norm": 0.6629355745717784, "learning_rate": 0.002043, "loss": 4.9257, "step": 681 }, { "epoch": 0.00682, "grad_norm": 0.6203412878179155, "learning_rate": 0.002046, "loss": 4.9288, "step": 682 }, { "epoch": 0.00683, "grad_norm": 0.6526907123945536, "learning_rate": 0.002049, "loss": 4.911, "step": 683 }, { "epoch": 0.00684, "grad_norm": 0.7763655080494695, "learning_rate": 0.002052, "loss": 4.927, "step": 684 }, { "epoch": 0.00685, "grad_norm": 0.8186051850168985, "learning_rate": 0.0020550000000000004, "loss": 4.9269, "step": 685 }, { "epoch": 0.00686, "grad_norm": 0.7902721695493929, "learning_rate": 0.0020580000000000004, "loss": 4.9185, "step": 686 }, { "epoch": 0.00687, "grad_norm": 0.7114814619711178, "learning_rate": 0.0020610000000000003, "loss": 4.9075, "step": 687 }, { "epoch": 0.00688, "grad_norm": 0.6920604052885636, "learning_rate": 0.002064, "loss": 4.9133, "step": 688 }, { "epoch": 0.00689, "grad_norm": 0.757153733582835, "learning_rate": 0.002067, "loss": 4.9107, "step": 689 }, { "epoch": 0.0069, "grad_norm": 0.9661677354997352, "learning_rate": 0.00207, "loss": 4.9206, "step": 690 }, { "epoch": 0.00691, "grad_norm": 1.0541003703880292, "learning_rate": 0.0020729999999999998, "loss": 4.9045, "step": 691 }, { "epoch": 0.00692, "grad_norm": 0.8794573104042516, "learning_rate": 0.0020759999999999997, "loss": 4.923, "step": 692 }, { "epoch": 0.00693, "grad_norm": 0.9176864844747288, "learning_rate": 0.0020789999999999997, "loss": 4.9239, "step": 693 }, { "epoch": 0.00694, "grad_norm": 0.9209969930842488, "learning_rate": 0.002082, "loss": 4.9074, "step": 694 }, { "epoch": 0.00695, "grad_norm": 1.1695252405572139, "learning_rate": 0.002085, "loss": 4.9261, "step": 695 }, { "epoch": 0.00696, "grad_norm": 0.9145612208618356, "learning_rate": 0.002088, "loss": 4.919, "step": 696 }, { "epoch": 0.00697, "grad_norm": 1.1036978009332064, "learning_rate": 0.002091, "loss": 4.9333, "step": 697 }, { "epoch": 0.00698, "grad_norm": 1.0287653208524956, "learning_rate": 0.002094, "loss": 4.9255, "step": 698 }, { "epoch": 0.00699, "grad_norm": 0.9310985158925286, "learning_rate": 0.002097, "loss": 4.9194, "step": 699 }, { "epoch": 0.007, "grad_norm": 0.917395048546633, "learning_rate": 0.0021, "loss": 4.9296, "step": 700 }, { "epoch": 0.00701, "grad_norm": 0.834332437065761, "learning_rate": 0.002103, "loss": 4.9089, "step": 701 }, { "epoch": 0.00702, "grad_norm": 0.8281083071877697, "learning_rate": 0.002106, "loss": 4.9094, "step": 702 }, { "epoch": 0.00703, "grad_norm": 0.8445695596642441, "learning_rate": 0.0021089999999999998, "loss": 4.8989, "step": 703 }, { "epoch": 0.00704, "grad_norm": 0.7492253201313137, "learning_rate": 0.0021119999999999997, "loss": 4.8989, "step": 704 }, { "epoch": 0.00705, "grad_norm": 0.735879432986631, "learning_rate": 0.002115, "loss": 4.9035, "step": 705 }, { "epoch": 0.00706, "grad_norm": 0.6973072916647776, "learning_rate": 0.002118, "loss": 4.9057, "step": 706 }, { "epoch": 0.00707, "grad_norm": 0.629484514979941, "learning_rate": 0.002121, "loss": 4.8808, "step": 707 }, { "epoch": 0.00708, "grad_norm": 0.5238472656698838, "learning_rate": 0.002124, "loss": 4.8573, "step": 708 }, { "epoch": 0.00709, "grad_norm": 0.5712907279215416, "learning_rate": 0.002127, "loss": 4.889, "step": 709 }, { "epoch": 0.0071, "grad_norm": 0.7552376956739056, "learning_rate": 0.00213, "loss": 4.8786, "step": 710 }, { "epoch": 0.00711, "grad_norm": 0.9238310631482564, "learning_rate": 0.002133, "loss": 4.8637, "step": 711 }, { "epoch": 0.00712, "grad_norm": 0.9459038147293442, "learning_rate": 0.002136, "loss": 4.8869, "step": 712 }, { "epoch": 0.00713, "grad_norm": 0.960196647997872, "learning_rate": 0.002139, "loss": 4.9055, "step": 713 }, { "epoch": 0.00714, "grad_norm": 0.9883902427962058, "learning_rate": 0.002142, "loss": 4.9036, "step": 714 }, { "epoch": 0.00715, "grad_norm": 1.091153020847237, "learning_rate": 0.0021449999999999998, "loss": 4.8821, "step": 715 }, { "epoch": 0.00716, "grad_norm": 0.9092241477742985, "learning_rate": 0.002148, "loss": 4.8679, "step": 716 }, { "epoch": 0.00717, "grad_norm": 0.9933217219488532, "learning_rate": 0.002151, "loss": 4.8972, "step": 717 }, { "epoch": 0.00718, "grad_norm": 1.0107964436698889, "learning_rate": 0.002154, "loss": 4.8709, "step": 718 }, { "epoch": 0.00719, "grad_norm": 0.9093998544086449, "learning_rate": 0.002157, "loss": 4.8869, "step": 719 }, { "epoch": 0.0072, "grad_norm": 0.8253722879391543, "learning_rate": 0.00216, "loss": 4.8711, "step": 720 }, { "epoch": 0.00721, "grad_norm": 0.7950204631220138, "learning_rate": 0.002163, "loss": 4.8781, "step": 721 }, { "epoch": 0.00722, "grad_norm": 0.7616416140294759, "learning_rate": 0.002166, "loss": 4.8517, "step": 722 }, { "epoch": 0.00723, "grad_norm": 0.8494335965132823, "learning_rate": 0.002169, "loss": 4.8626, "step": 723 }, { "epoch": 0.00724, "grad_norm": 0.9441673028016423, "learning_rate": 0.002172, "loss": 4.8621, "step": 724 }, { "epoch": 0.00725, "grad_norm": 0.8751268950403425, "learning_rate": 0.002175, "loss": 4.8595, "step": 725 }, { "epoch": 0.00726, "grad_norm": 0.9089850912837197, "learning_rate": 0.002178, "loss": 4.8628, "step": 726 }, { "epoch": 0.00727, "grad_norm": 0.8888908119178632, "learning_rate": 0.0021809999999999998, "loss": 4.8505, "step": 727 }, { "epoch": 0.00728, "grad_norm": 0.9736770183257887, "learning_rate": 0.002184, "loss": 4.8689, "step": 728 }, { "epoch": 0.00729, "grad_norm": 0.8001407133698992, "learning_rate": 0.002187, "loss": 4.8549, "step": 729 }, { "epoch": 0.0073, "grad_norm": 0.6821558897279537, "learning_rate": 0.00219, "loss": 4.8548, "step": 730 }, { "epoch": 0.00731, "grad_norm": 0.6831491892882273, "learning_rate": 0.002193, "loss": 4.8443, "step": 731 }, { "epoch": 0.00732, "grad_norm": 0.7429226676106812, "learning_rate": 0.002196, "loss": 4.8496, "step": 732 }, { "epoch": 0.00733, "grad_norm": 0.7471373337600634, "learning_rate": 0.002199, "loss": 4.8306, "step": 733 }, { "epoch": 0.00734, "grad_norm": 0.7160425997494153, "learning_rate": 0.002202, "loss": 4.845, "step": 734 }, { "epoch": 0.00735, "grad_norm": 0.657968205956134, "learning_rate": 0.002205, "loss": 4.8539, "step": 735 }, { "epoch": 0.00736, "grad_norm": 0.7318419955485544, "learning_rate": 0.002208, "loss": 4.8371, "step": 736 }, { "epoch": 0.00737, "grad_norm": 0.8187387102115173, "learning_rate": 0.002211, "loss": 4.8083, "step": 737 }, { "epoch": 0.00738, "grad_norm": 1.0775553672977198, "learning_rate": 0.002214, "loss": 4.8215, "step": 738 }, { "epoch": 0.00739, "grad_norm": 1.0127446069090642, "learning_rate": 0.0022170000000000002, "loss": 4.8211, "step": 739 }, { "epoch": 0.0074, "grad_norm": 0.8746324375476564, "learning_rate": 0.00222, "loss": 4.8143, "step": 740 }, { "epoch": 0.00741, "grad_norm": 1.0041320632210426, "learning_rate": 0.002223, "loss": 4.7944, "step": 741 }, { "epoch": 0.00742, "grad_norm": 1.1063519275220866, "learning_rate": 0.002226, "loss": 4.836, "step": 742 }, { "epoch": 0.00743, "grad_norm": 0.9221104447659358, "learning_rate": 0.002229, "loss": 4.8024, "step": 743 }, { "epoch": 0.00744, "grad_norm": 0.9424830767002879, "learning_rate": 0.002232, "loss": 4.8123, "step": 744 }, { "epoch": 0.00745, "grad_norm": 1.0188289984750667, "learning_rate": 0.002235, "loss": 4.8151, "step": 745 }, { "epoch": 0.00746, "grad_norm": 1.0287722855005268, "learning_rate": 0.002238, "loss": 4.8238, "step": 746 }, { "epoch": 0.00747, "grad_norm": 1.0487862769229943, "learning_rate": 0.002241, "loss": 4.8099, "step": 747 }, { "epoch": 0.00748, "grad_norm": 0.8549676783684295, "learning_rate": 0.002244, "loss": 4.8084, "step": 748 }, { "epoch": 0.00749, "grad_norm": 0.8567053944604911, "learning_rate": 0.002247, "loss": 4.787, "step": 749 }, { "epoch": 0.0075, "grad_norm": 1.0511964127667834, "learning_rate": 0.0022500000000000003, "loss": 4.8309, "step": 750 }, { "epoch": 0.00751, "grad_norm": 0.9720966734602534, "learning_rate": 0.0022530000000000002, "loss": 4.8211, "step": 751 }, { "epoch": 0.00752, "grad_norm": 0.8427538199759678, "learning_rate": 0.002256, "loss": 4.777, "step": 752 }, { "epoch": 0.00753, "grad_norm": 1.0504906636895608, "learning_rate": 0.002259, "loss": 4.8189, "step": 753 }, { "epoch": 0.00754, "grad_norm": 0.9387358479584846, "learning_rate": 0.002262, "loss": 4.8364, "step": 754 }, { "epoch": 0.00755, "grad_norm": 0.7913656975262588, "learning_rate": 0.002265, "loss": 4.7934, "step": 755 }, { "epoch": 0.00756, "grad_norm": 0.7692929115953351, "learning_rate": 0.002268, "loss": 4.7756, "step": 756 }, { "epoch": 0.00757, "grad_norm": 0.7739955836145438, "learning_rate": 0.002271, "loss": 4.7882, "step": 757 }, { "epoch": 0.00758, "grad_norm": 0.8110040425710608, "learning_rate": 0.002274, "loss": 4.7648, "step": 758 }, { "epoch": 0.00759, "grad_norm": 0.8073791732331314, "learning_rate": 0.002277, "loss": 4.785, "step": 759 }, { "epoch": 0.0076, "grad_norm": 0.7576731140462794, "learning_rate": 0.00228, "loss": 4.7713, "step": 760 }, { "epoch": 0.00761, "grad_norm": 0.749632110976692, "learning_rate": 0.002283, "loss": 4.7377, "step": 761 }, { "epoch": 0.00762, "grad_norm": 0.7853246221677522, "learning_rate": 0.0022860000000000003, "loss": 4.7232, "step": 762 }, { "epoch": 0.00763, "grad_norm": 0.7445635461156005, "learning_rate": 0.0022890000000000002, "loss": 4.7582, "step": 763 }, { "epoch": 0.00764, "grad_norm": 0.7229992019662822, "learning_rate": 0.002292, "loss": 4.761, "step": 764 }, { "epoch": 0.00765, "grad_norm": 0.753778651079225, "learning_rate": 0.002295, "loss": 4.748, "step": 765 }, { "epoch": 0.00766, "grad_norm": 0.6446844137748907, "learning_rate": 0.002298, "loss": 4.7414, "step": 766 }, { "epoch": 0.00767, "grad_norm": 0.610956287563313, "learning_rate": 0.002301, "loss": 4.7382, "step": 767 }, { "epoch": 0.00768, "grad_norm": 0.5534988019273127, "learning_rate": 0.002304, "loss": 4.7365, "step": 768 }, { "epoch": 0.00769, "grad_norm": 0.6790892369300202, "learning_rate": 0.002307, "loss": 4.7418, "step": 769 }, { "epoch": 0.0077, "grad_norm": 0.8121255147206851, "learning_rate": 0.00231, "loss": 4.7084, "step": 770 }, { "epoch": 0.00771, "grad_norm": 0.860209240156232, "learning_rate": 0.002313, "loss": 4.7446, "step": 771 }, { "epoch": 0.00772, "grad_norm": 0.860546267364981, "learning_rate": 0.002316, "loss": 4.7297, "step": 772 }, { "epoch": 0.00773, "grad_norm": 0.8140139366688685, "learning_rate": 0.0023190000000000003, "loss": 4.7319, "step": 773 }, { "epoch": 0.00774, "grad_norm": 0.8843308751287595, "learning_rate": 0.0023220000000000003, "loss": 4.7474, "step": 774 }, { "epoch": 0.00775, "grad_norm": 0.8212427196933955, "learning_rate": 0.0023250000000000002, "loss": 4.7389, "step": 775 }, { "epoch": 0.00776, "grad_norm": 0.7590181685436337, "learning_rate": 0.002328, "loss": 4.7362, "step": 776 }, { "epoch": 0.00777, "grad_norm": 0.7163129834638192, "learning_rate": 0.002331, "loss": 4.711, "step": 777 }, { "epoch": 0.00778, "grad_norm": 0.5704385831566832, "learning_rate": 0.002334, "loss": 4.7139, "step": 778 }, { "epoch": 0.00779, "grad_norm": 0.6376822022096883, "learning_rate": 0.002337, "loss": 4.6765, "step": 779 }, { "epoch": 0.0078, "grad_norm": 0.6749602286292105, "learning_rate": 0.00234, "loss": 4.6934, "step": 780 }, { "epoch": 0.00781, "grad_norm": 0.7110957632967577, "learning_rate": 0.002343, "loss": 4.6953, "step": 781 }, { "epoch": 0.00782, "grad_norm": 0.6944526657973698, "learning_rate": 0.002346, "loss": 4.6887, "step": 782 }, { "epoch": 0.00783, "grad_norm": 0.7011179026115173, "learning_rate": 0.002349, "loss": 4.6685, "step": 783 }, { "epoch": 0.00784, "grad_norm": 0.7292703169217482, "learning_rate": 0.002352, "loss": 4.6842, "step": 784 }, { "epoch": 0.00785, "grad_norm": 0.6993372864201758, "learning_rate": 0.0023550000000000003, "loss": 4.6776, "step": 785 }, { "epoch": 0.00786, "grad_norm": 0.758602493265201, "learning_rate": 0.0023580000000000003, "loss": 4.6843, "step": 786 }, { "epoch": 0.00787, "grad_norm": 0.8960009188651508, "learning_rate": 0.0023610000000000003, "loss": 4.6883, "step": 787 }, { "epoch": 0.00788, "grad_norm": 0.9005115946196935, "learning_rate": 0.002364, "loss": 4.6894, "step": 788 }, { "epoch": 0.00789, "grad_norm": 0.9107967107186621, "learning_rate": 0.002367, "loss": 4.7167, "step": 789 }, { "epoch": 0.0079, "grad_norm": 0.8913303624885218, "learning_rate": 0.00237, "loss": 4.6905, "step": 790 }, { "epoch": 0.00791, "grad_norm": 0.9033565188226677, "learning_rate": 0.002373, "loss": 4.6536, "step": 791 }, { "epoch": 0.00792, "grad_norm": 0.876812764850389, "learning_rate": 0.002376, "loss": 4.6841, "step": 792 }, { "epoch": 0.00793, "grad_norm": 0.8853769972879193, "learning_rate": 0.002379, "loss": 4.6794, "step": 793 }, { "epoch": 0.00794, "grad_norm": 0.9106991776684247, "learning_rate": 0.002382, "loss": 4.6921, "step": 794 }, { "epoch": 0.00795, "grad_norm": 0.8863762739828056, "learning_rate": 0.002385, "loss": 4.6703, "step": 795 }, { "epoch": 0.00796, "grad_norm": 0.8699126455036565, "learning_rate": 0.0023880000000000004, "loss": 4.664, "step": 796 }, { "epoch": 0.00797, "grad_norm": 0.8543487171707993, "learning_rate": 0.0023910000000000003, "loss": 4.7077, "step": 797 }, { "epoch": 0.00798, "grad_norm": 0.8580039409853985, "learning_rate": 0.0023940000000000003, "loss": 4.6717, "step": 798 }, { "epoch": 0.00799, "grad_norm": 0.9172496139502885, "learning_rate": 0.0023970000000000003, "loss": 4.7059, "step": 799 }, { "epoch": 0.008, "grad_norm": 0.850755534424379, "learning_rate": 0.0024000000000000002, "loss": 4.6592, "step": 800 }, { "epoch": 0.00801, "grad_norm": 0.9527629416181698, "learning_rate": 0.002403, "loss": 4.6941, "step": 801 }, { "epoch": 0.00802, "grad_norm": 0.907546457744482, "learning_rate": 0.002406, "loss": 4.6836, "step": 802 }, { "epoch": 0.00803, "grad_norm": 0.8445766186911342, "learning_rate": 0.002409, "loss": 4.6598, "step": 803 }, { "epoch": 0.00804, "grad_norm": 0.8905553328902869, "learning_rate": 0.002412, "loss": 4.6598, "step": 804 }, { "epoch": 0.00805, "grad_norm": 0.8936903365864526, "learning_rate": 0.002415, "loss": 4.6624, "step": 805 }, { "epoch": 0.00806, "grad_norm": 0.8647204208273495, "learning_rate": 0.002418, "loss": 4.6755, "step": 806 }, { "epoch": 0.00807, "grad_norm": 0.8494760155293042, "learning_rate": 0.0024210000000000004, "loss": 4.6694, "step": 807 }, { "epoch": 0.00808, "grad_norm": 0.7376236468427481, "learning_rate": 0.0024240000000000004, "loss": 4.65, "step": 808 }, { "epoch": 0.00809, "grad_norm": 0.7397890518039199, "learning_rate": 0.0024270000000000003, "loss": 4.6449, "step": 809 }, { "epoch": 0.0081, "grad_norm": 0.6742672790291327, "learning_rate": 0.0024300000000000003, "loss": 4.6639, "step": 810 }, { "epoch": 0.00811, "grad_norm": 0.6344626039053871, "learning_rate": 0.0024330000000000003, "loss": 4.6372, "step": 811 }, { "epoch": 0.00812, "grad_norm": 0.6264235852250682, "learning_rate": 0.0024360000000000002, "loss": 4.6138, "step": 812 }, { "epoch": 0.00813, "grad_norm": 0.6033433029827218, "learning_rate": 0.0024389999999999998, "loss": 4.643, "step": 813 }, { "epoch": 0.00814, "grad_norm": 0.6217940219682172, "learning_rate": 0.0024419999999999997, "loss": 4.6512, "step": 814 }, { "epoch": 0.00815, "grad_norm": 0.6033575847456577, "learning_rate": 0.0024449999999999997, "loss": 4.6541, "step": 815 }, { "epoch": 0.00816, "grad_norm": 0.5100796560015926, "learning_rate": 0.002448, "loss": 4.6231, "step": 816 }, { "epoch": 0.00817, "grad_norm": 0.49585009337112085, "learning_rate": 0.002451, "loss": 4.6301, "step": 817 }, { "epoch": 0.00818, "grad_norm": 0.5266854681962923, "learning_rate": 0.002454, "loss": 4.6109, "step": 818 }, { "epoch": 0.00819, "grad_norm": 0.6082191743538553, "learning_rate": 0.002457, "loss": 4.6237, "step": 819 }, { "epoch": 0.0082, "grad_norm": 0.5983769962466982, "learning_rate": 0.00246, "loss": 4.6093, "step": 820 }, { "epoch": 0.00821, "grad_norm": 0.6452383077710647, "learning_rate": 0.002463, "loss": 4.5921, "step": 821 }, { "epoch": 0.00822, "grad_norm": 0.7904685465267983, "learning_rate": 0.002466, "loss": 4.5962, "step": 822 }, { "epoch": 0.00823, "grad_norm": 0.9334545600211804, "learning_rate": 0.002469, "loss": 4.6151, "step": 823 }, { "epoch": 0.00824, "grad_norm": 1.0173145160580905, "learning_rate": 0.002472, "loss": 4.6432, "step": 824 }, { "epoch": 0.00825, "grad_norm": 1.3805405563709947, "learning_rate": 0.0024749999999999998, "loss": 4.6688, "step": 825 }, { "epoch": 0.00826, "grad_norm": 0.8722736889569705, "learning_rate": 0.0024779999999999997, "loss": 4.608, "step": 826 }, { "epoch": 0.00827, "grad_norm": 0.9315441314299773, "learning_rate": 0.002481, "loss": 4.6415, "step": 827 }, { "epoch": 0.00828, "grad_norm": 1.0512546819946473, "learning_rate": 0.002484, "loss": 4.6408, "step": 828 }, { "epoch": 0.00829, "grad_norm": 1.0438218855109669, "learning_rate": 0.002487, "loss": 4.6506, "step": 829 }, { "epoch": 0.0083, "grad_norm": 0.9301285778060441, "learning_rate": 0.00249, "loss": 4.6477, "step": 830 }, { "epoch": 0.00831, "grad_norm": 0.9032044535603423, "learning_rate": 0.002493, "loss": 4.629, "step": 831 }, { "epoch": 0.00832, "grad_norm": 0.8730808748017225, "learning_rate": 0.002496, "loss": 4.6303, "step": 832 }, { "epoch": 0.00833, "grad_norm": 0.8092334117771479, "learning_rate": 0.002499, "loss": 4.6217, "step": 833 }, { "epoch": 0.00834, "grad_norm": 0.7831190344913049, "learning_rate": 0.002502, "loss": 4.624, "step": 834 }, { "epoch": 0.00835, "grad_norm": 0.6494366058082618, "learning_rate": 0.002505, "loss": 4.6082, "step": 835 }, { "epoch": 0.00836, "grad_norm": 0.6821091092072289, "learning_rate": 0.002508, "loss": 4.6348, "step": 836 }, { "epoch": 0.00837, "grad_norm": 0.7523253894726831, "learning_rate": 0.0025109999999999998, "loss": 4.5808, "step": 837 }, { "epoch": 0.00838, "grad_norm": 0.7788634101452747, "learning_rate": 0.0025139999999999997, "loss": 4.6077, "step": 838 }, { "epoch": 0.00839, "grad_norm": 0.8138659157639668, "learning_rate": 0.002517, "loss": 4.6083, "step": 839 }, { "epoch": 0.0084, "grad_norm": 0.8732162622457459, "learning_rate": 0.00252, "loss": 4.5955, "step": 840 }, { "epoch": 0.00841, "grad_norm": 1.019417984642289, "learning_rate": 0.002523, "loss": 4.598, "step": 841 }, { "epoch": 0.00842, "grad_norm": 0.9409472966208113, "learning_rate": 0.002526, "loss": 4.6049, "step": 842 }, { "epoch": 0.00843, "grad_norm": 0.9226761303774836, "learning_rate": 0.002529, "loss": 4.5993, "step": 843 }, { "epoch": 0.00844, "grad_norm": 0.9652698373224984, "learning_rate": 0.002532, "loss": 4.6327, "step": 844 }, { "epoch": 0.00845, "grad_norm": 0.9308804697564665, "learning_rate": 0.002535, "loss": 4.5933, "step": 845 }, { "epoch": 0.00846, "grad_norm": 0.9944671667950292, "learning_rate": 0.002538, "loss": 4.6213, "step": 846 }, { "epoch": 0.00847, "grad_norm": 0.7973628646086437, "learning_rate": 0.002541, "loss": 4.6012, "step": 847 }, { "epoch": 0.00848, "grad_norm": 0.7238143616386193, "learning_rate": 0.002544, "loss": 4.5901, "step": 848 }, { "epoch": 0.00849, "grad_norm": 0.8442325507863795, "learning_rate": 0.002547, "loss": 4.5938, "step": 849 }, { "epoch": 0.0085, "grad_norm": 0.7603726939752414, "learning_rate": 0.00255, "loss": 4.5976, "step": 850 }, { "epoch": 0.00851, "grad_norm": 0.8865277336062035, "learning_rate": 0.002553, "loss": 4.6202, "step": 851 }, { "epoch": 0.00852, "grad_norm": 0.9253666339112696, "learning_rate": 0.002556, "loss": 4.6122, "step": 852 }, { "epoch": 0.00853, "grad_norm": 0.9517307300698885, "learning_rate": 0.002559, "loss": 4.5919, "step": 853 }, { "epoch": 0.00854, "grad_norm": 1.0310567151805228, "learning_rate": 0.002562, "loss": 4.5922, "step": 854 }, { "epoch": 0.00855, "grad_norm": 1.154844172511891, "learning_rate": 0.002565, "loss": 4.5856, "step": 855 }, { "epoch": 0.00856, "grad_norm": 0.9664055608114693, "learning_rate": 0.002568, "loss": 4.6075, "step": 856 }, { "epoch": 0.00857, "grad_norm": 1.116045732107275, "learning_rate": 0.002571, "loss": 4.606, "step": 857 }, { "epoch": 0.00858, "grad_norm": 0.7861495756039047, "learning_rate": 0.002574, "loss": 4.5872, "step": 858 }, { "epoch": 0.00859, "grad_norm": 0.6214173325635484, "learning_rate": 0.002577, "loss": 4.5716, "step": 859 }, { "epoch": 0.0086, "grad_norm": 0.6164572973900683, "learning_rate": 0.00258, "loss": 4.5977, "step": 860 }, { "epoch": 0.00861, "grad_norm": 0.5506301368516001, "learning_rate": 0.0025830000000000002, "loss": 4.6069, "step": 861 }, { "epoch": 0.00862, "grad_norm": 0.601070156302512, "learning_rate": 0.002586, "loss": 4.5567, "step": 862 }, { "epoch": 0.00863, "grad_norm": 0.6954858119983386, "learning_rate": 0.002589, "loss": 4.58, "step": 863 }, { "epoch": 0.00864, "grad_norm": 0.697853484629387, "learning_rate": 0.002592, "loss": 4.533, "step": 864 }, { "epoch": 0.00865, "grad_norm": 0.6171533334128538, "learning_rate": 0.002595, "loss": 4.5507, "step": 865 }, { "epoch": 0.00866, "grad_norm": 0.6615774116930657, "learning_rate": 0.002598, "loss": 4.5725, "step": 866 }, { "epoch": 0.00867, "grad_norm": 0.7744523515971641, "learning_rate": 0.002601, "loss": 4.5651, "step": 867 }, { "epoch": 0.00868, "grad_norm": 0.820881730278172, "learning_rate": 0.002604, "loss": 4.5507, "step": 868 }, { "epoch": 0.00869, "grad_norm": 0.7803750245131083, "learning_rate": 0.002607, "loss": 4.5569, "step": 869 }, { "epoch": 0.0087, "grad_norm": 0.7384702018181832, "learning_rate": 0.00261, "loss": 4.5432, "step": 870 }, { "epoch": 0.00871, "grad_norm": 0.7238577244427861, "learning_rate": 0.002613, "loss": 4.5594, "step": 871 }, { "epoch": 0.00872, "grad_norm": 0.6596009767396808, "learning_rate": 0.002616, "loss": 4.5386, "step": 872 }, { "epoch": 0.00873, "grad_norm": 0.657018760483689, "learning_rate": 0.0026190000000000002, "loss": 4.5487, "step": 873 }, { "epoch": 0.00874, "grad_norm": 0.5096984570514209, "learning_rate": 0.002622, "loss": 4.5587, "step": 874 }, { "epoch": 0.00875, "grad_norm": 0.5917862001329391, "learning_rate": 0.002625, "loss": 4.509, "step": 875 }, { "epoch": 0.00876, "grad_norm": 0.6358204645109454, "learning_rate": 0.002628, "loss": 4.5421, "step": 876 }, { "epoch": 0.00877, "grad_norm": 0.622105352843196, "learning_rate": 0.002631, "loss": 4.5557, "step": 877 }, { "epoch": 0.00878, "grad_norm": 0.6489802918731719, "learning_rate": 0.002634, "loss": 4.4926, "step": 878 }, { "epoch": 0.00879, "grad_norm": 0.7198651946841444, "learning_rate": 0.002637, "loss": 4.5246, "step": 879 }, { "epoch": 0.0088, "grad_norm": 0.807778517531823, "learning_rate": 0.00264, "loss": 4.5381, "step": 880 }, { "epoch": 0.00881, "grad_norm": 0.6840558014472901, "learning_rate": 0.002643, "loss": 4.5363, "step": 881 }, { "epoch": 0.00882, "grad_norm": 0.6535717374503174, "learning_rate": 0.002646, "loss": 4.532, "step": 882 }, { "epoch": 0.00883, "grad_norm": 0.6598608882990905, "learning_rate": 0.002649, "loss": 4.528, "step": 883 }, { "epoch": 0.00884, "grad_norm": 0.5423501635938441, "learning_rate": 0.0026520000000000003, "loss": 4.5066, "step": 884 }, { "epoch": 0.00885, "grad_norm": 0.5323717529728587, "learning_rate": 0.0026550000000000002, "loss": 4.5028, "step": 885 }, { "epoch": 0.00886, "grad_norm": 0.6188094567817924, "learning_rate": 0.002658, "loss": 4.5234, "step": 886 }, { "epoch": 0.00887, "grad_norm": 0.6972466993378786, "learning_rate": 0.002661, "loss": 4.521, "step": 887 }, { "epoch": 0.00888, "grad_norm": 0.6171068656932988, "learning_rate": 0.002664, "loss": 4.5168, "step": 888 }, { "epoch": 0.00889, "grad_norm": 0.7028052332269225, "learning_rate": 0.002667, "loss": 4.4741, "step": 889 }, { "epoch": 0.0089, "grad_norm": 0.8226361692292167, "learning_rate": 0.00267, "loss": 4.538, "step": 890 }, { "epoch": 0.00891, "grad_norm": 1.1043997753739854, "learning_rate": 0.002673, "loss": 4.5318, "step": 891 }, { "epoch": 0.00892, "grad_norm": 0.9946974632170549, "learning_rate": 0.002676, "loss": 4.5463, "step": 892 }, { "epoch": 0.00893, "grad_norm": 0.8958086490440859, "learning_rate": 0.002679, "loss": 4.5224, "step": 893 }, { "epoch": 0.00894, "grad_norm": 0.9077012235708041, "learning_rate": 0.002682, "loss": 4.5134, "step": 894 }, { "epoch": 0.00895, "grad_norm": 0.8017398027780225, "learning_rate": 0.0026850000000000003, "loss": 4.5213, "step": 895 }, { "epoch": 0.00896, "grad_norm": 0.7045937615557313, "learning_rate": 0.0026880000000000003, "loss": 4.5092, "step": 896 }, { "epoch": 0.00897, "grad_norm": 0.6210872063297819, "learning_rate": 0.0026910000000000002, "loss": 4.5215, "step": 897 }, { "epoch": 0.00898, "grad_norm": 0.6465621097725943, "learning_rate": 0.002694, "loss": 4.5135, "step": 898 }, { "epoch": 0.00899, "grad_norm": 0.7170710385058783, "learning_rate": 0.002697, "loss": 4.5216, "step": 899 }, { "epoch": 0.009, "grad_norm": 0.6908740128767943, "learning_rate": 0.0027, "loss": 4.5448, "step": 900 }, { "epoch": 0.00901, "grad_norm": 0.6330297320675389, "learning_rate": 0.002703, "loss": 4.4948, "step": 901 }, { "epoch": 0.00902, "grad_norm": 0.6888151977010172, "learning_rate": 0.002706, "loss": 4.5079, "step": 902 }, { "epoch": 0.00903, "grad_norm": 0.7064881100348057, "learning_rate": 0.002709, "loss": 4.5094, "step": 903 }, { "epoch": 0.00904, "grad_norm": 0.6240828405072839, "learning_rate": 0.002712, "loss": 4.4821, "step": 904 }, { "epoch": 0.00905, "grad_norm": 0.5621159533559194, "learning_rate": 0.002715, "loss": 4.4943, "step": 905 }, { "epoch": 0.00906, "grad_norm": 0.6881074752193249, "learning_rate": 0.002718, "loss": 4.5117, "step": 906 }, { "epoch": 0.00907, "grad_norm": 0.8314783658771728, "learning_rate": 0.0027210000000000003, "loss": 4.5324, "step": 907 }, { "epoch": 0.00908, "grad_norm": 0.8326381841713656, "learning_rate": 0.0027240000000000003, "loss": 4.5097, "step": 908 }, { "epoch": 0.00909, "grad_norm": 0.9853385359973519, "learning_rate": 0.0027270000000000003, "loss": 4.536, "step": 909 }, { "epoch": 0.0091, "grad_norm": 1.1119575374996422, "learning_rate": 0.0027300000000000002, "loss": 4.5599, "step": 910 }, { "epoch": 0.00911, "grad_norm": 1.0405135652045494, "learning_rate": 0.002733, "loss": 4.5105, "step": 911 }, { "epoch": 0.00912, "grad_norm": 1.210969720326656, "learning_rate": 0.002736, "loss": 4.5363, "step": 912 }, { "epoch": 0.00913, "grad_norm": 0.9746238769615785, "learning_rate": 0.002739, "loss": 4.5155, "step": 913 }, { "epoch": 0.00914, "grad_norm": 1.1317971223126129, "learning_rate": 0.002742, "loss": 4.5705, "step": 914 }, { "epoch": 0.00915, "grad_norm": 0.8132490227755053, "learning_rate": 0.002745, "loss": 4.5092, "step": 915 }, { "epoch": 0.00916, "grad_norm": 1.1120180148538203, "learning_rate": 0.002748, "loss": 4.5111, "step": 916 }, { "epoch": 0.00917, "grad_norm": 1.0918567511234711, "learning_rate": 0.002751, "loss": 4.5277, "step": 917 }, { "epoch": 0.00918, "grad_norm": 0.9369545872786895, "learning_rate": 0.0027540000000000004, "loss": 4.5356, "step": 918 }, { "epoch": 0.00919, "grad_norm": 0.9217438795079191, "learning_rate": 0.0027570000000000003, "loss": 4.5218, "step": 919 }, { "epoch": 0.0092, "grad_norm": 0.862557634576406, "learning_rate": 0.0027600000000000003, "loss": 4.5305, "step": 920 }, { "epoch": 0.00921, "grad_norm": 1.1407056864076377, "learning_rate": 0.0027630000000000003, "loss": 4.533, "step": 921 }, { "epoch": 0.00922, "grad_norm": 1.0450100884646383, "learning_rate": 0.0027660000000000002, "loss": 4.5481, "step": 922 }, { "epoch": 0.00923, "grad_norm": 1.1216174249826327, "learning_rate": 0.002769, "loss": 4.5266, "step": 923 }, { "epoch": 0.00924, "grad_norm": 0.8159916660782444, "learning_rate": 0.002772, "loss": 4.534, "step": 924 }, { "epoch": 0.00925, "grad_norm": 0.8122347656711312, "learning_rate": 0.002775, "loss": 4.5145, "step": 925 }, { "epoch": 0.00926, "grad_norm": 1.0333326209619593, "learning_rate": 0.002778, "loss": 4.5431, "step": 926 }, { "epoch": 0.00927, "grad_norm": 1.028655155327321, "learning_rate": 0.002781, "loss": 4.509, "step": 927 }, { "epoch": 0.00928, "grad_norm": 0.9454951825640333, "learning_rate": 0.002784, "loss": 4.5533, "step": 928 }, { "epoch": 0.00929, "grad_norm": 1.1164441967376633, "learning_rate": 0.0027870000000000004, "loss": 4.5458, "step": 929 }, { "epoch": 0.0093, "grad_norm": 0.8645616643837938, "learning_rate": 0.0027900000000000004, "loss": 4.5148, "step": 930 }, { "epoch": 0.00931, "grad_norm": 0.9852728030510483, "learning_rate": 0.0027930000000000003, "loss": 4.5315, "step": 931 }, { "epoch": 0.00932, "grad_norm": 0.8717314730884365, "learning_rate": 0.0027960000000000003, "loss": 4.5342, "step": 932 }, { "epoch": 0.00933, "grad_norm": 0.9689543139825987, "learning_rate": 0.0027990000000000003, "loss": 4.5061, "step": 933 }, { "epoch": 0.00934, "grad_norm": 0.8011334760420642, "learning_rate": 0.0028020000000000002, "loss": 4.5228, "step": 934 }, { "epoch": 0.00935, "grad_norm": 0.7842131538830379, "learning_rate": 0.002805, "loss": 4.5333, "step": 935 }, { "epoch": 0.00936, "grad_norm": 0.8009313182730428, "learning_rate": 0.002808, "loss": 4.5328, "step": 936 }, { "epoch": 0.00937, "grad_norm": 0.6974240482478135, "learning_rate": 0.002811, "loss": 4.5386, "step": 937 }, { "epoch": 0.00938, "grad_norm": 0.5643540587859054, "learning_rate": 0.002814, "loss": 4.5083, "step": 938 }, { "epoch": 0.00939, "grad_norm": 0.5748911957324845, "learning_rate": 0.002817, "loss": 4.5002, "step": 939 }, { "epoch": 0.0094, "grad_norm": 0.5120628706209022, "learning_rate": 0.00282, "loss": 4.4948, "step": 940 }, { "epoch": 0.00941, "grad_norm": 0.4939803780583287, "learning_rate": 0.002823, "loss": 4.4711, "step": 941 }, { "epoch": 0.00942, "grad_norm": 0.5149843098575663, "learning_rate": 0.002826, "loss": 4.4896, "step": 942 }, { "epoch": 0.00943, "grad_norm": 0.5813616468237084, "learning_rate": 0.002829, "loss": 4.4876, "step": 943 }, { "epoch": 0.00944, "grad_norm": 0.6406019504044549, "learning_rate": 0.002832, "loss": 4.4895, "step": 944 }, { "epoch": 0.00945, "grad_norm": 0.5680854338022571, "learning_rate": 0.002835, "loss": 4.4913, "step": 945 }, { "epoch": 0.00946, "grad_norm": 0.5220321709660245, "learning_rate": 0.002838, "loss": 4.4951, "step": 946 }, { "epoch": 0.00947, "grad_norm": 0.631763941258451, "learning_rate": 0.0028409999999999998, "loss": 4.4705, "step": 947 }, { "epoch": 0.00948, "grad_norm": 0.6449801035821655, "learning_rate": 0.0028439999999999997, "loss": 4.4569, "step": 948 }, { "epoch": 0.00949, "grad_norm": 0.5961885502547112, "learning_rate": 0.002847, "loss": 4.4903, "step": 949 }, { "epoch": 0.0095, "grad_norm": 0.6511958288987238, "learning_rate": 0.00285, "loss": 4.4834, "step": 950 }, { "epoch": 0.00951, "grad_norm": 0.7136770979597694, "learning_rate": 0.002853, "loss": 4.4642, "step": 951 }, { "epoch": 0.00952, "grad_norm": 0.7943173544375755, "learning_rate": 0.002856, "loss": 4.4903, "step": 952 }, { "epoch": 0.00953, "grad_norm": 0.7165313674775572, "learning_rate": 0.002859, "loss": 4.4283, "step": 953 }, { "epoch": 0.00954, "grad_norm": 0.7629917982284083, "learning_rate": 0.002862, "loss": 4.49, "step": 954 }, { "epoch": 0.00955, "grad_norm": 0.9137416767731276, "learning_rate": 0.002865, "loss": 4.4809, "step": 955 }, { "epoch": 0.00956, "grad_norm": 0.9350410452513611, "learning_rate": 0.002868, "loss": 4.4685, "step": 956 }, { "epoch": 0.00957, "grad_norm": 0.8266195329744859, "learning_rate": 0.002871, "loss": 4.4866, "step": 957 }, { "epoch": 0.00958, "grad_norm": 0.7685667720087905, "learning_rate": 0.002874, "loss": 4.469, "step": 958 }, { "epoch": 0.00959, "grad_norm": 0.7114722439549117, "learning_rate": 0.002877, "loss": 4.462, "step": 959 }, { "epoch": 0.0096, "grad_norm": 0.7212359661166219, "learning_rate": 0.0028799999999999997, "loss": 4.4615, "step": 960 }, { "epoch": 0.00961, "grad_norm": 0.6952146282717937, "learning_rate": 0.002883, "loss": 4.4332, "step": 961 }, { "epoch": 0.00962, "grad_norm": 0.631327349964468, "learning_rate": 0.002886, "loss": 4.4607, "step": 962 }, { "epoch": 0.00963, "grad_norm": 0.7386203674854516, "learning_rate": 0.002889, "loss": 4.4627, "step": 963 }, { "epoch": 0.00964, "grad_norm": 0.7099703519669538, "learning_rate": 0.002892, "loss": 4.4494, "step": 964 }, { "epoch": 0.00965, "grad_norm": 0.6798689010174963, "learning_rate": 0.002895, "loss": 4.4243, "step": 965 }, { "epoch": 0.00966, "grad_norm": 0.7248321886037777, "learning_rate": 0.002898, "loss": 4.4416, "step": 966 }, { "epoch": 0.00967, "grad_norm": 0.6988539316842848, "learning_rate": 0.002901, "loss": 4.4803, "step": 967 }, { "epoch": 0.00968, "grad_norm": 0.7131963881573582, "learning_rate": 0.002904, "loss": 4.4431, "step": 968 }, { "epoch": 0.00969, "grad_norm": 0.6774288908313462, "learning_rate": 0.002907, "loss": 4.4758, "step": 969 }, { "epoch": 0.0097, "grad_norm": 0.7070891862905684, "learning_rate": 0.00291, "loss": 4.4353, "step": 970 }, { "epoch": 0.00971, "grad_norm": 0.5977635294593704, "learning_rate": 0.002913, "loss": 4.4375, "step": 971 }, { "epoch": 0.00972, "grad_norm": 0.6000392249111391, "learning_rate": 0.002916, "loss": 4.4443, "step": 972 }, { "epoch": 0.00973, "grad_norm": 0.5652705565921429, "learning_rate": 0.002919, "loss": 4.4351, "step": 973 }, { "epoch": 0.00974, "grad_norm": 0.5862941407898875, "learning_rate": 0.002922, "loss": 4.4242, "step": 974 }, { "epoch": 0.00975, "grad_norm": 0.5761106589533092, "learning_rate": 0.002925, "loss": 4.4454, "step": 975 }, { "epoch": 0.00976, "grad_norm": 0.6267951053667676, "learning_rate": 0.002928, "loss": 4.4304, "step": 976 }, { "epoch": 0.00977, "grad_norm": 0.7323160890025502, "learning_rate": 0.002931, "loss": 4.4742, "step": 977 }, { "epoch": 0.00978, "grad_norm": 0.7947149437193983, "learning_rate": 0.002934, "loss": 4.4506, "step": 978 }, { "epoch": 0.00979, "grad_norm": 0.7190657481675354, "learning_rate": 0.002937, "loss": 4.4415, "step": 979 }, { "epoch": 0.0098, "grad_norm": 0.7127444247767541, "learning_rate": 0.00294, "loss": 4.4341, "step": 980 }, { "epoch": 0.00981, "grad_norm": 0.6752689834635909, "learning_rate": 0.002943, "loss": 4.4448, "step": 981 }, { "epoch": 0.00982, "grad_norm": 0.7621208928943479, "learning_rate": 0.002946, "loss": 4.4402, "step": 982 }, { "epoch": 0.00983, "grad_norm": 0.6278907346255873, "learning_rate": 0.0029490000000000002, "loss": 4.4356, "step": 983 }, { "epoch": 0.00984, "grad_norm": 0.640256203275423, "learning_rate": 0.002952, "loss": 4.4238, "step": 984 }, { "epoch": 0.00985, "grad_norm": 0.6571839667979233, "learning_rate": 0.002955, "loss": 4.4427, "step": 985 }, { "epoch": 0.00986, "grad_norm": 0.8077439009351965, "learning_rate": 0.002958, "loss": 4.4152, "step": 986 }, { "epoch": 0.00987, "grad_norm": 0.8060461488431017, "learning_rate": 0.002961, "loss": 4.4298, "step": 987 }, { "epoch": 0.00988, "grad_norm": 0.8493960405917234, "learning_rate": 0.002964, "loss": 4.4644, "step": 988 }, { "epoch": 0.00989, "grad_norm": 0.7333952379065988, "learning_rate": 0.002967, "loss": 4.405, "step": 989 }, { "epoch": 0.0099, "grad_norm": 0.6561442246030965, "learning_rate": 0.00297, "loss": 4.4391, "step": 990 }, { "epoch": 0.00991, "grad_norm": 0.760469930109403, "learning_rate": 0.002973, "loss": 4.4396, "step": 991 }, { "epoch": 0.00992, "grad_norm": 0.7424003895138885, "learning_rate": 0.002976, "loss": 4.4355, "step": 992 }, { "epoch": 0.00993, "grad_norm": 0.7430266355026064, "learning_rate": 0.002979, "loss": 4.4325, "step": 993 }, { "epoch": 0.00994, "grad_norm": 0.7650486142940663, "learning_rate": 0.002982, "loss": 4.4406, "step": 994 }, { "epoch": 0.00995, "grad_norm": 0.7876492554203988, "learning_rate": 0.0029850000000000002, "loss": 4.4395, "step": 995 }, { "epoch": 0.00996, "grad_norm": 0.7683559647577115, "learning_rate": 0.002988, "loss": 4.4448, "step": 996 }, { "epoch": 0.00997, "grad_norm": 0.7326948857624186, "learning_rate": 0.002991, "loss": 4.432, "step": 997 }, { "epoch": 0.00998, "grad_norm": 0.8133968945705032, "learning_rate": 0.002994, "loss": 4.4447, "step": 998 }, { "epoch": 0.00999, "grad_norm": 0.9410152737222023, "learning_rate": 0.002997, "loss": 4.4373, "step": 999 }, { "epoch": 0.01, "grad_norm": 0.799133553847171, "learning_rate": 0.003, "loss": 4.4596, "step": 1000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.9643642855424e+16, "train_batch_size": 1024, "trial_name": null, "trial_params": null }