{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9986648865153538, "eval_steps": 500, "global_step": 374, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026702269692923898, "grad_norm": 0.5390625, "learning_rate": 0.0, "loss": 2.534, "step": 1 }, { "epoch": 0.0053404539385847796, "grad_norm": 0.55078125, "learning_rate": 5.555555555555555e-07, "loss": 2.5367, "step": 2 }, { "epoch": 0.00801068090787717, "grad_norm": 0.5390625, "learning_rate": 1.111111111111111e-06, "loss": 2.5319, "step": 3 }, { "epoch": 0.010680907877169559, "grad_norm": 0.55078125, "learning_rate": 1.6666666666666667e-06, "loss": 2.5296, "step": 4 }, { "epoch": 0.01335113484646195, "grad_norm": 0.55859375, "learning_rate": 2.222222222222222e-06, "loss": 2.5376, "step": 5 }, { "epoch": 0.01602136181575434, "grad_norm": 0.57421875, "learning_rate": 2.7777777777777783e-06, "loss": 2.5011, "step": 6 }, { "epoch": 0.018691588785046728, "grad_norm": 0.51171875, "learning_rate": 3.3333333333333333e-06, "loss": 2.501, "step": 7 }, { "epoch": 0.021361815754339118, "grad_norm": 0.5390625, "learning_rate": 3.88888888888889e-06, "loss": 2.5337, "step": 8 }, { "epoch": 0.02403204272363151, "grad_norm": 0.50390625, "learning_rate": 4.444444444444444e-06, "loss": 2.4829, "step": 9 }, { "epoch": 0.0267022696929239, "grad_norm": 0.48828125, "learning_rate": 5e-06, "loss": 2.4611, "step": 10 }, { "epoch": 0.029372496662216287, "grad_norm": 0.46484375, "learning_rate": 5.555555555555557e-06, "loss": 2.4401, "step": 11 }, { "epoch": 0.03204272363150868, "grad_norm": 0.447265625, "learning_rate": 6.111111111111112e-06, "loss": 2.4879, "step": 12 }, { "epoch": 0.03471295060080107, "grad_norm": 0.4296875, "learning_rate": 6.666666666666667e-06, "loss": 2.5059, "step": 13 }, { "epoch": 0.037383177570093455, "grad_norm": 0.40234375, "learning_rate": 7.222222222222223e-06, "loss": 2.5041, "step": 14 }, { "epoch": 0.04005340453938585, "grad_norm": 0.400390625, "learning_rate": 7.77777777777778e-06, "loss": 2.516, "step": 15 }, { "epoch": 0.042723631508678236, "grad_norm": 0.3828125, "learning_rate": 8.333333333333334e-06, "loss": 2.504, "step": 16 }, { "epoch": 0.04539385847797063, "grad_norm": 0.37109375, "learning_rate": 8.888888888888888e-06, "loss": 2.523, "step": 17 }, { "epoch": 0.04806408544726302, "grad_norm": 0.361328125, "learning_rate": 9.444444444444445e-06, "loss": 2.5062, "step": 18 }, { "epoch": 0.050734312416555405, "grad_norm": 0.369140625, "learning_rate": 1e-05, "loss": 2.5219, "step": 19 }, { "epoch": 0.0534045393858478, "grad_norm": 0.345703125, "learning_rate": 9.999805313005946e-06, "loss": 2.5032, "step": 20 }, { "epoch": 0.056074766355140186, "grad_norm": 0.345703125, "learning_rate": 9.999221267184993e-06, "loss": 2.5027, "step": 21 }, { "epoch": 0.05874499332443257, "grad_norm": 0.337890625, "learning_rate": 9.998247908019594e-06, "loss": 2.5135, "step": 22 }, { "epoch": 0.06141522029372497, "grad_norm": 0.322265625, "learning_rate": 9.996885311309892e-06, "loss": 2.5042, "step": 23 }, { "epoch": 0.06408544726301736, "grad_norm": 0.318359375, "learning_rate": 9.995133583167833e-06, "loss": 2.513, "step": 24 }, { "epoch": 0.06675567423230974, "grad_norm": 0.322265625, "learning_rate": 9.992992860008893e-06, "loss": 2.5058, "step": 25 }, { "epoch": 0.06942590120160214, "grad_norm": 0.314453125, "learning_rate": 9.990463308541452e-06, "loss": 2.5437, "step": 26 }, { "epoch": 0.07209612817089453, "grad_norm": 0.3046875, "learning_rate": 9.987545125753818e-06, "loss": 2.4898, "step": 27 }, { "epoch": 0.07476635514018691, "grad_norm": 0.3046875, "learning_rate": 9.98423853889889e-06, "loss": 2.4943, "step": 28 }, { "epoch": 0.0774365821094793, "grad_norm": 0.30859375, "learning_rate": 9.980543805476447e-06, "loss": 2.5259, "step": 29 }, { "epoch": 0.0801068090787717, "grad_norm": 0.314453125, "learning_rate": 9.976461213213104e-06, "loss": 2.5365, "step": 30 }, { "epoch": 0.08277703604806408, "grad_norm": 0.296875, "learning_rate": 9.971991080039912e-06, "loss": 2.4535, "step": 31 }, { "epoch": 0.08544726301735647, "grad_norm": 0.302734375, "learning_rate": 9.967133754067581e-06, "loss": 2.5043, "step": 32 }, { "epoch": 0.08811748998664887, "grad_norm": 0.294921875, "learning_rate": 9.961889613559396e-06, "loss": 2.5127, "step": 33 }, { "epoch": 0.09078771695594126, "grad_norm": 0.298828125, "learning_rate": 9.956259066901733e-06, "loss": 2.5135, "step": 34 }, { "epoch": 0.09345794392523364, "grad_norm": 0.2890625, "learning_rate": 9.950242552572272e-06, "loss": 2.469, "step": 35 }, { "epoch": 0.09612817089452604, "grad_norm": 0.294921875, "learning_rate": 9.943840539105853e-06, "loss": 2.5182, "step": 36 }, { "epoch": 0.09879839786381843, "grad_norm": 0.29296875, "learning_rate": 9.937053525057977e-06, "loss": 2.5109, "step": 37 }, { "epoch": 0.10146862483311081, "grad_norm": 0.291015625, "learning_rate": 9.92988203896599e-06, "loss": 2.5071, "step": 38 }, { "epoch": 0.1041388518024032, "grad_norm": 0.287109375, "learning_rate": 9.922326639307918e-06, "loss": 2.473, "step": 39 }, { "epoch": 0.1068090787716956, "grad_norm": 0.28125, "learning_rate": 9.914387914458983e-06, "loss": 2.4875, "step": 40 }, { "epoch": 0.10947930574098798, "grad_norm": 0.28515625, "learning_rate": 9.906066482645774e-06, "loss": 2.5014, "step": 41 }, { "epoch": 0.11214953271028037, "grad_norm": 0.283203125, "learning_rate": 9.89736299189811e-06, "loss": 2.5084, "step": 42 }, { "epoch": 0.11481975967957277, "grad_norm": 0.2890625, "learning_rate": 9.888278119998573e-06, "loss": 2.4905, "step": 43 }, { "epoch": 0.11748998664886515, "grad_norm": 0.279296875, "learning_rate": 9.878812574429722e-06, "loss": 2.4696, "step": 44 }, { "epoch": 0.12016021361815754, "grad_norm": 0.298828125, "learning_rate": 9.868967092319003e-06, "loss": 2.4912, "step": 45 }, { "epoch": 0.12283044058744993, "grad_norm": 0.287109375, "learning_rate": 9.858742440381343e-06, "loss": 2.4963, "step": 46 }, { "epoch": 0.12550066755674233, "grad_norm": 0.279296875, "learning_rate": 9.848139414859441e-06, "loss": 2.5145, "step": 47 }, { "epoch": 0.12817089452603472, "grad_norm": 0.279296875, "learning_rate": 9.837158841461767e-06, "loss": 2.4842, "step": 48 }, { "epoch": 0.1308411214953271, "grad_norm": 0.283203125, "learning_rate": 9.825801575298248e-06, "loss": 2.5164, "step": 49 }, { "epoch": 0.13351134846461948, "grad_norm": 0.26953125, "learning_rate": 9.814068500813692e-06, "loss": 2.5011, "step": 50 }, { "epoch": 0.13618157543391188, "grad_norm": 0.275390625, "learning_rate": 9.801960531718898e-06, "loss": 2.5144, "step": 51 }, { "epoch": 0.13885180240320427, "grad_norm": 0.279296875, "learning_rate": 9.789478610919508e-06, "loss": 2.4887, "step": 52 }, { "epoch": 0.14152202937249667, "grad_norm": 0.275390625, "learning_rate": 9.77662371044258e-06, "loss": 2.508, "step": 53 }, { "epoch": 0.14419225634178906, "grad_norm": 0.28125, "learning_rate": 9.763396831360884e-06, "loss": 2.506, "step": 54 }, { "epoch": 0.14686248331108145, "grad_norm": 0.28125, "learning_rate": 9.749799003714954e-06, "loss": 2.5483, "step": 55 }, { "epoch": 0.14953271028037382, "grad_norm": 0.279296875, "learning_rate": 9.735831286432869e-06, "loss": 2.5136, "step": 56 }, { "epoch": 0.15220293724966621, "grad_norm": 0.2734375, "learning_rate": 9.721494767247779e-06, "loss": 2.487, "step": 57 }, { "epoch": 0.1548731642189586, "grad_norm": 0.28515625, "learning_rate": 9.70679056261322e-06, "loss": 2.5026, "step": 58 }, { "epoch": 0.157543391188251, "grad_norm": 0.27734375, "learning_rate": 9.691719817616148e-06, "loss": 2.5142, "step": 59 }, { "epoch": 0.1602136181575434, "grad_norm": 0.275390625, "learning_rate": 9.676283705887783e-06, "loss": 2.5027, "step": 60 }, { "epoch": 0.1628838451268358, "grad_norm": 0.2734375, "learning_rate": 9.660483429512198e-06, "loss": 2.5344, "step": 61 }, { "epoch": 0.16555407209612816, "grad_norm": 0.271484375, "learning_rate": 9.644320218932723e-06, "loss": 2.5107, "step": 62 }, { "epoch": 0.16822429906542055, "grad_norm": 0.267578125, "learning_rate": 9.627795332856107e-06, "loss": 2.4842, "step": 63 }, { "epoch": 0.17089452603471295, "grad_norm": 0.271484375, "learning_rate": 9.61091005815451e-06, "loss": 2.4635, "step": 64 }, { "epoch": 0.17356475300400534, "grad_norm": 0.271484375, "learning_rate": 9.59366570976528e-06, "loss": 2.5131, "step": 65 }, { "epoch": 0.17623497997329773, "grad_norm": 0.2734375, "learning_rate": 9.576063630588563e-06, "loss": 2.5098, "step": 66 }, { "epoch": 0.17890520694259013, "grad_norm": 0.275390625, "learning_rate": 9.55810519138271e-06, "loss": 2.5209, "step": 67 }, { "epoch": 0.18157543391188252, "grad_norm": 0.28125, "learning_rate": 9.53979179065754e-06, "loss": 2.5272, "step": 68 }, { "epoch": 0.1842456608811749, "grad_norm": 0.26953125, "learning_rate": 9.521124854565425e-06, "loss": 2.5067, "step": 69 }, { "epoch": 0.18691588785046728, "grad_norm": 0.26953125, "learning_rate": 9.50210583679024e-06, "loss": 2.5268, "step": 70 }, { "epoch": 0.18958611481975968, "grad_norm": 0.2734375, "learning_rate": 9.482736218434144e-06, "loss": 2.5179, "step": 71 }, { "epoch": 0.19225634178905207, "grad_norm": 0.271484375, "learning_rate": 9.463017507902245e-06, "loss": 2.4965, "step": 72 }, { "epoch": 0.19492656875834447, "grad_norm": 0.271484375, "learning_rate": 9.442951240785135e-06, "loss": 2.5109, "step": 73 }, { "epoch": 0.19759679572763686, "grad_norm": 0.265625, "learning_rate": 9.422538979739307e-06, "loss": 2.4834, "step": 74 }, { "epoch": 0.20026702269692923, "grad_norm": 0.271484375, "learning_rate": 9.401782314365458e-06, "loss": 2.4986, "step": 75 }, { "epoch": 0.20293724966622162, "grad_norm": 0.26953125, "learning_rate": 9.380682861084703e-06, "loss": 2.4779, "step": 76 }, { "epoch": 0.205607476635514, "grad_norm": 0.26953125, "learning_rate": 9.359242263012693e-06, "loss": 2.4841, "step": 77 }, { "epoch": 0.2082777036048064, "grad_norm": 0.2734375, "learning_rate": 9.33746218983167e-06, "loss": 2.4902, "step": 78 }, { "epoch": 0.2109479305740988, "grad_norm": 0.30078125, "learning_rate": 9.315344337660422e-06, "loss": 2.4984, "step": 79 }, { "epoch": 0.2136181575433912, "grad_norm": 0.271484375, "learning_rate": 9.29289042892221e-06, "loss": 2.4941, "step": 80 }, { "epoch": 0.2162883845126836, "grad_norm": 0.279296875, "learning_rate": 9.270102212210632e-06, "loss": 2.5192, "step": 81 }, { "epoch": 0.21895861148197596, "grad_norm": 0.26953125, "learning_rate": 9.246981462153456e-06, "loss": 2.4991, "step": 82 }, { "epoch": 0.22162883845126835, "grad_norm": 0.26953125, "learning_rate": 9.223529979274411e-06, "loss": 2.483, "step": 83 }, { "epoch": 0.22429906542056074, "grad_norm": 0.271484375, "learning_rate": 9.19974958985298e-06, "loss": 2.5091, "step": 84 }, { "epoch": 0.22696929238985314, "grad_norm": 0.271484375, "learning_rate": 9.175642145782179e-06, "loss": 2.5119, "step": 85 }, { "epoch": 0.22963951935914553, "grad_norm": 0.271484375, "learning_rate": 9.151209524424333e-06, "loss": 2.5313, "step": 86 }, { "epoch": 0.23230974632843793, "grad_norm": 0.28125, "learning_rate": 9.126453628464889e-06, "loss": 2.4813, "step": 87 }, { "epoch": 0.2349799732977303, "grad_norm": 0.267578125, "learning_rate": 9.10137638576423e-06, "loss": 2.489, "step": 88 }, { "epoch": 0.2376502002670227, "grad_norm": 0.2734375, "learning_rate": 9.07597974920756e-06, "loss": 2.5331, "step": 89 }, { "epoch": 0.24032042723631508, "grad_norm": 0.265625, "learning_rate": 9.05026569655281e-06, "loss": 2.4919, "step": 90 }, { "epoch": 0.24299065420560748, "grad_norm": 0.271484375, "learning_rate": 9.02423623027663e-06, "loss": 2.561, "step": 91 }, { "epoch": 0.24566088117489987, "grad_norm": 0.26953125, "learning_rate": 8.997893377418432e-06, "loss": 2.5266, "step": 92 }, { "epoch": 0.24833110814419226, "grad_norm": 0.265625, "learning_rate": 8.971239189422555e-06, "loss": 2.4969, "step": 93 }, { "epoch": 0.25100133511348466, "grad_norm": 0.2734375, "learning_rate": 8.944275741978495e-06, "loss": 2.4977, "step": 94 }, { "epoch": 0.253671562082777, "grad_norm": 0.271484375, "learning_rate": 8.917005134859263e-06, "loss": 2.4885, "step": 95 }, { "epoch": 0.25634178905206945, "grad_norm": 0.26953125, "learning_rate": 8.889429491757872e-06, "loss": 2.4995, "step": 96 }, { "epoch": 0.2590120160213618, "grad_norm": 0.271484375, "learning_rate": 8.861550960121946e-06, "loss": 2.5081, "step": 97 }, { "epoch": 0.2616822429906542, "grad_norm": 0.265625, "learning_rate": 8.833371710986493e-06, "loss": 2.4995, "step": 98 }, { "epoch": 0.2643524699599466, "grad_norm": 0.26171875, "learning_rate": 8.804893938804839e-06, "loss": 2.4881, "step": 99 }, { "epoch": 0.26702269692923897, "grad_norm": 0.271484375, "learning_rate": 8.77611986127773e-06, "loss": 2.5327, "step": 100 }, { "epoch": 0.2696929238985314, "grad_norm": 0.267578125, "learning_rate": 8.747051719180626e-06, "loss": 2.526, "step": 101 }, { "epoch": 0.27236315086782376, "grad_norm": 0.26953125, "learning_rate": 8.717691776189214e-06, "loss": 2.5366, "step": 102 }, { "epoch": 0.2750333778371162, "grad_norm": 0.259765625, "learning_rate": 8.688042318703111e-06, "loss": 2.4877, "step": 103 }, { "epoch": 0.27770360480640854, "grad_norm": 0.263671875, "learning_rate": 8.65810565566782e-06, "loss": 2.4742, "step": 104 }, { "epoch": 0.2803738317757009, "grad_norm": 0.263671875, "learning_rate": 8.627884118394913e-06, "loss": 2.4916, "step": 105 }, { "epoch": 0.28304405874499333, "grad_norm": 0.263671875, "learning_rate": 8.597380060380493e-06, "loss": 2.536, "step": 106 }, { "epoch": 0.2857142857142857, "grad_norm": 0.271484375, "learning_rate": 8.566595857121902e-06, "loss": 2.4748, "step": 107 }, { "epoch": 0.2883845126835781, "grad_norm": 0.275390625, "learning_rate": 8.535533905932739e-06, "loss": 2.4833, "step": 108 }, { "epoch": 0.2910547396528705, "grad_norm": 0.265625, "learning_rate": 8.504196625756166e-06, "loss": 2.5434, "step": 109 }, { "epoch": 0.2937249666221629, "grad_norm": 0.26953125, "learning_rate": 8.472586456976534e-06, "loss": 2.4924, "step": 110 }, { "epoch": 0.2963951935914553, "grad_norm": 0.267578125, "learning_rate": 8.440705861229344e-06, "loss": 2.5269, "step": 111 }, { "epoch": 0.29906542056074764, "grad_norm": 0.26171875, "learning_rate": 8.408557321209534e-06, "loss": 2.4687, "step": 112 }, { "epoch": 0.30173564753004006, "grad_norm": 0.263671875, "learning_rate": 8.376143340478153e-06, "loss": 2.4686, "step": 113 }, { "epoch": 0.30440587449933243, "grad_norm": 0.27734375, "learning_rate": 8.34346644326739e-06, "loss": 2.4978, "step": 114 }, { "epoch": 0.30707610146862485, "grad_norm": 0.263671875, "learning_rate": 8.310529174284004e-06, "loss": 2.5137, "step": 115 }, { "epoch": 0.3097463284379172, "grad_norm": 0.27734375, "learning_rate": 8.277334098511147e-06, "loss": 2.5235, "step": 116 }, { "epoch": 0.31241655540720964, "grad_norm": 0.275390625, "learning_rate": 8.243883801008632e-06, "loss": 2.4973, "step": 117 }, { "epoch": 0.315086782376502, "grad_norm": 0.263671875, "learning_rate": 8.210180886711603e-06, "loss": 2.5124, "step": 118 }, { "epoch": 0.3177570093457944, "grad_norm": 0.271484375, "learning_rate": 8.176227980227693e-06, "loss": 2.5167, "step": 119 }, { "epoch": 0.3204272363150868, "grad_norm": 0.267578125, "learning_rate": 8.142027725632622e-06, "loss": 2.454, "step": 120 }, { "epoch": 0.32309746328437916, "grad_norm": 0.267578125, "learning_rate": 8.107582786264299e-06, "loss": 2.5076, "step": 121 }, { "epoch": 0.3257676902536716, "grad_norm": 0.265625, "learning_rate": 8.072895844515398e-06, "loss": 2.5122, "step": 122 }, { "epoch": 0.32843791722296395, "grad_norm": 0.275390625, "learning_rate": 8.037969601624495e-06, "loss": 2.5294, "step": 123 }, { "epoch": 0.3311081441922563, "grad_norm": 0.271484375, "learning_rate": 8.002806777465685e-06, "loss": 2.537, "step": 124 }, { "epoch": 0.33377837116154874, "grad_norm": 0.263671875, "learning_rate": 7.967410110336782e-06, "loss": 2.4899, "step": 125 }, { "epoch": 0.3364485981308411, "grad_norm": 0.267578125, "learning_rate": 7.931782356746076e-06, "loss": 2.5312, "step": 126 }, { "epoch": 0.3391188251001335, "grad_norm": 0.267578125, "learning_rate": 7.895926291197667e-06, "loss": 2.4986, "step": 127 }, { "epoch": 0.3417890520694259, "grad_norm": 0.267578125, "learning_rate": 7.859844705975405e-06, "loss": 2.5286, "step": 128 }, { "epoch": 0.3444592790387183, "grad_norm": 0.26953125, "learning_rate": 7.823540410925434e-06, "loss": 2.4856, "step": 129 }, { "epoch": 0.3471295060080107, "grad_norm": 0.263671875, "learning_rate": 7.787016233237387e-06, "loss": 2.5236, "step": 130 }, { "epoch": 0.34979973297730305, "grad_norm": 0.2734375, "learning_rate": 7.750275017224208e-06, "loss": 2.5115, "step": 131 }, { "epoch": 0.35246995994659547, "grad_norm": 0.267578125, "learning_rate": 7.713319624100657e-06, "loss": 2.5059, "step": 132 }, { "epoch": 0.35514018691588783, "grad_norm": 0.26953125, "learning_rate": 7.676152931760496e-06, "loss": 2.5122, "step": 133 }, { "epoch": 0.35781041388518026, "grad_norm": 0.267578125, "learning_rate": 7.638777834552372e-06, "loss": 2.5179, "step": 134 }, { "epoch": 0.3604806408544726, "grad_norm": 0.271484375, "learning_rate": 7.601197243054411e-06, "loss": 2.4895, "step": 135 }, { "epoch": 0.36315086782376504, "grad_norm": 0.26171875, "learning_rate": 7.563414083847573e-06, "loss": 2.4843, "step": 136 }, { "epoch": 0.3658210947930574, "grad_norm": 0.271484375, "learning_rate": 7.525431299287737e-06, "loss": 2.5075, "step": 137 }, { "epoch": 0.3684913217623498, "grad_norm": 0.265625, "learning_rate": 7.4872518472765594e-06, "loss": 2.5155, "step": 138 }, { "epoch": 0.3711615487316422, "grad_norm": 0.263671875, "learning_rate": 7.4488787010311425e-06, "loss": 2.4949, "step": 139 }, { "epoch": 0.37383177570093457, "grad_norm": 0.263671875, "learning_rate": 7.4103148488524824e-06, "loss": 2.5134, "step": 140 }, { "epoch": 0.376502002670227, "grad_norm": 0.267578125, "learning_rate": 7.371563293892761e-06, "loss": 2.516, "step": 141 }, { "epoch": 0.37917222963951935, "grad_norm": 0.26953125, "learning_rate": 7.3326270539214826e-06, "loss": 2.4839, "step": 142 }, { "epoch": 0.3818424566088118, "grad_norm": 0.265625, "learning_rate": 7.293509161090453e-06, "loss": 2.4952, "step": 143 }, { "epoch": 0.38451268357810414, "grad_norm": 0.263671875, "learning_rate": 7.2542126616976596e-06, "loss": 2.4828, "step": 144 }, { "epoch": 0.3871829105473965, "grad_norm": 0.271484375, "learning_rate": 7.214740615950041e-06, "loss": 2.5365, "step": 145 }, { "epoch": 0.38985313751668893, "grad_norm": 0.267578125, "learning_rate": 7.175096097725169e-06, "loss": 2.5147, "step": 146 }, { "epoch": 0.3925233644859813, "grad_norm": 0.26171875, "learning_rate": 7.135282194331881e-06, "loss": 2.4745, "step": 147 }, { "epoch": 0.3951935914552737, "grad_norm": 0.26171875, "learning_rate": 7.095302006269842e-06, "loss": 2.4743, "step": 148 }, { "epoch": 0.3978638184245661, "grad_norm": 0.265625, "learning_rate": 7.05515864698811e-06, "loss": 2.4727, "step": 149 }, { "epoch": 0.40053404539385845, "grad_norm": 0.267578125, "learning_rate": 7.014855242642662e-06, "loss": 2.4808, "step": 150 }, { "epoch": 0.4032042723631509, "grad_norm": 0.265625, "learning_rate": 6.974394931852957e-06, "loss": 2.4861, "step": 151 }, { "epoch": 0.40587449933244324, "grad_norm": 0.26953125, "learning_rate": 6.933780865457508e-06, "loss": 2.5099, "step": 152 }, { "epoch": 0.40854472630173566, "grad_norm": 0.26171875, "learning_rate": 6.893016206268518e-06, "loss": 2.5029, "step": 153 }, { "epoch": 0.411214953271028, "grad_norm": 0.267578125, "learning_rate": 6.85210412882557e-06, "loss": 2.496, "step": 154 }, { "epoch": 0.41388518024032045, "grad_norm": 0.265625, "learning_rate": 6.811047819148413e-06, "loss": 2.4808, "step": 155 }, { "epoch": 0.4165554072096128, "grad_norm": 0.267578125, "learning_rate": 6.769850474488859e-06, "loss": 2.5062, "step": 156 }, { "epoch": 0.4192256341789052, "grad_norm": 0.263671875, "learning_rate": 6.728515303081782e-06, "loss": 2.4872, "step": 157 }, { "epoch": 0.4218958611481976, "grad_norm": 0.263671875, "learning_rate": 6.687045523895292e-06, "loss": 2.5125, "step": 158 }, { "epoch": 0.42456608811748997, "grad_norm": 0.2578125, "learning_rate": 6.64544436638005e-06, "loss": 2.4739, "step": 159 }, { "epoch": 0.4272363150867824, "grad_norm": 0.26171875, "learning_rate": 6.603715070217779e-06, "loss": 2.4561, "step": 160 }, { "epoch": 0.42990654205607476, "grad_norm": 0.267578125, "learning_rate": 6.561860885068972e-06, "loss": 2.5166, "step": 161 }, { "epoch": 0.4325767690253672, "grad_norm": 0.26171875, "learning_rate": 6.519885070319827e-06, "loss": 2.4901, "step": 162 }, { "epoch": 0.43524699599465955, "grad_norm": 0.2734375, "learning_rate": 6.477790894828422e-06, "loss": 2.4906, "step": 163 }, { "epoch": 0.4379172229639519, "grad_norm": 0.265625, "learning_rate": 6.435581636670154e-06, "loss": 2.4935, "step": 164 }, { "epoch": 0.44058744993324434, "grad_norm": 0.265625, "learning_rate": 6.393260582882462e-06, "loss": 2.4686, "step": 165 }, { "epoch": 0.4432576769025367, "grad_norm": 0.26171875, "learning_rate": 6.350831029208844e-06, "loss": 2.4516, "step": 166 }, { "epoch": 0.4459279038718291, "grad_norm": 0.26171875, "learning_rate": 6.308296279842204e-06, "loss": 2.479, "step": 167 }, { "epoch": 0.4485981308411215, "grad_norm": 0.26171875, "learning_rate": 6.265659647167542e-06, "loss": 2.5068, "step": 168 }, { "epoch": 0.4512683578104139, "grad_norm": 0.263671875, "learning_rate": 6.222924451504001e-06, "loss": 2.4598, "step": 169 }, { "epoch": 0.4539385847797063, "grad_norm": 0.271484375, "learning_rate": 6.180094020846291e-06, "loss": 2.5116, "step": 170 }, { "epoch": 0.45660881174899864, "grad_norm": 0.26953125, "learning_rate": 6.1371716906055336e-06, "loss": 2.4649, "step": 171 }, { "epoch": 0.45927903871829107, "grad_norm": 0.26171875, "learning_rate": 6.094160803349508e-06, "loss": 2.4785, "step": 172 }, { "epoch": 0.46194926568758343, "grad_norm": 0.26171875, "learning_rate": 6.051064708542357e-06, "loss": 2.5259, "step": 173 }, { "epoch": 0.46461949265687585, "grad_norm": 0.271484375, "learning_rate": 6.00788676228374e-06, "loss": 2.5047, "step": 174 }, { "epoch": 0.4672897196261682, "grad_norm": 0.267578125, "learning_rate": 5.964630327047485e-06, "loss": 2.5377, "step": 175 }, { "epoch": 0.4699599465954606, "grad_norm": 0.267578125, "learning_rate": 5.921298771419731e-06, "loss": 2.5288, "step": 176 }, { "epoch": 0.472630173564753, "grad_norm": 0.267578125, "learning_rate": 5.877895469836604e-06, "loss": 2.5045, "step": 177 }, { "epoch": 0.4753004005340454, "grad_norm": 0.265625, "learning_rate": 5.8344238023214305e-06, "loss": 2.5275, "step": 178 }, { "epoch": 0.4779706275033378, "grad_norm": 0.267578125, "learning_rate": 5.790887154221521e-06, "loss": 2.5307, "step": 179 }, { "epoch": 0.48064085447263016, "grad_norm": 0.279296875, "learning_rate": 5.747288915944533e-06, "loss": 2.4982, "step": 180 }, { "epoch": 0.4833110814419226, "grad_norm": 0.263671875, "learning_rate": 5.703632482694453e-06, "loss": 2.472, "step": 181 }, { "epoch": 0.48598130841121495, "grad_norm": 0.271484375, "learning_rate": 5.659921254207183e-06, "loss": 2.5524, "step": 182 }, { "epoch": 0.4886515353805073, "grad_norm": 0.267578125, "learning_rate": 5.616158634485793e-06, "loss": 2.4878, "step": 183 }, { "epoch": 0.49132176234979974, "grad_norm": 0.2578125, "learning_rate": 5.572348031535442e-06, "loss": 2.5004, "step": 184 }, { "epoch": 0.4939919893190921, "grad_norm": 0.255859375, "learning_rate": 5.528492857097966e-06, "loss": 2.4946, "step": 185 }, { "epoch": 0.49666221628838453, "grad_norm": 0.263671875, "learning_rate": 5.484596526386198e-06, "loss": 2.4705, "step": 186 }, { "epoch": 0.4993324432576769, "grad_norm": 0.263671875, "learning_rate": 5.44066245781801e-06, "loss": 2.5064, "step": 187 }, { "epoch": 0.5020026702269693, "grad_norm": 0.267578125, "learning_rate": 5.396694072750099e-06, "loss": 2.4749, "step": 188 }, { "epoch": 0.5046728971962616, "grad_norm": 0.263671875, "learning_rate": 5.352694795211555e-06, "loss": 2.5226, "step": 189 }, { "epoch": 0.507343124165554, "grad_norm": 0.265625, "learning_rate": 5.308668051637213e-06, "loss": 2.5068, "step": 190 }, { "epoch": 0.5100133511348465, "grad_norm": 0.2578125, "learning_rate": 5.2646172706008154e-06, "loss": 2.4547, "step": 191 }, { "epoch": 0.5126835781041389, "grad_norm": 0.265625, "learning_rate": 5.220545882548024e-06, "loss": 2.4742, "step": 192 }, { "epoch": 0.5153538050734312, "grad_norm": 0.26171875, "learning_rate": 5.176457319529264e-06, "loss": 2.5493, "step": 193 }, { "epoch": 0.5180240320427236, "grad_norm": 0.263671875, "learning_rate": 5.132355014932455e-06, "loss": 2.5024, "step": 194 }, { "epoch": 0.520694259012016, "grad_norm": 0.263671875, "learning_rate": 5.088242403215644e-06, "loss": 2.5104, "step": 195 }, { "epoch": 0.5233644859813084, "grad_norm": 0.265625, "learning_rate": 5.0441229196395416e-06, "loss": 2.4975, "step": 196 }, { "epoch": 0.5260347129506008, "grad_norm": 0.265625, "learning_rate": 5e-06, "loss": 2.4902, "step": 197 }, { "epoch": 0.5287049399198932, "grad_norm": 0.2734375, "learning_rate": 4.955877080360462e-06, "loss": 2.5088, "step": 198 }, { "epoch": 0.5313751668891856, "grad_norm": 0.267578125, "learning_rate": 4.911757596784358e-06, "loss": 2.5015, "step": 199 }, { "epoch": 0.5340453938584779, "grad_norm": 0.26953125, "learning_rate": 4.867644985067548e-06, "loss": 2.5029, "step": 200 }, { "epoch": 0.5367156208277704, "grad_norm": 0.26171875, "learning_rate": 4.823542680470738e-06, "loss": 2.5171, "step": 201 }, { "epoch": 0.5393858477970628, "grad_norm": 0.259765625, "learning_rate": 4.779454117451978e-06, "loss": 2.5242, "step": 202 }, { "epoch": 0.5420560747663551, "grad_norm": 0.263671875, "learning_rate": 4.7353827293991845e-06, "loss": 2.5121, "step": 203 }, { "epoch": 0.5447263017356475, "grad_norm": 0.26171875, "learning_rate": 4.691331948362789e-06, "loss": 2.4662, "step": 204 }, { "epoch": 0.5473965287049399, "grad_norm": 0.265625, "learning_rate": 4.647305204788445e-06, "loss": 2.5188, "step": 205 }, { "epoch": 0.5500667556742324, "grad_norm": 0.26171875, "learning_rate": 4.603305927249902e-06, "loss": 2.4847, "step": 206 }, { "epoch": 0.5527369826435247, "grad_norm": 0.2578125, "learning_rate": 4.559337542181993e-06, "loss": 2.4865, "step": 207 }, { "epoch": 0.5554072096128171, "grad_norm": 0.26171875, "learning_rate": 4.5154034736138035e-06, "loss": 2.4858, "step": 208 }, { "epoch": 0.5580774365821095, "grad_norm": 0.2578125, "learning_rate": 4.471507142902036e-06, "loss": 2.4824, "step": 209 }, { "epoch": 0.5607476635514018, "grad_norm": 0.265625, "learning_rate": 4.427651968464559e-06, "loss": 2.515, "step": 210 }, { "epoch": 0.5634178905206942, "grad_norm": 0.271484375, "learning_rate": 4.383841365514208e-06, "loss": 2.5034, "step": 211 }, { "epoch": 0.5660881174899867, "grad_norm": 0.267578125, "learning_rate": 4.340078745792818e-06, "loss": 2.4928, "step": 212 }, { "epoch": 0.5687583444592791, "grad_norm": 0.263671875, "learning_rate": 4.296367517305548e-06, "loss": 2.51, "step": 213 }, { "epoch": 0.5714285714285714, "grad_norm": 0.265625, "learning_rate": 4.252711084055468e-06, "loss": 2.4997, "step": 214 }, { "epoch": 0.5740987983978638, "grad_norm": 0.265625, "learning_rate": 4.209112845778481e-06, "loss": 2.4813, "step": 215 }, { "epoch": 0.5767690253671562, "grad_norm": 0.263671875, "learning_rate": 4.165576197678571e-06, "loss": 2.481, "step": 216 }, { "epoch": 0.5794392523364486, "grad_norm": 0.259765625, "learning_rate": 4.122104530163397e-06, "loss": 2.4856, "step": 217 }, { "epoch": 0.582109479305741, "grad_norm": 0.26171875, "learning_rate": 4.0787012285802695e-06, "loss": 2.5178, "step": 218 }, { "epoch": 0.5847797062750334, "grad_norm": 0.265625, "learning_rate": 4.035369672952516e-06, "loss": 2.5174, "step": 219 }, { "epoch": 0.5874499332443258, "grad_norm": 0.263671875, "learning_rate": 3.992113237716261e-06, "loss": 2.495, "step": 220 }, { "epoch": 0.5901201602136181, "grad_norm": 0.267578125, "learning_rate": 3.948935291457645e-06, "loss": 2.5052, "step": 221 }, { "epoch": 0.5927903871829105, "grad_norm": 0.26953125, "learning_rate": 3.905839196650494e-06, "loss": 2.5198, "step": 222 }, { "epoch": 0.595460614152203, "grad_norm": 0.267578125, "learning_rate": 3.862828309394469e-06, "loss": 2.4906, "step": 223 }, { "epoch": 0.5981308411214953, "grad_norm": 0.26171875, "learning_rate": 3.8199059791537105e-06, "loss": 2.5027, "step": 224 }, { "epoch": 0.6008010680907877, "grad_norm": 0.267578125, "learning_rate": 3.777075548496001e-06, "loss": 2.4962, "step": 225 }, { "epoch": 0.6034712950600801, "grad_norm": 0.271484375, "learning_rate": 3.7343403528324574e-06, "loss": 2.5218, "step": 226 }, { "epoch": 0.6061415220293725, "grad_norm": 0.263671875, "learning_rate": 3.6917037201577977e-06, "loss": 2.5101, "step": 227 }, { "epoch": 0.6088117489986649, "grad_norm": 0.265625, "learning_rate": 3.649168970791157e-06, "loss": 2.5235, "step": 228 }, { "epoch": 0.6114819759679573, "grad_norm": 0.259765625, "learning_rate": 3.6067394171175397e-06, "loss": 2.4863, "step": 229 }, { "epoch": 0.6141522029372497, "grad_norm": 0.267578125, "learning_rate": 3.564418363329848e-06, "loss": 2.5164, "step": 230 }, { "epoch": 0.616822429906542, "grad_norm": 0.26171875, "learning_rate": 3.5222091051715803e-06, "loss": 2.5003, "step": 231 }, { "epoch": 0.6194926568758344, "grad_norm": 0.263671875, "learning_rate": 3.480114929680176e-06, "loss": 2.5213, "step": 232 }, { "epoch": 0.6221628838451269, "grad_norm": 0.265625, "learning_rate": 3.4381391149310294e-06, "loss": 2.5545, "step": 233 }, { "epoch": 0.6248331108144193, "grad_norm": 0.27734375, "learning_rate": 3.3962849297822225e-06, "loss": 2.4738, "step": 234 }, { "epoch": 0.6275033377837116, "grad_norm": 0.26171875, "learning_rate": 3.35455563361995e-06, "loss": 2.4812, "step": 235 }, { "epoch": 0.630173564753004, "grad_norm": 0.26171875, "learning_rate": 3.3129544761047093e-06, "loss": 2.4738, "step": 236 }, { "epoch": 0.6328437917222964, "grad_norm": 0.263671875, "learning_rate": 3.271484696918218e-06, "loss": 2.5361, "step": 237 }, { "epoch": 0.6355140186915887, "grad_norm": 0.2578125, "learning_rate": 3.2301495255111426e-06, "loss": 2.4995, "step": 238 }, { "epoch": 0.6381842456608812, "grad_norm": 0.267578125, "learning_rate": 3.1889521808515888e-06, "loss": 2.5055, "step": 239 }, { "epoch": 0.6408544726301736, "grad_norm": 0.26171875, "learning_rate": 3.1478958711744324e-06, "loss": 2.4533, "step": 240 }, { "epoch": 0.6435246995994659, "grad_norm": 0.26171875, "learning_rate": 3.1069837937314846e-06, "loss": 2.4603, "step": 241 }, { "epoch": 0.6461949265687583, "grad_norm": 0.265625, "learning_rate": 3.0662191345424925e-06, "loss": 2.5259, "step": 242 }, { "epoch": 0.6488651535380507, "grad_norm": 0.26171875, "learning_rate": 3.0256050681470446e-06, "loss": 2.5498, "step": 243 }, { "epoch": 0.6515353805073432, "grad_norm": 0.259765625, "learning_rate": 2.9851447573573383e-06, "loss": 2.4606, "step": 244 }, { "epoch": 0.6542056074766355, "grad_norm": 0.26953125, "learning_rate": 2.9448413530118912e-06, "loss": 2.5477, "step": 245 }, { "epoch": 0.6568758344459279, "grad_norm": 0.265625, "learning_rate": 2.904697993730159e-06, "loss": 2.5476, "step": 246 }, { "epoch": 0.6595460614152203, "grad_norm": 0.263671875, "learning_rate": 2.8647178056681197e-06, "loss": 2.5033, "step": 247 }, { "epoch": 0.6622162883845126, "grad_norm": 0.2578125, "learning_rate": 2.8249039022748315e-06, "loss": 2.4971, "step": 248 }, { "epoch": 0.664886515353805, "grad_norm": 0.267578125, "learning_rate": 2.785259384049959e-06, "loss": 2.4596, "step": 249 }, { "epoch": 0.6675567423230975, "grad_norm": 0.263671875, "learning_rate": 2.745787338302341e-06, "loss": 2.4948, "step": 250 }, { "epoch": 0.6702269692923899, "grad_norm": 0.267578125, "learning_rate": 2.706490838909547e-06, "loss": 2.4986, "step": 251 }, { "epoch": 0.6728971962616822, "grad_norm": 0.263671875, "learning_rate": 2.6673729460785174e-06, "loss": 2.4885, "step": 252 }, { "epoch": 0.6755674232309746, "grad_norm": 0.26953125, "learning_rate": 2.628436706107238e-06, "loss": 2.5042, "step": 253 }, { "epoch": 0.678237650200267, "grad_norm": 0.26171875, "learning_rate": 2.5896851511475184e-06, "loss": 2.5229, "step": 254 }, { "epoch": 0.6809078771695594, "grad_norm": 0.2578125, "learning_rate": 2.5511212989688587e-06, "loss": 2.4748, "step": 255 }, { "epoch": 0.6835781041388518, "grad_norm": 0.26171875, "learning_rate": 2.5127481527234397e-06, "loss": 2.4837, "step": 256 }, { "epoch": 0.6862483311081442, "grad_norm": 0.26171875, "learning_rate": 2.4745687007122636e-06, "loss": 2.5272, "step": 257 }, { "epoch": 0.6889185580774366, "grad_norm": 0.26171875, "learning_rate": 2.436585916152426e-06, "loss": 2.4953, "step": 258 }, { "epoch": 0.6915887850467289, "grad_norm": 0.255859375, "learning_rate": 2.3988027569455895e-06, "loss": 2.4866, "step": 259 }, { "epoch": 0.6942590120160214, "grad_norm": 0.26171875, "learning_rate": 2.361222165447628e-06, "loss": 2.5, "step": 260 }, { "epoch": 0.6969292389853138, "grad_norm": 0.259765625, "learning_rate": 2.323847068239504e-06, "loss": 2.5211, "step": 261 }, { "epoch": 0.6995994659546061, "grad_norm": 0.271484375, "learning_rate": 2.2866803758993446e-06, "loss": 2.5103, "step": 262 }, { "epoch": 0.7022696929238985, "grad_norm": 0.265625, "learning_rate": 2.2497249827757933e-06, "loss": 2.5008, "step": 263 }, { "epoch": 0.7049399198931909, "grad_norm": 0.2578125, "learning_rate": 2.2129837667626147e-06, "loss": 2.4844, "step": 264 }, { "epoch": 0.7076101468624834, "grad_norm": 0.26953125, "learning_rate": 2.176459589074566e-06, "loss": 2.5163, "step": 265 }, { "epoch": 0.7102803738317757, "grad_norm": 0.265625, "learning_rate": 2.1401552940245962e-06, "loss": 2.5074, "step": 266 }, { "epoch": 0.7129506008010681, "grad_norm": 0.265625, "learning_rate": 2.1040737088023323e-06, "loss": 2.4936, "step": 267 }, { "epoch": 0.7156208277703605, "grad_norm": 0.265625, "learning_rate": 2.068217643253925e-06, "loss": 2.491, "step": 268 }, { "epoch": 0.7182910547396528, "grad_norm": 0.265625, "learning_rate": 2.0325898896632178e-06, "loss": 2.5246, "step": 269 }, { "epoch": 0.7209612817089452, "grad_norm": 0.267578125, "learning_rate": 1.997193222534316e-06, "loss": 2.5156, "step": 270 }, { "epoch": 0.7236315086782377, "grad_norm": 0.267578125, "learning_rate": 1.962030398375506e-06, "loss": 2.4926, "step": 271 }, { "epoch": 0.7263017356475301, "grad_norm": 0.259765625, "learning_rate": 1.927104155484602e-06, "loss": 2.5143, "step": 272 }, { "epoch": 0.7289719626168224, "grad_norm": 0.259765625, "learning_rate": 1.8924172137357038e-06, "loss": 2.4877, "step": 273 }, { "epoch": 0.7316421895861148, "grad_norm": 0.2734375, "learning_rate": 1.8579722743673773e-06, "loss": 2.513, "step": 274 }, { "epoch": 0.7343124165554072, "grad_norm": 0.263671875, "learning_rate": 1.8237720197723075e-06, "loss": 2.5352, "step": 275 }, { "epoch": 0.7369826435246996, "grad_norm": 0.263671875, "learning_rate": 1.789819113288397e-06, "loss": 2.4835, "step": 276 }, { "epoch": 0.739652870493992, "grad_norm": 0.26171875, "learning_rate": 1.75611619899137e-06, "loss": 2.4761, "step": 277 }, { "epoch": 0.7423230974632844, "grad_norm": 0.341796875, "learning_rate": 1.7226659014888548e-06, "loss": 2.4956, "step": 278 }, { "epoch": 0.7449933244325768, "grad_norm": 0.265625, "learning_rate": 1.689470825715998e-06, "loss": 2.5053, "step": 279 }, { "epoch": 0.7476635514018691, "grad_norm": 0.263671875, "learning_rate": 1.6565335567326112e-06, "loss": 2.4997, "step": 280 }, { "epoch": 0.7503337783711616, "grad_norm": 0.26171875, "learning_rate": 1.6238566595218475e-06, "loss": 2.5431, "step": 281 }, { "epoch": 0.753004005340454, "grad_norm": 0.259765625, "learning_rate": 1.591442678790467e-06, "loss": 2.4902, "step": 282 }, { "epoch": 0.7556742323097463, "grad_norm": 0.2734375, "learning_rate": 1.5592941387706562e-06, "loss": 2.5166, "step": 283 }, { "epoch": 0.7583444592790387, "grad_norm": 0.271484375, "learning_rate": 1.5274135430234654e-06, "loss": 2.501, "step": 284 }, { "epoch": 0.7610146862483311, "grad_norm": 0.26171875, "learning_rate": 1.4958033742438348e-06, "loss": 2.5154, "step": 285 }, { "epoch": 0.7636849132176236, "grad_norm": 0.294921875, "learning_rate": 1.4644660940672628e-06, "loss": 2.5048, "step": 286 }, { "epoch": 0.7663551401869159, "grad_norm": 0.26171875, "learning_rate": 1.4334041428781003e-06, "loss": 2.4991, "step": 287 }, { "epoch": 0.7690253671562083, "grad_norm": 0.26171875, "learning_rate": 1.4026199396195078e-06, "loss": 2.488, "step": 288 }, { "epoch": 0.7716955941255007, "grad_norm": 0.263671875, "learning_rate": 1.3721158816050872e-06, "loss": 2.4893, "step": 289 }, { "epoch": 0.774365821094793, "grad_norm": 0.26171875, "learning_rate": 1.3418943443321807e-06, "loss": 2.5037, "step": 290 }, { "epoch": 0.7770360480640854, "grad_norm": 0.265625, "learning_rate": 1.3119576812968893e-06, "loss": 2.4731, "step": 291 }, { "epoch": 0.7797062750333779, "grad_norm": 0.267578125, "learning_rate": 1.282308223810786e-06, "loss": 2.503, "step": 292 }, { "epoch": 0.7823765020026703, "grad_norm": 0.265625, "learning_rate": 1.252948280819375e-06, "loss": 2.5442, "step": 293 }, { "epoch": 0.7850467289719626, "grad_norm": 0.259765625, "learning_rate": 1.2238801387222716e-06, "loss": 2.4877, "step": 294 }, { "epoch": 0.787716955941255, "grad_norm": 0.26171875, "learning_rate": 1.1951060611951615e-06, "loss": 2.5067, "step": 295 }, { "epoch": 0.7903871829105474, "grad_norm": 0.26953125, "learning_rate": 1.1666282890135083e-06, "loss": 2.5179, "step": 296 }, { "epoch": 0.7930574098798397, "grad_norm": 0.26171875, "learning_rate": 1.1384490398780563e-06, "loss": 2.4758, "step": 297 }, { "epoch": 0.7957276368491322, "grad_norm": 0.259765625, "learning_rate": 1.1105705082421303e-06, "loss": 2.4833, "step": 298 }, { "epoch": 0.7983978638184246, "grad_norm": 0.263671875, "learning_rate": 1.0829948651407374e-06, "loss": 2.4751, "step": 299 }, { "epoch": 0.8010680907877169, "grad_norm": 0.255859375, "learning_rate": 1.0557242580215066e-06, "loss": 2.4916, "step": 300 }, { "epoch": 0.8037383177570093, "grad_norm": 0.263671875, "learning_rate": 1.0287608105774456e-06, "loss": 2.5196, "step": 301 }, { "epoch": 0.8064085447263017, "grad_norm": 0.255859375, "learning_rate": 1.002106622581569e-06, "loss": 2.5008, "step": 302 }, { "epoch": 0.8090787716955942, "grad_norm": 0.28515625, "learning_rate": 9.757637697233723e-07, "loss": 2.4333, "step": 303 }, { "epoch": 0.8117489986648865, "grad_norm": 0.267578125, "learning_rate": 9.497343034471896e-07, "loss": 2.5224, "step": 304 }, { "epoch": 0.8144192256341789, "grad_norm": 0.263671875, "learning_rate": 9.240202507924412e-07, "loss": 2.5119, "step": 305 }, { "epoch": 0.8170894526034713, "grad_norm": 0.259765625, "learning_rate": 8.986236142357707e-07, "loss": 2.5334, "step": 306 }, { "epoch": 0.8197596795727636, "grad_norm": 0.2578125, "learning_rate": 8.735463715351139e-07, "loss": 2.4918, "step": 307 }, { "epoch": 0.822429906542056, "grad_norm": 0.263671875, "learning_rate": 8.487904755756676e-07, "loss": 2.4838, "step": 308 }, { "epoch": 0.8251001335113485, "grad_norm": 0.265625, "learning_rate": 8.243578542178227e-07, "loss": 2.5301, "step": 309 }, { "epoch": 0.8277703604806409, "grad_norm": 0.259765625, "learning_rate": 8.002504101470204e-07, "loss": 2.5125, "step": 310 }, { "epoch": 0.8304405874499332, "grad_norm": 0.25390625, "learning_rate": 7.764700207255904e-07, "loss": 2.5022, "step": 311 }, { "epoch": 0.8331108144192256, "grad_norm": 0.263671875, "learning_rate": 7.530185378465459e-07, "loss": 2.527, "step": 312 }, { "epoch": 0.835781041388518, "grad_norm": 0.26171875, "learning_rate": 7.298977877893688e-07, "loss": 2.5099, "step": 313 }, { "epoch": 0.8384512683578104, "grad_norm": 0.26171875, "learning_rate": 7.071095710777925e-07, "loss": 2.4672, "step": 314 }, { "epoch": 0.8411214953271028, "grad_norm": 0.259765625, "learning_rate": 6.846556623395795e-07, "loss": 2.4596, "step": 315 }, { "epoch": 0.8437917222963952, "grad_norm": 0.265625, "learning_rate": 6.625378101683317e-07, "loss": 2.5339, "step": 316 }, { "epoch": 0.8464619492656876, "grad_norm": 0.259765625, "learning_rate": 6.40757736987307e-07, "loss": 2.4655, "step": 317 }, { "epoch": 0.8491321762349799, "grad_norm": 0.263671875, "learning_rate": 6.193171389152996e-07, "loss": 2.5052, "step": 318 }, { "epoch": 0.8518024032042724, "grad_norm": 0.267578125, "learning_rate": 5.982176856345445e-07, "loss": 2.5025, "step": 319 }, { "epoch": 0.8544726301735648, "grad_norm": 0.26953125, "learning_rate": 5.774610202606939e-07, "loss": 2.4683, "step": 320 }, { "epoch": 0.8571428571428571, "grad_norm": 0.26171875, "learning_rate": 5.570487592148666e-07, "loss": 2.4735, "step": 321 }, { "epoch": 0.8598130841121495, "grad_norm": 0.265625, "learning_rate": 5.369824920977567e-07, "loss": 2.5021, "step": 322 }, { "epoch": 0.8624833110814419, "grad_norm": 0.263671875, "learning_rate": 5.172637815658583e-07, "loss": 2.5157, "step": 323 }, { "epoch": 0.8651535380507344, "grad_norm": 0.2578125, "learning_rate": 4.978941632097612e-07, "loss": 2.4827, "step": 324 }, { "epoch": 0.8678237650200267, "grad_norm": 0.259765625, "learning_rate": 4.788751454345763e-07, "loss": 2.4453, "step": 325 }, { "epoch": 0.8704939919893191, "grad_norm": 0.259765625, "learning_rate": 4.60208209342462e-07, "loss": 2.4767, "step": 326 }, { "epoch": 0.8731642189586115, "grad_norm": 0.267578125, "learning_rate": 4.4189480861729137e-07, "loss": 2.5088, "step": 327 }, { "epoch": 0.8758344459279038, "grad_norm": 0.263671875, "learning_rate": 4.239363694114368e-07, "loss": 2.4893, "step": 328 }, { "epoch": 0.8785046728971962, "grad_norm": 0.26171875, "learning_rate": 4.0633429023472004e-07, "loss": 2.5054, "step": 329 }, { "epoch": 0.8811748998664887, "grad_norm": 0.263671875, "learning_rate": 3.890899418454913e-07, "loss": 2.4951, "step": 330 }, { "epoch": 0.8838451268357811, "grad_norm": 0.26171875, "learning_rate": 3.72204667143895e-07, "loss": 2.4751, "step": 331 }, { "epoch": 0.8865153538050734, "grad_norm": 0.259765625, "learning_rate": 3.556797810672785e-07, "loss": 2.488, "step": 332 }, { "epoch": 0.8891855807743658, "grad_norm": 0.26171875, "learning_rate": 3.395165704878023e-07, "loss": 2.5096, "step": 333 }, { "epoch": 0.8918558077436582, "grad_norm": 0.26171875, "learning_rate": 3.237162941122185e-07, "loss": 2.4998, "step": 334 }, { "epoch": 0.8945260347129506, "grad_norm": 0.255859375, "learning_rate": 3.082801823838527e-07, "loss": 2.4736, "step": 335 }, { "epoch": 0.897196261682243, "grad_norm": 0.265625, "learning_rate": 2.932094373867811e-07, "loss": 2.5098, "step": 336 }, { "epoch": 0.8998664886515354, "grad_norm": 0.259765625, "learning_rate": 2.785052327522214e-07, "loss": 2.4961, "step": 337 }, { "epoch": 0.9025367156208278, "grad_norm": 0.26171875, "learning_rate": 2.6416871356713224e-07, "loss": 2.5133, "step": 338 }, { "epoch": 0.9052069425901201, "grad_norm": 0.267578125, "learning_rate": 2.5020099628504603e-07, "loss": 2.4595, "step": 339 }, { "epoch": 0.9078771695594126, "grad_norm": 0.263671875, "learning_rate": 2.3660316863911682e-07, "loss": 2.5258, "step": 340 }, { "epoch": 0.910547396528705, "grad_norm": 0.265625, "learning_rate": 2.2337628955742263e-07, "loss": 2.4918, "step": 341 }, { "epoch": 0.9132176234979973, "grad_norm": 0.259765625, "learning_rate": 2.1052138908049303e-07, "loss": 2.5225, "step": 342 }, { "epoch": 0.9158878504672897, "grad_norm": 0.259765625, "learning_rate": 1.9803946828110376e-07, "loss": 2.4688, "step": 343 }, { "epoch": 0.9185580774365821, "grad_norm": 0.255859375, "learning_rate": 1.8593149918630927e-07, "loss": 2.4878, "step": 344 }, { "epoch": 0.9212283044058746, "grad_norm": 0.259765625, "learning_rate": 1.7419842470175196e-07, "loss": 2.5185, "step": 345 }, { "epoch": 0.9238985313751669, "grad_norm": 0.265625, "learning_rate": 1.6284115853823445e-07, "loss": 2.5176, "step": 346 }, { "epoch": 0.9265687583444593, "grad_norm": 0.2578125, "learning_rate": 1.5186058514055912e-07, "loss": 2.4453, "step": 347 }, { "epoch": 0.9292389853137517, "grad_norm": 0.265625, "learning_rate": 1.4125755961865827e-07, "loss": 2.5187, "step": 348 }, { "epoch": 0.931909212283044, "grad_norm": 0.271484375, "learning_rate": 1.3103290768099796e-07, "loss": 2.5128, "step": 349 }, { "epoch": 0.9345794392523364, "grad_norm": 0.259765625, "learning_rate": 1.2118742557027885e-07, "loss": 2.5018, "step": 350 }, { "epoch": 0.9372496662216289, "grad_norm": 0.26953125, "learning_rate": 1.1172188000142803e-07, "loss": 2.5313, "step": 351 }, { "epoch": 0.9399198931909212, "grad_norm": 0.263671875, "learning_rate": 1.026370081018907e-07, "loss": 2.5207, "step": 352 }, { "epoch": 0.9425901201602136, "grad_norm": 0.2578125, "learning_rate": 9.393351735422773e-08, "loss": 2.5082, "step": 353 }, { "epoch": 0.945260347129506, "grad_norm": 0.2578125, "learning_rate": 8.561208554101863e-08, "loss": 2.4809, "step": 354 }, { "epoch": 0.9479305740987984, "grad_norm": 0.267578125, "learning_rate": 7.76733606920832e-08, "loss": 2.5182, "step": 355 }, { "epoch": 0.9506008010680908, "grad_norm": 0.259765625, "learning_rate": 7.011796103401192e-08, "loss": 2.4804, "step": 356 }, { "epoch": 0.9532710280373832, "grad_norm": 0.263671875, "learning_rate": 6.294647494202444e-08, "loss": 2.4949, "step": 357 }, { "epoch": 0.9559412550066756, "grad_norm": 0.26171875, "learning_rate": 5.615946089414737e-08, "loss": 2.5131, "step": 358 }, { "epoch": 0.9586114819759679, "grad_norm": 0.259765625, "learning_rate": 4.975744742772848e-08, "loss": 2.4944, "step": 359 }, { "epoch": 0.9612817089452603, "grad_norm": 0.259765625, "learning_rate": 4.37409330982691e-08, "loss": 2.5229, "step": 360 }, { "epoch": 0.9639519359145527, "grad_norm": 0.265625, "learning_rate": 3.8110386440605164e-08, "loss": 2.5288, "step": 361 }, { "epoch": 0.9666221628838452, "grad_norm": 0.265625, "learning_rate": 3.2866245932418606e-08, "loss": 2.4581, "step": 362 }, { "epoch": 0.9692923898531375, "grad_norm": 0.267578125, "learning_rate": 2.8008919960090253e-08, "loss": 2.5052, "step": 363 }, { "epoch": 0.9719626168224299, "grad_norm": 0.265625, "learning_rate": 2.3538786786896918e-08, "loss": 2.5043, "step": 364 }, { "epoch": 0.9746328437917223, "grad_norm": 0.26171875, "learning_rate": 1.9456194523554404e-08, "loss": 2.4863, "step": 365 }, { "epoch": 0.9773030707610146, "grad_norm": 0.26171875, "learning_rate": 1.576146110111032e-08, "loss": 2.5012, "step": 366 }, { "epoch": 0.9799732977303071, "grad_norm": 0.2578125, "learning_rate": 1.2454874246181081e-08, "loss": 2.516, "step": 367 }, { "epoch": 0.9826435246995995, "grad_norm": 0.267578125, "learning_rate": 9.536691458548741e-09, "loss": 2.5053, "step": 368 }, { "epoch": 0.9853137516688919, "grad_norm": 0.26171875, "learning_rate": 7.007139991108136e-09, "loss": 2.4899, "step": 369 }, { "epoch": 0.9879839786381842, "grad_norm": 0.26171875, "learning_rate": 4.866416832167153e-09, "loss": 2.4932, "step": 370 }, { "epoch": 0.9906542056074766, "grad_norm": 0.259765625, "learning_rate": 3.1146886901090024e-09, "loss": 2.5244, "step": 371 }, { "epoch": 0.9933244325767691, "grad_norm": 0.259765625, "learning_rate": 1.7520919804075997e-09, "loss": 2.494, "step": 372 }, { "epoch": 0.9959946595460614, "grad_norm": 0.26171875, "learning_rate": 7.787328150071771e-10, "loss": 2.4716, "step": 373 }, { "epoch": 0.9986648865153538, "grad_norm": 0.26171875, "learning_rate": 1.9468699405444936e-10, "loss": 2.4657, "step": 374 } ], "logging_steps": 1, "max_steps": 374, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.698021672895119e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }