| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9986648865153538, | |
| "eval_steps": 500, | |
| "global_step": 374, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0026702269692923898, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.0, | |
| "loss": 2.534, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0053404539385847796, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 2.5367, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.00801068090787717, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 2.5319, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.010680907877169559, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 2.5296, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01335113484646195, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 2.5376, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01602136181575434, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.7777777777777783e-06, | |
| "loss": 2.5011, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.018691588785046728, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 2.501, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.021361815754339118, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.88888888888889e-06, | |
| "loss": 2.5337, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.02403204272363151, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 2.4829, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0267022696929239, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 5e-06, | |
| "loss": 2.4611, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.029372496662216287, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 2.4401, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03204272363150868, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 6.111111111111112e-06, | |
| "loss": 2.4879, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03471295060080107, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 2.5059, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.037383177570093455, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 7.222222222222223e-06, | |
| "loss": 2.5041, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.04005340453938585, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 2.516, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.042723631508678236, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 2.504, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.04539385847797063, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 2.523, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.04806408544726302, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 2.5062, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.050734312416555405, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.5219, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0534045393858478, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 9.999805313005946e-06, | |
| "loss": 2.5032, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.056074766355140186, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 9.999221267184993e-06, | |
| "loss": 2.5027, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.05874499332443257, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 9.998247908019594e-06, | |
| "loss": 2.5135, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.06141522029372497, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 9.996885311309892e-06, | |
| "loss": 2.5042, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.06408544726301736, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 9.995133583167833e-06, | |
| "loss": 2.513, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.06675567423230974, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 9.992992860008893e-06, | |
| "loss": 2.5058, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06942590120160214, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 9.990463308541452e-06, | |
| "loss": 2.5437, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.07209612817089453, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 9.987545125753818e-06, | |
| "loss": 2.4898, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.07476635514018691, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 9.98423853889889e-06, | |
| "loss": 2.4943, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0774365821094793, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 9.980543805476447e-06, | |
| "loss": 2.5259, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.0801068090787717, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 9.976461213213104e-06, | |
| "loss": 2.5365, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08277703604806408, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 9.971991080039912e-06, | |
| "loss": 2.4535, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.08544726301735647, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 9.967133754067581e-06, | |
| "loss": 2.5043, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.08811748998664887, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 9.961889613559396e-06, | |
| "loss": 2.5127, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.09078771695594126, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 9.956259066901733e-06, | |
| "loss": 2.5135, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.09345794392523364, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 9.950242552572272e-06, | |
| "loss": 2.469, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09612817089452604, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 9.943840539105853e-06, | |
| "loss": 2.5182, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.09879839786381843, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 9.937053525057977e-06, | |
| "loss": 2.5109, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.10146862483311081, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 9.92988203896599e-06, | |
| "loss": 2.5071, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1041388518024032, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 9.922326639307918e-06, | |
| "loss": 2.473, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1068090787716956, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 9.914387914458983e-06, | |
| "loss": 2.4875, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10947930574098798, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 9.906066482645774e-06, | |
| "loss": 2.5014, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.11214953271028037, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 9.89736299189811e-06, | |
| "loss": 2.5084, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.11481975967957277, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 9.888278119998573e-06, | |
| "loss": 2.4905, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.11748998664886515, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 9.878812574429722e-06, | |
| "loss": 2.4696, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.12016021361815754, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 9.868967092319003e-06, | |
| "loss": 2.4912, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.12283044058744993, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 9.858742440381343e-06, | |
| "loss": 2.4963, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.12550066755674233, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 9.848139414859441e-06, | |
| "loss": 2.5145, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.12817089452603472, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 9.837158841461767e-06, | |
| "loss": 2.4842, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1308411214953271, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 9.825801575298248e-06, | |
| "loss": 2.5164, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.13351134846461948, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.814068500813692e-06, | |
| "loss": 2.5011, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13618157543391188, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.801960531718898e-06, | |
| "loss": 2.5144, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.13885180240320427, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 9.789478610919508e-06, | |
| "loss": 2.4887, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.14152202937249667, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.77662371044258e-06, | |
| "loss": 2.508, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.14419225634178906, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 9.763396831360884e-06, | |
| "loss": 2.506, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.14686248331108145, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 9.749799003714954e-06, | |
| "loss": 2.5483, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.14953271028037382, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 9.735831286432869e-06, | |
| "loss": 2.5136, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.15220293724966621, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 9.721494767247779e-06, | |
| "loss": 2.487, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1548731642189586, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 9.70679056261322e-06, | |
| "loss": 2.5026, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.157543391188251, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 9.691719817616148e-06, | |
| "loss": 2.5142, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.1602136181575434, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.676283705887783e-06, | |
| "loss": 2.5027, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1628838451268358, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 9.660483429512198e-06, | |
| "loss": 2.5344, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.16555407209612816, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.644320218932723e-06, | |
| "loss": 2.5107, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.16822429906542055, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 9.627795332856107e-06, | |
| "loss": 2.4842, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.17089452603471295, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.61091005815451e-06, | |
| "loss": 2.4635, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.17356475300400534, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.59366570976528e-06, | |
| "loss": 2.5131, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.17623497997329773, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 9.576063630588563e-06, | |
| "loss": 2.5098, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.17890520694259013, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.55810519138271e-06, | |
| "loss": 2.5209, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.18157543391188252, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 9.53979179065754e-06, | |
| "loss": 2.5272, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.1842456608811749, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.521124854565425e-06, | |
| "loss": 2.5067, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.18691588785046728, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.50210583679024e-06, | |
| "loss": 2.5268, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18958611481975968, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 9.482736218434144e-06, | |
| "loss": 2.5179, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.19225634178905207, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.463017507902245e-06, | |
| "loss": 2.4965, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.19492656875834447, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.442951240785135e-06, | |
| "loss": 2.5109, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.19759679572763686, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 9.422538979739307e-06, | |
| "loss": 2.4834, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.20026702269692923, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.401782314365458e-06, | |
| "loss": 2.4986, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.20293724966622162, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.380682861084703e-06, | |
| "loss": 2.4779, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.205607476635514, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.359242263012693e-06, | |
| "loss": 2.4841, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2082777036048064, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 9.33746218983167e-06, | |
| "loss": 2.4902, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2109479305740988, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 9.315344337660422e-06, | |
| "loss": 2.4984, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.2136181575433912, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.29289042892221e-06, | |
| "loss": 2.4941, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2162883845126836, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 9.270102212210632e-06, | |
| "loss": 2.5192, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.21895861148197596, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.246981462153456e-06, | |
| "loss": 2.4991, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.22162883845126835, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.223529979274411e-06, | |
| "loss": 2.483, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.22429906542056074, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.19974958985298e-06, | |
| "loss": 2.5091, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.22696929238985314, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.175642145782179e-06, | |
| "loss": 2.5119, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.22963951935914553, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.151209524424333e-06, | |
| "loss": 2.5313, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.23230974632843793, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 9.126453628464889e-06, | |
| "loss": 2.4813, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.2349799732977303, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 9.10137638576423e-06, | |
| "loss": 2.489, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.2376502002670227, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 9.07597974920756e-06, | |
| "loss": 2.5331, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.24032042723631508, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 9.05026569655281e-06, | |
| "loss": 2.4919, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.24299065420560748, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.02423623027663e-06, | |
| "loss": 2.561, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.24566088117489987, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 8.997893377418432e-06, | |
| "loss": 2.5266, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.24833110814419226, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 8.971239189422555e-06, | |
| "loss": 2.4969, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.25100133511348466, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 8.944275741978495e-06, | |
| "loss": 2.4977, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.253671562082777, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.917005134859263e-06, | |
| "loss": 2.4885, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.25634178905206945, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 8.889429491757872e-06, | |
| "loss": 2.4995, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.2590120160213618, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.861550960121946e-06, | |
| "loss": 2.5081, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.2616822429906542, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 8.833371710986493e-06, | |
| "loss": 2.4995, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.2643524699599466, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 8.804893938804839e-06, | |
| "loss": 2.4881, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.26702269692923897, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.77611986127773e-06, | |
| "loss": 2.5327, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2696929238985314, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 8.747051719180626e-06, | |
| "loss": 2.526, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.27236315086782376, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 8.717691776189214e-06, | |
| "loss": 2.5366, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.2750333778371162, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.688042318703111e-06, | |
| "loss": 2.4877, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.27770360480640854, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.65810565566782e-06, | |
| "loss": 2.4742, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.2803738317757009, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.627884118394913e-06, | |
| "loss": 2.4916, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.28304405874499333, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.597380060380493e-06, | |
| "loss": 2.536, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.566595857121902e-06, | |
| "loss": 2.4748, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.2883845126835781, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 8.535533905932739e-06, | |
| "loss": 2.4833, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.2910547396528705, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 8.504196625756166e-06, | |
| "loss": 2.5434, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2937249666221629, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 8.472586456976534e-06, | |
| "loss": 2.4924, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2963951935914553, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 8.440705861229344e-06, | |
| "loss": 2.5269, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.29906542056074764, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 8.408557321209534e-06, | |
| "loss": 2.4687, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.30173564753004006, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.376143340478153e-06, | |
| "loss": 2.4686, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.30440587449933243, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 8.34346644326739e-06, | |
| "loss": 2.4978, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.30707610146862485, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.310529174284004e-06, | |
| "loss": 2.5137, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3097463284379172, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 8.277334098511147e-06, | |
| "loss": 2.5235, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.31241655540720964, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 8.243883801008632e-06, | |
| "loss": 2.4973, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.315086782376502, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.210180886711603e-06, | |
| "loss": 2.5124, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3177570093457944, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.176227980227693e-06, | |
| "loss": 2.5167, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.3204272363150868, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 8.142027725632622e-06, | |
| "loss": 2.454, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.32309746328437916, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 8.107582786264299e-06, | |
| "loss": 2.5076, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3257676902536716, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 8.072895844515398e-06, | |
| "loss": 2.5122, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.32843791722296395, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 8.037969601624495e-06, | |
| "loss": 2.5294, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3311081441922563, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.002806777465685e-06, | |
| "loss": 2.537, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.33377837116154874, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 7.967410110336782e-06, | |
| "loss": 2.4899, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3364485981308411, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.931782356746076e-06, | |
| "loss": 2.5312, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3391188251001335, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.895926291197667e-06, | |
| "loss": 2.4986, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3417890520694259, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.859844705975405e-06, | |
| "loss": 2.5286, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.3444592790387183, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 7.823540410925434e-06, | |
| "loss": 2.4856, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.3471295060080107, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 7.787016233237387e-06, | |
| "loss": 2.5236, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.34979973297730305, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 7.750275017224208e-06, | |
| "loss": 2.5115, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.35246995994659547, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.713319624100657e-06, | |
| "loss": 2.5059, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.35514018691588783, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 7.676152931760496e-06, | |
| "loss": 2.5122, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.35781041388518026, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.638777834552372e-06, | |
| "loss": 2.5179, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.3604806408544726, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 7.601197243054411e-06, | |
| "loss": 2.4895, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.36315086782376504, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 7.563414083847573e-06, | |
| "loss": 2.4843, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.3658210947930574, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 7.525431299287737e-06, | |
| "loss": 2.5075, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.3684913217623498, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 7.4872518472765594e-06, | |
| "loss": 2.5155, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.3711615487316422, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 7.4488787010311425e-06, | |
| "loss": 2.4949, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.37383177570093457, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 7.4103148488524824e-06, | |
| "loss": 2.5134, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.376502002670227, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.371563293892761e-06, | |
| "loss": 2.516, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.37917222963951935, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 7.3326270539214826e-06, | |
| "loss": 2.4839, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.3818424566088118, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 7.293509161090453e-06, | |
| "loss": 2.4952, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.38451268357810414, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 7.2542126616976596e-06, | |
| "loss": 2.4828, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.3871829105473965, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 7.214740615950041e-06, | |
| "loss": 2.5365, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.38985313751668893, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.175096097725169e-06, | |
| "loss": 2.5147, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.3925233644859813, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 7.135282194331881e-06, | |
| "loss": 2.4745, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.3951935914552737, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 7.095302006269842e-06, | |
| "loss": 2.4743, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.3978638184245661, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 7.05515864698811e-06, | |
| "loss": 2.4727, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.40053404539385845, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.014855242642662e-06, | |
| "loss": 2.4808, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4032042723631509, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.974394931852957e-06, | |
| "loss": 2.4861, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.40587449933244324, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 6.933780865457508e-06, | |
| "loss": 2.5099, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.40854472630173566, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.893016206268518e-06, | |
| "loss": 2.5029, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.411214953271028, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 6.85210412882557e-06, | |
| "loss": 2.496, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.41388518024032045, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.811047819148413e-06, | |
| "loss": 2.4808, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4165554072096128, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 6.769850474488859e-06, | |
| "loss": 2.5062, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.4192256341789052, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.728515303081782e-06, | |
| "loss": 2.4872, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.4218958611481976, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.687045523895292e-06, | |
| "loss": 2.5125, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.42456608811748997, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 6.64544436638005e-06, | |
| "loss": 2.4739, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.4272363150867824, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.603715070217779e-06, | |
| "loss": 2.4561, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.42990654205607476, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 6.561860885068972e-06, | |
| "loss": 2.5166, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.4325767690253672, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.519885070319827e-06, | |
| "loss": 2.4901, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.43524699599465955, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 6.477790894828422e-06, | |
| "loss": 2.4906, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.4379172229639519, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.435581636670154e-06, | |
| "loss": 2.4935, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.44058744993324434, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.393260582882462e-06, | |
| "loss": 2.4686, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4432576769025367, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.350831029208844e-06, | |
| "loss": 2.4516, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.4459279038718291, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.308296279842204e-06, | |
| "loss": 2.479, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.4485981308411215, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.265659647167542e-06, | |
| "loss": 2.5068, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.4512683578104139, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.222924451504001e-06, | |
| "loss": 2.4598, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.4539385847797063, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 6.180094020846291e-06, | |
| "loss": 2.5116, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.45660881174899864, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 6.1371716906055336e-06, | |
| "loss": 2.4649, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.45927903871829107, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.094160803349508e-06, | |
| "loss": 2.4785, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.46194926568758343, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.051064708542357e-06, | |
| "loss": 2.5259, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.46461949265687585, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 6.00788676228374e-06, | |
| "loss": 2.5047, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.4672897196261682, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.964630327047485e-06, | |
| "loss": 2.5377, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4699599465954606, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.921298771419731e-06, | |
| "loss": 2.5288, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.472630173564753, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.877895469836604e-06, | |
| "loss": 2.5045, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.4753004005340454, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 5.8344238023214305e-06, | |
| "loss": 2.5275, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.4779706275033378, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.790887154221521e-06, | |
| "loss": 2.5307, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.48064085447263016, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 5.747288915944533e-06, | |
| "loss": 2.4982, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4833110814419226, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.703632482694453e-06, | |
| "loss": 2.472, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.48598130841121495, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 5.659921254207183e-06, | |
| "loss": 2.5524, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.4886515353805073, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.616158634485793e-06, | |
| "loss": 2.4878, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.49132176234979974, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 5.572348031535442e-06, | |
| "loss": 2.5004, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.4939919893190921, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 5.528492857097966e-06, | |
| "loss": 2.4946, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.49666221628838453, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.484596526386198e-06, | |
| "loss": 2.4705, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.4993324432576769, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.44066245781801e-06, | |
| "loss": 2.5064, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.5020026702269693, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.396694072750099e-06, | |
| "loss": 2.4749, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.5046728971962616, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.352694795211555e-06, | |
| "loss": 2.5226, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.507343124165554, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 5.308668051637213e-06, | |
| "loss": 2.5068, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5100133511348465, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 5.2646172706008154e-06, | |
| "loss": 2.4547, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.5126835781041389, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 5.220545882548024e-06, | |
| "loss": 2.4742, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.5153538050734312, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 5.176457319529264e-06, | |
| "loss": 2.5493, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.5180240320427236, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.132355014932455e-06, | |
| "loss": 2.5024, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.520694259012016, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.088242403215644e-06, | |
| "loss": 2.5104, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5233644859813084, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 5.0441229196395416e-06, | |
| "loss": 2.4975, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.5260347129506008, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 5e-06, | |
| "loss": 2.4902, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.5287049399198932, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 4.955877080360462e-06, | |
| "loss": 2.5088, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.5313751668891856, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 4.911757596784358e-06, | |
| "loss": 2.5015, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.5340453938584779, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 4.867644985067548e-06, | |
| "loss": 2.5029, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5367156208277704, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 4.823542680470738e-06, | |
| "loss": 2.5171, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.5393858477970628, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.779454117451978e-06, | |
| "loss": 2.5242, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.5420560747663551, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 4.7353827293991845e-06, | |
| "loss": 2.5121, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.5447263017356475, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 4.691331948362789e-06, | |
| "loss": 2.4662, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.5473965287049399, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.647305204788445e-06, | |
| "loss": 2.5188, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5500667556742324, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 4.603305927249902e-06, | |
| "loss": 2.4847, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.5527369826435247, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 4.559337542181993e-06, | |
| "loss": 2.4865, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.5554072096128171, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 4.5154034736138035e-06, | |
| "loss": 2.4858, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.5580774365821095, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 4.471507142902036e-06, | |
| "loss": 2.4824, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.5607476635514018, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.427651968464559e-06, | |
| "loss": 2.515, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5634178905206942, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 4.383841365514208e-06, | |
| "loss": 2.5034, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.5660881174899867, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 4.340078745792818e-06, | |
| "loss": 2.4928, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.5687583444592791, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 4.296367517305548e-06, | |
| "loss": 2.51, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.252711084055468e-06, | |
| "loss": 2.4997, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5740987983978638, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.209112845778481e-06, | |
| "loss": 2.4813, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5767690253671562, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 4.165576197678571e-06, | |
| "loss": 2.481, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.5794392523364486, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.122104530163397e-06, | |
| "loss": 2.4856, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.582109479305741, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 4.0787012285802695e-06, | |
| "loss": 2.5178, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.5847797062750334, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.035369672952516e-06, | |
| "loss": 2.5174, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.5874499332443258, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 3.992113237716261e-06, | |
| "loss": 2.495, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5901201602136181, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 3.948935291457645e-06, | |
| "loss": 2.5052, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.5927903871829105, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 3.905839196650494e-06, | |
| "loss": 2.5198, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.595460614152203, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 3.862828309394469e-06, | |
| "loss": 2.4906, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.5981308411214953, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.8199059791537105e-06, | |
| "loss": 2.5027, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.6008010680907877, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 3.777075548496001e-06, | |
| "loss": 2.4962, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6034712950600801, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 3.7343403528324574e-06, | |
| "loss": 2.5218, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.6061415220293725, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 3.6917037201577977e-06, | |
| "loss": 2.5101, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.6088117489986649, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 3.649168970791157e-06, | |
| "loss": 2.5235, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.6114819759679573, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 3.6067394171175397e-06, | |
| "loss": 2.4863, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.6141522029372497, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 3.564418363329848e-06, | |
| "loss": 2.5164, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.616822429906542, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.5222091051715803e-06, | |
| "loss": 2.5003, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.6194926568758344, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 3.480114929680176e-06, | |
| "loss": 2.5213, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.6221628838451269, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 3.4381391149310294e-06, | |
| "loss": 2.5545, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.6248331108144193, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 3.3962849297822225e-06, | |
| "loss": 2.4738, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.6275033377837116, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.35455563361995e-06, | |
| "loss": 2.4812, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.630173564753004, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.3129544761047093e-06, | |
| "loss": 2.4738, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.6328437917222964, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 3.271484696918218e-06, | |
| "loss": 2.5361, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.6355140186915887, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 3.2301495255111426e-06, | |
| "loss": 2.4995, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.6381842456608812, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 3.1889521808515888e-06, | |
| "loss": 2.5055, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.6408544726301736, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.1478958711744324e-06, | |
| "loss": 2.4533, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6435246995994659, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.1069837937314846e-06, | |
| "loss": 2.4603, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.6461949265687583, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 3.0662191345424925e-06, | |
| "loss": 2.5259, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.6488651535380507, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.0256050681470446e-06, | |
| "loss": 2.5498, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.6515353805073432, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 2.9851447573573383e-06, | |
| "loss": 2.4606, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.6542056074766355, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.9448413530118912e-06, | |
| "loss": 2.5477, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6568758344459279, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.904697993730159e-06, | |
| "loss": 2.5476, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.6595460614152203, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.8647178056681197e-06, | |
| "loss": 2.5033, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.6622162883845126, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.8249039022748315e-06, | |
| "loss": 2.4971, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.664886515353805, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.785259384049959e-06, | |
| "loss": 2.4596, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.6675567423230975, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.745787338302341e-06, | |
| "loss": 2.4948, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6702269692923899, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.706490838909547e-06, | |
| "loss": 2.4986, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.6728971962616822, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.6673729460785174e-06, | |
| "loss": 2.4885, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.6755674232309746, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.628436706107238e-06, | |
| "loss": 2.5042, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.678237650200267, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.5896851511475184e-06, | |
| "loss": 2.5229, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.6809078771695594, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.5511212989688587e-06, | |
| "loss": 2.4748, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6835781041388518, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.5127481527234397e-06, | |
| "loss": 2.4837, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.6862483311081442, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.4745687007122636e-06, | |
| "loss": 2.5272, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.6889185580774366, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.436585916152426e-06, | |
| "loss": 2.4953, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.6915887850467289, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 2.3988027569455895e-06, | |
| "loss": 2.4866, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.6942590120160214, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.361222165447628e-06, | |
| "loss": 2.5, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6969292389853138, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 2.323847068239504e-06, | |
| "loss": 2.5211, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.6995994659546061, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.2866803758993446e-06, | |
| "loss": 2.5103, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.7022696929238985, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.2497249827757933e-06, | |
| "loss": 2.5008, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.7049399198931909, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.2129837667626147e-06, | |
| "loss": 2.4844, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.7076101468624834, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.176459589074566e-06, | |
| "loss": 2.5163, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7102803738317757, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.1401552940245962e-06, | |
| "loss": 2.5074, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.7129506008010681, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.1040737088023323e-06, | |
| "loss": 2.4936, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.7156208277703605, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.068217643253925e-06, | |
| "loss": 2.491, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.7182910547396528, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.0325898896632178e-06, | |
| "loss": 2.5246, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.7209612817089452, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.997193222534316e-06, | |
| "loss": 2.5156, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7236315086782377, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.962030398375506e-06, | |
| "loss": 2.4926, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.7263017356475301, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.927104155484602e-06, | |
| "loss": 2.5143, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.7289719626168224, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.8924172137357038e-06, | |
| "loss": 2.4877, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.7316421895861148, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.8579722743673773e-06, | |
| "loss": 2.513, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.7343124165554072, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.8237720197723075e-06, | |
| "loss": 2.5352, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7369826435246996, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.789819113288397e-06, | |
| "loss": 2.4835, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.739652870493992, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.75611619899137e-06, | |
| "loss": 2.4761, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.7423230974632844, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.7226659014888548e-06, | |
| "loss": 2.4956, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.7449933244325768, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.689470825715998e-06, | |
| "loss": 2.5053, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.7476635514018691, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.6565335567326112e-06, | |
| "loss": 2.4997, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7503337783711616, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.6238566595218475e-06, | |
| "loss": 2.5431, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.753004005340454, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.591442678790467e-06, | |
| "loss": 2.4902, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.7556742323097463, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.5592941387706562e-06, | |
| "loss": 2.5166, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.7583444592790387, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 1.5274135430234654e-06, | |
| "loss": 2.501, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.7610146862483311, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.4958033742438348e-06, | |
| "loss": 2.5154, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7636849132176236, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 1.4644660940672628e-06, | |
| "loss": 2.5048, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.7663551401869159, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.4334041428781003e-06, | |
| "loss": 2.4991, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.7690253671562083, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.4026199396195078e-06, | |
| "loss": 2.488, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.7716955941255007, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.3721158816050872e-06, | |
| "loss": 2.4893, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.774365821094793, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.3418943443321807e-06, | |
| "loss": 2.5037, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7770360480640854, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.3119576812968893e-06, | |
| "loss": 2.4731, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.7797062750333779, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.282308223810786e-06, | |
| "loss": 2.503, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.7823765020026703, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.252948280819375e-06, | |
| "loss": 2.5442, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.7850467289719626, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.2238801387222716e-06, | |
| "loss": 2.4877, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.787716955941255, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.1951060611951615e-06, | |
| "loss": 2.5067, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7903871829105474, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 1.1666282890135083e-06, | |
| "loss": 2.5179, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.7930574098798397, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.1384490398780563e-06, | |
| "loss": 2.4758, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.7957276368491322, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.1105705082421303e-06, | |
| "loss": 2.4833, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.7983978638184246, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.0829948651407374e-06, | |
| "loss": 2.4751, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.8010680907877169, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.0557242580215066e-06, | |
| "loss": 2.4916, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8037383177570093, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.0287608105774456e-06, | |
| "loss": 2.5196, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.8064085447263017, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.002106622581569e-06, | |
| "loss": 2.5008, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.8090787716955942, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 9.757637697233723e-07, | |
| "loss": 2.4333, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.8117489986648865, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 9.497343034471896e-07, | |
| "loss": 2.5224, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.8144192256341789, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 9.240202507924412e-07, | |
| "loss": 2.5119, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8170894526034713, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.986236142357707e-07, | |
| "loss": 2.5334, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.8197596795727636, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 8.735463715351139e-07, | |
| "loss": 2.4918, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.822429906542056, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.487904755756676e-07, | |
| "loss": 2.4838, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.8251001335113485, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 8.243578542178227e-07, | |
| "loss": 2.5301, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.8277703604806409, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.002504101470204e-07, | |
| "loss": 2.5125, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8304405874499332, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 7.764700207255904e-07, | |
| "loss": 2.5022, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.8331108144192256, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 7.530185378465459e-07, | |
| "loss": 2.527, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.835781041388518, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 7.298977877893688e-07, | |
| "loss": 2.5099, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.8384512683578104, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 7.071095710777925e-07, | |
| "loss": 2.4672, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.8411214953271028, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 6.846556623395795e-07, | |
| "loss": 2.4596, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8437917222963952, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.625378101683317e-07, | |
| "loss": 2.5339, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.8464619492656876, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 6.40757736987307e-07, | |
| "loss": 2.4655, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.8491321762349799, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.193171389152996e-07, | |
| "loss": 2.5052, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.8518024032042724, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.982176856345445e-07, | |
| "loss": 2.5025, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.8544726301735648, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 5.774610202606939e-07, | |
| "loss": 2.4683, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 5.570487592148666e-07, | |
| "loss": 2.4735, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.8598130841121495, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 5.369824920977567e-07, | |
| "loss": 2.5021, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.8624833110814419, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.172637815658583e-07, | |
| "loss": 2.5157, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.8651535380507344, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 4.978941632097612e-07, | |
| "loss": 2.4827, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.8678237650200267, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.788751454345763e-07, | |
| "loss": 2.4453, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8704939919893191, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.60208209342462e-07, | |
| "loss": 2.4767, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.8731642189586115, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 4.4189480861729137e-07, | |
| "loss": 2.5088, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.8758344459279038, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 4.239363694114368e-07, | |
| "loss": 2.4893, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.8785046728971962, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 4.0633429023472004e-07, | |
| "loss": 2.5054, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.8811748998664887, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 3.890899418454913e-07, | |
| "loss": 2.4951, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8838451268357811, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.72204667143895e-07, | |
| "loss": 2.4751, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.8865153538050734, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 3.556797810672785e-07, | |
| "loss": 2.488, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.8891855807743658, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.395165704878023e-07, | |
| "loss": 2.5096, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.8918558077436582, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.237162941122185e-07, | |
| "loss": 2.4998, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.8945260347129506, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 3.082801823838527e-07, | |
| "loss": 2.4736, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.897196261682243, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.932094373867811e-07, | |
| "loss": 2.5098, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.8998664886515354, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 2.785052327522214e-07, | |
| "loss": 2.4961, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.9025367156208278, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.6416871356713224e-07, | |
| "loss": 2.5133, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.9052069425901201, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.5020099628504603e-07, | |
| "loss": 2.4595, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.9078771695594126, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.3660316863911682e-07, | |
| "loss": 2.5258, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.910547396528705, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.2337628955742263e-07, | |
| "loss": 2.4918, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.9132176234979973, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 2.1052138908049303e-07, | |
| "loss": 2.5225, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.9158878504672897, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.9803946828110376e-07, | |
| "loss": 2.4688, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.9185580774365821, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.8593149918630927e-07, | |
| "loss": 2.4878, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.9212283044058746, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.7419842470175196e-07, | |
| "loss": 2.5185, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9238985313751669, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.6284115853823445e-07, | |
| "loss": 2.5176, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.9265687583444593, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 1.5186058514055912e-07, | |
| "loss": 2.4453, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.9292389853137517, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.4125755961865827e-07, | |
| "loss": 2.5187, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.931909212283044, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 1.3103290768099796e-07, | |
| "loss": 2.5128, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.9345794392523364, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.2118742557027885e-07, | |
| "loss": 2.5018, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9372496662216289, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 1.1172188000142803e-07, | |
| "loss": 2.5313, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.9399198931909212, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.026370081018907e-07, | |
| "loss": 2.5207, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.9425901201602136, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 9.393351735422773e-08, | |
| "loss": 2.5082, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.945260347129506, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 8.561208554101863e-08, | |
| "loss": 2.4809, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.9479305740987984, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.76733606920832e-08, | |
| "loss": 2.5182, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9506008010680908, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 7.011796103401192e-08, | |
| "loss": 2.4804, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.9532710280373832, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.294647494202444e-08, | |
| "loss": 2.4949, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.9559412550066756, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 5.615946089414737e-08, | |
| "loss": 2.5131, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.9586114819759679, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.975744742772848e-08, | |
| "loss": 2.4944, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.9612817089452603, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.37409330982691e-08, | |
| "loss": 2.5229, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9639519359145527, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 3.8110386440605164e-08, | |
| "loss": 2.5288, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.9666221628838452, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 3.2866245932418606e-08, | |
| "loss": 2.4581, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.9692923898531375, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.8008919960090253e-08, | |
| "loss": 2.5052, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.9719626168224299, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.3538786786896918e-08, | |
| "loss": 2.5043, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.9746328437917223, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.9456194523554404e-08, | |
| "loss": 2.4863, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.9773030707610146, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.576146110111032e-08, | |
| "loss": 2.5012, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.9799732977303071, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 1.2454874246181081e-08, | |
| "loss": 2.516, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.9826435246995995, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 9.536691458548741e-09, | |
| "loss": 2.5053, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.9853137516688919, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 7.007139991108136e-09, | |
| "loss": 2.4899, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.9879839786381842, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 4.866416832167153e-09, | |
| "loss": 2.4932, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9906542056074766, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 3.1146886901090024e-09, | |
| "loss": 2.5244, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.9933244325767691, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.7520919804075997e-09, | |
| "loss": 2.494, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.9959946595460614, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 7.787328150071771e-10, | |
| "loss": 2.4716, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.9986648865153538, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.9468699405444936e-10, | |
| "loss": 2.4657, | |
| "step": 374 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 374, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.698021672895119e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |