| { | |
| "best_global_step": 175000, | |
| "best_metric": 0.0006260189693421125, | |
| "best_model_checkpoint": "/data/bozos/models/f8d245da3b0d0e66db4c97688fe67d8c31303d4f662c4b64e5da18eb8964c893/checkpoints/checkpoint-175000", | |
| "epoch": 4.08, | |
| "eval_steps": 5000, | |
| "global_step": 255000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 2.213568925857544, | |
| "learning_rate": 0.00022758072642650628, | |
| "loss": 1.732, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.6694766879081726, | |
| "learning_rate": 0.00022757708507668012, | |
| "loss": 0.4826, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 2.43005633354187, | |
| "learning_rate": 0.00022757344372685396, | |
| "loss": 0.1817, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.4847993850708008, | |
| "learning_rate": 0.0002275698023770278, | |
| "loss": 0.1264, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.162339448928833, | |
| "learning_rate": 0.00022756616102720165, | |
| "loss": 0.0816, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 0.05317778140306473, | |
| "eval_runtime": 27.3774, | |
| "eval_samples_per_second": 36.526, | |
| "eval_steps_per_second": 4.566, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.13133646547794342, | |
| "learning_rate": 0.0002275625196773755, | |
| "loss": 0.0531, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.16350027918815613, | |
| "learning_rate": 0.00022755887832754933, | |
| "loss": 0.0471, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.15041467547416687, | |
| "learning_rate": 0.00022755523697772317, | |
| "loss": 0.0303, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.22431999444961548, | |
| "learning_rate": 0.000227551595627897, | |
| "loss": 0.0213, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.09702177345752716, | |
| "learning_rate": 0.00022754795427807082, | |
| "loss": 0.016, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 0.011971595697104931, | |
| "eval_runtime": 27.2488, | |
| "eval_samples_per_second": 36.699, | |
| "eval_steps_per_second": 4.587, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.22383971512317657, | |
| "learning_rate": 0.0002275443129282447, | |
| "loss": 0.0182, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.10483340919017792, | |
| "learning_rate": 0.0002275406715784185, | |
| "loss": 0.0099, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.06974471360445023, | |
| "learning_rate": 0.00022753703022859238, | |
| "loss": 0.0102, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.08179257810115814, | |
| "learning_rate": 0.0002275333888787662, | |
| "loss": 0.0095, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.06491447985172272, | |
| "learning_rate": 0.00022752974752894006, | |
| "loss": 0.0108, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 0.007058731280267239, | |
| "eval_runtime": 27.4538, | |
| "eval_samples_per_second": 36.425, | |
| "eval_steps_per_second": 4.553, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.07762598991394043, | |
| "learning_rate": 0.00022752610617911387, | |
| "loss": 0.0083, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.09719238430261612, | |
| "learning_rate": 0.00022752246482928774, | |
| "loss": 0.0088, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.0923658162355423, | |
| "learning_rate": 0.00022751882347946155, | |
| "loss": 0.0058, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.07014696300029755, | |
| "learning_rate": 0.00022751518212963542, | |
| "loss": 0.0075, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.060413043946027756, | |
| "learning_rate": 0.00022751154077980924, | |
| "loss": 0.006, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.004977255128324032, | |
| "eval_runtime": 27.1268, | |
| "eval_samples_per_second": 36.864, | |
| "eval_steps_per_second": 4.608, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.07979925721883774, | |
| "learning_rate": 0.0002275078994299831, | |
| "loss": 0.0074, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.07468965649604797, | |
| "learning_rate": 0.00022750425808015692, | |
| "loss": 0.0061, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.11379896104335785, | |
| "learning_rate": 0.00022750061673033079, | |
| "loss": 0.0066, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.14029712975025177, | |
| "learning_rate": 0.0002274969753805046, | |
| "loss": 0.0048, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.025649070739746094, | |
| "learning_rate": 0.00022749333403067844, | |
| "loss": 0.0062, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 0.00377740734256804, | |
| "eval_runtime": 27.2272, | |
| "eval_samples_per_second": 36.728, | |
| "eval_steps_per_second": 4.591, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.07835651934146881, | |
| "learning_rate": 0.00022748969268085228, | |
| "loss": 0.0057, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.037621937692165375, | |
| "learning_rate": 0.00022748605133102612, | |
| "loss": 0.0043, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.05530184134840965, | |
| "learning_rate": 0.00022748240998119996, | |
| "loss": 0.0078, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.2537539601325989, | |
| "learning_rate": 0.0002274787686313738, | |
| "loss": 0.004, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.08855901658535004, | |
| "learning_rate": 0.00022747512728154765, | |
| "loss": 0.0055, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 0.00407881336286664, | |
| "eval_runtime": 27.3314, | |
| "eval_samples_per_second": 36.588, | |
| "eval_steps_per_second": 4.573, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.01860993541777134, | |
| "learning_rate": 0.0002274714859317215, | |
| "loss": 0.0044, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.030549678951501846, | |
| "learning_rate": 0.00022746784458189533, | |
| "loss": 0.0062, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.07974190264940262, | |
| "learning_rate": 0.00022746420323206917, | |
| "loss": 0.0044, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.07146530598402023, | |
| "learning_rate": 0.000227460561882243, | |
| "loss": 0.0033, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.03786474093794823, | |
| "learning_rate": 0.00022745692053241685, | |
| "loss": 0.0064, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 0.002913910197094083, | |
| "eval_runtime": 27.2895, | |
| "eval_samples_per_second": 36.644, | |
| "eval_steps_per_second": 4.581, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 1.5708693265914917, | |
| "learning_rate": 0.0002274532791825907, | |
| "loss": 0.0048, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.04259568825364113, | |
| "learning_rate": 0.0002274496378327645, | |
| "loss": 0.0027, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.029481125995516777, | |
| "learning_rate": 0.00022744599648293838, | |
| "loss": 0.0049, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.3993789553642273, | |
| "learning_rate": 0.0002274423551331122, | |
| "loss": 0.0048, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.03810903802514076, | |
| "learning_rate": 0.00022743871378328606, | |
| "loss": 0.0027, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.00209135003387928, | |
| "eval_runtime": 27.0955, | |
| "eval_samples_per_second": 36.906, | |
| "eval_steps_per_second": 4.613, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.027073705568909645, | |
| "learning_rate": 0.00022743507243345987, | |
| "loss": 0.0033, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.04906334728002548, | |
| "learning_rate": 0.00022743143108363374, | |
| "loss": 0.0033, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.05806988850235939, | |
| "learning_rate": 0.00022742778973380755, | |
| "loss": 0.0039, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.022845715284347534, | |
| "learning_rate": 0.00022742414838398142, | |
| "loss": 0.0031, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.06443994492292404, | |
| "learning_rate": 0.00022742050703415524, | |
| "loss": 0.0027, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 0.002537691965699196, | |
| "eval_runtime": 27.3857, | |
| "eval_samples_per_second": 36.515, | |
| "eval_steps_per_second": 4.564, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.1143941730260849, | |
| "learning_rate": 0.0002274168656843291, | |
| "loss": 0.0034, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.04524613544344902, | |
| "learning_rate": 0.00022741322433450292, | |
| "loss": 0.0034, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.027965977787971497, | |
| "learning_rate": 0.00022740958298467676, | |
| "loss": 0.0031, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.033201005309820175, | |
| "learning_rate": 0.0002274059416348506, | |
| "loss": 0.0032, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.11329031735658646, | |
| "learning_rate": 0.00022740230028502444, | |
| "loss": 0.0034, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.00293481582775712, | |
| "eval_runtime": 27.0954, | |
| "eval_samples_per_second": 36.907, | |
| "eval_steps_per_second": 4.613, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.08998037129640579, | |
| "learning_rate": 0.00022739865893519828, | |
| "loss": 0.003, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.034506551921367645, | |
| "learning_rate": 0.00022739501758537212, | |
| "loss": 0.0039, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.10205531865358353, | |
| "learning_rate": 0.00022739137623554596, | |
| "loss": 0.0031, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.016757190227508545, | |
| "learning_rate": 0.0002273877348857198, | |
| "loss": 0.0024, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.038212958723306656, | |
| "learning_rate": 0.00022738409353589365, | |
| "loss": 0.0035, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_loss": 0.003101126756519079, | |
| "eval_runtime": 27.2079, | |
| "eval_samples_per_second": 36.754, | |
| "eval_steps_per_second": 4.594, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.20447635650634766, | |
| "learning_rate": 0.0002273804521860675, | |
| "loss": 0.0029, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.029786735773086548, | |
| "learning_rate": 0.00022737681083624133, | |
| "loss": 0.0028, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.1717972755432129, | |
| "learning_rate": 0.00022737316948641517, | |
| "loss": 0.0035, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.051670778542757034, | |
| "learning_rate": 0.000227369528136589, | |
| "loss": 0.0026, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.18315136432647705, | |
| "learning_rate": 0.00022736588678676285, | |
| "loss": 0.0021, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 0.0022276523523032665, | |
| "eval_runtime": 27.2912, | |
| "eval_samples_per_second": 36.642, | |
| "eval_steps_per_second": 4.58, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.2137993574142456, | |
| "learning_rate": 0.0002273622454369367, | |
| "loss": 0.0031, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.02584846317768097, | |
| "learning_rate": 0.00022735860408711053, | |
| "loss": 0.0033, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 0.054690442979335785, | |
| "learning_rate": 0.00022735496273728438, | |
| "loss": 0.0024, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 0.01702144928276539, | |
| "learning_rate": 0.00022735132138745822, | |
| "loss": 0.0032, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.02373000793159008, | |
| "learning_rate": 0.00022734768003763206, | |
| "loss": 0.0032, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_loss": 0.0020411296281963587, | |
| "eval_runtime": 27.2492, | |
| "eval_samples_per_second": 36.698, | |
| "eval_steps_per_second": 4.587, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 0.012987160123884678, | |
| "learning_rate": 0.0002273440386878059, | |
| "loss": 0.002, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 0.029065946117043495, | |
| "learning_rate": 0.0002273403973379797, | |
| "loss": 0.0031, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 0.17107020318508148, | |
| "learning_rate": 0.00022733675598815358, | |
| "loss": 0.0022, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 0.019081389531493187, | |
| "learning_rate": 0.0002273331146383274, | |
| "loss": 0.0031, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.008192900568246841, | |
| "learning_rate": 0.00022732947328850126, | |
| "loss": 0.0022, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_loss": 0.0018265106482431293, | |
| "eval_runtime": 27.5029, | |
| "eval_samples_per_second": 36.36, | |
| "eval_steps_per_second": 4.545, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 0.009845556691288948, | |
| "learning_rate": 0.00022732583193867508, | |
| "loss": 0.0033, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 0.1637999713420868, | |
| "learning_rate": 0.00022732219058884895, | |
| "loss": 0.0024, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 0.03215477615594864, | |
| "learning_rate": 0.00022731854923902276, | |
| "loss": 0.0026, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 0.028977178037166595, | |
| "learning_rate": 0.00022731490788919663, | |
| "loss": 0.0023, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.057766951620578766, | |
| "learning_rate": 0.00022731126653937044, | |
| "loss": 0.0034, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 0.0021767348516732454, | |
| "eval_runtime": 27.526, | |
| "eval_samples_per_second": 36.329, | |
| "eval_steps_per_second": 4.541, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 0.00946386530995369, | |
| "learning_rate": 0.0002273076251895443, | |
| "loss": 0.0021, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 0.12553413212299347, | |
| "learning_rate": 0.00022730398383971812, | |
| "loss": 0.0019, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 0.0369916595518589, | |
| "learning_rate": 0.000227300342489892, | |
| "loss": 0.003, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 0.19732122123241425, | |
| "learning_rate": 0.0002272967011400658, | |
| "loss": 0.0024, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.02471228875219822, | |
| "learning_rate": 0.00022729305979023967, | |
| "loss": 0.0024, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": 0.002593559678643942, | |
| "eval_runtime": 27.2923, | |
| "eval_samples_per_second": 36.64, | |
| "eval_steps_per_second": 4.58, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 0.5299795269966125, | |
| "learning_rate": 0.0002272894184404135, | |
| "loss": 0.0019, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 0.03472663834691048, | |
| "learning_rate": 0.00022728577709058736, | |
| "loss": 0.003, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 0.09357739239931107, | |
| "learning_rate": 0.00022728213574076117, | |
| "loss": 0.0022, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 0.01810472272336483, | |
| "learning_rate": 0.00022727849439093504, | |
| "loss": 0.0019, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.024920647963881493, | |
| "learning_rate": 0.00022727485304110885, | |
| "loss": 0.0021, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "eval_loss": 0.0022232765331864357, | |
| "eval_runtime": 27.3645, | |
| "eval_samples_per_second": 36.544, | |
| "eval_steps_per_second": 4.568, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 0.03085111826658249, | |
| "learning_rate": 0.00022727121169128272, | |
| "loss": 0.0023, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 0.010742255486547947, | |
| "learning_rate": 0.00022726757034145654, | |
| "loss": 0.0019, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 0.03559265285730362, | |
| "learning_rate": 0.00022726392899163038, | |
| "loss": 0.0022, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 0.0898701399564743, | |
| "learning_rate": 0.00022726028764180422, | |
| "loss": 0.0028, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.026589710265398026, | |
| "learning_rate": 0.00022725664629197806, | |
| "loss": 0.0016, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_loss": 0.00150102109182626, | |
| "eval_runtime": 27.6944, | |
| "eval_samples_per_second": 36.108, | |
| "eval_steps_per_second": 4.514, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 0.016303159296512604, | |
| "learning_rate": 0.0002272530049421519, | |
| "loss": 0.0024, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 0.01823027804493904, | |
| "learning_rate": 0.00022724936359232574, | |
| "loss": 0.0018, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 0.15236489474773407, | |
| "learning_rate": 0.00022724572224249958, | |
| "loss": 0.0024, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 0.03902558609843254, | |
| "learning_rate": 0.00022724208089267342, | |
| "loss": 0.0021, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.020767396315932274, | |
| "learning_rate": 0.00022723843954284726, | |
| "loss": 0.002, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "eval_loss": 0.001406910945661366, | |
| "eval_runtime": 27.6364, | |
| "eval_samples_per_second": 36.184, | |
| "eval_steps_per_second": 4.523, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 0.09269700944423676, | |
| "learning_rate": 0.0002272347981930211, | |
| "loss": 0.0023, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 0.04058321192860603, | |
| "learning_rate": 0.00022723115684319495, | |
| "loss": 0.0019, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 0.04894057661294937, | |
| "learning_rate": 0.0002272275154933688, | |
| "loss": 0.0018, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 0.04043205827474594, | |
| "learning_rate": 0.00022722387414354263, | |
| "loss": 0.0022, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.1002797931432724, | |
| "learning_rate": 0.00022722023279371647, | |
| "loss": 0.002, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.0017908032750710845, | |
| "eval_runtime": 27.5843, | |
| "eval_samples_per_second": 36.253, | |
| "eval_steps_per_second": 4.532, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 0.02161436155438423, | |
| "learning_rate": 0.0002272165914438903, | |
| "loss": 0.0018, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 0.010246573947370052, | |
| "learning_rate": 0.00022721295009406415, | |
| "loss": 0.002, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 0.06802576035261154, | |
| "learning_rate": 0.000227209308744238, | |
| "loss": 0.0015, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 0.013391965068876743, | |
| "learning_rate": 0.00022720566739441183, | |
| "loss": 0.0025, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.10946637392044067, | |
| "learning_rate": 0.00022720202604458568, | |
| "loss": 0.0018, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "eval_loss": 0.0015562042826786637, | |
| "eval_runtime": 27.7549, | |
| "eval_samples_per_second": 36.03, | |
| "eval_steps_per_second": 4.504, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 0.028942033648490906, | |
| "learning_rate": 0.00022719838469475952, | |
| "loss": 0.002, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 0.023039843887090683, | |
| "learning_rate": 0.00022719474334493333, | |
| "loss": 0.0014, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 0.010488491505384445, | |
| "learning_rate": 0.0002271911019951072, | |
| "loss": 0.0016, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 0.019485417753458023, | |
| "learning_rate": 0.000227187460645281, | |
| "loss": 0.002, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.010597913525998592, | |
| "learning_rate": 0.00022718381929545488, | |
| "loss": 0.002, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_loss": 0.000878525257576257, | |
| "eval_runtime": 27.823, | |
| "eval_samples_per_second": 35.941, | |
| "eval_steps_per_second": 4.493, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 0.02870281971991062, | |
| "learning_rate": 0.0002271801779456287, | |
| "loss": 0.0023, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 0.041255537420511246, | |
| "learning_rate": 0.00022717653659580256, | |
| "loss": 0.0014, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 0.04701690748333931, | |
| "learning_rate": 0.00022717289524597638, | |
| "loss": 0.0015, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 0.059342917054891586, | |
| "learning_rate": 0.00022716925389615024, | |
| "loss": 0.0028, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 0.040327928960323334, | |
| "learning_rate": 0.00022716561254632406, | |
| "loss": 0.0014, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "eval_loss": 0.0016007705125957727, | |
| "eval_runtime": 27.7178, | |
| "eval_samples_per_second": 36.078, | |
| "eval_steps_per_second": 4.51, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 0.018858684226870537, | |
| "learning_rate": 0.00022716197119649793, | |
| "loss": 0.002, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 0.026660999283194542, | |
| "learning_rate": 0.00022715832984667174, | |
| "loss": 0.0019, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 0.08471547812223434, | |
| "learning_rate": 0.0002271546884968456, | |
| "loss": 0.0016, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 0.03236541524529457, | |
| "learning_rate": 0.00022715104714701942, | |
| "loss": 0.0014, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.015728328377008438, | |
| "learning_rate": 0.0002271474057971933, | |
| "loss": 0.0022, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_loss": 0.0017823727102950215, | |
| "eval_runtime": 27.6601, | |
| "eval_samples_per_second": 36.153, | |
| "eval_steps_per_second": 4.519, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 0.2575147747993469, | |
| "learning_rate": 0.0002271437644473671, | |
| "loss": 0.0015, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 0.03020591102540493, | |
| "learning_rate": 0.00022714012309754097, | |
| "loss": 0.0015, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 0.011387010104954243, | |
| "learning_rate": 0.0002271364817477148, | |
| "loss": 0.0015, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 0.033326998353004456, | |
| "learning_rate": 0.00022713284039788866, | |
| "loss": 0.0019, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.234897643327713, | |
| "learning_rate": 0.00022712919904806247, | |
| "loss": 0.0029, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.00910487212240696, | |
| "eval_runtime": 27.7906, | |
| "eval_samples_per_second": 35.983, | |
| "eval_steps_per_second": 4.498, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 0.05067163705825806, | |
| "learning_rate": 0.0002271255576982363, | |
| "loss": 0.0019, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 0.015078851021826267, | |
| "learning_rate": 0.00022712191634841015, | |
| "loss": 0.0012, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 0.03365013748407364, | |
| "learning_rate": 0.000227118274998584, | |
| "loss": 0.0018, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.00802704505622387, | |
| "learning_rate": 0.00022711463364875783, | |
| "loss": 0.0013, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.011523068882524967, | |
| "learning_rate": 0.00022711099229893168, | |
| "loss": 0.0021, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_loss": 0.0009301243117079139, | |
| "eval_runtime": 27.505, | |
| "eval_samples_per_second": 36.357, | |
| "eval_steps_per_second": 4.545, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 0.012680677697062492, | |
| "learning_rate": 0.00022710735094910552, | |
| "loss": 0.0014, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 0.0508689247071743, | |
| "learning_rate": 0.00022710370959927936, | |
| "loss": 0.002, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 0.014830244705080986, | |
| "learning_rate": 0.0002271000682494532, | |
| "loss": 0.001, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.028912167996168137, | |
| "learning_rate": 0.00022709642689962704, | |
| "loss": 0.0019, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.06254349648952484, | |
| "learning_rate": 0.00022709278554980088, | |
| "loss": 0.0012, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "eval_loss": 0.0014802517835050821, | |
| "eval_runtime": 27.695, | |
| "eval_samples_per_second": 36.108, | |
| "eval_steps_per_second": 4.513, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 0.01877821609377861, | |
| "learning_rate": 0.00022708914419997472, | |
| "loss": 0.0015, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 0.18786460161209106, | |
| "learning_rate": 0.00022708550285014856, | |
| "loss": 0.0018, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 0.016280388459563255, | |
| "learning_rate": 0.0002270818615003224, | |
| "loss": 0.0015, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 0.009028231725096703, | |
| "learning_rate": 0.00022707822015049625, | |
| "loss": 0.0022, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.02473852038383484, | |
| "learning_rate": 0.0002270745788006701, | |
| "loss": 0.0011, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_loss": 0.0011171329533681273, | |
| "eval_runtime": 27.6717, | |
| "eval_samples_per_second": 36.138, | |
| "eval_steps_per_second": 4.517, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 0.015900999307632446, | |
| "learning_rate": 0.00022707093745084393, | |
| "loss": 0.0015, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 0.018436668440699577, | |
| "learning_rate": 0.00022706729610101774, | |
| "loss": 0.0015, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 0.268839567899704, | |
| "learning_rate": 0.0002270636547511916, | |
| "loss": 0.0013, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.024980826303362846, | |
| "learning_rate": 0.00022706001340136542, | |
| "loss": 0.0017, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.025631515309214592, | |
| "learning_rate": 0.00022705637205153926, | |
| "loss": 0.0009, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "eval_loss": 0.0012023365125060081, | |
| "eval_runtime": 27.5991, | |
| "eval_samples_per_second": 36.233, | |
| "eval_steps_per_second": 4.529, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 0.010165953077375889, | |
| "learning_rate": 0.0002270527307017131, | |
| "loss": 0.0018, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 0.012398986145853996, | |
| "learning_rate": 0.00022704908935188695, | |
| "loss": 0.001, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 0.02246440201997757, | |
| "learning_rate": 0.0002270454480020608, | |
| "loss": 0.0025, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.018412381410598755, | |
| "learning_rate": 0.00022704180665223463, | |
| "loss": 0.0008, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.025599336251616478, | |
| "learning_rate": 0.00022703816530240847, | |
| "loss": 0.0025, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 0.000995820271782577, | |
| "eval_runtime": 27.7548, | |
| "eval_samples_per_second": 36.03, | |
| "eval_steps_per_second": 4.504, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 0.03476562350988388, | |
| "learning_rate": 0.0002270345239525823, | |
| "loss": 0.0016, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 0.002502072835341096, | |
| "learning_rate": 0.00022703088260275615, | |
| "loss": 0.001, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 0.09545526653528214, | |
| "learning_rate": 0.00022702724125293, | |
| "loss": 0.0019, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 0.026374874636530876, | |
| "learning_rate": 0.00022702359990310383, | |
| "loss": 0.0027, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.02330603636801243, | |
| "learning_rate": 0.00022701995855327768, | |
| "loss": 0.0013, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "eval_loss": 0.0009146310039795935, | |
| "eval_runtime": 27.6699, | |
| "eval_samples_per_second": 36.14, | |
| "eval_steps_per_second": 4.518, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 0.042115718126297, | |
| "learning_rate": 0.00022701631720345152, | |
| "loss": 0.001, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 0.006467332132160664, | |
| "learning_rate": 0.00022701267585362536, | |
| "loss": 0.0013, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 0.039700523018836975, | |
| "learning_rate": 0.0002270090345037992, | |
| "loss": 0.0012, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 0.006177098024636507, | |
| "learning_rate": 0.00022700539315397304, | |
| "loss": 0.0032, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.016644610092043877, | |
| "learning_rate": 0.00022700175180414688, | |
| "loss": 0.0007, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_loss": 0.0010344331385567784, | |
| "eval_runtime": 27.8065, | |
| "eval_samples_per_second": 35.963, | |
| "eval_steps_per_second": 4.495, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 0.01400495320558548, | |
| "learning_rate": 0.00022699811045432072, | |
| "loss": 0.0012, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 0.016703518107533455, | |
| "learning_rate": 0.00022699446910449456, | |
| "loss": 0.0012, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 0.006359017454087734, | |
| "learning_rate": 0.0002269908277546684, | |
| "loss": 0.0012, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 0.01771441660821438, | |
| "learning_rate": 0.00022698718640484222, | |
| "loss": 0.0016, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.01094936951994896, | |
| "learning_rate": 0.0002269835450550161, | |
| "loss": 0.0011, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "eval_loss": 0.0007599141681566834, | |
| "eval_runtime": 27.7146, | |
| "eval_samples_per_second": 36.082, | |
| "eval_steps_per_second": 4.51, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 0.09152177721261978, | |
| "learning_rate": 0.0002269799037051899, | |
| "loss": 0.0024, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": 0.012105804868042469, | |
| "learning_rate": 0.00022697626235536377, | |
| "loss": 0.0009, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 0.01530654076486826, | |
| "learning_rate": 0.00022697262100553758, | |
| "loss": 0.0011, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.031053414568305016, | |
| "learning_rate": 0.00022696897965571145, | |
| "loss": 0.0015, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.01557753887027502, | |
| "learning_rate": 0.00022696533830588527, | |
| "loss": 0.001, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_loss": 0.0008088626782409847, | |
| "eval_runtime": 27.776, | |
| "eval_samples_per_second": 36.002, | |
| "eval_steps_per_second": 4.5, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 0.02831295132637024, | |
| "learning_rate": 0.00022696169695605913, | |
| "loss": 0.0014, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 0.017672572284936905, | |
| "learning_rate": 0.00022695805560623295, | |
| "loss": 0.0011, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 0.018164193257689476, | |
| "learning_rate": 0.00022695441425640682, | |
| "loss": 0.0019, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.017383994534611702, | |
| "learning_rate": 0.00022695077290658063, | |
| "loss": 0.001, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.006576849147677422, | |
| "learning_rate": 0.0002269471315567545, | |
| "loss": 0.0011, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_loss": 0.0006260189693421125, | |
| "eval_runtime": 27.3919, | |
| "eval_samples_per_second": 36.507, | |
| "eval_steps_per_second": 4.563, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 0.019615883007645607, | |
| "learning_rate": 0.0002269434902069283, | |
| "loss": 0.0012, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 0.03926165774464607, | |
| "learning_rate": 0.00022693984885710218, | |
| "loss": 0.0014, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 0.021534917876124382, | |
| "learning_rate": 0.000226936207507276, | |
| "loss": 0.0012, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 0.04047563299536705, | |
| "learning_rate": 0.00022693256615744986, | |
| "loss": 0.001, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.04712160676717758, | |
| "learning_rate": 0.00022692892480762368, | |
| "loss": 0.0015, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_loss": 0.0013630291214212775, | |
| "eval_runtime": 27.4056, | |
| "eval_samples_per_second": 36.489, | |
| "eval_steps_per_second": 4.561, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 0.21584591269493103, | |
| "learning_rate": 0.00022692528345779754, | |
| "loss": 0.0019, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 0.015519549138844013, | |
| "learning_rate": 0.00022692164210797136, | |
| "loss": 0.0012, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": 0.0314391665160656, | |
| "learning_rate": 0.00022691800075814523, | |
| "loss": 0.0009, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.16906876862049103, | |
| "learning_rate": 0.00022691435940831904, | |
| "loss": 0.0013, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.04538990557193756, | |
| "learning_rate": 0.00022691071805849288, | |
| "loss": 0.001, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "eval_loss": 0.0014080323744565248, | |
| "eval_runtime": 27.3828, | |
| "eval_samples_per_second": 36.519, | |
| "eval_steps_per_second": 4.565, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 0.008023149333894253, | |
| "learning_rate": 0.00022690707670866672, | |
| "loss": 0.0013, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": 0.011926773004233837, | |
| "learning_rate": 0.00022690343535884056, | |
| "loss": 0.0012, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 3.008, | |
| "grad_norm": 0.01701526716351509, | |
| "learning_rate": 0.0002268997940090144, | |
| "loss": 0.0011, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 3.024, | |
| "grad_norm": 0.015581037849187851, | |
| "learning_rate": 0.00022689615265918825, | |
| "loss": 0.0013, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.012046800926327705, | |
| "learning_rate": 0.0002268925113093621, | |
| "loss": 0.001, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_loss": 0.0010119588114321232, | |
| "eval_runtime": 27.6665, | |
| "eval_samples_per_second": 36.145, | |
| "eval_steps_per_second": 4.518, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 3.056, | |
| "grad_norm": 0.009263888001441956, | |
| "learning_rate": 0.00022688886995953593, | |
| "loss": 0.001, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 3.072, | |
| "grad_norm": 0.0538918599486351, | |
| "learning_rate": 0.00022688522860970977, | |
| "loss": 0.0012, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 3.088, | |
| "grad_norm": 0.0521121546626091, | |
| "learning_rate": 0.0002268815872598836, | |
| "loss": 0.0017, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 3.104, | |
| "grad_norm": 0.05000779777765274, | |
| "learning_rate": 0.00022687794591005745, | |
| "loss": 0.0008, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.06467895954847336, | |
| "learning_rate": 0.0002268743045602313, | |
| "loss": 0.0011, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "eval_loss": 0.0008815609035082161, | |
| "eval_runtime": 27.5652, | |
| "eval_samples_per_second": 36.278, | |
| "eval_steps_per_second": 4.535, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 3.136, | |
| "grad_norm": 0.01422048918902874, | |
| "learning_rate": 0.00022687066321040513, | |
| "loss": 0.0011, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 3.152, | |
| "grad_norm": 0.02482694387435913, | |
| "learning_rate": 0.00022686702186057897, | |
| "loss": 0.0011, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 3.168, | |
| "grad_norm": 0.03517874330282211, | |
| "learning_rate": 0.00022686338051075282, | |
| "loss": 0.0017, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 3.184, | |
| "grad_norm": 0.027310600504279137, | |
| "learning_rate": 0.00022685973916092666, | |
| "loss": 0.0008, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.06521017849445343, | |
| "learning_rate": 0.0002268560978111005, | |
| "loss": 0.002, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 0.00754576688632369, | |
| "eval_runtime": 27.5143, | |
| "eval_samples_per_second": 36.345, | |
| "eval_steps_per_second": 4.543, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 3.216, | |
| "grad_norm": 0.24959920346736908, | |
| "learning_rate": 0.00022685245646127434, | |
| "loss": 0.0008, | |
| "step": 201000 | |
| }, | |
| { | |
| "epoch": 3.232, | |
| "grad_norm": 0.010456324554979801, | |
| "learning_rate": 0.00022684881511144818, | |
| "loss": 0.0011, | |
| "step": 202000 | |
| }, | |
| { | |
| "epoch": 3.248, | |
| "grad_norm": 0.010797294788062572, | |
| "learning_rate": 0.00022684517376162202, | |
| "loss": 0.0011, | |
| "step": 203000 | |
| }, | |
| { | |
| "epoch": 3.2640000000000002, | |
| "grad_norm": 0.04222773015499115, | |
| "learning_rate": 0.00022684153241179584, | |
| "loss": 0.001, | |
| "step": 204000 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 0.03277302905917168, | |
| "learning_rate": 0.0002268378910619697, | |
| "loss": 0.0015, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "eval_loss": 0.0007634469075128436, | |
| "eval_runtime": 27.7742, | |
| "eval_samples_per_second": 36.005, | |
| "eval_steps_per_second": 4.501, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 3.296, | |
| "grad_norm": 0.0069810631684958935, | |
| "learning_rate": 0.00022683424971214352, | |
| "loss": 0.001, | |
| "step": 206000 | |
| }, | |
| { | |
| "epoch": 3.312, | |
| "grad_norm": 0.01147681474685669, | |
| "learning_rate": 0.00022683060836231739, | |
| "loss": 0.0009, | |
| "step": 207000 | |
| }, | |
| { | |
| "epoch": 3.328, | |
| "grad_norm": 0.009766928851604462, | |
| "learning_rate": 0.0002268269670124912, | |
| "loss": 0.0019, | |
| "step": 208000 | |
| }, | |
| { | |
| "epoch": 3.344, | |
| "grad_norm": 0.03460145741701126, | |
| "learning_rate": 0.00022682332566266507, | |
| "loss": 0.0008, | |
| "step": 209000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.016247229650616646, | |
| "learning_rate": 0.00022681968431283888, | |
| "loss": 0.001, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "eval_loss": 0.0010268606711179018, | |
| "eval_runtime": 26.9593, | |
| "eval_samples_per_second": 37.093, | |
| "eval_steps_per_second": 4.637, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 3.376, | |
| "grad_norm": 0.012766228057444096, | |
| "learning_rate": 0.00022681604296301275, | |
| "loss": 0.0008, | |
| "step": 211000 | |
| }, | |
| { | |
| "epoch": 3.392, | |
| "grad_norm": 0.005086794961243868, | |
| "learning_rate": 0.00022681240161318656, | |
| "loss": 0.0014, | |
| "step": 212000 | |
| }, | |
| { | |
| "epoch": 3.408, | |
| "grad_norm": 0.028264038264751434, | |
| "learning_rate": 0.00022680876026336043, | |
| "loss": 0.0011, | |
| "step": 213000 | |
| }, | |
| { | |
| "epoch": 3.424, | |
| "grad_norm": 0.05160939320921898, | |
| "learning_rate": 0.00022680511891353425, | |
| "loss": 0.0009, | |
| "step": 214000 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.02259020321071148, | |
| "learning_rate": 0.00022680147756370811, | |
| "loss": 0.0012, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "eval_loss": 0.0007602461846545339, | |
| "eval_runtime": 26.8881, | |
| "eval_samples_per_second": 37.191, | |
| "eval_steps_per_second": 4.649, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 3.456, | |
| "grad_norm": 0.03077981248497963, | |
| "learning_rate": 0.00022679783621388193, | |
| "loss": 0.0012, | |
| "step": 216000 | |
| }, | |
| { | |
| "epoch": 3.472, | |
| "grad_norm": 0.027997983619570732, | |
| "learning_rate": 0.0002267941948640558, | |
| "loss": 0.0008, | |
| "step": 217000 | |
| }, | |
| { | |
| "epoch": 3.488, | |
| "grad_norm": 0.009089149534702301, | |
| "learning_rate": 0.0002267905535142296, | |
| "loss": 0.0011, | |
| "step": 218000 | |
| }, | |
| { | |
| "epoch": 3.504, | |
| "grad_norm": 0.09043902903795242, | |
| "learning_rate": 0.00022678691216440348, | |
| "loss": 0.0011, | |
| "step": 219000 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.06199198588728905, | |
| "learning_rate": 0.0002267832708145773, | |
| "loss": 0.0011, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "eval_loss": 0.001106478739529848, | |
| "eval_runtime": 27.0055, | |
| "eval_samples_per_second": 37.029, | |
| "eval_steps_per_second": 4.629, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 3.536, | |
| "grad_norm": 0.013115255162119865, | |
| "learning_rate": 0.00022677962946475116, | |
| "loss": 0.0015, | |
| "step": 221000 | |
| }, | |
| { | |
| "epoch": 3.552, | |
| "grad_norm": 0.030206598341464996, | |
| "learning_rate": 0.00022677598811492498, | |
| "loss": 0.001, | |
| "step": 222000 | |
| }, | |
| { | |
| "epoch": 3.568, | |
| "grad_norm": 0.014335270039737225, | |
| "learning_rate": 0.00022677234676509882, | |
| "loss": 0.0008, | |
| "step": 223000 | |
| }, | |
| { | |
| "epoch": 3.584, | |
| "grad_norm": 0.04320364445447922, | |
| "learning_rate": 0.00022676870541527266, | |
| "loss": 0.001, | |
| "step": 224000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.01011396199464798, | |
| "learning_rate": 0.0002267650640654465, | |
| "loss": 0.0014, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "eval_loss": 0.0008724904037080705, | |
| "eval_runtime": 26.7598, | |
| "eval_samples_per_second": 37.37, | |
| "eval_steps_per_second": 4.671, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 3.616, | |
| "grad_norm": 0.06343936175107956, | |
| "learning_rate": 0.00022676142271562034, | |
| "loss": 0.0009, | |
| "step": 226000 | |
| }, | |
| { | |
| "epoch": 3.632, | |
| "grad_norm": 0.04553668946027756, | |
| "learning_rate": 0.00022675778136579418, | |
| "loss": 0.001, | |
| "step": 227000 | |
| }, | |
| { | |
| "epoch": 3.648, | |
| "grad_norm": 0.0029150221962481737, | |
| "learning_rate": 0.00022675414001596802, | |
| "loss": 0.0018, | |
| "step": 228000 | |
| }, | |
| { | |
| "epoch": 3.664, | |
| "grad_norm": 0.03533324971795082, | |
| "learning_rate": 0.00022675049866614186, | |
| "loss": 0.0017, | |
| "step": 229000 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.020134087651968002, | |
| "learning_rate": 0.0002267468573163157, | |
| "loss": 0.0013, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "eval_loss": 0.001037033973261714, | |
| "eval_runtime": 27.1191, | |
| "eval_samples_per_second": 36.874, | |
| "eval_steps_per_second": 4.609, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 3.6959999999999997, | |
| "grad_norm": 0.01976308599114418, | |
| "learning_rate": 0.00022674321596648955, | |
| "loss": 0.0009, | |
| "step": 231000 | |
| }, | |
| { | |
| "epoch": 3.7119999999999997, | |
| "grad_norm": 0.05415629222989082, | |
| "learning_rate": 0.00022673957461666339, | |
| "loss": 0.0012, | |
| "step": 232000 | |
| }, | |
| { | |
| "epoch": 3.7279999999999998, | |
| "grad_norm": 0.020477378740906715, | |
| "learning_rate": 0.00022673593326683723, | |
| "loss": 0.001, | |
| "step": 233000 | |
| }, | |
| { | |
| "epoch": 3.7439999999999998, | |
| "grad_norm": 0.014153924770653248, | |
| "learning_rate": 0.00022673229191701107, | |
| "loss": 0.0017, | |
| "step": 234000 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.02030963823199272, | |
| "learning_rate": 0.0002267286505671849, | |
| "loss": 0.0007, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "eval_loss": 0.0007908450206741691, | |
| "eval_runtime": 27.0159, | |
| "eval_samples_per_second": 37.015, | |
| "eval_steps_per_second": 4.627, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 3.776, | |
| "grad_norm": 0.03953304514288902, | |
| "learning_rate": 0.00022672500921735875, | |
| "loss": 0.0008, | |
| "step": 236000 | |
| }, | |
| { | |
| "epoch": 3.792, | |
| "grad_norm": 0.007172519341111183, | |
| "learning_rate": 0.0002267213678675326, | |
| "loss": 0.0016, | |
| "step": 237000 | |
| }, | |
| { | |
| "epoch": 3.808, | |
| "grad_norm": 0.03694753348827362, | |
| "learning_rate": 0.00022671772651770643, | |
| "loss": 0.0008, | |
| "step": 238000 | |
| }, | |
| { | |
| "epoch": 3.824, | |
| "grad_norm": 0.04899757727980614, | |
| "learning_rate": 0.00022671408516788027, | |
| "loss": 0.0011, | |
| "step": 239000 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.05499159172177315, | |
| "learning_rate": 0.00022671044381805412, | |
| "loss": 0.0013, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "eval_loss": 0.0008275896543636918, | |
| "eval_runtime": 27.099, | |
| "eval_samples_per_second": 36.902, | |
| "eval_steps_per_second": 4.613, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 3.856, | |
| "grad_norm": 0.02498927153646946, | |
| "learning_rate": 0.00022670680246822796, | |
| "loss": 0.0008, | |
| "step": 241000 | |
| }, | |
| { | |
| "epoch": 3.872, | |
| "grad_norm": 0.02703891508281231, | |
| "learning_rate": 0.00022670316111840177, | |
| "loss": 0.0009, | |
| "step": 242000 | |
| }, | |
| { | |
| "epoch": 3.888, | |
| "grad_norm": 0.010871395468711853, | |
| "learning_rate": 0.00022669951976857564, | |
| "loss": 0.0009, | |
| "step": 243000 | |
| }, | |
| { | |
| "epoch": 3.904, | |
| "grad_norm": 0.006647611036896706, | |
| "learning_rate": 0.00022669587841874945, | |
| "loss": 0.0019, | |
| "step": 244000 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.11232209205627441, | |
| "learning_rate": 0.00022669223706892332, | |
| "loss": 0.0006, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "eval_loss": 0.004233696032315493, | |
| "eval_runtime": 27.2042, | |
| "eval_samples_per_second": 36.759, | |
| "eval_steps_per_second": 4.595, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 3.936, | |
| "grad_norm": 0.03585943579673767, | |
| "learning_rate": 0.00022668859571909713, | |
| "loss": 0.0012, | |
| "step": 246000 | |
| }, | |
| { | |
| "epoch": 3.952, | |
| "grad_norm": 0.028422392904758453, | |
| "learning_rate": 0.000226684954369271, | |
| "loss": 0.0009, | |
| "step": 247000 | |
| }, | |
| { | |
| "epoch": 3.968, | |
| "grad_norm": 0.029626131057739258, | |
| "learning_rate": 0.00022668131301944482, | |
| "loss": 0.0009, | |
| "step": 248000 | |
| }, | |
| { | |
| "epoch": 3.984, | |
| "grad_norm": 0.01423815730959177, | |
| "learning_rate": 0.00022667767166961866, | |
| "loss": 0.0011, | |
| "step": 249000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.028744470328092575, | |
| "learning_rate": 0.0002266740303197925, | |
| "loss": 0.0012, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.0009512793621979654, | |
| "eval_runtime": 27.0826, | |
| "eval_samples_per_second": 36.924, | |
| "eval_steps_per_second": 4.616, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 4.016, | |
| "grad_norm": 0.05679468810558319, | |
| "learning_rate": 0.00022667038896996634, | |
| "loss": 0.0008, | |
| "step": 251000 | |
| }, | |
| { | |
| "epoch": 4.032, | |
| "grad_norm": 0.01259209681302309, | |
| "learning_rate": 0.00022666674762014018, | |
| "loss": 0.0012, | |
| "step": 252000 | |
| }, | |
| { | |
| "epoch": 4.048, | |
| "grad_norm": 0.02058994211256504, | |
| "learning_rate": 0.00022666310627031402, | |
| "loss": 0.0007, | |
| "step": 253000 | |
| }, | |
| { | |
| "epoch": 4.064, | |
| "grad_norm": 0.028425488620996475, | |
| "learning_rate": 0.00022665946492048786, | |
| "loss": 0.0017, | |
| "step": 254000 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 0.035576559603214264, | |
| "learning_rate": 0.0002266558235706617, | |
| "loss": 0.0008, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "eval_loss": 0.0006453625974245369, | |
| "eval_runtime": 27.5028, | |
| "eval_samples_per_second": 36.36, | |
| "eval_steps_per_second": 4.545, | |
| "step": 255000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 62500000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1000, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9094798804101104e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |