{ "best_global_step": 175000, "best_metric": 0.0006260189693421125, "best_model_checkpoint": "/data/bozos/models/f8d245da3b0d0e66db4c97688fe67d8c31303d4f662c4b64e5da18eb8964c893/checkpoints/checkpoint-175000", "epoch": 4.08, "eval_steps": 5000, "global_step": 255000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 2.213568925857544, "learning_rate": 0.00022758072642650628, "loss": 1.732, "step": 1000 }, { "epoch": 0.032, "grad_norm": 0.6694766879081726, "learning_rate": 0.00022757708507668012, "loss": 0.4826, "step": 2000 }, { "epoch": 0.048, "grad_norm": 2.43005633354187, "learning_rate": 0.00022757344372685396, "loss": 0.1817, "step": 3000 }, { "epoch": 0.064, "grad_norm": 0.4847993850708008, "learning_rate": 0.0002275698023770278, "loss": 0.1264, "step": 4000 }, { "epoch": 0.08, "grad_norm": 0.162339448928833, "learning_rate": 0.00022756616102720165, "loss": 0.0816, "step": 5000 }, { "epoch": 0.08, "eval_loss": 0.05317778140306473, "eval_runtime": 27.3774, "eval_samples_per_second": 36.526, "eval_steps_per_second": 4.566, "step": 5000 }, { "epoch": 0.096, "grad_norm": 0.13133646547794342, "learning_rate": 0.0002275625196773755, "loss": 0.0531, "step": 6000 }, { "epoch": 0.112, "grad_norm": 0.16350027918815613, "learning_rate": 0.00022755887832754933, "loss": 0.0471, "step": 7000 }, { "epoch": 0.128, "grad_norm": 0.15041467547416687, "learning_rate": 0.00022755523697772317, "loss": 0.0303, "step": 8000 }, { "epoch": 0.144, "grad_norm": 0.22431999444961548, "learning_rate": 0.000227551595627897, "loss": 0.0213, "step": 9000 }, { "epoch": 0.16, "grad_norm": 0.09702177345752716, "learning_rate": 0.00022754795427807082, "loss": 0.016, "step": 10000 }, { "epoch": 0.16, "eval_loss": 0.011971595697104931, "eval_runtime": 27.2488, "eval_samples_per_second": 36.699, "eval_steps_per_second": 4.587, "step": 10000 }, { "epoch": 0.176, "grad_norm": 0.22383971512317657, "learning_rate": 0.0002275443129282447, "loss": 0.0182, "step": 11000 }, { "epoch": 0.192, "grad_norm": 0.10483340919017792, "learning_rate": 0.0002275406715784185, "loss": 0.0099, "step": 12000 }, { "epoch": 0.208, "grad_norm": 0.06974471360445023, "learning_rate": 0.00022753703022859238, "loss": 0.0102, "step": 13000 }, { "epoch": 0.224, "grad_norm": 0.08179257810115814, "learning_rate": 0.0002275333888787662, "loss": 0.0095, "step": 14000 }, { "epoch": 0.24, "grad_norm": 0.06491447985172272, "learning_rate": 0.00022752974752894006, "loss": 0.0108, "step": 15000 }, { "epoch": 0.24, "eval_loss": 0.007058731280267239, "eval_runtime": 27.4538, "eval_samples_per_second": 36.425, "eval_steps_per_second": 4.553, "step": 15000 }, { "epoch": 0.256, "grad_norm": 0.07762598991394043, "learning_rate": 0.00022752610617911387, "loss": 0.0083, "step": 16000 }, { "epoch": 0.272, "grad_norm": 0.09719238430261612, "learning_rate": 0.00022752246482928774, "loss": 0.0088, "step": 17000 }, { "epoch": 0.288, "grad_norm": 0.0923658162355423, "learning_rate": 0.00022751882347946155, "loss": 0.0058, "step": 18000 }, { "epoch": 0.304, "grad_norm": 0.07014696300029755, "learning_rate": 0.00022751518212963542, "loss": 0.0075, "step": 19000 }, { "epoch": 0.32, "grad_norm": 0.060413043946027756, "learning_rate": 0.00022751154077980924, "loss": 0.006, "step": 20000 }, { "epoch": 0.32, "eval_loss": 0.004977255128324032, "eval_runtime": 27.1268, "eval_samples_per_second": 36.864, "eval_steps_per_second": 4.608, "step": 20000 }, { "epoch": 0.336, "grad_norm": 0.07979925721883774, "learning_rate": 0.0002275078994299831, "loss": 0.0074, "step": 21000 }, { "epoch": 0.352, "grad_norm": 0.07468965649604797, "learning_rate": 0.00022750425808015692, "loss": 0.0061, "step": 22000 }, { "epoch": 0.368, "grad_norm": 0.11379896104335785, "learning_rate": 0.00022750061673033079, "loss": 0.0066, "step": 23000 }, { "epoch": 0.384, "grad_norm": 0.14029712975025177, "learning_rate": 0.0002274969753805046, "loss": 0.0048, "step": 24000 }, { "epoch": 0.4, "grad_norm": 0.025649070739746094, "learning_rate": 0.00022749333403067844, "loss": 0.0062, "step": 25000 }, { "epoch": 0.4, "eval_loss": 0.00377740734256804, "eval_runtime": 27.2272, "eval_samples_per_second": 36.728, "eval_steps_per_second": 4.591, "step": 25000 }, { "epoch": 0.416, "grad_norm": 0.07835651934146881, "learning_rate": 0.00022748969268085228, "loss": 0.0057, "step": 26000 }, { "epoch": 0.432, "grad_norm": 0.037621937692165375, "learning_rate": 0.00022748605133102612, "loss": 0.0043, "step": 27000 }, { "epoch": 0.448, "grad_norm": 0.05530184134840965, "learning_rate": 0.00022748240998119996, "loss": 0.0078, "step": 28000 }, { "epoch": 0.464, "grad_norm": 0.2537539601325989, "learning_rate": 0.0002274787686313738, "loss": 0.004, "step": 29000 }, { "epoch": 0.48, "grad_norm": 0.08855901658535004, "learning_rate": 0.00022747512728154765, "loss": 0.0055, "step": 30000 }, { "epoch": 0.48, "eval_loss": 0.00407881336286664, "eval_runtime": 27.3314, "eval_samples_per_second": 36.588, "eval_steps_per_second": 4.573, "step": 30000 }, { "epoch": 0.496, "grad_norm": 0.01860993541777134, "learning_rate": 0.0002274714859317215, "loss": 0.0044, "step": 31000 }, { "epoch": 0.512, "grad_norm": 0.030549678951501846, "learning_rate": 0.00022746784458189533, "loss": 0.0062, "step": 32000 }, { "epoch": 0.528, "grad_norm": 0.07974190264940262, "learning_rate": 0.00022746420323206917, "loss": 0.0044, "step": 33000 }, { "epoch": 0.544, "grad_norm": 0.07146530598402023, "learning_rate": 0.000227460561882243, "loss": 0.0033, "step": 34000 }, { "epoch": 0.56, "grad_norm": 0.03786474093794823, "learning_rate": 0.00022745692053241685, "loss": 0.0064, "step": 35000 }, { "epoch": 0.56, "eval_loss": 0.002913910197094083, "eval_runtime": 27.2895, "eval_samples_per_second": 36.644, "eval_steps_per_second": 4.581, "step": 35000 }, { "epoch": 0.576, "grad_norm": 1.5708693265914917, "learning_rate": 0.0002274532791825907, "loss": 0.0048, "step": 36000 }, { "epoch": 0.592, "grad_norm": 0.04259568825364113, "learning_rate": 0.0002274496378327645, "loss": 0.0027, "step": 37000 }, { "epoch": 0.608, "grad_norm": 0.029481125995516777, "learning_rate": 0.00022744599648293838, "loss": 0.0049, "step": 38000 }, { "epoch": 0.624, "grad_norm": 0.3993789553642273, "learning_rate": 0.0002274423551331122, "loss": 0.0048, "step": 39000 }, { "epoch": 0.64, "grad_norm": 0.03810903802514076, "learning_rate": 0.00022743871378328606, "loss": 0.0027, "step": 40000 }, { "epoch": 0.64, "eval_loss": 0.00209135003387928, "eval_runtime": 27.0955, "eval_samples_per_second": 36.906, "eval_steps_per_second": 4.613, "step": 40000 }, { "epoch": 0.656, "grad_norm": 0.027073705568909645, "learning_rate": 0.00022743507243345987, "loss": 0.0033, "step": 41000 }, { "epoch": 0.672, "grad_norm": 0.04906334728002548, "learning_rate": 0.00022743143108363374, "loss": 0.0033, "step": 42000 }, { "epoch": 0.688, "grad_norm": 0.05806988850235939, "learning_rate": 0.00022742778973380755, "loss": 0.0039, "step": 43000 }, { "epoch": 0.704, "grad_norm": 0.022845715284347534, "learning_rate": 0.00022742414838398142, "loss": 0.0031, "step": 44000 }, { "epoch": 0.72, "grad_norm": 0.06443994492292404, "learning_rate": 0.00022742050703415524, "loss": 0.0027, "step": 45000 }, { "epoch": 0.72, "eval_loss": 0.002537691965699196, "eval_runtime": 27.3857, "eval_samples_per_second": 36.515, "eval_steps_per_second": 4.564, "step": 45000 }, { "epoch": 0.736, "grad_norm": 0.1143941730260849, "learning_rate": 0.0002274168656843291, "loss": 0.0034, "step": 46000 }, { "epoch": 0.752, "grad_norm": 0.04524613544344902, "learning_rate": 0.00022741322433450292, "loss": 0.0034, "step": 47000 }, { "epoch": 0.768, "grad_norm": 0.027965977787971497, "learning_rate": 0.00022740958298467676, "loss": 0.0031, "step": 48000 }, { "epoch": 0.784, "grad_norm": 0.033201005309820175, "learning_rate": 0.0002274059416348506, "loss": 0.0032, "step": 49000 }, { "epoch": 0.8, "grad_norm": 0.11329031735658646, "learning_rate": 0.00022740230028502444, "loss": 0.0034, "step": 50000 }, { "epoch": 0.8, "eval_loss": 0.00293481582775712, "eval_runtime": 27.0954, "eval_samples_per_second": 36.907, "eval_steps_per_second": 4.613, "step": 50000 }, { "epoch": 0.816, "grad_norm": 0.08998037129640579, "learning_rate": 0.00022739865893519828, "loss": 0.003, "step": 51000 }, { "epoch": 0.832, "grad_norm": 0.034506551921367645, "learning_rate": 0.00022739501758537212, "loss": 0.0039, "step": 52000 }, { "epoch": 0.848, "grad_norm": 0.10205531865358353, "learning_rate": 0.00022739137623554596, "loss": 0.0031, "step": 53000 }, { "epoch": 0.864, "grad_norm": 0.016757190227508545, "learning_rate": 0.0002273877348857198, "loss": 0.0024, "step": 54000 }, { "epoch": 0.88, "grad_norm": 0.038212958723306656, "learning_rate": 0.00022738409353589365, "loss": 0.0035, "step": 55000 }, { "epoch": 0.88, "eval_loss": 0.003101126756519079, "eval_runtime": 27.2079, "eval_samples_per_second": 36.754, "eval_steps_per_second": 4.594, "step": 55000 }, { "epoch": 0.896, "grad_norm": 0.20447635650634766, "learning_rate": 0.0002273804521860675, "loss": 0.0029, "step": 56000 }, { "epoch": 0.912, "grad_norm": 0.029786735773086548, "learning_rate": 0.00022737681083624133, "loss": 0.0028, "step": 57000 }, { "epoch": 0.928, "grad_norm": 0.1717972755432129, "learning_rate": 0.00022737316948641517, "loss": 0.0035, "step": 58000 }, { "epoch": 0.944, "grad_norm": 0.051670778542757034, "learning_rate": 0.000227369528136589, "loss": 0.0026, "step": 59000 }, { "epoch": 0.96, "grad_norm": 0.18315136432647705, "learning_rate": 0.00022736588678676285, "loss": 0.0021, "step": 60000 }, { "epoch": 0.96, "eval_loss": 0.0022276523523032665, "eval_runtime": 27.2912, "eval_samples_per_second": 36.642, "eval_steps_per_second": 4.58, "step": 60000 }, { "epoch": 0.976, "grad_norm": 0.2137993574142456, "learning_rate": 0.0002273622454369367, "loss": 0.0031, "step": 61000 }, { "epoch": 0.992, "grad_norm": 0.02584846317768097, "learning_rate": 0.00022735860408711053, "loss": 0.0033, "step": 62000 }, { "epoch": 1.008, "grad_norm": 0.054690442979335785, "learning_rate": 0.00022735496273728438, "loss": 0.0024, "step": 63000 }, { "epoch": 1.024, "grad_norm": 0.01702144928276539, "learning_rate": 0.00022735132138745822, "loss": 0.0032, "step": 64000 }, { "epoch": 1.04, "grad_norm": 0.02373000793159008, "learning_rate": 0.00022734768003763206, "loss": 0.0032, "step": 65000 }, { "epoch": 1.04, "eval_loss": 0.0020411296281963587, "eval_runtime": 27.2492, "eval_samples_per_second": 36.698, "eval_steps_per_second": 4.587, "step": 65000 }, { "epoch": 1.056, "grad_norm": 0.012987160123884678, "learning_rate": 0.0002273440386878059, "loss": 0.002, "step": 66000 }, { "epoch": 1.072, "grad_norm": 0.029065946117043495, "learning_rate": 0.0002273403973379797, "loss": 0.0031, "step": 67000 }, { "epoch": 1.088, "grad_norm": 0.17107020318508148, "learning_rate": 0.00022733675598815358, "loss": 0.0022, "step": 68000 }, { "epoch": 1.104, "grad_norm": 0.019081389531493187, "learning_rate": 0.0002273331146383274, "loss": 0.0031, "step": 69000 }, { "epoch": 1.12, "grad_norm": 0.008192900568246841, "learning_rate": 0.00022732947328850126, "loss": 0.0022, "step": 70000 }, { "epoch": 1.12, "eval_loss": 0.0018265106482431293, "eval_runtime": 27.5029, "eval_samples_per_second": 36.36, "eval_steps_per_second": 4.545, "step": 70000 }, { "epoch": 1.1360000000000001, "grad_norm": 0.009845556691288948, "learning_rate": 0.00022732583193867508, "loss": 0.0033, "step": 71000 }, { "epoch": 1.152, "grad_norm": 0.1637999713420868, "learning_rate": 0.00022732219058884895, "loss": 0.0024, "step": 72000 }, { "epoch": 1.168, "grad_norm": 0.03215477615594864, "learning_rate": 0.00022731854923902276, "loss": 0.0026, "step": 73000 }, { "epoch": 1.184, "grad_norm": 0.028977178037166595, "learning_rate": 0.00022731490788919663, "loss": 0.0023, "step": 74000 }, { "epoch": 1.2, "grad_norm": 0.057766951620578766, "learning_rate": 0.00022731126653937044, "loss": 0.0034, "step": 75000 }, { "epoch": 1.2, "eval_loss": 0.0021767348516732454, "eval_runtime": 27.526, "eval_samples_per_second": 36.329, "eval_steps_per_second": 4.541, "step": 75000 }, { "epoch": 1.216, "grad_norm": 0.00946386530995369, "learning_rate": 0.0002273076251895443, "loss": 0.0021, "step": 76000 }, { "epoch": 1.232, "grad_norm": 0.12553413212299347, "learning_rate": 0.00022730398383971812, "loss": 0.0019, "step": 77000 }, { "epoch": 1.248, "grad_norm": 0.0369916595518589, "learning_rate": 0.000227300342489892, "loss": 0.003, "step": 78000 }, { "epoch": 1.264, "grad_norm": 0.19732122123241425, "learning_rate": 0.0002272967011400658, "loss": 0.0024, "step": 79000 }, { "epoch": 1.28, "grad_norm": 0.02471228875219822, "learning_rate": 0.00022729305979023967, "loss": 0.0024, "step": 80000 }, { "epoch": 1.28, "eval_loss": 0.002593559678643942, "eval_runtime": 27.2923, "eval_samples_per_second": 36.64, "eval_steps_per_second": 4.58, "step": 80000 }, { "epoch": 1.296, "grad_norm": 0.5299795269966125, "learning_rate": 0.0002272894184404135, "loss": 0.0019, "step": 81000 }, { "epoch": 1.312, "grad_norm": 0.03472663834691048, "learning_rate": 0.00022728577709058736, "loss": 0.003, "step": 82000 }, { "epoch": 1.328, "grad_norm": 0.09357739239931107, "learning_rate": 0.00022728213574076117, "loss": 0.0022, "step": 83000 }, { "epoch": 1.3439999999999999, "grad_norm": 0.01810472272336483, "learning_rate": 0.00022727849439093504, "loss": 0.0019, "step": 84000 }, { "epoch": 1.3599999999999999, "grad_norm": 0.024920647963881493, "learning_rate": 0.00022727485304110885, "loss": 0.0021, "step": 85000 }, { "epoch": 1.3599999999999999, "eval_loss": 0.0022232765331864357, "eval_runtime": 27.3645, "eval_samples_per_second": 36.544, "eval_steps_per_second": 4.568, "step": 85000 }, { "epoch": 1.376, "grad_norm": 0.03085111826658249, "learning_rate": 0.00022727121169128272, "loss": 0.0023, "step": 86000 }, { "epoch": 1.392, "grad_norm": 0.010742255486547947, "learning_rate": 0.00022726757034145654, "loss": 0.0019, "step": 87000 }, { "epoch": 1.408, "grad_norm": 0.03559265285730362, "learning_rate": 0.00022726392899163038, "loss": 0.0022, "step": 88000 }, { "epoch": 1.424, "grad_norm": 0.0898701399564743, "learning_rate": 0.00022726028764180422, "loss": 0.0028, "step": 89000 }, { "epoch": 1.44, "grad_norm": 0.026589710265398026, "learning_rate": 0.00022725664629197806, "loss": 0.0016, "step": 90000 }, { "epoch": 1.44, "eval_loss": 0.00150102109182626, "eval_runtime": 27.6944, "eval_samples_per_second": 36.108, "eval_steps_per_second": 4.514, "step": 90000 }, { "epoch": 1.456, "grad_norm": 0.016303159296512604, "learning_rate": 0.0002272530049421519, "loss": 0.0024, "step": 91000 }, { "epoch": 1.472, "grad_norm": 0.01823027804493904, "learning_rate": 0.00022724936359232574, "loss": 0.0018, "step": 92000 }, { "epoch": 1.488, "grad_norm": 0.15236489474773407, "learning_rate": 0.00022724572224249958, "loss": 0.0024, "step": 93000 }, { "epoch": 1.504, "grad_norm": 0.03902558609843254, "learning_rate": 0.00022724208089267342, "loss": 0.0021, "step": 94000 }, { "epoch": 1.52, "grad_norm": 0.020767396315932274, "learning_rate": 0.00022723843954284726, "loss": 0.002, "step": 95000 }, { "epoch": 1.52, "eval_loss": 0.001406910945661366, "eval_runtime": 27.6364, "eval_samples_per_second": 36.184, "eval_steps_per_second": 4.523, "step": 95000 }, { "epoch": 1.536, "grad_norm": 0.09269700944423676, "learning_rate": 0.0002272347981930211, "loss": 0.0023, "step": 96000 }, { "epoch": 1.552, "grad_norm": 0.04058321192860603, "learning_rate": 0.00022723115684319495, "loss": 0.0019, "step": 97000 }, { "epoch": 1.568, "grad_norm": 0.04894057661294937, "learning_rate": 0.0002272275154933688, "loss": 0.0018, "step": 98000 }, { "epoch": 1.584, "grad_norm": 0.04043205827474594, "learning_rate": 0.00022722387414354263, "loss": 0.0022, "step": 99000 }, { "epoch": 1.6, "grad_norm": 0.1002797931432724, "learning_rate": 0.00022722023279371647, "loss": 0.002, "step": 100000 }, { "epoch": 1.6, "eval_loss": 0.0017908032750710845, "eval_runtime": 27.5843, "eval_samples_per_second": 36.253, "eval_steps_per_second": 4.532, "step": 100000 }, { "epoch": 1.616, "grad_norm": 0.02161436155438423, "learning_rate": 0.0002272165914438903, "loss": 0.0018, "step": 101000 }, { "epoch": 1.6320000000000001, "grad_norm": 0.010246573947370052, "learning_rate": 0.00022721295009406415, "loss": 0.002, "step": 102000 }, { "epoch": 1.6480000000000001, "grad_norm": 0.06802576035261154, "learning_rate": 0.000227209308744238, "loss": 0.0015, "step": 103000 }, { "epoch": 1.6640000000000001, "grad_norm": 0.013391965068876743, "learning_rate": 0.00022720566739441183, "loss": 0.0025, "step": 104000 }, { "epoch": 1.6800000000000002, "grad_norm": 0.10946637392044067, "learning_rate": 0.00022720202604458568, "loss": 0.0018, "step": 105000 }, { "epoch": 1.6800000000000002, "eval_loss": 0.0015562042826786637, "eval_runtime": 27.7549, "eval_samples_per_second": 36.03, "eval_steps_per_second": 4.504, "step": 105000 }, { "epoch": 1.696, "grad_norm": 0.028942033648490906, "learning_rate": 0.00022719838469475952, "loss": 0.002, "step": 106000 }, { "epoch": 1.712, "grad_norm": 0.023039843887090683, "learning_rate": 0.00022719474334493333, "loss": 0.0014, "step": 107000 }, { "epoch": 1.728, "grad_norm": 0.010488491505384445, "learning_rate": 0.0002271911019951072, "loss": 0.0016, "step": 108000 }, { "epoch": 1.744, "grad_norm": 0.019485417753458023, "learning_rate": 0.000227187460645281, "loss": 0.002, "step": 109000 }, { "epoch": 1.76, "grad_norm": 0.010597913525998592, "learning_rate": 0.00022718381929545488, "loss": 0.002, "step": 110000 }, { "epoch": 1.76, "eval_loss": 0.000878525257576257, "eval_runtime": 27.823, "eval_samples_per_second": 35.941, "eval_steps_per_second": 4.493, "step": 110000 }, { "epoch": 1.776, "grad_norm": 0.02870281971991062, "learning_rate": 0.0002271801779456287, "loss": 0.0023, "step": 111000 }, { "epoch": 1.792, "grad_norm": 0.041255537420511246, "learning_rate": 0.00022717653659580256, "loss": 0.0014, "step": 112000 }, { "epoch": 1.808, "grad_norm": 0.04701690748333931, "learning_rate": 0.00022717289524597638, "loss": 0.0015, "step": 113000 }, { "epoch": 1.8239999999999998, "grad_norm": 0.059342917054891586, "learning_rate": 0.00022716925389615024, "loss": 0.0028, "step": 114000 }, { "epoch": 1.8399999999999999, "grad_norm": 0.040327928960323334, "learning_rate": 0.00022716561254632406, "loss": 0.0014, "step": 115000 }, { "epoch": 1.8399999999999999, "eval_loss": 0.0016007705125957727, "eval_runtime": 27.7178, "eval_samples_per_second": 36.078, "eval_steps_per_second": 4.51, "step": 115000 }, { "epoch": 1.8559999999999999, "grad_norm": 0.018858684226870537, "learning_rate": 0.00022716197119649793, "loss": 0.002, "step": 116000 }, { "epoch": 1.8719999999999999, "grad_norm": 0.026660999283194542, "learning_rate": 0.00022715832984667174, "loss": 0.0019, "step": 117000 }, { "epoch": 1.888, "grad_norm": 0.08471547812223434, "learning_rate": 0.0002271546884968456, "loss": 0.0016, "step": 118000 }, { "epoch": 1.904, "grad_norm": 0.03236541524529457, "learning_rate": 0.00022715104714701942, "loss": 0.0014, "step": 119000 }, { "epoch": 1.92, "grad_norm": 0.015728328377008438, "learning_rate": 0.0002271474057971933, "loss": 0.0022, "step": 120000 }, { "epoch": 1.92, "eval_loss": 0.0017823727102950215, "eval_runtime": 27.6601, "eval_samples_per_second": 36.153, "eval_steps_per_second": 4.519, "step": 120000 }, { "epoch": 1.936, "grad_norm": 0.2575147747993469, "learning_rate": 0.0002271437644473671, "loss": 0.0015, "step": 121000 }, { "epoch": 1.952, "grad_norm": 0.03020591102540493, "learning_rate": 0.00022714012309754097, "loss": 0.0015, "step": 122000 }, { "epoch": 1.968, "grad_norm": 0.011387010104954243, "learning_rate": 0.0002271364817477148, "loss": 0.0015, "step": 123000 }, { "epoch": 1.984, "grad_norm": 0.033326998353004456, "learning_rate": 0.00022713284039788866, "loss": 0.0019, "step": 124000 }, { "epoch": 2.0, "grad_norm": 0.234897643327713, "learning_rate": 0.00022712919904806247, "loss": 0.0029, "step": 125000 }, { "epoch": 2.0, "eval_loss": 0.00910487212240696, "eval_runtime": 27.7906, "eval_samples_per_second": 35.983, "eval_steps_per_second": 4.498, "step": 125000 }, { "epoch": 2.016, "grad_norm": 0.05067163705825806, "learning_rate": 0.0002271255576982363, "loss": 0.0019, "step": 126000 }, { "epoch": 2.032, "grad_norm": 0.015078851021826267, "learning_rate": 0.00022712191634841015, "loss": 0.0012, "step": 127000 }, { "epoch": 2.048, "grad_norm": 0.03365013748407364, "learning_rate": 0.000227118274998584, "loss": 0.0018, "step": 128000 }, { "epoch": 2.064, "grad_norm": 0.00802704505622387, "learning_rate": 0.00022711463364875783, "loss": 0.0013, "step": 129000 }, { "epoch": 2.08, "grad_norm": 0.011523068882524967, "learning_rate": 0.00022711099229893168, "loss": 0.0021, "step": 130000 }, { "epoch": 2.08, "eval_loss": 0.0009301243117079139, "eval_runtime": 27.505, "eval_samples_per_second": 36.357, "eval_steps_per_second": 4.545, "step": 130000 }, { "epoch": 2.096, "grad_norm": 0.012680677697062492, "learning_rate": 0.00022710735094910552, "loss": 0.0014, "step": 131000 }, { "epoch": 2.112, "grad_norm": 0.0508689247071743, "learning_rate": 0.00022710370959927936, "loss": 0.002, "step": 132000 }, { "epoch": 2.128, "grad_norm": 0.014830244705080986, "learning_rate": 0.0002271000682494532, "loss": 0.001, "step": 133000 }, { "epoch": 2.144, "grad_norm": 0.028912167996168137, "learning_rate": 0.00022709642689962704, "loss": 0.0019, "step": 134000 }, { "epoch": 2.16, "grad_norm": 0.06254349648952484, "learning_rate": 0.00022709278554980088, "loss": 0.0012, "step": 135000 }, { "epoch": 2.16, "eval_loss": 0.0014802517835050821, "eval_runtime": 27.695, "eval_samples_per_second": 36.108, "eval_steps_per_second": 4.513, "step": 135000 }, { "epoch": 2.176, "grad_norm": 0.01877821609377861, "learning_rate": 0.00022708914419997472, "loss": 0.0015, "step": 136000 }, { "epoch": 2.192, "grad_norm": 0.18786460161209106, "learning_rate": 0.00022708550285014856, "loss": 0.0018, "step": 137000 }, { "epoch": 2.208, "grad_norm": 0.016280388459563255, "learning_rate": 0.0002270818615003224, "loss": 0.0015, "step": 138000 }, { "epoch": 2.224, "grad_norm": 0.009028231725096703, "learning_rate": 0.00022707822015049625, "loss": 0.0022, "step": 139000 }, { "epoch": 2.24, "grad_norm": 0.02473852038383484, "learning_rate": 0.0002270745788006701, "loss": 0.0011, "step": 140000 }, { "epoch": 2.24, "eval_loss": 0.0011171329533681273, "eval_runtime": 27.6717, "eval_samples_per_second": 36.138, "eval_steps_per_second": 4.517, "step": 140000 }, { "epoch": 2.2560000000000002, "grad_norm": 0.015900999307632446, "learning_rate": 0.00022707093745084393, "loss": 0.0015, "step": 141000 }, { "epoch": 2.2720000000000002, "grad_norm": 0.018436668440699577, "learning_rate": 0.00022706729610101774, "loss": 0.0015, "step": 142000 }, { "epoch": 2.288, "grad_norm": 0.268839567899704, "learning_rate": 0.0002270636547511916, "loss": 0.0013, "step": 143000 }, { "epoch": 2.304, "grad_norm": 0.024980826303362846, "learning_rate": 0.00022706001340136542, "loss": 0.0017, "step": 144000 }, { "epoch": 2.32, "grad_norm": 0.025631515309214592, "learning_rate": 0.00022705637205153926, "loss": 0.0009, "step": 145000 }, { "epoch": 2.32, "eval_loss": 0.0012023365125060081, "eval_runtime": 27.5991, "eval_samples_per_second": 36.233, "eval_steps_per_second": 4.529, "step": 145000 }, { "epoch": 2.336, "grad_norm": 0.010165953077375889, "learning_rate": 0.0002270527307017131, "loss": 0.0018, "step": 146000 }, { "epoch": 2.352, "grad_norm": 0.012398986145853996, "learning_rate": 0.00022704908935188695, "loss": 0.001, "step": 147000 }, { "epoch": 2.368, "grad_norm": 0.02246440201997757, "learning_rate": 0.0002270454480020608, "loss": 0.0025, "step": 148000 }, { "epoch": 2.384, "grad_norm": 0.018412381410598755, "learning_rate": 0.00022704180665223463, "loss": 0.0008, "step": 149000 }, { "epoch": 2.4, "grad_norm": 0.025599336251616478, "learning_rate": 0.00022703816530240847, "loss": 0.0025, "step": 150000 }, { "epoch": 2.4, "eval_loss": 0.000995820271782577, "eval_runtime": 27.7548, "eval_samples_per_second": 36.03, "eval_steps_per_second": 4.504, "step": 150000 }, { "epoch": 2.416, "grad_norm": 0.03476562350988388, "learning_rate": 0.0002270345239525823, "loss": 0.0016, "step": 151000 }, { "epoch": 2.432, "grad_norm": 0.002502072835341096, "learning_rate": 0.00022703088260275615, "loss": 0.001, "step": 152000 }, { "epoch": 2.448, "grad_norm": 0.09545526653528214, "learning_rate": 0.00022702724125293, "loss": 0.0019, "step": 153000 }, { "epoch": 2.464, "grad_norm": 0.026374874636530876, "learning_rate": 0.00022702359990310383, "loss": 0.0027, "step": 154000 }, { "epoch": 2.48, "grad_norm": 0.02330603636801243, "learning_rate": 0.00022701995855327768, "loss": 0.0013, "step": 155000 }, { "epoch": 2.48, "eval_loss": 0.0009146310039795935, "eval_runtime": 27.6699, "eval_samples_per_second": 36.14, "eval_steps_per_second": 4.518, "step": 155000 }, { "epoch": 2.496, "grad_norm": 0.042115718126297, "learning_rate": 0.00022701631720345152, "loss": 0.001, "step": 156000 }, { "epoch": 2.512, "grad_norm": 0.006467332132160664, "learning_rate": 0.00022701267585362536, "loss": 0.0013, "step": 157000 }, { "epoch": 2.528, "grad_norm": 0.039700523018836975, "learning_rate": 0.0002270090345037992, "loss": 0.0012, "step": 158000 }, { "epoch": 2.544, "grad_norm": 0.006177098024636507, "learning_rate": 0.00022700539315397304, "loss": 0.0032, "step": 159000 }, { "epoch": 2.56, "grad_norm": 0.016644610092043877, "learning_rate": 0.00022700175180414688, "loss": 0.0007, "step": 160000 }, { "epoch": 2.56, "eval_loss": 0.0010344331385567784, "eval_runtime": 27.8065, "eval_samples_per_second": 35.963, "eval_steps_per_second": 4.495, "step": 160000 }, { "epoch": 2.576, "grad_norm": 0.01400495320558548, "learning_rate": 0.00022699811045432072, "loss": 0.0012, "step": 161000 }, { "epoch": 2.592, "grad_norm": 0.016703518107533455, "learning_rate": 0.00022699446910449456, "loss": 0.0012, "step": 162000 }, { "epoch": 2.608, "grad_norm": 0.006359017454087734, "learning_rate": 0.0002269908277546684, "loss": 0.0012, "step": 163000 }, { "epoch": 2.624, "grad_norm": 0.01771441660821438, "learning_rate": 0.00022698718640484222, "loss": 0.0016, "step": 164000 }, { "epoch": 2.64, "grad_norm": 0.01094936951994896, "learning_rate": 0.0002269835450550161, "loss": 0.0011, "step": 165000 }, { "epoch": 2.64, "eval_loss": 0.0007599141681566834, "eval_runtime": 27.7146, "eval_samples_per_second": 36.082, "eval_steps_per_second": 4.51, "step": 165000 }, { "epoch": 2.656, "grad_norm": 0.09152177721261978, "learning_rate": 0.0002269799037051899, "loss": 0.0024, "step": 166000 }, { "epoch": 2.672, "grad_norm": 0.012105804868042469, "learning_rate": 0.00022697626235536377, "loss": 0.0009, "step": 167000 }, { "epoch": 2.6879999999999997, "grad_norm": 0.01530654076486826, "learning_rate": 0.00022697262100553758, "loss": 0.0011, "step": 168000 }, { "epoch": 2.7039999999999997, "grad_norm": 0.031053414568305016, "learning_rate": 0.00022696897965571145, "loss": 0.0015, "step": 169000 }, { "epoch": 2.7199999999999998, "grad_norm": 0.01557753887027502, "learning_rate": 0.00022696533830588527, "loss": 0.001, "step": 170000 }, { "epoch": 2.7199999999999998, "eval_loss": 0.0008088626782409847, "eval_runtime": 27.776, "eval_samples_per_second": 36.002, "eval_steps_per_second": 4.5, "step": 170000 }, { "epoch": 2.7359999999999998, "grad_norm": 0.02831295132637024, "learning_rate": 0.00022696169695605913, "loss": 0.0014, "step": 171000 }, { "epoch": 2.752, "grad_norm": 0.017672572284936905, "learning_rate": 0.00022695805560623295, "loss": 0.0011, "step": 172000 }, { "epoch": 2.768, "grad_norm": 0.018164193257689476, "learning_rate": 0.00022695441425640682, "loss": 0.0019, "step": 173000 }, { "epoch": 2.784, "grad_norm": 0.017383994534611702, "learning_rate": 0.00022695077290658063, "loss": 0.001, "step": 174000 }, { "epoch": 2.8, "grad_norm": 0.006576849147677422, "learning_rate": 0.0002269471315567545, "loss": 0.0011, "step": 175000 }, { "epoch": 2.8, "eval_loss": 0.0006260189693421125, "eval_runtime": 27.3919, "eval_samples_per_second": 36.507, "eval_steps_per_second": 4.563, "step": 175000 }, { "epoch": 2.816, "grad_norm": 0.019615883007645607, "learning_rate": 0.0002269434902069283, "loss": 0.0012, "step": 176000 }, { "epoch": 2.832, "grad_norm": 0.03926165774464607, "learning_rate": 0.00022693984885710218, "loss": 0.0014, "step": 177000 }, { "epoch": 2.848, "grad_norm": 0.021534917876124382, "learning_rate": 0.000226936207507276, "loss": 0.0012, "step": 178000 }, { "epoch": 2.864, "grad_norm": 0.04047563299536705, "learning_rate": 0.00022693256615744986, "loss": 0.001, "step": 179000 }, { "epoch": 2.88, "grad_norm": 0.04712160676717758, "learning_rate": 0.00022692892480762368, "loss": 0.0015, "step": 180000 }, { "epoch": 2.88, "eval_loss": 0.0013630291214212775, "eval_runtime": 27.4056, "eval_samples_per_second": 36.489, "eval_steps_per_second": 4.561, "step": 180000 }, { "epoch": 2.896, "grad_norm": 0.21584591269493103, "learning_rate": 0.00022692528345779754, "loss": 0.0019, "step": 181000 }, { "epoch": 2.912, "grad_norm": 0.015519549138844013, "learning_rate": 0.00022692164210797136, "loss": 0.0012, "step": 182000 }, { "epoch": 2.928, "grad_norm": 0.0314391665160656, "learning_rate": 0.00022691800075814523, "loss": 0.0009, "step": 183000 }, { "epoch": 2.944, "grad_norm": 0.16906876862049103, "learning_rate": 0.00022691435940831904, "loss": 0.0013, "step": 184000 }, { "epoch": 2.96, "grad_norm": 0.04538990557193756, "learning_rate": 0.00022691071805849288, "loss": 0.001, "step": 185000 }, { "epoch": 2.96, "eval_loss": 0.0014080323744565248, "eval_runtime": 27.3828, "eval_samples_per_second": 36.519, "eval_steps_per_second": 4.565, "step": 185000 }, { "epoch": 2.976, "grad_norm": 0.008023149333894253, "learning_rate": 0.00022690707670866672, "loss": 0.0013, "step": 186000 }, { "epoch": 2.992, "grad_norm": 0.011926773004233837, "learning_rate": 0.00022690343535884056, "loss": 0.0012, "step": 187000 }, { "epoch": 3.008, "grad_norm": 0.01701526716351509, "learning_rate": 0.0002268997940090144, "loss": 0.0011, "step": 188000 }, { "epoch": 3.024, "grad_norm": 0.015581037849187851, "learning_rate": 0.00022689615265918825, "loss": 0.0013, "step": 189000 }, { "epoch": 3.04, "grad_norm": 0.012046800926327705, "learning_rate": 0.0002268925113093621, "loss": 0.001, "step": 190000 }, { "epoch": 3.04, "eval_loss": 0.0010119588114321232, "eval_runtime": 27.6665, "eval_samples_per_second": 36.145, "eval_steps_per_second": 4.518, "step": 190000 }, { "epoch": 3.056, "grad_norm": 0.009263888001441956, "learning_rate": 0.00022688886995953593, "loss": 0.001, "step": 191000 }, { "epoch": 3.072, "grad_norm": 0.0538918599486351, "learning_rate": 0.00022688522860970977, "loss": 0.0012, "step": 192000 }, { "epoch": 3.088, "grad_norm": 0.0521121546626091, "learning_rate": 0.0002268815872598836, "loss": 0.0017, "step": 193000 }, { "epoch": 3.104, "grad_norm": 0.05000779777765274, "learning_rate": 0.00022687794591005745, "loss": 0.0008, "step": 194000 }, { "epoch": 3.12, "grad_norm": 0.06467895954847336, "learning_rate": 0.0002268743045602313, "loss": 0.0011, "step": 195000 }, { "epoch": 3.12, "eval_loss": 0.0008815609035082161, "eval_runtime": 27.5652, "eval_samples_per_second": 36.278, "eval_steps_per_second": 4.535, "step": 195000 }, { "epoch": 3.136, "grad_norm": 0.01422048918902874, "learning_rate": 0.00022687066321040513, "loss": 0.0011, "step": 196000 }, { "epoch": 3.152, "grad_norm": 0.02482694387435913, "learning_rate": 0.00022686702186057897, "loss": 0.0011, "step": 197000 }, { "epoch": 3.168, "grad_norm": 0.03517874330282211, "learning_rate": 0.00022686338051075282, "loss": 0.0017, "step": 198000 }, { "epoch": 3.184, "grad_norm": 0.027310600504279137, "learning_rate": 0.00022685973916092666, "loss": 0.0008, "step": 199000 }, { "epoch": 3.2, "grad_norm": 0.06521017849445343, "learning_rate": 0.0002268560978111005, "loss": 0.002, "step": 200000 }, { "epoch": 3.2, "eval_loss": 0.00754576688632369, "eval_runtime": 27.5143, "eval_samples_per_second": 36.345, "eval_steps_per_second": 4.543, "step": 200000 }, { "epoch": 3.216, "grad_norm": 0.24959920346736908, "learning_rate": 0.00022685245646127434, "loss": 0.0008, "step": 201000 }, { "epoch": 3.232, "grad_norm": 0.010456324554979801, "learning_rate": 0.00022684881511144818, "loss": 0.0011, "step": 202000 }, { "epoch": 3.248, "grad_norm": 0.010797294788062572, "learning_rate": 0.00022684517376162202, "loss": 0.0011, "step": 203000 }, { "epoch": 3.2640000000000002, "grad_norm": 0.04222773015499115, "learning_rate": 0.00022684153241179584, "loss": 0.001, "step": 204000 }, { "epoch": 3.2800000000000002, "grad_norm": 0.03277302905917168, "learning_rate": 0.0002268378910619697, "loss": 0.0015, "step": 205000 }, { "epoch": 3.2800000000000002, "eval_loss": 0.0007634469075128436, "eval_runtime": 27.7742, "eval_samples_per_second": 36.005, "eval_steps_per_second": 4.501, "step": 205000 }, { "epoch": 3.296, "grad_norm": 0.0069810631684958935, "learning_rate": 0.00022683424971214352, "loss": 0.001, "step": 206000 }, { "epoch": 3.312, "grad_norm": 0.01147681474685669, "learning_rate": 0.00022683060836231739, "loss": 0.0009, "step": 207000 }, { "epoch": 3.328, "grad_norm": 0.009766928851604462, "learning_rate": 0.0002268269670124912, "loss": 0.0019, "step": 208000 }, { "epoch": 3.344, "grad_norm": 0.03460145741701126, "learning_rate": 0.00022682332566266507, "loss": 0.0008, "step": 209000 }, { "epoch": 3.36, "grad_norm": 0.016247229650616646, "learning_rate": 0.00022681968431283888, "loss": 0.001, "step": 210000 }, { "epoch": 3.36, "eval_loss": 0.0010268606711179018, "eval_runtime": 26.9593, "eval_samples_per_second": 37.093, "eval_steps_per_second": 4.637, "step": 210000 }, { "epoch": 3.376, "grad_norm": 0.012766228057444096, "learning_rate": 0.00022681604296301275, "loss": 0.0008, "step": 211000 }, { "epoch": 3.392, "grad_norm": 0.005086794961243868, "learning_rate": 0.00022681240161318656, "loss": 0.0014, "step": 212000 }, { "epoch": 3.408, "grad_norm": 0.028264038264751434, "learning_rate": 0.00022680876026336043, "loss": 0.0011, "step": 213000 }, { "epoch": 3.424, "grad_norm": 0.05160939320921898, "learning_rate": 0.00022680511891353425, "loss": 0.0009, "step": 214000 }, { "epoch": 3.44, "grad_norm": 0.02259020321071148, "learning_rate": 0.00022680147756370811, "loss": 0.0012, "step": 215000 }, { "epoch": 3.44, "eval_loss": 0.0007602461846545339, "eval_runtime": 26.8881, "eval_samples_per_second": 37.191, "eval_steps_per_second": 4.649, "step": 215000 }, { "epoch": 3.456, "grad_norm": 0.03077981248497963, "learning_rate": 0.00022679783621388193, "loss": 0.0012, "step": 216000 }, { "epoch": 3.472, "grad_norm": 0.027997983619570732, "learning_rate": 0.0002267941948640558, "loss": 0.0008, "step": 217000 }, { "epoch": 3.488, "grad_norm": 0.009089149534702301, "learning_rate": 0.0002267905535142296, "loss": 0.0011, "step": 218000 }, { "epoch": 3.504, "grad_norm": 0.09043902903795242, "learning_rate": 0.00022678691216440348, "loss": 0.0011, "step": 219000 }, { "epoch": 3.52, "grad_norm": 0.06199198588728905, "learning_rate": 0.0002267832708145773, "loss": 0.0011, "step": 220000 }, { "epoch": 3.52, "eval_loss": 0.001106478739529848, "eval_runtime": 27.0055, "eval_samples_per_second": 37.029, "eval_steps_per_second": 4.629, "step": 220000 }, { "epoch": 3.536, "grad_norm": 0.013115255162119865, "learning_rate": 0.00022677962946475116, "loss": 0.0015, "step": 221000 }, { "epoch": 3.552, "grad_norm": 0.030206598341464996, "learning_rate": 0.00022677598811492498, "loss": 0.001, "step": 222000 }, { "epoch": 3.568, "grad_norm": 0.014335270039737225, "learning_rate": 0.00022677234676509882, "loss": 0.0008, "step": 223000 }, { "epoch": 3.584, "grad_norm": 0.04320364445447922, "learning_rate": 0.00022676870541527266, "loss": 0.001, "step": 224000 }, { "epoch": 3.6, "grad_norm": 0.01011396199464798, "learning_rate": 0.0002267650640654465, "loss": 0.0014, "step": 225000 }, { "epoch": 3.6, "eval_loss": 0.0008724904037080705, "eval_runtime": 26.7598, "eval_samples_per_second": 37.37, "eval_steps_per_second": 4.671, "step": 225000 }, { "epoch": 3.616, "grad_norm": 0.06343936175107956, "learning_rate": 0.00022676142271562034, "loss": 0.0009, "step": 226000 }, { "epoch": 3.632, "grad_norm": 0.04553668946027756, "learning_rate": 0.00022675778136579418, "loss": 0.001, "step": 227000 }, { "epoch": 3.648, "grad_norm": 0.0029150221962481737, "learning_rate": 0.00022675414001596802, "loss": 0.0018, "step": 228000 }, { "epoch": 3.664, "grad_norm": 0.03533324971795082, "learning_rate": 0.00022675049866614186, "loss": 0.0017, "step": 229000 }, { "epoch": 3.68, "grad_norm": 0.020134087651968002, "learning_rate": 0.0002267468573163157, "loss": 0.0013, "step": 230000 }, { "epoch": 3.68, "eval_loss": 0.001037033973261714, "eval_runtime": 27.1191, "eval_samples_per_second": 36.874, "eval_steps_per_second": 4.609, "step": 230000 }, { "epoch": 3.6959999999999997, "grad_norm": 0.01976308599114418, "learning_rate": 0.00022674321596648955, "loss": 0.0009, "step": 231000 }, { "epoch": 3.7119999999999997, "grad_norm": 0.05415629222989082, "learning_rate": 0.00022673957461666339, "loss": 0.0012, "step": 232000 }, { "epoch": 3.7279999999999998, "grad_norm": 0.020477378740906715, "learning_rate": 0.00022673593326683723, "loss": 0.001, "step": 233000 }, { "epoch": 3.7439999999999998, "grad_norm": 0.014153924770653248, "learning_rate": 0.00022673229191701107, "loss": 0.0017, "step": 234000 }, { "epoch": 3.76, "grad_norm": 0.02030963823199272, "learning_rate": 0.0002267286505671849, "loss": 0.0007, "step": 235000 }, { "epoch": 3.76, "eval_loss": 0.0007908450206741691, "eval_runtime": 27.0159, "eval_samples_per_second": 37.015, "eval_steps_per_second": 4.627, "step": 235000 }, { "epoch": 3.776, "grad_norm": 0.03953304514288902, "learning_rate": 0.00022672500921735875, "loss": 0.0008, "step": 236000 }, { "epoch": 3.792, "grad_norm": 0.007172519341111183, "learning_rate": 0.0002267213678675326, "loss": 0.0016, "step": 237000 }, { "epoch": 3.808, "grad_norm": 0.03694753348827362, "learning_rate": 0.00022671772651770643, "loss": 0.0008, "step": 238000 }, { "epoch": 3.824, "grad_norm": 0.04899757727980614, "learning_rate": 0.00022671408516788027, "loss": 0.0011, "step": 239000 }, { "epoch": 3.84, "grad_norm": 0.05499159172177315, "learning_rate": 0.00022671044381805412, "loss": 0.0013, "step": 240000 }, { "epoch": 3.84, "eval_loss": 0.0008275896543636918, "eval_runtime": 27.099, "eval_samples_per_second": 36.902, "eval_steps_per_second": 4.613, "step": 240000 }, { "epoch": 3.856, "grad_norm": 0.02498927153646946, "learning_rate": 0.00022670680246822796, "loss": 0.0008, "step": 241000 }, { "epoch": 3.872, "grad_norm": 0.02703891508281231, "learning_rate": 0.00022670316111840177, "loss": 0.0009, "step": 242000 }, { "epoch": 3.888, "grad_norm": 0.010871395468711853, "learning_rate": 0.00022669951976857564, "loss": 0.0009, "step": 243000 }, { "epoch": 3.904, "grad_norm": 0.006647611036896706, "learning_rate": 0.00022669587841874945, "loss": 0.0019, "step": 244000 }, { "epoch": 3.92, "grad_norm": 0.11232209205627441, "learning_rate": 0.00022669223706892332, "loss": 0.0006, "step": 245000 }, { "epoch": 3.92, "eval_loss": 0.004233696032315493, "eval_runtime": 27.2042, "eval_samples_per_second": 36.759, "eval_steps_per_second": 4.595, "step": 245000 }, { "epoch": 3.936, "grad_norm": 0.03585943579673767, "learning_rate": 0.00022668859571909713, "loss": 0.0012, "step": 246000 }, { "epoch": 3.952, "grad_norm": 0.028422392904758453, "learning_rate": 0.000226684954369271, "loss": 0.0009, "step": 247000 }, { "epoch": 3.968, "grad_norm": 0.029626131057739258, "learning_rate": 0.00022668131301944482, "loss": 0.0009, "step": 248000 }, { "epoch": 3.984, "grad_norm": 0.01423815730959177, "learning_rate": 0.00022667767166961866, "loss": 0.0011, "step": 249000 }, { "epoch": 4.0, "grad_norm": 0.028744470328092575, "learning_rate": 0.0002266740303197925, "loss": 0.0012, "step": 250000 }, { "epoch": 4.0, "eval_loss": 0.0009512793621979654, "eval_runtime": 27.0826, "eval_samples_per_second": 36.924, "eval_steps_per_second": 4.616, "step": 250000 }, { "epoch": 4.016, "grad_norm": 0.05679468810558319, "learning_rate": 0.00022667038896996634, "loss": 0.0008, "step": 251000 }, { "epoch": 4.032, "grad_norm": 0.01259209681302309, "learning_rate": 0.00022666674762014018, "loss": 0.0012, "step": 252000 }, { "epoch": 4.048, "grad_norm": 0.02058994211256504, "learning_rate": 0.00022666310627031402, "loss": 0.0007, "step": 253000 }, { "epoch": 4.064, "grad_norm": 0.028425488620996475, "learning_rate": 0.00022665946492048786, "loss": 0.0017, "step": 254000 }, { "epoch": 4.08, "grad_norm": 0.035576559603214264, "learning_rate": 0.0002266558235706617, "loss": 0.0008, "step": 255000 }, { "epoch": 4.08, "eval_loss": 0.0006453625974245369, "eval_runtime": 27.5028, "eval_samples_per_second": 36.36, "eval_steps_per_second": 4.545, "step": 255000 } ], "logging_steps": 1000, "max_steps": 62500000, "num_input_tokens_seen": 0, "num_train_epochs": 1000, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9094798804101104e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }