| { | |
| "best_global_step": 50000, | |
| "best_metric": 2.1760547161102295, | |
| "best_model_checkpoint": "./sky-389m-tx-project/checkpoint-50000", | |
| "epoch": 1.7269964078474715, | |
| "eval_steps": 1000, | |
| "global_step": 50000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0034539928156949434, | |
| "grad_norm": 3.3880717754364014, | |
| "learning_rate": 2.4750000000000002e-05, | |
| "loss": 8.5386, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.006907985631389887, | |
| "grad_norm": 2.3290209770202637, | |
| "learning_rate": 4.975e-05, | |
| "loss": 6.7703, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01036197844708483, | |
| "grad_norm": 2.278391122817993, | |
| "learning_rate": 7.475e-05, | |
| "loss": 6.0778, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.013815971262779773, | |
| "grad_norm": 1.8386310338974, | |
| "learning_rate": 9.975000000000001e-05, | |
| "loss": 5.7081, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.017269964078474715, | |
| "grad_norm": 1.0589734315872192, | |
| "learning_rate": 0.00012475, | |
| "loss": 5.3779, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.02072395689416966, | |
| "grad_norm": 1.060039758682251, | |
| "learning_rate": 0.00014975, | |
| "loss": 5.0624, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.024177949709864602, | |
| "grad_norm": 1.0565826892852783, | |
| "learning_rate": 0.00017475, | |
| "loss": 4.8215, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.027631942525559547, | |
| "grad_norm": 0.9314415454864502, | |
| "learning_rate": 0.00019975, | |
| "loss": 4.5897, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.03108593534125449, | |
| "grad_norm": 0.9964447021484375, | |
| "learning_rate": 0.00022475000000000001, | |
| "loss": 4.4049, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.03453992815694943, | |
| "grad_norm": 0.9212857484817505, | |
| "learning_rate": 0.00024975, | |
| "loss": 4.2051, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.03453992815694943, | |
| "eval_loss": 4.235040187835693, | |
| "eval_runtime": 935.7314, | |
| "eval_samples_per_second": 162.843, | |
| "eval_steps_per_second": 1.629, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.037993920972644375, | |
| "grad_norm": 0.8937363028526306, | |
| "learning_rate": 0.00027475, | |
| "loss": 4.0112, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.04144791378833932, | |
| "grad_norm": 0.8522709012031555, | |
| "learning_rate": 0.00029975000000000005, | |
| "loss": 3.8575, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.044901906604034265, | |
| "grad_norm": 0.8380929231643677, | |
| "learning_rate": 0.00032475, | |
| "loss": 3.7379, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.048355899419729204, | |
| "grad_norm": 0.704521894454956, | |
| "learning_rate": 0.00034975, | |
| "loss": 3.6267, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.05180989223542415, | |
| "grad_norm": 0.7526060938835144, | |
| "learning_rate": 0.00037475000000000003, | |
| "loss": 3.5288, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.055263885051119094, | |
| "grad_norm": 0.8213881254196167, | |
| "learning_rate": 0.00039975, | |
| "loss": 3.4769, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.05871787786681404, | |
| "grad_norm": 0.6610364317893982, | |
| "learning_rate": 0.00042475000000000005, | |
| "loss": 3.4406, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.06217187068250898, | |
| "grad_norm": 0.8810706734657288, | |
| "learning_rate": 0.00044975, | |
| "loss": 3.3477, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.06562586349820393, | |
| "grad_norm": 1.3641518354415894, | |
| "learning_rate": 0.00047475, | |
| "loss": 3.3183, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.06907985631389886, | |
| "grad_norm": 0.6155968904495239, | |
| "learning_rate": 0.0004997500000000001, | |
| "loss": 3.3016, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.06907985631389886, | |
| "eval_loss": 3.551506280899048, | |
| "eval_runtime": 937.0574, | |
| "eval_samples_per_second": 162.612, | |
| "eval_steps_per_second": 1.626, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0725338491295938, | |
| "grad_norm": 0.5413870215415955, | |
| "learning_rate": 0.0004994375, | |
| "loss": 3.2393, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.07598784194528875, | |
| "grad_norm": 0.5018215775489807, | |
| "learning_rate": 0.0004988693181818182, | |
| "loss": 3.1889, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0794418347609837, | |
| "grad_norm": 0.5307313203811646, | |
| "learning_rate": 0.0004983011363636364, | |
| "loss": 3.1391, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.08289582757667864, | |
| "grad_norm": 0.5371428728103638, | |
| "learning_rate": 0.0004977329545454545, | |
| "loss": 3.1102, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.08634982039237359, | |
| "grad_norm": 0.5079624652862549, | |
| "learning_rate": 0.0004971647727272727, | |
| "loss": 3.0868, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.08980381320806853, | |
| "grad_norm": 0.4562855660915375, | |
| "learning_rate": 0.000496596590909091, | |
| "loss": 3.0448, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.09325780602376348, | |
| "grad_norm": 0.4660443067550659, | |
| "learning_rate": 0.0004960284090909092, | |
| "loss": 3.0341, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.09671179883945841, | |
| "grad_norm": 0.48204541206359863, | |
| "learning_rate": 0.0004954602272727273, | |
| "loss": 2.9917, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.10016579165515335, | |
| "grad_norm": 0.43557730317115784, | |
| "learning_rate": 0.0004948920454545454, | |
| "loss": 2.9464, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.1036197844708483, | |
| "grad_norm": 0.5063506364822388, | |
| "learning_rate": 0.0004943238636363637, | |
| "loss": 2.9463, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1036197844708483, | |
| "eval_loss": 3.2142741680145264, | |
| "eval_runtime": 937.7254, | |
| "eval_samples_per_second": 162.496, | |
| "eval_steps_per_second": 1.625, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.10707377728654324, | |
| "grad_norm": 0.4616130590438843, | |
| "learning_rate": 0.0004937556818181818, | |
| "loss": 2.9168, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.11052777010223819, | |
| "grad_norm": 0.447933554649353, | |
| "learning_rate": 0.0004931875, | |
| "loss": 2.9172, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.11398176291793313, | |
| "grad_norm": 0.4423768222332001, | |
| "learning_rate": 0.0004926193181818183, | |
| "loss": 2.8891, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.11743575573362808, | |
| "grad_norm": 0.4318563640117645, | |
| "learning_rate": 0.0004920511363636364, | |
| "loss": 2.8479, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.12088974854932302, | |
| "grad_norm": 0.41672056913375854, | |
| "learning_rate": 0.0004914829545454545, | |
| "loss": 2.8462, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.12434374136501795, | |
| "grad_norm": 0.3930817246437073, | |
| "learning_rate": 0.0004909147727272727, | |
| "loss": 2.8219, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.1277977341807129, | |
| "grad_norm": 0.4133651852607727, | |
| "learning_rate": 0.0004903465909090909, | |
| "loss": 2.7957, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.13125172699640786, | |
| "grad_norm": 0.40811586380004883, | |
| "learning_rate": 0.0004897784090909091, | |
| "loss": 2.7879, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.1347057198121028, | |
| "grad_norm": 0.42227643728256226, | |
| "learning_rate": 0.0004892102272727273, | |
| "loss": 2.7716, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.13815971262779772, | |
| "grad_norm": 0.41413313150405884, | |
| "learning_rate": 0.0004886420454545455, | |
| "loss": 2.7606, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.13815971262779772, | |
| "eval_loss": 3.061166286468506, | |
| "eval_runtime": 936.0728, | |
| "eval_samples_per_second": 162.783, | |
| "eval_steps_per_second": 1.628, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.14161370544349267, | |
| "grad_norm": 0.40972092747688293, | |
| "learning_rate": 0.00048807386363636365, | |
| "loss": 2.7386, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.1450676982591876, | |
| "grad_norm": 0.4020697772502899, | |
| "learning_rate": 0.00048750568181818183, | |
| "loss": 2.7522, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.14852169107488256, | |
| "grad_norm": 0.40231621265411377, | |
| "learning_rate": 0.0004869375, | |
| "loss": 2.7442, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.1519756838905775, | |
| "grad_norm": 0.455773264169693, | |
| "learning_rate": 0.0004863693181818182, | |
| "loss": 2.7146, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.15542967670627245, | |
| "grad_norm": 0.38691282272338867, | |
| "learning_rate": 0.00048580113636363637, | |
| "loss": 2.6924, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.1588836695219674, | |
| "grad_norm": 0.3897066116333008, | |
| "learning_rate": 0.00048523295454545455, | |
| "loss": 2.6964, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.16233766233766234, | |
| "grad_norm": 0.3786475360393524, | |
| "learning_rate": 0.00048466477272727273, | |
| "loss": 2.6566, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.16579165515335728, | |
| "grad_norm": 0.3838929235935211, | |
| "learning_rate": 0.0004840965909090909, | |
| "loss": 2.6634, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.16924564796905223, | |
| "grad_norm": 0.3646841049194336, | |
| "learning_rate": 0.0004835284090909091, | |
| "loss": 2.6708, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.17269964078474717, | |
| "grad_norm": 0.37178680300712585, | |
| "learning_rate": 0.0004829602272727273, | |
| "loss": 2.6389, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.17269964078474717, | |
| "eval_loss": 2.940995216369629, | |
| "eval_runtime": 935.4231, | |
| "eval_samples_per_second": 162.896, | |
| "eval_steps_per_second": 1.629, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.17615363360044212, | |
| "grad_norm": 0.37742722034454346, | |
| "learning_rate": 0.00048239204545454545, | |
| "loss": 2.644, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.17960762641613706, | |
| "grad_norm": 0.3702583611011505, | |
| "learning_rate": 0.00048182386363636363, | |
| "loss": 2.624, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.183061619231832, | |
| "grad_norm": 0.4044618308544159, | |
| "learning_rate": 0.0004812556818181818, | |
| "loss": 2.6197, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.18651561204752695, | |
| "grad_norm": 0.3829458951950073, | |
| "learning_rate": 0.0004806875, | |
| "loss": 2.614, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.1899696048632219, | |
| "grad_norm": 0.3829841911792755, | |
| "learning_rate": 0.0004801193181818182, | |
| "loss": 2.6118, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.19342359767891681, | |
| "grad_norm": 0.3528871238231659, | |
| "learning_rate": 0.0004795511363636364, | |
| "loss": 2.6041, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.19687759049461176, | |
| "grad_norm": 0.3476055860519409, | |
| "learning_rate": 0.00047898295454545454, | |
| "loss": 2.5908, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.2003315833103067, | |
| "grad_norm": 0.3490158021450043, | |
| "learning_rate": 0.0004784147727272727, | |
| "loss": 2.569, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.20378557612600165, | |
| "grad_norm": 0.3507535457611084, | |
| "learning_rate": 0.00047784659090909095, | |
| "loss": 2.5502, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.2072395689416966, | |
| "grad_norm": 0.37472763657569885, | |
| "learning_rate": 0.0004772784090909091, | |
| "loss": 2.5656, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.2072395689416966, | |
| "eval_loss": 2.869264602661133, | |
| "eval_runtime": 927.5758, | |
| "eval_samples_per_second": 164.274, | |
| "eval_steps_per_second": 1.027, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.21069356175739154, | |
| "grad_norm": 0.34653300046920776, | |
| "learning_rate": 0.00047671022727272726, | |
| "loss": 2.5509, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.21414755457308648, | |
| "grad_norm": 0.3335779011249542, | |
| "learning_rate": 0.0004761420454545455, | |
| "loss": 2.5421, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.21760154738878143, | |
| "grad_norm": 0.37146443128585815, | |
| "learning_rate": 0.0004755738636363636, | |
| "loss": 2.5438, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.22105554020447638, | |
| "grad_norm": 0.33024120330810547, | |
| "learning_rate": 0.0004750056818181818, | |
| "loss": 2.5318, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.22450953302017132, | |
| "grad_norm": 0.3545812666416168, | |
| "learning_rate": 0.00047443750000000003, | |
| "loss": 2.5167, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.22796352583586627, | |
| "grad_norm": 0.3502351641654968, | |
| "learning_rate": 0.00047386931818181816, | |
| "loss": 2.5247, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.2314175186515612, | |
| "grad_norm": 0.35102933645248413, | |
| "learning_rate": 0.00047330113636363634, | |
| "loss": 2.5271, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.23487151146725616, | |
| "grad_norm": 0.34355252981185913, | |
| "learning_rate": 0.0004727329545454546, | |
| "loss": 2.536, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.2383255042829511, | |
| "grad_norm": 0.3270651400089264, | |
| "learning_rate": 0.00047216477272727275, | |
| "loss": 2.5081, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.24177949709864605, | |
| "grad_norm": 0.35053566098213196, | |
| "learning_rate": 0.0004715965909090909, | |
| "loss": 2.4945, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.24177949709864605, | |
| "eval_loss": 2.804372549057007, | |
| "eval_runtime": 927.2887, | |
| "eval_samples_per_second": 164.325, | |
| "eval_steps_per_second": 1.028, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.245233489914341, | |
| "grad_norm": 0.3321439325809479, | |
| "learning_rate": 0.0004710284090909091, | |
| "loss": 2.482, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.2486874827300359, | |
| "grad_norm": 0.3228578567504883, | |
| "learning_rate": 0.0004704602272727273, | |
| "loss": 2.4787, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.25214147554573085, | |
| "grad_norm": 0.3319440186023712, | |
| "learning_rate": 0.0004698920454545454, | |
| "loss": 2.4704, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.2555954683614258, | |
| "grad_norm": 0.34676915407180786, | |
| "learning_rate": 0.00046932386363636366, | |
| "loss": 2.479, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.25904946117712074, | |
| "grad_norm": 0.3456803560256958, | |
| "learning_rate": 0.00046875568181818184, | |
| "loss": 2.462, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.2625034539928157, | |
| "grad_norm": 0.330388605594635, | |
| "learning_rate": 0.00046818749999999996, | |
| "loss": 2.4638, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.26595744680851063, | |
| "grad_norm": 0.3278537690639496, | |
| "learning_rate": 0.0004676193181818182, | |
| "loss": 2.456, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.2694114396242056, | |
| "grad_norm": 0.331632137298584, | |
| "learning_rate": 0.0004670511363636364, | |
| "loss": 2.4459, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.2728654324399005, | |
| "grad_norm": 0.34204795956611633, | |
| "learning_rate": 0.0004664829545454545, | |
| "loss": 2.4545, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.27631942525559544, | |
| "grad_norm": 0.33582791686058044, | |
| "learning_rate": 0.00046591477272727274, | |
| "loss": 2.4377, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.27631942525559544, | |
| "eval_loss": 2.753157138824463, | |
| "eval_runtime": 927.4163, | |
| "eval_samples_per_second": 164.303, | |
| "eval_steps_per_second": 1.028, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.2797734180712904, | |
| "grad_norm": 0.3626213073730469, | |
| "learning_rate": 0.0004653465909090909, | |
| "loss": 2.4395, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.28322741088698533, | |
| "grad_norm": 0.33439400792121887, | |
| "learning_rate": 0.00046477840909090915, | |
| "loss": 2.4267, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.2866814037026803, | |
| "grad_norm": 0.31855249404907227, | |
| "learning_rate": 0.0004642102272727273, | |
| "loss": 2.4349, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.2901353965183752, | |
| "grad_norm": 0.3519601821899414, | |
| "learning_rate": 0.00046364204545454546, | |
| "loss": 2.4248, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.2935893893340702, | |
| "grad_norm": 0.31838154792785645, | |
| "learning_rate": 0.0004630738636363637, | |
| "loss": 2.3968, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.2970433821497651, | |
| "grad_norm": 0.3294484317302704, | |
| "learning_rate": 0.0004625056818181818, | |
| "loss": 2.4162, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.3004973749654601, | |
| "grad_norm": 0.31714752316474915, | |
| "learning_rate": 0.0004619375, | |
| "loss": 2.4073, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.303951367781155, | |
| "grad_norm": 0.32918691635131836, | |
| "learning_rate": 0.00046136931818181823, | |
| "loss": 2.4229, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.30740536059685, | |
| "grad_norm": 0.3097570538520813, | |
| "learning_rate": 0.00046080113636363636, | |
| "loss": 2.3966, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.3108593534125449, | |
| "grad_norm": 0.3247508406639099, | |
| "learning_rate": 0.00046023295454545454, | |
| "loss": 2.3925, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.3108593534125449, | |
| "eval_loss": 2.705599308013916, | |
| "eval_runtime": 927.4293, | |
| "eval_samples_per_second": 164.3, | |
| "eval_steps_per_second": 1.028, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.31431334622823986, | |
| "grad_norm": 0.3189142048358917, | |
| "learning_rate": 0.0004596647727272728, | |
| "loss": 2.3932, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.3177673390439348, | |
| "grad_norm": 0.3028543293476105, | |
| "learning_rate": 0.0004590965909090909, | |
| "loss": 2.3787, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.32122133185962976, | |
| "grad_norm": 0.3109678030014038, | |
| "learning_rate": 0.0004585284090909091, | |
| "loss": 2.3665, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.3246753246753247, | |
| "grad_norm": 0.31394320726394653, | |
| "learning_rate": 0.0004579602272727273, | |
| "loss": 2.3722, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.32812931749101965, | |
| "grad_norm": 0.3214563727378845, | |
| "learning_rate": 0.0004573920454545455, | |
| "loss": 2.3586, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.33158331030671456, | |
| "grad_norm": 0.33052927255630493, | |
| "learning_rate": 0.0004568238636363636, | |
| "loss": 2.3838, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.3350373031224095, | |
| "grad_norm": 0.3511188328266144, | |
| "learning_rate": 0.00045625568181818186, | |
| "loss": 2.3588, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.33849129593810445, | |
| "grad_norm": 0.31076422333717346, | |
| "learning_rate": 0.00045568750000000004, | |
| "loss": 2.3552, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.34194528875379937, | |
| "grad_norm": 0.32571229338645935, | |
| "learning_rate": 0.00045511931818181816, | |
| "loss": 2.3496, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.34539928156949434, | |
| "grad_norm": 0.3375560939311981, | |
| "learning_rate": 0.0004545511363636364, | |
| "loss": 2.3526, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.34539928156949434, | |
| "eval_loss": 2.6650397777557373, | |
| "eval_runtime": 927.3482, | |
| "eval_samples_per_second": 164.315, | |
| "eval_steps_per_second": 1.028, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.34885327438518926, | |
| "grad_norm": 0.3159004747867584, | |
| "learning_rate": 0.0004539829545454546, | |
| "loss": 2.3606, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.35230726720088423, | |
| "grad_norm": 0.32001519203186035, | |
| "learning_rate": 0.0004534147727272727, | |
| "loss": 2.37, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.35576126001657915, | |
| "grad_norm": 0.328933447599411, | |
| "learning_rate": 0.00045284659090909094, | |
| "loss": 2.3515, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.3592152528322741, | |
| "grad_norm": 0.3076813817024231, | |
| "learning_rate": 0.0004522784090909091, | |
| "loss": 2.3276, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.36266924564796904, | |
| "grad_norm": 0.3153810501098633, | |
| "learning_rate": 0.00045171022727272725, | |
| "loss": 2.3373, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.366123238463664, | |
| "grad_norm": 0.32247108221054077, | |
| "learning_rate": 0.0004511420454545455, | |
| "loss": 2.3335, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.36957723127935893, | |
| "grad_norm": 0.3074076771736145, | |
| "learning_rate": 0.00045057386363636366, | |
| "loss": 2.325, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.3730312240950539, | |
| "grad_norm": 0.31907033920288086, | |
| "learning_rate": 0.0004500056818181818, | |
| "loss": 2.3155, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.3764852169107488, | |
| "grad_norm": 0.32912886142730713, | |
| "learning_rate": 0.0004494375, | |
| "loss": 2.324, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.3799392097264438, | |
| "grad_norm": 0.3003767430782318, | |
| "learning_rate": 0.0004488693181818182, | |
| "loss": 2.3222, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.3799392097264438, | |
| "eval_loss": 2.633434772491455, | |
| "eval_runtime": 927.254, | |
| "eval_samples_per_second": 164.331, | |
| "eval_steps_per_second": 1.028, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.3833932025421387, | |
| "grad_norm": 0.3144666254520416, | |
| "learning_rate": 0.0004483011363636364, | |
| "loss": 2.3251, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.38684719535783363, | |
| "grad_norm": 0.3284156322479248, | |
| "learning_rate": 0.00044773295454545456, | |
| "loss": 2.3033, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.3903011881735286, | |
| "grad_norm": 0.3226972222328186, | |
| "learning_rate": 0.00044716477272727274, | |
| "loss": 2.296, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.3937551809892235, | |
| "grad_norm": 0.34044623374938965, | |
| "learning_rate": 0.0004465965909090909, | |
| "loss": 2.3198, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.3972091738049185, | |
| "grad_norm": 0.3101319372653961, | |
| "learning_rate": 0.0004460284090909091, | |
| "loss": 2.3107, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.4006631666206134, | |
| "grad_norm": 0.3044012486934662, | |
| "learning_rate": 0.0004454602272727273, | |
| "loss": 2.2984, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.4041171594363084, | |
| "grad_norm": 0.3155890107154846, | |
| "learning_rate": 0.00044489204545454546, | |
| "loss": 2.2968, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.4075711522520033, | |
| "grad_norm": 0.33918723464012146, | |
| "learning_rate": 0.00044432386363636364, | |
| "loss": 2.2707, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.41102514506769827, | |
| "grad_norm": 0.30243411660194397, | |
| "learning_rate": 0.0004437556818181818, | |
| "loss": 2.2979, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.4144791378833932, | |
| "grad_norm": 0.3046514391899109, | |
| "learning_rate": 0.0004431875, | |
| "loss": 2.2809, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.4144791378833932, | |
| "eval_loss": 2.61051344871521, | |
| "eval_runtime": 927.0521, | |
| "eval_samples_per_second": 164.367, | |
| "eval_steps_per_second": 1.028, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.41793313069908816, | |
| "grad_norm": 0.32584163546562195, | |
| "learning_rate": 0.0004426193181818182, | |
| "loss": 2.2876, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.4213871235147831, | |
| "grad_norm": 0.34489238262176514, | |
| "learning_rate": 0.00044205113636363637, | |
| "loss": 2.298, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.42484111633047805, | |
| "grad_norm": 0.30355241894721985, | |
| "learning_rate": 0.00044148295454545455, | |
| "loss": 2.2767, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.42829510914617297, | |
| "grad_norm": 0.3140780031681061, | |
| "learning_rate": 0.0004409147727272728, | |
| "loss": 2.2779, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.43174910196186794, | |
| "grad_norm": 0.31298449635505676, | |
| "learning_rate": 0.0004403465909090909, | |
| "loss": 2.2831, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.43520309477756286, | |
| "grad_norm": 0.32630786299705505, | |
| "learning_rate": 0.0004397784090909091, | |
| "loss": 2.2698, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.43865708759325783, | |
| "grad_norm": 0.303371399641037, | |
| "learning_rate": 0.0004392102272727273, | |
| "loss": 2.2767, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.44211108040895275, | |
| "grad_norm": 0.30070436000823975, | |
| "learning_rate": 0.00043864204545454545, | |
| "loss": 2.2449, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.44556507322464767, | |
| "grad_norm": 0.2887287139892578, | |
| "learning_rate": 0.00043807386363636363, | |
| "loss": 2.2688, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.44901906604034264, | |
| "grad_norm": 0.306916743516922, | |
| "learning_rate": 0.00043750568181818186, | |
| "loss": 2.2557, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.44901906604034264, | |
| "eval_loss": 2.5854439735412598, | |
| "eval_runtime": 926.6658, | |
| "eval_samples_per_second": 164.436, | |
| "eval_steps_per_second": 1.028, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.45247305885603756, | |
| "grad_norm": 0.34850597381591797, | |
| "learning_rate": 0.0004369375, | |
| "loss": 2.2423, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.45592705167173253, | |
| "grad_norm": 0.35393500328063965, | |
| "learning_rate": 0.00043636931818181817, | |
| "loss": 2.2543, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.45938104448742745, | |
| "grad_norm": 0.3059336543083191, | |
| "learning_rate": 0.0004358011363636364, | |
| "loss": 2.2516, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.4628350373031224, | |
| "grad_norm": 0.3357197344303131, | |
| "learning_rate": 0.00043523295454545453, | |
| "loss": 2.2328, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.46628903011881734, | |
| "grad_norm": 0.31849172711372375, | |
| "learning_rate": 0.0004346647727272727, | |
| "loss": 2.2424, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.4697430229345123, | |
| "grad_norm": 0.31968438625335693, | |
| "learning_rate": 0.00043409659090909094, | |
| "loss": 2.2228, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.47319701575020723, | |
| "grad_norm": 0.3293677568435669, | |
| "learning_rate": 0.0004335284090909091, | |
| "loss": 2.2555, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.4766510085659022, | |
| "grad_norm": 0.3031880855560303, | |
| "learning_rate": 0.00043296022727272725, | |
| "loss": 2.2387, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.4801050013815971, | |
| "grad_norm": 0.2914179563522339, | |
| "learning_rate": 0.0004323920454545455, | |
| "loss": 2.2494, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.4835589941972921, | |
| "grad_norm": 0.3345280587673187, | |
| "learning_rate": 0.00043182386363636367, | |
| "loss": 2.2322, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.4835589941972921, | |
| "eval_loss": 2.5606906414031982, | |
| "eval_runtime": 929.2705, | |
| "eval_samples_per_second": 163.975, | |
| "eval_steps_per_second": 1.026, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.487012987012987, | |
| "grad_norm": 0.3165434002876282, | |
| "learning_rate": 0.0004312556818181818, | |
| "loss": 2.2266, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.490466979828682, | |
| "grad_norm": 0.30577775835990906, | |
| "learning_rate": 0.0004306875, | |
| "loss": 2.2401, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.4939209726443769, | |
| "grad_norm": 0.2920292019844055, | |
| "learning_rate": 0.0004301193181818182, | |
| "loss": 2.2203, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.4973749654600718, | |
| "grad_norm": 0.32168078422546387, | |
| "learning_rate": 0.00042955113636363633, | |
| "loss": 2.2064, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.5008289582757668, | |
| "grad_norm": 0.31879886984825134, | |
| "learning_rate": 0.00042898295454545457, | |
| "loss": 2.219, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.5042829510914617, | |
| "grad_norm": 0.2906869351863861, | |
| "learning_rate": 0.00042841477272727275, | |
| "loss": 2.2055, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.5077369439071566, | |
| "grad_norm": 0.3648407757282257, | |
| "learning_rate": 0.0004278465909090909, | |
| "loss": 2.2157, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.5111909367228517, | |
| "grad_norm": 0.30823054909706116, | |
| "learning_rate": 0.0004272784090909091, | |
| "loss": 2.2158, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.5146449295385466, | |
| "grad_norm": 0.3004588782787323, | |
| "learning_rate": 0.0004267102272727273, | |
| "loss": 2.2009, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.5180989223542415, | |
| "grad_norm": 0.29552149772644043, | |
| "learning_rate": 0.00042614204545454547, | |
| "loss": 2.2194, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.5180989223542415, | |
| "eval_loss": 2.537440538406372, | |
| "eval_runtime": 928.9697, | |
| "eval_samples_per_second": 164.028, | |
| "eval_steps_per_second": 1.026, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.5215529151699364, | |
| "grad_norm": 0.3077145516872406, | |
| "learning_rate": 0.00042557386363636365, | |
| "loss": 2.199, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.5250069079856314, | |
| "grad_norm": 0.32205095887184143, | |
| "learning_rate": 0.00042500568181818183, | |
| "loss": 2.2045, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.5284609008013263, | |
| "grad_norm": 0.30157867074012756, | |
| "learning_rate": 0.0004244375, | |
| "loss": 2.1926, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.5319148936170213, | |
| "grad_norm": 0.35868486762046814, | |
| "learning_rate": 0.0004238693181818182, | |
| "loss": 2.1911, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.5353688864327162, | |
| "grad_norm": 0.3132970631122589, | |
| "learning_rate": 0.00042330113636363637, | |
| "loss": 2.193, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.5388228792484112, | |
| "grad_norm": 0.31356823444366455, | |
| "learning_rate": 0.00042273295454545455, | |
| "loss": 2.1959, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.5422768720641061, | |
| "grad_norm": 0.31471192836761475, | |
| "learning_rate": 0.00042216477272727273, | |
| "loss": 2.2069, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.545730864879801, | |
| "grad_norm": 0.33163174986839294, | |
| "learning_rate": 0.0004215965909090909, | |
| "loss": 2.1929, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.549184857695496, | |
| "grad_norm": 0.31774455308914185, | |
| "learning_rate": 0.0004210284090909091, | |
| "loss": 2.1816, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.5526388505111909, | |
| "grad_norm": 0.30572381615638733, | |
| "learning_rate": 0.00042046022727272727, | |
| "loss": 2.206, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.5526388505111909, | |
| "eval_loss": 2.52417254447937, | |
| "eval_runtime": 928.7275, | |
| "eval_samples_per_second": 164.071, | |
| "eval_steps_per_second": 1.026, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.5560928433268859, | |
| "grad_norm": 0.3196762502193451, | |
| "learning_rate": 0.00041989204545454545, | |
| "loss": 2.1801, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.5595468361425808, | |
| "grad_norm": 0.3148038685321808, | |
| "learning_rate": 0.00041932386363636363, | |
| "loss": 2.1722, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.5630008289582757, | |
| "grad_norm": 0.32507434487342834, | |
| "learning_rate": 0.0004187556818181818, | |
| "loss": 2.1836, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.5664548217739707, | |
| "grad_norm": 0.3227043151855469, | |
| "learning_rate": 0.0004181875, | |
| "loss": 2.1794, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.5699088145896657, | |
| "grad_norm": 0.3271748721599579, | |
| "learning_rate": 0.0004176193181818182, | |
| "loss": 2.1786, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.5733628074053606, | |
| "grad_norm": 0.31076040863990784, | |
| "learning_rate": 0.0004170511363636364, | |
| "loss": 2.1616, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.5768168002210555, | |
| "grad_norm": 0.32442960143089294, | |
| "learning_rate": 0.00041648295454545453, | |
| "loss": 2.1642, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.5802707930367504, | |
| "grad_norm": 0.2945985794067383, | |
| "learning_rate": 0.0004159147727272727, | |
| "loss": 2.1641, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.5837247858524455, | |
| "grad_norm": 0.32005414366722107, | |
| "learning_rate": 0.00041534659090909095, | |
| "loss": 2.1968, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.5871787786681404, | |
| "grad_norm": 0.31035128235816956, | |
| "learning_rate": 0.0004147784090909091, | |
| "loss": 2.1735, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5871787786681404, | |
| "eval_loss": 2.5003573894500732, | |
| "eval_runtime": 929.6579, | |
| "eval_samples_per_second": 163.907, | |
| "eval_steps_per_second": 1.025, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5906327714838353, | |
| "grad_norm": 0.3103092908859253, | |
| "learning_rate": 0.00041421022727272726, | |
| "loss": 2.1625, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.5940867642995302, | |
| "grad_norm": 0.3217906057834625, | |
| "learning_rate": 0.0004136420454545455, | |
| "loss": 2.1485, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.5975407571152253, | |
| "grad_norm": 0.2988424301147461, | |
| "learning_rate": 0.0004130738636363636, | |
| "loss": 2.1628, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.6009947499309202, | |
| "grad_norm": 0.3058546185493469, | |
| "learning_rate": 0.0004125056818181818, | |
| "loss": 2.1701, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.6044487427466151, | |
| "grad_norm": 0.3056589961051941, | |
| "learning_rate": 0.00041193750000000003, | |
| "loss": 2.1515, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.60790273556231, | |
| "grad_norm": 0.31840789318084717, | |
| "learning_rate": 0.00041136931818181816, | |
| "loss": 2.1536, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.6113567283780049, | |
| "grad_norm": 0.3044828772544861, | |
| "learning_rate": 0.00041080113636363634, | |
| "loss": 2.162, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.6148107211937, | |
| "grad_norm": 0.30973371863365173, | |
| "learning_rate": 0.00041023295454545457, | |
| "loss": 2.1498, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.6182647140093949, | |
| "grad_norm": 0.30947718024253845, | |
| "learning_rate": 0.00040966477272727275, | |
| "loss": 2.1435, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.6217187068250898, | |
| "grad_norm": 0.34587281942367554, | |
| "learning_rate": 0.0004090965909090909, | |
| "loss": 2.1504, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.6217187068250898, | |
| "eval_loss": 2.484160900115967, | |
| "eval_runtime": 927.1891, | |
| "eval_samples_per_second": 164.343, | |
| "eval_steps_per_second": 1.028, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.6251726996407847, | |
| "grad_norm": 0.30945053696632385, | |
| "learning_rate": 0.0004085284090909091, | |
| "loss": 2.1545, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.6286266924564797, | |
| "grad_norm": 0.3018719255924225, | |
| "learning_rate": 0.0004079602272727273, | |
| "loss": 2.1439, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.6320806852721746, | |
| "grad_norm": 0.3113386332988739, | |
| "learning_rate": 0.0004073920454545454, | |
| "loss": 2.1225, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.6355346780878696, | |
| "grad_norm": 0.29737088084220886, | |
| "learning_rate": 0.00040682386363636365, | |
| "loss": 2.1286, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.6389886709035645, | |
| "grad_norm": 0.31960177421569824, | |
| "learning_rate": 0.00040625568181818183, | |
| "loss": 2.1249, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.6424426637192595, | |
| "grad_norm": 0.3072162866592407, | |
| "learning_rate": 0.00040568749999999996, | |
| "loss": 2.1348, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.6458966565349544, | |
| "grad_norm": 0.3196597397327423, | |
| "learning_rate": 0.0004051193181818182, | |
| "loss": 2.1408, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.6493506493506493, | |
| "grad_norm": 0.3315812051296234, | |
| "learning_rate": 0.0004045511363636364, | |
| "loss": 2.1439, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.6528046421663443, | |
| "grad_norm": 0.2933200001716614, | |
| "learning_rate": 0.0004039829545454545, | |
| "loss": 2.1465, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.6562586349820393, | |
| "grad_norm": 0.33558085560798645, | |
| "learning_rate": 0.00040341477272727274, | |
| "loss": 2.1416, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.6562586349820393, | |
| "eval_loss": 2.473646640777588, | |
| "eval_runtime": 926.9255, | |
| "eval_samples_per_second": 164.39, | |
| "eval_steps_per_second": 1.028, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.6597126277977342, | |
| "grad_norm": 0.2992997169494629, | |
| "learning_rate": 0.0004028465909090909, | |
| "loss": 2.1386, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.6631666206134291, | |
| "grad_norm": 0.3051714599132538, | |
| "learning_rate": 0.00040227840909090915, | |
| "loss": 2.1225, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.666620613429124, | |
| "grad_norm": 0.31724849343299866, | |
| "learning_rate": 0.0004017102272727273, | |
| "loss": 2.1174, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.670074606244819, | |
| "grad_norm": 0.2937643826007843, | |
| "learning_rate": 0.00040114204545454546, | |
| "loss": 2.1279, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.673528599060514, | |
| "grad_norm": 0.31908687949180603, | |
| "learning_rate": 0.0004005738636363637, | |
| "loss": 2.1151, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.6769825918762089, | |
| "grad_norm": 0.31399762630462646, | |
| "learning_rate": 0.0004000056818181818, | |
| "loss": 2.1378, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.6804365846919038, | |
| "grad_norm": 0.3157575726509094, | |
| "learning_rate": 0.0003994375, | |
| "loss": 2.1149, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.6838905775075987, | |
| "grad_norm": 0.32018882036209106, | |
| "learning_rate": 0.00039886931818181823, | |
| "loss": 2.0993, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.6873445703232938, | |
| "grad_norm": 0.31708574295043945, | |
| "learning_rate": 0.00039830113636363636, | |
| "loss": 2.1132, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.6907985631389887, | |
| "grad_norm": 0.2904827892780304, | |
| "learning_rate": 0.00039773295454545454, | |
| "loss": 2.1088, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6907985631389887, | |
| "eval_loss": 2.4520211219787598, | |
| "eval_runtime": 926.891, | |
| "eval_samples_per_second": 164.396, | |
| "eval_steps_per_second": 1.028, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6942525559546836, | |
| "grad_norm": 0.3298169672489166, | |
| "learning_rate": 0.0003971647727272728, | |
| "loss": 2.1177, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.6977065487703785, | |
| "grad_norm": 0.291166752576828, | |
| "learning_rate": 0.0003965965909090909, | |
| "loss": 2.0954, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.7011605415860735, | |
| "grad_norm": 0.3211086094379425, | |
| "learning_rate": 0.0003960284090909091, | |
| "loss": 2.1209, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.7046145344017685, | |
| "grad_norm": 0.3161545395851135, | |
| "learning_rate": 0.0003954602272727273, | |
| "loss": 2.1149, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.7080685272174634, | |
| "grad_norm": 0.3262562155723572, | |
| "learning_rate": 0.0003948920454545455, | |
| "loss": 2.1204, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.7115225200331583, | |
| "grad_norm": 0.3347005546092987, | |
| "learning_rate": 0.0003943238636363636, | |
| "loss": 2.104, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.7149765128488533, | |
| "grad_norm": 0.30474451184272766, | |
| "learning_rate": 0.00039375568181818186, | |
| "loss": 2.0955, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.7184305056645482, | |
| "grad_norm": 0.32672184705734253, | |
| "learning_rate": 0.00039318750000000004, | |
| "loss": 2.0998, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.7218844984802432, | |
| "grad_norm": 0.3041098713874817, | |
| "learning_rate": 0.00039261931818181816, | |
| "loss": 2.0897, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.7253384912959381, | |
| "grad_norm": 0.351904034614563, | |
| "learning_rate": 0.0003920511363636364, | |
| "loss": 2.0925, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.7253384912959381, | |
| "eval_loss": 2.4404454231262207, | |
| "eval_runtime": 927.0383, | |
| "eval_samples_per_second": 164.37, | |
| "eval_steps_per_second": 1.028, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.728792484111633, | |
| "grad_norm": 0.34308210015296936, | |
| "learning_rate": 0.0003914829545454546, | |
| "loss": 2.1039, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.732246476927328, | |
| "grad_norm": 0.3298318088054657, | |
| "learning_rate": 0.0003909147727272727, | |
| "loss": 2.0774, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.735700469743023, | |
| "grad_norm": 0.3102123737335205, | |
| "learning_rate": 0.00039034659090909094, | |
| "loss": 2.1111, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.7391544625587179, | |
| "grad_norm": 0.3186514973640442, | |
| "learning_rate": 0.0003897784090909091, | |
| "loss": 2.084, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.7426084553744128, | |
| "grad_norm": 0.31114721298217773, | |
| "learning_rate": 0.00038921022727272724, | |
| "loss": 2.1037, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.7460624481901078, | |
| "grad_norm": 0.330563485622406, | |
| "learning_rate": 0.0003886420454545455, | |
| "loss": 2.0831, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.7495164410058027, | |
| "grad_norm": 0.3088129460811615, | |
| "learning_rate": 0.00038807386363636366, | |
| "loss": 2.0914, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.7529704338214976, | |
| "grad_norm": 0.28733545541763306, | |
| "learning_rate": 0.0003875056818181818, | |
| "loss": 2.0955, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.7564244266371926, | |
| "grad_norm": 0.3190239667892456, | |
| "learning_rate": 0.0003869375, | |
| "loss": 2.0828, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.7598784194528876, | |
| "grad_norm": 0.3163771331310272, | |
| "learning_rate": 0.0003863693181818182, | |
| "loss": 2.0786, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.7598784194528876, | |
| "eval_loss": 2.4309139251708984, | |
| "eval_runtime": 926.6419, | |
| "eval_samples_per_second": 164.44, | |
| "eval_steps_per_second": 1.028, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.7633324122685825, | |
| "grad_norm": 0.2819238603115082, | |
| "learning_rate": 0.0003858011363636364, | |
| "loss": 2.092, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.7667864050842774, | |
| "grad_norm": 0.31991979479789734, | |
| "learning_rate": 0.00038523295454545456, | |
| "loss": 2.0628, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.7702403978999723, | |
| "grad_norm": 0.3094194233417511, | |
| "learning_rate": 0.00038466477272727274, | |
| "loss": 2.0826, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.7736943907156673, | |
| "grad_norm": 0.30959707498550415, | |
| "learning_rate": 0.0003840965909090909, | |
| "loss": 2.0858, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.7771483835313623, | |
| "grad_norm": 0.30589380860328674, | |
| "learning_rate": 0.0003835284090909091, | |
| "loss": 2.0864, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.7806023763470572, | |
| "grad_norm": 0.3400673270225525, | |
| "learning_rate": 0.0003829602272727273, | |
| "loss": 2.069, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.7840563691627521, | |
| "grad_norm": 0.3428845703601837, | |
| "learning_rate": 0.00038239204545454546, | |
| "loss": 2.0622, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.787510361978447, | |
| "grad_norm": 0.3274592459201813, | |
| "learning_rate": 0.00038182386363636364, | |
| "loss": 2.0714, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.7909643547941421, | |
| "grad_norm": 0.3281017243862152, | |
| "learning_rate": 0.0003812556818181818, | |
| "loss": 2.0856, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.794418347609837, | |
| "grad_norm": 0.32381513714790344, | |
| "learning_rate": 0.0003806875, | |
| "loss": 2.0687, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.794418347609837, | |
| "eval_loss": 2.416405439376831, | |
| "eval_runtime": 926.9677, | |
| "eval_samples_per_second": 164.382, | |
| "eval_steps_per_second": 1.028, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.7978723404255319, | |
| "grad_norm": 0.31997501850128174, | |
| "learning_rate": 0.0003801193181818182, | |
| "loss": 2.0923, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.8013263332412268, | |
| "grad_norm": 0.315775603055954, | |
| "learning_rate": 0.00037955113636363636, | |
| "loss": 2.0578, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.8047803260569218, | |
| "grad_norm": 0.3135242462158203, | |
| "learning_rate": 0.00037898295454545454, | |
| "loss": 2.0604, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.8082343188726168, | |
| "grad_norm": 0.33324697613716125, | |
| "learning_rate": 0.0003784147727272728, | |
| "loss": 2.0776, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.8116883116883117, | |
| "grad_norm": 0.3114740252494812, | |
| "learning_rate": 0.0003778465909090909, | |
| "loss": 2.0679, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.8151423045040066, | |
| "grad_norm": 0.37432342767715454, | |
| "learning_rate": 0.0003772784090909091, | |
| "loss": 2.0685, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.8185962973197016, | |
| "grad_norm": 0.31538712978363037, | |
| "learning_rate": 0.0003767102272727273, | |
| "loss": 2.0687, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.8220502901353965, | |
| "grad_norm": 0.3598659336566925, | |
| "learning_rate": 0.00037614204545454545, | |
| "loss": 2.0909, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.8255042829510915, | |
| "grad_norm": 0.3034459948539734, | |
| "learning_rate": 0.0003755738636363636, | |
| "loss": 2.0588, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.8289582757667864, | |
| "grad_norm": 0.3221229016780853, | |
| "learning_rate": 0.00037500568181818186, | |
| "loss": 2.0661, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.8289582757667864, | |
| "eval_loss": 2.4008617401123047, | |
| "eval_runtime": 927.2556, | |
| "eval_samples_per_second": 164.331, | |
| "eval_steps_per_second": 1.028, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.8324122685824813, | |
| "grad_norm": 0.3049459755420685, | |
| "learning_rate": 0.0003744375, | |
| "loss": 2.0428, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.8358662613981763, | |
| "grad_norm": 0.3034842908382416, | |
| "learning_rate": 0.00037386931818181817, | |
| "loss": 2.0639, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.8393202542138712, | |
| "grad_norm": 0.3170601427555084, | |
| "learning_rate": 0.0003733011363636364, | |
| "loss": 2.0606, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.8427742470295662, | |
| "grad_norm": 0.3232339918613434, | |
| "learning_rate": 0.00037273295454545453, | |
| "loss": 2.0394, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.8462282398452611, | |
| "grad_norm": 0.3366962671279907, | |
| "learning_rate": 0.0003721647727272727, | |
| "loss": 2.0415, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.8496822326609561, | |
| "grad_norm": 0.3091275095939636, | |
| "learning_rate": 0.00037159659090909094, | |
| "loss": 2.0789, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.853136225476651, | |
| "grad_norm": 0.3144051432609558, | |
| "learning_rate": 0.0003710284090909091, | |
| "loss": 2.059, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.8565902182923459, | |
| "grad_norm": 0.3365747332572937, | |
| "learning_rate": 0.00037046022727272725, | |
| "loss": 2.0388, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.8600442111080409, | |
| "grad_norm": 0.2965666949748993, | |
| "learning_rate": 0.0003698920454545455, | |
| "loss": 2.0576, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.8634982039237359, | |
| "grad_norm": 0.3322639465332031, | |
| "learning_rate": 0.00036932386363636366, | |
| "loss": 2.0633, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.8634982039237359, | |
| "eval_loss": 2.392946243286133, | |
| "eval_runtime": 926.5204, | |
| "eval_samples_per_second": 164.462, | |
| "eval_steps_per_second": 1.029, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.8669521967394308, | |
| "grad_norm": 0.3184923827648163, | |
| "learning_rate": 0.0003687556818181818, | |
| "loss": 2.0442, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.8704061895551257, | |
| "grad_norm": 0.30526450276374817, | |
| "learning_rate": 0.0003681875, | |
| "loss": 2.0364, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.8738601823708206, | |
| "grad_norm": 0.3035339117050171, | |
| "learning_rate": 0.0003676193181818182, | |
| "loss": 2.0399, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.8773141751865157, | |
| "grad_norm": 0.3300335705280304, | |
| "learning_rate": 0.00036705113636363633, | |
| "loss": 2.0388, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.8807681680022106, | |
| "grad_norm": 0.33707037568092346, | |
| "learning_rate": 0.00036648295454545457, | |
| "loss": 2.0364, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.8842221608179055, | |
| "grad_norm": 0.3057771623134613, | |
| "learning_rate": 0.00036591477272727275, | |
| "loss": 2.0377, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.8876761536336004, | |
| "grad_norm": 0.33993765711784363, | |
| "learning_rate": 0.00036534659090909087, | |
| "loss": 2.0485, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.8911301464492953, | |
| "grad_norm": 0.3075715899467468, | |
| "learning_rate": 0.0003647784090909091, | |
| "loss": 2.0256, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.8945841392649904, | |
| "grad_norm": 0.30490240454673767, | |
| "learning_rate": 0.0003642102272727273, | |
| "loss": 2.0489, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.8980381320806853, | |
| "grad_norm": 0.3403315246105194, | |
| "learning_rate": 0.00036364204545454547, | |
| "loss": 2.0476, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.8980381320806853, | |
| "eval_loss": 2.382169008255005, | |
| "eval_runtime": 932.2661, | |
| "eval_samples_per_second": 163.448, | |
| "eval_steps_per_second": 1.635, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.9014921248963802, | |
| "grad_norm": 0.31369808316230774, | |
| "learning_rate": 0.00036307386363636365, | |
| "loss": 2.0265, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.9049461177120751, | |
| "grad_norm": 0.30494198203086853, | |
| "learning_rate": 0.00036250568181818183, | |
| "loss": 2.0328, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.9084001105277701, | |
| "grad_norm": 0.2981790006160736, | |
| "learning_rate": 0.0003619375, | |
| "loss": 2.0196, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.9118541033434651, | |
| "grad_norm": 0.3235887587070465, | |
| "learning_rate": 0.0003613693181818182, | |
| "loss": 2.0224, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.91530809615916, | |
| "grad_norm": 0.32602986693382263, | |
| "learning_rate": 0.00036080113636363637, | |
| "loss": 2.0169, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.9187620889748549, | |
| "grad_norm": 0.3355056643486023, | |
| "learning_rate": 0.00036023295454545455, | |
| "loss": 2.0338, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.9222160817905499, | |
| "grad_norm": 0.3180111348628998, | |
| "learning_rate": 0.00035966477272727273, | |
| "loss": 2.0297, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.9256700746062448, | |
| "grad_norm": 0.2988349199295044, | |
| "learning_rate": 0.0003590965909090909, | |
| "loss": 2.0189, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.9291240674219398, | |
| "grad_norm": 0.30824485421180725, | |
| "learning_rate": 0.0003585284090909091, | |
| "loss": 2.0086, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.9325780602376347, | |
| "grad_norm": 0.33140483498573303, | |
| "learning_rate": 0.00035796022727272727, | |
| "loss": 2.0127, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.9325780602376347, | |
| "eval_loss": 2.3799469470977783, | |
| "eval_runtime": 931.9535, | |
| "eval_samples_per_second": 163.503, | |
| "eval_steps_per_second": 1.635, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.9360320530533297, | |
| "grad_norm": 0.31175485253334045, | |
| "learning_rate": 0.00035739204545454545, | |
| "loss": 2.027, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.9394860458690246, | |
| "grad_norm": 0.3109052777290344, | |
| "learning_rate": 0.00035682386363636363, | |
| "loss": 2.029, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.9429400386847195, | |
| "grad_norm": 0.3299388885498047, | |
| "learning_rate": 0.0003562556818181818, | |
| "loss": 2.0194, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.9463940315004145, | |
| "grad_norm": 0.35121017694473267, | |
| "learning_rate": 0.0003556875, | |
| "loss": 2.0158, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.9498480243161094, | |
| "grad_norm": 0.3052006959915161, | |
| "learning_rate": 0.00035511931818181817, | |
| "loss": 2.0109, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.9533020171318044, | |
| "grad_norm": 0.3126027584075928, | |
| "learning_rate": 0.0003545511363636364, | |
| "loss": 2.0215, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.9567560099474993, | |
| "grad_norm": 0.32444655895233154, | |
| "learning_rate": 0.00035398295454545453, | |
| "loss": 2.0108, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.9602100027631942, | |
| "grad_norm": 0.31381282210350037, | |
| "learning_rate": 0.0003534147727272727, | |
| "loss": 2.0151, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.9636639955788892, | |
| "grad_norm": 0.3093770444393158, | |
| "learning_rate": 0.00035284659090909095, | |
| "loss": 1.9959, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.9671179883945842, | |
| "grad_norm": 0.3137684762477875, | |
| "learning_rate": 0.0003522784090909091, | |
| "loss": 2.0223, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.9671179883945842, | |
| "eval_loss": 2.3616411685943604, | |
| "eval_runtime": 936.0723, | |
| "eval_samples_per_second": 162.783, | |
| "eval_steps_per_second": 1.628, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.9705719812102791, | |
| "grad_norm": 0.3130528926849365, | |
| "learning_rate": 0.00035171022727272725, | |
| "loss": 2.0078, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.974025974025974, | |
| "grad_norm": 0.33664995431900024, | |
| "learning_rate": 0.0003511420454545455, | |
| "loss": 2.0087, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.9774799668416689, | |
| "grad_norm": 0.32277122139930725, | |
| "learning_rate": 0.0003505738636363636, | |
| "loss": 2.0106, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.980933959657364, | |
| "grad_norm": 0.33459389209747314, | |
| "learning_rate": 0.0003500056818181818, | |
| "loss": 2.019, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.9843879524730589, | |
| "grad_norm": 0.31769075989723206, | |
| "learning_rate": 0.00034943750000000003, | |
| "loss": 2.0105, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.9878419452887538, | |
| "grad_norm": 0.3090764582157135, | |
| "learning_rate": 0.00034886931818181816, | |
| "loss": 2.0121, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.9912959381044487, | |
| "grad_norm": 0.3254571557044983, | |
| "learning_rate": 0.00034830113636363634, | |
| "loss": 2.0069, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.9947499309201436, | |
| "grad_norm": 0.3087945878505707, | |
| "learning_rate": 0.00034773295454545457, | |
| "loss": 1.9956, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.9982039237358387, | |
| "grad_norm": 0.2959256172180176, | |
| "learning_rate": 0.00034716477272727275, | |
| "loss": 2.0202, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 1.0016579165515336, | |
| "grad_norm": 0.3626255691051483, | |
| "learning_rate": 0.0003465965909090909, | |
| "loss": 1.9733, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.0016579165515336, | |
| "eval_loss": 2.3518831729888916, | |
| "eval_runtime": 933.6764, | |
| "eval_samples_per_second": 163.201, | |
| "eval_steps_per_second": 1.632, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.0051119093672285, | |
| "grad_norm": 0.3299137353897095, | |
| "learning_rate": 0.0003460284090909091, | |
| "loss": 1.9406, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 1.0085659021829234, | |
| "grad_norm": 0.3189757168292999, | |
| "learning_rate": 0.0003454602272727273, | |
| "loss": 1.9547, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 1.0120198949986183, | |
| "grad_norm": 0.33895236253738403, | |
| "learning_rate": 0.0003448920454545454, | |
| "loss": 1.9462, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 1.0154738878143132, | |
| "grad_norm": 0.3329538106918335, | |
| "learning_rate": 0.00034432386363636365, | |
| "loss": 1.9445, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 1.0189278806300084, | |
| "grad_norm": 0.33972305059432983, | |
| "learning_rate": 0.00034375568181818183, | |
| "loss": 1.9482, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.0223818734457033, | |
| "grad_norm": 0.3170960545539856, | |
| "learning_rate": 0.00034318749999999996, | |
| "loss": 1.9322, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 1.0258358662613982, | |
| "grad_norm": 0.3435528576374054, | |
| "learning_rate": 0.0003426193181818182, | |
| "loss": 1.9651, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 1.0292898590770931, | |
| "grad_norm": 0.3118680715560913, | |
| "learning_rate": 0.0003420511363636364, | |
| "loss": 1.9553, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 1.032743851892788, | |
| "grad_norm": 0.30952584743499756, | |
| "learning_rate": 0.0003414829545454545, | |
| "loss": 1.9594, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 1.036197844708483, | |
| "grad_norm": 0.3205563724040985, | |
| "learning_rate": 0.00034091477272727274, | |
| "loss": 1.951, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.036197844708483, | |
| "eval_loss": 2.3421385288238525, | |
| "eval_runtime": 931.9003, | |
| "eval_samples_per_second": 163.512, | |
| "eval_steps_per_second": 1.635, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.039651837524178, | |
| "grad_norm": 0.3193325400352478, | |
| "learning_rate": 0.0003403465909090909, | |
| "loss": 1.9781, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 1.0431058303398728, | |
| "grad_norm": 0.3476419448852539, | |
| "learning_rate": 0.00033977840909090915, | |
| "loss": 1.9804, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 1.046559823155568, | |
| "grad_norm": 0.334945946931839, | |
| "learning_rate": 0.0003392102272727273, | |
| "loss": 1.9956, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 1.0500138159712629, | |
| "grad_norm": 0.3205523192882538, | |
| "learning_rate": 0.00033864204545454546, | |
| "loss": 1.9738, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 1.0534678087869578, | |
| "grad_norm": 0.3324650824069977, | |
| "learning_rate": 0.0003380738636363637, | |
| "loss": 1.9851, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.0569218016026527, | |
| "grad_norm": 0.3181789815425873, | |
| "learning_rate": 0.0003375056818181818, | |
| "loss": 1.9993, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 1.0603757944183476, | |
| "grad_norm": 0.3182109594345093, | |
| "learning_rate": 0.0003369375, | |
| "loss": 1.9808, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 1.0638297872340425, | |
| "grad_norm": 0.3040473163127899, | |
| "learning_rate": 0.00033636931818181823, | |
| "loss": 1.9697, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 1.0672837800497375, | |
| "grad_norm": 0.3187369108200073, | |
| "learning_rate": 0.00033580113636363636, | |
| "loss": 1.9668, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 1.0707377728654324, | |
| "grad_norm": 0.31757599115371704, | |
| "learning_rate": 0.00033523295454545454, | |
| "loss": 1.9797, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.0707377728654324, | |
| "eval_loss": 2.334416151046753, | |
| "eval_runtime": 932.4289, | |
| "eval_samples_per_second": 163.419, | |
| "eval_steps_per_second": 1.634, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.0741917656811273, | |
| "grad_norm": 0.3234330713748932, | |
| "learning_rate": 0.0003346647727272728, | |
| "loss": 1.9646, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 1.0776457584968224, | |
| "grad_norm": 0.346343457698822, | |
| "learning_rate": 0.0003340965909090909, | |
| "loss": 1.9633, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.0810997513125173, | |
| "grad_norm": 0.33652421832084656, | |
| "learning_rate": 0.0003335284090909091, | |
| "loss": 1.9635, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 1.0845537441282123, | |
| "grad_norm": 0.3355984091758728, | |
| "learning_rate": 0.0003329602272727273, | |
| "loss": 1.9714, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 1.0880077369439072, | |
| "grad_norm": 0.3155532479286194, | |
| "learning_rate": 0.0003323920454545455, | |
| "loss": 1.9579, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.091461729759602, | |
| "grad_norm": 0.3124435842037201, | |
| "learning_rate": 0.0003318238636363636, | |
| "loss": 1.9896, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.094915722575297, | |
| "grad_norm": 0.3473125100135803, | |
| "learning_rate": 0.00033125568181818185, | |
| "loss": 1.9604, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 1.098369715390992, | |
| "grad_norm": 0.33051636815071106, | |
| "learning_rate": 0.00033068750000000004, | |
| "loss": 1.9703, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 1.1018237082066868, | |
| "grad_norm": 0.3092711865901947, | |
| "learning_rate": 0.00033011931818181816, | |
| "loss": 1.9583, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 1.105277701022382, | |
| "grad_norm": 0.32419732213020325, | |
| "learning_rate": 0.0003295511363636364, | |
| "loss": 1.9603, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.105277701022382, | |
| "eval_loss": 2.3255245685577393, | |
| "eval_runtime": 932.6931, | |
| "eval_samples_per_second": 163.373, | |
| "eval_steps_per_second": 1.634, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.108731693838077, | |
| "grad_norm": 0.332787424325943, | |
| "learning_rate": 0.0003289829545454546, | |
| "loss": 1.992, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 1.1121856866537718, | |
| "grad_norm": 0.3273712992668152, | |
| "learning_rate": 0.0003284147727272727, | |
| "loss": 1.9632, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 1.1156396794694667, | |
| "grad_norm": 0.32147789001464844, | |
| "learning_rate": 0.00032784659090909094, | |
| "loss": 1.9838, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 1.1190936722851617, | |
| "grad_norm": 0.3235771358013153, | |
| "learning_rate": 0.0003272784090909091, | |
| "loss": 1.9594, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 1.1225476651008566, | |
| "grad_norm": 0.31604549288749695, | |
| "learning_rate": 0.00032671022727272724, | |
| "loss": 1.9716, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.1260016579165515, | |
| "grad_norm": 0.3200394809246063, | |
| "learning_rate": 0.0003261420454545455, | |
| "loss": 1.9598, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 1.1294556507322464, | |
| "grad_norm": 0.31569093465805054, | |
| "learning_rate": 0.00032557386363636366, | |
| "loss": 1.9598, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 1.1329096435479413, | |
| "grad_norm": 0.3108920753002167, | |
| "learning_rate": 0.0003250056818181818, | |
| "loss": 1.9333, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 1.1363636363636362, | |
| "grad_norm": 0.31714916229248047, | |
| "learning_rate": 0.0003244375, | |
| "loss": 1.9665, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 1.1398176291793314, | |
| "grad_norm": 0.3428919017314911, | |
| "learning_rate": 0.0003238693181818182, | |
| "loss": 1.9367, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.1398176291793314, | |
| "eval_loss": 2.314099073410034, | |
| "eval_runtime": 933.8501, | |
| "eval_samples_per_second": 163.171, | |
| "eval_steps_per_second": 1.632, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.1432716219950263, | |
| "grad_norm": 0.31503021717071533, | |
| "learning_rate": 0.0003233011363636364, | |
| "loss": 1.952, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 1.1467256148107212, | |
| "grad_norm": 0.3151177763938904, | |
| "learning_rate": 0.00032273295454545456, | |
| "loss": 1.9711, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 1.1501796076264161, | |
| "grad_norm": 0.33299991488456726, | |
| "learning_rate": 0.00032216477272727274, | |
| "loss": 1.966, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 1.153633600442111, | |
| "grad_norm": 0.35912394523620605, | |
| "learning_rate": 0.0003215965909090909, | |
| "loss": 1.9345, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 1.157087593257806, | |
| "grad_norm": 0.3316855728626251, | |
| "learning_rate": 0.0003210284090909091, | |
| "loss": 1.9473, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.1605415860735009, | |
| "grad_norm": 0.32025349140167236, | |
| "learning_rate": 0.0003204602272727273, | |
| "loss": 1.9512, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 1.163995578889196, | |
| "grad_norm": 0.31566309928894043, | |
| "learning_rate": 0.00031989204545454546, | |
| "loss": 1.9451, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 1.167449571704891, | |
| "grad_norm": 0.32200607657432556, | |
| "learning_rate": 0.00031932386363636364, | |
| "loss": 1.9382, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 1.1709035645205859, | |
| "grad_norm": 0.3362364172935486, | |
| "learning_rate": 0.0003187556818181818, | |
| "loss": 1.9504, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 1.1743575573362808, | |
| "grad_norm": 0.3156588077545166, | |
| "learning_rate": 0.0003181875, | |
| "loss": 1.9488, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.1743575573362808, | |
| "eval_loss": 2.308772563934326, | |
| "eval_runtime": 932.5965, | |
| "eval_samples_per_second": 163.39, | |
| "eval_steps_per_second": 1.634, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.1778115501519757, | |
| "grad_norm": 0.3278816342353821, | |
| "learning_rate": 0.0003176193181818182, | |
| "loss": 1.9547, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 1.1812655429676706, | |
| "grad_norm": 0.3398403227329254, | |
| "learning_rate": 0.00031705113636363636, | |
| "loss": 1.9293, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 1.1847195357833655, | |
| "grad_norm": 0.34434807300567627, | |
| "learning_rate": 0.00031648295454545454, | |
| "loss": 1.9497, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 1.1881735285990604, | |
| "grad_norm": 0.33737897872924805, | |
| "learning_rate": 0.0003159147727272728, | |
| "loss": 1.9471, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 1.1916275214147554, | |
| "grad_norm": 0.3157757520675659, | |
| "learning_rate": 0.0003153465909090909, | |
| "loss": 1.9395, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.1950815142304503, | |
| "grad_norm": 0.3554360866546631, | |
| "learning_rate": 0.0003147784090909091, | |
| "loss": 1.9589, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 1.1985355070461454, | |
| "grad_norm": 0.31714192032814026, | |
| "learning_rate": 0.0003142102272727273, | |
| "loss": 1.9382, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 1.2019894998618403, | |
| "grad_norm": 0.3395540416240692, | |
| "learning_rate": 0.00031364204545454545, | |
| "loss": 1.9245, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 1.2054434926775353, | |
| "grad_norm": 0.38380250334739685, | |
| "learning_rate": 0.0003130738636363636, | |
| "loss": 1.9379, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 1.2088974854932302, | |
| "grad_norm": 0.3237415552139282, | |
| "learning_rate": 0.00031250568181818186, | |
| "loss": 1.9433, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.2088974854932302, | |
| "eval_loss": 2.299807548522949, | |
| "eval_runtime": 932.2028, | |
| "eval_samples_per_second": 163.459, | |
| "eval_steps_per_second": 1.635, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.212351478308925, | |
| "grad_norm": 0.3568110764026642, | |
| "learning_rate": 0.0003119375, | |
| "loss": 1.9359, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 1.21580547112462, | |
| "grad_norm": 0.3228346109390259, | |
| "learning_rate": 0.00031136931818181817, | |
| "loss": 1.9398, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 1.219259463940315, | |
| "grad_norm": 0.4409060478210449, | |
| "learning_rate": 0.0003108011363636364, | |
| "loss": 1.9271, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 1.22271345675601, | |
| "grad_norm": 0.3323960602283478, | |
| "learning_rate": 0.0003102329545454545, | |
| "loss": 1.9351, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 1.226167449571705, | |
| "grad_norm": 0.33286628127098083, | |
| "learning_rate": 0.0003096647727272727, | |
| "loss": 1.9261, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.2296214423874, | |
| "grad_norm": 0.32433241605758667, | |
| "learning_rate": 0.00030909659090909094, | |
| "loss": 1.9235, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 1.2330754352030948, | |
| "grad_norm": 0.33505016565322876, | |
| "learning_rate": 0.0003085284090909091, | |
| "loss": 1.9463, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 1.2365294280187897, | |
| "grad_norm": 0.33028197288513184, | |
| "learning_rate": 0.00030796022727272725, | |
| "loss": 1.9425, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 1.2399834208344847, | |
| "grad_norm": 0.32460519671440125, | |
| "learning_rate": 0.0003073920454545455, | |
| "loss": 1.9237, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 1.2434374136501796, | |
| "grad_norm": 0.34961310029029846, | |
| "learning_rate": 0.00030682386363636366, | |
| "loss": 1.927, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.2434374136501796, | |
| "eval_loss": 2.2926623821258545, | |
| "eval_runtime": 933.7737, | |
| "eval_samples_per_second": 163.184, | |
| "eval_steps_per_second": 1.632, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.2468914064658745, | |
| "grad_norm": 0.3421266973018646, | |
| "learning_rate": 0.0003062556818181818, | |
| "loss": 1.9172, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 1.2503453992815694, | |
| "grad_norm": 0.31496691703796387, | |
| "learning_rate": 0.0003056875, | |
| "loss": 1.9283, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 1.2537993920972643, | |
| "grad_norm": 0.3333700895309448, | |
| "learning_rate": 0.0003051193181818182, | |
| "loss": 1.9083, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 1.2572533849129595, | |
| "grad_norm": 0.33785733580589294, | |
| "learning_rate": 0.00030455113636363633, | |
| "loss": 1.9364, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 1.2607073777286544, | |
| "grad_norm": 0.3140362799167633, | |
| "learning_rate": 0.00030398295454545456, | |
| "loss": 1.9202, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.2641613705443493, | |
| "grad_norm": 0.332356721162796, | |
| "learning_rate": 0.00030341477272727275, | |
| "loss": 1.9219, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 1.2676153633600442, | |
| "grad_norm": 0.30988287925720215, | |
| "learning_rate": 0.00030284659090909087, | |
| "loss": 1.9247, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 1.2710693561757391, | |
| "grad_norm": 0.3257978856563568, | |
| "learning_rate": 0.0003022784090909091, | |
| "loss": 1.9274, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 1.274523348991434, | |
| "grad_norm": 0.3108922243118286, | |
| "learning_rate": 0.0003017102272727273, | |
| "loss": 1.9182, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 1.277977341807129, | |
| "grad_norm": 0.32838690280914307, | |
| "learning_rate": 0.00030114204545454547, | |
| "loss": 1.921, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.277977341807129, | |
| "eval_loss": 2.28013014793396, | |
| "eval_runtime": 932.9674, | |
| "eval_samples_per_second": 163.325, | |
| "eval_steps_per_second": 1.633, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.281431334622824, | |
| "grad_norm": 0.33043205738067627, | |
| "learning_rate": 0.00030057386363636365, | |
| "loss": 1.9282, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 1.284885327438519, | |
| "grad_norm": 0.3355056047439575, | |
| "learning_rate": 0.00030000568181818183, | |
| "loss": 1.9146, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 1.288339320254214, | |
| "grad_norm": 0.34499314427375793, | |
| "learning_rate": 0.0002994375, | |
| "loss": 1.9121, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 1.2917933130699089, | |
| "grad_norm": 0.33857813477516174, | |
| "learning_rate": 0.0002988693181818182, | |
| "loss": 1.906, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 1.2952473058856038, | |
| "grad_norm": 0.34451091289520264, | |
| "learning_rate": 0.00029830113636363637, | |
| "loss": 1.9069, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.2987012987012987, | |
| "grad_norm": 0.31819987297058105, | |
| "learning_rate": 0.00029773295454545455, | |
| "loss": 1.905, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 1.3021552915169936, | |
| "grad_norm": 0.32892873883247375, | |
| "learning_rate": 0.00029716477272727273, | |
| "loss": 1.9358, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 1.3056092843326885, | |
| "grad_norm": 0.3139948844909668, | |
| "learning_rate": 0.0002965965909090909, | |
| "loss": 1.917, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 1.3090632771483834, | |
| "grad_norm": 0.3358207046985626, | |
| "learning_rate": 0.0002960284090909091, | |
| "loss": 1.8979, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 1.3125172699640784, | |
| "grad_norm": 0.3274485468864441, | |
| "learning_rate": 0.00029546022727272727, | |
| "loss": 1.9147, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.3125172699640784, | |
| "eval_loss": 2.2716429233551025, | |
| "eval_runtime": 932.4022, | |
| "eval_samples_per_second": 163.424, | |
| "eval_steps_per_second": 1.634, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.3159712627797735, | |
| "grad_norm": 0.3326353430747986, | |
| "learning_rate": 0.00029489204545454545, | |
| "loss": 1.9151, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 1.3194252555954684, | |
| "grad_norm": 0.33048099279403687, | |
| "learning_rate": 0.00029432386363636363, | |
| "loss": 1.9003, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 1.3228792484111633, | |
| "grad_norm": 0.3198449909687042, | |
| "learning_rate": 0.0002937556818181818, | |
| "loss": 1.9012, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 1.3263332412268583, | |
| "grad_norm": 0.3347759246826172, | |
| "learning_rate": 0.0002931875, | |
| "loss": 1.889, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 1.3297872340425532, | |
| "grad_norm": 0.344235360622406, | |
| "learning_rate": 0.00029261931818181817, | |
| "loss": 1.9096, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.333241226858248, | |
| "grad_norm": 0.34197336435317993, | |
| "learning_rate": 0.0002920511363636364, | |
| "loss": 1.9083, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 1.336695219673943, | |
| "grad_norm": 0.3257678747177124, | |
| "learning_rate": 0.00029148295454545453, | |
| "loss": 1.9007, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 1.3401492124896381, | |
| "grad_norm": 0.3299179971218109, | |
| "learning_rate": 0.0002909147727272727, | |
| "loss": 1.8992, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 1.343603205305333, | |
| "grad_norm": 0.32206007838249207, | |
| "learning_rate": 0.00029034659090909095, | |
| "loss": 1.8853, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 1.347057198121028, | |
| "grad_norm": 0.3281271159648895, | |
| "learning_rate": 0.0002897784090909091, | |
| "loss": 1.9075, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.347057198121028, | |
| "eval_loss": 2.266144275665283, | |
| "eval_runtime": 933.0379, | |
| "eval_samples_per_second": 163.313, | |
| "eval_steps_per_second": 1.633, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.350511190936723, | |
| "grad_norm": 0.32982590794563293, | |
| "learning_rate": 0.00028921022727272725, | |
| "loss": 1.9255, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 1.3539651837524178, | |
| "grad_norm": 0.33906838297843933, | |
| "learning_rate": 0.0002886420454545455, | |
| "loss": 1.9119, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 1.3574191765681127, | |
| "grad_norm": 0.32768332958221436, | |
| "learning_rate": 0.0002880738636363636, | |
| "loss": 1.8838, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 1.3608731693838076, | |
| "grad_norm": 0.3550179600715637, | |
| "learning_rate": 0.0002875056818181818, | |
| "loss": 1.8889, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 1.3643271621995026, | |
| "grad_norm": 0.32649099826812744, | |
| "learning_rate": 0.00028693750000000003, | |
| "loss": 1.8983, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.3677811550151975, | |
| "grad_norm": 0.33756542205810547, | |
| "learning_rate": 0.00028636931818181816, | |
| "loss": 1.8982, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 1.3712351478308924, | |
| "grad_norm": 0.3554450571537018, | |
| "learning_rate": 0.00028580113636363634, | |
| "loss": 1.8831, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 1.3746891406465875, | |
| "grad_norm": 0.3348751962184906, | |
| "learning_rate": 0.00028523295454545457, | |
| "loss": 1.9022, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 1.3781431334622825, | |
| "grad_norm": 0.3384929895401001, | |
| "learning_rate": 0.00028466477272727275, | |
| "loss": 1.8973, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 1.3815971262779774, | |
| "grad_norm": 0.3346748352050781, | |
| "learning_rate": 0.0002840965909090909, | |
| "loss": 1.897, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.3815971262779774, | |
| "eval_loss": 2.258094072341919, | |
| "eval_runtime": 932.2639, | |
| "eval_samples_per_second": 163.448, | |
| "eval_steps_per_second": 1.635, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.3850511190936723, | |
| "grad_norm": 0.3488174378871918, | |
| "learning_rate": 0.0002835284090909091, | |
| "loss": 1.899, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 1.3885051119093672, | |
| "grad_norm": 0.357048898935318, | |
| "learning_rate": 0.0002829602272727273, | |
| "loss": 1.8874, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 1.3919591047250621, | |
| "grad_norm": 0.34619608521461487, | |
| "learning_rate": 0.0002823920454545454, | |
| "loss": 1.8971, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 1.395413097540757, | |
| "grad_norm": 0.3450053930282593, | |
| "learning_rate": 0.00028182386363636365, | |
| "loss": 1.8951, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 1.3988670903564522, | |
| "grad_norm": 0.3244158923625946, | |
| "learning_rate": 0.00028125568181818183, | |
| "loss": 1.887, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.402321083172147, | |
| "grad_norm": 0.36656075716018677, | |
| "learning_rate": 0.00028068749999999996, | |
| "loss": 1.8961, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 1.405775075987842, | |
| "grad_norm": 0.3427944481372833, | |
| "learning_rate": 0.0002801193181818182, | |
| "loss": 1.8801, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 1.409229068803537, | |
| "grad_norm": 0.3511246144771576, | |
| "learning_rate": 0.0002795511363636364, | |
| "loss": 1.8856, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 1.4126830616192319, | |
| "grad_norm": 0.34178775548934937, | |
| "learning_rate": 0.0002789829545454545, | |
| "loss": 1.8888, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 1.4161370544349268, | |
| "grad_norm": 0.35453692078590393, | |
| "learning_rate": 0.00027841477272727273, | |
| "loss": 1.8867, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.4161370544349268, | |
| "eval_loss": 2.2483203411102295, | |
| "eval_runtime": 932.6422, | |
| "eval_samples_per_second": 163.382, | |
| "eval_steps_per_second": 1.634, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.4195910472506217, | |
| "grad_norm": 0.38095447421073914, | |
| "learning_rate": 0.0002778465909090909, | |
| "loss": 1.8847, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 1.4230450400663166, | |
| "grad_norm": 0.3299073576927185, | |
| "learning_rate": 0.00027727840909090915, | |
| "loss": 1.8848, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 1.4264990328820115, | |
| "grad_norm": 0.3188841640949249, | |
| "learning_rate": 0.0002767102272727273, | |
| "loss": 1.9009, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 1.4299530256977064, | |
| "grad_norm": 0.3500712811946869, | |
| "learning_rate": 0.00027614204545454546, | |
| "loss": 1.885, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 1.4334070185134014, | |
| "grad_norm": 0.34655386209487915, | |
| "learning_rate": 0.0002755738636363637, | |
| "loss": 1.8862, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.4368610113290965, | |
| "grad_norm": 0.34666162729263306, | |
| "learning_rate": 0.0002750056818181818, | |
| "loss": 1.8859, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 1.4403150041447914, | |
| "grad_norm": 0.3630838692188263, | |
| "learning_rate": 0.0002744375, | |
| "loss": 1.8796, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 1.4437689969604863, | |
| "grad_norm": 0.40710654854774475, | |
| "learning_rate": 0.00027386931818181823, | |
| "loss": 1.8822, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 1.4472229897761812, | |
| "grad_norm": 0.33801448345184326, | |
| "learning_rate": 0.00027330113636363636, | |
| "loss": 1.8788, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 1.4506769825918762, | |
| "grad_norm": 0.3448280692100525, | |
| "learning_rate": 0.00027273295454545454, | |
| "loss": 1.8685, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.4506769825918762, | |
| "eval_loss": 2.24458909034729, | |
| "eval_runtime": 932.8306, | |
| "eval_samples_per_second": 163.349, | |
| "eval_steps_per_second": 1.634, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.454130975407571, | |
| "grad_norm": 0.35361775755882263, | |
| "learning_rate": 0.00027216477272727277, | |
| "loss": 1.8657, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 1.4575849682232662, | |
| "grad_norm": 0.3468896448612213, | |
| "learning_rate": 0.0002715965909090909, | |
| "loss": 1.8701, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 1.4610389610389611, | |
| "grad_norm": 0.3501305878162384, | |
| "learning_rate": 0.0002710284090909091, | |
| "loss": 1.8729, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 1.464492953854656, | |
| "grad_norm": 0.3370625078678131, | |
| "learning_rate": 0.0002704602272727273, | |
| "loss": 1.8723, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 1.467946946670351, | |
| "grad_norm": 0.33096930384635925, | |
| "learning_rate": 0.0002698920454545455, | |
| "loss": 1.8642, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.471400939486046, | |
| "grad_norm": 0.3265809118747711, | |
| "learning_rate": 0.0002693238636363636, | |
| "loss": 1.8757, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 1.4748549323017408, | |
| "grad_norm": 0.3586813509464264, | |
| "learning_rate": 0.00026875568181818185, | |
| "loss": 1.8639, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 1.4783089251174357, | |
| "grad_norm": 0.3498245179653168, | |
| "learning_rate": 0.00026818750000000003, | |
| "loss": 1.888, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 1.4817629179331306, | |
| "grad_norm": 0.34165388345718384, | |
| "learning_rate": 0.00026761931818181816, | |
| "loss": 1.8644, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 1.4852169107488256, | |
| "grad_norm": 0.32099393010139465, | |
| "learning_rate": 0.0002670511363636364, | |
| "loss": 1.8747, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.4852169107488256, | |
| "eval_loss": 2.237309455871582, | |
| "eval_runtime": 932.7872, | |
| "eval_samples_per_second": 163.357, | |
| "eval_steps_per_second": 1.634, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.4886709035645205, | |
| "grad_norm": 0.4318270981311798, | |
| "learning_rate": 0.0002664829545454546, | |
| "loss": 1.8684, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 1.4921248963802154, | |
| "grad_norm": 0.34946203231811523, | |
| "learning_rate": 0.0002659147727272727, | |
| "loss": 1.8813, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 1.4955788891959105, | |
| "grad_norm": 0.33623960614204407, | |
| "learning_rate": 0.00026534659090909094, | |
| "loss": 1.8566, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 1.4990328820116054, | |
| "grad_norm": 0.3431924283504486, | |
| "learning_rate": 0.0002647784090909091, | |
| "loss": 1.8555, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 1.5024868748273004, | |
| "grad_norm": 0.3669569492340088, | |
| "learning_rate": 0.00026421022727272724, | |
| "loss": 1.8656, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.5059408676429953, | |
| "grad_norm": 0.3411414623260498, | |
| "learning_rate": 0.0002636420454545455, | |
| "loss": 1.8533, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 1.5093948604586902, | |
| "grad_norm": 0.348023384809494, | |
| "learning_rate": 0.00026307386363636366, | |
| "loss": 1.8583, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 1.5128488532743853, | |
| "grad_norm": 0.3822565972805023, | |
| "learning_rate": 0.0002625056818181818, | |
| "loss": 1.8669, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 1.5163028460900803, | |
| "grad_norm": 0.34821194410324097, | |
| "learning_rate": 0.0002619375, | |
| "loss": 1.8513, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 1.5197568389057752, | |
| "grad_norm": 0.35662829875946045, | |
| "learning_rate": 0.0002613693181818182, | |
| "loss": 1.8699, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.5197568389057752, | |
| "eval_loss": 2.2242226600646973, | |
| "eval_runtime": 933.1522, | |
| "eval_samples_per_second": 163.293, | |
| "eval_steps_per_second": 1.633, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.52321083172147, | |
| "grad_norm": 0.34279394149780273, | |
| "learning_rate": 0.0002608011363636364, | |
| "loss": 1.8583, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 1.526664824537165, | |
| "grad_norm": 0.35233989357948303, | |
| "learning_rate": 0.00026023295454545456, | |
| "loss": 1.8434, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 1.53011881735286, | |
| "grad_norm": 0.34149396419525146, | |
| "learning_rate": 0.00025966477272727274, | |
| "loss": 1.8593, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 1.5335728101685548, | |
| "grad_norm": 0.35298213362693787, | |
| "learning_rate": 0.0002590965909090909, | |
| "loss": 1.8439, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 1.5370268029842498, | |
| "grad_norm": 0.3766247630119324, | |
| "learning_rate": 0.0002585284090909091, | |
| "loss": 1.8645, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.5404807957999447, | |
| "grad_norm": 0.3492392301559448, | |
| "learning_rate": 0.0002579602272727273, | |
| "loss": 1.8551, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 1.5439347886156396, | |
| "grad_norm": 0.324101060628891, | |
| "learning_rate": 0.00025739204545454546, | |
| "loss": 1.8657, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 1.5473887814313345, | |
| "grad_norm": 0.3346399664878845, | |
| "learning_rate": 0.00025682386363636364, | |
| "loss": 1.8483, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 1.5508427742470294, | |
| "grad_norm": 0.35447120666503906, | |
| "learning_rate": 0.0002562556818181818, | |
| "loss": 1.8424, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 1.5542967670627243, | |
| "grad_norm": 0.3583132326602936, | |
| "learning_rate": 0.0002556875, | |
| "loss": 1.8619, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.5542967670627243, | |
| "eval_loss": 2.2166972160339355, | |
| "eval_runtime": 933.428, | |
| "eval_samples_per_second": 163.245, | |
| "eval_steps_per_second": 1.633, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.5577507598784195, | |
| "grad_norm": 0.34049317240715027, | |
| "learning_rate": 0.0002551193181818182, | |
| "loss": 1.8577, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 1.5612047526941144, | |
| "grad_norm": 0.3376822769641876, | |
| "learning_rate": 0.00025455113636363636, | |
| "loss": 1.8448, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 1.5646587455098093, | |
| "grad_norm": 0.3559693396091461, | |
| "learning_rate": 0.00025398295454545454, | |
| "loss": 1.8366, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 1.5681127383255042, | |
| "grad_norm": 0.34435904026031494, | |
| "learning_rate": 0.0002534147727272728, | |
| "loss": 1.8485, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 1.5715667311411994, | |
| "grad_norm": 0.35500675439834595, | |
| "learning_rate": 0.0002528465909090909, | |
| "loss": 1.8516, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.5750207239568943, | |
| "grad_norm": 0.34272322058677673, | |
| "learning_rate": 0.0002522784090909091, | |
| "loss": 1.8296, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 1.5784747167725892, | |
| "grad_norm": 0.36497625708580017, | |
| "learning_rate": 0.0002517102272727273, | |
| "loss": 1.8255, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 1.5819287095882841, | |
| "grad_norm": 0.31943902373313904, | |
| "learning_rate": 0.00025114204545454544, | |
| "loss": 1.8657, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 1.585382702403979, | |
| "grad_norm": 0.3567992150783539, | |
| "learning_rate": 0.0002505738636363636, | |
| "loss": 1.8727, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 1.588836695219674, | |
| "grad_norm": 0.3523275554180145, | |
| "learning_rate": 0.00025000568181818186, | |
| "loss": 1.8348, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.588836695219674, | |
| "eval_loss": 2.211845874786377, | |
| "eval_runtime": 932.6736, | |
| "eval_samples_per_second": 163.377, | |
| "eval_steps_per_second": 1.634, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.5922906880353689, | |
| "grad_norm": 0.3533009886741638, | |
| "learning_rate": 0.0002494375, | |
| "loss": 1.8324, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 1.5957446808510638, | |
| "grad_norm": 0.35436585545539856, | |
| "learning_rate": 0.00024886931818181817, | |
| "loss": 1.8329, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 1.5991986736667587, | |
| "grad_norm": 0.35463017225265503, | |
| "learning_rate": 0.0002483011363636364, | |
| "loss": 1.848, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 1.6026526664824536, | |
| "grad_norm": 0.33948197960853577, | |
| "learning_rate": 0.0002477329545454546, | |
| "loss": 1.8416, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 1.6061066592981486, | |
| "grad_norm": 0.3487997353076935, | |
| "learning_rate": 0.0002471647727272727, | |
| "loss": 1.8331, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.6095606521138435, | |
| "grad_norm": 0.3553692698478699, | |
| "learning_rate": 0.00024659659090909094, | |
| "loss": 1.8443, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 1.6130146449295384, | |
| "grad_norm": 0.3699355721473694, | |
| "learning_rate": 0.0002460284090909091, | |
| "loss": 1.8396, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 1.6164686377452335, | |
| "grad_norm": 0.33341851830482483, | |
| "learning_rate": 0.00024546022727272725, | |
| "loss": 1.8266, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 1.6199226305609284, | |
| "grad_norm": 0.3703523874282837, | |
| "learning_rate": 0.0002448920454545455, | |
| "loss": 1.8356, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 1.6233766233766234, | |
| "grad_norm": 0.34331998229026794, | |
| "learning_rate": 0.00024432386363636366, | |
| "loss": 1.8506, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.6233766233766234, | |
| "eval_loss": 2.201261281967163, | |
| "eval_runtime": 932.8465, | |
| "eval_samples_per_second": 163.346, | |
| "eval_steps_per_second": 1.634, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.6268306161923183, | |
| "grad_norm": 0.3524048924446106, | |
| "learning_rate": 0.00024375568181818184, | |
| "loss": 1.8276, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 1.6302846090080134, | |
| "grad_norm": 0.6397112607955933, | |
| "learning_rate": 0.0002431875, | |
| "loss": 1.8358, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 1.6337386018237083, | |
| "grad_norm": 0.3624354600906372, | |
| "learning_rate": 0.00024261931818181818, | |
| "loss": 1.819, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 1.6371925946394033, | |
| "grad_norm": 0.3678456246852875, | |
| "learning_rate": 0.00024205113636363638, | |
| "loss": 1.8151, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 1.6406465874550982, | |
| "grad_norm": 0.38248035311698914, | |
| "learning_rate": 0.00024148295454545454, | |
| "loss": 1.8303, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.644100580270793, | |
| "grad_norm": 0.36703070998191833, | |
| "learning_rate": 0.00024091477272727272, | |
| "loss": 1.8375, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 1.647554573086488, | |
| "grad_norm": 0.34606924653053284, | |
| "learning_rate": 0.00024034659090909092, | |
| "loss": 1.8261, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 1.651008565902183, | |
| "grad_norm": 0.35459455847740173, | |
| "learning_rate": 0.00023977840909090908, | |
| "loss": 1.8541, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 1.6544625587178778, | |
| "grad_norm": 0.35106080770492554, | |
| "learning_rate": 0.00023921022727272728, | |
| "loss": 1.8434, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 1.6579165515335728, | |
| "grad_norm": 0.3380804657936096, | |
| "learning_rate": 0.00023864204545454547, | |
| "loss": 1.8323, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.6579165515335728, | |
| "eval_loss": 2.1915159225463867, | |
| "eval_runtime": 932.8372, | |
| "eval_samples_per_second": 163.348, | |
| "eval_steps_per_second": 1.634, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.6613705443492677, | |
| "grad_norm": 0.36180025339126587, | |
| "learning_rate": 0.00023807386363636362, | |
| "loss": 1.8347, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 1.6648245371649626, | |
| "grad_norm": 0.33836793899536133, | |
| "learning_rate": 0.00023750568181818183, | |
| "loss": 1.8169, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 1.6682785299806575, | |
| "grad_norm": 0.34874165058135986, | |
| "learning_rate": 0.0002369375, | |
| "loss": 1.8206, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 1.6717325227963524, | |
| "grad_norm": 0.3255716562271118, | |
| "learning_rate": 0.0002363693181818182, | |
| "loss": 1.8319, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 1.6751865156120476, | |
| "grad_norm": 0.3886810839176178, | |
| "learning_rate": 0.00023580113636363637, | |
| "loss": 1.8208, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.6786405084277425, | |
| "grad_norm": 0.38673707842826843, | |
| "learning_rate": 0.00023523295454545455, | |
| "loss": 1.8294, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 1.6820945012434374, | |
| "grad_norm": 0.3884912431240082, | |
| "learning_rate": 0.00023466477272727273, | |
| "loss": 1.8137, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 1.6855484940591323, | |
| "grad_norm": 0.35155996680259705, | |
| "learning_rate": 0.0002340965909090909, | |
| "loss": 1.8135, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 1.6890024868748275, | |
| "grad_norm": 0.34583061933517456, | |
| "learning_rate": 0.0002335284090909091, | |
| "loss": 1.8125, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 1.6924564796905224, | |
| "grad_norm": 0.3412420451641083, | |
| "learning_rate": 0.00023296022727272727, | |
| "loss": 1.8238, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.6924564796905224, | |
| "eval_loss": 2.1860053539276123, | |
| "eval_runtime": 932.5574, | |
| "eval_samples_per_second": 163.397, | |
| "eval_steps_per_second": 1.634, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.6959104725062173, | |
| "grad_norm": 0.36108842492103577, | |
| "learning_rate": 0.00023239204545454545, | |
| "loss": 1.8195, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 1.6993644653219122, | |
| "grad_norm": 0.3617706000804901, | |
| "learning_rate": 0.00023182386363636366, | |
| "loss": 1.8032, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 1.7028184581376071, | |
| "grad_norm": 0.36145681142807007, | |
| "learning_rate": 0.00023125568181818184, | |
| "loss": 1.8441, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 1.706272450953302, | |
| "grad_norm": 0.3923262059688568, | |
| "learning_rate": 0.0002306875, | |
| "loss": 1.8136, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 1.709726443768997, | |
| "grad_norm": 0.3287799656391144, | |
| "learning_rate": 0.0002301193181818182, | |
| "loss": 1.8211, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.7131804365846919, | |
| "grad_norm": 0.35752880573272705, | |
| "learning_rate": 0.00022955113636363638, | |
| "loss": 1.8108, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 1.7166344294003868, | |
| "grad_norm": 0.3737923204898834, | |
| "learning_rate": 0.00022898295454545456, | |
| "loss": 1.8033, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 1.7200884222160817, | |
| "grad_norm": 0.374796599149704, | |
| "learning_rate": 0.00022841477272727274, | |
| "loss": 1.8097, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 1.7235424150317766, | |
| "grad_norm": 0.386203408241272, | |
| "learning_rate": 0.00022784659090909092, | |
| "loss": 1.811, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 1.7269964078474715, | |
| "grad_norm": 0.3648054003715515, | |
| "learning_rate": 0.0002272784090909091, | |
| "loss": 1.8061, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.7269964078474715, | |
| "eval_loss": 2.1760547161102295, | |
| "eval_runtime": 932.5357, | |
| "eval_samples_per_second": 163.401, | |
| "eval_steps_per_second": 1.634, | |
| "step": 50000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 90000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.64606111435273e+18, | |
| "train_batch_size": 100, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |