smallm_130 / last-checkpoint /trainer_state.json
Azrail's picture
Training in progress, step 39000, checkpoint
6dcd8b6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8566756864731733,
"eval_steps": 500,
"global_step": 39000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001098302162145094,
"grad_norm": 1.8242720365524292,
"learning_rate": 1e-05,
"loss": 10.2008,
"num_input_tokens_seen": 52428800,
"step": 50
},
{
"epoch": 0.002196604324290188,
"grad_norm": 1.7588739395141602,
"learning_rate": 2e-05,
"loss": 9.6579,
"num_input_tokens_seen": 104857600,
"step": 100
},
{
"epoch": 0.003294906486435282,
"grad_norm": 1.4990028142929077,
"learning_rate": 3e-05,
"loss": 8.9277,
"num_input_tokens_seen": 157286400,
"step": 150
},
{
"epoch": 0.004393208648580376,
"grad_norm": 0.9206030368804932,
"learning_rate": 4e-05,
"loss": 8.0739,
"num_input_tokens_seen": 209715200,
"step": 200
},
{
"epoch": 0.00549151081072547,
"grad_norm": 0.4887239933013916,
"learning_rate": 5e-05,
"loss": 7.406,
"num_input_tokens_seen": 262144000,
"step": 250
},
{
"epoch": 0.006589812972870564,
"grad_norm": 0.7044657468795776,
"learning_rate": 6e-05,
"loss": 6.9708,
"num_input_tokens_seen": 314572800,
"step": 300
},
{
"epoch": 0.007688115135015658,
"grad_norm": 0.9420009255409241,
"learning_rate": 7.000000000000001e-05,
"loss": 6.6177,
"num_input_tokens_seen": 367001600,
"step": 350
},
{
"epoch": 0.008786417297160752,
"grad_norm": 1.4098254442214966,
"learning_rate": 8e-05,
"loss": 6.3285,
"num_input_tokens_seen": 419430400,
"step": 400
},
{
"epoch": 0.009884719459305845,
"grad_norm": 0.5596774220466614,
"learning_rate": 8.999999999999999e-05,
"loss": 6.0918,
"num_input_tokens_seen": 471859200,
"step": 450
},
{
"epoch": 0.01098302162145094,
"grad_norm": 0.5934723615646362,
"learning_rate": 0.0001,
"loss": 5.8926,
"num_input_tokens_seen": 524288000,
"step": 500
},
{
"epoch": 0.01098302162145094,
"eval_loss": 5.72648286819458,
"eval_runtime": 65.255,
"eval_samples_per_second": 76.622,
"eval_steps_per_second": 19.156,
"num_input_tokens_seen": 524288000,
"step": 500
},
{
"epoch": 0.012081323783596033,
"grad_norm": 0.6180132627487183,
"learning_rate": 0.00011,
"loss": 5.713,
"num_input_tokens_seen": 576716800,
"step": 550
},
{
"epoch": 0.013179625945741128,
"grad_norm": 0.7194430232048035,
"learning_rate": 0.00012,
"loss": 5.5604,
"num_input_tokens_seen": 629145600,
"step": 600
},
{
"epoch": 0.014277928107886221,
"grad_norm": 0.7763974070549011,
"learning_rate": 0.00013000000000000002,
"loss": 5.4212,
"num_input_tokens_seen": 681574400,
"step": 650
},
{
"epoch": 0.015376230270031316,
"grad_norm": 0.7948254942893982,
"learning_rate": 0.00014000000000000001,
"loss": 5.2875,
"num_input_tokens_seen": 734003200,
"step": 700
},
{
"epoch": 0.01647453243217641,
"grad_norm": 0.7185749411582947,
"learning_rate": 0.00015,
"loss": 5.1765,
"num_input_tokens_seen": 786432000,
"step": 750
},
{
"epoch": 0.017572834594321504,
"grad_norm": 0.673218846321106,
"learning_rate": 0.00016,
"loss": 5.0599,
"num_input_tokens_seen": 838860800,
"step": 800
},
{
"epoch": 0.018671136756466596,
"grad_norm": 0.6499584317207336,
"learning_rate": 0.00017,
"loss": 4.9475,
"num_input_tokens_seen": 891289600,
"step": 850
},
{
"epoch": 0.01976943891861169,
"grad_norm": 0.9044798016548157,
"learning_rate": 0.00017999999999999998,
"loss": 4.8334,
"num_input_tokens_seen": 943718400,
"step": 900
},
{
"epoch": 0.020867741080756785,
"grad_norm": 0.886431872844696,
"learning_rate": 0.00019,
"loss": 4.723,
"num_input_tokens_seen": 996147200,
"step": 950
},
{
"epoch": 0.02196604324290188,
"grad_norm": 0.6721145510673523,
"learning_rate": 0.0002,
"loss": 4.6106,
"num_input_tokens_seen": 1048576000,
"step": 1000
},
{
"epoch": 0.02196604324290188,
"eval_loss": 4.456684589385986,
"eval_runtime": 66.2606,
"eval_samples_per_second": 75.46,
"eval_steps_per_second": 18.865,
"num_input_tokens_seen": 1048576000,
"step": 1000
},
{
"epoch": 0.02306434540504697,
"grad_norm": 0.6067565083503723,
"learning_rate": 0.00021,
"loss": 4.5355,
"num_input_tokens_seen": 1101004800,
"step": 1050
},
{
"epoch": 0.024162647567192067,
"grad_norm": 0.6668316721916199,
"learning_rate": 0.00022,
"loss": 4.4383,
"num_input_tokens_seen": 1153433600,
"step": 1100
},
{
"epoch": 0.02526094972933716,
"grad_norm": 0.3714616000652313,
"learning_rate": 0.00023,
"loss": 4.3538,
"num_input_tokens_seen": 1205862400,
"step": 1150
},
{
"epoch": 0.026359251891482256,
"grad_norm": 0.439012348651886,
"learning_rate": 0.00024,
"loss": 4.2848,
"num_input_tokens_seen": 1258291200,
"step": 1200
},
{
"epoch": 0.027457554053627348,
"grad_norm": 0.5026286840438843,
"learning_rate": 0.00025,
"loss": 4.2181,
"num_input_tokens_seen": 1310720000,
"step": 1250
},
{
"epoch": 0.028555856215772443,
"grad_norm": 0.4865541160106659,
"learning_rate": 0.00026000000000000003,
"loss": 4.1495,
"num_input_tokens_seen": 1363148800,
"step": 1300
},
{
"epoch": 0.029654158377917537,
"grad_norm": 0.5259677767753601,
"learning_rate": 0.00027,
"loss": 4.0873,
"num_input_tokens_seen": 1415577600,
"step": 1350
},
{
"epoch": 0.030752460540062632,
"grad_norm": 0.4151704013347626,
"learning_rate": 0.00028000000000000003,
"loss": 4.0369,
"num_input_tokens_seen": 1468006400,
"step": 1400
},
{
"epoch": 0.03185076270220773,
"grad_norm": 0.5806245803833008,
"learning_rate": 0.00029,
"loss": 3.9881,
"num_input_tokens_seen": 1520435200,
"step": 1450
},
{
"epoch": 0.03294906486435282,
"grad_norm": 0.46140730381011963,
"learning_rate": 0.0003,
"loss": 3.9311,
"num_input_tokens_seen": 1572864000,
"step": 1500
},
{
"epoch": 0.03294906486435282,
"eval_loss": 3.8112432956695557,
"eval_runtime": 65.8947,
"eval_samples_per_second": 75.879,
"eval_steps_per_second": 18.97,
"num_input_tokens_seen": 1572864000,
"step": 1500
},
{
"epoch": 0.03404736702649791,
"grad_norm": 0.4219188392162323,
"learning_rate": 0.00031,
"loss": 3.8972,
"num_input_tokens_seen": 1625292800,
"step": 1550
},
{
"epoch": 0.03514566918864301,
"grad_norm": 0.3506027162075043,
"learning_rate": 0.00032,
"loss": 3.8596,
"num_input_tokens_seen": 1677721600,
"step": 1600
},
{
"epoch": 0.0362439713507881,
"grad_norm": 0.5210819840431213,
"learning_rate": 0.00033,
"loss": 3.8182,
"num_input_tokens_seen": 1730150400,
"step": 1650
},
{
"epoch": 0.03734227351293319,
"grad_norm": 0.5830159783363342,
"learning_rate": 0.00034,
"loss": 3.7766,
"num_input_tokens_seen": 1782579200,
"step": 1700
},
{
"epoch": 0.03844057567507829,
"grad_norm": 0.4602348804473877,
"learning_rate": 0.00035,
"loss": 3.7362,
"num_input_tokens_seen": 1835008000,
"step": 1750
},
{
"epoch": 0.03953887783722338,
"grad_norm": 0.40075036883354187,
"learning_rate": 0.00035999999999999997,
"loss": 3.7136,
"num_input_tokens_seen": 1887436800,
"step": 1800
},
{
"epoch": 0.04063717999936848,
"grad_norm": 0.3893415629863739,
"learning_rate": 0.00037,
"loss": 3.6809,
"num_input_tokens_seen": 1939865600,
"step": 1850
},
{
"epoch": 0.04173548216151357,
"grad_norm": 0.2921469211578369,
"learning_rate": 0.00038,
"loss": 3.6565,
"num_input_tokens_seen": 1992294400,
"step": 1900
},
{
"epoch": 0.04283378432365866,
"grad_norm": 0.49007460474967957,
"learning_rate": 0.00039000000000000005,
"loss": 3.6215,
"num_input_tokens_seen": 2044723200,
"step": 1950
},
{
"epoch": 0.04393208648580376,
"grad_norm": 0.2980474531650543,
"learning_rate": 0.0004,
"loss": 3.591,
"num_input_tokens_seen": 2097152000,
"step": 2000
},
{
"epoch": 0.04393208648580376,
"eval_loss": 3.4769670963287354,
"eval_runtime": 62.8853,
"eval_samples_per_second": 79.51,
"eval_steps_per_second": 19.877,
"num_input_tokens_seen": 2097152000,
"step": 2000
},
{
"epoch": 0.04503038864794885,
"grad_norm": 0.33002936840057373,
"learning_rate": 0.00041,
"loss": 3.5684,
"num_input_tokens_seen": 2149580800,
"step": 2050
},
{
"epoch": 0.04612869081009394,
"grad_norm": 0.43806758522987366,
"learning_rate": 0.00042,
"loss": 3.5436,
"num_input_tokens_seen": 2202009600,
"step": 2100
},
{
"epoch": 0.04722699297223904,
"grad_norm": 0.32842758297920227,
"learning_rate": 0.00043,
"loss": 3.5191,
"num_input_tokens_seen": 2254438400,
"step": 2150
},
{
"epoch": 0.04832529513438413,
"grad_norm": 0.3068505525588989,
"learning_rate": 0.00044,
"loss": 3.5009,
"num_input_tokens_seen": 2306867200,
"step": 2200
},
{
"epoch": 0.049423597296529224,
"grad_norm": 0.2950410544872284,
"learning_rate": 0.00045000000000000004,
"loss": 3.4796,
"num_input_tokens_seen": 2359296000,
"step": 2250
},
{
"epoch": 0.05052189945867432,
"grad_norm": 0.29731425642967224,
"learning_rate": 0.00046,
"loss": 3.4583,
"num_input_tokens_seen": 2411724800,
"step": 2300
},
{
"epoch": 0.051620201620819414,
"grad_norm": 0.2702693045139313,
"learning_rate": 0.00047,
"loss": 3.4385,
"num_input_tokens_seen": 2464153600,
"step": 2350
},
{
"epoch": 0.05271850378296451,
"grad_norm": 0.2418452948331833,
"learning_rate": 0.00048,
"loss": 3.4244,
"num_input_tokens_seen": 2516582400,
"step": 2400
},
{
"epoch": 0.053816805945109604,
"grad_norm": 0.28668686747550964,
"learning_rate": 0.00049,
"loss": 3.3977,
"num_input_tokens_seen": 2569011200,
"step": 2450
},
{
"epoch": 0.054915108107254695,
"grad_norm": 0.3115544319152832,
"learning_rate": 0.0005,
"loss": 3.3881,
"num_input_tokens_seen": 2621440000,
"step": 2500
},
{
"epoch": 0.054915108107254695,
"eval_loss": 3.2789928913116455,
"eval_runtime": 62.6749,
"eval_samples_per_second": 79.777,
"eval_steps_per_second": 19.944,
"num_input_tokens_seen": 2621440000,
"step": 2500
},
{
"epoch": 0.056013410269399794,
"grad_norm": 0.32340022921562195,
"learning_rate": 0.00051,
"loss": 3.3667,
"num_input_tokens_seen": 2673868800,
"step": 2550
},
{
"epoch": 0.057111712431544885,
"grad_norm": 0.2612442970275879,
"learning_rate": 0.0005200000000000001,
"loss": 3.3612,
"num_input_tokens_seen": 2726297600,
"step": 2600
},
{
"epoch": 0.05821001459368998,
"grad_norm": 0.29934820532798767,
"learning_rate": 0.0005300000000000001,
"loss": 3.3386,
"num_input_tokens_seen": 2778726400,
"step": 2650
},
{
"epoch": 0.059308316755835075,
"grad_norm": 0.2737022042274475,
"learning_rate": 0.00054,
"loss": 3.3274,
"num_input_tokens_seen": 2831155200,
"step": 2700
},
{
"epoch": 0.060406618917980166,
"grad_norm": 0.2101408988237381,
"learning_rate": 0.00055,
"loss": 3.3153,
"num_input_tokens_seen": 2883584000,
"step": 2750
},
{
"epoch": 0.061504921080125265,
"grad_norm": 0.3240911066532135,
"learning_rate": 0.0005600000000000001,
"loss": 3.2978,
"num_input_tokens_seen": 2936012800,
"step": 2800
},
{
"epoch": 0.06260322324227036,
"grad_norm": 0.20592735707759857,
"learning_rate": 0.00057,
"loss": 3.2984,
"num_input_tokens_seen": 2988441600,
"step": 2850
},
{
"epoch": 0.06370152540441545,
"grad_norm": 0.263443261384964,
"learning_rate": 0.00058,
"loss": 3.2706,
"num_input_tokens_seen": 3040870400,
"step": 2900
},
{
"epoch": 0.06479982756656054,
"grad_norm": 0.24249990284442902,
"learning_rate": 0.00059,
"loss": 3.2673,
"num_input_tokens_seen": 3093299200,
"step": 2950
},
{
"epoch": 0.06589812972870564,
"grad_norm": 0.25961214303970337,
"learning_rate": 0.0006,
"loss": 3.2512,
"num_input_tokens_seen": 3145728000,
"step": 3000
},
{
"epoch": 0.06589812972870564,
"eval_loss": 3.150442600250244,
"eval_runtime": 65.9549,
"eval_samples_per_second": 75.809,
"eval_steps_per_second": 18.952,
"num_input_tokens_seen": 3145728000,
"step": 3000
},
{
"epoch": 0.06699643189085074,
"grad_norm": 0.21884848177433014,
"learning_rate": 0.00061,
"loss": 3.2437,
"num_input_tokens_seen": 3198156800,
"step": 3050
},
{
"epoch": 0.06809473405299582,
"grad_norm": 0.2534893751144409,
"learning_rate": 0.00062,
"loss": 3.2366,
"num_input_tokens_seen": 3250585600,
"step": 3100
},
{
"epoch": 0.06919303621514092,
"grad_norm": 0.2408875823020935,
"learning_rate": 0.00063,
"loss": 3.2264,
"num_input_tokens_seen": 3303014400,
"step": 3150
},
{
"epoch": 0.07029133837728602,
"grad_norm": 0.22240856289863586,
"learning_rate": 0.00064,
"loss": 3.2102,
"num_input_tokens_seen": 3355443200,
"step": 3200
},
{
"epoch": 0.0713896405394311,
"grad_norm": 0.21527299284934998,
"learning_rate": 0.0006500000000000001,
"loss": 3.1985,
"num_input_tokens_seen": 3407872000,
"step": 3250
},
{
"epoch": 0.0724879427015762,
"grad_norm": 0.26642242074012756,
"learning_rate": 0.00066,
"loss": 3.1923,
"num_input_tokens_seen": 3460300800,
"step": 3300
},
{
"epoch": 0.0735862448637213,
"grad_norm": 0.22164040803909302,
"learning_rate": 0.00067,
"loss": 3.1848,
"num_input_tokens_seen": 3512729600,
"step": 3350
},
{
"epoch": 0.07468454702586638,
"grad_norm": 0.21594341099262238,
"learning_rate": 0.00068,
"loss": 3.1764,
"num_input_tokens_seen": 3565158400,
"step": 3400
},
{
"epoch": 0.07578284918801148,
"grad_norm": 0.1921539604663849,
"learning_rate": 0.00069,
"loss": 3.1643,
"num_input_tokens_seen": 3617587200,
"step": 3450
},
{
"epoch": 0.07688115135015658,
"grad_norm": 0.2266080528497696,
"learning_rate": 0.0007,
"loss": 3.1647,
"num_input_tokens_seen": 3670016000,
"step": 3500
},
{
"epoch": 0.07688115135015658,
"eval_loss": 3.061373472213745,
"eval_runtime": 63.388,
"eval_samples_per_second": 78.879,
"eval_steps_per_second": 19.72,
"num_input_tokens_seen": 3670016000,
"step": 3500
},
{
"epoch": 0.07797945351230168,
"grad_norm": 0.19900226593017578,
"learning_rate": 0.00071,
"loss": 3.1557,
"num_input_tokens_seen": 3722444800,
"step": 3550
},
{
"epoch": 0.07907775567444676,
"grad_norm": 0.20299012959003448,
"learning_rate": 0.0007199999999999999,
"loss": 3.1503,
"num_input_tokens_seen": 3774873600,
"step": 3600
},
{
"epoch": 0.08017605783659186,
"grad_norm": 0.232399120926857,
"learning_rate": 0.00073,
"loss": 3.1387,
"num_input_tokens_seen": 3827302400,
"step": 3650
},
{
"epoch": 0.08127435999873696,
"grad_norm": 0.2127719670534134,
"learning_rate": 0.00074,
"loss": 3.1388,
"num_input_tokens_seen": 3879731200,
"step": 3700
},
{
"epoch": 0.08237266216088204,
"grad_norm": 0.22336533665657043,
"learning_rate": 0.00075,
"loss": 3.1247,
"num_input_tokens_seen": 3932160000,
"step": 3750
},
{
"epoch": 0.08347096432302714,
"grad_norm": 0.18270662426948547,
"learning_rate": 0.00076,
"loss": 3.1192,
"num_input_tokens_seen": 3984588800,
"step": 3800
},
{
"epoch": 0.08456926648517224,
"grad_norm": 0.16843897104263306,
"learning_rate": 0.0007700000000000001,
"loss": 3.1153,
"num_input_tokens_seen": 4037017600,
"step": 3850
},
{
"epoch": 0.08566756864731732,
"grad_norm": 0.19947747886180878,
"learning_rate": 0.0007800000000000001,
"loss": 3.1048,
"num_input_tokens_seen": 4089446400,
"step": 3900
},
{
"epoch": 0.08676587080946242,
"grad_norm": 0.17078733444213867,
"learning_rate": 0.00079,
"loss": 3.1014,
"num_input_tokens_seen": 4141875200,
"step": 3950
},
{
"epoch": 0.08786417297160752,
"grad_norm": 0.22091113030910492,
"learning_rate": 0.0008,
"loss": 3.0982,
"num_input_tokens_seen": 4194304000,
"step": 4000
},
{
"epoch": 0.08786417297160752,
"eval_loss": 2.9978296756744385,
"eval_runtime": 65.6064,
"eval_samples_per_second": 76.212,
"eval_steps_per_second": 19.053,
"num_input_tokens_seen": 4194304000,
"step": 4000
},
{
"epoch": 0.0889624751337526,
"grad_norm": 0.1839856207370758,
"learning_rate": 0.0008100000000000001,
"loss": 3.0862,
"num_input_tokens_seen": 4246732800,
"step": 4050
},
{
"epoch": 0.0900607772958977,
"grad_norm": 0.17331145703792572,
"learning_rate": 0.00082,
"loss": 3.087,
"num_input_tokens_seen": 4299161600,
"step": 4100
},
{
"epoch": 0.0911590794580428,
"grad_norm": 0.18384258449077606,
"learning_rate": 0.00083,
"loss": 3.076,
"num_input_tokens_seen": 4351590400,
"step": 4150
},
{
"epoch": 0.09225738162018789,
"grad_norm": 0.17061170935630798,
"learning_rate": 0.00084,
"loss": 3.0693,
"num_input_tokens_seen": 4404019200,
"step": 4200
},
{
"epoch": 0.09335568378233298,
"grad_norm": 0.18157647550106049,
"learning_rate": 0.00085,
"loss": 3.0698,
"num_input_tokens_seen": 4456448000,
"step": 4250
},
{
"epoch": 0.09445398594447808,
"grad_norm": 0.15678547322750092,
"learning_rate": 0.00086,
"loss": 3.064,
"num_input_tokens_seen": 4508876800,
"step": 4300
},
{
"epoch": 0.09555228810662317,
"grad_norm": 0.19118325412273407,
"learning_rate": 0.00087,
"loss": 3.0541,
"num_input_tokens_seen": 4561305600,
"step": 4350
},
{
"epoch": 0.09665059026876827,
"grad_norm": 0.17620691657066345,
"learning_rate": 0.00088,
"loss": 3.0532,
"num_input_tokens_seen": 4613734400,
"step": 4400
},
{
"epoch": 0.09774889243091336,
"grad_norm": 0.17351101338863373,
"learning_rate": 0.0008900000000000001,
"loss": 3.0549,
"num_input_tokens_seen": 4666163200,
"step": 4450
},
{
"epoch": 0.09884719459305845,
"grad_norm": 0.15183581411838531,
"learning_rate": 0.0009000000000000001,
"loss": 3.0485,
"num_input_tokens_seen": 4718592000,
"step": 4500
},
{
"epoch": 0.09884719459305845,
"eval_loss": 2.9479379653930664,
"eval_runtime": 66.5611,
"eval_samples_per_second": 75.119,
"eval_steps_per_second": 18.78,
"num_input_tokens_seen": 4718592000,
"step": 4500
},
{
"epoch": 0.09994549675520355,
"grad_norm": 0.1681961864233017,
"learning_rate": 0.00091,
"loss": 3.0395,
"num_input_tokens_seen": 4771020800,
"step": 4550
},
{
"epoch": 0.10104379891734865,
"grad_norm": 0.17382557690143585,
"learning_rate": 0.00092,
"loss": 3.0371,
"num_input_tokens_seen": 4823449600,
"step": 4600
},
{
"epoch": 0.10214210107949374,
"grad_norm": 0.14377906918525696,
"learning_rate": 0.00093,
"loss": 3.0377,
"num_input_tokens_seen": 4875878400,
"step": 4650
},
{
"epoch": 0.10324040324163883,
"grad_norm": 0.1590214967727661,
"learning_rate": 0.00094,
"loss": 3.0305,
"num_input_tokens_seen": 4928307200,
"step": 4700
},
{
"epoch": 0.10433870540378393,
"grad_norm": 0.15563353896141052,
"learning_rate": 0.00095,
"loss": 3.0254,
"num_input_tokens_seen": 4980736000,
"step": 4750
},
{
"epoch": 0.10543700756592903,
"grad_norm": 0.16002103686332703,
"learning_rate": 0.00096,
"loss": 3.0222,
"num_input_tokens_seen": 5033164800,
"step": 4800
},
{
"epoch": 0.10653530972807411,
"grad_norm": 0.1406039148569107,
"learning_rate": 0.0009699999999999999,
"loss": 3.0185,
"num_input_tokens_seen": 5085593600,
"step": 4850
},
{
"epoch": 0.10763361189021921,
"grad_norm": 0.14609627425670624,
"learning_rate": 0.00098,
"loss": 3.0177,
"num_input_tokens_seen": 5138022400,
"step": 4900
},
{
"epoch": 0.1087319140523643,
"grad_norm": 0.16061657667160034,
"learning_rate": 0.00099,
"loss": 3.0137,
"num_input_tokens_seen": 5190451200,
"step": 4950
},
{
"epoch": 0.10983021621450939,
"grad_norm": 0.18423974514007568,
"learning_rate": 0.001,
"loss": 3.016,
"num_input_tokens_seen": 5242880000,
"step": 5000
},
{
"epoch": 0.10983021621450939,
"eval_loss": 2.9132862091064453,
"eval_runtime": 65.7163,
"eval_samples_per_second": 76.085,
"eval_steps_per_second": 19.021,
"num_input_tokens_seen": 5242880000,
"step": 5000
},
{
"epoch": 0.11092851837665449,
"grad_norm": 0.15302155911922455,
"learning_rate": 0.001,
"loss": 3.0037,
"num_input_tokens_seen": 5295308800,
"step": 5050
},
{
"epoch": 0.11202682053879959,
"grad_norm": 0.1474563181400299,
"learning_rate": 0.001,
"loss": 3.0063,
"num_input_tokens_seen": 5347737600,
"step": 5100
},
{
"epoch": 0.11312512270094467,
"grad_norm": 0.14318443834781647,
"learning_rate": 0.001,
"loss": 3.0011,
"num_input_tokens_seen": 5400166400,
"step": 5150
},
{
"epoch": 0.11422342486308977,
"grad_norm": 0.1521013379096985,
"learning_rate": 0.001,
"loss": 2.9946,
"num_input_tokens_seen": 5452595200,
"step": 5200
},
{
"epoch": 0.11532172702523487,
"grad_norm": 0.14434175193309784,
"learning_rate": 0.001,
"loss": 2.9909,
"num_input_tokens_seen": 5505024000,
"step": 5250
},
{
"epoch": 0.11642002918737995,
"grad_norm": 0.16284991800785065,
"learning_rate": 0.001,
"loss": 2.9846,
"num_input_tokens_seen": 5557452800,
"step": 5300
},
{
"epoch": 0.11751833134952505,
"grad_norm": 0.15281164646148682,
"learning_rate": 0.001,
"loss": 2.9843,
"num_input_tokens_seen": 5609881600,
"step": 5350
},
{
"epoch": 0.11861663351167015,
"grad_norm": 0.1227719634771347,
"learning_rate": 0.001,
"loss": 2.9778,
"num_input_tokens_seen": 5662310400,
"step": 5400
},
{
"epoch": 0.11971493567381523,
"grad_norm": 0.1346055269241333,
"learning_rate": 0.001,
"loss": 2.9745,
"num_input_tokens_seen": 5714739200,
"step": 5450
},
{
"epoch": 0.12081323783596033,
"grad_norm": 0.15828204154968262,
"learning_rate": 0.001,
"loss": 2.9723,
"num_input_tokens_seen": 5767168000,
"step": 5500
},
{
"epoch": 0.12081323783596033,
"eval_loss": 2.8801000118255615,
"eval_runtime": 65.3935,
"eval_samples_per_second": 76.46,
"eval_steps_per_second": 19.115,
"num_input_tokens_seen": 5767168000,
"step": 5500
},
{
"epoch": 0.12191153999810543,
"grad_norm": 0.1391400694847107,
"learning_rate": 0.001,
"loss": 2.9609,
"num_input_tokens_seen": 5819596800,
"step": 5550
},
{
"epoch": 0.12300984216025053,
"grad_norm": 0.14347107708454132,
"learning_rate": 0.001,
"loss": 2.9697,
"num_input_tokens_seen": 5872025600,
"step": 5600
},
{
"epoch": 0.12410814432239561,
"grad_norm": 0.13779127597808838,
"learning_rate": 0.001,
"loss": 2.9609,
"num_input_tokens_seen": 5924454400,
"step": 5650
},
{
"epoch": 0.1252064464845407,
"grad_norm": 0.13017955422401428,
"learning_rate": 0.001,
"loss": 2.9545,
"num_input_tokens_seen": 5976883200,
"step": 5700
},
{
"epoch": 0.1263047486466858,
"grad_norm": 0.12697578966617584,
"learning_rate": 0.001,
"loss": 2.9563,
"num_input_tokens_seen": 6029312000,
"step": 5750
},
{
"epoch": 0.1274030508088309,
"grad_norm": 0.15175020694732666,
"learning_rate": 0.001,
"loss": 2.9502,
"num_input_tokens_seen": 6081740800,
"step": 5800
},
{
"epoch": 0.12850135297097598,
"grad_norm": 0.1209852397441864,
"learning_rate": 0.001,
"loss": 2.9516,
"num_input_tokens_seen": 6134169600,
"step": 5850
},
{
"epoch": 0.12959965513312108,
"grad_norm": 0.16521666944026947,
"learning_rate": 0.001,
"loss": 2.9528,
"num_input_tokens_seen": 6186598400,
"step": 5900
},
{
"epoch": 0.13069795729526618,
"grad_norm": 0.12271756678819656,
"learning_rate": 0.001,
"loss": 2.9382,
"num_input_tokens_seen": 6239027200,
"step": 5950
},
{
"epoch": 0.13179625945741127,
"grad_norm": 0.1376461535692215,
"learning_rate": 0.001,
"loss": 2.9464,
"num_input_tokens_seen": 6291456000,
"step": 6000
},
{
"epoch": 0.13179625945741127,
"eval_loss": 2.84769606590271,
"eval_runtime": 65.8814,
"eval_samples_per_second": 75.894,
"eval_steps_per_second": 18.973,
"num_input_tokens_seen": 6291456000,
"step": 6000
},
{
"epoch": 0.13289456161955637,
"grad_norm": 0.11629872024059296,
"learning_rate": 0.001,
"loss": 2.9406,
"num_input_tokens_seen": 6343884800,
"step": 6050
},
{
"epoch": 0.13399286378170147,
"grad_norm": 0.13740529119968414,
"learning_rate": 0.001,
"loss": 2.9343,
"num_input_tokens_seen": 6396313600,
"step": 6100
},
{
"epoch": 0.13509116594384657,
"grad_norm": 0.11548039317131042,
"learning_rate": 0.001,
"loss": 2.9374,
"num_input_tokens_seen": 6448742400,
"step": 6150
},
{
"epoch": 0.13618946810599164,
"grad_norm": 0.11710146814584732,
"learning_rate": 0.001,
"loss": 2.9376,
"num_input_tokens_seen": 6501171200,
"step": 6200
},
{
"epoch": 0.13728777026813674,
"grad_norm": 0.11223472654819489,
"learning_rate": 0.001,
"loss": 2.9284,
"num_input_tokens_seen": 6553600000,
"step": 6250
},
{
"epoch": 0.13838607243028184,
"grad_norm": 0.12880656123161316,
"learning_rate": 0.001,
"loss": 2.9303,
"num_input_tokens_seen": 6606028800,
"step": 6300
},
{
"epoch": 0.13948437459242694,
"grad_norm": 0.11898139119148254,
"learning_rate": 0.001,
"loss": 2.9246,
"num_input_tokens_seen": 6658457600,
"step": 6350
},
{
"epoch": 0.14058267675457203,
"grad_norm": 0.11154898256063461,
"learning_rate": 0.001,
"loss": 2.9254,
"num_input_tokens_seen": 6710886400,
"step": 6400
},
{
"epoch": 0.14168097891671713,
"grad_norm": 0.12669232487678528,
"learning_rate": 0.001,
"loss": 2.9162,
"num_input_tokens_seen": 6763315200,
"step": 6450
},
{
"epoch": 0.1427792810788622,
"grad_norm": 0.12259842455387115,
"learning_rate": 0.001,
"loss": 2.9179,
"num_input_tokens_seen": 6815744000,
"step": 6500
},
{
"epoch": 0.1427792810788622,
"eval_loss": 2.8220207691192627,
"eval_runtime": 65.2868,
"eval_samples_per_second": 76.585,
"eval_steps_per_second": 19.146,
"num_input_tokens_seen": 6815744000,
"step": 6500
},
{
"epoch": 0.1438775832410073,
"grad_norm": 0.13403092324733734,
"learning_rate": 0.001,
"loss": 2.9102,
"num_input_tokens_seen": 6868172800,
"step": 6550
},
{
"epoch": 0.1449758854031524,
"grad_norm": 0.13063696026802063,
"learning_rate": 0.001,
"loss": 2.9112,
"num_input_tokens_seen": 6920601600,
"step": 6600
},
{
"epoch": 0.1460741875652975,
"grad_norm": 0.11871635168790817,
"learning_rate": 0.001,
"loss": 2.9085,
"num_input_tokens_seen": 6973030400,
"step": 6650
},
{
"epoch": 0.1471724897274426,
"grad_norm": 0.11007633060216904,
"learning_rate": 0.001,
"loss": 2.9098,
"num_input_tokens_seen": 7025459200,
"step": 6700
},
{
"epoch": 0.1482707918895877,
"grad_norm": 0.10521857440471649,
"learning_rate": 0.001,
"loss": 2.9086,
"num_input_tokens_seen": 7077888000,
"step": 6750
},
{
"epoch": 0.14936909405173276,
"grad_norm": 0.11179310083389282,
"learning_rate": 0.001,
"loss": 2.9066,
"num_input_tokens_seen": 7130316800,
"step": 6800
},
{
"epoch": 0.15046739621387786,
"grad_norm": 0.1192353144288063,
"learning_rate": 0.001,
"loss": 2.9135,
"num_input_tokens_seen": 7182745600,
"step": 6850
},
{
"epoch": 0.15156569837602296,
"grad_norm": 0.11084350198507309,
"learning_rate": 0.001,
"loss": 2.9054,
"num_input_tokens_seen": 7235174400,
"step": 6900
},
{
"epoch": 0.15266400053816806,
"grad_norm": 0.11826325207948685,
"learning_rate": 0.001,
"loss": 2.9054,
"num_input_tokens_seen": 7287603200,
"step": 6950
},
{
"epoch": 0.15376230270031316,
"grad_norm": 0.12597590684890747,
"learning_rate": 0.001,
"loss": 2.8945,
"num_input_tokens_seen": 7340032000,
"step": 7000
},
{
"epoch": 0.15376230270031316,
"eval_loss": 2.802734851837158,
"eval_runtime": 65.3332,
"eval_samples_per_second": 76.531,
"eval_steps_per_second": 19.133,
"num_input_tokens_seen": 7340032000,
"step": 7000
},
{
"epoch": 0.15486060486245826,
"grad_norm": 0.11222469806671143,
"learning_rate": 0.001,
"loss": 2.8997,
"num_input_tokens_seen": 7392460800,
"step": 7050
},
{
"epoch": 0.15595890702460335,
"grad_norm": 0.11488104611635208,
"learning_rate": 0.001,
"loss": 2.8965,
"num_input_tokens_seen": 7444889600,
"step": 7100
},
{
"epoch": 0.15705720918674843,
"grad_norm": 0.1285555213689804,
"learning_rate": 0.001,
"loss": 2.8909,
"num_input_tokens_seen": 7497318400,
"step": 7150
},
{
"epoch": 0.15815551134889352,
"grad_norm": 0.12659265100955963,
"learning_rate": 0.001,
"loss": 2.8833,
"num_input_tokens_seen": 7549747200,
"step": 7200
},
{
"epoch": 0.15925381351103862,
"grad_norm": 0.10823842883110046,
"learning_rate": 0.001,
"loss": 2.9031,
"num_input_tokens_seen": 7602176000,
"step": 7250
},
{
"epoch": 0.16035211567318372,
"grad_norm": 0.12597811222076416,
"learning_rate": 0.001,
"loss": 2.8831,
"num_input_tokens_seen": 7654604800,
"step": 7300
},
{
"epoch": 0.16145041783532882,
"grad_norm": 0.1285410374403,
"learning_rate": 0.001,
"loss": 2.8931,
"num_input_tokens_seen": 7707033600,
"step": 7350
},
{
"epoch": 0.16254871999747392,
"grad_norm": 0.11170299351215363,
"learning_rate": 0.001,
"loss": 2.8861,
"num_input_tokens_seen": 7759462400,
"step": 7400
},
{
"epoch": 0.163647022159619,
"grad_norm": 0.11146055907011032,
"learning_rate": 0.001,
"loss": 2.8756,
"num_input_tokens_seen": 7811891200,
"step": 7450
},
{
"epoch": 0.1647453243217641,
"grad_norm": 0.10750412940979004,
"learning_rate": 0.001,
"loss": 2.8808,
"num_input_tokens_seen": 7864320000,
"step": 7500
},
{
"epoch": 0.1647453243217641,
"eval_loss": 2.785506248474121,
"eval_runtime": 65.0661,
"eval_samples_per_second": 76.845,
"eval_steps_per_second": 19.211,
"num_input_tokens_seen": 7864320000,
"step": 7500
},
{
"epoch": 0.16584362648390918,
"grad_norm": 0.11221355944871902,
"learning_rate": 0.001,
"loss": 2.8834,
"num_input_tokens_seen": 7916748800,
"step": 7550
},
{
"epoch": 0.16694192864605428,
"grad_norm": 0.1089220717549324,
"learning_rate": 0.001,
"loss": 2.8796,
"num_input_tokens_seen": 7969177600,
"step": 7600
},
{
"epoch": 0.16804023080819938,
"grad_norm": 0.11125486344099045,
"learning_rate": 0.001,
"loss": 2.8836,
"num_input_tokens_seen": 8021606400,
"step": 7650
},
{
"epoch": 0.16913853297034448,
"grad_norm": 0.12804660201072693,
"learning_rate": 0.001,
"loss": 2.8754,
"num_input_tokens_seen": 8074035200,
"step": 7700
},
{
"epoch": 0.17023683513248955,
"grad_norm": 0.11395713686943054,
"learning_rate": 0.001,
"loss": 2.8736,
"num_input_tokens_seen": 8126464000,
"step": 7750
},
{
"epoch": 0.17133513729463465,
"grad_norm": 0.1095738559961319,
"learning_rate": 0.001,
"loss": 2.8743,
"num_input_tokens_seen": 8178892800,
"step": 7800
},
{
"epoch": 0.17243343945677975,
"grad_norm": 0.10545111447572708,
"learning_rate": 0.001,
"loss": 2.8718,
"num_input_tokens_seen": 8231321600,
"step": 7850
},
{
"epoch": 0.17353174161892485,
"grad_norm": 0.13135021924972534,
"learning_rate": 0.001,
"loss": 2.8648,
"num_input_tokens_seen": 8283750400,
"step": 7900
},
{
"epoch": 0.17463004378106994,
"grad_norm": 0.12348899990320206,
"learning_rate": 0.001,
"loss": 2.8628,
"num_input_tokens_seen": 8336179200,
"step": 7950
},
{
"epoch": 0.17572834594321504,
"grad_norm": 0.10604492574930191,
"learning_rate": 0.001,
"loss": 2.8676,
"num_input_tokens_seen": 8388608000,
"step": 8000
},
{
"epoch": 0.17572834594321504,
"eval_loss": 2.7698919773101807,
"eval_runtime": 65.5096,
"eval_samples_per_second": 76.325,
"eval_steps_per_second": 19.081,
"num_input_tokens_seen": 8388608000,
"step": 8000
},
{
"epoch": 0.17682664810536014,
"grad_norm": 0.12299258261919022,
"learning_rate": 0.001,
"loss": 2.8626,
"num_input_tokens_seen": 8441036800,
"step": 8050
},
{
"epoch": 0.1779249502675052,
"grad_norm": 0.11638012528419495,
"learning_rate": 0.001,
"loss": 2.864,
"num_input_tokens_seen": 8493465600,
"step": 8100
},
{
"epoch": 0.1790232524296503,
"grad_norm": 0.10978250205516815,
"learning_rate": 0.001,
"loss": 2.8589,
"num_input_tokens_seen": 8545894400,
"step": 8150
},
{
"epoch": 0.1801215545917954,
"grad_norm": 0.11229872703552246,
"learning_rate": 0.001,
"loss": 2.8671,
"num_input_tokens_seen": 8598323200,
"step": 8200
},
{
"epoch": 0.1812198567539405,
"grad_norm": 0.13177119195461273,
"learning_rate": 0.001,
"loss": 2.8524,
"num_input_tokens_seen": 8650752000,
"step": 8250
},
{
"epoch": 0.1823181589160856,
"grad_norm": 0.11021032929420471,
"learning_rate": 0.001,
"loss": 2.8552,
"num_input_tokens_seen": 8703180800,
"step": 8300
},
{
"epoch": 0.1834164610782307,
"grad_norm": 0.11381058394908905,
"learning_rate": 0.001,
"loss": 2.8529,
"num_input_tokens_seen": 8755609600,
"step": 8350
},
{
"epoch": 0.18451476324037577,
"grad_norm": 0.10889217257499695,
"learning_rate": 0.001,
"loss": 2.8581,
"num_input_tokens_seen": 8808038400,
"step": 8400
},
{
"epoch": 0.18561306540252087,
"grad_norm": 0.13519708812236786,
"learning_rate": 0.001,
"loss": 2.8518,
"num_input_tokens_seen": 8860467200,
"step": 8450
},
{
"epoch": 0.18671136756466597,
"grad_norm": 0.1265636533498764,
"learning_rate": 0.001,
"loss": 2.8452,
"num_input_tokens_seen": 8912896000,
"step": 8500
},
{
"epoch": 0.18671136756466597,
"eval_loss": 2.754452705383301,
"eval_runtime": 65.4439,
"eval_samples_per_second": 76.401,
"eval_steps_per_second": 19.1,
"num_input_tokens_seen": 8912896000,
"step": 8500
},
{
"epoch": 0.18780966972681107,
"grad_norm": 0.12250006198883057,
"learning_rate": 0.001,
"loss": 2.8506,
"num_input_tokens_seen": 8965324800,
"step": 8550
},
{
"epoch": 0.18890797188895617,
"grad_norm": 0.1371607929468155,
"learning_rate": 0.001,
"loss": 2.8472,
"num_input_tokens_seen": 9017753600,
"step": 8600
},
{
"epoch": 0.19000627405110126,
"grad_norm": 0.11844755709171295,
"learning_rate": 0.001,
"loss": 2.8492,
"num_input_tokens_seen": 9070182400,
"step": 8650
},
{
"epoch": 0.19110457621324634,
"grad_norm": 0.38294216990470886,
"learning_rate": 0.001,
"loss": 6.3226,
"num_input_tokens_seen": 9122611200,
"step": 8700
},
{
"epoch": 0.19220287837539143,
"grad_norm": 0.44077590107917786,
"learning_rate": 0.001,
"loss": 6.7001,
"num_input_tokens_seen": 9175040000,
"step": 8750
},
{
"epoch": 0.19330118053753653,
"grad_norm": 0.4238772392272949,
"learning_rate": 0.001,
"loss": 5.8714,
"num_input_tokens_seen": 9227468800,
"step": 8800
},
{
"epoch": 0.19439948269968163,
"grad_norm": 0.2830688953399658,
"learning_rate": 0.001,
"loss": 4.8951,
"num_input_tokens_seen": 9279897600,
"step": 8850
},
{
"epoch": 0.19549778486182673,
"grad_norm": 0.2485039383172989,
"learning_rate": 0.001,
"loss": 3.928,
"num_input_tokens_seen": 9332326400,
"step": 8900
},
{
"epoch": 0.19659608702397183,
"grad_norm": 0.20515842735767365,
"learning_rate": 0.001,
"loss": 3.4277,
"num_input_tokens_seen": 9384755200,
"step": 8950
},
{
"epoch": 0.1976943891861169,
"grad_norm": 0.13605651259422302,
"learning_rate": 0.001,
"loss": 3.2263,
"num_input_tokens_seen": 9437184000,
"step": 9000
},
{
"epoch": 0.1976943891861169,
"eval_loss": 3.014314889907837,
"eval_runtime": 65.8851,
"eval_samples_per_second": 75.89,
"eval_steps_per_second": 18.972,
"num_input_tokens_seen": 9437184000,
"step": 9000
},
{
"epoch": 0.198792691348262,
"grad_norm": 0.17666102945804596,
"learning_rate": 0.001,
"loss": 3.0728,
"num_input_tokens_seen": 9489612800,
"step": 9050
},
{
"epoch": 0.1998909935104071,
"grad_norm": 0.202484592795372,
"learning_rate": 0.001,
"loss": 2.9818,
"num_input_tokens_seen": 9542041600,
"step": 9100
},
{
"epoch": 0.2009892956725522,
"grad_norm": 0.15095236897468567,
"learning_rate": 0.001,
"loss": 2.9423,
"num_input_tokens_seen": 9594470400,
"step": 9150
},
{
"epoch": 0.2020875978346973,
"grad_norm": 0.13089850544929504,
"learning_rate": 0.001,
"loss": 2.9227,
"num_input_tokens_seen": 9646899200,
"step": 9200
},
{
"epoch": 0.2031858999968424,
"grad_norm": 0.14022304117679596,
"learning_rate": 0.001,
"loss": 2.8988,
"num_input_tokens_seen": 9699328000,
"step": 9250
},
{
"epoch": 0.2042842021589875,
"grad_norm": 0.13116785883903503,
"learning_rate": 0.001,
"loss": 2.8716,
"num_input_tokens_seen": 9751756800,
"step": 9300
},
{
"epoch": 0.20538250432113256,
"grad_norm": 0.1395471841096878,
"learning_rate": 0.001,
"loss": 2.8727,
"num_input_tokens_seen": 9804185600,
"step": 9350
},
{
"epoch": 0.20648080648327766,
"grad_norm": 0.1271878033876419,
"learning_rate": 0.001,
"loss": 2.864,
"num_input_tokens_seen": 9856614400,
"step": 9400
},
{
"epoch": 0.20757910864542276,
"grad_norm": 0.14148685336112976,
"learning_rate": 0.001,
"loss": 2.8604,
"num_input_tokens_seen": 9909043200,
"step": 9450
},
{
"epoch": 0.20867741080756785,
"grad_norm": 0.1292584091424942,
"learning_rate": 0.001,
"loss": 2.8547,
"num_input_tokens_seen": 9961472000,
"step": 9500
},
{
"epoch": 0.20867741080756785,
"eval_loss": 2.756131649017334,
"eval_runtime": 65.0495,
"eval_samples_per_second": 76.865,
"eval_steps_per_second": 19.216,
"num_input_tokens_seen": 9961472000,
"step": 9500
},
{
"epoch": 0.20977571296971295,
"grad_norm": 0.10929372161626816,
"learning_rate": 0.001,
"loss": 2.8467,
"num_input_tokens_seen": 10013900800,
"step": 9550
},
{
"epoch": 0.21087401513185805,
"grad_norm": 0.1180899515748024,
"learning_rate": 0.001,
"loss": 2.8501,
"num_input_tokens_seen": 10066329600,
"step": 9600
},
{
"epoch": 0.21197231729400312,
"grad_norm": 0.12041448056697845,
"learning_rate": 0.001,
"loss": 2.8438,
"num_input_tokens_seen": 10118758400,
"step": 9650
},
{
"epoch": 0.21307061945614822,
"grad_norm": 0.13195224106311798,
"learning_rate": 0.001,
"loss": 2.8341,
"num_input_tokens_seen": 10171187200,
"step": 9700
},
{
"epoch": 0.21416892161829332,
"grad_norm": 0.11887054890394211,
"learning_rate": 0.001,
"loss": 2.8349,
"num_input_tokens_seen": 10223616000,
"step": 9750
},
{
"epoch": 0.21526722378043842,
"grad_norm": 0.1044996827840805,
"learning_rate": 0.001,
"loss": 2.8428,
"num_input_tokens_seen": 10276044800,
"step": 9800
},
{
"epoch": 0.21636552594258351,
"grad_norm": 0.11951665580272675,
"learning_rate": 0.001,
"loss": 2.8323,
"num_input_tokens_seen": 10328473600,
"step": 9850
},
{
"epoch": 0.2174638281047286,
"grad_norm": 0.11673793941736221,
"learning_rate": 0.001,
"loss": 2.8271,
"num_input_tokens_seen": 10380902400,
"step": 9900
},
{
"epoch": 0.21856213026687368,
"grad_norm": 0.1178969219326973,
"learning_rate": 0.001,
"loss": 2.8328,
"num_input_tokens_seen": 10433331200,
"step": 9950
},
{
"epoch": 0.21966043242901878,
"grad_norm": 0.11995361745357513,
"learning_rate": 0.001,
"loss": 2.8182,
"num_input_tokens_seen": 10485760000,
"step": 10000
},
{
"epoch": 0.21966043242901878,
"eval_loss": 2.732673168182373,
"eval_runtime": 66.3377,
"eval_samples_per_second": 75.372,
"eval_steps_per_second": 18.843,
"num_input_tokens_seen": 10485760000,
"step": 10000
},
{
"epoch": 0.22075873459116388,
"grad_norm": 0.13463908433914185,
"learning_rate": 0.001,
"loss": 2.8242,
"num_input_tokens_seen": 10538188800,
"step": 10050
},
{
"epoch": 0.22185703675330898,
"grad_norm": 0.11778156459331512,
"learning_rate": 0.001,
"loss": 2.8234,
"num_input_tokens_seen": 10590617600,
"step": 10100
},
{
"epoch": 0.22295533891545408,
"grad_norm": 0.11393869668245316,
"learning_rate": 0.001,
"loss": 2.8204,
"num_input_tokens_seen": 10643046400,
"step": 10150
},
{
"epoch": 0.22405364107759917,
"grad_norm": 0.12454303354024887,
"learning_rate": 0.001,
"loss": 2.8185,
"num_input_tokens_seen": 10695475200,
"step": 10200
},
{
"epoch": 0.22515194323974427,
"grad_norm": 0.1148439347743988,
"learning_rate": 0.001,
"loss": 2.8219,
"num_input_tokens_seen": 10747904000,
"step": 10250
},
{
"epoch": 0.22625024540188934,
"grad_norm": 0.13888292014598846,
"learning_rate": 0.001,
"loss": 2.8157,
"num_input_tokens_seen": 10800332800,
"step": 10300
},
{
"epoch": 0.22734854756403444,
"grad_norm": 0.12242749333381653,
"learning_rate": 0.001,
"loss": 2.8165,
"num_input_tokens_seen": 10852761600,
"step": 10350
},
{
"epoch": 0.22844684972617954,
"grad_norm": 0.13651017844676971,
"learning_rate": 0.001,
"loss": 2.8165,
"num_input_tokens_seen": 10905190400,
"step": 10400
},
{
"epoch": 0.22954515188832464,
"grad_norm": 0.12349703162908554,
"learning_rate": 0.001,
"loss": 2.8126,
"num_input_tokens_seen": 10957619200,
"step": 10450
},
{
"epoch": 0.23064345405046974,
"grad_norm": 0.13448943197727203,
"learning_rate": 0.001,
"loss": 2.8162,
"num_input_tokens_seen": 11010048000,
"step": 10500
},
{
"epoch": 0.23064345405046974,
"eval_loss": 2.720102071762085,
"eval_runtime": 65.0663,
"eval_samples_per_second": 76.845,
"eval_steps_per_second": 19.211,
"num_input_tokens_seen": 11010048000,
"step": 10500
},
{
"epoch": 0.23174175621261484,
"grad_norm": 0.1171165183186531,
"learning_rate": 0.001,
"loss": 2.817,
"num_input_tokens_seen": 11062476800,
"step": 10550
},
{
"epoch": 0.2328400583747599,
"grad_norm": 0.1417781263589859,
"learning_rate": 0.001,
"loss": 2.8159,
"num_input_tokens_seen": 11114905600,
"step": 10600
},
{
"epoch": 0.233938360536905,
"grad_norm": 0.13051685690879822,
"learning_rate": 0.001,
"loss": 2.8062,
"num_input_tokens_seen": 11167334400,
"step": 10650
},
{
"epoch": 0.2350366626990501,
"grad_norm": 0.12536808848381042,
"learning_rate": 0.001,
"loss": 2.8166,
"num_input_tokens_seen": 11219763200,
"step": 10700
},
{
"epoch": 0.2361349648611952,
"grad_norm": 0.11859289556741714,
"learning_rate": 0.001,
"loss": 2.8075,
"num_input_tokens_seen": 11272192000,
"step": 10750
},
{
"epoch": 0.2372332670233403,
"grad_norm": 0.14844287931919098,
"learning_rate": 0.001,
"loss": 2.8139,
"num_input_tokens_seen": 11324620800,
"step": 10800
},
{
"epoch": 0.2383315691854854,
"grad_norm": 0.12877844274044037,
"learning_rate": 0.001,
"loss": 2.8031,
"num_input_tokens_seen": 11377049600,
"step": 10850
},
{
"epoch": 0.23942987134763047,
"grad_norm": 0.13911722600460052,
"learning_rate": 0.001,
"loss": 2.7992,
"num_input_tokens_seen": 11429478400,
"step": 10900
},
{
"epoch": 0.24052817350977557,
"grad_norm": 0.156200110912323,
"learning_rate": 0.001,
"loss": 2.8059,
"num_input_tokens_seen": 11481907200,
"step": 10950
},
{
"epoch": 0.24162647567192067,
"grad_norm": 0.12990960478782654,
"learning_rate": 0.001,
"loss": 2.7984,
"num_input_tokens_seen": 11534336000,
"step": 11000
},
{
"epoch": 0.24162647567192067,
"eval_loss": 2.7103493213653564,
"eval_runtime": 65.6611,
"eval_samples_per_second": 76.149,
"eval_steps_per_second": 19.037,
"num_input_tokens_seen": 11534336000,
"step": 11000
},
{
"epoch": 0.24272477783406576,
"grad_norm": 0.1190350204706192,
"learning_rate": 0.001,
"loss": 2.7994,
"num_input_tokens_seen": 11586764800,
"step": 11050
},
{
"epoch": 0.24382307999621086,
"grad_norm": 0.12825961410999298,
"learning_rate": 0.001,
"loss": 2.7992,
"num_input_tokens_seen": 11639193600,
"step": 11100
},
{
"epoch": 0.24492138215835596,
"grad_norm": 0.12561525404453278,
"learning_rate": 0.001,
"loss": 2.8009,
"num_input_tokens_seen": 11691622400,
"step": 11150
},
{
"epoch": 0.24601968432050106,
"grad_norm": 0.12596049904823303,
"learning_rate": 0.001,
"loss": 2.8002,
"num_input_tokens_seen": 11744051200,
"step": 11200
},
{
"epoch": 0.24711798648264613,
"grad_norm": 0.1415141373872757,
"learning_rate": 0.001,
"loss": 2.8004,
"num_input_tokens_seen": 11796480000,
"step": 11250
},
{
"epoch": 0.24821628864479123,
"grad_norm": 0.1359766125679016,
"learning_rate": 0.001,
"loss": 2.7988,
"num_input_tokens_seen": 11848908800,
"step": 11300
},
{
"epoch": 0.24931459080693633,
"grad_norm": 0.13459013402462006,
"learning_rate": 0.001,
"loss": 2.7991,
"num_input_tokens_seen": 11901337600,
"step": 11350
},
{
"epoch": 0.2504128929690814,
"grad_norm": 0.1344253420829773,
"learning_rate": 0.001,
"loss": 2.805,
"num_input_tokens_seen": 11953766400,
"step": 11400
},
{
"epoch": 0.2515111951312265,
"grad_norm": 0.13629016280174255,
"learning_rate": 0.001,
"loss": 2.7954,
"num_input_tokens_seen": 12006195200,
"step": 11450
},
{
"epoch": 0.2526094972933716,
"grad_norm": 0.12940892577171326,
"learning_rate": 0.001,
"loss": 2.8009,
"num_input_tokens_seen": 12058624000,
"step": 11500
},
{
"epoch": 0.2526094972933716,
"eval_loss": 2.7012581825256348,
"eval_runtime": 65.7039,
"eval_samples_per_second": 76.099,
"eval_steps_per_second": 19.025,
"num_input_tokens_seen": 12058624000,
"step": 11500
},
{
"epoch": 0.2537077994555167,
"grad_norm": 0.15021966397762299,
"learning_rate": 0.001,
"loss": 2.7963,
"num_input_tokens_seen": 12111052800,
"step": 11550
},
{
"epoch": 0.2548061016176618,
"grad_norm": 0.12381847202777863,
"learning_rate": 0.001,
"loss": 2.7954,
"num_input_tokens_seen": 12163481600,
"step": 11600
},
{
"epoch": 0.2559044037798069,
"grad_norm": 0.14849607646465302,
"learning_rate": 0.001,
"loss": 2.7837,
"num_input_tokens_seen": 12215910400,
"step": 11650
},
{
"epoch": 0.25700270594195196,
"grad_norm": 0.1286240816116333,
"learning_rate": 0.001,
"loss": 2.7999,
"num_input_tokens_seen": 12268339200,
"step": 11700
},
{
"epoch": 0.2581010081040971,
"grad_norm": 0.11861539632081985,
"learning_rate": 0.001,
"loss": 2.7979,
"num_input_tokens_seen": 12320768000,
"step": 11750
},
{
"epoch": 0.25919931026624216,
"grad_norm": 0.11512617021799088,
"learning_rate": 0.001,
"loss": 2.7926,
"num_input_tokens_seen": 12373196800,
"step": 11800
},
{
"epoch": 0.2602976124283873,
"grad_norm": 0.13469178974628448,
"learning_rate": 0.001,
"loss": 2.7881,
"num_input_tokens_seen": 12425625600,
"step": 11850
},
{
"epoch": 0.26139591459053235,
"grad_norm": 0.15504290163516998,
"learning_rate": 0.001,
"loss": 2.7917,
"num_input_tokens_seen": 12478054400,
"step": 11900
},
{
"epoch": 0.2624942167526775,
"grad_norm": 0.1363905370235443,
"learning_rate": 0.001,
"loss": 2.7869,
"num_input_tokens_seen": 12530483200,
"step": 11950
},
{
"epoch": 0.26359251891482255,
"grad_norm": 0.11095720529556274,
"learning_rate": 0.001,
"loss": 2.7883,
"num_input_tokens_seen": 12582912000,
"step": 12000
},
{
"epoch": 0.26359251891482255,
"eval_loss": 2.6911227703094482,
"eval_runtime": 65.4928,
"eval_samples_per_second": 76.344,
"eval_steps_per_second": 19.086,
"num_input_tokens_seen": 12582912000,
"step": 12000
},
{
"epoch": 0.2646908210769676,
"grad_norm": 0.1443321257829666,
"learning_rate": 0.001,
"loss": 2.7866,
"num_input_tokens_seen": 12635340800,
"step": 12050
},
{
"epoch": 0.26578912323911275,
"grad_norm": 0.12249191850423813,
"learning_rate": 0.001,
"loss": 2.8,
"num_input_tokens_seen": 12687769600,
"step": 12100
},
{
"epoch": 0.2668874254012578,
"grad_norm": 0.1505623608827591,
"learning_rate": 0.001,
"loss": 2.7934,
"num_input_tokens_seen": 12740198400,
"step": 12150
},
{
"epoch": 0.26798572756340294,
"grad_norm": 0.17367833852767944,
"learning_rate": 0.001,
"loss": 2.7905,
"num_input_tokens_seen": 12792627200,
"step": 12200
},
{
"epoch": 0.269084029725548,
"grad_norm": 0.12189670652151108,
"learning_rate": 0.001,
"loss": 2.7878,
"num_input_tokens_seen": 12845056000,
"step": 12250
},
{
"epoch": 0.27018233188769314,
"grad_norm": 0.12834201753139496,
"learning_rate": 0.001,
"loss": 2.7822,
"num_input_tokens_seen": 12897484800,
"step": 12300
},
{
"epoch": 0.2712806340498382,
"grad_norm": 0.1277332305908203,
"learning_rate": 0.001,
"loss": 2.7846,
"num_input_tokens_seen": 12949913600,
"step": 12350
},
{
"epoch": 0.2723789362119833,
"grad_norm": 0.14190761744976044,
"learning_rate": 0.001,
"loss": 2.7845,
"num_input_tokens_seen": 13002342400,
"step": 12400
},
{
"epoch": 0.2734772383741284,
"grad_norm": 0.14843693375587463,
"learning_rate": 0.001,
"loss": 2.7847,
"num_input_tokens_seen": 13054771200,
"step": 12450
},
{
"epoch": 0.2745755405362735,
"grad_norm": 0.14427120983600616,
"learning_rate": 0.001,
"loss": 2.78,
"num_input_tokens_seen": 13107200000,
"step": 12500
},
{
"epoch": 0.2745755405362735,
"eval_loss": 2.6847124099731445,
"eval_runtime": 65.0448,
"eval_samples_per_second": 76.87,
"eval_steps_per_second": 19.218,
"num_input_tokens_seen": 13107200000,
"step": 12500
},
{
"epoch": 0.2756738426984186,
"grad_norm": 0.14408434927463531,
"learning_rate": 0.001,
"loss": 2.7794,
"num_input_tokens_seen": 13159628800,
"step": 12550
},
{
"epoch": 0.2767721448605637,
"grad_norm": 0.1557396501302719,
"learning_rate": 0.001,
"loss": 2.7754,
"num_input_tokens_seen": 13212057600,
"step": 12600
},
{
"epoch": 0.27787044702270874,
"grad_norm": 0.11494632810354233,
"learning_rate": 0.001,
"loss": 2.7839,
"num_input_tokens_seen": 13264486400,
"step": 12650
},
{
"epoch": 0.27896874918485387,
"grad_norm": 0.12402207404375076,
"learning_rate": 0.001,
"loss": 2.7773,
"num_input_tokens_seen": 13316915200,
"step": 12700
},
{
"epoch": 0.28006705134699894,
"grad_norm": 0.1308801770210266,
"learning_rate": 0.001,
"loss": 2.7864,
"num_input_tokens_seen": 13369344000,
"step": 12750
},
{
"epoch": 0.28116535350914407,
"grad_norm": 0.13596223294734955,
"learning_rate": 0.001,
"loss": 2.7763,
"num_input_tokens_seen": 13421772800,
"step": 12800
},
{
"epoch": 0.28226365567128914,
"grad_norm": 0.13256165385246277,
"learning_rate": 0.001,
"loss": 2.7762,
"num_input_tokens_seen": 13474201600,
"step": 12850
},
{
"epoch": 0.28336195783343426,
"grad_norm": 0.12955094873905182,
"learning_rate": 0.001,
"loss": 2.7823,
"num_input_tokens_seen": 13526630400,
"step": 12900
},
{
"epoch": 0.28446025999557933,
"grad_norm": 0.13506431877613068,
"learning_rate": 0.001,
"loss": 2.774,
"num_input_tokens_seen": 13579059200,
"step": 12950
},
{
"epoch": 0.2855585621577244,
"grad_norm": 0.14323291182518005,
"learning_rate": 0.001,
"loss": 2.7755,
"num_input_tokens_seen": 13631488000,
"step": 13000
},
{
"epoch": 0.2855585621577244,
"eval_loss": 2.6779518127441406,
"eval_runtime": 66.0334,
"eval_samples_per_second": 75.719,
"eval_steps_per_second": 18.93,
"num_input_tokens_seen": 13631488000,
"step": 13000
},
{
"epoch": 0.28665686431986953,
"grad_norm": 0.13635839521884918,
"learning_rate": 0.001,
"loss": 2.7705,
"num_input_tokens_seen": 13683916800,
"step": 13050
},
{
"epoch": 0.2877551664820146,
"grad_norm": 0.1449163854122162,
"learning_rate": 0.001,
"loss": 2.775,
"num_input_tokens_seen": 13736345600,
"step": 13100
},
{
"epoch": 0.2888534686441597,
"grad_norm": 0.1385536640882492,
"learning_rate": 0.001,
"loss": 2.7705,
"num_input_tokens_seen": 13788774400,
"step": 13150
},
{
"epoch": 0.2899517708063048,
"grad_norm": 0.14647842943668365,
"learning_rate": 0.001,
"loss": 2.7709,
"num_input_tokens_seen": 13841203200,
"step": 13200
},
{
"epoch": 0.2910500729684499,
"grad_norm": 0.14193060994148254,
"learning_rate": 0.001,
"loss": 2.7753,
"num_input_tokens_seen": 13893632000,
"step": 13250
},
{
"epoch": 0.292148375130595,
"grad_norm": 0.15065765380859375,
"learning_rate": 0.001,
"loss": 2.7725,
"num_input_tokens_seen": 13946060800,
"step": 13300
},
{
"epoch": 0.29324667729274007,
"grad_norm": 0.1726570725440979,
"learning_rate": 0.001,
"loss": 2.7677,
"num_input_tokens_seen": 13998489600,
"step": 13350
},
{
"epoch": 0.2943449794548852,
"grad_norm": 0.13577735424041748,
"learning_rate": 0.001,
"loss": 2.7661,
"num_input_tokens_seen": 14050918400,
"step": 13400
},
{
"epoch": 0.29544328161703026,
"grad_norm": 0.1286347657442093,
"learning_rate": 0.001,
"loss": 2.7642,
"num_input_tokens_seen": 14103347200,
"step": 13450
},
{
"epoch": 0.2965415837791754,
"grad_norm": 0.12374001741409302,
"learning_rate": 0.001,
"loss": 2.7651,
"num_input_tokens_seen": 14155776000,
"step": 13500
},
{
"epoch": 0.2965415837791754,
"eval_loss": 2.6711983680725098,
"eval_runtime": 65.6737,
"eval_samples_per_second": 76.134,
"eval_steps_per_second": 19.033,
"num_input_tokens_seen": 14155776000,
"step": 13500
},
{
"epoch": 0.29763988594132046,
"grad_norm": 0.1733749508857727,
"learning_rate": 0.001,
"loss": 2.765,
"num_input_tokens_seen": 14208204800,
"step": 13550
},
{
"epoch": 0.29873818810346553,
"grad_norm": 0.1459003984928131,
"learning_rate": 0.001,
"loss": 2.7683,
"num_input_tokens_seen": 14260633600,
"step": 13600
},
{
"epoch": 0.29983649026561066,
"grad_norm": 0.1527784913778305,
"learning_rate": 0.001,
"loss": 2.7678,
"num_input_tokens_seen": 14313062400,
"step": 13650
},
{
"epoch": 0.3009347924277557,
"grad_norm": 0.1344996690750122,
"learning_rate": 0.001,
"loss": 2.7613,
"num_input_tokens_seen": 14365491200,
"step": 13700
},
{
"epoch": 0.30203309458990085,
"grad_norm": 0.1291748583316803,
"learning_rate": 0.001,
"loss": 2.7682,
"num_input_tokens_seen": 14417920000,
"step": 13750
},
{
"epoch": 0.3031313967520459,
"grad_norm": 0.1352360099554062,
"learning_rate": 0.001,
"loss": 2.764,
"num_input_tokens_seen": 14470348800,
"step": 13800
},
{
"epoch": 0.30422969891419105,
"grad_norm": 0.13686618208885193,
"learning_rate": 0.001,
"loss": 2.7638,
"num_input_tokens_seen": 14522777600,
"step": 13850
},
{
"epoch": 0.3053280010763361,
"grad_norm": 0.15377116203308105,
"learning_rate": 0.001,
"loss": 2.7639,
"num_input_tokens_seen": 14575206400,
"step": 13900
},
{
"epoch": 0.3064263032384812,
"grad_norm": 0.13904446363449097,
"learning_rate": 0.001,
"loss": 2.7666,
"num_input_tokens_seen": 14627635200,
"step": 13950
},
{
"epoch": 0.3075246054006263,
"grad_norm": 0.12402611970901489,
"learning_rate": 0.001,
"loss": 2.759,
"num_input_tokens_seen": 14680064000,
"step": 14000
},
{
"epoch": 0.3075246054006263,
"eval_loss": 2.6654388904571533,
"eval_runtime": 65.2775,
"eval_samples_per_second": 76.596,
"eval_steps_per_second": 19.149,
"num_input_tokens_seen": 14680064000,
"step": 14000
},
{
"epoch": 0.3086229075627714,
"grad_norm": 0.13326038420200348,
"learning_rate": 0.001,
"loss": 2.7622,
"num_input_tokens_seen": 14732492800,
"step": 14050
},
{
"epoch": 0.3097212097249165,
"grad_norm": 0.14305976033210754,
"learning_rate": 0.001,
"loss": 2.7597,
"num_input_tokens_seen": 14784921600,
"step": 14100
},
{
"epoch": 0.3108195118870616,
"grad_norm": 0.1182415783405304,
"learning_rate": 0.001,
"loss": 2.758,
"num_input_tokens_seen": 14837350400,
"step": 14150
},
{
"epoch": 0.3119178140492067,
"grad_norm": 0.12919387221336365,
"learning_rate": 0.001,
"loss": 2.759,
"num_input_tokens_seen": 14889779200,
"step": 14200
},
{
"epoch": 0.3130161162113518,
"grad_norm": 0.1420537382364273,
"learning_rate": 0.001,
"loss": 2.7519,
"num_input_tokens_seen": 14942208000,
"step": 14250
},
{
"epoch": 0.31411441837349685,
"grad_norm": 0.14349806308746338,
"learning_rate": 0.001,
"loss": 2.7653,
"num_input_tokens_seen": 14994636800,
"step": 14300
},
{
"epoch": 0.315212720535642,
"grad_norm": 0.16453324258327484,
"learning_rate": 0.001,
"loss": 2.7642,
"num_input_tokens_seen": 15047065600,
"step": 14350
},
{
"epoch": 0.31631102269778705,
"grad_norm": 0.11806487292051315,
"learning_rate": 0.001,
"loss": 2.7605,
"num_input_tokens_seen": 15099494400,
"step": 14400
},
{
"epoch": 0.3174093248599322,
"grad_norm": 0.12850746512413025,
"learning_rate": 0.001,
"loss": 2.7539,
"num_input_tokens_seen": 15151923200,
"step": 14450
},
{
"epoch": 0.31850762702207724,
"grad_norm": 0.1480904221534729,
"learning_rate": 0.001,
"loss": 2.7574,
"num_input_tokens_seen": 15204352000,
"step": 14500
},
{
"epoch": 0.31850762702207724,
"eval_loss": 2.6607398986816406,
"eval_runtime": 65.6281,
"eval_samples_per_second": 76.187,
"eval_steps_per_second": 19.047,
"num_input_tokens_seen": 15204352000,
"step": 14500
},
{
"epoch": 0.3196059291842223,
"grad_norm": 0.13606210052967072,
"learning_rate": 0.001,
"loss": 2.763,
"num_input_tokens_seen": 15256780800,
"step": 14550
},
{
"epoch": 0.32070423134636744,
"grad_norm": 0.12546846270561218,
"learning_rate": 0.001,
"loss": 2.7556,
"num_input_tokens_seen": 15309209600,
"step": 14600
},
{
"epoch": 0.3218025335085125,
"grad_norm": 0.1267230361700058,
"learning_rate": 0.001,
"loss": 2.7617,
"num_input_tokens_seen": 15361638400,
"step": 14650
},
{
"epoch": 0.32290083567065764,
"grad_norm": 0.13812699913978577,
"learning_rate": 0.001,
"loss": 2.7533,
"num_input_tokens_seen": 15414067200,
"step": 14700
},
{
"epoch": 0.3239991378328027,
"grad_norm": 0.12577973306179047,
"learning_rate": 0.001,
"loss": 2.7519,
"num_input_tokens_seen": 15466496000,
"step": 14750
},
{
"epoch": 0.32509743999494783,
"grad_norm": 0.14296036958694458,
"learning_rate": 0.001,
"loss": 2.7479,
"num_input_tokens_seen": 15518924800,
"step": 14800
},
{
"epoch": 0.3261957421570929,
"grad_norm": 0.12737593054771423,
"learning_rate": 0.001,
"loss": 2.7546,
"num_input_tokens_seen": 15571353600,
"step": 14850
},
{
"epoch": 0.327294044319238,
"grad_norm": 0.1349722445011139,
"learning_rate": 0.001,
"loss": 2.7477,
"num_input_tokens_seen": 15623782400,
"step": 14900
},
{
"epoch": 0.3283923464813831,
"grad_norm": 0.12827487289905548,
"learning_rate": 0.001,
"loss": 2.7492,
"num_input_tokens_seen": 15676211200,
"step": 14950
},
{
"epoch": 0.3294906486435282,
"grad_norm": 0.13282813131809235,
"learning_rate": 0.001,
"loss": 2.7466,
"num_input_tokens_seen": 15728640000,
"step": 15000
},
{
"epoch": 0.3294906486435282,
"eval_loss": 2.6524744033813477,
"eval_runtime": 65.8996,
"eval_samples_per_second": 75.873,
"eval_steps_per_second": 18.968,
"num_input_tokens_seen": 15728640000,
"step": 15000
},
{
"epoch": 0.3305889508056733,
"grad_norm": 0.11965218186378479,
"learning_rate": 0.001,
"loss": 2.7443,
"num_input_tokens_seen": 15781068800,
"step": 15050
},
{
"epoch": 0.33168725296781837,
"grad_norm": 0.14668309688568115,
"learning_rate": 0.001,
"loss": 2.7496,
"num_input_tokens_seen": 15833497600,
"step": 15100
},
{
"epoch": 0.3327855551299635,
"grad_norm": 0.12492749840021133,
"learning_rate": 0.001,
"loss": 2.7485,
"num_input_tokens_seen": 15885926400,
"step": 15150
},
{
"epoch": 0.33388385729210857,
"grad_norm": 0.1333470493555069,
"learning_rate": 0.001,
"loss": 2.7511,
"num_input_tokens_seen": 15938355200,
"step": 15200
},
{
"epoch": 0.33498215945425364,
"grad_norm": 0.14136457443237305,
"learning_rate": 0.001,
"loss": 2.74,
"num_input_tokens_seen": 15990784000,
"step": 15250
},
{
"epoch": 0.33608046161639876,
"grad_norm": 0.14975622296333313,
"learning_rate": 0.001,
"loss": 2.7543,
"num_input_tokens_seen": 16043212800,
"step": 15300
},
{
"epoch": 0.33717876377854383,
"grad_norm": 0.1193549856543541,
"learning_rate": 0.001,
"loss": 2.7497,
"num_input_tokens_seen": 16095641600,
"step": 15350
},
{
"epoch": 0.33827706594068896,
"grad_norm": 0.1429223120212555,
"learning_rate": 0.001,
"loss": 2.7463,
"num_input_tokens_seen": 16148070400,
"step": 15400
},
{
"epoch": 0.33937536810283403,
"grad_norm": 0.16827304661273956,
"learning_rate": 0.001,
"loss": 2.7415,
"num_input_tokens_seen": 16200499200,
"step": 15450
},
{
"epoch": 0.3404736702649791,
"grad_norm": 0.13952937722206116,
"learning_rate": 0.001,
"loss": 2.7388,
"num_input_tokens_seen": 16252928000,
"step": 15500
},
{
"epoch": 0.3404736702649791,
"eval_loss": 2.6472089290618896,
"eval_runtime": 65.4943,
"eval_samples_per_second": 76.343,
"eval_steps_per_second": 19.086,
"num_input_tokens_seen": 16252928000,
"step": 15500
},
{
"epoch": 0.3415719724271242,
"grad_norm": 0.13359376788139343,
"learning_rate": 0.001,
"loss": 2.7522,
"num_input_tokens_seen": 16305356800,
"step": 15550
},
{
"epoch": 0.3426702745892693,
"grad_norm": 0.13101224601268768,
"learning_rate": 0.001,
"loss": 2.7483,
"num_input_tokens_seen": 16357785600,
"step": 15600
},
{
"epoch": 0.3437685767514144,
"grad_norm": 0.14006133377552032,
"learning_rate": 0.001,
"loss": 2.7439,
"num_input_tokens_seen": 16410214400,
"step": 15650
},
{
"epoch": 0.3448668789135595,
"grad_norm": 0.15062059462070465,
"learning_rate": 0.001,
"loss": 2.7454,
"num_input_tokens_seen": 16462643200,
"step": 15700
},
{
"epoch": 0.3459651810757046,
"grad_norm": 0.13822610676288605,
"learning_rate": 0.001,
"loss": 2.74,
"num_input_tokens_seen": 16515072000,
"step": 15750
},
{
"epoch": 0.3470634832378497,
"grad_norm": 0.1368207335472107,
"learning_rate": 0.001,
"loss": 2.745,
"num_input_tokens_seen": 16567500800,
"step": 15800
},
{
"epoch": 0.34816178539999476,
"grad_norm": 0.14573991298675537,
"learning_rate": 0.001,
"loss": 2.742,
"num_input_tokens_seen": 16619929600,
"step": 15850
},
{
"epoch": 0.3492600875621399,
"grad_norm": 12.025542259216309,
"learning_rate": 0.001,
"loss": 3.3278,
"num_input_tokens_seen": 16672358400,
"step": 15900
},
{
"epoch": 0.35035838972428496,
"grad_norm": 0.15699023008346558,
"learning_rate": 0.001,
"loss": 4.04,
"num_input_tokens_seen": 16724787200,
"step": 15950
},
{
"epoch": 0.3514566918864301,
"grad_norm": 0.13041897118091583,
"learning_rate": 0.001,
"loss": 2.8233,
"num_input_tokens_seen": 16777216000,
"step": 16000
},
{
"epoch": 0.3514566918864301,
"eval_loss": 2.689638614654541,
"eval_runtime": 66.0949,
"eval_samples_per_second": 75.649,
"eval_steps_per_second": 18.912,
"num_input_tokens_seen": 16777216000,
"step": 16000
},
{
"epoch": 0.35255499404857515,
"grad_norm": 0.1446143537759781,
"learning_rate": 0.001,
"loss": 2.7837,
"num_input_tokens_seen": 16829644800,
"step": 16050
},
{
"epoch": 0.3536532962107203,
"grad_norm": 0.12466421723365784,
"learning_rate": 0.001,
"loss": 2.7808,
"num_input_tokens_seen": 16882073600,
"step": 16100
},
{
"epoch": 0.35475159837286535,
"grad_norm": 0.13154324889183044,
"learning_rate": 0.001,
"loss": 2.7608,
"num_input_tokens_seen": 16934502400,
"step": 16150
},
{
"epoch": 0.3558499005350104,
"grad_norm": 0.12929347157478333,
"learning_rate": 0.001,
"loss": 2.7599,
"num_input_tokens_seen": 16986931200,
"step": 16200
},
{
"epoch": 0.35694820269715555,
"grad_norm": 0.12805528938770294,
"learning_rate": 0.001,
"loss": 2.7562,
"num_input_tokens_seen": 17039360000,
"step": 16250
},
{
"epoch": 0.3580465048593006,
"grad_norm": 0.12885579466819763,
"learning_rate": 0.001,
"loss": 2.7498,
"num_input_tokens_seen": 17091788800,
"step": 16300
},
{
"epoch": 0.35914480702144574,
"grad_norm": 0.14422497153282166,
"learning_rate": 0.001,
"loss": 2.7518,
"num_input_tokens_seen": 17144217600,
"step": 16350
},
{
"epoch": 0.3602431091835908,
"grad_norm": 0.13284224271774292,
"learning_rate": 0.001,
"loss": 2.7453,
"num_input_tokens_seen": 17196646400,
"step": 16400
},
{
"epoch": 0.3613414113457359,
"grad_norm": 0.1408185362815857,
"learning_rate": 0.001,
"loss": 2.7422,
"num_input_tokens_seen": 17249075200,
"step": 16450
},
{
"epoch": 0.362439713507881,
"grad_norm": 0.1295713484287262,
"learning_rate": 0.001,
"loss": 2.7394,
"num_input_tokens_seen": 17301504000,
"step": 16500
},
{
"epoch": 0.362439713507881,
"eval_loss": 2.6431446075439453,
"eval_runtime": 65.9239,
"eval_samples_per_second": 75.845,
"eval_steps_per_second": 18.961,
"num_input_tokens_seen": 17301504000,
"step": 16500
},
{
"epoch": 0.3635380156700261,
"grad_norm": 0.1245918869972229,
"learning_rate": 0.001,
"loss": 2.7434,
"num_input_tokens_seen": 17353932800,
"step": 16550
},
{
"epoch": 0.3646363178321712,
"grad_norm": 0.15865615010261536,
"learning_rate": 0.001,
"loss": 2.7378,
"num_input_tokens_seen": 17406361600,
"step": 16600
},
{
"epoch": 0.3657346199943163,
"grad_norm": 0.1391313523054123,
"learning_rate": 0.001,
"loss": 2.7415,
"num_input_tokens_seen": 17458790400,
"step": 16650
},
{
"epoch": 0.3668329221564614,
"grad_norm": 0.13604389131069183,
"learning_rate": 0.001,
"loss": 2.7394,
"num_input_tokens_seen": 17511219200,
"step": 16700
},
{
"epoch": 0.3679312243186065,
"grad_norm": 0.14926299452781677,
"learning_rate": 0.001,
"loss": 2.732,
"num_input_tokens_seen": 17563648000,
"step": 16750
},
{
"epoch": 0.36902952648075155,
"grad_norm": 0.12619628012180328,
"learning_rate": 0.001,
"loss": 2.7275,
"num_input_tokens_seen": 17616076800,
"step": 16800
},
{
"epoch": 0.3701278286428967,
"grad_norm": 0.1268402636051178,
"learning_rate": 0.001,
"loss": 2.7309,
"num_input_tokens_seen": 17668505600,
"step": 16850
},
{
"epoch": 0.37122613080504174,
"grad_norm": 0.1379624754190445,
"learning_rate": 0.001,
"loss": 2.7266,
"num_input_tokens_seen": 17720934400,
"step": 16900
},
{
"epoch": 0.37232443296718687,
"grad_norm": 0.1443478763103485,
"learning_rate": 0.001,
"loss": 2.7321,
"num_input_tokens_seen": 17773363200,
"step": 16950
},
{
"epoch": 0.37342273512933194,
"grad_norm": 0.15214091539382935,
"learning_rate": 0.001,
"loss": 2.7284,
"num_input_tokens_seen": 17825792000,
"step": 17000
},
{
"epoch": 0.37342273512933194,
"eval_loss": 2.63478946685791,
"eval_runtime": 65.141,
"eval_samples_per_second": 76.757,
"eval_steps_per_second": 19.189,
"num_input_tokens_seen": 17825792000,
"step": 17000
},
{
"epoch": 0.374521037291477,
"grad_norm": 0.1361106038093567,
"learning_rate": 0.001,
"loss": 2.7342,
"num_input_tokens_seen": 17878220800,
"step": 17050
},
{
"epoch": 0.37561933945362214,
"grad_norm": 0.13839572668075562,
"learning_rate": 0.001,
"loss": 2.7259,
"num_input_tokens_seen": 17930649600,
"step": 17100
},
{
"epoch": 0.3767176416157672,
"grad_norm": 0.13055244088172913,
"learning_rate": 0.001,
"loss": 2.7306,
"num_input_tokens_seen": 17983078400,
"step": 17150
},
{
"epoch": 0.37781594377791233,
"grad_norm": 0.1444411724805832,
"learning_rate": 0.001,
"loss": 2.7315,
"num_input_tokens_seen": 18035507200,
"step": 17200
},
{
"epoch": 0.3789142459400574,
"grad_norm": 0.151028573513031,
"learning_rate": 0.001,
"loss": 2.7211,
"num_input_tokens_seen": 18087936000,
"step": 17250
},
{
"epoch": 0.38001254810220253,
"grad_norm": 0.15638011693954468,
"learning_rate": 0.001,
"loss": 2.7269,
"num_input_tokens_seen": 18140364800,
"step": 17300
},
{
"epoch": 0.3811108502643476,
"grad_norm": 0.1508658230304718,
"learning_rate": 0.001,
"loss": 2.7263,
"num_input_tokens_seen": 18192793600,
"step": 17350
},
{
"epoch": 0.38220915242649267,
"grad_norm": 0.13167701661586761,
"learning_rate": 0.001,
"loss": 2.7296,
"num_input_tokens_seen": 18245222400,
"step": 17400
},
{
"epoch": 0.3833074545886378,
"grad_norm": 0.14609253406524658,
"learning_rate": 0.001,
"loss": 2.7249,
"num_input_tokens_seen": 18297651200,
"step": 17450
},
{
"epoch": 0.38440575675078287,
"grad_norm": 0.13172782957553864,
"learning_rate": 0.001,
"loss": 2.7252,
"num_input_tokens_seen": 18350080000,
"step": 17500
},
{
"epoch": 0.38440575675078287,
"eval_loss": 2.630176544189453,
"eval_runtime": 66.0667,
"eval_samples_per_second": 75.681,
"eval_steps_per_second": 18.92,
"num_input_tokens_seen": 18350080000,
"step": 17500
},
{
"epoch": 0.385504058912928,
"grad_norm": 0.149306520819664,
"learning_rate": 0.001,
"loss": 2.7245,
"num_input_tokens_seen": 18402508800,
"step": 17550
},
{
"epoch": 0.38660236107507306,
"grad_norm": 0.14191772043704987,
"learning_rate": 0.001,
"loss": 2.7204,
"num_input_tokens_seen": 18454937600,
"step": 17600
},
{
"epoch": 0.3877006632372182,
"grad_norm": 0.13731072843074799,
"learning_rate": 0.001,
"loss": 2.7243,
"num_input_tokens_seen": 18507366400,
"step": 17650
},
{
"epoch": 0.38879896539936326,
"grad_norm": 0.1466369777917862,
"learning_rate": 0.001,
"loss": 2.7262,
"num_input_tokens_seen": 18559795200,
"step": 17700
},
{
"epoch": 0.38989726756150833,
"grad_norm": 0.13290658593177795,
"learning_rate": 0.001,
"loss": 2.7314,
"num_input_tokens_seen": 18612224000,
"step": 17750
},
{
"epoch": 0.39099556972365346,
"grad_norm": 0.13785040378570557,
"learning_rate": 0.001,
"loss": 2.7252,
"num_input_tokens_seen": 18664652800,
"step": 17800
},
{
"epoch": 0.39209387188579853,
"grad_norm": 0.13384000957012177,
"learning_rate": 0.001,
"loss": 2.7321,
"num_input_tokens_seen": 18717081600,
"step": 17850
},
{
"epoch": 0.39319217404794365,
"grad_norm": 0.14927875995635986,
"learning_rate": 0.001,
"loss": 2.7236,
"num_input_tokens_seen": 18769510400,
"step": 17900
},
{
"epoch": 0.3942904762100887,
"grad_norm": 0.13494938611984253,
"learning_rate": 0.001,
"loss": 2.7234,
"num_input_tokens_seen": 18821939200,
"step": 17950
},
{
"epoch": 0.3953887783722338,
"grad_norm": 0.15054813027381897,
"learning_rate": 0.001,
"loss": 2.7236,
"num_input_tokens_seen": 18874368000,
"step": 18000
},
{
"epoch": 0.3953887783722338,
"eval_loss": 2.62626051902771,
"eval_runtime": 65.3965,
"eval_samples_per_second": 76.457,
"eval_steps_per_second": 19.114,
"num_input_tokens_seen": 18874368000,
"step": 18000
},
{
"epoch": 0.3964870805343789,
"grad_norm": 0.1353403478860855,
"learning_rate": 0.001,
"loss": 2.724,
"num_input_tokens_seen": 18926796800,
"step": 18050
},
{
"epoch": 0.397585382696524,
"grad_norm": 0.15004459023475647,
"learning_rate": 0.001,
"loss": 2.717,
"num_input_tokens_seen": 18979225600,
"step": 18100
},
{
"epoch": 0.3986836848586691,
"grad_norm": 0.1293007880449295,
"learning_rate": 0.001,
"loss": 2.7187,
"num_input_tokens_seen": 19031654400,
"step": 18150
},
{
"epoch": 0.3997819870208142,
"grad_norm": 0.16373878717422485,
"learning_rate": 0.001,
"loss": 2.7217,
"num_input_tokens_seen": 19084083200,
"step": 18200
},
{
"epoch": 0.4008802891829593,
"grad_norm": 0.1529611349105835,
"learning_rate": 0.001,
"loss": 2.722,
"num_input_tokens_seen": 19136512000,
"step": 18250
},
{
"epoch": 0.4019785913451044,
"grad_norm": 0.14109951257705688,
"learning_rate": 0.001,
"loss": 2.7232,
"num_input_tokens_seen": 19188940800,
"step": 18300
},
{
"epoch": 0.40307689350724946,
"grad_norm": 0.13841493427753448,
"learning_rate": 0.001,
"loss": 2.7195,
"num_input_tokens_seen": 19241369600,
"step": 18350
},
{
"epoch": 0.4041751956693946,
"grad_norm": 0.13508476316928864,
"learning_rate": 0.001,
"loss": 2.7166,
"num_input_tokens_seen": 19293798400,
"step": 18400
},
{
"epoch": 0.40527349783153965,
"grad_norm": 0.1372646540403366,
"learning_rate": 0.001,
"loss": 2.7212,
"num_input_tokens_seen": 19346227200,
"step": 18450
},
{
"epoch": 0.4063717999936848,
"grad_norm": 0.1485033482313156,
"learning_rate": 0.001,
"loss": 2.7186,
"num_input_tokens_seen": 19398656000,
"step": 18500
},
{
"epoch": 0.4063717999936848,
"eval_loss": 2.622330904006958,
"eval_runtime": 66.3601,
"eval_samples_per_second": 75.346,
"eval_steps_per_second": 18.837,
"num_input_tokens_seen": 19398656000,
"step": 18500
},
{
"epoch": 0.40747010215582985,
"grad_norm": 0.1484711617231369,
"learning_rate": 0.001,
"loss": 2.7235,
"num_input_tokens_seen": 19451084800,
"step": 18550
},
{
"epoch": 0.408568404317975,
"grad_norm": 0.141770601272583,
"learning_rate": 0.001,
"loss": 2.7225,
"num_input_tokens_seen": 19503513600,
"step": 18600
},
{
"epoch": 0.40966670648012005,
"grad_norm": 0.1213323250412941,
"learning_rate": 0.001,
"loss": 2.7212,
"num_input_tokens_seen": 19555942400,
"step": 18650
},
{
"epoch": 0.4107650086422651,
"grad_norm": 0.14149373769760132,
"learning_rate": 0.001,
"loss": 2.7181,
"num_input_tokens_seen": 19608371200,
"step": 18700
},
{
"epoch": 0.41186331080441024,
"grad_norm": 0.13964049518108368,
"learning_rate": 0.001,
"loss": 2.7147,
"num_input_tokens_seen": 19660800000,
"step": 18750
},
{
"epoch": 0.4129616129665553,
"grad_norm": 0.1384592205286026,
"learning_rate": 0.001,
"loss": 2.7141,
"num_input_tokens_seen": 19713228800,
"step": 18800
},
{
"epoch": 0.41405991512870044,
"grad_norm": 0.15027381479740143,
"learning_rate": 0.001,
"loss": 2.7185,
"num_input_tokens_seen": 19765657600,
"step": 18850
},
{
"epoch": 0.4151582172908455,
"grad_norm": 0.15221597254276276,
"learning_rate": 0.001,
"loss": 2.7206,
"num_input_tokens_seen": 19818086400,
"step": 18900
},
{
"epoch": 0.4162565194529906,
"grad_norm": 0.1272735893726349,
"learning_rate": 0.001,
"loss": 2.7183,
"num_input_tokens_seen": 19870515200,
"step": 18950
},
{
"epoch": 0.4173548216151357,
"grad_norm": 0.1258268654346466,
"learning_rate": 0.001,
"loss": 2.7117,
"num_input_tokens_seen": 19922944000,
"step": 19000
},
{
"epoch": 0.4173548216151357,
"eval_loss": 2.619187116622925,
"eval_runtime": 65.7537,
"eval_samples_per_second": 76.041,
"eval_steps_per_second": 19.01,
"num_input_tokens_seen": 19922944000,
"step": 19000
},
{
"epoch": 0.4184531237772808,
"grad_norm": 0.12389284372329712,
"learning_rate": 0.001,
"loss": 2.7222,
"num_input_tokens_seen": 19975372800,
"step": 19050
},
{
"epoch": 0.4195514259394259,
"grad_norm": 0.14157339930534363,
"learning_rate": 0.001,
"loss": 2.7178,
"num_input_tokens_seen": 20027801600,
"step": 19100
},
{
"epoch": 0.420649728101571,
"grad_norm": 0.1490466445684433,
"learning_rate": 0.001,
"loss": 2.7185,
"num_input_tokens_seen": 20080230400,
"step": 19150
},
{
"epoch": 0.4217480302637161,
"grad_norm": 0.14112494885921478,
"learning_rate": 0.001,
"loss": 2.7166,
"num_input_tokens_seen": 20132659200,
"step": 19200
},
{
"epoch": 0.42284633242586117,
"grad_norm": 0.13986504077911377,
"learning_rate": 0.001,
"loss": 2.7201,
"num_input_tokens_seen": 20185088000,
"step": 19250
},
{
"epoch": 0.42394463458800624,
"grad_norm": 0.14087803661823273,
"learning_rate": 0.001,
"loss": 2.7175,
"num_input_tokens_seen": 20237516800,
"step": 19300
},
{
"epoch": 0.42504293675015137,
"grad_norm": 0.165438711643219,
"learning_rate": 0.001,
"loss": 2.7155,
"num_input_tokens_seen": 20289945600,
"step": 19350
},
{
"epoch": 0.42614123891229644,
"grad_norm": 0.132109135389328,
"learning_rate": 0.001,
"loss": 2.7116,
"num_input_tokens_seen": 20342374400,
"step": 19400
},
{
"epoch": 0.42723954107444156,
"grad_norm": 0.1372772753238678,
"learning_rate": 0.001,
"loss": 2.7137,
"num_input_tokens_seen": 20394803200,
"step": 19450
},
{
"epoch": 0.42833784323658664,
"grad_norm": 0.1470147669315338,
"learning_rate": 0.001,
"loss": 2.7081,
"num_input_tokens_seen": 20447232000,
"step": 19500
},
{
"epoch": 0.42833784323658664,
"eval_loss": 2.615947961807251,
"eval_runtime": 65.588,
"eval_samples_per_second": 76.233,
"eval_steps_per_second": 19.058,
"num_input_tokens_seen": 20447232000,
"step": 19500
},
{
"epoch": 0.42943614539873176,
"grad_norm": 0.15671676397323608,
"learning_rate": 0.001,
"loss": 2.7176,
"num_input_tokens_seen": 20499660800,
"step": 19550
},
{
"epoch": 0.43053444756087683,
"grad_norm": 0.13104794919490814,
"learning_rate": 0.001,
"loss": 2.7108,
"num_input_tokens_seen": 20552089600,
"step": 19600
},
{
"epoch": 0.4316327497230219,
"grad_norm": 0.14532406628131866,
"learning_rate": 0.001,
"loss": 2.7087,
"num_input_tokens_seen": 20604518400,
"step": 19650
},
{
"epoch": 0.43273105188516703,
"grad_norm": 0.16199354827404022,
"learning_rate": 0.001,
"loss": 2.7178,
"num_input_tokens_seen": 20656947200,
"step": 19700
},
{
"epoch": 0.4338293540473121,
"grad_norm": 0.13537316024303436,
"learning_rate": 0.001,
"loss": 2.7124,
"num_input_tokens_seen": 20709376000,
"step": 19750
},
{
"epoch": 0.4349276562094572,
"grad_norm": 0.15098537504673004,
"learning_rate": 0.001,
"loss": 2.7119,
"num_input_tokens_seen": 20761804800,
"step": 19800
},
{
"epoch": 0.4360259583716023,
"grad_norm": 0.21563659608364105,
"learning_rate": 0.001,
"loss": 2.7118,
"num_input_tokens_seen": 20814233600,
"step": 19850
},
{
"epoch": 0.43712426053374737,
"grad_norm": 0.15981121361255646,
"learning_rate": 0.001,
"loss": 2.7043,
"num_input_tokens_seen": 20866662400,
"step": 19900
},
{
"epoch": 0.4382225626958925,
"grad_norm": 0.15192069113254547,
"learning_rate": 0.001,
"loss": 2.7137,
"num_input_tokens_seen": 20919091200,
"step": 19950
},
{
"epoch": 0.43932086485803756,
"grad_norm": 0.14211437106132507,
"learning_rate": 0.001,
"loss": 2.7128,
"num_input_tokens_seen": 20971520000,
"step": 20000
},
{
"epoch": 0.43932086485803756,
"eval_loss": 2.611689567565918,
"eval_runtime": 66.3456,
"eval_samples_per_second": 75.363,
"eval_steps_per_second": 18.841,
"num_input_tokens_seen": 20971520000,
"step": 20000
},
{
"epoch": 0.4404191670201827,
"grad_norm": 0.14489957690238953,
"learning_rate": 0.001,
"loss": 2.7139,
"num_input_tokens_seen": 21023948800,
"step": 20050
},
{
"epoch": 0.44151746918232776,
"grad_norm": 0.13994646072387695,
"learning_rate": 0.001,
"loss": 2.7091,
"num_input_tokens_seen": 21076377600,
"step": 20100
},
{
"epoch": 0.4426157713444729,
"grad_norm": 0.17211903631687164,
"learning_rate": 0.001,
"loss": 2.7176,
"num_input_tokens_seen": 21128806400,
"step": 20150
},
{
"epoch": 0.44371407350661796,
"grad_norm": 0.16364862024784088,
"learning_rate": 0.001,
"loss": 2.7181,
"num_input_tokens_seen": 21181235200,
"step": 20200
},
{
"epoch": 0.444812375668763,
"grad_norm": 0.14166216552257538,
"learning_rate": 0.001,
"loss": 2.7127,
"num_input_tokens_seen": 21233664000,
"step": 20250
},
{
"epoch": 0.44591067783090815,
"grad_norm": 0.12995755672454834,
"learning_rate": 0.001,
"loss": 2.7085,
"num_input_tokens_seen": 21286092800,
"step": 20300
},
{
"epoch": 0.4470089799930532,
"grad_norm": 0.15717202425003052,
"learning_rate": 0.001,
"loss": 2.7071,
"num_input_tokens_seen": 21338521600,
"step": 20350
},
{
"epoch": 0.44810728215519835,
"grad_norm": 0.13354860246181488,
"learning_rate": 0.001,
"loss": 2.7094,
"num_input_tokens_seen": 21390950400,
"step": 20400
},
{
"epoch": 0.4492055843173434,
"grad_norm": 0.16004188358783722,
"learning_rate": 0.001,
"loss": 2.7109,
"num_input_tokens_seen": 21443379200,
"step": 20450
},
{
"epoch": 0.45030388647948855,
"grad_norm": 0.148077592253685,
"learning_rate": 0.001,
"loss": 2.7058,
"num_input_tokens_seen": 21495808000,
"step": 20500
},
{
"epoch": 0.45030388647948855,
"eval_loss": 2.6089115142822266,
"eval_runtime": 65.5589,
"eval_samples_per_second": 76.267,
"eval_steps_per_second": 19.067,
"num_input_tokens_seen": 21495808000,
"step": 20500
},
{
"epoch": 0.4514021886416336,
"grad_norm": 0.16992634534835815,
"learning_rate": 0.001,
"loss": 2.7026,
"num_input_tokens_seen": 21548236800,
"step": 20550
},
{
"epoch": 0.4525004908037787,
"grad_norm": 0.14876551926136017,
"learning_rate": 0.001,
"loss": 2.7105,
"num_input_tokens_seen": 21600665600,
"step": 20600
},
{
"epoch": 0.4535987929659238,
"grad_norm": 0.16025613248348236,
"learning_rate": 0.001,
"loss": 2.707,
"num_input_tokens_seen": 21653094400,
"step": 20650
},
{
"epoch": 0.4546970951280689,
"grad_norm": 0.14609012007713318,
"learning_rate": 0.001,
"loss": 2.7086,
"num_input_tokens_seen": 21705523200,
"step": 20700
},
{
"epoch": 0.455795397290214,
"grad_norm": 0.14725832641124725,
"learning_rate": 0.001,
"loss": 2.7075,
"num_input_tokens_seen": 21757952000,
"step": 20750
},
{
"epoch": 0.4568936994523591,
"grad_norm": 0.1736454963684082,
"learning_rate": 0.001,
"loss": 2.7033,
"num_input_tokens_seen": 21810380800,
"step": 20800
},
{
"epoch": 0.45799200161450415,
"grad_norm": 0.14904257655143738,
"learning_rate": 0.001,
"loss": 2.7012,
"num_input_tokens_seen": 21862809600,
"step": 20850
},
{
"epoch": 0.4590903037766493,
"grad_norm": 0.14407765865325928,
"learning_rate": 0.001,
"loss": 2.7055,
"num_input_tokens_seen": 21915238400,
"step": 20900
},
{
"epoch": 0.46018860593879435,
"grad_norm": 0.13943473994731903,
"learning_rate": 0.001,
"loss": 2.6999,
"num_input_tokens_seen": 21967667200,
"step": 20950
},
{
"epoch": 0.4612869081009395,
"grad_norm": 0.1592896729707718,
"learning_rate": 0.001,
"loss": 2.7072,
"num_input_tokens_seen": 22020096000,
"step": 21000
},
{
"epoch": 0.4612869081009395,
"eval_loss": 2.605719566345215,
"eval_runtime": 65.6879,
"eval_samples_per_second": 76.117,
"eval_steps_per_second": 19.029,
"num_input_tokens_seen": 22020096000,
"step": 21000
},
{
"epoch": 0.46238521026308455,
"grad_norm": 0.1428702473640442,
"learning_rate": 0.001,
"loss": 2.7042,
"num_input_tokens_seen": 22072524800,
"step": 21050
},
{
"epoch": 0.46348351242522967,
"grad_norm": 0.13529072701931,
"learning_rate": 0.001,
"loss": 2.7093,
"num_input_tokens_seen": 22124953600,
"step": 21100
},
{
"epoch": 0.46458181458737474,
"grad_norm": 0.17529748380184174,
"learning_rate": 0.001,
"loss": 2.713,
"num_input_tokens_seen": 22177382400,
"step": 21150
},
{
"epoch": 0.4656801167495198,
"grad_norm": 0.1479254513978958,
"learning_rate": 0.001,
"loss": 2.6984,
"num_input_tokens_seen": 22229811200,
"step": 21200
},
{
"epoch": 0.46677841891166494,
"grad_norm": 0.15110637247562408,
"learning_rate": 0.001,
"loss": 2.7128,
"num_input_tokens_seen": 22282240000,
"step": 21250
},
{
"epoch": 0.46787672107381,
"grad_norm": 0.13746944069862366,
"learning_rate": 0.001,
"loss": 2.7036,
"num_input_tokens_seen": 22334668800,
"step": 21300
},
{
"epoch": 0.46897502323595514,
"grad_norm": 0.17940136790275574,
"learning_rate": 0.001,
"loss": 2.7048,
"num_input_tokens_seen": 22387097600,
"step": 21350
},
{
"epoch": 0.4700733253981002,
"grad_norm": 0.14203256368637085,
"learning_rate": 0.001,
"loss": 2.6997,
"num_input_tokens_seen": 22439526400,
"step": 21400
},
{
"epoch": 0.47117162756024533,
"grad_norm": 0.14260704815387726,
"learning_rate": 0.001,
"loss": 2.7092,
"num_input_tokens_seen": 22491955200,
"step": 21450
},
{
"epoch": 0.4722699297223904,
"grad_norm": 0.16455897688865662,
"learning_rate": 0.001,
"loss": 2.6969,
"num_input_tokens_seen": 22544384000,
"step": 21500
},
{
"epoch": 0.4722699297223904,
"eval_loss": 2.60367751121521,
"eval_runtime": 65.4304,
"eval_samples_per_second": 76.417,
"eval_steps_per_second": 19.104,
"num_input_tokens_seen": 22544384000,
"step": 21500
},
{
"epoch": 0.4733682318845355,
"grad_norm": 0.1529170274734497,
"learning_rate": 0.001,
"loss": 2.7003,
"num_input_tokens_seen": 22596812800,
"step": 21550
},
{
"epoch": 0.4744665340466806,
"grad_norm": 0.1921636164188385,
"learning_rate": 0.001,
"loss": 2.7014,
"num_input_tokens_seen": 22649241600,
"step": 21600
},
{
"epoch": 0.47556483620882567,
"grad_norm": 0.16029173135757446,
"learning_rate": 0.001,
"loss": 2.7028,
"num_input_tokens_seen": 22701670400,
"step": 21650
},
{
"epoch": 0.4766631383709708,
"grad_norm": 0.14740578830242157,
"learning_rate": 0.001,
"loss": 2.7019,
"num_input_tokens_seen": 22754099200,
"step": 21700
},
{
"epoch": 0.47776144053311587,
"grad_norm": 0.1734548658132553,
"learning_rate": 0.001,
"loss": 2.6985,
"num_input_tokens_seen": 22806528000,
"step": 21750
},
{
"epoch": 0.47885974269526094,
"grad_norm": 0.15502890944480896,
"learning_rate": 0.001,
"loss": 2.6973,
"num_input_tokens_seen": 22858956800,
"step": 21800
},
{
"epoch": 0.47995804485740606,
"grad_norm": 0.16783900558948517,
"learning_rate": 0.001,
"loss": 2.7003,
"num_input_tokens_seen": 22911385600,
"step": 21850
},
{
"epoch": 0.48105634701955113,
"grad_norm": 0.14911381900310516,
"learning_rate": 0.001,
"loss": 2.6992,
"num_input_tokens_seen": 22963814400,
"step": 21900
},
{
"epoch": 0.48215464918169626,
"grad_norm": 0.15027394890785217,
"learning_rate": 0.001,
"loss": 2.6957,
"num_input_tokens_seen": 23016243200,
"step": 21950
},
{
"epoch": 0.48325295134384133,
"grad_norm": 0.1261301189661026,
"learning_rate": 0.001,
"loss": 2.7064,
"num_input_tokens_seen": 23068672000,
"step": 22000
},
{
"epoch": 0.48325295134384133,
"eval_loss": 2.6012015342712402,
"eval_runtime": 64.9701,
"eval_samples_per_second": 76.958,
"eval_steps_per_second": 19.24,
"num_input_tokens_seen": 23068672000,
"step": 22000
},
{
"epoch": 0.48435125350598646,
"grad_norm": 0.15728288888931274,
"learning_rate": 0.001,
"loss": 2.703,
"num_input_tokens_seen": 23121100800,
"step": 22050
},
{
"epoch": 0.4854495556681315,
"grad_norm": 0.13599443435668945,
"learning_rate": 0.001,
"loss": 2.6984,
"num_input_tokens_seen": 23173529600,
"step": 22100
},
{
"epoch": 0.4865478578302766,
"grad_norm": 0.25702551007270813,
"learning_rate": 0.001,
"loss": 2.9388,
"num_input_tokens_seen": 23225958400,
"step": 22150
},
{
"epoch": 0.4876461599924217,
"grad_norm": 0.12942279875278473,
"learning_rate": 0.001,
"loss": 2.7568,
"num_input_tokens_seen": 23278383360,
"step": 22200
},
{
"epoch": 0.4887444621545668,
"grad_norm": 0.12908817827701569,
"learning_rate": 0.001,
"loss": 2.7195,
"num_input_tokens_seen": 23330812160,
"step": 22250
},
{
"epoch": 0.4898427643167119,
"grad_norm": 0.1351587176322937,
"learning_rate": 0.001,
"loss": 2.7155,
"num_input_tokens_seen": 23383240960,
"step": 22300
},
{
"epoch": 0.490941066478857,
"grad_norm": 0.1245250552892685,
"learning_rate": 0.001,
"loss": 2.7074,
"num_input_tokens_seen": 23435669760,
"step": 22350
},
{
"epoch": 0.4920393686410021,
"grad_norm": 0.13818837702274323,
"learning_rate": 0.001,
"loss": 2.7064,
"num_input_tokens_seen": 23488098560,
"step": 22400
},
{
"epoch": 0.4931376708031472,
"grad_norm": 0.15505041182041168,
"learning_rate": 0.001,
"loss": 2.7044,
"num_input_tokens_seen": 23540527360,
"step": 22450
},
{
"epoch": 0.49423597296529226,
"grad_norm": 0.14414137601852417,
"learning_rate": 0.001,
"loss": 2.7046,
"num_input_tokens_seen": 23592956160,
"step": 22500
},
{
"epoch": 0.49423597296529226,
"eval_loss": 2.60188627243042,
"eval_runtime": 67.3268,
"eval_samples_per_second": 74.265,
"eval_steps_per_second": 18.566,
"num_input_tokens_seen": 23592956160,
"step": 22500
},
{
"epoch": 0.4953342751274374,
"grad_norm": 0.14763414859771729,
"learning_rate": 0.001,
"loss": 2.695,
"num_input_tokens_seen": 23645384960,
"step": 22550
},
{
"epoch": 0.49643257728958246,
"grad_norm": 0.14800110459327698,
"learning_rate": 0.001,
"loss": 2.6939,
"num_input_tokens_seen": 23697813760,
"step": 22600
},
{
"epoch": 0.4975308794517276,
"grad_norm": 0.13590902090072632,
"learning_rate": 0.001,
"loss": 2.6967,
"num_input_tokens_seen": 23750242560,
"step": 22650
},
{
"epoch": 0.49862918161387265,
"grad_norm": 0.1315733939409256,
"learning_rate": 0.001,
"loss": 2.6909,
"num_input_tokens_seen": 23802671360,
"step": 22700
},
{
"epoch": 0.4997274837760177,
"grad_norm": 0.13714700937271118,
"learning_rate": 0.001,
"loss": 2.6957,
"num_input_tokens_seen": 23855100160,
"step": 22750
},
{
"epoch": 0.5008257859381628,
"grad_norm": 0.1412438154220581,
"learning_rate": 0.001,
"loss": 2.6977,
"num_input_tokens_seen": 23907528960,
"step": 22800
},
{
"epoch": 0.501924088100308,
"grad_norm": 0.15368172526359558,
"learning_rate": 0.001,
"loss": 2.6977,
"num_input_tokens_seen": 23959957760,
"step": 22850
},
{
"epoch": 0.503022390262453,
"grad_norm": 0.14018824696540833,
"learning_rate": 0.001,
"loss": 2.6992,
"num_input_tokens_seen": 24012386560,
"step": 22900
},
{
"epoch": 0.5041206924245981,
"grad_norm": 0.1284814178943634,
"learning_rate": 0.001,
"loss": 2.6962,
"num_input_tokens_seen": 24064815360,
"step": 22950
},
{
"epoch": 0.5052189945867432,
"grad_norm": 0.15145835280418396,
"learning_rate": 0.001,
"loss": 2.692,
"num_input_tokens_seen": 24117244160,
"step": 23000
},
{
"epoch": 0.5052189945867432,
"eval_loss": 2.5970778465270996,
"eval_runtime": 66.1666,
"eval_samples_per_second": 75.567,
"eval_steps_per_second": 18.892,
"num_input_tokens_seen": 24117244160,
"step": 23000
},
{
"epoch": 0.5063172967488883,
"grad_norm": 0.15117652714252472,
"learning_rate": 0.001,
"loss": 2.696,
"num_input_tokens_seen": 24169672960,
"step": 23050
},
{
"epoch": 0.5074155989110334,
"grad_norm": 0.15605470538139343,
"learning_rate": 0.001,
"loss": 2.6918,
"num_input_tokens_seen": 24222101760,
"step": 23100
},
{
"epoch": 0.5085139010731785,
"grad_norm": 0.17503651976585388,
"learning_rate": 0.001,
"loss": 2.688,
"num_input_tokens_seen": 24274530560,
"step": 23150
},
{
"epoch": 0.5096122032353236,
"grad_norm": 0.1622135490179062,
"learning_rate": 0.001,
"loss": 2.6949,
"num_input_tokens_seen": 24326959360,
"step": 23200
},
{
"epoch": 0.5107105053974687,
"grad_norm": 0.1331271231174469,
"learning_rate": 0.001,
"loss": 2.6876,
"num_input_tokens_seen": 24379388160,
"step": 23250
},
{
"epoch": 0.5118088075596138,
"grad_norm": 0.14365510642528534,
"learning_rate": 0.001,
"loss": 2.7027,
"num_input_tokens_seen": 24431816960,
"step": 23300
},
{
"epoch": 0.5129071097217589,
"grad_norm": 0.13621902465820312,
"learning_rate": 0.001,
"loss": 2.6946,
"num_input_tokens_seen": 24484245760,
"step": 23350
},
{
"epoch": 0.5140054118839039,
"grad_norm": 0.12506547570228577,
"learning_rate": 0.001,
"loss": 2.6864,
"num_input_tokens_seen": 24536674560,
"step": 23400
},
{
"epoch": 0.515103714046049,
"grad_norm": 0.12824128568172455,
"learning_rate": 0.001,
"loss": 2.6871,
"num_input_tokens_seen": 24589103360,
"step": 23450
},
{
"epoch": 0.5162020162081942,
"grad_norm": 0.14310036599636078,
"learning_rate": 0.001,
"loss": 2.6936,
"num_input_tokens_seen": 24641532160,
"step": 23500
},
{
"epoch": 0.5162020162081942,
"eval_loss": 2.592362880706787,
"eval_runtime": 66.663,
"eval_samples_per_second": 75.004,
"eval_steps_per_second": 18.751,
"num_input_tokens_seen": 24641532160,
"step": 23500
},
{
"epoch": 0.5173003183703393,
"grad_norm": 0.1362077295780182,
"learning_rate": 0.001,
"loss": 2.6924,
"num_input_tokens_seen": 24693960960,
"step": 23550
},
{
"epoch": 0.5183986205324843,
"grad_norm": 0.13662473857402802,
"learning_rate": 0.001,
"loss": 2.6972,
"num_input_tokens_seen": 24746389760,
"step": 23600
},
{
"epoch": 0.5194969226946294,
"grad_norm": 0.12603560090065002,
"learning_rate": 0.001,
"loss": 2.6908,
"num_input_tokens_seen": 24798818560,
"step": 23650
},
{
"epoch": 0.5205952248567746,
"grad_norm": 0.16597150266170502,
"learning_rate": 0.001,
"loss": 2.6882,
"num_input_tokens_seen": 24851247360,
"step": 23700
},
{
"epoch": 0.5216935270189196,
"grad_norm": 0.13665246963500977,
"learning_rate": 0.001,
"loss": 2.6958,
"num_input_tokens_seen": 24903676160,
"step": 23750
},
{
"epoch": 0.5227918291810647,
"grad_norm": 0.14349523186683655,
"learning_rate": 0.001,
"loss": 2.6874,
"num_input_tokens_seen": 24956104960,
"step": 23800
},
{
"epoch": 0.5238901313432098,
"grad_norm": 0.15857954323291779,
"learning_rate": 0.001,
"loss": 2.6882,
"num_input_tokens_seen": 25008533760,
"step": 23850
},
{
"epoch": 0.524988433505355,
"grad_norm": 0.15056300163269043,
"learning_rate": 0.001,
"loss": 2.694,
"num_input_tokens_seen": 25060962560,
"step": 23900
},
{
"epoch": 0.5260867356675,
"grad_norm": 0.12861080467700958,
"learning_rate": 0.001,
"loss": 2.6899,
"num_input_tokens_seen": 25113391360,
"step": 23950
},
{
"epoch": 0.5271850378296451,
"grad_norm": 0.14443258941173553,
"learning_rate": 0.001,
"loss": 2.6929,
"num_input_tokens_seen": 25165820160,
"step": 24000
},
{
"epoch": 0.5271850378296451,
"eval_loss": 2.5910630226135254,
"eval_runtime": 66.9014,
"eval_samples_per_second": 74.737,
"eval_steps_per_second": 18.684,
"num_input_tokens_seen": 25165820160,
"step": 24000
},
{
"epoch": 0.5282833399917902,
"grad_norm": 0.14083649218082428,
"learning_rate": 0.001,
"loss": 2.6851,
"num_input_tokens_seen": 25218248960,
"step": 24050
},
{
"epoch": 0.5293816421539352,
"grad_norm": 0.13934968411922455,
"learning_rate": 0.001,
"loss": 2.6863,
"num_input_tokens_seen": 25270677760,
"step": 24100
},
{
"epoch": 0.5304799443160804,
"grad_norm": 0.15416787564754486,
"learning_rate": 0.001,
"loss": 2.6894,
"num_input_tokens_seen": 25323106560,
"step": 24150
},
{
"epoch": 0.5315782464782255,
"grad_norm": 0.17290246486663818,
"learning_rate": 0.001,
"loss": 2.6907,
"num_input_tokens_seen": 25375535360,
"step": 24200
},
{
"epoch": 0.5326765486403706,
"grad_norm": 0.14260552823543549,
"learning_rate": 0.001,
"loss": 2.6832,
"num_input_tokens_seen": 25427964160,
"step": 24250
},
{
"epoch": 0.5337748508025156,
"grad_norm": 0.14795690774917603,
"learning_rate": 0.001,
"loss": 2.6895,
"num_input_tokens_seen": 25480392960,
"step": 24300
},
{
"epoch": 0.5348731529646608,
"grad_norm": 0.15009699761867523,
"learning_rate": 0.001,
"loss": 2.6819,
"num_input_tokens_seen": 25532821760,
"step": 24350
},
{
"epoch": 0.5359714551268059,
"grad_norm": 0.15425953269004822,
"learning_rate": 0.001,
"loss": 2.6874,
"num_input_tokens_seen": 25585250560,
"step": 24400
},
{
"epoch": 0.5370697572889509,
"grad_norm": 0.14639410376548767,
"learning_rate": 0.001,
"loss": 2.6878,
"num_input_tokens_seen": 25637679360,
"step": 24450
},
{
"epoch": 0.538168059451096,
"grad_norm": 0.14785613119602203,
"learning_rate": 0.001,
"loss": 2.6841,
"num_input_tokens_seen": 25690108160,
"step": 24500
},
{
"epoch": 0.538168059451096,
"eval_loss": 2.5875706672668457,
"eval_runtime": 66.9296,
"eval_samples_per_second": 74.705,
"eval_steps_per_second": 18.676,
"num_input_tokens_seen": 25690108160,
"step": 24500
},
{
"epoch": 0.5392663616132412,
"grad_norm": 0.14224180579185486,
"learning_rate": 0.001,
"loss": 2.6876,
"num_input_tokens_seen": 25742536960,
"step": 24550
},
{
"epoch": 0.5403646637753863,
"grad_norm": 0.14881493151187897,
"learning_rate": 0.001,
"loss": 2.6827,
"num_input_tokens_seen": 25794965760,
"step": 24600
},
{
"epoch": 0.5414629659375313,
"grad_norm": 0.17951786518096924,
"learning_rate": 0.001,
"loss": 2.688,
"num_input_tokens_seen": 25847394560,
"step": 24650
},
{
"epoch": 0.5425612680996764,
"grad_norm": 0.1400926560163498,
"learning_rate": 0.001,
"loss": 2.6945,
"num_input_tokens_seen": 25899823360,
"step": 24700
},
{
"epoch": 0.5436595702618215,
"grad_norm": 0.1421627402305603,
"learning_rate": 0.001,
"loss": 2.6852,
"num_input_tokens_seen": 25952252160,
"step": 24750
},
{
"epoch": 0.5447578724239666,
"grad_norm": 0.1617737114429474,
"learning_rate": 0.001,
"loss": 2.686,
"num_input_tokens_seen": 26004680960,
"step": 24800
},
{
"epoch": 0.5458561745861117,
"grad_norm": 0.1523471176624298,
"learning_rate": 0.001,
"loss": 2.6945,
"num_input_tokens_seen": 26057109760,
"step": 24850
},
{
"epoch": 0.5469544767482568,
"grad_norm": 0.13078247010707855,
"learning_rate": 0.001,
"loss": 2.6829,
"num_input_tokens_seen": 26109538560,
"step": 24900
},
{
"epoch": 0.5480527789104018,
"grad_norm": 0.14831651747226715,
"learning_rate": 0.001,
"loss": 2.6898,
"num_input_tokens_seen": 26161967360,
"step": 24950
},
{
"epoch": 0.549151081072547,
"grad_norm": 0.1782410740852356,
"learning_rate": 0.001,
"loss": 2.6871,
"num_input_tokens_seen": 26214396160,
"step": 25000
},
{
"epoch": 0.549151081072547,
"eval_loss": 2.5877788066864014,
"eval_runtime": 67.2223,
"eval_samples_per_second": 74.38,
"eval_steps_per_second": 18.595,
"num_input_tokens_seen": 26214396160,
"step": 25000
},
{
"epoch": 0.5502493832346921,
"grad_norm": 0.16484692692756653,
"learning_rate": 0.001,
"loss": 2.6843,
"num_input_tokens_seen": 26266824960,
"step": 25050
},
{
"epoch": 0.5513476853968372,
"grad_norm": 0.1583317369222641,
"learning_rate": 0.001,
"loss": 2.6825,
"num_input_tokens_seen": 26319253760,
"step": 25100
},
{
"epoch": 0.5524459875589822,
"grad_norm": 0.1569424867630005,
"learning_rate": 0.001,
"loss": 2.6787,
"num_input_tokens_seen": 26371682560,
"step": 25150
},
{
"epoch": 0.5535442897211273,
"grad_norm": 0.13633306324481964,
"learning_rate": 0.001,
"loss": 2.6872,
"num_input_tokens_seen": 26424111360,
"step": 25200
},
{
"epoch": 0.5546425918832725,
"grad_norm": 0.1480533927679062,
"learning_rate": 0.001,
"loss": 2.6842,
"num_input_tokens_seen": 26476540160,
"step": 25250
},
{
"epoch": 0.5557408940454175,
"grad_norm": 0.1267666518688202,
"learning_rate": 0.001,
"loss": 2.6839,
"num_input_tokens_seen": 26528968960,
"step": 25300
},
{
"epoch": 0.5568391962075626,
"grad_norm": 0.13951599597930908,
"learning_rate": 0.001,
"loss": 2.6799,
"num_input_tokens_seen": 26581397760,
"step": 25350
},
{
"epoch": 0.5579374983697077,
"grad_norm": 0.15044580399990082,
"learning_rate": 0.001,
"loss": 2.6846,
"num_input_tokens_seen": 26633826560,
"step": 25400
},
{
"epoch": 0.5590358005318529,
"grad_norm": 0.12891829013824463,
"learning_rate": 0.001,
"loss": 2.682,
"num_input_tokens_seen": 26686255360,
"step": 25450
},
{
"epoch": 0.5601341026939979,
"grad_norm": 0.12812241911888123,
"learning_rate": 0.001,
"loss": 2.684,
"num_input_tokens_seen": 26738684160,
"step": 25500
},
{
"epoch": 0.5601341026939979,
"eval_loss": 2.5832085609436035,
"eval_runtime": 66.9038,
"eval_samples_per_second": 74.734,
"eval_steps_per_second": 18.684,
"num_input_tokens_seen": 26738684160,
"step": 25500
},
{
"epoch": 0.561232404856143,
"grad_norm": 0.14243654906749725,
"learning_rate": 0.001,
"loss": 2.6883,
"num_input_tokens_seen": 26791112960,
"step": 25550
},
{
"epoch": 0.5623307070182881,
"grad_norm": 0.14436320960521698,
"learning_rate": 0.001,
"loss": 2.6835,
"num_input_tokens_seen": 26843541760,
"step": 25600
},
{
"epoch": 0.5634290091804331,
"grad_norm": 0.1516960710287094,
"learning_rate": 0.001,
"loss": 2.6752,
"num_input_tokens_seen": 26895970560,
"step": 25650
},
{
"epoch": 0.5645273113425783,
"grad_norm": 0.14002515375614166,
"learning_rate": 0.001,
"loss": 2.6817,
"num_input_tokens_seen": 26948399360,
"step": 25700
},
{
"epoch": 0.5656256135047234,
"grad_norm": 0.1379036009311676,
"learning_rate": 0.001,
"loss": 2.6904,
"num_input_tokens_seen": 27000828160,
"step": 25750
},
{
"epoch": 0.5667239156668685,
"grad_norm": 0.16127964854240417,
"learning_rate": 0.001,
"loss": 2.6813,
"num_input_tokens_seen": 27053256960,
"step": 25800
},
{
"epoch": 0.5678222178290135,
"grad_norm": 0.15714125335216522,
"learning_rate": 0.001,
"loss": 2.6851,
"num_input_tokens_seen": 27105685760,
"step": 25850
},
{
"epoch": 0.5689205199911587,
"grad_norm": 0.15288160741329193,
"learning_rate": 0.001,
"loss": 2.6832,
"num_input_tokens_seen": 27158114560,
"step": 25900
},
{
"epoch": 0.5700188221533038,
"grad_norm": 0.1398363709449768,
"learning_rate": 0.001,
"loss": 2.6814,
"num_input_tokens_seen": 27210543360,
"step": 25950
},
{
"epoch": 0.5711171243154488,
"grad_norm": 0.15253235399723053,
"learning_rate": 0.001,
"loss": 2.6755,
"num_input_tokens_seen": 27262972160,
"step": 26000
},
{
"epoch": 0.5711171243154488,
"eval_loss": 2.5809168815612793,
"eval_runtime": 66.151,
"eval_samples_per_second": 75.585,
"eval_steps_per_second": 18.896,
"num_input_tokens_seen": 27262972160,
"step": 26000
},
{
"epoch": 0.5722154264775939,
"grad_norm": 0.1538383513689041,
"learning_rate": 0.001,
"loss": 2.6783,
"num_input_tokens_seen": 27315400960,
"step": 26050
},
{
"epoch": 0.5733137286397391,
"grad_norm": 0.15545998513698578,
"learning_rate": 0.001,
"loss": 2.6798,
"num_input_tokens_seen": 27367829760,
"step": 26100
},
{
"epoch": 0.5744120308018842,
"grad_norm": 0.15456970036029816,
"learning_rate": 0.001,
"loss": 2.6836,
"num_input_tokens_seen": 27420258560,
"step": 26150
},
{
"epoch": 0.5755103329640292,
"grad_norm": 0.1353277862071991,
"learning_rate": 0.001,
"loss": 2.6777,
"num_input_tokens_seen": 27472687360,
"step": 26200
},
{
"epoch": 0.5766086351261743,
"grad_norm": 0.15124258399009705,
"learning_rate": 0.001,
"loss": 2.681,
"num_input_tokens_seen": 27525116160,
"step": 26250
},
{
"epoch": 0.5777069372883195,
"grad_norm": 0.14200901985168457,
"learning_rate": 0.001,
"loss": 2.6827,
"num_input_tokens_seen": 27577544960,
"step": 26300
},
{
"epoch": 0.5788052394504645,
"grad_norm": 0.15356388688087463,
"learning_rate": 0.001,
"loss": 2.6802,
"num_input_tokens_seen": 27629973760,
"step": 26350
},
{
"epoch": 0.5799035416126096,
"grad_norm": 0.17395390570163727,
"learning_rate": 0.001,
"loss": 2.6921,
"num_input_tokens_seen": 27682402560,
"step": 26400
},
{
"epoch": 0.5810018437747547,
"grad_norm": 0.1507692188024521,
"learning_rate": 0.001,
"loss": 2.6811,
"num_input_tokens_seen": 27734831360,
"step": 26450
},
{
"epoch": 0.5821001459368998,
"grad_norm": 0.14512786269187927,
"learning_rate": 0.001,
"loss": 2.6798,
"num_input_tokens_seen": 27787260160,
"step": 26500
},
{
"epoch": 0.5821001459368998,
"eval_loss": 2.5802626609802246,
"eval_runtime": 67.1032,
"eval_samples_per_second": 74.512,
"eval_steps_per_second": 18.628,
"num_input_tokens_seen": 27787260160,
"step": 26500
},
{
"epoch": 0.5831984480990449,
"grad_norm": 0.15365912020206451,
"learning_rate": 0.001,
"loss": 2.6813,
"num_input_tokens_seen": 27839688960,
"step": 26550
},
{
"epoch": 0.58429675026119,
"grad_norm": 0.14015646278858185,
"learning_rate": 0.001,
"loss": 2.6774,
"num_input_tokens_seen": 27892117760,
"step": 26600
},
{
"epoch": 0.5853950524233351,
"grad_norm": 0.1529797911643982,
"learning_rate": 0.001,
"loss": 2.6751,
"num_input_tokens_seen": 27944546560,
"step": 26650
},
{
"epoch": 0.5864933545854801,
"grad_norm": 0.16909636557102203,
"learning_rate": 0.001,
"loss": 2.6795,
"num_input_tokens_seen": 27996975360,
"step": 26700
},
{
"epoch": 0.5875916567476253,
"grad_norm": 0.14130276441574097,
"learning_rate": 0.001,
"loss": 2.6809,
"num_input_tokens_seen": 28049404160,
"step": 26750
},
{
"epoch": 0.5886899589097704,
"grad_norm": 0.15182790160179138,
"learning_rate": 0.001,
"loss": 2.685,
"num_input_tokens_seen": 28101832960,
"step": 26800
},
{
"epoch": 0.5897882610719154,
"grad_norm": 0.12757331132888794,
"learning_rate": 0.001,
"loss": 2.6766,
"num_input_tokens_seen": 28154261760,
"step": 26850
},
{
"epoch": 0.5908865632340605,
"grad_norm": 0.1527504026889801,
"learning_rate": 0.001,
"loss": 2.6767,
"num_input_tokens_seen": 28206690560,
"step": 26900
},
{
"epoch": 0.5919848653962057,
"grad_norm": 0.18337304890155792,
"learning_rate": 0.001,
"loss": 2.6752,
"num_input_tokens_seen": 28259119360,
"step": 26950
},
{
"epoch": 0.5930831675583508,
"grad_norm": 0.1472473442554474,
"learning_rate": 0.001,
"loss": 2.6717,
"num_input_tokens_seen": 28311548160,
"step": 27000
},
{
"epoch": 0.5930831675583508,
"eval_loss": 2.5781941413879395,
"eval_runtime": 66.2194,
"eval_samples_per_second": 75.507,
"eval_steps_per_second": 18.877,
"num_input_tokens_seen": 28311548160,
"step": 27000
},
{
"epoch": 0.5941814697204958,
"grad_norm": 0.15350718796253204,
"learning_rate": 0.001,
"loss": 2.6787,
"num_input_tokens_seen": 28363976960,
"step": 27050
},
{
"epoch": 0.5952797718826409,
"grad_norm": 0.1393333077430725,
"learning_rate": 0.001,
"loss": 2.6759,
"num_input_tokens_seen": 28416405760,
"step": 27100
},
{
"epoch": 0.596378074044786,
"grad_norm": 0.1485709846019745,
"learning_rate": 0.001,
"loss": 2.6772,
"num_input_tokens_seen": 28468834560,
"step": 27150
},
{
"epoch": 0.5974763762069311,
"grad_norm": 0.13909003138542175,
"learning_rate": 0.001,
"loss": 2.6729,
"num_input_tokens_seen": 28521263360,
"step": 27200
},
{
"epoch": 0.5985746783690762,
"grad_norm": 0.15117496252059937,
"learning_rate": 0.001,
"loss": 2.6704,
"num_input_tokens_seen": 28573692160,
"step": 27250
},
{
"epoch": 0.5996729805312213,
"grad_norm": 0.14054876565933228,
"learning_rate": 0.001,
"loss": 2.6748,
"num_input_tokens_seen": 28626120960,
"step": 27300
},
{
"epoch": 0.6007712826933664,
"grad_norm": 0.15437620878219604,
"learning_rate": 0.001,
"loss": 2.6778,
"num_input_tokens_seen": 28678549760,
"step": 27350
},
{
"epoch": 0.6018695848555115,
"grad_norm": 0.15858007967472076,
"learning_rate": 0.001,
"loss": 2.6763,
"num_input_tokens_seen": 28730978560,
"step": 27400
},
{
"epoch": 0.6029678870176566,
"grad_norm": 0.14459487795829773,
"learning_rate": 0.001,
"loss": 2.6726,
"num_input_tokens_seen": 28783407360,
"step": 27450
},
{
"epoch": 0.6040661891798017,
"grad_norm": 0.17691345512866974,
"learning_rate": 0.001,
"loss": 2.678,
"num_input_tokens_seen": 28835836160,
"step": 27500
},
{
"epoch": 0.6040661891798017,
"eval_loss": 2.576051950454712,
"eval_runtime": 66.9387,
"eval_samples_per_second": 74.695,
"eval_steps_per_second": 18.674,
"num_input_tokens_seen": 28835836160,
"step": 27500
},
{
"epoch": 0.6051644913419467,
"grad_norm": 0.16200922429561615,
"learning_rate": 0.001,
"loss": 2.6763,
"num_input_tokens_seen": 28888264960,
"step": 27550
},
{
"epoch": 0.6062627935040918,
"grad_norm": 0.14567038416862488,
"learning_rate": 0.001,
"loss": 2.6795,
"num_input_tokens_seen": 28940693760,
"step": 27600
},
{
"epoch": 0.607361095666237,
"grad_norm": 0.16075611114501953,
"learning_rate": 0.001,
"loss": 2.6746,
"num_input_tokens_seen": 28993122560,
"step": 27650
},
{
"epoch": 0.6084593978283821,
"grad_norm": 0.1386987417936325,
"learning_rate": 0.001,
"loss": 2.6771,
"num_input_tokens_seen": 29045551360,
"step": 27700
},
{
"epoch": 0.6095576999905271,
"grad_norm": 0.14672614634037018,
"learning_rate": 0.001,
"loss": 2.6792,
"num_input_tokens_seen": 29097980160,
"step": 27750
},
{
"epoch": 0.6106560021526722,
"grad_norm": 0.22614523768424988,
"learning_rate": 0.001,
"loss": 2.6728,
"num_input_tokens_seen": 29150408960,
"step": 27800
},
{
"epoch": 0.6117543043148174,
"grad_norm": 0.15554341673851013,
"learning_rate": 0.001,
"loss": 2.676,
"num_input_tokens_seen": 29202837760,
"step": 27850
},
{
"epoch": 0.6128526064769624,
"grad_norm": 0.17181837558746338,
"learning_rate": 0.001,
"loss": 2.6811,
"num_input_tokens_seen": 29255266560,
"step": 27900
},
{
"epoch": 0.6139509086391075,
"grad_norm": 0.15763437747955322,
"learning_rate": 0.001,
"loss": 2.6797,
"num_input_tokens_seen": 29307695360,
"step": 27950
},
{
"epoch": 0.6150492108012526,
"grad_norm": 0.14721135795116425,
"learning_rate": 0.001,
"loss": 2.6762,
"num_input_tokens_seen": 29360124160,
"step": 28000
},
{
"epoch": 0.6150492108012526,
"eval_loss": 2.5763511657714844,
"eval_runtime": 66.3236,
"eval_samples_per_second": 75.388,
"eval_steps_per_second": 18.847,
"num_input_tokens_seen": 29360124160,
"step": 28000
},
{
"epoch": 0.6161475129633978,
"grad_norm": 0.13857993483543396,
"learning_rate": 0.001,
"loss": 2.677,
"num_input_tokens_seen": 29412552960,
"step": 28050
},
{
"epoch": 0.6172458151255428,
"grad_norm": 0.14276473224163055,
"learning_rate": 0.001,
"loss": 2.6669,
"num_input_tokens_seen": 29464981760,
"step": 28100
},
{
"epoch": 0.6183441172876879,
"grad_norm": 0.1536131203174591,
"learning_rate": 0.001,
"loss": 2.6757,
"num_input_tokens_seen": 29517410560,
"step": 28150
},
{
"epoch": 0.619442419449833,
"grad_norm": 0.15733414888381958,
"learning_rate": 0.001,
"loss": 2.6735,
"num_input_tokens_seen": 29569839360,
"step": 28200
},
{
"epoch": 0.620540721611978,
"grad_norm": 0.14553523063659668,
"learning_rate": 0.001,
"loss": 2.6683,
"num_input_tokens_seen": 29622268160,
"step": 28250
},
{
"epoch": 0.6216390237741232,
"grad_norm": 0.15685459971427917,
"learning_rate": 0.001,
"loss": 2.6692,
"num_input_tokens_seen": 29674696960,
"step": 28300
},
{
"epoch": 0.6227373259362683,
"grad_norm": 0.16553767025470734,
"learning_rate": 0.001,
"loss": 2.6778,
"num_input_tokens_seen": 29727125760,
"step": 28350
},
{
"epoch": 0.6238356280984134,
"grad_norm": 0.1619853973388672,
"learning_rate": 0.001,
"loss": 2.6807,
"num_input_tokens_seen": 29779554560,
"step": 28400
},
{
"epoch": 0.6249339302605584,
"grad_norm": 0.12794817984104156,
"learning_rate": 0.001,
"loss": 2.6776,
"num_input_tokens_seen": 29831983360,
"step": 28450
},
{
"epoch": 0.6260322324227036,
"grad_norm": 0.17001128196716309,
"learning_rate": 0.001,
"loss": 2.6797,
"num_input_tokens_seen": 29884412160,
"step": 28500
},
{
"epoch": 0.6260322324227036,
"eval_loss": 2.5728061199188232,
"eval_runtime": 66.7752,
"eval_samples_per_second": 74.878,
"eval_steps_per_second": 18.72,
"num_input_tokens_seen": 29884412160,
"step": 28500
},
{
"epoch": 0.6271305345848487,
"grad_norm": 0.12936875224113464,
"learning_rate": 0.001,
"loss": 2.6677,
"num_input_tokens_seen": 29936840960,
"step": 28550
},
{
"epoch": 0.6282288367469937,
"grad_norm": 0.14839358627796173,
"learning_rate": 0.001,
"loss": 2.6681,
"num_input_tokens_seen": 29989269760,
"step": 28600
},
{
"epoch": 0.6293271389091388,
"grad_norm": 0.1526126265525818,
"learning_rate": 0.001,
"loss": 2.6711,
"num_input_tokens_seen": 30041698560,
"step": 28650
},
{
"epoch": 0.630425441071284,
"grad_norm": 11.806962013244629,
"learning_rate": 0.001,
"loss": 2.7543,
"num_input_tokens_seen": 30094127360,
"step": 28700
},
{
"epoch": 0.631523743233429,
"grad_norm": 0.13446328043937683,
"learning_rate": 0.001,
"loss": 2.9466,
"num_input_tokens_seen": 30146556160,
"step": 28750
},
{
"epoch": 0.6326220453955741,
"grad_norm": 0.1319582760334015,
"learning_rate": 0.001,
"loss": 2.7002,
"num_input_tokens_seen": 30198984960,
"step": 28800
},
{
"epoch": 0.6337203475577192,
"grad_norm": 0.13955356180667877,
"learning_rate": 0.001,
"loss": 2.6814,
"num_input_tokens_seen": 30251413760,
"step": 28850
},
{
"epoch": 0.6348186497198643,
"grad_norm": 0.1295064240694046,
"learning_rate": 0.001,
"loss": 2.676,
"num_input_tokens_seen": 30303842560,
"step": 28900
},
{
"epoch": 0.6359169518820094,
"grad_norm": 0.1440495401620865,
"learning_rate": 0.001,
"loss": 2.6778,
"num_input_tokens_seen": 30356271360,
"step": 28950
},
{
"epoch": 0.6370152540441545,
"grad_norm": 0.13806115090847015,
"learning_rate": 0.001,
"loss": 2.6712,
"num_input_tokens_seen": 30408700160,
"step": 29000
},
{
"epoch": 0.6370152540441545,
"eval_loss": 2.576237440109253,
"eval_runtime": 66.9761,
"eval_samples_per_second": 74.653,
"eval_steps_per_second": 18.663,
"num_input_tokens_seen": 30408700160,
"step": 29000
},
{
"epoch": 0.6381135562062996,
"grad_norm": 0.13853897154331207,
"learning_rate": 0.001,
"loss": 2.6719,
"num_input_tokens_seen": 30461128960,
"step": 29050
},
{
"epoch": 0.6392118583684446,
"grad_norm": 0.14228977262973785,
"learning_rate": 0.001,
"loss": 2.6788,
"num_input_tokens_seen": 30513557760,
"step": 29100
},
{
"epoch": 0.6403101605305898,
"grad_norm": 0.13464143872261047,
"learning_rate": 0.001,
"loss": 2.6743,
"num_input_tokens_seen": 30565986560,
"step": 29150
},
{
"epoch": 0.6414084626927349,
"grad_norm": 0.15960821509361267,
"learning_rate": 0.001,
"loss": 2.6729,
"num_input_tokens_seen": 30618415360,
"step": 29200
},
{
"epoch": 0.64250676485488,
"grad_norm": 0.13830585777759552,
"learning_rate": 0.001,
"loss": 2.6723,
"num_input_tokens_seen": 30670844160,
"step": 29250
},
{
"epoch": 0.643605067017025,
"grad_norm": 0.14440728724002838,
"learning_rate": 0.001,
"loss": 2.664,
"num_input_tokens_seen": 30723272960,
"step": 29300
},
{
"epoch": 0.6447033691791701,
"grad_norm": 0.14259463548660278,
"learning_rate": 0.001,
"loss": 2.6675,
"num_input_tokens_seen": 30775701760,
"step": 29350
},
{
"epoch": 0.6458016713413153,
"grad_norm": 0.1462564468383789,
"learning_rate": 0.001,
"loss": 2.6671,
"num_input_tokens_seen": 30828130560,
"step": 29400
},
{
"epoch": 0.6468999735034603,
"grad_norm": 0.1443469077348709,
"learning_rate": 0.001,
"loss": 2.6667,
"num_input_tokens_seen": 30880559360,
"step": 29450
},
{
"epoch": 0.6479982756656054,
"grad_norm": 0.143255814909935,
"learning_rate": 0.001,
"loss": 2.6652,
"num_input_tokens_seen": 30932988160,
"step": 29500
},
{
"epoch": 0.6479982756656054,
"eval_loss": 2.569544792175293,
"eval_runtime": 66.8674,
"eval_samples_per_second": 74.775,
"eval_steps_per_second": 18.694,
"num_input_tokens_seen": 30932988160,
"step": 29500
},
{
"epoch": 0.6490965778277505,
"grad_norm": 0.15149758756160736,
"learning_rate": 0.001,
"loss": 2.6681,
"num_input_tokens_seen": 30985416960,
"step": 29550
},
{
"epoch": 0.6501948799898957,
"grad_norm": 0.15703468024730682,
"learning_rate": 0.001,
"loss": 2.6681,
"num_input_tokens_seen": 31037845760,
"step": 29600
},
{
"epoch": 0.6512931821520407,
"grad_norm": 0.14332515001296997,
"learning_rate": 0.001,
"loss": 2.6622,
"num_input_tokens_seen": 31090274560,
"step": 29650
},
{
"epoch": 0.6523914843141858,
"grad_norm": 0.13763870298862457,
"learning_rate": 0.001,
"loss": 2.6724,
"num_input_tokens_seen": 31142703360,
"step": 29700
},
{
"epoch": 0.6534897864763309,
"grad_norm": 0.11858976632356644,
"learning_rate": 0.001,
"loss": 2.6743,
"num_input_tokens_seen": 31195132160,
"step": 29750
},
{
"epoch": 0.654588088638476,
"grad_norm": 0.15627937018871307,
"learning_rate": 0.001,
"loss": 2.6653,
"num_input_tokens_seen": 31247560960,
"step": 29800
},
{
"epoch": 0.6556863908006211,
"grad_norm": 0.15052759647369385,
"learning_rate": 0.001,
"loss": 2.6684,
"num_input_tokens_seen": 31299989760,
"step": 29850
},
{
"epoch": 0.6567846929627662,
"grad_norm": 0.1648450791835785,
"learning_rate": 0.001,
"loss": 2.6783,
"num_input_tokens_seen": 31352418560,
"step": 29900
},
{
"epoch": 0.6578829951249113,
"grad_norm": 0.13318586349487305,
"learning_rate": 0.001,
"loss": 2.6712,
"num_input_tokens_seen": 31404847360,
"step": 29950
},
{
"epoch": 0.6589812972870563,
"grad_norm": 0.1517287641763687,
"learning_rate": 0.001,
"loss": 2.6688,
"num_input_tokens_seen": 31457276160,
"step": 30000
},
{
"epoch": 0.6589812972870563,
"eval_loss": 2.5676708221435547,
"eval_runtime": 66.0876,
"eval_samples_per_second": 75.657,
"eval_steps_per_second": 18.914,
"num_input_tokens_seen": 31457276160,
"step": 30000
},
{
"epoch": 0.6600795994492015,
"grad_norm": 0.14465224742889404,
"learning_rate": 0.001,
"loss": 2.6657,
"num_input_tokens_seen": 31509704960,
"step": 30050
},
{
"epoch": 0.6611779016113466,
"grad_norm": 0.16096332669258118,
"learning_rate": 0.001,
"loss": 2.6612,
"num_input_tokens_seen": 31562133760,
"step": 30100
},
{
"epoch": 0.6622762037734916,
"grad_norm": 0.1434296816587448,
"learning_rate": 0.001,
"loss": 2.6695,
"num_input_tokens_seen": 31614562560,
"step": 30150
},
{
"epoch": 0.6633745059356367,
"grad_norm": 0.13844367861747742,
"learning_rate": 0.001,
"loss": 2.6649,
"num_input_tokens_seen": 31666991360,
"step": 30200
},
{
"epoch": 0.6644728080977819,
"grad_norm": 0.1579446643590927,
"learning_rate": 0.001,
"loss": 2.6701,
"num_input_tokens_seen": 31719420160,
"step": 30250
},
{
"epoch": 0.665571110259927,
"grad_norm": 0.1585385501384735,
"learning_rate": 0.001,
"loss": 2.665,
"num_input_tokens_seen": 31771848960,
"step": 30300
},
{
"epoch": 0.666669412422072,
"grad_norm": 0.18768636882305145,
"learning_rate": 0.001,
"loss": 2.6708,
"num_input_tokens_seen": 31824277760,
"step": 30350
},
{
"epoch": 0.6677677145842171,
"grad_norm": 0.13027966022491455,
"learning_rate": 0.001,
"loss": 2.6657,
"num_input_tokens_seen": 31876706560,
"step": 30400
},
{
"epoch": 0.6688660167463623,
"grad_norm": 0.13473722338676453,
"learning_rate": 0.001,
"loss": 2.6658,
"num_input_tokens_seen": 31929135360,
"step": 30450
},
{
"epoch": 0.6699643189085073,
"grad_norm": 0.14617317914962769,
"learning_rate": 0.001,
"loss": 2.664,
"num_input_tokens_seen": 31981564160,
"step": 30500
},
{
"epoch": 0.6699643189085073,
"eval_loss": 2.5658769607543945,
"eval_runtime": 67.5011,
"eval_samples_per_second": 74.073,
"eval_steps_per_second": 18.518,
"num_input_tokens_seen": 31981564160,
"step": 30500
},
{
"epoch": 0.6710626210706524,
"grad_norm": 0.14581717550754547,
"learning_rate": 0.001,
"loss": 2.6654,
"num_input_tokens_seen": 32033992960,
"step": 30550
},
{
"epoch": 0.6721609232327975,
"grad_norm": 0.12281567603349686,
"learning_rate": 0.001,
"loss": 2.6649,
"num_input_tokens_seen": 32086421760,
"step": 30600
},
{
"epoch": 0.6732592253949425,
"grad_norm": 0.14368072152137756,
"learning_rate": 0.001,
"loss": 2.6605,
"num_input_tokens_seen": 32138850560,
"step": 30650
},
{
"epoch": 0.6743575275570877,
"grad_norm": 0.14596907794475555,
"learning_rate": 0.001,
"loss": 2.6651,
"num_input_tokens_seen": 32191279360,
"step": 30700
},
{
"epoch": 0.6754558297192328,
"grad_norm": 0.15414392948150635,
"learning_rate": 0.001,
"loss": 2.6696,
"num_input_tokens_seen": 32243708160,
"step": 30750
},
{
"epoch": 0.6765541318813779,
"grad_norm": 0.14875884354114532,
"learning_rate": 0.001,
"loss": 2.6662,
"num_input_tokens_seen": 32296136960,
"step": 30800
},
{
"epoch": 0.6776524340435229,
"grad_norm": 0.13774773478507996,
"learning_rate": 0.001,
"loss": 2.6649,
"num_input_tokens_seen": 32348565760,
"step": 30850
},
{
"epoch": 0.6787507362056681,
"grad_norm": 0.1647578626871109,
"learning_rate": 0.001,
"loss": 2.6693,
"num_input_tokens_seen": 32400994560,
"step": 30900
},
{
"epoch": 0.6798490383678132,
"grad_norm": 0.1620490700006485,
"learning_rate": 0.001,
"loss": 2.6726,
"num_input_tokens_seen": 32453423360,
"step": 30950
},
{
"epoch": 0.6809473405299582,
"grad_norm": 0.14238062500953674,
"learning_rate": 0.001,
"loss": 2.6681,
"num_input_tokens_seen": 32505852160,
"step": 31000
},
{
"epoch": 0.6809473405299582,
"eval_loss": 2.5645763874053955,
"eval_runtime": 65.7725,
"eval_samples_per_second": 76.02,
"eval_steps_per_second": 19.005,
"num_input_tokens_seen": 32505852160,
"step": 31000
},
{
"epoch": 0.6820456426921033,
"grad_norm": 0.143716499209404,
"learning_rate": 0.001,
"loss": 2.6591,
"num_input_tokens_seen": 32558280960,
"step": 31050
},
{
"epoch": 0.6831439448542485,
"grad_norm": 0.16048283874988556,
"learning_rate": 0.001,
"loss": 2.659,
"num_input_tokens_seen": 32610709760,
"step": 31100
},
{
"epoch": 0.6842422470163936,
"grad_norm": 0.15203309059143066,
"learning_rate": 0.001,
"loss": 2.6703,
"num_input_tokens_seen": 32663138560,
"step": 31150
},
{
"epoch": 0.6853405491785386,
"grad_norm": 0.14977113902568817,
"learning_rate": 0.001,
"loss": 2.6657,
"num_input_tokens_seen": 32715567360,
"step": 31200
},
{
"epoch": 0.6864388513406837,
"grad_norm": 0.15292279422283173,
"learning_rate": 0.001,
"loss": 2.6629,
"num_input_tokens_seen": 32767996160,
"step": 31250
},
{
"epoch": 0.6875371535028288,
"grad_norm": 0.13721971213817596,
"learning_rate": 0.001,
"loss": 2.6641,
"num_input_tokens_seen": 32820424960,
"step": 31300
},
{
"epoch": 0.6886354556649739,
"grad_norm": 0.15564891695976257,
"learning_rate": 0.001,
"loss": 2.6673,
"num_input_tokens_seen": 32872853760,
"step": 31350
},
{
"epoch": 0.689733757827119,
"grad_norm": 0.15267717838287354,
"learning_rate": 0.001,
"loss": 2.6624,
"num_input_tokens_seen": 32925282560,
"step": 31400
},
{
"epoch": 0.6908320599892641,
"grad_norm": 0.15039384365081787,
"learning_rate": 0.001,
"loss": 2.6615,
"num_input_tokens_seen": 32977711360,
"step": 31450
},
{
"epoch": 0.6919303621514092,
"grad_norm": 0.14114901423454285,
"learning_rate": 0.001,
"loss": 2.6663,
"num_input_tokens_seen": 33030140160,
"step": 31500
},
{
"epoch": 0.6919303621514092,
"eval_loss": 2.5618767738342285,
"eval_runtime": 66.9611,
"eval_samples_per_second": 74.67,
"eval_steps_per_second": 18.668,
"num_input_tokens_seen": 33030140160,
"step": 31500
},
{
"epoch": 0.6930286643135543,
"grad_norm": 0.1415725201368332,
"learning_rate": 0.001,
"loss": 2.6606,
"num_input_tokens_seen": 33082568960,
"step": 31550
},
{
"epoch": 0.6941269664756994,
"grad_norm": 0.14324156939983368,
"learning_rate": 0.001,
"loss": 2.6616,
"num_input_tokens_seen": 33134997760,
"step": 31600
},
{
"epoch": 0.6952252686378445,
"grad_norm": 0.1544431746006012,
"learning_rate": 0.001,
"loss": 2.6567,
"num_input_tokens_seen": 33187426560,
"step": 31650
},
{
"epoch": 0.6963235707999895,
"grad_norm": 0.14641186594963074,
"learning_rate": 0.001,
"loss": 2.6605,
"num_input_tokens_seen": 33239855360,
"step": 31700
},
{
"epoch": 0.6974218729621346,
"grad_norm": 0.13757406175136566,
"learning_rate": 0.001,
"loss": 2.673,
"num_input_tokens_seen": 33292284160,
"step": 31750
},
{
"epoch": 0.6985201751242798,
"grad_norm": 0.14516425132751465,
"learning_rate": 0.001,
"loss": 2.6781,
"num_input_tokens_seen": 33344712960,
"step": 31800
},
{
"epoch": 0.6996184772864249,
"grad_norm": 0.15246887505054474,
"learning_rate": 0.001,
"loss": 2.6683,
"num_input_tokens_seen": 33397141760,
"step": 31850
},
{
"epoch": 0.7007167794485699,
"grad_norm": 0.1413787305355072,
"learning_rate": 0.001,
"loss": 2.6591,
"num_input_tokens_seen": 33449570560,
"step": 31900
},
{
"epoch": 0.701815081610715,
"grad_norm": 0.16077399253845215,
"learning_rate": 0.001,
"loss": 2.6628,
"num_input_tokens_seen": 33501999360,
"step": 31950
},
{
"epoch": 0.7029133837728602,
"grad_norm": 0.1555839478969574,
"learning_rate": 0.001,
"loss": 2.6631,
"num_input_tokens_seen": 33554428160,
"step": 32000
},
{
"epoch": 0.7029133837728602,
"eval_loss": 2.561042547225952,
"eval_runtime": 66.7879,
"eval_samples_per_second": 74.864,
"eval_steps_per_second": 18.716,
"num_input_tokens_seen": 33554428160,
"step": 32000
},
{
"epoch": 0.7040116859350052,
"grad_norm": 0.15333816409111023,
"learning_rate": 0.001,
"loss": 2.6605,
"num_input_tokens_seen": 33606856960,
"step": 32050
},
{
"epoch": 0.7051099880971503,
"grad_norm": 0.14965052902698517,
"learning_rate": 0.001,
"loss": 2.6551,
"num_input_tokens_seen": 33659285760,
"step": 32100
},
{
"epoch": 0.7062082902592954,
"grad_norm": 0.1994074285030365,
"learning_rate": 0.001,
"loss": 2.6652,
"num_input_tokens_seen": 33711714560,
"step": 32150
},
{
"epoch": 0.7073065924214406,
"grad_norm": 0.3089894652366638,
"learning_rate": 0.001,
"loss": 2.6814,
"num_input_tokens_seen": 33764143360,
"step": 32200
},
{
"epoch": 0.7084048945835856,
"grad_norm": 0.14903652667999268,
"learning_rate": 0.001,
"loss": 2.6834,
"num_input_tokens_seen": 33816572160,
"step": 32250
},
{
"epoch": 0.7095031967457307,
"grad_norm": 0.17594854533672333,
"learning_rate": 0.001,
"loss": 2.6618,
"num_input_tokens_seen": 33869000960,
"step": 32300
},
{
"epoch": 0.7106014989078758,
"grad_norm": 0.15634667873382568,
"learning_rate": 0.001,
"loss": 2.6663,
"num_input_tokens_seen": 33921429760,
"step": 32350
},
{
"epoch": 0.7116998010700208,
"grad_norm": 0.13893702626228333,
"learning_rate": 0.001,
"loss": 2.67,
"num_input_tokens_seen": 33973858560,
"step": 32400
},
{
"epoch": 0.712798103232166,
"grad_norm": 0.16974663734436035,
"learning_rate": 0.001,
"loss": 2.6686,
"num_input_tokens_seen": 34026287360,
"step": 32450
},
{
"epoch": 0.7138964053943111,
"grad_norm": 0.15336968004703522,
"learning_rate": 0.001,
"loss": 2.6703,
"num_input_tokens_seen": 34078716160,
"step": 32500
},
{
"epoch": 0.7138964053943111,
"eval_loss": 2.5648574829101562,
"eval_runtime": 66.0796,
"eval_samples_per_second": 75.666,
"eval_steps_per_second": 18.917,
"num_input_tokens_seen": 34078716160,
"step": 32500
},
{
"epoch": 0.7149947075564561,
"grad_norm": 1.428727626800537,
"learning_rate": 0.001,
"loss": 2.8433,
"num_input_tokens_seen": 34131144960,
"step": 32550
},
{
"epoch": 0.7160930097186012,
"grad_norm": 0.1666879504919052,
"learning_rate": 0.001,
"loss": 2.7236,
"num_input_tokens_seen": 34183573760,
"step": 32600
},
{
"epoch": 0.7171913118807464,
"grad_norm": 0.16038021445274353,
"learning_rate": 0.001,
"loss": 2.6876,
"num_input_tokens_seen": 34236002560,
"step": 32650
},
{
"epoch": 0.7182896140428915,
"grad_norm": 0.1514110267162323,
"learning_rate": 0.001,
"loss": 2.6717,
"num_input_tokens_seen": 34288431360,
"step": 32700
},
{
"epoch": 0.7193879162050365,
"grad_norm": 0.13304661214351654,
"learning_rate": 0.001,
"loss": 2.6664,
"num_input_tokens_seen": 34340860160,
"step": 32750
},
{
"epoch": 0.7204862183671816,
"grad_norm": 0.15957415103912354,
"learning_rate": 0.001,
"loss": 2.6683,
"num_input_tokens_seen": 34393288960,
"step": 32800
},
{
"epoch": 0.7215845205293268,
"grad_norm": 0.14532499015331268,
"learning_rate": 0.001,
"loss": 2.6632,
"num_input_tokens_seen": 34445717760,
"step": 32850
},
{
"epoch": 0.7226828226914718,
"grad_norm": 0.1402454972267151,
"learning_rate": 0.001,
"loss": 2.6631,
"num_input_tokens_seen": 34498146560,
"step": 32900
},
{
"epoch": 0.7237811248536169,
"grad_norm": 0.17248420417308807,
"learning_rate": 0.001,
"loss": 2.6743,
"num_input_tokens_seen": 34550575360,
"step": 32950
},
{
"epoch": 0.724879427015762,
"grad_norm": 0.1455400288105011,
"learning_rate": 0.001,
"loss": 2.6598,
"num_input_tokens_seen": 34603004160,
"step": 33000
},
{
"epoch": 0.724879427015762,
"eval_loss": 2.5639312267303467,
"eval_runtime": 66.9575,
"eval_samples_per_second": 74.674,
"eval_steps_per_second": 18.669,
"num_input_tokens_seen": 34603004160,
"step": 33000
},
{
"epoch": 0.7259777291779071,
"grad_norm": 0.14448963105678558,
"learning_rate": 0.001,
"loss": 2.6579,
"num_input_tokens_seen": 34655432960,
"step": 33050
},
{
"epoch": 0.7270760313400522,
"grad_norm": 0.15785731375217438,
"learning_rate": 0.001,
"loss": 2.6641,
"num_input_tokens_seen": 34707861760,
"step": 33100
},
{
"epoch": 0.7281743335021973,
"grad_norm": 0.14524365961551666,
"learning_rate": 0.001,
"loss": 2.6639,
"num_input_tokens_seen": 34760290560,
"step": 33150
},
{
"epoch": 0.7292726356643424,
"grad_norm": 0.17661139369010925,
"learning_rate": 0.001,
"loss": 2.666,
"num_input_tokens_seen": 34812719360,
"step": 33200
},
{
"epoch": 0.7303709378264874,
"grad_norm": 0.14052839577198029,
"learning_rate": 0.001,
"loss": 2.6638,
"num_input_tokens_seen": 34865148160,
"step": 33250
},
{
"epoch": 0.7314692399886326,
"grad_norm": 0.14182330667972565,
"learning_rate": 0.001,
"loss": 2.6618,
"num_input_tokens_seen": 34917576960,
"step": 33300
},
{
"epoch": 0.7325675421507777,
"grad_norm": 0.168069988489151,
"learning_rate": 0.001,
"loss": 2.6655,
"num_input_tokens_seen": 34970005760,
"step": 33350
},
{
"epoch": 0.7336658443129228,
"grad_norm": 0.1627034991979599,
"learning_rate": 0.001,
"loss": 2.6646,
"num_input_tokens_seen": 35022434560,
"step": 33400
},
{
"epoch": 0.7347641464750678,
"grad_norm": 0.1257403939962387,
"learning_rate": 0.001,
"loss": 2.6682,
"num_input_tokens_seen": 35074863360,
"step": 33450
},
{
"epoch": 0.735862448637213,
"grad_norm": 0.15367744863033295,
"learning_rate": 0.001,
"loss": 2.6693,
"num_input_tokens_seen": 35127292160,
"step": 33500
},
{
"epoch": 0.735862448637213,
"eval_loss": 2.5610554218292236,
"eval_runtime": 67.0185,
"eval_samples_per_second": 74.606,
"eval_steps_per_second": 18.652,
"num_input_tokens_seen": 35127292160,
"step": 33500
},
{
"epoch": 0.7369607507993581,
"grad_norm": 0.16001376509666443,
"learning_rate": 0.001,
"loss": 2.6594,
"num_input_tokens_seen": 35179720960,
"step": 33550
},
{
"epoch": 0.7380590529615031,
"grad_norm": 0.14694422483444214,
"learning_rate": 0.001,
"loss": 2.6635,
"num_input_tokens_seen": 35232149760,
"step": 33600
},
{
"epoch": 0.7391573551236482,
"grad_norm": 0.15586304664611816,
"learning_rate": 0.001,
"loss": 2.6565,
"num_input_tokens_seen": 35284578560,
"step": 33650
},
{
"epoch": 0.7402556572857933,
"grad_norm": 0.16455145180225372,
"learning_rate": 0.001,
"loss": 2.6621,
"num_input_tokens_seen": 35337007360,
"step": 33700
},
{
"epoch": 0.7413539594479385,
"grad_norm": 0.13630282878875732,
"learning_rate": 0.001,
"loss": 2.6658,
"num_input_tokens_seen": 35389436160,
"step": 33750
},
{
"epoch": 0.7424522616100835,
"grad_norm": 0.15180189907550812,
"learning_rate": 0.001,
"loss": 2.6593,
"num_input_tokens_seen": 35441864960,
"step": 33800
},
{
"epoch": 0.7435505637722286,
"grad_norm": 0.16608890891075134,
"learning_rate": 0.001,
"loss": 2.6777,
"num_input_tokens_seen": 35494293760,
"step": 33850
},
{
"epoch": 0.7446488659343737,
"grad_norm": 0.31720519065856934,
"learning_rate": 0.001,
"loss": 2.6685,
"num_input_tokens_seen": 35546722560,
"step": 33900
},
{
"epoch": 0.7457471680965188,
"grad_norm": 0.24131393432617188,
"learning_rate": 0.001,
"loss": 2.6682,
"num_input_tokens_seen": 35599151360,
"step": 33950
},
{
"epoch": 0.7468454702586639,
"grad_norm": 0.1594172567129135,
"learning_rate": 0.001,
"loss": 2.6575,
"num_input_tokens_seen": 35651580160,
"step": 34000
},
{
"epoch": 0.7468454702586639,
"eval_loss": 2.5587804317474365,
"eval_runtime": 66.6197,
"eval_samples_per_second": 75.053,
"eval_steps_per_second": 18.763,
"num_input_tokens_seen": 35651580160,
"step": 34000
},
{
"epoch": 0.747943772420809,
"grad_norm": 0.1586858183145523,
"learning_rate": 0.001,
"loss": 2.6654,
"num_input_tokens_seen": 35704008960,
"step": 34050
},
{
"epoch": 0.749042074582954,
"grad_norm": 0.1376073956489563,
"learning_rate": 0.001,
"loss": 2.6627,
"num_input_tokens_seen": 35756437760,
"step": 34100
},
{
"epoch": 0.7501403767450991,
"grad_norm": 0.13904818892478943,
"learning_rate": 0.001,
"loss": 2.6605,
"num_input_tokens_seen": 35808866560,
"step": 34150
},
{
"epoch": 0.7512386789072443,
"grad_norm": 0.14543947577476501,
"learning_rate": 0.001,
"loss": 2.6589,
"num_input_tokens_seen": 35861295360,
"step": 34200
},
{
"epoch": 0.7523369810693894,
"grad_norm": 0.14855198562145233,
"learning_rate": 0.001,
"loss": 2.6612,
"num_input_tokens_seen": 35913724160,
"step": 34250
},
{
"epoch": 0.7534352832315344,
"grad_norm": 0.14492908120155334,
"learning_rate": 0.001,
"loss": 2.6561,
"num_input_tokens_seen": 35966152960,
"step": 34300
},
{
"epoch": 0.7545335853936795,
"grad_norm": 0.1388978660106659,
"learning_rate": 0.001,
"loss": 2.6551,
"num_input_tokens_seen": 36018581760,
"step": 34350
},
{
"epoch": 0.7556318875558247,
"grad_norm": 0.14582422375679016,
"learning_rate": 0.001,
"loss": 2.6521,
"num_input_tokens_seen": 36071010560,
"step": 34400
},
{
"epoch": 0.7567301897179697,
"grad_norm": 0.17488695681095123,
"learning_rate": 0.001,
"loss": 2.6516,
"num_input_tokens_seen": 36123439360,
"step": 34450
},
{
"epoch": 0.7578284918801148,
"grad_norm": 0.12302416563034058,
"learning_rate": 0.001,
"loss": 2.6617,
"num_input_tokens_seen": 36175868160,
"step": 34500
},
{
"epoch": 0.7578284918801148,
"eval_loss": 2.5549991130828857,
"eval_runtime": 67.5095,
"eval_samples_per_second": 74.064,
"eval_steps_per_second": 18.516,
"num_input_tokens_seen": 36175868160,
"step": 34500
},
{
"epoch": 0.7589267940422599,
"grad_norm": 0.14238396286964417,
"learning_rate": 0.001,
"loss": 2.6609,
"num_input_tokens_seen": 36228296960,
"step": 34550
},
{
"epoch": 0.7600250962044051,
"grad_norm": 0.17919403314590454,
"learning_rate": 0.001,
"loss": 2.6621,
"num_input_tokens_seen": 36280725760,
"step": 34600
},
{
"epoch": 0.7611233983665501,
"grad_norm": 0.13188666105270386,
"learning_rate": 0.001,
"loss": 2.6529,
"num_input_tokens_seen": 36333154560,
"step": 34650
},
{
"epoch": 0.7622217005286952,
"grad_norm": 0.16191646456718445,
"learning_rate": 0.001,
"loss": 2.6584,
"num_input_tokens_seen": 36385583360,
"step": 34700
},
{
"epoch": 0.7633200026908403,
"grad_norm": 0.14606165885925293,
"learning_rate": 0.001,
"loss": 2.6567,
"num_input_tokens_seen": 36438012160,
"step": 34750
},
{
"epoch": 0.7644183048529853,
"grad_norm": 0.1648443192243576,
"learning_rate": 0.001,
"loss": 2.6587,
"num_input_tokens_seen": 36490440960,
"step": 34800
},
{
"epoch": 0.7655166070151305,
"grad_norm": 0.19523674249649048,
"learning_rate": 0.001,
"loss": 2.6662,
"num_input_tokens_seen": 36542869760,
"step": 34850
},
{
"epoch": 0.7666149091772756,
"grad_norm": 0.1713179498910904,
"learning_rate": 0.001,
"loss": 2.6683,
"num_input_tokens_seen": 36595298560,
"step": 34900
},
{
"epoch": 0.7677132113394207,
"grad_norm": 0.14923711121082306,
"learning_rate": 0.001,
"loss": 2.6629,
"num_input_tokens_seen": 36647727360,
"step": 34950
},
{
"epoch": 0.7688115135015657,
"grad_norm": 0.13948023319244385,
"learning_rate": 0.001,
"loss": 2.6619,
"num_input_tokens_seen": 36700156160,
"step": 35000
},
{
"epoch": 0.7688115135015657,
"eval_loss": 2.5569379329681396,
"eval_runtime": 67.9393,
"eval_samples_per_second": 73.595,
"eval_steps_per_second": 18.399,
"num_input_tokens_seen": 36700156160,
"step": 35000
},
{
"epoch": 0.7699098156637109,
"grad_norm": 0.14624406397342682,
"learning_rate": 0.001,
"loss": 2.657,
"num_input_tokens_seen": 36752584960,
"step": 35050
},
{
"epoch": 0.771008117825856,
"grad_norm": 0.16855786740779877,
"learning_rate": 0.001,
"loss": 2.6585,
"num_input_tokens_seen": 36805013760,
"step": 35100
},
{
"epoch": 0.772106419988001,
"grad_norm": 0.1439932882785797,
"learning_rate": 0.001,
"loss": 2.6653,
"num_input_tokens_seen": 36857442560,
"step": 35150
},
{
"epoch": 0.7732047221501461,
"grad_norm": 0.16299331188201904,
"learning_rate": 0.001,
"loss": 2.6621,
"num_input_tokens_seen": 36909871360,
"step": 35200
},
{
"epoch": 0.7743030243122913,
"grad_norm": 0.16961826384067535,
"learning_rate": 0.001,
"loss": 2.6545,
"num_input_tokens_seen": 36962300160,
"step": 35250
},
{
"epoch": 0.7754013264744364,
"grad_norm": 0.13337954878807068,
"learning_rate": 0.001,
"loss": 2.652,
"num_input_tokens_seen": 37014728960,
"step": 35300
},
{
"epoch": 0.7764996286365814,
"grad_norm": 0.1728074699640274,
"learning_rate": 0.001,
"loss": 2.6631,
"num_input_tokens_seen": 37067157760,
"step": 35350
},
{
"epoch": 0.7775979307987265,
"grad_norm": 0.16615192592144012,
"learning_rate": 0.001,
"loss": 2.6551,
"num_input_tokens_seen": 37119586560,
"step": 35400
},
{
"epoch": 0.7786962329608716,
"grad_norm": 0.1515650749206543,
"learning_rate": 0.001,
"loss": 2.6529,
"num_input_tokens_seen": 37172015360,
"step": 35450
},
{
"epoch": 0.7797945351230167,
"grad_norm": 0.1534053236246109,
"learning_rate": 0.001,
"loss": 2.6567,
"num_input_tokens_seen": 37224444160,
"step": 35500
},
{
"epoch": 0.7797945351230167,
"eval_loss": 2.55454683303833,
"eval_runtime": 67.0727,
"eval_samples_per_second": 74.546,
"eval_steps_per_second": 18.637,
"num_input_tokens_seen": 37224444160,
"step": 35500
},
{
"epoch": 0.7808928372851618,
"grad_norm": 0.16377541422843933,
"learning_rate": 0.001,
"loss": 2.6552,
"num_input_tokens_seen": 37276872960,
"step": 35550
},
{
"epoch": 0.7819911394473069,
"grad_norm": 0.14807477593421936,
"learning_rate": 0.001,
"loss": 2.6563,
"num_input_tokens_seen": 37329301760,
"step": 35600
},
{
"epoch": 0.783089441609452,
"grad_norm": 0.13599660992622375,
"learning_rate": 0.001,
"loss": 2.6575,
"num_input_tokens_seen": 37381730560,
"step": 35650
},
{
"epoch": 0.7841877437715971,
"grad_norm": 0.16653482615947723,
"learning_rate": 0.001,
"loss": 2.6515,
"num_input_tokens_seen": 37434159360,
"step": 35700
},
{
"epoch": 0.7852860459337422,
"grad_norm": 0.15467293560504913,
"learning_rate": 0.001,
"loss": 2.6548,
"num_input_tokens_seen": 37486588160,
"step": 35750
},
{
"epoch": 0.7863843480958873,
"grad_norm": 0.4751467704772949,
"learning_rate": 0.001,
"loss": 2.6592,
"num_input_tokens_seen": 37539016960,
"step": 35800
},
{
"epoch": 0.7874826502580323,
"grad_norm": 0.15940867364406586,
"learning_rate": 0.001,
"loss": 2.6624,
"num_input_tokens_seen": 37591445760,
"step": 35850
},
{
"epoch": 0.7885809524201775,
"grad_norm": 0.137634739279747,
"learning_rate": 0.001,
"loss": 2.6559,
"num_input_tokens_seen": 37643874560,
"step": 35900
},
{
"epoch": 0.7896792545823226,
"grad_norm": 0.16022460162639618,
"learning_rate": 0.001,
"loss": 2.6555,
"num_input_tokens_seen": 37696303360,
"step": 35950
},
{
"epoch": 0.7907775567444676,
"grad_norm": 0.147109717130661,
"learning_rate": 0.001,
"loss": 2.663,
"num_input_tokens_seen": 37748732160,
"step": 36000
},
{
"epoch": 0.7907775567444676,
"eval_loss": 2.556107521057129,
"eval_runtime": 67.1814,
"eval_samples_per_second": 74.425,
"eval_steps_per_second": 18.606,
"num_input_tokens_seen": 37748732160,
"step": 36000
},
{
"epoch": 0.7918758589066127,
"grad_norm": 0.16054154932498932,
"learning_rate": 0.001,
"loss": 2.6516,
"num_input_tokens_seen": 37801160960,
"step": 36050
},
{
"epoch": 0.7929741610687578,
"grad_norm": 0.15180550515651703,
"learning_rate": 0.001,
"loss": 2.6508,
"num_input_tokens_seen": 37853589760,
"step": 36100
},
{
"epoch": 0.794072463230903,
"grad_norm": 0.19564937055110931,
"learning_rate": 0.001,
"loss": 2.6532,
"num_input_tokens_seen": 37906018560,
"step": 36150
},
{
"epoch": 0.795170765393048,
"grad_norm": 0.15047501027584076,
"learning_rate": 0.001,
"loss": 2.6567,
"num_input_tokens_seen": 37958447360,
"step": 36200
},
{
"epoch": 0.7962690675551931,
"grad_norm": 0.1420314759016037,
"learning_rate": 0.001,
"loss": 2.6511,
"num_input_tokens_seen": 38010876160,
"step": 36250
},
{
"epoch": 0.7973673697173382,
"grad_norm": 0.14328153431415558,
"learning_rate": 0.001,
"loss": 2.6601,
"num_input_tokens_seen": 38063304960,
"step": 36300
},
{
"epoch": 0.7984656718794833,
"grad_norm": 0.15527622401714325,
"learning_rate": 0.001,
"loss": 2.6598,
"num_input_tokens_seen": 38115733760,
"step": 36350
},
{
"epoch": 0.7995639740416284,
"grad_norm": 0.15956974029541016,
"learning_rate": 0.001,
"loss": 2.6522,
"num_input_tokens_seen": 38168162560,
"step": 36400
},
{
"epoch": 0.8006622762037735,
"grad_norm": 0.15193034708499908,
"learning_rate": 0.001,
"loss": 2.6561,
"num_input_tokens_seen": 38220591360,
"step": 36450
},
{
"epoch": 0.8017605783659186,
"grad_norm": 0.1692439615726471,
"learning_rate": 0.001,
"loss": 2.653,
"num_input_tokens_seen": 38273020160,
"step": 36500
},
{
"epoch": 0.8017605783659186,
"eval_loss": 2.553743362426758,
"eval_runtime": 66.3488,
"eval_samples_per_second": 75.359,
"eval_steps_per_second": 18.84,
"num_input_tokens_seen": 38273020160,
"step": 36500
},
{
"epoch": 0.8028588805280636,
"grad_norm": 0.473707377910614,
"learning_rate": 0.001,
"loss": 2.6604,
"num_input_tokens_seen": 38325448960,
"step": 36550
},
{
"epoch": 0.8039571826902088,
"grad_norm": 0.16226574778556824,
"learning_rate": 0.001,
"loss": 2.6615,
"num_input_tokens_seen": 38377877760,
"step": 36600
},
{
"epoch": 0.8050554848523539,
"grad_norm": 0.17274035513401031,
"learning_rate": 0.001,
"loss": 2.6616,
"num_input_tokens_seen": 38430306560,
"step": 36650
},
{
"epoch": 0.8061537870144989,
"grad_norm": 0.14171990752220154,
"learning_rate": 0.001,
"loss": 2.6628,
"num_input_tokens_seen": 38482735360,
"step": 36700
},
{
"epoch": 0.807252089176644,
"grad_norm": 0.3828020989894867,
"learning_rate": 0.001,
"loss": 2.6717,
"num_input_tokens_seen": 38535164160,
"step": 36750
},
{
"epoch": 0.8083503913387892,
"grad_norm": 0.20836575329303741,
"learning_rate": 0.001,
"loss": 2.685,
"num_input_tokens_seen": 38587592960,
"step": 36800
},
{
"epoch": 0.8094486935009343,
"grad_norm": 0.14613227546215057,
"learning_rate": 0.001,
"loss": 2.6687,
"num_input_tokens_seen": 38640021760,
"step": 36850
},
{
"epoch": 0.8105469956630793,
"grad_norm": 0.16505028307437897,
"learning_rate": 0.001,
"loss": 2.6654,
"num_input_tokens_seen": 38692450560,
"step": 36900
},
{
"epoch": 0.8116452978252244,
"grad_norm": 0.15305323898792267,
"learning_rate": 0.001,
"loss": 2.6612,
"num_input_tokens_seen": 38744879360,
"step": 36950
},
{
"epoch": 0.8127435999873696,
"grad_norm": 0.2416296899318695,
"learning_rate": 0.001,
"loss": 2.6614,
"num_input_tokens_seen": 38797308160,
"step": 37000
},
{
"epoch": 0.8127435999873696,
"eval_loss": 2.5642571449279785,
"eval_runtime": 66.5631,
"eval_samples_per_second": 75.117,
"eval_steps_per_second": 18.779,
"num_input_tokens_seen": 38797308160,
"step": 37000
},
{
"epoch": 0.8138419021495146,
"grad_norm": 0.1504666954278946,
"learning_rate": 0.001,
"loss": 2.6625,
"num_input_tokens_seen": 38849736960,
"step": 37050
},
{
"epoch": 0.8149402043116597,
"grad_norm": 0.15831789374351501,
"learning_rate": 0.001,
"loss": 2.6566,
"num_input_tokens_seen": 38902165760,
"step": 37100
},
{
"epoch": 0.8160385064738048,
"grad_norm": 0.1391575187444687,
"learning_rate": 0.001,
"loss": 2.6609,
"num_input_tokens_seen": 38954594560,
"step": 37150
},
{
"epoch": 0.81713680863595,
"grad_norm": 0.22168035805225372,
"learning_rate": 0.001,
"loss": 2.6768,
"num_input_tokens_seen": 39007023360,
"step": 37200
},
{
"epoch": 0.818235110798095,
"grad_norm": 0.1874976009130478,
"learning_rate": 0.001,
"loss": 2.679,
"num_input_tokens_seen": 39059452160,
"step": 37250
},
{
"epoch": 0.8193334129602401,
"grad_norm": 0.1796240657567978,
"learning_rate": 0.001,
"loss": 2.6644,
"num_input_tokens_seen": 39111880960,
"step": 37300
},
{
"epoch": 0.8204317151223852,
"grad_norm": 0.3271934986114502,
"learning_rate": 0.001,
"loss": 2.6695,
"num_input_tokens_seen": 39164309760,
"step": 37350
},
{
"epoch": 0.8215300172845302,
"grad_norm": 0.13447704911231995,
"learning_rate": 0.001,
"loss": 2.6656,
"num_input_tokens_seen": 39216738560,
"step": 37400
},
{
"epoch": 0.8226283194466754,
"grad_norm": 0.1367628127336502,
"learning_rate": 0.001,
"loss": 2.6505,
"num_input_tokens_seen": 39269167360,
"step": 37450
},
{
"epoch": 0.8237266216088205,
"grad_norm": 0.1498686671257019,
"learning_rate": 0.001,
"loss": 2.6594,
"num_input_tokens_seen": 39321596160,
"step": 37500
},
{
"epoch": 0.8237266216088205,
"eval_loss": 2.5516529083251953,
"eval_runtime": 66.8213,
"eval_samples_per_second": 74.826,
"eval_steps_per_second": 18.707,
"num_input_tokens_seen": 39321596160,
"step": 37500
},
{
"epoch": 0.8248249237709656,
"grad_norm": 0.14790424704551697,
"learning_rate": 0.001,
"loss": 2.6519,
"num_input_tokens_seen": 39374024960,
"step": 37550
},
{
"epoch": 0.8259232259331106,
"grad_norm": 0.15297918021678925,
"learning_rate": 0.001,
"loss": 2.6533,
"num_input_tokens_seen": 39426453760,
"step": 37600
},
{
"epoch": 0.8270215280952558,
"grad_norm": 0.15760953724384308,
"learning_rate": 0.001,
"loss": 2.6584,
"num_input_tokens_seen": 39478882560,
"step": 37650
},
{
"epoch": 0.8281198302574009,
"grad_norm": 0.1545770913362503,
"learning_rate": 0.001,
"loss": 2.6453,
"num_input_tokens_seen": 39531311360,
"step": 37700
},
{
"epoch": 0.8292181324195459,
"grad_norm": 0.17809870839118958,
"learning_rate": 0.001,
"loss": 2.6547,
"num_input_tokens_seen": 39583740160,
"step": 37750
},
{
"epoch": 0.830316434581691,
"grad_norm": 0.2712576687335968,
"learning_rate": 0.001,
"loss": 2.6489,
"num_input_tokens_seen": 39636168960,
"step": 37800
},
{
"epoch": 0.8314147367438361,
"grad_norm": 0.1525331437587738,
"learning_rate": 0.001,
"loss": 2.6558,
"num_input_tokens_seen": 39688597760,
"step": 37850
},
{
"epoch": 0.8325130389059812,
"grad_norm": 0.1624525785446167,
"learning_rate": 0.001,
"loss": 2.6465,
"num_input_tokens_seen": 39741026560,
"step": 37900
},
{
"epoch": 0.8336113410681263,
"grad_norm": 0.14974552392959595,
"learning_rate": 0.001,
"loss": 2.6595,
"num_input_tokens_seen": 39793455360,
"step": 37950
},
{
"epoch": 0.8347096432302714,
"grad_norm": 0.15206202864646912,
"learning_rate": 0.001,
"loss": 2.6525,
"num_input_tokens_seen": 39845884160,
"step": 38000
},
{
"epoch": 0.8347096432302714,
"eval_loss": 2.549203395843506,
"eval_runtime": 66.3732,
"eval_samples_per_second": 75.332,
"eval_steps_per_second": 18.833,
"num_input_tokens_seen": 39845884160,
"step": 38000
},
{
"epoch": 0.8358079453924165,
"grad_norm": 0.15346269309520721,
"learning_rate": 0.001,
"loss": 2.645,
"num_input_tokens_seen": 39898312960,
"step": 38050
},
{
"epoch": 0.8369062475545616,
"grad_norm": 0.1504630148410797,
"learning_rate": 0.001,
"loss": 2.666,
"num_input_tokens_seen": 39950741760,
"step": 38100
},
{
"epoch": 0.8380045497167067,
"grad_norm": 0.19098903238773346,
"learning_rate": 0.001,
"loss": 2.6649,
"num_input_tokens_seen": 40003170560,
"step": 38150
},
{
"epoch": 0.8391028518788518,
"grad_norm": 0.15553973615169525,
"learning_rate": 0.001,
"loss": 2.6565,
"num_input_tokens_seen": 40055599360,
"step": 38200
},
{
"epoch": 0.8402011540409968,
"grad_norm": 0.15650159120559692,
"learning_rate": 0.001,
"loss": 2.6568,
"num_input_tokens_seen": 40108028160,
"step": 38250
},
{
"epoch": 0.841299456203142,
"grad_norm": 0.17787836492061615,
"learning_rate": 0.001,
"loss": 2.6497,
"num_input_tokens_seen": 40160456960,
"step": 38300
},
{
"epoch": 0.8423977583652871,
"grad_norm": 0.1535162478685379,
"learning_rate": 0.001,
"loss": 2.6492,
"num_input_tokens_seen": 40212885760,
"step": 38350
},
{
"epoch": 0.8434960605274322,
"grad_norm": 0.16713359951972961,
"learning_rate": 0.001,
"loss": 2.6534,
"num_input_tokens_seen": 40265314560,
"step": 38400
},
{
"epoch": 0.8445943626895772,
"grad_norm": 0.17087998986244202,
"learning_rate": 0.001,
"loss": 2.6602,
"num_input_tokens_seen": 40317743360,
"step": 38450
},
{
"epoch": 0.8456926648517223,
"grad_norm": 0.15651412308216095,
"learning_rate": 0.001,
"loss": 2.6547,
"num_input_tokens_seen": 40370172160,
"step": 38500
},
{
"epoch": 0.8456926648517223,
"eval_loss": 2.5524706840515137,
"eval_runtime": 66.5023,
"eval_samples_per_second": 75.185,
"eval_steps_per_second": 18.796,
"num_input_tokens_seen": 40370172160,
"step": 38500
},
{
"epoch": 0.8467909670138675,
"grad_norm": 0.15205898880958557,
"learning_rate": 0.001,
"loss": 2.6541,
"num_input_tokens_seen": 40422600960,
"step": 38550
},
{
"epoch": 0.8478892691760125,
"grad_norm": 0.15865832567214966,
"learning_rate": 0.001,
"loss": 2.6536,
"num_input_tokens_seen": 40475029760,
"step": 38600
},
{
"epoch": 0.8489875713381576,
"grad_norm": 0.133284330368042,
"learning_rate": 0.001,
"loss": 2.6531,
"num_input_tokens_seen": 40527458560,
"step": 38650
},
{
"epoch": 0.8500858735003027,
"grad_norm": 0.1421806663274765,
"learning_rate": 0.001,
"loss": 2.6558,
"num_input_tokens_seen": 40579887360,
"step": 38700
},
{
"epoch": 0.8511841756624479,
"grad_norm": 0.19429996609687805,
"learning_rate": 0.001,
"loss": 2.6628,
"num_input_tokens_seen": 40632316160,
"step": 38750
},
{
"epoch": 0.8522824778245929,
"grad_norm": 0.14661937952041626,
"learning_rate": 0.001,
"loss": 2.6594,
"num_input_tokens_seen": 40684744960,
"step": 38800
},
{
"epoch": 0.853380779986738,
"grad_norm": 0.1694687008857727,
"learning_rate": 0.001,
"loss": 2.6571,
"num_input_tokens_seen": 40737173760,
"step": 38850
},
{
"epoch": 0.8544790821488831,
"grad_norm": 0.152188241481781,
"learning_rate": 0.001,
"loss": 2.6534,
"num_input_tokens_seen": 40789602560,
"step": 38900
},
{
"epoch": 0.8555773843110281,
"grad_norm": 0.1554640680551529,
"learning_rate": 0.001,
"loss": 2.649,
"num_input_tokens_seen": 40842031360,
"step": 38950
},
{
"epoch": 0.8566756864731733,
"grad_norm": 0.1481955647468567,
"learning_rate": 0.001,
"loss": 2.6527,
"num_input_tokens_seen": 40894460160,
"step": 39000
},
{
"epoch": 0.8566756864731733,
"eval_loss": 2.547664165496826,
"eval_runtime": 66.2874,
"eval_samples_per_second": 75.429,
"eval_steps_per_second": 18.857,
"num_input_tokens_seen": 40894460160,
"step": 39000
}
],
"logging_steps": 50,
"max_steps": 200000,
"num_input_tokens_seen": 40894460160,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.3289694735724052e+19,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}