CodCodingCode's picture
Upload folder using huggingface_hub
622c937 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1295465243998117,
"eval_steps": 500,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002510591558135886,
"grad_norm": 2.53125,
"learning_rate": 1.9964e-05,
"loss": 1.2975,
"step": 10
},
{
"epoch": 0.005021183116271772,
"grad_norm": 1.859375,
"learning_rate": 1.9924e-05,
"loss": 0.6792,
"step": 20
},
{
"epoch": 0.007531774674407657,
"grad_norm": 1.921875,
"learning_rate": 1.9884e-05,
"loss": 0.6571,
"step": 30
},
{
"epoch": 0.010042366232543544,
"grad_norm": 1.8046875,
"learning_rate": 1.9844000000000002e-05,
"loss": 0.6431,
"step": 40
},
{
"epoch": 0.012552957790679428,
"grad_norm": 1.8984375,
"learning_rate": 1.9804000000000002e-05,
"loss": 0.6739,
"step": 50
},
{
"epoch": 0.015063549348815314,
"grad_norm": 1.6953125,
"learning_rate": 1.9764000000000003e-05,
"loss": 0.6491,
"step": 60
},
{
"epoch": 0.0175741409069512,
"grad_norm": 1.6796875,
"learning_rate": 1.9724e-05,
"loss": 0.5911,
"step": 70
},
{
"epoch": 0.020084732465087088,
"grad_norm": 2.0,
"learning_rate": 1.9684e-05,
"loss": 0.6117,
"step": 80
},
{
"epoch": 0.022595324023222972,
"grad_norm": 1.734375,
"learning_rate": 1.9644e-05,
"loss": 0.5892,
"step": 90
},
{
"epoch": 0.025105915581358856,
"grad_norm": 2.125,
"learning_rate": 1.9604e-05,
"loss": 0.5893,
"step": 100
},
{
"epoch": 0.027616507139494744,
"grad_norm": 1.953125,
"learning_rate": 1.9564e-05,
"loss": 0.6412,
"step": 110
},
{
"epoch": 0.03012709869763063,
"grad_norm": 1.796875,
"learning_rate": 1.9524e-05,
"loss": 0.5514,
"step": 120
},
{
"epoch": 0.032637690255766516,
"grad_norm": 1.7421875,
"learning_rate": 1.9484000000000002e-05,
"loss": 0.6029,
"step": 130
},
{
"epoch": 0.0351482818139024,
"grad_norm": 1.78125,
"learning_rate": 1.9444000000000002e-05,
"loss": 0.5723,
"step": 140
},
{
"epoch": 0.037658873372038285,
"grad_norm": 1.9453125,
"learning_rate": 1.9404e-05,
"loss": 0.5721,
"step": 150
},
{
"epoch": 0.040169464930174176,
"grad_norm": 2.0625,
"learning_rate": 1.9364e-05,
"loss": 0.5911,
"step": 160
},
{
"epoch": 0.04268005648831006,
"grad_norm": 1.7265625,
"learning_rate": 1.9324e-05,
"loss": 0.5532,
"step": 170
},
{
"epoch": 0.045190648046445944,
"grad_norm": 1.796875,
"learning_rate": 1.9284e-05,
"loss": 0.5836,
"step": 180
},
{
"epoch": 0.04770123960458183,
"grad_norm": 1.890625,
"learning_rate": 1.9244000000000004e-05,
"loss": 0.5294,
"step": 190
},
{
"epoch": 0.05021183116271771,
"grad_norm": 1.8125,
"learning_rate": 1.9204e-05,
"loss": 0.5675,
"step": 200
},
{
"epoch": 0.052722422720853604,
"grad_norm": 1.421875,
"learning_rate": 1.9164e-05,
"loss": 0.4689,
"step": 210
},
{
"epoch": 0.05523301427898949,
"grad_norm": 1.6484375,
"learning_rate": 1.9124000000000002e-05,
"loss": 0.5765,
"step": 220
},
{
"epoch": 0.05774360583712537,
"grad_norm": 2.078125,
"learning_rate": 1.9084000000000002e-05,
"loss": 0.5369,
"step": 230
},
{
"epoch": 0.06025419739526126,
"grad_norm": 1.65625,
"learning_rate": 1.9044000000000003e-05,
"loss": 0.4895,
"step": 240
},
{
"epoch": 0.06276478895339714,
"grad_norm": 2.234375,
"learning_rate": 1.9004000000000003e-05,
"loss": 0.5191,
"step": 250
},
{
"epoch": 0.06527538051153303,
"grad_norm": 1.90625,
"learning_rate": 1.8964000000000003e-05,
"loss": 0.5655,
"step": 260
},
{
"epoch": 0.06778597206966891,
"grad_norm": 1.9921875,
"learning_rate": 1.8924000000000004e-05,
"loss": 0.5454,
"step": 270
},
{
"epoch": 0.0702965636278048,
"grad_norm": 1.9765625,
"learning_rate": 1.8884e-05,
"loss": 0.5656,
"step": 280
},
{
"epoch": 0.07280715518594069,
"grad_norm": 1.7890625,
"learning_rate": 1.8844e-05,
"loss": 0.481,
"step": 290
},
{
"epoch": 0.07531774674407657,
"grad_norm": 1.6328125,
"learning_rate": 1.8804e-05,
"loss": 0.5365,
"step": 300
},
{
"epoch": 0.07782833830221246,
"grad_norm": 1.7578125,
"learning_rate": 1.8764000000000002e-05,
"loss": 0.5271,
"step": 310
},
{
"epoch": 0.08033892986034835,
"grad_norm": 1.65625,
"learning_rate": 1.8724000000000002e-05,
"loss": 0.5145,
"step": 320
},
{
"epoch": 0.08284952141848423,
"grad_norm": 1.90625,
"learning_rate": 1.8684000000000003e-05,
"loss": 0.5129,
"step": 330
},
{
"epoch": 0.08536011297662012,
"grad_norm": 1.6484375,
"learning_rate": 1.8644000000000003e-05,
"loss": 0.5539,
"step": 340
},
{
"epoch": 0.087870704534756,
"grad_norm": 1.9453125,
"learning_rate": 1.8604000000000003e-05,
"loss": 0.5051,
"step": 350
},
{
"epoch": 0.09038129609289189,
"grad_norm": 2.09375,
"learning_rate": 1.8564e-05,
"loss": 0.5034,
"step": 360
},
{
"epoch": 0.09289188765102778,
"grad_norm": 1.625,
"learning_rate": 1.8524e-05,
"loss": 0.4867,
"step": 370
},
{
"epoch": 0.09540247920916366,
"grad_norm": 1.71875,
"learning_rate": 1.8484e-05,
"loss": 0.5237,
"step": 380
},
{
"epoch": 0.09791307076729955,
"grad_norm": 2.109375,
"learning_rate": 1.8444e-05,
"loss": 0.5305,
"step": 390
},
{
"epoch": 0.10042366232543543,
"grad_norm": 1.859375,
"learning_rate": 1.8404000000000002e-05,
"loss": 0.498,
"step": 400
},
{
"epoch": 0.10293425388357132,
"grad_norm": 1.921875,
"learning_rate": 1.8364000000000002e-05,
"loss": 0.512,
"step": 410
},
{
"epoch": 0.10544484544170721,
"grad_norm": 2.046875,
"learning_rate": 1.8324000000000003e-05,
"loss": 0.4874,
"step": 420
},
{
"epoch": 0.10795543699984309,
"grad_norm": 2.03125,
"learning_rate": 1.8284000000000003e-05,
"loss": 0.5123,
"step": 430
},
{
"epoch": 0.11046602855797898,
"grad_norm": 1.78125,
"learning_rate": 1.8244e-05,
"loss": 0.4691,
"step": 440
},
{
"epoch": 0.11297662011611485,
"grad_norm": 1.640625,
"learning_rate": 1.8204e-05,
"loss": 0.4485,
"step": 450
},
{
"epoch": 0.11548721167425074,
"grad_norm": 1.7890625,
"learning_rate": 1.8164e-05,
"loss": 0.4647,
"step": 460
},
{
"epoch": 0.11799780323238664,
"grad_norm": 1.7265625,
"learning_rate": 1.8124e-05,
"loss": 0.4518,
"step": 470
},
{
"epoch": 0.12050839479052251,
"grad_norm": 2.34375,
"learning_rate": 1.8084e-05,
"loss": 0.4848,
"step": 480
},
{
"epoch": 0.1230189863486584,
"grad_norm": 1.828125,
"learning_rate": 1.8044000000000002e-05,
"loss": 0.4624,
"step": 490
},
{
"epoch": 0.12552957790679428,
"grad_norm": 2.3125,
"learning_rate": 1.8004000000000002e-05,
"loss": 0.489,
"step": 500
},
{
"epoch": 0.1280401694649302,
"grad_norm": 1.828125,
"learning_rate": 1.7964000000000003e-05,
"loss": 0.4556,
"step": 510
},
{
"epoch": 0.13055076102306606,
"grad_norm": 2.15625,
"learning_rate": 1.7924e-05,
"loss": 0.4726,
"step": 520
},
{
"epoch": 0.13306135258120194,
"grad_norm": 2.03125,
"learning_rate": 1.7884e-05,
"loss": 0.4686,
"step": 530
},
{
"epoch": 0.13557194413933782,
"grad_norm": 1.96875,
"learning_rate": 1.7844e-05,
"loss": 0.4228,
"step": 540
},
{
"epoch": 0.13808253569747372,
"grad_norm": 2.125,
"learning_rate": 1.7804e-05,
"loss": 0.4523,
"step": 550
},
{
"epoch": 0.1405931272556096,
"grad_norm": 1.9921875,
"learning_rate": 1.7764e-05,
"loss": 0.4235,
"step": 560
},
{
"epoch": 0.14310371881374548,
"grad_norm": 1.875,
"learning_rate": 1.7724000000000002e-05,
"loss": 0.4474,
"step": 570
},
{
"epoch": 0.14561431037188138,
"grad_norm": 2.0625,
"learning_rate": 1.7684000000000002e-05,
"loss": 0.4576,
"step": 580
},
{
"epoch": 0.14812490193001726,
"grad_norm": 2.296875,
"learning_rate": 1.7644000000000003e-05,
"loss": 0.4345,
"step": 590
},
{
"epoch": 0.15063549348815314,
"grad_norm": 2.0,
"learning_rate": 1.7604e-05,
"loss": 0.4124,
"step": 600
},
{
"epoch": 0.15314608504628904,
"grad_norm": 1.9453125,
"learning_rate": 1.7564e-05,
"loss": 0.4457,
"step": 610
},
{
"epoch": 0.15565667660442492,
"grad_norm": 2.125,
"learning_rate": 1.7524e-05,
"loss": 0.4569,
"step": 620
},
{
"epoch": 0.1581672681625608,
"grad_norm": 1.8203125,
"learning_rate": 1.7484e-05,
"loss": 0.4065,
"step": 630
},
{
"epoch": 0.1606778597206967,
"grad_norm": 1.9921875,
"learning_rate": 1.7444e-05,
"loss": 0.4231,
"step": 640
},
{
"epoch": 0.16318845127883258,
"grad_norm": 2.328125,
"learning_rate": 1.7404e-05,
"loss": 0.4306,
"step": 650
},
{
"epoch": 0.16569904283696846,
"grad_norm": 1.8359375,
"learning_rate": 1.7364000000000002e-05,
"loss": 0.4261,
"step": 660
},
{
"epoch": 0.16820963439510433,
"grad_norm": 1.9140625,
"learning_rate": 1.7324000000000002e-05,
"loss": 0.3939,
"step": 670
},
{
"epoch": 0.17072022595324024,
"grad_norm": 2.34375,
"learning_rate": 1.7284e-05,
"loss": 0.3979,
"step": 680
},
{
"epoch": 0.17323081751137612,
"grad_norm": 2.0,
"learning_rate": 1.7244e-05,
"loss": 0.419,
"step": 690
},
{
"epoch": 0.175741409069512,
"grad_norm": 2.078125,
"learning_rate": 1.7204e-05,
"loss": 0.4181,
"step": 700
},
{
"epoch": 0.1782520006276479,
"grad_norm": 2.0625,
"learning_rate": 1.7164e-05,
"loss": 0.4345,
"step": 710
},
{
"epoch": 0.18076259218578378,
"grad_norm": 2.078125,
"learning_rate": 1.7124e-05,
"loss": 0.436,
"step": 720
},
{
"epoch": 0.18327318374391965,
"grad_norm": 2.15625,
"learning_rate": 1.7084e-05,
"loss": 0.3904,
"step": 730
},
{
"epoch": 0.18578377530205556,
"grad_norm": 1.7265625,
"learning_rate": 1.7044e-05,
"loss": 0.3941,
"step": 740
},
{
"epoch": 0.18829436686019144,
"grad_norm": 1.9453125,
"learning_rate": 1.7004000000000002e-05,
"loss": 0.403,
"step": 750
},
{
"epoch": 0.19080495841832731,
"grad_norm": 2.09375,
"learning_rate": 1.6964e-05,
"loss": 0.397,
"step": 760
},
{
"epoch": 0.1933155499764632,
"grad_norm": 2.078125,
"learning_rate": 1.6924e-05,
"loss": 0.3869,
"step": 770
},
{
"epoch": 0.1958261415345991,
"grad_norm": 2.046875,
"learning_rate": 1.6884e-05,
"loss": 0.428,
"step": 780
},
{
"epoch": 0.19833673309273497,
"grad_norm": 2.375,
"learning_rate": 1.6844e-05,
"loss": 0.4024,
"step": 790
},
{
"epoch": 0.20084732465087085,
"grad_norm": 2.03125,
"learning_rate": 1.6804e-05,
"loss": 0.4061,
"step": 800
},
{
"epoch": 0.20335791620900676,
"grad_norm": 2.171875,
"learning_rate": 1.6764e-05,
"loss": 0.3882,
"step": 810
},
{
"epoch": 0.20586850776714263,
"grad_norm": 2.140625,
"learning_rate": 1.6724e-05,
"loss": 0.3973,
"step": 820
},
{
"epoch": 0.2083790993252785,
"grad_norm": 2.203125,
"learning_rate": 1.6684e-05,
"loss": 0.3871,
"step": 830
},
{
"epoch": 0.21088969088341442,
"grad_norm": 2.09375,
"learning_rate": 1.6644000000000002e-05,
"loss": 0.3666,
"step": 840
},
{
"epoch": 0.2134002824415503,
"grad_norm": 2.046875,
"learning_rate": 1.6604000000000002e-05,
"loss": 0.3853,
"step": 850
},
{
"epoch": 0.21591087399968617,
"grad_norm": 2.078125,
"learning_rate": 1.6564000000000003e-05,
"loss": 0.3772,
"step": 860
},
{
"epoch": 0.21842146555782208,
"grad_norm": 2.3125,
"learning_rate": 1.6524000000000003e-05,
"loss": 0.3926,
"step": 870
},
{
"epoch": 0.22093205711595795,
"grad_norm": 2.0625,
"learning_rate": 1.6484000000000003e-05,
"loss": 0.3824,
"step": 880
},
{
"epoch": 0.22344264867409383,
"grad_norm": 2.078125,
"learning_rate": 1.6444000000000004e-05,
"loss": 0.3777,
"step": 890
},
{
"epoch": 0.2259532402322297,
"grad_norm": 1.8984375,
"learning_rate": 1.6404e-05,
"loss": 0.3886,
"step": 900
},
{
"epoch": 0.2284638317903656,
"grad_norm": 2.25,
"learning_rate": 1.6364e-05,
"loss": 0.4085,
"step": 910
},
{
"epoch": 0.2309744233485015,
"grad_norm": 2.125,
"learning_rate": 1.6324e-05,
"loss": 0.3692,
"step": 920
},
{
"epoch": 0.23348501490663737,
"grad_norm": 2.125,
"learning_rate": 1.6284000000000002e-05,
"loss": 0.3528,
"step": 930
},
{
"epoch": 0.23599560646477327,
"grad_norm": 2.0625,
"learning_rate": 1.6244000000000002e-05,
"loss": 0.3708,
"step": 940
},
{
"epoch": 0.23850619802290915,
"grad_norm": 1.9453125,
"learning_rate": 1.6204000000000003e-05,
"loss": 0.3625,
"step": 950
},
{
"epoch": 0.24101678958104503,
"grad_norm": 2.140625,
"learning_rate": 1.6164000000000003e-05,
"loss": 0.3596,
"step": 960
},
{
"epoch": 0.24352738113918093,
"grad_norm": 1.7265625,
"learning_rate": 1.6124000000000004e-05,
"loss": 0.3576,
"step": 970
},
{
"epoch": 0.2460379726973168,
"grad_norm": 1.6015625,
"learning_rate": 1.6084e-05,
"loss": 0.3481,
"step": 980
},
{
"epoch": 0.2485485642554527,
"grad_norm": 2.296875,
"learning_rate": 1.6044e-05,
"loss": 0.3899,
"step": 990
},
{
"epoch": 0.25105915581358856,
"grad_norm": 2.0,
"learning_rate": 1.6004e-05,
"loss": 0.3838,
"step": 1000
},
{
"epoch": 0.25356974737172444,
"grad_norm": 1.78125,
"learning_rate": 1.5964e-05,
"loss": 0.3522,
"step": 1010
},
{
"epoch": 0.2560803389298604,
"grad_norm": 2.0625,
"learning_rate": 1.5924000000000002e-05,
"loss": 0.3277,
"step": 1020
},
{
"epoch": 0.25859093048799625,
"grad_norm": 2.109375,
"learning_rate": 1.5884000000000002e-05,
"loss": 0.3532,
"step": 1030
},
{
"epoch": 0.26110152204613213,
"grad_norm": 2.125,
"learning_rate": 1.5844000000000003e-05,
"loss": 0.3558,
"step": 1040
},
{
"epoch": 0.263612113604268,
"grad_norm": 2.375,
"learning_rate": 1.5804000000000003e-05,
"loss": 0.3617,
"step": 1050
},
{
"epoch": 0.2661227051624039,
"grad_norm": 2.421875,
"learning_rate": 1.5764e-05,
"loss": 0.3607,
"step": 1060
},
{
"epoch": 0.26863329672053976,
"grad_norm": 2.140625,
"learning_rate": 1.5724e-05,
"loss": 0.3948,
"step": 1070
},
{
"epoch": 0.27114388827867564,
"grad_norm": 1.8203125,
"learning_rate": 1.5684e-05,
"loss": 0.3776,
"step": 1080
},
{
"epoch": 0.27365447983681157,
"grad_norm": 2.15625,
"learning_rate": 1.5644e-05,
"loss": 0.3324,
"step": 1090
},
{
"epoch": 0.27616507139494745,
"grad_norm": 2.125,
"learning_rate": 1.5604000000000002e-05,
"loss": 0.3536,
"step": 1100
},
{
"epoch": 0.2786756629530833,
"grad_norm": 2.34375,
"learning_rate": 1.5564000000000002e-05,
"loss": 0.3591,
"step": 1110
},
{
"epoch": 0.2811862545112192,
"grad_norm": 2.015625,
"learning_rate": 1.5524000000000002e-05,
"loss": 0.3598,
"step": 1120
},
{
"epoch": 0.2836968460693551,
"grad_norm": 1.734375,
"learning_rate": 1.5484000000000003e-05,
"loss": 0.323,
"step": 1130
},
{
"epoch": 0.28620743762749096,
"grad_norm": 1.421875,
"learning_rate": 1.5444e-05,
"loss": 0.3496,
"step": 1140
},
{
"epoch": 0.2887180291856269,
"grad_norm": 2.1875,
"learning_rate": 1.5404e-05,
"loss": 0.3442,
"step": 1150
},
{
"epoch": 0.29122862074376277,
"grad_norm": 2.09375,
"learning_rate": 1.5364e-05,
"loss": 0.3392,
"step": 1160
},
{
"epoch": 0.29373921230189864,
"grad_norm": 1.9296875,
"learning_rate": 1.5324e-05,
"loss": 0.3689,
"step": 1170
},
{
"epoch": 0.2962498038600345,
"grad_norm": 2.296875,
"learning_rate": 1.5284e-05,
"loss": 0.347,
"step": 1180
},
{
"epoch": 0.2987603954181704,
"grad_norm": 1.96875,
"learning_rate": 1.5244000000000002e-05,
"loss": 0.3487,
"step": 1190
},
{
"epoch": 0.3012709869763063,
"grad_norm": 2.46875,
"learning_rate": 1.5204e-05,
"loss": 0.3182,
"step": 1200
},
{
"epoch": 0.30378157853444215,
"grad_norm": 2.109375,
"learning_rate": 1.5164e-05,
"loss": 0.3298,
"step": 1210
},
{
"epoch": 0.3062921700925781,
"grad_norm": 2.40625,
"learning_rate": 1.5124000000000001e-05,
"loss": 0.3299,
"step": 1220
},
{
"epoch": 0.30880276165071396,
"grad_norm": 2.53125,
"learning_rate": 1.5084000000000002e-05,
"loss": 0.3685,
"step": 1230
},
{
"epoch": 0.31131335320884984,
"grad_norm": 2.09375,
"learning_rate": 1.5044e-05,
"loss": 0.3321,
"step": 1240
},
{
"epoch": 0.3138239447669857,
"grad_norm": 1.9453125,
"learning_rate": 1.5004e-05,
"loss": 0.3397,
"step": 1250
},
{
"epoch": 0.3163345363251216,
"grad_norm": 1.8359375,
"learning_rate": 1.4964000000000001e-05,
"loss": 0.3296,
"step": 1260
},
{
"epoch": 0.3188451278832575,
"grad_norm": 1.875,
"learning_rate": 1.4924000000000001e-05,
"loss": 0.3122,
"step": 1270
},
{
"epoch": 0.3213557194413934,
"grad_norm": 2.0,
"learning_rate": 1.4884e-05,
"loss": 0.3325,
"step": 1280
},
{
"epoch": 0.3238663109995293,
"grad_norm": 1.9453125,
"learning_rate": 1.4844e-05,
"loss": 0.343,
"step": 1290
},
{
"epoch": 0.32637690255766516,
"grad_norm": 2.21875,
"learning_rate": 1.4804000000000001e-05,
"loss": 0.332,
"step": 1300
},
{
"epoch": 0.32888749411580104,
"grad_norm": 2.234375,
"learning_rate": 1.4764000000000001e-05,
"loss": 0.3303,
"step": 1310
},
{
"epoch": 0.3313980856739369,
"grad_norm": 2.25,
"learning_rate": 1.4724e-05,
"loss": 0.3302,
"step": 1320
},
{
"epoch": 0.3339086772320728,
"grad_norm": 1.6796875,
"learning_rate": 1.4684e-05,
"loss": 0.3304,
"step": 1330
},
{
"epoch": 0.33641926879020867,
"grad_norm": 1.7734375,
"learning_rate": 1.4644e-05,
"loss": 0.3247,
"step": 1340
},
{
"epoch": 0.3389298603483446,
"grad_norm": 1.9375,
"learning_rate": 1.4604000000000001e-05,
"loss": 0.324,
"step": 1350
},
{
"epoch": 0.3414404519064805,
"grad_norm": 1.5390625,
"learning_rate": 1.4564e-05,
"loss": 0.3353,
"step": 1360
},
{
"epoch": 0.34395104346461636,
"grad_norm": 2.03125,
"learning_rate": 1.4524e-05,
"loss": 0.321,
"step": 1370
},
{
"epoch": 0.34646163502275223,
"grad_norm": 2.5625,
"learning_rate": 1.4484e-05,
"loss": 0.3133,
"step": 1380
},
{
"epoch": 0.3489722265808881,
"grad_norm": 2.34375,
"learning_rate": 1.4444000000000001e-05,
"loss": 0.3516,
"step": 1390
},
{
"epoch": 0.351482818139024,
"grad_norm": 1.6015625,
"learning_rate": 1.4404e-05,
"loss": 0.3093,
"step": 1400
},
{
"epoch": 0.35399340969715987,
"grad_norm": 1.4765625,
"learning_rate": 1.4364e-05,
"loss": 0.3171,
"step": 1410
},
{
"epoch": 0.3565040012552958,
"grad_norm": 2.0,
"learning_rate": 1.4324e-05,
"loss": 0.3321,
"step": 1420
},
{
"epoch": 0.3590145928134317,
"grad_norm": 2.0625,
"learning_rate": 1.4284e-05,
"loss": 0.3111,
"step": 1430
},
{
"epoch": 0.36152518437156755,
"grad_norm": 1.9375,
"learning_rate": 1.4244000000000003e-05,
"loss": 0.3403,
"step": 1440
},
{
"epoch": 0.36403577592970343,
"grad_norm": 1.859375,
"learning_rate": 1.4204000000000002e-05,
"loss": 0.3174,
"step": 1450
},
{
"epoch": 0.3665463674878393,
"grad_norm": 2.03125,
"learning_rate": 1.4164000000000002e-05,
"loss": 0.3332,
"step": 1460
},
{
"epoch": 0.3690569590459752,
"grad_norm": 1.7734375,
"learning_rate": 1.4124000000000002e-05,
"loss": 0.3139,
"step": 1470
},
{
"epoch": 0.3715675506041111,
"grad_norm": 1.9375,
"learning_rate": 1.4084000000000003e-05,
"loss": 0.328,
"step": 1480
},
{
"epoch": 0.374078142162247,
"grad_norm": 2.015625,
"learning_rate": 1.4044000000000001e-05,
"loss": 0.3301,
"step": 1490
},
{
"epoch": 0.3765887337203829,
"grad_norm": 2.6875,
"learning_rate": 1.4004000000000002e-05,
"loss": 0.3218,
"step": 1500
},
{
"epoch": 0.37909932527851875,
"grad_norm": 2.453125,
"learning_rate": 1.3964000000000002e-05,
"loss": 0.3371,
"step": 1510
},
{
"epoch": 0.38160991683665463,
"grad_norm": 1.515625,
"learning_rate": 1.3924000000000003e-05,
"loss": 0.3162,
"step": 1520
},
{
"epoch": 0.3841205083947905,
"grad_norm": 1.9765625,
"learning_rate": 1.3884000000000001e-05,
"loss": 0.3092,
"step": 1530
},
{
"epoch": 0.3866310999529264,
"grad_norm": 2.1875,
"learning_rate": 1.3844000000000002e-05,
"loss": 0.3224,
"step": 1540
},
{
"epoch": 0.3891416915110623,
"grad_norm": 2.390625,
"learning_rate": 1.3804000000000002e-05,
"loss": 0.3027,
"step": 1550
},
{
"epoch": 0.3916522830691982,
"grad_norm": 2.375,
"learning_rate": 1.3764000000000002e-05,
"loss": 0.3303,
"step": 1560
},
{
"epoch": 0.39416287462733407,
"grad_norm": 1.828125,
"learning_rate": 1.3724000000000001e-05,
"loss": 0.3317,
"step": 1570
},
{
"epoch": 0.39667346618546995,
"grad_norm": 1.9140625,
"learning_rate": 1.3684000000000001e-05,
"loss": 0.3195,
"step": 1580
},
{
"epoch": 0.3991840577436058,
"grad_norm": 2.03125,
"learning_rate": 1.3644000000000002e-05,
"loss": 0.3185,
"step": 1590
},
{
"epoch": 0.4016946493017417,
"grad_norm": 1.75,
"learning_rate": 1.3604000000000002e-05,
"loss": 0.2818,
"step": 1600
},
{
"epoch": 0.40420524085987763,
"grad_norm": 1.8515625,
"learning_rate": 1.3564000000000001e-05,
"loss": 0.2996,
"step": 1610
},
{
"epoch": 0.4067158324180135,
"grad_norm": 1.7578125,
"learning_rate": 1.3524000000000001e-05,
"loss": 0.3167,
"step": 1620
},
{
"epoch": 0.4092264239761494,
"grad_norm": 2.25,
"learning_rate": 1.3484000000000002e-05,
"loss": 0.3257,
"step": 1630
},
{
"epoch": 0.41173701553428527,
"grad_norm": 1.671875,
"learning_rate": 1.3444000000000002e-05,
"loss": 0.3145,
"step": 1640
},
{
"epoch": 0.41424760709242114,
"grad_norm": 1.59375,
"learning_rate": 1.3404e-05,
"loss": 0.2936,
"step": 1650
},
{
"epoch": 0.416758198650557,
"grad_norm": 1.953125,
"learning_rate": 1.3364000000000001e-05,
"loss": 0.3158,
"step": 1660
},
{
"epoch": 0.4192687902086929,
"grad_norm": 1.53125,
"learning_rate": 1.3324000000000002e-05,
"loss": 0.3233,
"step": 1670
},
{
"epoch": 0.42177938176682883,
"grad_norm": 2.28125,
"learning_rate": 1.3284000000000002e-05,
"loss": 0.2975,
"step": 1680
},
{
"epoch": 0.4242899733249647,
"grad_norm": 2.046875,
"learning_rate": 1.3244e-05,
"loss": 0.3058,
"step": 1690
},
{
"epoch": 0.4268005648831006,
"grad_norm": 2.15625,
"learning_rate": 1.3204000000000001e-05,
"loss": 0.3107,
"step": 1700
},
{
"epoch": 0.42931115644123646,
"grad_norm": 1.8203125,
"learning_rate": 1.3164000000000001e-05,
"loss": 0.3096,
"step": 1710
},
{
"epoch": 0.43182174799937234,
"grad_norm": 1.3828125,
"learning_rate": 1.3124000000000002e-05,
"loss": 0.3052,
"step": 1720
},
{
"epoch": 0.4343323395575082,
"grad_norm": 1.8515625,
"learning_rate": 1.3084e-05,
"loss": 0.3029,
"step": 1730
},
{
"epoch": 0.43684293111564415,
"grad_norm": 1.8203125,
"learning_rate": 1.3044e-05,
"loss": 0.3147,
"step": 1740
},
{
"epoch": 0.43935352267378003,
"grad_norm": 1.984375,
"learning_rate": 1.3004000000000001e-05,
"loss": 0.3053,
"step": 1750
},
{
"epoch": 0.4418641142319159,
"grad_norm": 1.953125,
"learning_rate": 1.2964000000000002e-05,
"loss": 0.2913,
"step": 1760
},
{
"epoch": 0.4443747057900518,
"grad_norm": 1.8515625,
"learning_rate": 1.2924e-05,
"loss": 0.2916,
"step": 1770
},
{
"epoch": 0.44688529734818766,
"grad_norm": 2.046875,
"learning_rate": 1.2884e-05,
"loss": 0.3159,
"step": 1780
},
{
"epoch": 0.44939588890632354,
"grad_norm": 1.90625,
"learning_rate": 1.2844000000000001e-05,
"loss": 0.2987,
"step": 1790
},
{
"epoch": 0.4519064804644594,
"grad_norm": 1.953125,
"learning_rate": 1.2804000000000001e-05,
"loss": 0.3136,
"step": 1800
},
{
"epoch": 0.45441707202259535,
"grad_norm": 1.6015625,
"learning_rate": 1.2764e-05,
"loss": 0.3038,
"step": 1810
},
{
"epoch": 0.4569276635807312,
"grad_norm": 2.0,
"learning_rate": 1.2724e-05,
"loss": 0.2916,
"step": 1820
},
{
"epoch": 0.4594382551388671,
"grad_norm": 1.9765625,
"learning_rate": 1.2684000000000001e-05,
"loss": 0.3113,
"step": 1830
},
{
"epoch": 0.461948846697003,
"grad_norm": 1.5625,
"learning_rate": 1.2644000000000001e-05,
"loss": 0.2728,
"step": 1840
},
{
"epoch": 0.46445943825513886,
"grad_norm": 1.9296875,
"learning_rate": 1.2604e-05,
"loss": 0.2868,
"step": 1850
},
{
"epoch": 0.46697002981327473,
"grad_norm": 2.140625,
"learning_rate": 1.2564e-05,
"loss": 0.289,
"step": 1860
},
{
"epoch": 0.4694806213714106,
"grad_norm": 1.4921875,
"learning_rate": 1.2524e-05,
"loss": 0.2881,
"step": 1870
},
{
"epoch": 0.47199121292954654,
"grad_norm": 1.8984375,
"learning_rate": 1.2484000000000001e-05,
"loss": 0.2803,
"step": 1880
},
{
"epoch": 0.4745018044876824,
"grad_norm": 1.9375,
"learning_rate": 1.2444e-05,
"loss": 0.2785,
"step": 1890
},
{
"epoch": 0.4770123960458183,
"grad_norm": 1.71875,
"learning_rate": 1.2404e-05,
"loss": 0.2976,
"step": 1900
},
{
"epoch": 0.4795229876039542,
"grad_norm": 1.9296875,
"learning_rate": 1.2364e-05,
"loss": 0.2737,
"step": 1910
},
{
"epoch": 0.48203357916209005,
"grad_norm": 1.7421875,
"learning_rate": 1.2324000000000001e-05,
"loss": 0.3237,
"step": 1920
},
{
"epoch": 0.48454417072022593,
"grad_norm": 1.859375,
"learning_rate": 1.2284e-05,
"loss": 0.297,
"step": 1930
},
{
"epoch": 0.48705476227836186,
"grad_norm": 2.046875,
"learning_rate": 1.2244e-05,
"loss": 0.2745,
"step": 1940
},
{
"epoch": 0.48956535383649774,
"grad_norm": 2.1875,
"learning_rate": 1.2204e-05,
"loss": 0.3059,
"step": 1950
},
{
"epoch": 0.4920759453946336,
"grad_norm": 2.359375,
"learning_rate": 1.2164e-05,
"loss": 0.2876,
"step": 1960
},
{
"epoch": 0.4945865369527695,
"grad_norm": 2.6875,
"learning_rate": 1.2124e-05,
"loss": 0.2801,
"step": 1970
},
{
"epoch": 0.4970971285109054,
"grad_norm": 1.921875,
"learning_rate": 1.2084e-05,
"loss": 0.2971,
"step": 1980
},
{
"epoch": 0.49960772006904125,
"grad_norm": 1.6484375,
"learning_rate": 1.2044e-05,
"loss": 0.2979,
"step": 1990
},
{
"epoch": 0.5021183116271771,
"grad_norm": 1.9296875,
"learning_rate": 1.2004e-05,
"loss": 0.306,
"step": 2000
},
{
"epoch": 0.5046289031853131,
"grad_norm": 1.5859375,
"learning_rate": 1.1964e-05,
"loss": 0.2975,
"step": 2010
},
{
"epoch": 0.5071394947434489,
"grad_norm": 1.9296875,
"learning_rate": 1.1924e-05,
"loss": 0.2771,
"step": 2020
},
{
"epoch": 0.5096500863015848,
"grad_norm": 2.265625,
"learning_rate": 1.1884e-05,
"loss": 0.2903,
"step": 2030
},
{
"epoch": 0.5121606778597207,
"grad_norm": 1.7109375,
"learning_rate": 1.1844e-05,
"loss": 0.2808,
"step": 2040
},
{
"epoch": 0.5146712694178566,
"grad_norm": 1.7890625,
"learning_rate": 1.1803999999999999e-05,
"loss": 0.2856,
"step": 2050
},
{
"epoch": 0.5171818609759925,
"grad_norm": 1.6796875,
"learning_rate": 1.1764e-05,
"loss": 0.2868,
"step": 2060
},
{
"epoch": 0.5196924525341283,
"grad_norm": 1.7109375,
"learning_rate": 1.1724000000000002e-05,
"loss": 0.2973,
"step": 2070
},
{
"epoch": 0.5222030440922643,
"grad_norm": 1.8828125,
"learning_rate": 1.1684000000000002e-05,
"loss": 0.3097,
"step": 2080
},
{
"epoch": 0.5247136356504001,
"grad_norm": 1.6953125,
"learning_rate": 1.1644000000000002e-05,
"loss": 0.2731,
"step": 2090
},
{
"epoch": 0.527224227208536,
"grad_norm": 2.0625,
"learning_rate": 1.1604000000000003e-05,
"loss": 0.2849,
"step": 2100
},
{
"epoch": 0.5297348187666719,
"grad_norm": 2.203125,
"learning_rate": 1.1564000000000001e-05,
"loss": 0.2949,
"step": 2110
},
{
"epoch": 0.5322454103248078,
"grad_norm": 2.203125,
"learning_rate": 1.1524000000000002e-05,
"loss": 0.3049,
"step": 2120
},
{
"epoch": 0.5347560018829437,
"grad_norm": 1.7421875,
"learning_rate": 1.1484000000000002e-05,
"loss": 0.2851,
"step": 2130
},
{
"epoch": 0.5372665934410795,
"grad_norm": 1.84375,
"learning_rate": 1.1444000000000003e-05,
"loss": 0.2919,
"step": 2140
},
{
"epoch": 0.5397771849992155,
"grad_norm": 1.765625,
"learning_rate": 1.1404000000000001e-05,
"loss": 0.2967,
"step": 2150
},
{
"epoch": 0.5422877765573513,
"grad_norm": 1.5546875,
"learning_rate": 1.1364000000000002e-05,
"loss": 0.2801,
"step": 2160
},
{
"epoch": 0.5447983681154872,
"grad_norm": 2.0,
"learning_rate": 1.1324000000000002e-05,
"loss": 0.3012,
"step": 2170
},
{
"epoch": 0.5473089596736231,
"grad_norm": 1.8984375,
"learning_rate": 1.1284000000000002e-05,
"loss": 0.2925,
"step": 2180
},
{
"epoch": 0.549819551231759,
"grad_norm": 1.609375,
"learning_rate": 1.1244000000000001e-05,
"loss": 0.2954,
"step": 2190
},
{
"epoch": 0.5523301427898949,
"grad_norm": 1.734375,
"learning_rate": 1.1204000000000001e-05,
"loss": 0.2799,
"step": 2200
},
{
"epoch": 0.5548407343480307,
"grad_norm": 1.515625,
"learning_rate": 1.1164000000000002e-05,
"loss": 0.2704,
"step": 2210
},
{
"epoch": 0.5573513259061667,
"grad_norm": 1.6953125,
"learning_rate": 1.1124000000000002e-05,
"loss": 0.2876,
"step": 2220
},
{
"epoch": 0.5598619174643025,
"grad_norm": 1.7421875,
"learning_rate": 1.1084000000000001e-05,
"loss": 0.2633,
"step": 2230
},
{
"epoch": 0.5623725090224384,
"grad_norm": 1.796875,
"learning_rate": 1.1044000000000001e-05,
"loss": 0.2867,
"step": 2240
},
{
"epoch": 0.5648831005805743,
"grad_norm": 1.4609375,
"learning_rate": 1.1004000000000002e-05,
"loss": 0.2702,
"step": 2250
},
{
"epoch": 0.5673936921387102,
"grad_norm": 1.828125,
"learning_rate": 1.0964000000000002e-05,
"loss": 0.2852,
"step": 2260
},
{
"epoch": 0.5699042836968461,
"grad_norm": 1.703125,
"learning_rate": 1.0924e-05,
"loss": 0.2746,
"step": 2270
},
{
"epoch": 0.5724148752549819,
"grad_norm": 1.90625,
"learning_rate": 1.0884000000000001e-05,
"loss": 0.2932,
"step": 2280
},
{
"epoch": 0.5749254668131178,
"grad_norm": 1.84375,
"learning_rate": 1.0844000000000002e-05,
"loss": 0.2903,
"step": 2290
},
{
"epoch": 0.5774360583712538,
"grad_norm": 2.15625,
"learning_rate": 1.0804000000000002e-05,
"loss": 0.2851,
"step": 2300
},
{
"epoch": 0.5799466499293896,
"grad_norm": 2.0625,
"learning_rate": 1.0764e-05,
"loss": 0.2923,
"step": 2310
},
{
"epoch": 0.5824572414875255,
"grad_norm": 2.109375,
"learning_rate": 1.0724000000000001e-05,
"loss": 0.3003,
"step": 2320
},
{
"epoch": 0.5849678330456614,
"grad_norm": 2.046875,
"learning_rate": 1.0684000000000001e-05,
"loss": 0.291,
"step": 2330
},
{
"epoch": 0.5874784246037973,
"grad_norm": 1.8828125,
"learning_rate": 1.0644000000000002e-05,
"loss": 0.2787,
"step": 2340
},
{
"epoch": 0.5899890161619331,
"grad_norm": 1.8203125,
"learning_rate": 1.0604e-05,
"loss": 0.2764,
"step": 2350
},
{
"epoch": 0.592499607720069,
"grad_norm": 1.65625,
"learning_rate": 1.0564e-05,
"loss": 0.2842,
"step": 2360
},
{
"epoch": 0.595010199278205,
"grad_norm": 1.78125,
"learning_rate": 1.0524000000000001e-05,
"loss": 0.2922,
"step": 2370
},
{
"epoch": 0.5975207908363408,
"grad_norm": 1.5625,
"learning_rate": 1.0484000000000002e-05,
"loss": 0.283,
"step": 2380
},
{
"epoch": 0.6000313823944767,
"grad_norm": 1.5703125,
"learning_rate": 1.0444e-05,
"loss": 0.2796,
"step": 2390
},
{
"epoch": 0.6025419739526126,
"grad_norm": 1.96875,
"learning_rate": 1.0404e-05,
"loss": 0.2828,
"step": 2400
},
{
"epoch": 0.6050525655107485,
"grad_norm": 1.5703125,
"learning_rate": 1.0364000000000001e-05,
"loss": 0.2866,
"step": 2410
},
{
"epoch": 0.6075631570688843,
"grad_norm": 2.109375,
"learning_rate": 1.0324000000000001e-05,
"loss": 0.2836,
"step": 2420
},
{
"epoch": 0.6100737486270202,
"grad_norm": 1.875,
"learning_rate": 1.0284e-05,
"loss": 0.3037,
"step": 2430
},
{
"epoch": 0.6125843401851562,
"grad_norm": 2.203125,
"learning_rate": 1.0244e-05,
"loss": 0.2774,
"step": 2440
},
{
"epoch": 0.615094931743292,
"grad_norm": 1.8671875,
"learning_rate": 1.0204000000000001e-05,
"loss": 0.2922,
"step": 2450
},
{
"epoch": 0.6176055233014279,
"grad_norm": 1.7109375,
"learning_rate": 1.0164000000000001e-05,
"loss": 0.283,
"step": 2460
},
{
"epoch": 0.6201161148595637,
"grad_norm": 1.9453125,
"learning_rate": 1.0124e-05,
"loss": 0.2725,
"step": 2470
},
{
"epoch": 0.6226267064176997,
"grad_norm": 1.6875,
"learning_rate": 1.0084e-05,
"loss": 0.2749,
"step": 2480
},
{
"epoch": 0.6251372979758355,
"grad_norm": 1.703125,
"learning_rate": 1.0044e-05,
"loss": 0.2823,
"step": 2490
},
{
"epoch": 0.6276478895339714,
"grad_norm": 1.9296875,
"learning_rate": 1.0004000000000001e-05,
"loss": 0.281,
"step": 2500
},
{
"epoch": 0.6301584810921074,
"grad_norm": 1.9140625,
"learning_rate": 9.964e-06,
"loss": 0.2856,
"step": 2510
},
{
"epoch": 0.6326690726502432,
"grad_norm": 1.4921875,
"learning_rate": 9.924e-06,
"loss": 0.28,
"step": 2520
},
{
"epoch": 0.6351796642083791,
"grad_norm": 1.53125,
"learning_rate": 9.884e-06,
"loss": 0.267,
"step": 2530
},
{
"epoch": 0.637690255766515,
"grad_norm": 1.5703125,
"learning_rate": 9.844000000000001e-06,
"loss": 0.2563,
"step": 2540
},
{
"epoch": 0.6402008473246509,
"grad_norm": 1.46875,
"learning_rate": 9.804000000000001e-06,
"loss": 0.2911,
"step": 2550
},
{
"epoch": 0.6427114388827868,
"grad_norm": 2.46875,
"learning_rate": 9.764000000000002e-06,
"loss": 0.2897,
"step": 2560
},
{
"epoch": 0.6452220304409226,
"grad_norm": 1.90625,
"learning_rate": 9.724e-06,
"loss": 0.2975,
"step": 2570
},
{
"epoch": 0.6477326219990586,
"grad_norm": 1.890625,
"learning_rate": 9.684e-06,
"loss": 0.2722,
"step": 2580
},
{
"epoch": 0.6502432135571944,
"grad_norm": 1.6015625,
"learning_rate": 9.644000000000001e-06,
"loss": 0.2787,
"step": 2590
},
{
"epoch": 0.6527538051153303,
"grad_norm": 2.046875,
"learning_rate": 9.604000000000002e-06,
"loss": 0.2822,
"step": 2600
},
{
"epoch": 0.6552643966734661,
"grad_norm": 1.9296875,
"learning_rate": 9.564e-06,
"loss": 0.2903,
"step": 2610
},
{
"epoch": 0.6577749882316021,
"grad_norm": 1.78125,
"learning_rate": 9.524e-06,
"loss": 0.2693,
"step": 2620
},
{
"epoch": 0.660285579789738,
"grad_norm": 1.46875,
"learning_rate": 9.484000000000001e-06,
"loss": 0.2768,
"step": 2630
},
{
"epoch": 0.6627961713478738,
"grad_norm": 1.734375,
"learning_rate": 9.444000000000001e-06,
"loss": 0.2707,
"step": 2640
},
{
"epoch": 0.6653067629060098,
"grad_norm": 1.5703125,
"learning_rate": 9.404e-06,
"loss": 0.2791,
"step": 2650
},
{
"epoch": 0.6678173544641456,
"grad_norm": 1.703125,
"learning_rate": 9.364e-06,
"loss": 0.2924,
"step": 2660
},
{
"epoch": 0.6703279460222815,
"grad_norm": 1.9296875,
"learning_rate": 9.324000000000001e-06,
"loss": 0.2875,
"step": 2670
},
{
"epoch": 0.6728385375804173,
"grad_norm": 1.78125,
"learning_rate": 9.284000000000001e-06,
"loss": 0.2602,
"step": 2680
},
{
"epoch": 0.6753491291385533,
"grad_norm": 1.6875,
"learning_rate": 9.244e-06,
"loss": 0.2924,
"step": 2690
},
{
"epoch": 0.6778597206966892,
"grad_norm": 1.578125,
"learning_rate": 9.204e-06,
"loss": 0.2841,
"step": 2700
},
{
"epoch": 0.680370312254825,
"grad_norm": 1.390625,
"learning_rate": 9.164e-06,
"loss": 0.2745,
"step": 2710
},
{
"epoch": 0.682880903812961,
"grad_norm": 2.28125,
"learning_rate": 9.124000000000001e-06,
"loss": 0.2876,
"step": 2720
},
{
"epoch": 0.6853914953710968,
"grad_norm": 1.546875,
"learning_rate": 9.084e-06,
"loss": 0.2549,
"step": 2730
},
{
"epoch": 0.6879020869292327,
"grad_norm": 1.3515625,
"learning_rate": 9.044e-06,
"loss": 0.2512,
"step": 2740
},
{
"epoch": 0.6904126784873685,
"grad_norm": 2.203125,
"learning_rate": 9.004e-06,
"loss": 0.2686,
"step": 2750
},
{
"epoch": 0.6929232700455045,
"grad_norm": 1.6484375,
"learning_rate": 8.964000000000001e-06,
"loss": 0.2714,
"step": 2760
},
{
"epoch": 0.6954338616036404,
"grad_norm": 1.6796875,
"learning_rate": 8.924e-06,
"loss": 0.275,
"step": 2770
},
{
"epoch": 0.6979444531617762,
"grad_norm": 1.8359375,
"learning_rate": 8.884e-06,
"loss": 0.2631,
"step": 2780
},
{
"epoch": 0.7004550447199122,
"grad_norm": 1.3828125,
"learning_rate": 8.844e-06,
"loss": 0.2639,
"step": 2790
},
{
"epoch": 0.702965636278048,
"grad_norm": 1.59375,
"learning_rate": 8.804e-06,
"loss": 0.2755,
"step": 2800
},
{
"epoch": 0.7054762278361839,
"grad_norm": 1.8828125,
"learning_rate": 8.764e-06,
"loss": 0.282,
"step": 2810
},
{
"epoch": 0.7079868193943197,
"grad_norm": 1.9140625,
"learning_rate": 8.724e-06,
"loss": 0.2672,
"step": 2820
},
{
"epoch": 0.7104974109524557,
"grad_norm": 1.5859375,
"learning_rate": 8.684e-06,
"loss": 0.2663,
"step": 2830
},
{
"epoch": 0.7130080025105916,
"grad_norm": 1.75,
"learning_rate": 8.644e-06,
"loss": 0.263,
"step": 2840
},
{
"epoch": 0.7155185940687274,
"grad_norm": 1.9765625,
"learning_rate": 8.604000000000001e-06,
"loss": 0.276,
"step": 2850
},
{
"epoch": 0.7180291856268634,
"grad_norm": 1.5078125,
"learning_rate": 8.564000000000001e-06,
"loss": 0.2622,
"step": 2860
},
{
"epoch": 0.7205397771849992,
"grad_norm": 1.890625,
"learning_rate": 8.524000000000002e-06,
"loss": 0.2574,
"step": 2870
},
{
"epoch": 0.7230503687431351,
"grad_norm": 1.65625,
"learning_rate": 8.484e-06,
"loss": 0.2586,
"step": 2880
},
{
"epoch": 0.725560960301271,
"grad_norm": 2.234375,
"learning_rate": 8.444e-06,
"loss": 0.2694,
"step": 2890
},
{
"epoch": 0.7280715518594069,
"grad_norm": 1.6953125,
"learning_rate": 8.404000000000001e-06,
"loss": 0.2817,
"step": 2900
},
{
"epoch": 0.7305821434175428,
"grad_norm": 1.828125,
"learning_rate": 8.364000000000002e-06,
"loss": 0.2767,
"step": 2910
},
{
"epoch": 0.7330927349756786,
"grad_norm": 1.40625,
"learning_rate": 8.324e-06,
"loss": 0.2547,
"step": 2920
},
{
"epoch": 0.7356033265338145,
"grad_norm": 1.65625,
"learning_rate": 8.284e-06,
"loss": 0.2724,
"step": 2930
},
{
"epoch": 0.7381139180919504,
"grad_norm": 2.078125,
"learning_rate": 8.244000000000001e-06,
"loss": 0.2972,
"step": 2940
},
{
"epoch": 0.7406245096500863,
"grad_norm": 1.484375,
"learning_rate": 8.204000000000001e-06,
"loss": 0.2601,
"step": 2950
},
{
"epoch": 0.7431351012082222,
"grad_norm": 2.015625,
"learning_rate": 8.164e-06,
"loss": 0.2552,
"step": 2960
},
{
"epoch": 0.7456456927663581,
"grad_norm": 2.234375,
"learning_rate": 8.124e-06,
"loss": 0.274,
"step": 2970
},
{
"epoch": 0.748156284324494,
"grad_norm": 1.8125,
"learning_rate": 8.084000000000001e-06,
"loss": 0.272,
"step": 2980
},
{
"epoch": 0.7506668758826298,
"grad_norm": 1.6953125,
"learning_rate": 8.044000000000001e-06,
"loss": 0.2826,
"step": 2990
},
{
"epoch": 0.7531774674407657,
"grad_norm": 1.7109375,
"learning_rate": 8.004e-06,
"loss": 0.2799,
"step": 3000
},
{
"epoch": 0.7556880589989016,
"grad_norm": 1.640625,
"learning_rate": 7.964e-06,
"loss": 0.2919,
"step": 3010
},
{
"epoch": 0.7581986505570375,
"grad_norm": 1.6953125,
"learning_rate": 7.924e-06,
"loss": 0.2634,
"step": 3020
},
{
"epoch": 0.7607092421151734,
"grad_norm": 1.8515625,
"learning_rate": 7.884000000000001e-06,
"loss": 0.259,
"step": 3030
},
{
"epoch": 0.7632198336733093,
"grad_norm": 1.7109375,
"learning_rate": 7.844e-06,
"loss": 0.2687,
"step": 3040
},
{
"epoch": 0.7657304252314452,
"grad_norm": 1.6015625,
"learning_rate": 7.804e-06,
"loss": 0.2644,
"step": 3050
},
{
"epoch": 0.768241016789581,
"grad_norm": 1.7421875,
"learning_rate": 7.764e-06,
"loss": 0.2541,
"step": 3060
},
{
"epoch": 0.7707516083477169,
"grad_norm": 1.3203125,
"learning_rate": 7.724000000000001e-06,
"loss": 0.2555,
"step": 3070
},
{
"epoch": 0.7732621999058528,
"grad_norm": 1.4609375,
"learning_rate": 7.684e-06,
"loss": 0.2745,
"step": 3080
},
{
"epoch": 0.7757727914639887,
"grad_norm": 1.5546875,
"learning_rate": 7.644e-06,
"loss": 0.2924,
"step": 3090
},
{
"epoch": 0.7782833830221246,
"grad_norm": 1.78125,
"learning_rate": 7.604e-06,
"loss": 0.2859,
"step": 3100
},
{
"epoch": 0.7807939745802605,
"grad_norm": 1.75,
"learning_rate": 7.564e-06,
"loss": 0.2752,
"step": 3110
},
{
"epoch": 0.7833045661383964,
"grad_norm": 1.6171875,
"learning_rate": 7.524e-06,
"loss": 0.2611,
"step": 3120
},
{
"epoch": 0.7858151576965322,
"grad_norm": 1.578125,
"learning_rate": 7.484e-06,
"loss": 0.256,
"step": 3130
},
{
"epoch": 0.7883257492546681,
"grad_norm": 1.578125,
"learning_rate": 7.444e-06,
"loss": 0.2707,
"step": 3140
},
{
"epoch": 0.790836340812804,
"grad_norm": 1.765625,
"learning_rate": 7.404e-06,
"loss": 0.2711,
"step": 3150
},
{
"epoch": 0.7933469323709399,
"grad_norm": 1.6484375,
"learning_rate": 7.364000000000001e-06,
"loss": 0.2588,
"step": 3160
},
{
"epoch": 0.7958575239290758,
"grad_norm": 1.7890625,
"learning_rate": 7.324000000000001e-06,
"loss": 0.2618,
"step": 3170
},
{
"epoch": 0.7983681154872116,
"grad_norm": 1.7734375,
"learning_rate": 7.284000000000001e-06,
"loss": 0.2755,
"step": 3180
},
{
"epoch": 0.8008787070453476,
"grad_norm": 1.5703125,
"learning_rate": 7.244000000000001e-06,
"loss": 0.2727,
"step": 3190
},
{
"epoch": 0.8033892986034834,
"grad_norm": 1.453125,
"learning_rate": 7.204000000000001e-06,
"loss": 0.2584,
"step": 3200
},
{
"epoch": 0.8058998901616193,
"grad_norm": 1.7265625,
"learning_rate": 7.164000000000001e-06,
"loss": 0.2767,
"step": 3210
},
{
"epoch": 0.8084104817197553,
"grad_norm": 1.5703125,
"learning_rate": 7.124000000000001e-06,
"loss": 0.2735,
"step": 3220
},
{
"epoch": 0.8109210732778911,
"grad_norm": 1.8203125,
"learning_rate": 7.084000000000001e-06,
"loss": 0.277,
"step": 3230
},
{
"epoch": 0.813431664836027,
"grad_norm": 1.8125,
"learning_rate": 7.044000000000001e-06,
"loss": 0.2743,
"step": 3240
},
{
"epoch": 0.8159422563941628,
"grad_norm": 1.953125,
"learning_rate": 7.004000000000001e-06,
"loss": 0.2804,
"step": 3250
},
{
"epoch": 0.8184528479522988,
"grad_norm": 1.71875,
"learning_rate": 6.964000000000001e-06,
"loss": 0.27,
"step": 3260
},
{
"epoch": 0.8209634395104346,
"grad_norm": 1.578125,
"learning_rate": 6.924000000000001e-06,
"loss": 0.2672,
"step": 3270
},
{
"epoch": 0.8234740310685705,
"grad_norm": 1.671875,
"learning_rate": 6.8840000000000005e-06,
"loss": 0.2787,
"step": 3280
},
{
"epoch": 0.8259846226267065,
"grad_norm": 2.34375,
"learning_rate": 6.844000000000001e-06,
"loss": 0.2624,
"step": 3290
},
{
"epoch": 0.8284952141848423,
"grad_norm": 1.9765625,
"learning_rate": 6.804e-06,
"loss": 0.2655,
"step": 3300
},
{
"epoch": 0.8310058057429782,
"grad_norm": 1.65625,
"learning_rate": 6.764000000000001e-06,
"loss": 0.2677,
"step": 3310
},
{
"epoch": 0.833516397301114,
"grad_norm": 2.015625,
"learning_rate": 6.724e-06,
"loss": 0.2655,
"step": 3320
},
{
"epoch": 0.83602698885925,
"grad_norm": 1.5703125,
"learning_rate": 6.684000000000001e-06,
"loss": 0.27,
"step": 3330
},
{
"epoch": 0.8385375804173858,
"grad_norm": 1.6015625,
"learning_rate": 6.644e-06,
"loss": 0.2503,
"step": 3340
},
{
"epoch": 0.8410481719755217,
"grad_norm": 2.15625,
"learning_rate": 6.604000000000001e-06,
"loss": 0.2773,
"step": 3350
},
{
"epoch": 0.8435587635336577,
"grad_norm": 1.609375,
"learning_rate": 6.564e-06,
"loss": 0.274,
"step": 3360
},
{
"epoch": 0.8460693550917935,
"grad_norm": 1.578125,
"learning_rate": 6.5240000000000006e-06,
"loss": 0.2587,
"step": 3370
},
{
"epoch": 0.8485799466499294,
"grad_norm": 1.828125,
"learning_rate": 6.484e-06,
"loss": 0.2699,
"step": 3380
},
{
"epoch": 0.8510905382080652,
"grad_norm": 1.8125,
"learning_rate": 6.4440000000000005e-06,
"loss": 0.255,
"step": 3390
},
{
"epoch": 0.8536011297662012,
"grad_norm": 1.65625,
"learning_rate": 6.404e-06,
"loss": 0.2532,
"step": 3400
},
{
"epoch": 0.856111721324337,
"grad_norm": 1.6171875,
"learning_rate": 6.364e-06,
"loss": 0.2745,
"step": 3410
},
{
"epoch": 0.8586223128824729,
"grad_norm": 1.7421875,
"learning_rate": 6.324e-06,
"loss": 0.2713,
"step": 3420
},
{
"epoch": 0.8611329044406089,
"grad_norm": 1.6875,
"learning_rate": 6.284e-06,
"loss": 0.271,
"step": 3430
},
{
"epoch": 0.8636434959987447,
"grad_norm": 1.4375,
"learning_rate": 6.244e-06,
"loss": 0.2485,
"step": 3440
},
{
"epoch": 0.8661540875568806,
"grad_norm": 1.3203125,
"learning_rate": 6.204e-06,
"loss": 0.2648,
"step": 3450
},
{
"epoch": 0.8686646791150164,
"grad_norm": 1.8203125,
"learning_rate": 6.164e-06,
"loss": 0.2683,
"step": 3460
},
{
"epoch": 0.8711752706731524,
"grad_norm": 1.5703125,
"learning_rate": 6.124000000000001e-06,
"loss": 0.261,
"step": 3470
},
{
"epoch": 0.8736858622312883,
"grad_norm": 1.4296875,
"learning_rate": 6.084000000000001e-06,
"loss": 0.2541,
"step": 3480
},
{
"epoch": 0.8761964537894241,
"grad_norm": 1.859375,
"learning_rate": 6.044000000000001e-06,
"loss": 0.2607,
"step": 3490
},
{
"epoch": 0.8787070453475601,
"grad_norm": 2.71875,
"learning_rate": 6.004000000000001e-06,
"loss": 0.2516,
"step": 3500
},
{
"epoch": 0.8812176369056959,
"grad_norm": 1.578125,
"learning_rate": 5.964000000000001e-06,
"loss": 0.2678,
"step": 3510
},
{
"epoch": 0.8837282284638318,
"grad_norm": 1.65625,
"learning_rate": 5.924000000000001e-06,
"loss": 0.2628,
"step": 3520
},
{
"epoch": 0.8862388200219676,
"grad_norm": 1.5546875,
"learning_rate": 5.884000000000001e-06,
"loss": 0.2557,
"step": 3530
},
{
"epoch": 0.8887494115801036,
"grad_norm": 1.8828125,
"learning_rate": 5.844000000000001e-06,
"loss": 0.2663,
"step": 3540
},
{
"epoch": 0.8912600031382395,
"grad_norm": 2.125,
"learning_rate": 5.804000000000001e-06,
"loss": 0.2789,
"step": 3550
},
{
"epoch": 0.8937705946963753,
"grad_norm": 1.78125,
"learning_rate": 5.764000000000001e-06,
"loss": 0.2646,
"step": 3560
},
{
"epoch": 0.8962811862545113,
"grad_norm": 1.796875,
"learning_rate": 5.724000000000001e-06,
"loss": 0.2627,
"step": 3570
},
{
"epoch": 0.8987917778126471,
"grad_norm": 1.609375,
"learning_rate": 5.684000000000001e-06,
"loss": 0.281,
"step": 3580
},
{
"epoch": 0.901302369370783,
"grad_norm": 1.8515625,
"learning_rate": 5.6440000000000005e-06,
"loss": 0.2588,
"step": 3590
},
{
"epoch": 0.9038129609289188,
"grad_norm": 1.9140625,
"learning_rate": 5.604000000000001e-06,
"loss": 0.267,
"step": 3600
},
{
"epoch": 0.9063235524870548,
"grad_norm": 1.75,
"learning_rate": 5.5640000000000004e-06,
"loss": 0.2546,
"step": 3610
},
{
"epoch": 0.9088341440451907,
"grad_norm": 1.484375,
"learning_rate": 5.524000000000001e-06,
"loss": 0.2607,
"step": 3620
},
{
"epoch": 0.9113447356033265,
"grad_norm": 1.5703125,
"learning_rate": 5.484e-06,
"loss": 0.2804,
"step": 3630
},
{
"epoch": 0.9138553271614624,
"grad_norm": 1.6328125,
"learning_rate": 5.444000000000001e-06,
"loss": 0.2595,
"step": 3640
},
{
"epoch": 0.9163659187195983,
"grad_norm": 1.84375,
"learning_rate": 5.404e-06,
"loss": 0.2697,
"step": 3650
},
{
"epoch": 0.9188765102777342,
"grad_norm": 1.5703125,
"learning_rate": 5.364000000000001e-06,
"loss": 0.2725,
"step": 3660
},
{
"epoch": 0.92138710183587,
"grad_norm": 1.3984375,
"learning_rate": 5.324e-06,
"loss": 0.2613,
"step": 3670
},
{
"epoch": 0.923897693394006,
"grad_norm": 1.8046875,
"learning_rate": 5.2840000000000006e-06,
"loss": 0.2708,
"step": 3680
},
{
"epoch": 0.9264082849521419,
"grad_norm": 1.578125,
"learning_rate": 5.244e-06,
"loss": 0.2685,
"step": 3690
},
{
"epoch": 0.9289188765102777,
"grad_norm": 1.8828125,
"learning_rate": 5.2040000000000005e-06,
"loss": 0.272,
"step": 3700
},
{
"epoch": 0.9314294680684136,
"grad_norm": 1.6875,
"learning_rate": 5.164e-06,
"loss": 0.2801,
"step": 3710
},
{
"epoch": 0.9339400596265495,
"grad_norm": 1.6015625,
"learning_rate": 5.124e-06,
"loss": 0.257,
"step": 3720
},
{
"epoch": 0.9364506511846854,
"grad_norm": 1.5,
"learning_rate": 5.084e-06,
"loss": 0.2511,
"step": 3730
},
{
"epoch": 0.9389612427428212,
"grad_norm": 1.5390625,
"learning_rate": 5.044e-06,
"loss": 0.2645,
"step": 3740
},
{
"epoch": 0.9414718343009572,
"grad_norm": 1.6640625,
"learning_rate": 5.004e-06,
"loss": 0.2711,
"step": 3750
},
{
"epoch": 0.9439824258590931,
"grad_norm": 1.7421875,
"learning_rate": 4.964e-06,
"loss": 0.2547,
"step": 3760
},
{
"epoch": 0.9464930174172289,
"grad_norm": 1.7734375,
"learning_rate": 4.924000000000001e-06,
"loss": 0.2728,
"step": 3770
},
{
"epoch": 0.9490036089753648,
"grad_norm": 1.421875,
"learning_rate": 4.884e-06,
"loss": 0.2666,
"step": 3780
},
{
"epoch": 0.9515142005335007,
"grad_norm": 1.3515625,
"learning_rate": 4.8440000000000005e-06,
"loss": 0.2509,
"step": 3790
},
{
"epoch": 0.9540247920916366,
"grad_norm": 1.6640625,
"learning_rate": 4.804e-06,
"loss": 0.2643,
"step": 3800
},
{
"epoch": 0.9565353836497725,
"grad_norm": 1.5703125,
"learning_rate": 4.7640000000000005e-06,
"loss": 0.2568,
"step": 3810
},
{
"epoch": 0.9590459752079084,
"grad_norm": 1.6796875,
"learning_rate": 4.724e-06,
"loss": 0.2673,
"step": 3820
},
{
"epoch": 0.9615565667660443,
"grad_norm": 1.625,
"learning_rate": 4.684e-06,
"loss": 0.2622,
"step": 3830
},
{
"epoch": 0.9640671583241801,
"grad_norm": 1.5625,
"learning_rate": 4.644e-06,
"loss": 0.2656,
"step": 3840
},
{
"epoch": 0.966577749882316,
"grad_norm": 1.4609375,
"learning_rate": 4.604e-06,
"loss": 0.2581,
"step": 3850
},
{
"epoch": 0.9690883414404519,
"grad_norm": 1.6875,
"learning_rate": 4.564e-06,
"loss": 0.2529,
"step": 3860
},
{
"epoch": 0.9715989329985878,
"grad_norm": 1.8125,
"learning_rate": 4.524e-06,
"loss": 0.284,
"step": 3870
},
{
"epoch": 0.9741095245567237,
"grad_norm": 2.484375,
"learning_rate": 4.484000000000001e-06,
"loss": 0.2596,
"step": 3880
},
{
"epoch": 0.9766201161148595,
"grad_norm": 1.5390625,
"learning_rate": 4.444e-06,
"loss": 0.2759,
"step": 3890
},
{
"epoch": 0.9791307076729955,
"grad_norm": 1.46875,
"learning_rate": 4.4040000000000005e-06,
"loss": 0.2563,
"step": 3900
},
{
"epoch": 0.9816412992311313,
"grad_norm": 1.53125,
"learning_rate": 4.364e-06,
"loss": 0.25,
"step": 3910
},
{
"epoch": 0.9841518907892672,
"grad_norm": 1.875,
"learning_rate": 4.3240000000000004e-06,
"loss": 0.2747,
"step": 3920
},
{
"epoch": 0.9866624823474031,
"grad_norm": 1.6640625,
"learning_rate": 4.284e-06,
"loss": 0.2691,
"step": 3930
},
{
"epoch": 0.989173073905539,
"grad_norm": 1.53125,
"learning_rate": 4.244e-06,
"loss": 0.2431,
"step": 3940
},
{
"epoch": 0.9916836654636749,
"grad_norm": 1.46875,
"learning_rate": 4.204e-06,
"loss": 0.2422,
"step": 3950
},
{
"epoch": 0.9941942570218107,
"grad_norm": 1.984375,
"learning_rate": 4.164e-06,
"loss": 0.2554,
"step": 3960
},
{
"epoch": 0.9967048485799467,
"grad_norm": 1.671875,
"learning_rate": 4.124e-06,
"loss": 0.2659,
"step": 3970
},
{
"epoch": 0.9992154401380825,
"grad_norm": 1.875,
"learning_rate": 4.084e-06,
"loss": 0.2563,
"step": 3980
},
{
"epoch": 1.0015063549348815,
"grad_norm": 1.53125,
"learning_rate": 4.044e-06,
"loss": 0.2382,
"step": 3990
},
{
"epoch": 1.0040169464930173,
"grad_norm": 1.34375,
"learning_rate": 4.004e-06,
"loss": 0.2385,
"step": 4000
},
{
"epoch": 1.0065275380511534,
"grad_norm": 1.4921875,
"learning_rate": 3.964e-06,
"loss": 0.2354,
"step": 4010
},
{
"epoch": 1.0090381296092892,
"grad_norm": 1.578125,
"learning_rate": 3.924000000000001e-06,
"loss": 0.2446,
"step": 4020
},
{
"epoch": 1.011548721167425,
"grad_norm": 1.5703125,
"learning_rate": 3.884e-06,
"loss": 0.2355,
"step": 4030
},
{
"epoch": 1.014059312725561,
"grad_norm": 1.4609375,
"learning_rate": 3.844000000000001e-06,
"loss": 0.2358,
"step": 4040
},
{
"epoch": 1.0165699042836969,
"grad_norm": 1.6171875,
"learning_rate": 3.8040000000000003e-06,
"loss": 0.2387,
"step": 4050
},
{
"epoch": 1.0190804958418327,
"grad_norm": 1.6015625,
"learning_rate": 3.7640000000000003e-06,
"loss": 0.2342,
"step": 4060
},
{
"epoch": 1.0215910873999685,
"grad_norm": 1.5546875,
"learning_rate": 3.7240000000000003e-06,
"loss": 0.2381,
"step": 4070
},
{
"epoch": 1.0241016789581046,
"grad_norm": 1.765625,
"learning_rate": 3.6840000000000002e-06,
"loss": 0.2392,
"step": 4080
},
{
"epoch": 1.0266122705162404,
"grad_norm": 1.515625,
"learning_rate": 3.644e-06,
"loss": 0.2335,
"step": 4090
},
{
"epoch": 1.0291228620743762,
"grad_norm": 1.453125,
"learning_rate": 3.604e-06,
"loss": 0.2544,
"step": 4100
},
{
"epoch": 1.0316334536325122,
"grad_norm": 1.4296875,
"learning_rate": 3.564e-06,
"loss": 0.2254,
"step": 4110
},
{
"epoch": 1.034144045190648,
"grad_norm": 1.7734375,
"learning_rate": 3.524e-06,
"loss": 0.2443,
"step": 4120
},
{
"epoch": 1.0366546367487839,
"grad_norm": 1.6640625,
"learning_rate": 3.484e-06,
"loss": 0.2492,
"step": 4130
},
{
"epoch": 1.0391652283069197,
"grad_norm": 1.4296875,
"learning_rate": 3.444e-06,
"loss": 0.2285,
"step": 4140
},
{
"epoch": 1.0416758198650558,
"grad_norm": 1.7109375,
"learning_rate": 3.404e-06,
"loss": 0.2309,
"step": 4150
},
{
"epoch": 1.0441864114231916,
"grad_norm": 1.78125,
"learning_rate": 3.364e-06,
"loss": 0.2477,
"step": 4160
},
{
"epoch": 1.0466970029813274,
"grad_norm": 1.7265625,
"learning_rate": 3.324e-06,
"loss": 0.2395,
"step": 4170
},
{
"epoch": 1.0492075945394634,
"grad_norm": 1.5234375,
"learning_rate": 3.2840000000000007e-06,
"loss": 0.2266,
"step": 4180
},
{
"epoch": 1.0517181860975993,
"grad_norm": 2.046875,
"learning_rate": 3.2440000000000006e-06,
"loss": 0.2382,
"step": 4190
},
{
"epoch": 1.054228777655735,
"grad_norm": 1.8125,
"learning_rate": 3.2040000000000006e-06,
"loss": 0.2315,
"step": 4200
},
{
"epoch": 1.056739369213871,
"grad_norm": 1.7109375,
"learning_rate": 3.1640000000000005e-06,
"loss": 0.2399,
"step": 4210
},
{
"epoch": 1.059249960772007,
"grad_norm": 1.5703125,
"learning_rate": 3.1240000000000005e-06,
"loss": 0.2287,
"step": 4220
},
{
"epoch": 1.0617605523301428,
"grad_norm": 1.4453125,
"learning_rate": 3.0840000000000005e-06,
"loss": 0.2399,
"step": 4230
},
{
"epoch": 1.0642711438882786,
"grad_norm": 1.6171875,
"learning_rate": 3.0440000000000004e-06,
"loss": 0.2334,
"step": 4240
},
{
"epoch": 1.0667817354464146,
"grad_norm": 1.671875,
"learning_rate": 3.0040000000000004e-06,
"loss": 0.2324,
"step": 4250
},
{
"epoch": 1.0692923270045505,
"grad_norm": 1.5859375,
"learning_rate": 2.9640000000000003e-06,
"loss": 0.231,
"step": 4260
},
{
"epoch": 1.0718029185626863,
"grad_norm": 1.640625,
"learning_rate": 2.9240000000000003e-06,
"loss": 0.2465,
"step": 4270
},
{
"epoch": 1.074313510120822,
"grad_norm": 1.796875,
"learning_rate": 2.8840000000000003e-06,
"loss": 0.2465,
"step": 4280
},
{
"epoch": 1.0768241016789581,
"grad_norm": 1.4375,
"learning_rate": 2.8440000000000002e-06,
"loss": 0.2514,
"step": 4290
},
{
"epoch": 1.079334693237094,
"grad_norm": 1.71875,
"learning_rate": 2.804e-06,
"loss": 0.2318,
"step": 4300
},
{
"epoch": 1.0818452847952298,
"grad_norm": 1.5234375,
"learning_rate": 2.764e-06,
"loss": 0.2366,
"step": 4310
},
{
"epoch": 1.0843558763533658,
"grad_norm": 1.46875,
"learning_rate": 2.724e-06,
"loss": 0.2328,
"step": 4320
},
{
"epoch": 1.0868664679115017,
"grad_norm": 1.734375,
"learning_rate": 2.6840000000000005e-06,
"loss": 0.2566,
"step": 4330
},
{
"epoch": 1.0893770594696375,
"grad_norm": 1.4296875,
"learning_rate": 2.6440000000000004e-06,
"loss": 0.2243,
"step": 4340
},
{
"epoch": 1.0918876510277735,
"grad_norm": 1.703125,
"learning_rate": 2.6040000000000004e-06,
"loss": 0.2234,
"step": 4350
},
{
"epoch": 1.0943982425859093,
"grad_norm": 1.7421875,
"learning_rate": 2.5640000000000004e-06,
"loss": 0.2287,
"step": 4360
},
{
"epoch": 1.0969088341440452,
"grad_norm": 1.5703125,
"learning_rate": 2.5240000000000003e-06,
"loss": 0.2365,
"step": 4370
},
{
"epoch": 1.099419425702181,
"grad_norm": 1.4296875,
"learning_rate": 2.4840000000000003e-06,
"loss": 0.2299,
"step": 4380
},
{
"epoch": 1.101930017260317,
"grad_norm": 1.8359375,
"learning_rate": 2.4440000000000002e-06,
"loss": 0.24,
"step": 4390
},
{
"epoch": 1.1044406088184529,
"grad_norm": 1.5703125,
"learning_rate": 2.404e-06,
"loss": 0.2292,
"step": 4400
},
{
"epoch": 1.1069512003765887,
"grad_norm": 1.59375,
"learning_rate": 2.364e-06,
"loss": 0.2356,
"step": 4410
},
{
"epoch": 1.1094617919347247,
"grad_norm": 1.8671875,
"learning_rate": 2.324e-06,
"loss": 0.2395,
"step": 4420
},
{
"epoch": 1.1119723834928605,
"grad_norm": 1.7734375,
"learning_rate": 2.284e-06,
"loss": 0.2534,
"step": 4430
},
{
"epoch": 1.1144829750509964,
"grad_norm": 1.6328125,
"learning_rate": 2.244e-06,
"loss": 0.2294,
"step": 4440
},
{
"epoch": 1.1169935666091322,
"grad_norm": 1.6796875,
"learning_rate": 2.2040000000000004e-06,
"loss": 0.2428,
"step": 4450
},
{
"epoch": 1.1195041581672682,
"grad_norm": 1.5,
"learning_rate": 2.1640000000000004e-06,
"loss": 0.2486,
"step": 4460
},
{
"epoch": 1.122014749725404,
"grad_norm": 1.515625,
"learning_rate": 2.1240000000000003e-06,
"loss": 0.2428,
"step": 4470
},
{
"epoch": 1.1245253412835399,
"grad_norm": 1.65625,
"learning_rate": 2.0840000000000003e-06,
"loss": 0.2438,
"step": 4480
},
{
"epoch": 1.127035932841676,
"grad_norm": 1.796875,
"learning_rate": 2.0440000000000003e-06,
"loss": 0.2278,
"step": 4490
},
{
"epoch": 1.1295465243998117,
"grad_norm": 1.640625,
"learning_rate": 2.004e-06,
"loss": 0.2306,
"step": 4500
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.194052832115884e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}