sort_trash_real_2 / trainer_state.json
theconstruct-ai's picture
Upload folder using huggingface_hub
d7a47a9 verified
Raw
History Blame Contribute Delete
42.2 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"grad_norm": 0.18099799752235413,
"learning_rate": 3e-06,
"loss": 1.2088,
"step": 10
},
{
"grad_norm": 0.20619013905525208,
"learning_rate": 6.333333333333334e-06,
"loss": 1.1961,
"step": 20
},
{
"grad_norm": 0.14481662213802338,
"learning_rate": 9.666666666666667e-06,
"loss": 1.1631,
"step": 30
},
{
"grad_norm": 0.14480064809322357,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.12,
"step": 40
},
{
"grad_norm": 0.1346646249294281,
"learning_rate": 1.6333333333333335e-05,
"loss": 1.0839,
"step": 50
},
{
"grad_norm": 0.1372521072626114,
"learning_rate": 1.9666666666666666e-05,
"loss": 1.0649,
"step": 60
},
{
"grad_norm": 0.2891208529472351,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.0489,
"step": 70
},
{
"grad_norm": 0.2914903163909912,
"learning_rate": 2.633333333333333e-05,
"loss": 1.0182,
"step": 80
},
{
"grad_norm": 0.4698174297809601,
"learning_rate": 2.9666666666666672e-05,
"loss": 0.9654,
"step": 90
},
{
"grad_norm": 0.8523975014686584,
"learning_rate": 3.3e-05,
"loss": 0.9062,
"step": 100
},
{
"grad_norm": 0.5535483956336975,
"learning_rate": 3.633333333333333e-05,
"loss": 0.8529,
"step": 110
},
{
"grad_norm": 0.6099287271499634,
"learning_rate": 3.966666666666667e-05,
"loss": 0.8047,
"step": 120
},
{
"grad_norm": 0.6395930051803589,
"learning_rate": 4.3e-05,
"loss": 0.7568,
"step": 130
},
{
"grad_norm": 0.7526710033416748,
"learning_rate": 4.633333333333333e-05,
"loss": 0.7168,
"step": 140
},
{
"grad_norm": 0.7764474153518677,
"learning_rate": 4.966666666666667e-05,
"loss": 0.6804,
"step": 150
},
{
"grad_norm": 0.9331451654434204,
"learning_rate": 5.300000000000001e-05,
"loss": 0.6504,
"step": 160
},
{
"grad_norm": 0.702415943145752,
"learning_rate": 5.633333333333334e-05,
"loss": 0.625,
"step": 170
},
{
"grad_norm": 1.1818166971206665,
"learning_rate": 5.966666666666667e-05,
"loss": 0.6007,
"step": 180
},
{
"grad_norm": 1.1777819395065308,
"learning_rate": 6.3e-05,
"loss": 0.5718,
"step": 190
},
{
"grad_norm": 1.0473010540008545,
"learning_rate": 6.633333333333334e-05,
"loss": 0.5543,
"step": 200
},
{
"grad_norm": 0.873248815536499,
"learning_rate": 6.966666666666668e-05,
"loss": 0.5387,
"step": 210
},
{
"grad_norm": 0.8804641962051392,
"learning_rate": 7.3e-05,
"loss": 0.5221,
"step": 220
},
{
"grad_norm": 0.915578305721283,
"learning_rate": 7.633333333333334e-05,
"loss": 0.5033,
"step": 230
},
{
"grad_norm": 1.316872000694275,
"learning_rate": 7.966666666666666e-05,
"loss": 0.4825,
"step": 240
},
{
"grad_norm": 1.1146589517593384,
"learning_rate": 8.3e-05,
"loss": 0.4626,
"step": 250
},
{
"grad_norm": 1.5606942176818848,
"learning_rate": 8.633333333333334e-05,
"loss": 0.4351,
"step": 260
},
{
"grad_norm": 1.0714901685714722,
"learning_rate": 8.966666666666666e-05,
"loss": 0.4128,
"step": 270
},
{
"grad_norm": 1.3251256942749023,
"learning_rate": 9.300000000000001e-05,
"loss": 0.3907,
"step": 280
},
{
"grad_norm": 1.0007801055908203,
"learning_rate": 9.633333333333335e-05,
"loss": 0.3783,
"step": 290
},
{
"grad_norm": 1.3502074480056763,
"learning_rate": 9.966666666666667e-05,
"loss": 0.3597,
"step": 300
},
{
"grad_norm": 1.1962398290634155,
"learning_rate": 9.999938485971279e-05,
"loss": 0.3383,
"step": 310
},
{
"grad_norm": 1.634523630142212,
"learning_rate": 9.999725846827562e-05,
"loss": 0.3226,
"step": 320
},
{
"grad_norm": 1.4946134090423584,
"learning_rate": 9.999361329594254e-05,
"loss": 0.2976,
"step": 330
},
{
"grad_norm": 1.2499020099639893,
"learning_rate": 9.998844945344405e-05,
"loss": 0.2673,
"step": 340
},
{
"grad_norm": 1.481104850769043,
"learning_rate": 9.99817670976436e-05,
"loss": 0.2459,
"step": 350
},
{
"grad_norm": 1.554062008857727,
"learning_rate": 9.997356643153303e-05,
"loss": 0.2272,
"step": 360
},
{
"grad_norm": 1.8656765222549438,
"learning_rate": 9.996384770422629e-05,
"loss": 0.2146,
"step": 370
},
{
"grad_norm": 1.0278624296188354,
"learning_rate": 9.995261121095194e-05,
"loss": 0.1967,
"step": 380
},
{
"grad_norm": 1.3932757377624512,
"learning_rate": 9.993985729304408e-05,
"loss": 0.1756,
"step": 390
},
{
"grad_norm": 1.618746042251587,
"learning_rate": 9.992558633793212e-05,
"loss": 0.1501,
"step": 400
},
{
"grad_norm": 1.1260430812835693,
"learning_rate": 9.990979877912891e-05,
"loss": 0.1335,
"step": 410
},
{
"grad_norm": 1.0491538047790527,
"learning_rate": 9.989249509621759e-05,
"loss": 0.1259,
"step": 420
},
{
"grad_norm": 1.284968614578247,
"learning_rate": 9.987367581483705e-05,
"loss": 0.122,
"step": 430
},
{
"grad_norm": 1.0914318561553955,
"learning_rate": 9.985334150666592e-05,
"loss": 0.1172,
"step": 440
},
{
"grad_norm": 1.1247401237487793,
"learning_rate": 9.983149278940526e-05,
"loss": 0.1097,
"step": 450
},
{
"grad_norm": 1.192862868309021,
"learning_rate": 9.980813032675974e-05,
"loss": 0.1,
"step": 460
},
{
"grad_norm": 0.8659683465957642,
"learning_rate": 9.978325482841753e-05,
"loss": 0.1027,
"step": 470
},
{
"grad_norm": 1.0352890491485596,
"learning_rate": 9.975686705002867e-05,
"loss": 0.1003,
"step": 480
},
{
"grad_norm": 1.2678529024124146,
"learning_rate": 9.972896779318219e-05,
"loss": 0.0971,
"step": 490
},
{
"grad_norm": 1.1098188161849976,
"learning_rate": 9.969955790538175e-05,
"loss": 0.0992,
"step": 500
},
{
"grad_norm": 1.181512713432312,
"learning_rate": 9.966863828001982e-05,
"loss": 0.0965,
"step": 510
},
{
"grad_norm": 1.0185215473175049,
"learning_rate": 9.963620985635065e-05,
"loss": 0.0877,
"step": 520
},
{
"grad_norm": 1.1182595491409302,
"learning_rate": 9.960227361946164e-05,
"loss": 0.0843,
"step": 530
},
{
"grad_norm": 0.9148073196411133,
"learning_rate": 9.95668306002435e-05,
"loss": 0.0923,
"step": 540
},
{
"grad_norm": 0.9969584941864014,
"learning_rate": 9.952988187535886e-05,
"loss": 0.0849,
"step": 550
},
{
"grad_norm": 1.2698750495910645,
"learning_rate": 9.949142856720961e-05,
"loss": 0.0835,
"step": 560
},
{
"grad_norm": 1.0291186571121216,
"learning_rate": 9.945147184390278e-05,
"loss": 0.0878,
"step": 570
},
{
"grad_norm": 0.9522308111190796,
"learning_rate": 9.941001291921512e-05,
"loss": 0.0823,
"step": 580
},
{
"grad_norm": 1.0067387819290161,
"learning_rate": 9.936705305255612e-05,
"loss": 0.084,
"step": 590
},
{
"grad_norm": 1.1268168687820435,
"learning_rate": 9.932259354892984e-05,
"loss": 0.0824,
"step": 600
},
{
"grad_norm": 0.8802523016929626,
"learning_rate": 9.927663575889521e-05,
"loss": 0.0792,
"step": 610
},
{
"grad_norm": 1.3417941331863403,
"learning_rate": 9.922918107852504e-05,
"loss": 0.0811,
"step": 620
},
{
"grad_norm": 1.097968578338623,
"learning_rate": 9.918023094936363e-05,
"loss": 0.077,
"step": 630
},
{
"grad_norm": 1.0577588081359863,
"learning_rate": 9.912978685838294e-05,
"loss": 0.0802,
"step": 640
},
{
"grad_norm": 1.1714197397232056,
"learning_rate": 9.90778503379374e-05,
"loss": 0.078,
"step": 650
},
{
"grad_norm": 0.8812937140464783,
"learning_rate": 9.902442296571743e-05,
"loss": 0.0708,
"step": 660
},
{
"grad_norm": 0.9380112886428833,
"learning_rate": 9.896950636470147e-05,
"loss": 0.0803,
"step": 670
},
{
"grad_norm": 1.1852452754974365,
"learning_rate": 9.891310220310666e-05,
"loss": 0.0757,
"step": 680
},
{
"grad_norm": 1.0475136041641235,
"learning_rate": 9.885521219433823e-05,
"loss": 0.0727,
"step": 690
},
{
"grad_norm": 1.1130269765853882,
"learning_rate": 9.879583809693738e-05,
"loss": 0.0711,
"step": 700
},
{
"grad_norm": 0.9928076863288879,
"learning_rate": 9.873498171452789e-05,
"loss": 0.0703,
"step": 710
},
{
"grad_norm": 0.979897141456604,
"learning_rate": 9.867264489576135e-05,
"loss": 0.0687,
"step": 720
},
{
"grad_norm": 1.0989563465118408,
"learning_rate": 9.860882953426099e-05,
"loss": 0.0747,
"step": 730
},
{
"grad_norm": 1.0301982164382935,
"learning_rate": 9.854353756856412e-05,
"loss": 0.0699,
"step": 740
},
{
"grad_norm": 1.101908802986145,
"learning_rate": 9.847677098206332e-05,
"loss": 0.069,
"step": 750
},
{
"grad_norm": 0.8733094334602356,
"learning_rate": 9.840853180294608e-05,
"loss": 0.0672,
"step": 760
},
{
"grad_norm": 1.0546811819076538,
"learning_rate": 9.833882210413332e-05,
"loss": 0.0706,
"step": 770
},
{
"grad_norm": 0.8678887486457825,
"learning_rate": 9.826764400321633e-05,
"loss": 0.0702,
"step": 780
},
{
"grad_norm": 0.8769698739051819,
"learning_rate": 9.819499966239243e-05,
"loss": 0.0678,
"step": 790
},
{
"grad_norm": 1.1157478094100952,
"learning_rate": 9.812089128839938e-05,
"loss": 0.0693,
"step": 800
},
{
"grad_norm": 1.0164200067520142,
"learning_rate": 9.804532113244828e-05,
"loss": 0.0624,
"step": 810
},
{
"grad_norm": 0.915485680103302,
"learning_rate": 9.796829149015517e-05,
"loss": 0.0647,
"step": 820
},
{
"grad_norm": 0.8830865025520325,
"learning_rate": 9.788980470147132e-05,
"loss": 0.0613,
"step": 830
},
{
"grad_norm": 1.0174789428710938,
"learning_rate": 9.780986315061218e-05,
"loss": 0.0641,
"step": 840
},
{
"grad_norm": 0.7468952536582947,
"learning_rate": 9.772846926598491e-05,
"loss": 0.0716,
"step": 850
},
{
"grad_norm": 0.7474204301834106,
"learning_rate": 9.76456255201146e-05,
"loss": 0.0636,
"step": 860
},
{
"grad_norm": 1.0349617004394531,
"learning_rate": 9.756133442956923e-05,
"loss": 0.0612,
"step": 870
},
{
"grad_norm": 0.8907390236854553,
"learning_rate": 9.747559855488313e-05,
"loss": 0.0656,
"step": 880
},
{
"grad_norm": 0.8625577092170715,
"learning_rate": 9.73884205004793e-05,
"loss": 0.0637,
"step": 890
},
{
"grad_norm": 0.895968496799469,
"learning_rate": 9.729980291459019e-05,
"loss": 0.0635,
"step": 900
},
{
"grad_norm": 0.9742909073829651,
"learning_rate": 9.720974848917735e-05,
"loss": 0.0596,
"step": 910
},
{
"grad_norm": 0.7080522775650024,
"learning_rate": 9.711825995984957e-05,
"loss": 0.0604,
"step": 920
},
{
"grad_norm": 0.7485001087188721,
"learning_rate": 9.702534010577991e-05,
"loss": 0.0627,
"step": 930
},
{
"grad_norm": 0.8010299801826477,
"learning_rate": 9.693099174962103e-05,
"loss": 0.0584,
"step": 940
},
{
"grad_norm": 0.8207157254219055,
"learning_rate": 9.683521775741977e-05,
"loss": 0.0606,
"step": 950
},
{
"grad_norm": 0.7718232870101929,
"learning_rate": 9.673802103852979e-05,
"loss": 0.0586,
"step": 960
},
{
"grad_norm": 1.0081161260604858,
"learning_rate": 9.663940454552342e-05,
"loss": 0.0595,
"step": 970
},
{
"grad_norm": 0.8325558304786682,
"learning_rate": 9.65393712741018e-05,
"loss": 0.0581,
"step": 980
},
{
"grad_norm": 0.9128422737121582,
"learning_rate": 9.6437924263004e-05,
"loss": 0.0613,
"step": 990
},
{
"grad_norm": 0.8013613224029541,
"learning_rate": 9.63350665939146e-05,
"loss": 0.059,
"step": 1000
},
{
"grad_norm": 0.8024020791053772,
"learning_rate": 9.623080139137023e-05,
"loss": 0.0585,
"step": 1010
},
{
"grad_norm": 0.8608654737472534,
"learning_rate": 9.612513182266447e-05,
"loss": 0.0573,
"step": 1020
},
{
"grad_norm": 0.7992358803749084,
"learning_rate": 9.601806109775179e-05,
"loss": 0.0588,
"step": 1030
},
{
"grad_norm": 0.9951710104942322,
"learning_rate": 9.590959246914995e-05,
"loss": 0.0549,
"step": 1040
},
{
"grad_norm": 0.7153400778770447,
"learning_rate": 9.579972923184122e-05,
"loss": 0.0581,
"step": 1050
},
{
"grad_norm": 0.8274824023246765,
"learning_rate": 9.568847472317232e-05,
"loss": 0.0528,
"step": 1060
},
{
"grad_norm": 0.6790134906768799,
"learning_rate": 9.557583232275303e-05,
"loss": 0.0554,
"step": 1070
},
{
"grad_norm": 0.7177821397781372,
"learning_rate": 9.546180545235344e-05,
"loss": 0.0525,
"step": 1080
},
{
"grad_norm": 0.8989811539649963,
"learning_rate": 9.534639757580013e-05,
"loss": 0.0515,
"step": 1090
},
{
"grad_norm": 0.8031622767448425,
"learning_rate": 9.522961219887092e-05,
"loss": 0.0564,
"step": 1100
},
{
"grad_norm": 0.8315763473510742,
"learning_rate": 9.511145286918828e-05,
"loss": 0.0567,
"step": 1110
},
{
"grad_norm": 0.7631978988647461,
"learning_rate": 9.499192317611167e-05,
"loss": 0.0524,
"step": 1120
},
{
"grad_norm": 0.8047354817390442,
"learning_rate": 9.487102675062851e-05,
"loss": 0.0563,
"step": 1130
},
{
"grad_norm": 0.5823233127593994,
"learning_rate": 9.474876726524374e-05,
"loss": 0.0507,
"step": 1140
},
{
"grad_norm": 0.7840980887413025,
"learning_rate": 9.462514843386845e-05,
"loss": 0.0522,
"step": 1150
},
{
"grad_norm": 0.7950931191444397,
"learning_rate": 9.450017401170689e-05,
"loss": 0.0544,
"step": 1160
},
{
"grad_norm": 0.7812637090682983,
"learning_rate": 9.437384779514256e-05,
"loss": 0.0538,
"step": 1170
},
{
"grad_norm": 0.8743076920509338,
"learning_rate": 9.424617362162271e-05,
"loss": 0.0551,
"step": 1180
},
{
"grad_norm": 0.7778111100196838,
"learning_rate": 9.411715536954196e-05,
"loss": 0.0515,
"step": 1190
},
{
"grad_norm": 0.799373209476471,
"learning_rate": 9.39867969581243e-05,
"loss": 0.0514,
"step": 1200
},
{
"grad_norm": 0.7291685342788696,
"learning_rate": 9.385510234730415e-05,
"loss": 0.0524,
"step": 1210
},
{
"grad_norm": 0.830124020576477,
"learning_rate": 9.372207553760603e-05,
"loss": 0.0506,
"step": 1220
},
{
"grad_norm": 0.6252336502075195,
"learning_rate": 9.358772057002312e-05,
"loss": 0.0502,
"step": 1230
},
{
"grad_norm": 0.7802227735519409,
"learning_rate": 9.345204152589428e-05,
"loss": 0.0505,
"step": 1240
},
{
"grad_norm": 0.7094554901123047,
"learning_rate": 9.331504252678037e-05,
"loss": 0.0537,
"step": 1250
},
{
"grad_norm": 0.7272769808769226,
"learning_rate": 9.317672773433876e-05,
"loss": 0.0506,
"step": 1260
},
{
"grad_norm": 0.6488326191902161,
"learning_rate": 9.30371013501972e-05,
"loss": 0.0484,
"step": 1270
},
{
"grad_norm": 0.6355553865432739,
"learning_rate": 9.289616761582587e-05,
"loss": 0.0544,
"step": 1280
},
{
"grad_norm": 0.769917905330658,
"learning_rate": 9.275393081240882e-05,
"loss": 0.048,
"step": 1290
},
{
"grad_norm": 0.6785501837730408,
"learning_rate": 9.261039526071374e-05,
"loss": 0.0484,
"step": 1300
},
{
"grad_norm": 0.7995139360427856,
"learning_rate": 9.246556532096078e-05,
"loss": 0.05,
"step": 1310
},
{
"grad_norm": 0.7101492285728455,
"learning_rate": 9.231944539269009e-05,
"loss": 0.0499,
"step": 1320
},
{
"grad_norm": 0.6292925477027893,
"learning_rate": 9.217203991462815e-05,
"loss": 0.0499,
"step": 1330
},
{
"grad_norm": 0.6308528780937195,
"learning_rate": 9.202335336455296e-05,
"loss": 0.049,
"step": 1340
},
{
"grad_norm": 0.6749176979064941,
"learning_rate": 9.187339025915802e-05,
"loss": 0.0488,
"step": 1350
},
{
"grad_norm": 0.5973607897758484,
"learning_rate": 9.17221551539151e-05,
"loss": 0.0475,
"step": 1360
},
{
"grad_norm": 0.6545643210411072,
"learning_rate": 9.156965264293586e-05,
"loss": 0.0476,
"step": 1370
},
{
"grad_norm": 0.6092913746833801,
"learning_rate": 9.141588735883232e-05,
"loss": 0.0433,
"step": 1380
},
{
"grad_norm": 0.5947241187095642,
"learning_rate": 9.126086397257612e-05,
"loss": 0.0471,
"step": 1390
},
{
"grad_norm": 0.5612359046936035,
"learning_rate": 9.110458719335659e-05,
"loss": 0.0463,
"step": 1400
},
{
"grad_norm": 0.654656708240509,
"learning_rate": 9.094706176843777e-05,
"loss": 0.0486,
"step": 1410
},
{
"grad_norm": 0.7321748733520508,
"learning_rate": 9.078829248301417e-05,
"loss": 0.0451,
"step": 1420
},
{
"grad_norm": 0.7481226325035095,
"learning_rate": 9.062828416006539e-05,
"loss": 0.0503,
"step": 1430
},
{
"grad_norm": 0.6706563234329224,
"learning_rate": 9.046704166020961e-05,
"loss": 0.0472,
"step": 1440
},
{
"grad_norm": 0.6942538619041443,
"learning_rate": 9.030456988155596e-05,
"loss": 0.0462,
"step": 1450
},
{
"grad_norm": 0.65287184715271,
"learning_rate": 9.014087375955573e-05,
"loss": 0.0469,
"step": 1460
},
{
"grad_norm": 0.7019280195236206,
"learning_rate": 8.997595826685243e-05,
"loss": 0.0514,
"step": 1470
},
{
"grad_norm": 0.6150776147842407,
"learning_rate": 8.980982841313074e-05,
"loss": 0.0466,
"step": 1480
},
{
"grad_norm": 0.784782350063324,
"learning_rate": 8.964248924496435e-05,
"loss": 0.0434,
"step": 1490
},
{
"grad_norm": 0.6784024834632874,
"learning_rate": 8.947394584566258e-05,
"loss": 0.0438,
"step": 1500
},
{
"grad_norm": 0.5981051921844482,
"learning_rate": 8.930420333511606e-05,
"loss": 0.0427,
"step": 1510
},
{
"grad_norm": 0.7331579923629761,
"learning_rate": 8.913326686964117e-05,
"loss": 0.0432,
"step": 1520
},
{
"grad_norm": 0.6730307936668396,
"learning_rate": 8.89611416418234e-05,
"loss": 0.0424,
"step": 1530
},
{
"grad_norm": 0.5771387219429016,
"learning_rate": 8.878783288035957e-05,
"loss": 0.0432,
"step": 1540
},
{
"grad_norm": 0.7928068041801453,
"learning_rate": 8.86133458498991e-05,
"loss": 0.0475,
"step": 1550
},
{
"grad_norm": 0.6628245115280151,
"learning_rate": 8.843768585088393e-05,
"loss": 0.0432,
"step": 1560
},
{
"grad_norm": 0.7262830138206482,
"learning_rate": 8.82608582193877e-05,
"loss": 0.0451,
"step": 1570
},
{
"grad_norm": 0.6896581649780273,
"learning_rate": 8.80828683269535e-05,
"loss": 0.0429,
"step": 1580
},
{
"grad_norm": 0.6019271016120911,
"learning_rate": 8.790372158043074e-05,
"loss": 0.0416,
"step": 1590
},
{
"grad_norm": 0.6586809754371643,
"learning_rate": 8.772342342181095e-05,
"loss": 0.0435,
"step": 1600
},
{
"grad_norm": 0.741075336933136,
"learning_rate": 8.75419793280624e-05,
"loss": 0.0428,
"step": 1610
},
{
"grad_norm": 0.7138071656227112,
"learning_rate": 8.735939481096378e-05,
"loss": 0.0415,
"step": 1620
},
{
"grad_norm": 0.665623128414154,
"learning_rate": 8.717567541693673e-05,
"loss": 0.0437,
"step": 1630
},
{
"grad_norm": 0.6723113059997559,
"learning_rate": 8.699082672687734e-05,
"loss": 0.0442,
"step": 1640
},
{
"grad_norm": 0.5757609605789185,
"learning_rate": 8.680485435598673e-05,
"loss": 0.0473,
"step": 1650
},
{
"grad_norm": 0.646248459815979,
"learning_rate": 8.661776395360029e-05,
"loss": 0.0443,
"step": 1660
},
{
"grad_norm": 0.7440095543861389,
"learning_rate": 8.642956120301626e-05,
"loss": 0.0414,
"step": 1670
},
{
"grad_norm": 0.6682982444763184,
"learning_rate": 8.624025182132292e-05,
"loss": 0.042,
"step": 1680
},
{
"grad_norm": 0.6209063529968262,
"learning_rate": 8.604984155922506e-05,
"loss": 0.0422,
"step": 1690
},
{
"grad_norm": 0.6250181198120117,
"learning_rate": 8.585833620086918e-05,
"loss": 0.042,
"step": 1700
},
{
"grad_norm": 0.709252655506134,
"learning_rate": 8.566574156366784e-05,
"loss": 0.0369,
"step": 1710
},
{
"grad_norm": 0.783593475818634,
"learning_rate": 8.547206349812298e-05,
"loss": 0.0445,
"step": 1720
},
{
"grad_norm": 0.5931394100189209,
"learning_rate": 8.527730788764805e-05,
"loss": 0.0449,
"step": 1730
},
{
"grad_norm": 0.5985734462738037,
"learning_rate": 8.508148064838948e-05,
"loss": 0.0412,
"step": 1740
},
{
"grad_norm": 0.528599739074707,
"learning_rate": 8.488458772904684e-05,
"loss": 0.0398,
"step": 1750
},
{
"grad_norm": 0.6593722701072693,
"learning_rate": 8.468663511069217e-05,
"loss": 0.0408,
"step": 1760
},
{
"grad_norm": 0.5931499600410461,
"learning_rate": 8.448762880658825e-05,
"loss": 0.0414,
"step": 1770
},
{
"grad_norm": 0.5673992037773132,
"learning_rate": 8.428757486200603e-05,
"loss": 0.041,
"step": 1780
},
{
"grad_norm": 0.7802947759628296,
"learning_rate": 8.40864793540409e-05,
"loss": 0.0421,
"step": 1790
},
{
"grad_norm": 0.5950642228126526,
"learning_rate": 8.388434839142813e-05,
"loss": 0.0424,
"step": 1800
},
{
"grad_norm": 0.6841787099838257,
"learning_rate": 8.368118811435726e-05,
"loss": 0.0391,
"step": 1810
},
{
"grad_norm": 0.5789716839790344,
"learning_rate": 8.347700469428564e-05,
"loss": 0.0386,
"step": 1820
},
{
"grad_norm": 0.6306881904602051,
"learning_rate": 8.327180433375091e-05,
"loss": 0.0404,
"step": 1830
},
{
"grad_norm": 0.5804703831672668,
"learning_rate": 8.306559326618259e-05,
"loss": 0.0392,
"step": 1840
},
{
"grad_norm": 0.6453599333763123,
"learning_rate": 8.285837775571276e-05,
"loss": 0.0398,
"step": 1850
},
{
"grad_norm": 0.5413265228271484,
"learning_rate": 8.265016409698573e-05,
"loss": 0.0389,
"step": 1860
},
{
"grad_norm": 0.5259561538696289,
"learning_rate": 8.244095861496686e-05,
"loss": 0.0389,
"step": 1870
},
{
"grad_norm": 0.6392974853515625,
"learning_rate": 8.223076766475035e-05,
"loss": 0.0404,
"step": 1880
},
{
"grad_norm": 0.7087792754173279,
"learning_rate": 8.201959763136633e-05,
"loss": 0.0388,
"step": 1890
},
{
"grad_norm": 0.7540794610977173,
"learning_rate": 8.180745492958674e-05,
"loss": 0.0419,
"step": 1900
},
{
"grad_norm": 0.5628899335861206,
"learning_rate": 8.159434600373061e-05,
"loss": 0.0375,
"step": 1910
},
{
"grad_norm": 0.5828471779823303,
"learning_rate": 8.138027732746818e-05,
"loss": 0.0394,
"step": 1920
},
{
"grad_norm": 0.6918069124221802,
"learning_rate": 8.116525540362434e-05,
"loss": 0.0377,
"step": 1930
},
{
"grad_norm": 0.5691211819648743,
"learning_rate": 8.094928676398101e-05,
"loss": 0.0389,
"step": 1940
},
{
"grad_norm": 0.5968996286392212,
"learning_rate": 8.073237796907882e-05,
"loss": 0.0361,
"step": 1950
},
{
"grad_norm": 0.5921427011489868,
"learning_rate": 8.051453560801772e-05,
"loss": 0.0433,
"step": 1960
},
{
"grad_norm": 0.5701543688774109,
"learning_rate": 8.029576629825687e-05,
"loss": 0.0368,
"step": 1970
},
{
"grad_norm": 0.6130271553993225,
"learning_rate": 8.007607668541362e-05,
"loss": 0.0395,
"step": 1980
},
{
"grad_norm": 0.6060221195220947,
"learning_rate": 7.985547344306161e-05,
"loss": 0.0438,
"step": 1990
},
{
"grad_norm": 0.709045946598053,
"learning_rate": 7.963396327252812e-05,
"loss": 0.0414,
"step": 2000
},
{
"grad_norm": 0.6804901361465454,
"learning_rate": 7.941155290269038e-05,
"loss": 0.0394,
"step": 2010
},
{
"grad_norm": 0.5408011078834534,
"learning_rate": 7.918824908977123e-05,
"loss": 0.0367,
"step": 2020
},
{
"grad_norm": 0.554338812828064,
"learning_rate": 7.896405861713394e-05,
"loss": 0.0356,
"step": 2030
},
{
"grad_norm": 0.711392879486084,
"learning_rate": 7.873898829507606e-05,
"loss": 0.0371,
"step": 2040
},
{
"grad_norm": 0.6779384613037109,
"learning_rate": 7.851304496062254e-05,
"loss": 0.038,
"step": 2050
},
{
"grad_norm": 0.6775013208389282,
"learning_rate": 7.828623547731818e-05,
"loss": 0.038,
"step": 2060
},
{
"grad_norm": 0.5738393664360046,
"learning_rate": 7.80585667350189e-05,
"loss": 0.0388,
"step": 2070
},
{
"grad_norm": 0.5050686001777649,
"learning_rate": 7.783004564968263e-05,
"loss": 0.0381,
"step": 2080
},
{
"grad_norm": 0.6223453283309937,
"learning_rate": 7.760067916315921e-05,
"loss": 0.0382,
"step": 2090
},
{
"grad_norm": 0.6240858435630798,
"learning_rate": 7.737047424297941e-05,
"loss": 0.0345,
"step": 2100
},
{
"grad_norm": 0.5866036415100098,
"learning_rate": 7.713943788214337e-05,
"loss": 0.0341,
"step": 2110
},
{
"grad_norm": 0.6695197224617004,
"learning_rate": 7.690757709890812e-05,
"loss": 0.0354,
"step": 2120
},
{
"grad_norm": 0.5520651340484619,
"learning_rate": 7.66748989365744e-05,
"loss": 0.0366,
"step": 2130
},
{
"grad_norm": 0.5425397157669067,
"learning_rate": 7.644141046327271e-05,
"loss": 0.0339,
"step": 2140
},
{
"grad_norm": 0.5396847128868103,
"learning_rate": 7.620711877174866e-05,
"loss": 0.037,
"step": 2150
},
{
"grad_norm": 0.633583128452301,
"learning_rate": 7.597203097914732e-05,
"loss": 0.0358,
"step": 2160
},
{
"grad_norm": 0.5030661821365356,
"learning_rate": 7.573615422679726e-05,
"loss": 0.0372,
"step": 2170
},
{
"grad_norm": 0.7198052406311035,
"learning_rate": 7.549949567999345e-05,
"loss": 0.0344,
"step": 2180
},
{
"grad_norm": 0.5248534679412842,
"learning_rate": 7.526206252777968e-05,
"loss": 0.0382,
"step": 2190
},
{
"grad_norm": 0.6668030619621277,
"learning_rate": 7.50238619827301e-05,
"loss": 0.0375,
"step": 2200
},
{
"grad_norm": 0.6512902975082397,
"learning_rate": 7.478490128073022e-05,
"loss": 0.0365,
"step": 2210
},
{
"grad_norm": 0.5244461894035339,
"learning_rate": 7.454518768075704e-05,
"loss": 0.0369,
"step": 2220
},
{
"grad_norm": 0.5693942308425903,
"learning_rate": 7.430472846465856e-05,
"loss": 0.0344,
"step": 2230
},
{
"grad_norm": 0.6084948182106018,
"learning_rate": 7.406353093693253e-05,
"loss": 0.035,
"step": 2240
},
{
"grad_norm": 0.536939263343811,
"learning_rate": 7.382160242450469e-05,
"loss": 0.0356,
"step": 2250
},
{
"grad_norm": 0.5331522226333618,
"learning_rate": 7.357895027650598e-05,
"loss": 0.031,
"step": 2260
},
{
"grad_norm": 0.45928630232810974,
"learning_rate": 7.333558186404958e-05,
"loss": 0.0327,
"step": 2270
},
{
"grad_norm": 0.528089165687561,
"learning_rate": 7.309150458000668e-05,
"loss": 0.0359,
"step": 2280
},
{
"grad_norm": 0.581315279006958,
"learning_rate": 7.284672583878219e-05,
"loss": 0.0343,
"step": 2290
},
{
"grad_norm": 0.558525800704956,
"learning_rate": 7.260125307608929e-05,
"loss": 0.0367,
"step": 2300
},
{
"grad_norm": 0.47152066230773926,
"learning_rate": 7.235509374872373e-05,
"loss": 0.035,
"step": 2310
},
{
"grad_norm": 0.6010111570358276,
"learning_rate": 7.210825533433719e-05,
"loss": 0.0335,
"step": 2320
},
{
"grad_norm": 0.6121062636375427,
"learning_rate": 7.186074533121013e-05,
"loss": 0.0336,
"step": 2330
},
{
"grad_norm": 0.5350408554077148,
"learning_rate": 7.161257125802413e-05,
"loss": 0.0353,
"step": 2340
},
{
"grad_norm": 0.5239009857177734,
"learning_rate": 7.136374065363334e-05,
"loss": 0.037,
"step": 2350
},
{
"grad_norm": 0.4956763982772827,
"learning_rate": 7.11142610768356e-05,
"loss": 0.0354,
"step": 2360
},
{
"grad_norm": 0.5018975138664246,
"learning_rate": 7.086414010614276e-05,
"loss": 0.0338,
"step": 2370
},
{
"grad_norm": 0.4875252842903137,
"learning_rate": 7.061338533955043e-05,
"loss": 0.0362,
"step": 2380
},
{
"grad_norm": 0.47811827063560486,
"learning_rate": 7.036200439430725e-05,
"loss": 0.0376,
"step": 2390
},
{
"grad_norm": 0.5614078044891357,
"learning_rate": 7.01100049066835e-05,
"loss": 0.0339,
"step": 2400
},
{
"grad_norm": 0.6021232008934021,
"learning_rate": 6.985739453173903e-05,
"loss": 0.0372,
"step": 2410
},
{
"grad_norm": 0.5856548547744751,
"learning_rate": 6.960418094309085e-05,
"loss": 0.0353,
"step": 2420
},
{
"grad_norm": 0.46249493956565857,
"learning_rate": 6.93503718326799e-05,
"loss": 0.0334,
"step": 2430
},
{
"grad_norm": 0.5227417945861816,
"learning_rate": 6.909597491053751e-05,
"loss": 0.0342,
"step": 2440
},
{
"grad_norm": 0.607357382774353,
"learning_rate": 6.884099790455113e-05,
"loss": 0.0324,
"step": 2450
},
{
"grad_norm": 0.485953152179718,
"learning_rate": 6.858544856022952e-05,
"loss": 0.0348,
"step": 2460
},
{
"grad_norm": 0.571148157119751,
"learning_rate": 6.83293346404676e-05,
"loss": 0.0347,
"step": 2470
},
{
"grad_norm": 0.5217222571372986,
"learning_rate": 6.80726639253105e-05,
"loss": 0.0368,
"step": 2480
},
{
"grad_norm": 0.4487457573413849,
"learning_rate": 6.781544421171732e-05,
"loss": 0.0355,
"step": 2490
},
{
"grad_norm": 0.47729650139808655,
"learning_rate": 6.755768331332424e-05,
"loss": 0.0343,
"step": 2500
},
{
"grad_norm": 0.4894144535064697,
"learning_rate": 6.729938906020713e-05,
"loss": 0.0353,
"step": 2510
},
{
"grad_norm": 0.544179379940033,
"learning_rate": 6.704056929864376e-05,
"loss": 0.0331,
"step": 2520
},
{
"grad_norm": 0.6115988492965698,
"learning_rate": 6.67812318908754e-05,
"loss": 0.0326,
"step": 2530
},
{
"grad_norm": 0.5752000212669373,
"learning_rate": 6.6521384714868e-05,
"loss": 0.0312,
"step": 2540
},
{
"grad_norm": 0.47528618574142456,
"learning_rate": 6.626103566407295e-05,
"loss": 0.0331,
"step": 2550
},
{
"grad_norm": 0.5542522072792053,
"learning_rate": 6.600019264718713e-05,
"loss": 0.0327,
"step": 2560
},
{
"grad_norm": 0.5280784368515015,
"learning_rate": 6.573886358791285e-05,
"loss": 0.0347,
"step": 2570
},
{
"grad_norm": 0.5374977588653564,
"learning_rate": 6.547705642471703e-05,
"loss": 0.0331,
"step": 2580
},
{
"grad_norm": 0.3995784521102905,
"learning_rate": 6.521477911059008e-05,
"loss": 0.0287,
"step": 2590
},
{
"grad_norm": 0.43667104840278625,
"learning_rate": 6.495203961280434e-05,
"loss": 0.0327,
"step": 2600
},
{
"grad_norm": 0.5405910611152649,
"learning_rate": 6.468884591267204e-05,
"loss": 0.0325,
"step": 2610
},
{
"grad_norm": 0.46785178780555725,
"learning_rate": 6.44252060053028e-05,
"loss": 0.0318,
"step": 2620
},
{
"grad_norm": 0.45796945691108704,
"learning_rate": 6.416112789936086e-05,
"loss": 0.0331,
"step": 2630
},
{
"grad_norm": 0.4898403286933899,
"learning_rate": 6.389661961682173e-05,
"loss": 0.0317,
"step": 2640
},
{
"grad_norm": 0.5258901119232178,
"learning_rate": 6.363168919272846e-05,
"loss": 0.0317,
"step": 2650
},
{
"grad_norm": 0.492632120847702,
"learning_rate": 6.336634467494768e-05,
"loss": 0.0306,
"step": 2660
},
{
"grad_norm": 0.5009192824363708,
"learning_rate": 6.310059412392505e-05,
"loss": 0.0304,
"step": 2670
},
{
"grad_norm": 0.6297652721405029,
"learning_rate": 6.283444561244042e-05,
"loss": 0.0304,
"step": 2680
},
{
"grad_norm": 0.4868377149105072,
"learning_rate": 6.256790722536251e-05,
"loss": 0.0313,
"step": 2690
},
{
"grad_norm": 0.5541006922721863,
"learning_rate": 6.230098705940354e-05,
"loss": 0.0316,
"step": 2700
},
{
"grad_norm": 0.42766621708869934,
"learning_rate": 6.203369322287306e-05,
"loss": 0.0327,
"step": 2710
},
{
"grad_norm": 0.5170658826828003,
"learning_rate": 6.17660338354317e-05,
"loss": 0.0293,
"step": 2720
},
{
"grad_norm": 0.4898792505264282,
"learning_rate": 6.149801702784456e-05,
"loss": 0.0288,
"step": 2730
},
{
"grad_norm": 0.4858188033103943,
"learning_rate": 6.122965094173424e-05,
"loss": 0.031,
"step": 2740
},
{
"grad_norm": 0.5073441863059998,
"learning_rate": 6.0960943729333374e-05,
"loss": 0.034,
"step": 2750
},
{
"grad_norm": 0.4941282570362091,
"learning_rate": 6.069190355323717e-05,
"loss": 0.0305,
"step": 2760
},
{
"grad_norm": 0.4680149257183075,
"learning_rate": 6.042253858615532e-05,
"loss": 0.0308,
"step": 2770
},
{
"grad_norm": 0.4339468777179718,
"learning_rate": 6.015285701066382e-05,
"loss": 0.0333,
"step": 2780
},
{
"grad_norm": 0.46258655190467834,
"learning_rate": 5.988286701895631e-05,
"loss": 0.0349,
"step": 2790
},
{
"grad_norm": 0.490296870470047,
"learning_rate": 5.961257681259535e-05,
"loss": 0.0343,
"step": 2800
},
{
"grad_norm": 0.5121153593063354,
"learning_rate": 5.934199460226317e-05,
"loss": 0.0332,
"step": 2810
},
{
"grad_norm": 0.4576858878135681,
"learning_rate": 5.9071128607512285e-05,
"loss": 0.0308,
"step": 2820
},
{
"grad_norm": 0.4811716675758362,
"learning_rate": 5.8799987056515804e-05,
"loss": 0.0304,
"step": 2830
},
{
"grad_norm": 0.6127643585205078,
"learning_rate": 5.8528578185817514e-05,
"loss": 0.0318,
"step": 2840
},
{
"grad_norm": 0.48503780364990234,
"learning_rate": 5.825691024008162e-05,
"loss": 0.0294,
"step": 2850
},
{
"grad_norm": 0.555530846118927,
"learning_rate": 5.798499147184233e-05,
"loss": 0.0307,
"step": 2860
},
{
"grad_norm": 0.562579870223999,
"learning_rate": 5.771283014125317e-05,
"loss": 0.0338,
"step": 2870
},
{
"grad_norm": 0.5244818925857544,
"learning_rate": 5.7440434515836064e-05,
"loss": 0.0284,
"step": 2880
},
{
"grad_norm": 0.3919405937194824,
"learning_rate": 5.7167812870230094e-05,
"loss": 0.0305,
"step": 2890
},
{
"grad_norm": 0.46723607182502747,
"learning_rate": 5.689497348594035e-05,
"loss": 0.0292,
"step": 2900
},
{
"grad_norm": 0.47963953018188477,
"learning_rate": 5.662192465108613e-05,
"loss": 0.0303,
"step": 2910
},
{
"grad_norm": 0.4416669011116028,
"learning_rate": 5.634867466014932e-05,
"loss": 0.0282,
"step": 2920
},
{
"grad_norm": 0.3962218761444092,
"learning_rate": 5.607523181372234e-05,
"loss": 0.0308,
"step": 2930
},
{
"grad_norm": 0.4772116243839264,
"learning_rate": 5.5801604418256117e-05,
"loss": 0.0292,
"step": 2940
},
{
"grad_norm": 0.40191689133644104,
"learning_rate": 5.552780078580756e-05,
"loss": 0.0275,
"step": 2950
},
{
"grad_norm": 0.4422965943813324,
"learning_rate": 5.525382923378728e-05,
"loss": 0.0292,
"step": 2960
},
{
"grad_norm": 0.4391031563282013,
"learning_rate": 5.49796980847068e-05,
"loss": 0.0311,
"step": 2970
},
{
"grad_norm": 0.4302864372730255,
"learning_rate": 5.470541566592573e-05,
"loss": 0.0303,
"step": 2980
},
{
"grad_norm": 0.4752635359764099,
"learning_rate": 5.443099030939887e-05,
"loss": 0.0284,
"step": 2990
},
{
"grad_norm": 0.42526647448539734,
"learning_rate": 5.415643035142309e-05,
"loss": 0.0279,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 6000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}