cekpoint3 / checkpoint-950 /trainer_state.json
Sela223's picture
Upload checkpoint checkpoint-950
7826d33 verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.973821989528796,
"eval_steps": 500,
"global_step": 950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05235602094240838,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 9.7408,
"step": 10
},
{
"epoch": 0.10471204188481675,
"grad_norm": 202209.765625,
"learning_rate": 3.1413612565445024e-08,
"loss": 9.1548,
"step": 20
},
{
"epoch": 0.15706806282722513,
"grad_norm": 27189.787109375,
"learning_rate": 1.3612565445026178e-07,
"loss": 3.8451,
"step": 30
},
{
"epoch": 0.2094240837696335,
"grad_norm": 10457.34765625,
"learning_rate": 2.4083769633507854e-07,
"loss": 3.267,
"step": 40
},
{
"epoch": 0.2617801047120419,
"grad_norm": 8087.2939453125,
"learning_rate": 3.4554973821989523e-07,
"loss": 3.0939,
"step": 50
},
{
"epoch": 0.31413612565445026,
"grad_norm": 7214.744140625,
"learning_rate": 4.50261780104712e-07,
"loss": 3.0211,
"step": 60
},
{
"epoch": 0.36649214659685864,
"grad_norm": 6162.826171875,
"learning_rate": 5.549738219895288e-07,
"loss": 2.846,
"step": 70
},
{
"epoch": 0.418848167539267,
"grad_norm": 4688.05615234375,
"learning_rate": 6.596858638743455e-07,
"loss": 2.8104,
"step": 80
},
{
"epoch": 0.4712041884816754,
"grad_norm": 3856.7578125,
"learning_rate": 7.643979057591623e-07,
"loss": 2.8735,
"step": 90
},
{
"epoch": 0.5235602094240838,
"grad_norm": 3529.413330078125,
"learning_rate": 8.691099476439791e-07,
"loss": 2.8117,
"step": 100
},
{
"epoch": 0.5759162303664922,
"grad_norm": 2830.52734375,
"learning_rate": 9.738219895287958e-07,
"loss": 2.7099,
"step": 110
},
{
"epoch": 0.6282722513089005,
"grad_norm": 2316.537353515625,
"learning_rate": 1.0785340314136124e-06,
"loss": 2.6387,
"step": 120
},
{
"epoch": 0.680628272251309,
"grad_norm": 2685.246826171875,
"learning_rate": 1.1832460732984293e-06,
"loss": 2.6667,
"step": 130
},
{
"epoch": 0.7329842931937173,
"grad_norm": 2066.593017578125,
"learning_rate": 1.2879581151832458e-06,
"loss": 2.5786,
"step": 140
},
{
"epoch": 0.7853403141361257,
"grad_norm": 2110.41748046875,
"learning_rate": 1.3926701570680628e-06,
"loss": 2.4927,
"step": 150
},
{
"epoch": 0.837696335078534,
"grad_norm": 1557.745849609375,
"learning_rate": 1.4973821989528795e-06,
"loss": 2.6125,
"step": 160
},
{
"epoch": 0.8900523560209425,
"grad_norm": 1510.9991455078125,
"learning_rate": 1.6020942408376963e-06,
"loss": 2.5048,
"step": 170
},
{
"epoch": 0.9424083769633508,
"grad_norm": 1395.5841064453125,
"learning_rate": 1.706806282722513e-06,
"loss": 2.5049,
"step": 180
},
{
"epoch": 0.9947643979057592,
"grad_norm": 1400.4466552734375,
"learning_rate": 1.8115183246073297e-06,
"loss": 2.4902,
"step": 190
},
{
"epoch": 1.0471204188481675,
"grad_norm": 1328.171142578125,
"learning_rate": 1.9162303664921463e-06,
"loss": 2.3063,
"step": 200
},
{
"epoch": 1.0994764397905759,
"grad_norm": 1169.1490478515625,
"learning_rate": 1.997673065735893e-06,
"loss": 2.3826,
"step": 210
},
{
"epoch": 1.1518324607329844,
"grad_norm": 1007.3028564453125,
"learning_rate": 1.9860383944153577e-06,
"loss": 2.2646,
"step": 220
},
{
"epoch": 1.2041884816753927,
"grad_norm": 905.8086547851562,
"learning_rate": 1.9744037230948225e-06,
"loss": 2.3065,
"step": 230
},
{
"epoch": 1.256544502617801,
"grad_norm": 904.2677001953125,
"learning_rate": 1.9627690517742874e-06,
"loss": 2.369,
"step": 240
},
{
"epoch": 1.3089005235602094,
"grad_norm": 878.70751953125,
"learning_rate": 1.951134380453752e-06,
"loss": 2.2916,
"step": 250
},
{
"epoch": 1.3612565445026177,
"grad_norm": 785.525146484375,
"learning_rate": 1.9394997091332166e-06,
"loss": 2.2916,
"step": 260
},
{
"epoch": 1.4136125654450262,
"grad_norm": 715.8485107421875,
"learning_rate": 1.927865037812682e-06,
"loss": 2.247,
"step": 270
},
{
"epoch": 1.4659685863874345,
"grad_norm": 742.1319580078125,
"learning_rate": 1.9162303664921463e-06,
"loss": 2.2293,
"step": 280
},
{
"epoch": 1.518324607329843,
"grad_norm": 777.41259765625,
"learning_rate": 1.9045956951716113e-06,
"loss": 2.1447,
"step": 290
},
{
"epoch": 1.5706806282722514,
"grad_norm": 693.8157348632812,
"learning_rate": 1.8929610238510761e-06,
"loss": 2.1851,
"step": 300
},
{
"epoch": 1.6230366492146597,
"grad_norm": 707.2672119140625,
"learning_rate": 1.881326352530541e-06,
"loss": 2.1879,
"step": 310
},
{
"epoch": 1.675392670157068,
"grad_norm": 727.61767578125,
"learning_rate": 1.8696916812100056e-06,
"loss": 2.1962,
"step": 320
},
{
"epoch": 1.7277486910994764,
"grad_norm": 695.4833984375,
"learning_rate": 1.8580570098894706e-06,
"loss": 2.2057,
"step": 330
},
{
"epoch": 1.7801047120418847,
"grad_norm": 614.199462890625,
"learning_rate": 1.8464223385689352e-06,
"loss": 2.0654,
"step": 340
},
{
"epoch": 1.8324607329842932,
"grad_norm": 724.0316162109375,
"learning_rate": 1.8347876672484e-06,
"loss": 2.0803,
"step": 350
},
{
"epoch": 1.8848167539267016,
"grad_norm": 664.735595703125,
"learning_rate": 1.823152995927865e-06,
"loss": 1.8995,
"step": 360
},
{
"epoch": 1.93717277486911,
"grad_norm": 725.57373046875,
"learning_rate": 1.8115183246073297e-06,
"loss": 1.9195,
"step": 370
},
{
"epoch": 1.9895287958115184,
"grad_norm": 680.0363159179688,
"learning_rate": 1.7998836532867946e-06,
"loss": 1.9157,
"step": 380
},
{
"epoch": 2.0418848167539267,
"grad_norm": 656.7247314453125,
"learning_rate": 1.7882489819662594e-06,
"loss": 1.8435,
"step": 390
},
{
"epoch": 2.094240837696335,
"grad_norm": 754.1705322265625,
"learning_rate": 1.776614310645724e-06,
"loss": 1.8308,
"step": 400
},
{
"epoch": 2.1465968586387434,
"grad_norm": 811.6585693359375,
"learning_rate": 1.764979639325189e-06,
"loss": 1.8349,
"step": 410
},
{
"epoch": 2.1989528795811517,
"grad_norm": 743.0385131835938,
"learning_rate": 1.7533449680046537e-06,
"loss": 1.8507,
"step": 420
},
{
"epoch": 2.25130890052356,
"grad_norm": 748.2722778320312,
"learning_rate": 1.7417102966841187e-06,
"loss": 1.7967,
"step": 430
},
{
"epoch": 2.303664921465969,
"grad_norm": 587.875732421875,
"learning_rate": 1.7300756253635833e-06,
"loss": 1.9958,
"step": 440
},
{
"epoch": 2.356020942408377,
"grad_norm": 623.1217651367188,
"learning_rate": 1.7184409540430482e-06,
"loss": 1.8716,
"step": 450
},
{
"epoch": 2.4083769633507854,
"grad_norm": 689.55126953125,
"learning_rate": 1.706806282722513e-06,
"loss": 1.8947,
"step": 460
},
{
"epoch": 2.4607329842931938,
"grad_norm": 656.4078369140625,
"learning_rate": 1.6951716114019778e-06,
"loss": 1.8584,
"step": 470
},
{
"epoch": 2.513089005235602,
"grad_norm": 672.116455078125,
"learning_rate": 1.6835369400814424e-06,
"loss": 1.8129,
"step": 480
},
{
"epoch": 2.5654450261780104,
"grad_norm": 586.6629638671875,
"learning_rate": 1.6719022687609075e-06,
"loss": 1.8214,
"step": 490
},
{
"epoch": 2.6178010471204187,
"grad_norm": 593.3026123046875,
"learning_rate": 1.6602675974403721e-06,
"loss": 1.802,
"step": 500
},
{
"epoch": 2.670157068062827,
"grad_norm": 591.8192749023438,
"learning_rate": 1.6486329261198371e-06,
"loss": 1.8523,
"step": 510
},
{
"epoch": 2.7225130890052354,
"grad_norm": 655.689453125,
"learning_rate": 1.6369982547993018e-06,
"loss": 1.7537,
"step": 520
},
{
"epoch": 2.774869109947644,
"grad_norm": 727.9883422851562,
"learning_rate": 1.6253635834787666e-06,
"loss": 1.7047,
"step": 530
},
{
"epoch": 2.8272251308900525,
"grad_norm": 547.6102905273438,
"learning_rate": 1.6137289121582314e-06,
"loss": 1.8178,
"step": 540
},
{
"epoch": 2.8795811518324608,
"grad_norm": 565.3403930664062,
"learning_rate": 1.6020942408376963e-06,
"loss": 1.7221,
"step": 550
},
{
"epoch": 2.931937172774869,
"grad_norm": 623.0109252929688,
"learning_rate": 1.5904595695171609e-06,
"loss": 1.7912,
"step": 560
},
{
"epoch": 2.9842931937172774,
"grad_norm": 600.0060424804688,
"learning_rate": 1.578824898196626e-06,
"loss": 1.8453,
"step": 570
},
{
"epoch": 3.0366492146596857,
"grad_norm": 719.7506103515625,
"learning_rate": 1.5671902268760905e-06,
"loss": 1.5766,
"step": 580
},
{
"epoch": 3.089005235602094,
"grad_norm": 836.7677612304688,
"learning_rate": 1.5555555555555556e-06,
"loss": 1.5165,
"step": 590
},
{
"epoch": 3.141361256544503,
"grad_norm": 736.6253662109375,
"learning_rate": 1.5439208842350202e-06,
"loss": 1.5065,
"step": 600
},
{
"epoch": 3.193717277486911,
"grad_norm": 796.0474243164062,
"learning_rate": 1.532286212914485e-06,
"loss": 1.4979,
"step": 610
},
{
"epoch": 3.2460732984293195,
"grad_norm": 717.6380615234375,
"learning_rate": 1.5206515415939499e-06,
"loss": 1.488,
"step": 620
},
{
"epoch": 3.298429319371728,
"grad_norm": 726.650634765625,
"learning_rate": 1.5090168702734147e-06,
"loss": 1.4923,
"step": 630
},
{
"epoch": 3.350785340314136,
"grad_norm": 660.4285278320312,
"learning_rate": 1.4973821989528795e-06,
"loss": 1.5328,
"step": 640
},
{
"epoch": 3.4031413612565444,
"grad_norm": 717.77490234375,
"learning_rate": 1.4857475276323443e-06,
"loss": 1.5207,
"step": 650
},
{
"epoch": 3.4554973821989527,
"grad_norm": 665.8229370117188,
"learning_rate": 1.474112856311809e-06,
"loss": 1.497,
"step": 660
},
{
"epoch": 3.507853403141361,
"grad_norm": 726.3001098632812,
"learning_rate": 1.462478184991274e-06,
"loss": 1.5739,
"step": 670
},
{
"epoch": 3.5602094240837694,
"grad_norm": 805.164306640625,
"learning_rate": 1.4508435136707386e-06,
"loss": 1.5005,
"step": 680
},
{
"epoch": 3.612565445026178,
"grad_norm": 672.5020751953125,
"learning_rate": 1.4392088423502037e-06,
"loss": 1.4879,
"step": 690
},
{
"epoch": 3.6649214659685865,
"grad_norm": 731.792236328125,
"learning_rate": 1.4275741710296683e-06,
"loss": 1.4545,
"step": 700
},
{
"epoch": 3.717277486910995,
"grad_norm": 764.9650268554688,
"learning_rate": 1.4159394997091331e-06,
"loss": 1.5199,
"step": 710
},
{
"epoch": 3.769633507853403,
"grad_norm": 888.060302734375,
"learning_rate": 1.404304828388598e-06,
"loss": 1.4832,
"step": 720
},
{
"epoch": 3.8219895287958114,
"grad_norm": 691.344970703125,
"learning_rate": 1.3926701570680628e-06,
"loss": 1.5045,
"step": 730
},
{
"epoch": 3.8743455497382198,
"grad_norm": 706.8125,
"learning_rate": 1.3810354857475274e-06,
"loss": 1.5049,
"step": 740
},
{
"epoch": 3.9267015706806285,
"grad_norm": 744.5066528320312,
"learning_rate": 1.3694008144269924e-06,
"loss": 1.4492,
"step": 750
},
{
"epoch": 3.979057591623037,
"grad_norm": 752.1239624023438,
"learning_rate": 1.357766143106457e-06,
"loss": 1.5103,
"step": 760
},
{
"epoch": 4.031413612565445,
"grad_norm": 684.4227294921875,
"learning_rate": 1.346131471785922e-06,
"loss": 1.2956,
"step": 770
},
{
"epoch": 4.0837696335078535,
"grad_norm": 712.1319580078125,
"learning_rate": 1.3344968004653867e-06,
"loss": 1.1727,
"step": 780
},
{
"epoch": 4.136125654450262,
"grad_norm": 802.2329711914062,
"learning_rate": 1.3228621291448515e-06,
"loss": 1.0637,
"step": 790
},
{
"epoch": 4.18848167539267,
"grad_norm": 782.2645263671875,
"learning_rate": 1.3112274578243164e-06,
"loss": 1.2483,
"step": 800
},
{
"epoch": 4.2408376963350785,
"grad_norm": 756.0220947265625,
"learning_rate": 1.2995927865037812e-06,
"loss": 1.135,
"step": 810
},
{
"epoch": 4.293193717277487,
"grad_norm": 726.4359741210938,
"learning_rate": 1.2879581151832458e-06,
"loss": 1.1676,
"step": 820
},
{
"epoch": 4.345549738219895,
"grad_norm": 803.60791015625,
"learning_rate": 1.2763234438627109e-06,
"loss": 1.1466,
"step": 830
},
{
"epoch": 4.397905759162303,
"grad_norm": 964.1234741210938,
"learning_rate": 1.2646887725421755e-06,
"loss": 1.243,
"step": 840
},
{
"epoch": 4.450261780104712,
"grad_norm": 774.6426391601562,
"learning_rate": 1.2530541012216405e-06,
"loss": 1.2794,
"step": 850
},
{
"epoch": 4.50261780104712,
"grad_norm": 954.2877197265625,
"learning_rate": 1.2414194299011051e-06,
"loss": 1.2563,
"step": 860
},
{
"epoch": 4.554973821989529,
"grad_norm": 854.4068603515625,
"learning_rate": 1.22978475858057e-06,
"loss": 1.2186,
"step": 870
},
{
"epoch": 4.607329842931938,
"grad_norm": 842.273193359375,
"learning_rate": 1.2181500872600348e-06,
"loss": 1.1172,
"step": 880
},
{
"epoch": 4.659685863874346,
"grad_norm": 794.1563720703125,
"learning_rate": 1.2065154159394996e-06,
"loss": 1.2818,
"step": 890
},
{
"epoch": 4.712041884816754,
"grad_norm": 864.4095458984375,
"learning_rate": 1.1948807446189645e-06,
"loss": 1.2335,
"step": 900
},
{
"epoch": 4.7643979057591626,
"grad_norm": 781.428955078125,
"learning_rate": 1.1832460732984293e-06,
"loss": 1.2563,
"step": 910
},
{
"epoch": 4.816753926701571,
"grad_norm": 808.9722900390625,
"learning_rate": 1.171611401977894e-06,
"loss": 1.2301,
"step": 920
},
{
"epoch": 4.869109947643979,
"grad_norm": 838.6384887695312,
"learning_rate": 1.159976730657359e-06,
"loss": 1.1707,
"step": 930
},
{
"epoch": 4.9214659685863875,
"grad_norm": 852.7664184570312,
"learning_rate": 1.1483420593368236e-06,
"loss": 1.1906,
"step": 940
},
{
"epoch": 4.973821989528796,
"grad_norm": 689.58154296875,
"learning_rate": 1.1367073880162884e-06,
"loss": 1.2978,
"step": 950
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.260298693445818e+16,
"train_batch_size": 5,
"trial_name": null,
"trial_params": null
}