agentlans's picture
Upload 12 files
17628df verified
{
"best_global_step": 6505,
"best_metric": 0.19649724662303925,
"best_model_checkpoint": "/media/user/Expansion1/snowflake-arctic-embed-xs-refusal/checkpoint-6505",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 32525,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07686395080707148,
"grad_norm": 2.155881881713867,
"learning_rate": 4.923289777094543e-05,
"loss": 0.3511,
"num_input_tokens_seen": 512000,
"step": 500,
"train_runtime": 5.3031,
"train_tokens_per_second": 96546.489
},
{
"epoch": 0.15372790161414296,
"grad_norm": 0.6236560344696045,
"learning_rate": 4.846425826287471e-05,
"loss": 0.2593,
"num_input_tokens_seen": 1024000,
"step": 1000,
"train_runtime": 10.3436,
"train_tokens_per_second": 98998.124
},
{
"epoch": 0.23059185242121444,
"grad_norm": 3.8176653385162354,
"learning_rate": 4.7695618754804e-05,
"loss": 0.2546,
"num_input_tokens_seen": 1536000,
"step": 1500,
"train_runtime": 15.3626,
"train_tokens_per_second": 99983.128
},
{
"epoch": 0.3074558032282859,
"grad_norm": 2.538367986679077,
"learning_rate": 4.692697924673328e-05,
"loss": 0.2377,
"num_input_tokens_seen": 2048000,
"step": 2000,
"train_runtime": 20.351,
"train_tokens_per_second": 100634.117
},
{
"epoch": 0.3843197540353574,
"grad_norm": 3.922593832015991,
"learning_rate": 4.615833973866257e-05,
"loss": 0.2411,
"num_input_tokens_seen": 2560000,
"step": 2500,
"train_runtime": 25.363,
"train_tokens_per_second": 100934.428
},
{
"epoch": 0.4611837048424289,
"grad_norm": 0.728330135345459,
"learning_rate": 4.5389700230591855e-05,
"loss": 0.2278,
"num_input_tokens_seen": 3072000,
"step": 3000,
"train_runtime": 30.387,
"train_tokens_per_second": 101095.729
},
{
"epoch": 0.5380476556495004,
"grad_norm": 0.5299816131591797,
"learning_rate": 4.462106072252114e-05,
"loss": 0.2422,
"num_input_tokens_seen": 3584000,
"step": 3500,
"train_runtime": 35.3784,
"train_tokens_per_second": 101304.819
},
{
"epoch": 0.6149116064565718,
"grad_norm": 9.030647277832031,
"learning_rate": 4.3852421214450424e-05,
"loss": 0.2269,
"num_input_tokens_seen": 4096000,
"step": 4000,
"train_runtime": 40.3905,
"train_tokens_per_second": 101409.866
},
{
"epoch": 0.6917755572636434,
"grad_norm": 1.8069450855255127,
"learning_rate": 4.308378170637971e-05,
"loss": 0.2378,
"num_input_tokens_seen": 4608000,
"step": 4500,
"train_runtime": 45.4094,
"train_tokens_per_second": 101476.838
},
{
"epoch": 0.7686395080707148,
"grad_norm": 2.4024786949157715,
"learning_rate": 4.231514219830899e-05,
"loss": 0.2129,
"num_input_tokens_seen": 5120000,
"step": 5000,
"train_runtime": 50.4114,
"train_tokens_per_second": 101564.317
},
{
"epoch": 0.8455034588777863,
"grad_norm": 0.8753976821899414,
"learning_rate": 4.1546502690238284e-05,
"loss": 0.2354,
"num_input_tokens_seen": 5632000,
"step": 5500,
"train_runtime": 55.4722,
"train_tokens_per_second": 101528.336
},
{
"epoch": 0.9223674096848578,
"grad_norm": 4.247986316680908,
"learning_rate": 4.077786318216756e-05,
"loss": 0.237,
"num_input_tokens_seen": 6144000,
"step": 6000,
"train_runtime": 60.5329,
"train_tokens_per_second": 101498.514
},
{
"epoch": 0.9992313604919293,
"grad_norm": 7.1489973068237305,
"learning_rate": 4.000922367409685e-05,
"loss": 0.2286,
"num_input_tokens_seen": 6656000,
"step": 6500,
"train_runtime": 65.5361,
"train_tokens_per_second": 101562.357
},
{
"epoch": 1.0,
"eval_accuracy": 0.919369715603382,
"eval_loss": 0.19649724662303925,
"eval_runtime": 5.1167,
"eval_samples_per_second": 2542.665,
"eval_steps_per_second": 317.98,
"num_input_tokens_seen": 6661120,
"step": 6505
},
{
"epoch": 1.0760953112990008,
"grad_norm": 10.231003761291504,
"learning_rate": 3.9240584166026136e-05,
"loss": 0.1751,
"num_input_tokens_seen": 7168000,
"step": 7000,
"train_runtime": 75.9746,
"train_tokens_per_second": 94347.276
},
{
"epoch": 1.1529592621060722,
"grad_norm": 0.037300433963537216,
"learning_rate": 3.847194465795542e-05,
"loss": 0.1739,
"num_input_tokens_seen": 7680000,
"step": 7500,
"train_runtime": 81.0147,
"train_tokens_per_second": 94797.624
},
{
"epoch": 1.2298232129131437,
"grad_norm": 10.88604736328125,
"learning_rate": 3.7703305149884705e-05,
"loss": 0.1923,
"num_input_tokens_seen": 8192000,
"step": 8000,
"train_runtime": 86.0728,
"train_tokens_per_second": 95175.201
},
{
"epoch": 1.3066871637202153,
"grad_norm": 14.25737190246582,
"learning_rate": 3.693466564181399e-05,
"loss": 0.1809,
"num_input_tokens_seen": 8704000,
"step": 8500,
"train_runtime": 91.0822,
"train_tokens_per_second": 95562.051
},
{
"epoch": 1.3835511145272867,
"grad_norm": 0.3044818639755249,
"learning_rate": 3.6166026133743274e-05,
"loss": 0.1922,
"num_input_tokens_seen": 9216000,
"step": 9000,
"train_runtime": 96.1257,
"train_tokens_per_second": 95874.452
},
{
"epoch": 1.4604150653343582,
"grad_norm": 15.291511535644531,
"learning_rate": 3.5397386625672565e-05,
"loss": 0.171,
"num_input_tokens_seen": 9728000,
"step": 9500,
"train_runtime": 101.1575,
"train_tokens_per_second": 96166.85
},
{
"epoch": 1.5372790161414298,
"grad_norm": 12.030097007751465,
"learning_rate": 3.462874711760184e-05,
"loss": 0.1779,
"num_input_tokens_seen": 10240000,
"step": 10000,
"train_runtime": 106.2268,
"train_tokens_per_second": 96397.497
},
{
"epoch": 1.614142966948501,
"grad_norm": 0.17212723195552826,
"learning_rate": 3.386010760953113e-05,
"loss": 0.176,
"num_input_tokens_seen": 10752000,
"step": 10500,
"train_runtime": 111.2959,
"train_tokens_per_second": 96607.35
},
{
"epoch": 1.6910069177555727,
"grad_norm": 35.843482971191406,
"learning_rate": 3.309146810146042e-05,
"loss": 0.1879,
"num_input_tokens_seen": 11264000,
"step": 11000,
"train_runtime": 116.3574,
"train_tokens_per_second": 96805.219
},
{
"epoch": 1.767870868562644,
"grad_norm": 0.055776312947273254,
"learning_rate": 3.23228285933897e-05,
"loss": 0.1749,
"num_input_tokens_seen": 11776000,
"step": 11500,
"train_runtime": 121.386,
"train_tokens_per_second": 97012.845
},
{
"epoch": 1.8447348193697155,
"grad_norm": 0.48420748114585876,
"learning_rate": 3.1554189085318986e-05,
"loss": 0.1716,
"num_input_tokens_seen": 12288000,
"step": 12000,
"train_runtime": 126.4198,
"train_tokens_per_second": 97200.001
},
{
"epoch": 1.9215987701767872,
"grad_norm": 0.3808608949184418,
"learning_rate": 3.078554957724827e-05,
"loss": 0.1819,
"num_input_tokens_seen": 12800000,
"step": 12500,
"train_runtime": 131.456,
"train_tokens_per_second": 97370.996
},
{
"epoch": 1.9984627209838586,
"grad_norm": 0.15483863651752472,
"learning_rate": 3.0016910069177555e-05,
"loss": 0.1718,
"num_input_tokens_seen": 13312000,
"step": 13000,
"train_runtime": 136.5089,
"train_tokens_per_second": 97517.416
},
{
"epoch": 2.0,
"eval_accuracy": 0.9259031514219831,
"eval_loss": 0.27698734402656555,
"eval_runtime": 5.1343,
"eval_samples_per_second": 2533.949,
"eval_steps_per_second": 316.89,
"num_input_tokens_seen": 13322240,
"step": 13010
},
{
"epoch": 2.07532667179093,
"grad_norm": 0.06390306353569031,
"learning_rate": 2.9248270561106846e-05,
"loss": 0.1384,
"num_input_tokens_seen": 13824000,
"step": 13500,
"train_runtime": 146.9662,
"train_tokens_per_second": 94062.42
},
{
"epoch": 2.1521906225980016,
"grad_norm": 0.09185440093278885,
"learning_rate": 2.8479631053036127e-05,
"loss": 0.123,
"num_input_tokens_seen": 14336000,
"step": 14000,
"train_runtime": 152.0064,
"train_tokens_per_second": 94311.832
},
{
"epoch": 2.229054573405073,
"grad_norm": 0.05354034900665283,
"learning_rate": 2.7710991544965414e-05,
"loss": 0.1265,
"num_input_tokens_seen": 14848000,
"step": 14500,
"train_runtime": 157.0491,
"train_tokens_per_second": 94543.682
},
{
"epoch": 2.3059185242121445,
"grad_norm": 6.2592926025390625,
"learning_rate": 2.6942352036894695e-05,
"loss": 0.1214,
"num_input_tokens_seen": 15360000,
"step": 15000,
"train_runtime": 162.0991,
"train_tokens_per_second": 94756.854
},
{
"epoch": 2.382782475019216,
"grad_norm": 0.3284030854701996,
"learning_rate": 2.6173712528823986e-05,
"loss": 0.1298,
"num_input_tokens_seen": 15872000,
"step": 15500,
"train_runtime": 167.141,
"train_tokens_per_second": 94961.759
},
{
"epoch": 2.4596464258262873,
"grad_norm": 0.2101190984249115,
"learning_rate": 2.5405073020753267e-05,
"loss": 0.1279,
"num_input_tokens_seen": 16384000,
"step": 16000,
"train_runtime": 172.1787,
"train_tokens_per_second": 95156.958
},
{
"epoch": 2.536510376633359,
"grad_norm": 7.672014236450195,
"learning_rate": 2.463643351268255e-05,
"loss": 0.1435,
"num_input_tokens_seen": 16896000,
"step": 16500,
"train_runtime": 177.2341,
"train_tokens_per_second": 95331.566
},
{
"epoch": 2.6133743274404306,
"grad_norm": 37.9052734375,
"learning_rate": 2.3867794004611836e-05,
"loss": 0.123,
"num_input_tokens_seen": 17408000,
"step": 17000,
"train_runtime": 182.2827,
"train_tokens_per_second": 95500.03
},
{
"epoch": 2.690238278247502,
"grad_norm": 0.08578933030366898,
"learning_rate": 2.3099154496541124e-05,
"loss": 0.1289,
"num_input_tokens_seen": 17920000,
"step": 17500,
"train_runtime": 187.3355,
"train_tokens_per_second": 95657.272
},
{
"epoch": 2.7671022290545735,
"grad_norm": 0.08860859274864197,
"learning_rate": 2.2330514988470408e-05,
"loss": 0.1296,
"num_input_tokens_seen": 18432000,
"step": 18000,
"train_runtime": 192.3781,
"train_tokens_per_second": 95811.329
},
{
"epoch": 2.8439661798616447,
"grad_norm": 0.41104796528816223,
"learning_rate": 2.1561875480399692e-05,
"loss": 0.1124,
"num_input_tokens_seen": 18944000,
"step": 18500,
"train_runtime": 197.7595,
"train_tokens_per_second": 95793.134
},
{
"epoch": 2.9208301306687163,
"grad_norm": 37.97283172607422,
"learning_rate": 2.079323597232898e-05,
"loss": 0.1382,
"num_input_tokens_seen": 19456000,
"step": 19000,
"train_runtime": 203.2967,
"train_tokens_per_second": 95702.474
},
{
"epoch": 2.997694081475788,
"grad_norm": 0.0325402170419693,
"learning_rate": 2.0024596464258264e-05,
"loss": 0.1388,
"num_input_tokens_seen": 19968000,
"step": 19500,
"train_runtime": 208.8029,
"train_tokens_per_second": 95630.843
},
{
"epoch": 3.0,
"eval_accuracy": 0.9287471176018447,
"eval_loss": 0.31825903058052063,
"eval_runtime": 5.4038,
"eval_samples_per_second": 2407.562,
"eval_steps_per_second": 301.084,
"num_input_tokens_seen": 19983360,
"step": 19515
},
{
"epoch": 3.074558032282859,
"grad_norm": 0.05180477350950241,
"learning_rate": 1.925595695618755e-05,
"loss": 0.0823,
"num_input_tokens_seen": 20480000,
"step": 20000,
"train_runtime": 219.9739,
"train_tokens_per_second": 93101.956
},
{
"epoch": 3.151421983089931,
"grad_norm": 0.005111335311084986,
"learning_rate": 1.8487317448116833e-05,
"loss": 0.0716,
"num_input_tokens_seen": 20992000,
"step": 20500,
"train_runtime": 225.3567,
"train_tokens_per_second": 93150.099
},
{
"epoch": 3.2282859338970025,
"grad_norm": 0.012623129412531853,
"learning_rate": 1.771867794004612e-05,
"loss": 0.0915,
"num_input_tokens_seen": 21504000,
"step": 21000,
"train_runtime": 230.6352,
"train_tokens_per_second": 93238.15
},
{
"epoch": 3.3051498847040737,
"grad_norm": 10.89956283569336,
"learning_rate": 1.6950038431975405e-05,
"loss": 0.0783,
"num_input_tokens_seen": 22016000,
"step": 21500,
"train_runtime": 236.059,
"train_tokens_per_second": 93264.831
},
{
"epoch": 3.3820138355111453,
"grad_norm": 0.010125258006155491,
"learning_rate": 1.618139892390469e-05,
"loss": 0.0882,
"num_input_tokens_seen": 22528000,
"step": 22000,
"train_runtime": 241.5242,
"train_tokens_per_second": 93274.31
},
{
"epoch": 3.458877786318217,
"grad_norm": 0.04097803309559822,
"learning_rate": 1.5412759415833973e-05,
"loss": 0.095,
"num_input_tokens_seen": 23040000,
"step": 22500,
"train_runtime": 247.0361,
"train_tokens_per_second": 93265.737
},
{
"epoch": 3.535741737125288,
"grad_norm": 0.3172767758369446,
"learning_rate": 1.464411990776326e-05,
"loss": 0.0847,
"num_input_tokens_seen": 23552000,
"step": 23000,
"train_runtime": 252.366,
"train_tokens_per_second": 93324.783
},
{
"epoch": 3.61260568793236,
"grad_norm": 0.0049354820512235165,
"learning_rate": 1.3875480399692545e-05,
"loss": 0.0758,
"num_input_tokens_seen": 24064000,
"step": 23500,
"train_runtime": 257.7685,
"train_tokens_per_second": 93355.1
},
{
"epoch": 3.689469638739431,
"grad_norm": 0.09353843331336975,
"learning_rate": 1.310684089162183e-05,
"loss": 0.0812,
"num_input_tokens_seen": 24576000,
"step": 24000,
"train_runtime": 263.0331,
"train_tokens_per_second": 93433.102
},
{
"epoch": 3.7663335895465027,
"grad_norm": 13.593195915222168,
"learning_rate": 1.2338201383551116e-05,
"loss": 0.0882,
"num_input_tokens_seen": 25088000,
"step": 24500,
"train_runtime": 268.3188,
"train_tokens_per_second": 93500.701
},
{
"epoch": 3.8431975403535743,
"grad_norm": 0.12645399570465088,
"learning_rate": 1.15695618754804e-05,
"loss": 0.0862,
"num_input_tokens_seen": 25600000,
"step": 25000,
"train_runtime": 273.6907,
"train_tokens_per_second": 93536.259
},
{
"epoch": 3.9200614911606455,
"grad_norm": 0.015061162412166595,
"learning_rate": 1.0800922367409686e-05,
"loss": 0.0825,
"num_input_tokens_seen": 26112000,
"step": 25500,
"train_runtime": 279.0408,
"train_tokens_per_second": 93577.699
},
{
"epoch": 3.996925441967717,
"grad_norm": 0.020776506513357162,
"learning_rate": 1.003228285933897e-05,
"loss": 0.0772,
"num_input_tokens_seen": 26624000,
"step": 26000,
"train_runtime": 284.4194,
"train_tokens_per_second": 93608.25
},
{
"epoch": 4.0,
"eval_accuracy": 0.9270561106840891,
"eval_loss": 0.43025368452072144,
"eval_runtime": 5.3244,
"eval_samples_per_second": 2443.487,
"eval_steps_per_second": 305.577,
"num_input_tokens_seen": 26644480,
"step": 26020
},
{
"epoch": 4.073789392774788,
"grad_norm": 0.09127756953239441,
"learning_rate": 9.263643351268256e-06,
"loss": 0.0615,
"num_input_tokens_seen": 27136000,
"step": 26500,
"train_runtime": 295.1257,
"train_tokens_per_second": 91947.265
},
{
"epoch": 4.15065334358186,
"grad_norm": 0.11556842923164368,
"learning_rate": 8.49500384319754e-06,
"loss": 0.0444,
"num_input_tokens_seen": 27648000,
"step": 27000,
"train_runtime": 300.149,
"train_tokens_per_second": 92114.259
},
{
"epoch": 4.227517294388932,
"grad_norm": 0.09005430340766907,
"learning_rate": 7.726364335126826e-06,
"loss": 0.0434,
"num_input_tokens_seen": 28160000,
"step": 27500,
"train_runtime": 305.2327,
"train_tokens_per_second": 92257.495
},
{
"epoch": 4.304381245196003,
"grad_norm": 0.004569609649479389,
"learning_rate": 6.9577248270561115e-06,
"loss": 0.0352,
"num_input_tokens_seen": 28672000,
"step": 28000,
"train_runtime": 310.2962,
"train_tokens_per_second": 92402.026
},
{
"epoch": 4.381245196003075,
"grad_norm": 1.524936318397522,
"learning_rate": 6.189085318985397e-06,
"loss": 0.06,
"num_input_tokens_seen": 29184000,
"step": 28500,
"train_runtime": 315.365,
"train_tokens_per_second": 92540.388
},
{
"epoch": 4.458109146810146,
"grad_norm": 0.014427268877625465,
"learning_rate": 5.420445810914681e-06,
"loss": 0.0492,
"num_input_tokens_seen": 29696000,
"step": 29000,
"train_runtime": 320.4274,
"train_tokens_per_second": 92676.208
},
{
"epoch": 4.534973097617217,
"grad_norm": 0.07355033606290817,
"learning_rate": 4.651806302843966e-06,
"loss": 0.0572,
"num_input_tokens_seen": 30208000,
"step": 29500,
"train_runtime": 325.4966,
"train_tokens_per_second": 92805.883
},
{
"epoch": 4.611837048424289,
"grad_norm": 7.176478385925293,
"learning_rate": 3.883166794773251e-06,
"loss": 0.0442,
"num_input_tokens_seen": 30720000,
"step": 30000,
"train_runtime": 330.568,
"train_tokens_per_second": 92930.968
},
{
"epoch": 4.688700999231361,
"grad_norm": 0.030576860532164574,
"learning_rate": 3.114527286702537e-06,
"loss": 0.0465,
"num_input_tokens_seen": 31232000,
"step": 30500,
"train_runtime": 335.6493,
"train_tokens_per_second": 93049.488
},
{
"epoch": 4.765564950038432,
"grad_norm": 0.005597515497356653,
"learning_rate": 2.345887778631822e-06,
"loss": 0.0487,
"num_input_tokens_seen": 31744000,
"step": 31000,
"train_runtime": 340.8069,
"train_tokens_per_second": 93143.641
},
{
"epoch": 4.842428900845503,
"grad_norm": 0.021089155226945877,
"learning_rate": 1.5772482705611067e-06,
"loss": 0.0557,
"num_input_tokens_seen": 32256000,
"step": 31500,
"train_runtime": 346.0097,
"train_tokens_per_second": 93222.824
},
{
"epoch": 4.919292851652575,
"grad_norm": 0.023516027256846428,
"learning_rate": 8.086087624903922e-07,
"loss": 0.0399,
"num_input_tokens_seen": 32768000,
"step": 32000,
"train_runtime": 351.2595,
"train_tokens_per_second": 93287.165
},
{
"epoch": 4.996156802459646,
"grad_norm": 0.007581554353237152,
"learning_rate": 3.996925441967718e-08,
"loss": 0.0532,
"num_input_tokens_seen": 33280000,
"step": 32500,
"train_runtime": 356.5514,
"train_tokens_per_second": 93338.566
},
{
"epoch": 5.0,
"eval_accuracy": 0.9264411990776326,
"eval_loss": 0.4613732397556305,
"eval_runtime": 5.2997,
"eval_samples_per_second": 2454.857,
"eval_steps_per_second": 306.999,
"num_input_tokens_seen": 33305600,
"step": 32525
},
{
"epoch": 5.0,
"num_input_tokens_seen": 33305600,
"step": 32525,
"total_flos": 2157435918643200.0,
"train_loss": 0.1371641572881533,
"train_runtime": 362.4532,
"train_samples_per_second": 717.886,
"train_steps_per_second": 89.736
}
],
"logging_steps": 500,
"max_steps": 32525,
"num_input_tokens_seen": 33305600,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2157435918643200.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}