8-clusters-balanced-lex-best-0 / trainer_state.json
MHGanainy's picture
MHGanainy/8-clusters-balanced-lex-best-0
2527f27 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 7908,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012645422357106728,
"grad_norm": 0.11559224128723145,
"learning_rate": 2.5316455696202535e-06,
"loss": 2.4624,
"step": 100
},
{
"epoch": 0.025290844714213456,
"grad_norm": 0.13693679869174957,
"learning_rate": 5.063291139240507e-06,
"loss": 2.4073,
"step": 200
},
{
"epoch": 0.03793626707132018,
"grad_norm": 0.18691261112689972,
"learning_rate": 7.5949367088607605e-06,
"loss": 2.4027,
"step": 300
},
{
"epoch": 0.05058168942842691,
"grad_norm": 0.3437875509262085,
"learning_rate": 1.0126582278481014e-05,
"loss": 2.3508,
"step": 400
},
{
"epoch": 0.06322711178553364,
"grad_norm": 0.3929010033607483,
"learning_rate": 1.2658227848101268e-05,
"loss": 2.3569,
"step": 500
},
{
"epoch": 0.07587253414264036,
"grad_norm": 0.4519222676753998,
"learning_rate": 1.5189873417721521e-05,
"loss": 2.3237,
"step": 600
},
{
"epoch": 0.08851795649974709,
"grad_norm": 0.42425811290740967,
"learning_rate": 1.7721518987341772e-05,
"loss": 2.2495,
"step": 700
},
{
"epoch": 0.10116337885685382,
"grad_norm": 0.4894159436225891,
"learning_rate": 1.9999902601318596e-05,
"loss": 2.2785,
"step": 800
},
{
"epoch": 0.11380880121396054,
"grad_norm": 0.5537558794021606,
"learning_rate": 1.9988217055109233e-05,
"loss": 2.2217,
"step": 900
},
{
"epoch": 0.12645422357106728,
"grad_norm": 0.6574133038520813,
"learning_rate": 1.9957077852147003e-05,
"loss": 2.2403,
"step": 1000
},
{
"epoch": 0.139099645928174,
"grad_norm": 0.7620156407356262,
"learning_rate": 1.9906545641030418e-05,
"loss": 2.2379,
"step": 1100
},
{
"epoch": 0.15174506828528073,
"grad_norm": 0.7593836784362793,
"learning_rate": 1.983671884135574e-05,
"loss": 2.2384,
"step": 1200
},
{
"epoch": 0.16439049064238745,
"grad_norm": 0.637991189956665,
"learning_rate": 1.9747733452029044e-05,
"loss": 2.1626,
"step": 1300
},
{
"epoch": 0.17703591299949417,
"grad_norm": 0.7157115340232849,
"learning_rate": 1.9639762786386466e-05,
"loss": 2.2157,
"step": 1400
},
{
"epoch": 0.18968133535660092,
"grad_norm": 0.8079652190208435,
"learning_rate": 1.9513017134638686e-05,
"loss": 2.1978,
"step": 1500
},
{
"epoch": 0.20232675771370764,
"grad_norm": 0.7162272334098816,
"learning_rate": 1.9367743354296953e-05,
"loss": 2.1615,
"step": 1600
},
{
"epoch": 0.21497218007081437,
"grad_norm": 0.7601909637451172,
"learning_rate": 1.9204224389378434e-05,
"loss": 2.1563,
"step": 1700
},
{
"epoch": 0.2276176024279211,
"grad_norm": 0.8401700258255005,
"learning_rate": 1.902277871932732e-05,
"loss": 2.1535,
"step": 1800
},
{
"epoch": 0.2402630247850278,
"grad_norm": 0.8605347275733948,
"learning_rate": 1.882375973872494e-05,
"loss": 2.1561,
"step": 1900
},
{
"epoch": 0.25290844714213456,
"grad_norm": 0.7509819865226746,
"learning_rate": 1.8607555068997083e-05,
"loss": 2.1439,
"step": 2000
},
{
"epoch": 0.26555386949924126,
"grad_norm": 0.7537213563919067,
"learning_rate": 1.8374585803459005e-05,
"loss": 2.1637,
"step": 2100
},
{
"epoch": 0.278199291856348,
"grad_norm": 0.7172447443008423,
"learning_rate": 1.8125305687168578e-05,
"loss": 2.1378,
"step": 2200
},
{
"epoch": 0.2908447142134547,
"grad_norm": 1.2094899415969849,
"learning_rate": 1.7860200233184934e-05,
"loss": 2.116,
"step": 2300
},
{
"epoch": 0.30349013657056145,
"grad_norm": 0.6232509613037109,
"learning_rate": 1.7579785776953788e-05,
"loss": 2.1388,
"step": 2400
},
{
"epoch": 0.3161355589276682,
"grad_norm": 0.7145382761955261,
"learning_rate": 1.728460847066126e-05,
"loss": 2.122,
"step": 2500
},
{
"epoch": 0.3287809812847749,
"grad_norm": 0.7406628727912903,
"learning_rate": 1.6975243219514772e-05,
"loss": 2.0997,
"step": 2600
},
{
"epoch": 0.34142640364188165,
"grad_norm": 0.7270051836967468,
"learning_rate": 1.6652292562022838e-05,
"loss": 2.1062,
"step": 2700
},
{
"epoch": 0.35407182599898834,
"grad_norm": 0.7675819993019104,
"learning_rate": 1.6316385496454543e-05,
"loss": 2.0954,
"step": 2800
},
{
"epoch": 0.3667172483560951,
"grad_norm": 1.0367164611816406,
"learning_rate": 1.596817625576442e-05,
"loss": 2.0791,
"step": 2900
},
{
"epoch": 0.37936267071320184,
"grad_norm": 0.9357011318206787,
"learning_rate": 1.5608343033368685e-05,
"loss": 2.0908,
"step": 3000
},
{
"epoch": 0.39200809307030854,
"grad_norm": 1.0484192371368408,
"learning_rate": 1.5237586662254664e-05,
"loss": 2.114,
"step": 3100
},
{
"epoch": 0.4046535154274153,
"grad_norm": 0.8791719079017639,
"learning_rate": 1.4856629249995986e-05,
"loss": 2.1022,
"step": 3200
},
{
"epoch": 0.417298937784522,
"grad_norm": 0.8439493179321289,
"learning_rate": 1.446621277233214e-05,
"loss": 2.093,
"step": 3300
},
{
"epoch": 0.42994436014162873,
"grad_norm": 0.9357572793960571,
"learning_rate": 1.4067097628051532e-05,
"loss": 2.0881,
"step": 3400
},
{
"epoch": 0.4425897824987355,
"grad_norm": 0.7724614143371582,
"learning_rate": 1.3660061157992763e-05,
"loss": 2.09,
"step": 3500
},
{
"epoch": 0.4552352048558422,
"grad_norm": 0.7469125986099243,
"learning_rate": 1.3245896131048493e-05,
"loss": 2.1347,
"step": 3600
},
{
"epoch": 0.46788062721294893,
"grad_norm": 1.0939433574676514,
"learning_rate": 1.282540920012071e-05,
"loss": 2.0754,
"step": 3700
},
{
"epoch": 0.4805260495700556,
"grad_norm": 0.8267479538917542,
"learning_rate": 1.2399419331034666e-05,
"loss": 2.0978,
"step": 3800
},
{
"epoch": 0.4931714719271624,
"grad_norm": 0.9732351303100586,
"learning_rate": 1.1968756207471413e-05,
"loss": 2.0973,
"step": 3900
},
{
"epoch": 0.5058168942842691,
"grad_norm": 0.7001142501831055,
"learning_rate": 1.1534258615025584e-05,
"loss": 2.0911,
"step": 4000
},
{
"epoch": 0.5184623166413759,
"grad_norm": 0.7966179847717285,
"learning_rate": 1.1096772807535755e-05,
"loss": 2.0964,
"step": 4100
},
{
"epoch": 0.5311077389984825,
"grad_norm": 0.8109586834907532,
"learning_rate": 1.065715085886918e-05,
"loss": 2.0704,
"step": 4200
},
{
"epoch": 0.5437531613555893,
"grad_norm": 0.9050929546356201,
"learning_rate": 1.0216249003371113e-05,
"loss": 2.1378,
"step": 4300
},
{
"epoch": 0.556398583712696,
"grad_norm": 0.9542393684387207,
"learning_rate": 9.774925968210892e-06,
"loss": 2.074,
"step": 4400
},
{
"epoch": 0.5690440060698028,
"grad_norm": 0.8393438458442688,
"learning_rate": 9.334041300872904e-06,
"loss": 2.0886,
"step": 4500
},
{
"epoch": 0.5816894284269094,
"grad_norm": 1.006131649017334,
"learning_rate": 8.894453695049792e-06,
"loss": 2.0545,
"step": 4600
},
{
"epoch": 0.5943348507840162,
"grad_norm": 1.2745147943496704,
"learning_rate": 8.4570193181986e-06,
"loss": 2.0421,
"step": 4700
},
{
"epoch": 0.6069802731411229,
"grad_norm": 0.7968528270721436,
"learning_rate": 8.022590144017162e-06,
"loss": 2.1115,
"step": 4800
},
{
"epoch": 0.6196256954982297,
"grad_norm": 1.0543171167373657,
"learning_rate": 7.592012293088485e-06,
"loss": 2.1035,
"step": 4900
},
{
"epoch": 0.6322711178553364,
"grad_norm": 1.2739633321762085,
"learning_rate": 7.166124384925069e-06,
"loss": 2.1336,
"step": 5000
},
{
"epoch": 0.644916540212443,
"grad_norm": 1.05385422706604,
"learning_rate": 6.745755904622678e-06,
"loss": 2.1043,
"step": 5100
},
{
"epoch": 0.6575619625695498,
"grad_norm": 0.854491651058197,
"learning_rate": 6.3317255873049535e-06,
"loss": 2.0562,
"step": 5200
},
{
"epoch": 0.6702073849266565,
"grad_norm": 0.9591374397277832,
"learning_rate": 5.9248398235052566e-06,
"loss": 2.0872,
"step": 5300
},
{
"epoch": 0.6828528072837633,
"grad_norm": 1.0265579223632812,
"learning_rate": 5.525891088591604e-06,
"loss": 2.0493,
"step": 5400
},
{
"epoch": 0.69549822964087,
"grad_norm": 1.3940016031265259,
"learning_rate": 5.135656399293624e-06,
"loss": 2.046,
"step": 5500
},
{
"epoch": 0.7081436519979767,
"grad_norm": 1.0076895952224731,
"learning_rate": 4.754895800337698e-06,
"loss": 2.1039,
"step": 5600
},
{
"epoch": 0.7207890743550834,
"grad_norm": 1.1425988674163818,
"learning_rate": 4.384350884137794e-06,
"loss": 2.0724,
"step": 5700
},
{
"epoch": 0.7334344967121902,
"grad_norm": 1.011376142501831,
"learning_rate": 4.024743346425134e-06,
"loss": 2.0797,
"step": 5800
},
{
"epoch": 0.7460799190692969,
"grad_norm": 1.2228542566299438,
"learning_rate": 3.6767735806298833e-06,
"loss": 2.0745,
"step": 5900
},
{
"epoch": 0.7587253414264037,
"grad_norm": 1.220448613166809,
"learning_rate": 3.3411193137524458e-06,
"loss": 2.0349,
"step": 6000
},
{
"epoch": 0.7713707637835103,
"grad_norm": 0.8894066214561462,
"learning_rate": 3.0184342863813044e-06,
"loss": 2.0935,
"step": 6100
},
{
"epoch": 0.7840161861406171,
"grad_norm": 0.8592280745506287,
"learning_rate": 2.7093469794282246e-06,
"loss": 2.0997,
"step": 6200
},
{
"epoch": 0.7966616084977238,
"grad_norm": 1.1828413009643555,
"learning_rate": 2.4144593900607706e-06,
"loss": 2.0485,
"step": 6300
},
{
"epoch": 0.8093070308548306,
"grad_norm": 0.8146592378616333,
"learning_rate": 2.134345859216118e-06,
"loss": 2.0963,
"step": 6400
},
{
"epoch": 0.8219524532119373,
"grad_norm": 0.8691930174827576,
"learning_rate": 1.8695519529798789e-06,
"loss": 2.0619,
"step": 6500
},
{
"epoch": 0.834597875569044,
"grad_norm": 0.8871398568153381,
"learning_rate": 1.6205934000084966e-06,
"loss": 2.0537,
"step": 6600
},
{
"epoch": 0.8472432979261507,
"grad_norm": 0.9184776544570923,
"learning_rate": 1.387955087064895e-06,
"loss": 2.0731,
"step": 6700
},
{
"epoch": 0.8598887202832575,
"grad_norm": 1.0037082433700562,
"learning_rate": 1.1720901146236207e-06,
"loss": 2.0834,
"step": 6800
},
{
"epoch": 0.8725341426403642,
"grad_norm": 1.014647126197815,
"learning_rate": 9.734189143849126e-07,
"loss": 2.0622,
"step": 6900
},
{
"epoch": 0.885179564997471,
"grad_norm": 0.8669779300689697,
"learning_rate": 7.923284304164502e-07,
"loss": 2.0733,
"step": 7000
},
{
"epoch": 0.8978249873545776,
"grad_norm": 1.2026468515396118,
"learning_rate": 6.291713655176257e-07,
"loss": 2.0768,
"step": 7100
},
{
"epoch": 0.9104704097116844,
"grad_norm": 0.8568145036697388,
"learning_rate": 4.84265494274222e-07,
"loss": 2.0677,
"step": 7200
},
{
"epoch": 0.9231158320687911,
"grad_norm": 0.9718058705329895,
"learning_rate": 3.578930441413542e-07,
"loss": 2.08,
"step": 7300
},
{
"epoch": 0.9357612544258979,
"grad_norm": 0.8524600267410278,
"learning_rate": 2.503001457601928e-07,
"loss": 2.0769,
"step": 7400
},
{
"epoch": 0.9484066767830045,
"grad_norm": 0.7981248497962952,
"learning_rate": 1.6169635357900505e-07,
"loss": 2.0827,
"step": 7500
},
{
"epoch": 0.9610520991401112,
"grad_norm": 0.9334998726844788,
"learning_rate": 9.225423771221598e-08,
"loss": 2.0611,
"step": 7600
},
{
"epoch": 0.973697521497218,
"grad_norm": 0.7538848519325256,
"learning_rate": 4.210904783239378e-08,
"loss": 2.0664,
"step": 7700
},
{
"epoch": 0.9863429438543247,
"grad_norm": 0.8826780915260315,
"learning_rate": 1.1358449749798717e-08,
"loss": 2.0641,
"step": 7800
},
{
"epoch": 0.9989883662114315,
"grad_norm": 0.9787418246269226,
"learning_rate": 6.233519252774045e-11,
"loss": 2.0665,
"step": 7900
},
{
"epoch": 1.0,
"eval_loss": 1.9169105291366577,
"eval_runtime": 98.8888,
"eval_samples_per_second": 6.502,
"eval_steps_per_second": 0.819,
"step": 7908
},
{
"epoch": 1.0,
"step": 7908,
"total_flos": 1.4404222844928e+17,
"train_loss": 2.1297678035741874,
"train_runtime": 3846.3945,
"train_samples_per_second": 4.112,
"train_steps_per_second": 2.056
}
],
"logging_steps": 100,
"max_steps": 7908,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4404222844928e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}