| { | |
| "best_global_step": 6108, | |
| "best_metric": 0.6058866381645203, | |
| "best_model_checkpoint": "mamba_nli_ensemble/checkpoint-6108", | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 6108, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016371971185330715, | |
| "grad_norm": 15.273209571838379, | |
| "learning_rate": 1.6371971185330716e-06, | |
| "loss": 0.7553, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03274394237066143, | |
| "grad_norm": 17.171979904174805, | |
| "learning_rate": 3.2743942370661432e-06, | |
| "loss": 0.7151, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04911591355599214, | |
| "grad_norm": 5.574963569641113, | |
| "learning_rate": 4.911591355599214e-06, | |
| "loss": 0.7065, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06548788474132286, | |
| "grad_norm": 7.3369059562683105, | |
| "learning_rate": 6.5487884741322864e-06, | |
| "loss": 0.6996, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.08185985592665357, | |
| "grad_norm": 7.506777286529541, | |
| "learning_rate": 8.185985592665357e-06, | |
| "loss": 0.7071, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09823182711198428, | |
| "grad_norm": 6.177253723144531, | |
| "learning_rate": 9.823182711198428e-06, | |
| "loss": 0.6926, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.114603798297315, | |
| "grad_norm": 5.600251197814941, | |
| "learning_rate": 1.14603798297315e-05, | |
| "loss": 0.7145, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.13097576948264572, | |
| "grad_norm": 2.5524227619171143, | |
| "learning_rate": 1.3097576948264573e-05, | |
| "loss": 0.6919, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14734774066797643, | |
| "grad_norm": 8.479023933410645, | |
| "learning_rate": 1.4734774066797644e-05, | |
| "loss": 0.6819, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.16371971185330714, | |
| "grad_norm": 5.893022060394287, | |
| "learning_rate": 1.6371971185330713e-05, | |
| "loss": 0.6944, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18009168303863785, | |
| "grad_norm": 4.31400728225708, | |
| "learning_rate": 1.8009168303863786e-05, | |
| "loss": 0.6787, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.19646365422396855, | |
| "grad_norm": 1.9604185819625854, | |
| "learning_rate": 1.9646365422396855e-05, | |
| "loss": 0.6804, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2128356254092993, | |
| "grad_norm": 14.416400909423828, | |
| "learning_rate": 2.128356254092993e-05, | |
| "loss": 0.6979, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.22920759659463, | |
| "grad_norm": 2.0943357944488525, | |
| "learning_rate": 2.2920759659463e-05, | |
| "loss": 0.6689, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2455795677799607, | |
| "grad_norm": 4.136998176574707, | |
| "learning_rate": 2.4557956777996073e-05, | |
| "loss": 0.6694, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.26195153896529144, | |
| "grad_norm": 3.5071325302124023, | |
| "learning_rate": 2.6195153896529146e-05, | |
| "loss": 0.6168, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2783235101506221, | |
| "grad_norm": 7.638752460479736, | |
| "learning_rate": 2.7832351015062215e-05, | |
| "loss": 0.6237, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.29469548133595286, | |
| "grad_norm": 1.9127601385116577, | |
| "learning_rate": 2.9469548133595288e-05, | |
| "loss": 0.6163, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.31106745252128354, | |
| "grad_norm": 66.4927749633789, | |
| "learning_rate": 3.110674525212836e-05, | |
| "loss": 0.6006, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3274394237066143, | |
| "grad_norm": 52.92075729370117, | |
| "learning_rate": 3.2743942370661426e-05, | |
| "loss": 0.6598, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.343811394891945, | |
| "grad_norm": 11.339043617248535, | |
| "learning_rate": 3.43811394891945e-05, | |
| "loss": 0.5987, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3601833660772757, | |
| "grad_norm": 28.995885848999023, | |
| "learning_rate": 3.601833660772757e-05, | |
| "loss": 0.6509, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3765553372626064, | |
| "grad_norm": 23.708646774291992, | |
| "learning_rate": 3.765553372626065e-05, | |
| "loss": 0.6729, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3929273084479371, | |
| "grad_norm": 3.438246726989746, | |
| "learning_rate": 3.929273084479371e-05, | |
| "loss": 0.5537, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.40929927963326784, | |
| "grad_norm": 10.562445640563965, | |
| "learning_rate": 4.0929927963326786e-05, | |
| "loss": 0.6228, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4256712508185986, | |
| "grad_norm": 9.508832931518555, | |
| "learning_rate": 4.256712508185986e-05, | |
| "loss": 0.5776, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.44204322200392926, | |
| "grad_norm": 12.658103942871094, | |
| "learning_rate": 4.4204322200392925e-05, | |
| "loss": 0.5455, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.45841519318926, | |
| "grad_norm": 8.46078109741211, | |
| "learning_rate": 4.5841519318926e-05, | |
| "loss": 0.5583, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4747871643745907, | |
| "grad_norm": 5.642892360687256, | |
| "learning_rate": 4.747871643745907e-05, | |
| "loss": 0.5556, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4911591355599214, | |
| "grad_norm": 3.8212382793426514, | |
| "learning_rate": 4.9115913555992146e-05, | |
| "loss": 0.5552, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5075311067452521, | |
| "grad_norm": 8.145768165588379, | |
| "learning_rate": 4.999965445760666e-05, | |
| "loss": 0.5488, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5239030779305829, | |
| "grad_norm": 12.39121150970459, | |
| "learning_rate": 4.999651917405523e-05, | |
| "loss": 0.5595, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5402750491159135, | |
| "grad_norm": 6.998423099517822, | |
| "learning_rate": 4.999011837711028e-05, | |
| "loss": 0.5111, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5566470203012442, | |
| "grad_norm": 31.633630752563477, | |
| "learning_rate": 4.998045290296376e-05, | |
| "loss": 0.553, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.573018991486575, | |
| "grad_norm": 56.126251220703125, | |
| "learning_rate": 4.9967524014300896e-05, | |
| "loss": 0.5713, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5893909626719057, | |
| "grad_norm": 6.04685640335083, | |
| "learning_rate": 4.995133340013522e-05, | |
| "loss": 0.526, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6057629338572365, | |
| "grad_norm": 9.806577682495117, | |
| "learning_rate": 4.993188317558791e-05, | |
| "loss": 0.6185, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6221349050425671, | |
| "grad_norm": 3.9068918228149414, | |
| "learning_rate": 4.9909175881611514e-05, | |
| "loss": 0.5086, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6385068762278978, | |
| "grad_norm": 19.12666130065918, | |
| "learning_rate": 4.9883214484657957e-05, | |
| "loss": 0.515, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6548788474132285, | |
| "grad_norm": 6.140756607055664, | |
| "learning_rate": 4.9854002376291046e-05, | |
| "loss": 0.5581, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6712508185985593, | |
| "grad_norm": 12.521078109741211, | |
| "learning_rate": 4.9821543372743355e-05, | |
| "loss": 0.5192, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.68762278978389, | |
| "grad_norm": 18.783933639526367, | |
| "learning_rate": 4.9785841714417734e-05, | |
| "loss": 0.5334, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7039947609692206, | |
| "grad_norm": 7.139877796173096, | |
| "learning_rate": 4.97469020653333e-05, | |
| "loss": 0.5334, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7203667321545514, | |
| "grad_norm": 7.0137834548950195, | |
| "learning_rate": 4.970472951251617e-05, | |
| "loss": 0.5019, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7367387033398821, | |
| "grad_norm": 51.292449951171875, | |
| "learning_rate": 4.9659329565334854e-05, | |
| "loss": 0.4813, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7531106745252129, | |
| "grad_norm": 7.053626537322998, | |
| "learning_rate": 4.9610708154780585e-05, | |
| "loss": 0.6834, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7694826457105436, | |
| "grad_norm": 0.5209086537361145, | |
| "learning_rate": 4.955887163269243e-05, | |
| "loss": 0.4802, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7858546168958742, | |
| "grad_norm": 0.9966021776199341, | |
| "learning_rate": 4.950382677092754e-05, | |
| "loss": 0.5673, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.802226588081205, | |
| "grad_norm": 1.4829602241516113, | |
| "learning_rate": 4.944558076047649e-05, | |
| "loss": 0.4976, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.8185985592665357, | |
| "grad_norm": 63.20207214355469, | |
| "learning_rate": 4.9384141210523804e-05, | |
| "loss": 0.5398, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8349705304518664, | |
| "grad_norm": 9.008106231689453, | |
| "learning_rate": 4.931951614745395e-05, | |
| "loss": 0.5906, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.8513425016371972, | |
| "grad_norm": 9.714171409606934, | |
| "learning_rate": 4.925171401380278e-05, | |
| "loss": 0.4833, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.8677144728225278, | |
| "grad_norm": 1.0515024662017822, | |
| "learning_rate": 4.918074366715457e-05, | |
| "loss": 0.5046, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.8840864440078585, | |
| "grad_norm": 0.32931941747665405, | |
| "learning_rate": 4.910661437898493e-05, | |
| "loss": 0.6394, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9004584151931893, | |
| "grad_norm": 19.654884338378906, | |
| "learning_rate": 4.902933583344954e-05, | |
| "loss": 0.5572, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.91683038637852, | |
| "grad_norm": 6.547713279724121, | |
| "learning_rate": 4.8948918126119056e-05, | |
| "loss": 0.5898, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.9332023575638507, | |
| "grad_norm": 0.6314940452575684, | |
| "learning_rate": 4.886537176266024e-05, | |
| "loss": 0.4681, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.9495743287491814, | |
| "grad_norm": 9.13287353515625, | |
| "learning_rate": 4.877870765746347e-05, | |
| "loss": 0.4678, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.9659462999345121, | |
| "grad_norm": 8.16297721862793, | |
| "learning_rate": 4.8688937132216966e-05, | |
| "loss": 0.5657, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.9823182711198428, | |
| "grad_norm": 19.567949295043945, | |
| "learning_rate": 4.859607191442768e-05, | |
| "loss": 0.5164, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9986902423051736, | |
| "grad_norm": 20.028736114501953, | |
| "learning_rate": 4.850012413588926e-05, | |
| "loss": 0.4769, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.8083209509658247, | |
| "eval_loss": 0.6058866381645203, | |
| "eval_mcc": 0.6161859428296993, | |
| "eval_runtime": 5.3649, | |
| "eval_samples_per_second": 125.446, | |
| "eval_steps_per_second": 8.015, | |
| "step": 6108 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 30540, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |