{ "best_global_step": 6108, "best_metric": 0.6058866381645203, "best_model_checkpoint": "mamba_nli_ensemble/checkpoint-6108", "epoch": 1.0, "eval_steps": 500, "global_step": 6108, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016371971185330715, "grad_norm": 15.273209571838379, "learning_rate": 1.6371971185330716e-06, "loss": 0.7553, "step": 100 }, { "epoch": 0.03274394237066143, "grad_norm": 17.171979904174805, "learning_rate": 3.2743942370661432e-06, "loss": 0.7151, "step": 200 }, { "epoch": 0.04911591355599214, "grad_norm": 5.574963569641113, "learning_rate": 4.911591355599214e-06, "loss": 0.7065, "step": 300 }, { "epoch": 0.06548788474132286, "grad_norm": 7.3369059562683105, "learning_rate": 6.5487884741322864e-06, "loss": 0.6996, "step": 400 }, { "epoch": 0.08185985592665357, "grad_norm": 7.506777286529541, "learning_rate": 8.185985592665357e-06, "loss": 0.7071, "step": 500 }, { "epoch": 0.09823182711198428, "grad_norm": 6.177253723144531, "learning_rate": 9.823182711198428e-06, "loss": 0.6926, "step": 600 }, { "epoch": 0.114603798297315, "grad_norm": 5.600251197814941, "learning_rate": 1.14603798297315e-05, "loss": 0.7145, "step": 700 }, { "epoch": 0.13097576948264572, "grad_norm": 2.5524227619171143, "learning_rate": 1.3097576948264573e-05, "loss": 0.6919, "step": 800 }, { "epoch": 0.14734774066797643, "grad_norm": 8.479023933410645, "learning_rate": 1.4734774066797644e-05, "loss": 0.6819, "step": 900 }, { "epoch": 0.16371971185330714, "grad_norm": 5.893022060394287, "learning_rate": 1.6371971185330713e-05, "loss": 0.6944, "step": 1000 }, { "epoch": 0.18009168303863785, "grad_norm": 4.31400728225708, "learning_rate": 1.8009168303863786e-05, "loss": 0.6787, "step": 1100 }, { "epoch": 0.19646365422396855, "grad_norm": 1.9604185819625854, "learning_rate": 1.9646365422396855e-05, "loss": 0.6804, "step": 1200 }, { "epoch": 0.2128356254092993, "grad_norm": 14.416400909423828, "learning_rate": 2.128356254092993e-05, "loss": 0.6979, "step": 1300 }, { "epoch": 0.22920759659463, "grad_norm": 2.0943357944488525, "learning_rate": 2.2920759659463e-05, "loss": 0.6689, "step": 1400 }, { "epoch": 0.2455795677799607, "grad_norm": 4.136998176574707, "learning_rate": 2.4557956777996073e-05, "loss": 0.6694, "step": 1500 }, { "epoch": 0.26195153896529144, "grad_norm": 3.5071325302124023, "learning_rate": 2.6195153896529146e-05, "loss": 0.6168, "step": 1600 }, { "epoch": 0.2783235101506221, "grad_norm": 7.638752460479736, "learning_rate": 2.7832351015062215e-05, "loss": 0.6237, "step": 1700 }, { "epoch": 0.29469548133595286, "grad_norm": 1.9127601385116577, "learning_rate": 2.9469548133595288e-05, "loss": 0.6163, "step": 1800 }, { "epoch": 0.31106745252128354, "grad_norm": 66.4927749633789, "learning_rate": 3.110674525212836e-05, "loss": 0.6006, "step": 1900 }, { "epoch": 0.3274394237066143, "grad_norm": 52.92075729370117, "learning_rate": 3.2743942370661426e-05, "loss": 0.6598, "step": 2000 }, { "epoch": 0.343811394891945, "grad_norm": 11.339043617248535, "learning_rate": 3.43811394891945e-05, "loss": 0.5987, "step": 2100 }, { "epoch": 0.3601833660772757, "grad_norm": 28.995885848999023, "learning_rate": 3.601833660772757e-05, "loss": 0.6509, "step": 2200 }, { "epoch": 0.3765553372626064, "grad_norm": 23.708646774291992, "learning_rate": 3.765553372626065e-05, "loss": 0.6729, "step": 2300 }, { "epoch": 0.3929273084479371, "grad_norm": 3.438246726989746, "learning_rate": 3.929273084479371e-05, "loss": 0.5537, "step": 2400 }, { "epoch": 0.40929927963326784, "grad_norm": 10.562445640563965, "learning_rate": 4.0929927963326786e-05, "loss": 0.6228, "step": 2500 }, { "epoch": 0.4256712508185986, "grad_norm": 9.508832931518555, "learning_rate": 4.256712508185986e-05, "loss": 0.5776, "step": 2600 }, { "epoch": 0.44204322200392926, "grad_norm": 12.658103942871094, "learning_rate": 4.4204322200392925e-05, "loss": 0.5455, "step": 2700 }, { "epoch": 0.45841519318926, "grad_norm": 8.46078109741211, "learning_rate": 4.5841519318926e-05, "loss": 0.5583, "step": 2800 }, { "epoch": 0.4747871643745907, "grad_norm": 5.642892360687256, "learning_rate": 4.747871643745907e-05, "loss": 0.5556, "step": 2900 }, { "epoch": 0.4911591355599214, "grad_norm": 3.8212382793426514, "learning_rate": 4.9115913555992146e-05, "loss": 0.5552, "step": 3000 }, { "epoch": 0.5075311067452521, "grad_norm": 8.145768165588379, "learning_rate": 4.999965445760666e-05, "loss": 0.5488, "step": 3100 }, { "epoch": 0.5239030779305829, "grad_norm": 12.39121150970459, "learning_rate": 4.999651917405523e-05, "loss": 0.5595, "step": 3200 }, { "epoch": 0.5402750491159135, "grad_norm": 6.998423099517822, "learning_rate": 4.999011837711028e-05, "loss": 0.5111, "step": 3300 }, { "epoch": 0.5566470203012442, "grad_norm": 31.633630752563477, "learning_rate": 4.998045290296376e-05, "loss": 0.553, "step": 3400 }, { "epoch": 0.573018991486575, "grad_norm": 56.126251220703125, "learning_rate": 4.9967524014300896e-05, "loss": 0.5713, "step": 3500 }, { "epoch": 0.5893909626719057, "grad_norm": 6.04685640335083, "learning_rate": 4.995133340013522e-05, "loss": 0.526, "step": 3600 }, { "epoch": 0.6057629338572365, "grad_norm": 9.806577682495117, "learning_rate": 4.993188317558791e-05, "loss": 0.6185, "step": 3700 }, { "epoch": 0.6221349050425671, "grad_norm": 3.9068918228149414, "learning_rate": 4.9909175881611514e-05, "loss": 0.5086, "step": 3800 }, { "epoch": 0.6385068762278978, "grad_norm": 19.12666130065918, "learning_rate": 4.9883214484657957e-05, "loss": 0.515, "step": 3900 }, { "epoch": 0.6548788474132285, "grad_norm": 6.140756607055664, "learning_rate": 4.9854002376291046e-05, "loss": 0.5581, "step": 4000 }, { "epoch": 0.6712508185985593, "grad_norm": 12.521078109741211, "learning_rate": 4.9821543372743355e-05, "loss": 0.5192, "step": 4100 }, { "epoch": 0.68762278978389, "grad_norm": 18.783933639526367, "learning_rate": 4.9785841714417734e-05, "loss": 0.5334, "step": 4200 }, { "epoch": 0.7039947609692206, "grad_norm": 7.139877796173096, "learning_rate": 4.97469020653333e-05, "loss": 0.5334, "step": 4300 }, { "epoch": 0.7203667321545514, "grad_norm": 7.0137834548950195, "learning_rate": 4.970472951251617e-05, "loss": 0.5019, "step": 4400 }, { "epoch": 0.7367387033398821, "grad_norm": 51.292449951171875, "learning_rate": 4.9659329565334854e-05, "loss": 0.4813, "step": 4500 }, { "epoch": 0.7531106745252129, "grad_norm": 7.053626537322998, "learning_rate": 4.9610708154780585e-05, "loss": 0.6834, "step": 4600 }, { "epoch": 0.7694826457105436, "grad_norm": 0.5209086537361145, "learning_rate": 4.955887163269243e-05, "loss": 0.4802, "step": 4700 }, { "epoch": 0.7858546168958742, "grad_norm": 0.9966021776199341, "learning_rate": 4.950382677092754e-05, "loss": 0.5673, "step": 4800 }, { "epoch": 0.802226588081205, "grad_norm": 1.4829602241516113, "learning_rate": 4.944558076047649e-05, "loss": 0.4976, "step": 4900 }, { "epoch": 0.8185985592665357, "grad_norm": 63.20207214355469, "learning_rate": 4.9384141210523804e-05, "loss": 0.5398, "step": 5000 }, { "epoch": 0.8349705304518664, "grad_norm": 9.008106231689453, "learning_rate": 4.931951614745395e-05, "loss": 0.5906, "step": 5100 }, { "epoch": 0.8513425016371972, "grad_norm": 9.714171409606934, "learning_rate": 4.925171401380278e-05, "loss": 0.4833, "step": 5200 }, { "epoch": 0.8677144728225278, "grad_norm": 1.0515024662017822, "learning_rate": 4.918074366715457e-05, "loss": 0.5046, "step": 5300 }, { "epoch": 0.8840864440078585, "grad_norm": 0.32931941747665405, "learning_rate": 4.910661437898493e-05, "loss": 0.6394, "step": 5400 }, { "epoch": 0.9004584151931893, "grad_norm": 19.654884338378906, "learning_rate": 4.902933583344954e-05, "loss": 0.5572, "step": 5500 }, { "epoch": 0.91683038637852, "grad_norm": 6.547713279724121, "learning_rate": 4.8948918126119056e-05, "loss": 0.5898, "step": 5600 }, { "epoch": 0.9332023575638507, "grad_norm": 0.6314940452575684, "learning_rate": 4.886537176266024e-05, "loss": 0.4681, "step": 5700 }, { "epoch": 0.9495743287491814, "grad_norm": 9.13287353515625, "learning_rate": 4.877870765746347e-05, "loss": 0.4678, "step": 5800 }, { "epoch": 0.9659462999345121, "grad_norm": 8.16297721862793, "learning_rate": 4.8688937132216966e-05, "loss": 0.5657, "step": 5900 }, { "epoch": 0.9823182711198428, "grad_norm": 19.567949295043945, "learning_rate": 4.859607191442768e-05, "loss": 0.5164, "step": 6000 }, { "epoch": 0.9986902423051736, "grad_norm": 20.028736114501953, "learning_rate": 4.850012413588926e-05, "loss": 0.4769, "step": 6100 }, { "epoch": 1.0, "eval_accuracy": 0.8083209509658247, "eval_loss": 0.6058866381645203, "eval_mcc": 0.6161859428296993, "eval_runtime": 5.3649, "eval_samples_per_second": 125.446, "eval_steps_per_second": 8.015, "step": 6108 } ], "logging_steps": 100, "max_steps": 30540, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }