| { | |
| "best_metric": 0.8554500158408583, | |
| "best_model_checkpoint": "cn_output/run-0/checkpoint-5775", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 5775, | |
| "is_hyper_param_search": true, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 43.52214813232422, | |
| "learning_rate": 2.7348179693000015e-05, | |
| "loss": 1.6704, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 16.620838165283203, | |
| "learning_rate": 2.6866273442903096e-05, | |
| "loss": 1.3854, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 14.869579315185547, | |
| "learning_rate": 2.638436719280618e-05, | |
| "loss": 1.2394, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 16.870759963989258, | |
| "learning_rate": 2.5902460942709264e-05, | |
| "loss": 1.1392, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 12.891343116760254, | |
| "learning_rate": 2.542055469261235e-05, | |
| "loss": 1.0319, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 7.518686771392822, | |
| "learning_rate": 2.493864844251543e-05, | |
| "loss": 1.0726, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 9.59931755065918, | |
| "learning_rate": 2.4456742192418514e-05, | |
| "loss": 1.0907, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 15.09188461303711, | |
| "learning_rate": 2.3974835942321598e-05, | |
| "loss": 1.038, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 12.715774536132812, | |
| "learning_rate": 2.3492929692224682e-05, | |
| "loss": 0.9121, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 23.419095993041992, | |
| "learning_rate": 2.3011023442127766e-05, | |
| "loss": 0.8541, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 24.277725219726562, | |
| "learning_rate": 2.252911719203085e-05, | |
| "loss": 0.8723, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 22.929729461669922, | |
| "learning_rate": 2.2047210941933934e-05, | |
| "loss": 0.8625, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 12.207918167114258, | |
| "learning_rate": 2.1565304691837015e-05, | |
| "loss": 0.8384, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 26.28716278076172, | |
| "learning_rate": 2.10833984417401e-05, | |
| "loss": 0.8521, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 9.733681678771973, | |
| "learning_rate": 2.0601492191643184e-05, | |
| "loss": 0.7984, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 25.5473690032959, | |
| "learning_rate": 2.0119585941546264e-05, | |
| "loss": 0.7452, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 28.462318420410156, | |
| "learning_rate": 1.963767969144935e-05, | |
| "loss": 0.7016, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 30.851057052612305, | |
| "learning_rate": 1.9155773441352433e-05, | |
| "loss": 0.6933, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 20.66396141052246, | |
| "learning_rate": 1.8673867191255517e-05, | |
| "loss": 0.6987, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.7933753943217665, | |
| "eval_f1": 0.7855091479380132, | |
| "eval_loss": 0.635335385799408, | |
| "eval_runtime": 15.4807, | |
| "eval_samples_per_second": 122.863, | |
| "eval_steps_per_second": 3.876, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 18.86337661743164, | |
| "learning_rate": 1.8191960941158598e-05, | |
| "loss": 0.5131, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 23.430830001831055, | |
| "learning_rate": 1.7710054691061682e-05, | |
| "loss": 0.4268, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 11.48133373260498, | |
| "learning_rate": 1.7228148440964766e-05, | |
| "loss": 0.4388, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 21.7901668548584, | |
| "learning_rate": 1.674624219086785e-05, | |
| "loss": 0.4276, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 3.37796688079834, | |
| "learning_rate": 1.6264335940770934e-05, | |
| "loss": 0.3975, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.8390240669250488, | |
| "learning_rate": 1.578242969067402e-05, | |
| "loss": 0.4863, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.9483298063278198, | |
| "learning_rate": 1.5300523440577103e-05, | |
| "loss": 0.4216, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 13.241854667663574, | |
| "learning_rate": 1.4818617190480184e-05, | |
| "loss": 0.4522, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 13.804593086242676, | |
| "learning_rate": 1.4336710940383268e-05, | |
| "loss": 0.3998, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 20.864044189453125, | |
| "learning_rate": 1.3854804690286352e-05, | |
| "loss": 0.3561, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 11.546530723571777, | |
| "learning_rate": 1.3372898440189434e-05, | |
| "loss": 0.4525, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 21.35649871826172, | |
| "learning_rate": 1.2890992190092519e-05, | |
| "loss": 0.422, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 9.798705101013184, | |
| "learning_rate": 1.2409085939995601e-05, | |
| "loss": 0.3685, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.4076740741729736, | |
| "learning_rate": 1.1927179689898684e-05, | |
| "loss": 0.3969, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.5313684940338135, | |
| "learning_rate": 1.1445273439801768e-05, | |
| "loss": 0.3632, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 4.284488677978516, | |
| "learning_rate": 1.0963367189704852e-05, | |
| "loss": 0.3758, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 87.45575714111328, | |
| "learning_rate": 1.0481460939607936e-05, | |
| "loss": 0.3407, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 18.49854850769043, | |
| "learning_rate": 9.999554689511019e-06, | |
| "loss": 0.4158, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 10.284146308898926, | |
| "learning_rate": 9.517648439414103e-06, | |
| "loss": 0.3669, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.8496319663512093, | |
| "eval_f1": 0.8396298133039037, | |
| "eval_loss": 0.6180713176727295, | |
| "eval_runtime": 15.5038, | |
| "eval_samples_per_second": 122.679, | |
| "eval_steps_per_second": 3.87, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 2.4232983589172363, | |
| "learning_rate": 9.035742189317185e-06, | |
| "loss": 0.2593, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 14.904269218444824, | |
| "learning_rate": 8.553835939220268e-06, | |
| "loss": 0.1672, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.02660948596894741, | |
| "learning_rate": 8.071929689123352e-06, | |
| "loss": 0.1319, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.1543588936328888, | |
| "learning_rate": 7.590023439026435e-06, | |
| "loss": 0.1873, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.04116074740886688, | |
| "learning_rate": 7.108117188929519e-06, | |
| "loss": 0.1706, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 21.272350311279297, | |
| "learning_rate": 6.626210938832602e-06, | |
| "loss": 0.2408, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.016322173178195953, | |
| "learning_rate": 6.144304688735686e-06, | |
| "loss": 0.1878, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.12828181684017181, | |
| "learning_rate": 5.662398438638769e-06, | |
| "loss": 0.1664, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 20.833871841430664, | |
| "learning_rate": 5.180492188541853e-06, | |
| "loss": 0.1066, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.006888058967888355, | |
| "learning_rate": 4.698585938444936e-06, | |
| "loss": 0.1298, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.0369136743247509, | |
| "learning_rate": 4.21667968834802e-06, | |
| "loss": 0.1151, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.13510233163833618, | |
| "learning_rate": 3.7347734382511036e-06, | |
| "loss": 0.173, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.05811930075287819, | |
| "learning_rate": 3.252867188154187e-06, | |
| "loss": 0.1359, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.07640138268470764, | |
| "learning_rate": 2.7709609380572702e-06, | |
| "loss": 0.1293, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.019262025132775307, | |
| "learning_rate": 2.289054687960354e-06, | |
| "loss": 0.1205, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 16.199689865112305, | |
| "learning_rate": 1.8071484378634369e-06, | |
| "loss": 0.1713, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.010484320111572742, | |
| "learning_rate": 1.3252421877665204e-06, | |
| "loss": 0.145, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.06876770406961441, | |
| "learning_rate": 8.43335937669604e-07, | |
| "loss": 0.1405, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.3863673210144043, | |
| "learning_rate": 3.6142968757268745e-07, | |
| "loss": 0.1295, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.8654048370136698, | |
| "eval_f1": 0.8554500158408583, | |
| "eval_loss": 0.7277432680130005, | |
| "eval_runtime": 15.4912, | |
| "eval_samples_per_second": 122.779, | |
| "eval_steps_per_second": 3.873, | |
| "step": 5775 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 5775, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 4219575531135264.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": { | |
| "learning_rate": 2.783008594309693e-05, | |
| "lr_scheduler_type": "linear", | |
| "num_train_epochs": 3, | |
| "per_device_train_batch_size": 8 | |
| } | |
| } | |