| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9942004971002486, | |
| "eval_steps": 500, | |
| "global_step": 1200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.7059583067893982, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1809, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.6619198322296143, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.37209564447402954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.833, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5066412091255188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.497, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5964956283569336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.77, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.6351748108863831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4792, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.33565786480903625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6867, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5637449622154236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4725, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.29314109683036804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6842, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.5340172052383423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4269, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.38599228858947754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6939, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5987310409545898, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4435, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.3571007549762726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6976, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.5330923199653625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.415, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.4177188277244568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6663, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.5220038890838623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4305, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.40320834517478943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6978, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.3986828625202179, | |
| "learning_rate": 0.0002, | |
| "loss": 0.392, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.3662169873714447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6647, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.4708826541900635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4119, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.3558133840560913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6364, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.5110255479812622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3949, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.4202500581741333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6034, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.7778056263923645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.389, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.42499253153800964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6453, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.4130938947200775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3776, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.38256368041038513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6242409944534302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3771, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.3617693781852722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5804, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.3799505829811096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3513, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.32906627655029297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.4239194989204407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3636, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.4123442769050598, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.4623079001903534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3821, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.3831407427787781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.48481571674346924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3588, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.36328190565109253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.4488459527492523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3833, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.39116260409355164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5035, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.4923652410507202, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3178, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.38706672191619873, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.3919411897659302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3329, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.44446566700935364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4754, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.3222676217556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3128, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.3832922577857971, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.5052505135536194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3673, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.3186209797859192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4993, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.5416242480278015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3399, | |
| "step": 1200 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 1207, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "total_flos": 6.735879205561958e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |