| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9942004971002486, |
| "eval_steps": 500, |
| "global_step": 1200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.7059583067893982, |
| "learning_rate": 0.0002, |
| "loss": 1.1809, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.6619198322296143, |
| "learning_rate": 0.0002, |
| "loss": 0.5511, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.37209564447402954, |
| "learning_rate": 0.0002, |
| "loss": 0.833, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.5066412091255188, |
| "learning_rate": 0.0002, |
| "loss": 0.497, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.5964956283569336, |
| "learning_rate": 0.0002, |
| "loss": 0.77, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.6351748108863831, |
| "learning_rate": 0.0002, |
| "loss": 0.4792, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.33565786480903625, |
| "learning_rate": 0.0002, |
| "loss": 0.6867, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.5637449622154236, |
| "learning_rate": 0.0002, |
| "loss": 0.4725, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.29314109683036804, |
| "learning_rate": 0.0002, |
| "loss": 0.6842, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.5340172052383423, |
| "learning_rate": 0.0002, |
| "loss": 0.4269, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.38599228858947754, |
| "learning_rate": 0.0002, |
| "loss": 0.6939, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.5987310409545898, |
| "learning_rate": 0.0002, |
| "loss": 0.4435, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.3571007549762726, |
| "learning_rate": 0.0002, |
| "loss": 0.6976, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.5330923199653625, |
| "learning_rate": 0.0002, |
| "loss": 0.415, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.4177188277244568, |
| "learning_rate": 0.0002, |
| "loss": 0.6663, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.5220038890838623, |
| "learning_rate": 0.0002, |
| "loss": 0.4305, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.40320834517478943, |
| "learning_rate": 0.0002, |
| "loss": 0.6978, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.3986828625202179, |
| "learning_rate": 0.0002, |
| "loss": 0.392, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.3662169873714447, |
| "learning_rate": 0.0002, |
| "loss": 0.6647, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.4708826541900635, |
| "learning_rate": 0.0002, |
| "loss": 0.4119, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.3558133840560913, |
| "learning_rate": 0.0002, |
| "loss": 0.6364, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.5110255479812622, |
| "learning_rate": 0.0002, |
| "loss": 0.3949, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.4202500581741333, |
| "learning_rate": 0.0002, |
| "loss": 0.6034, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.7778056263923645, |
| "learning_rate": 0.0002, |
| "loss": 0.389, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.42499253153800964, |
| "learning_rate": 0.0002, |
| "loss": 0.6453, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.4130938947200775, |
| "learning_rate": 0.0002, |
| "loss": 0.3776, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.38256368041038513, |
| "learning_rate": 0.0002, |
| "loss": 0.5631, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.6242409944534302, |
| "learning_rate": 0.0002, |
| "loss": 0.3771, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.3617693781852722, |
| "learning_rate": 0.0002, |
| "loss": 0.5804, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.3799505829811096, |
| "learning_rate": 0.0002, |
| "loss": 0.3513, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.32906627655029297, |
| "learning_rate": 0.0002, |
| "loss": 0.5393, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.4239194989204407, |
| "learning_rate": 0.0002, |
| "loss": 0.3636, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.4123442769050598, |
| "learning_rate": 0.0002, |
| "loss": 0.5279, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.4623079001903534, |
| "learning_rate": 0.0002, |
| "loss": 0.3821, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.3831407427787781, |
| "learning_rate": 0.0002, |
| "loss": 0.5286, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.48481571674346924, |
| "learning_rate": 0.0002, |
| "loss": 0.3588, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.36328190565109253, |
| "learning_rate": 0.0002, |
| "loss": 0.5519, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.4488459527492523, |
| "learning_rate": 0.0002, |
| "loss": 0.3833, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.39116260409355164, |
| "learning_rate": 0.0002, |
| "loss": 0.5035, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.4923652410507202, |
| "learning_rate": 0.0002, |
| "loss": 0.3178, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.38706672191619873, |
| "learning_rate": 0.0002, |
| "loss": 0.5357, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.3919411897659302, |
| "learning_rate": 0.0002, |
| "loss": 0.3329, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.44446566700935364, |
| "learning_rate": 0.0002, |
| "loss": 0.4754, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.3222676217556, |
| "learning_rate": 0.0002, |
| "loss": 0.3128, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.3832922577857971, |
| "learning_rate": 0.0002, |
| "loss": 0.5655, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.5052505135536194, |
| "learning_rate": 0.0002, |
| "loss": 0.3673, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.3186209797859192, |
| "learning_rate": 0.0002, |
| "loss": 0.4993, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.5416242480278015, |
| "learning_rate": 0.0002, |
| "loss": 0.3399, |
| "step": 1200 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 1207, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 25, |
| "total_flos": 6.735879205561958e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|