| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 902, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.5633599758148193, | |
| "learning_rate": 5.494505494505495e-05, | |
| "loss": 3.1831, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.523214340209961, | |
| "learning_rate": 0.0001098901098901099, | |
| "loss": 2.9698, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.6384470462799072, | |
| "learning_rate": 0.00016483516483516484, | |
| "loss": 2.7894, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.2789688110351562, | |
| "learning_rate": 0.00019778051787916153, | |
| "loss": 2.7214, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.6615029573440552, | |
| "learning_rate": 0.00019161528976572133, | |
| "loss": 2.7224, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.7445863485336304, | |
| "learning_rate": 0.00018545006165228113, | |
| "loss": 2.7081, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.5059261322021484, | |
| "learning_rate": 0.00017928483353884094, | |
| "loss": 2.668, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.2297817468643188, | |
| "learning_rate": 0.00017311960542540076, | |
| "loss": 2.6149, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.3275938034057617, | |
| "learning_rate": 0.00016695437731196054, | |
| "loss": 2.591, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.3491621017456055, | |
| "learning_rate": 0.00016078914919852034, | |
| "loss": 2.583, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.9846614003181458, | |
| "learning_rate": 0.00015462392108508014, | |
| "loss": 2.5189, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.9622399806976318, | |
| "learning_rate": 0.00014845869297163997, | |
| "loss": 2.549, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.9879335761070251, | |
| "learning_rate": 0.00014229346485819977, | |
| "loss": 2.4825, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.9367666840553284, | |
| "learning_rate": 0.00013612823674475957, | |
| "loss": 2.4431, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.8422412276268005, | |
| "learning_rate": 0.00012996300863131935, | |
| "loss": 2.399, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.3478554487228394, | |
| "learning_rate": 0.00012379778051787915, | |
| "loss": 2.4595, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.7269034385681152, | |
| "learning_rate": 0.00011763255240443898, | |
| "loss": 2.4464, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.9614003896713257, | |
| "learning_rate": 0.00011146732429099878, | |
| "loss": 2.474, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.8424810767173767, | |
| "learning_rate": 0.00010530209617755857, | |
| "loss": 2.3071, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.7542237639427185, | |
| "learning_rate": 9.913686806411838e-05, | |
| "loss": 2.2797, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.0208224058151245, | |
| "learning_rate": 9.297163995067819e-05, | |
| "loss": 2.2255, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.8640455007553101, | |
| "learning_rate": 8.680641183723797e-05, | |
| "loss": 2.3139, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.8734548687934875, | |
| "learning_rate": 8.064118372379779e-05, | |
| "loss": 2.321, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.7769907116889954, | |
| "learning_rate": 7.447595561035759e-05, | |
| "loss": 2.2748, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.7717509865760803, | |
| "learning_rate": 6.831072749691739e-05, | |
| "loss": 2.2641, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.8536700010299683, | |
| "learning_rate": 6.214549938347719e-05, | |
| "loss": 2.2658, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.8191765546798706, | |
| "learning_rate": 5.5980271270037e-05, | |
| "loss": 2.2729, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.0077537298202515, | |
| "learning_rate": 4.9815043156596796e-05, | |
| "loss": 2.2726, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 0.8226682543754578, | |
| "learning_rate": 4.36498150431566e-05, | |
| "loss": 2.2624, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.8827477693557739, | |
| "learning_rate": 3.7484586929716406e-05, | |
| "loss": 2.1979, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.8012568354606628, | |
| "learning_rate": 3.131935881627621e-05, | |
| "loss": 2.2563, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.8210929036140442, | |
| "learning_rate": 2.5154130702836005e-05, | |
| "loss": 2.2923, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.8134270906448364, | |
| "learning_rate": 1.8988902589395807e-05, | |
| "loss": 2.2719, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.8563103079795837, | |
| "learning_rate": 1.282367447595561e-05, | |
| "loss": 2.2821, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.8670592308044434, | |
| "learning_rate": 6.6584463625154135e-06, | |
| "loss": 2.2923, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.7817544341087341, | |
| "learning_rate": 4.932182490752158e-07, | |
| "loss": 2.2142, | |
| "step": 900 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 902, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "total_flos": 5650178899968000.0, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |