| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 200, | |
| "global_step": 530, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03784295175023652, | |
| "grad_norm": 12.928278923034668, | |
| "learning_rate": 1.6981132075471698e-06, | |
| "loss": 1.4503, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07568590350047304, | |
| "grad_norm": 4.828810214996338, | |
| "learning_rate": 3.5849056603773586e-06, | |
| "loss": 0.8761, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11352885525070956, | |
| "grad_norm": 3.0466108322143555, | |
| "learning_rate": 5.4716981132075475e-06, | |
| "loss": 0.5782, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15137180700094607, | |
| "grad_norm": 2.7793734073638916, | |
| "learning_rate": 7.358490566037736e-06, | |
| "loss": 0.4365, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1892147587511826, | |
| "grad_norm": 2.6989638805389404, | |
| "learning_rate": 9.245283018867926e-06, | |
| "loss": 0.3779, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22705771050141912, | |
| "grad_norm": 2.662125825881958, | |
| "learning_rate": 9.99609654676786e-06, | |
| "loss": 0.3345, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26490066225165565, | |
| "grad_norm": 2.1630754470825195, | |
| "learning_rate": 9.9722641784023e-06, | |
| "loss": 0.3279, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30274361400189215, | |
| "grad_norm": 2.4073753356933594, | |
| "learning_rate": 9.92687124249773e-06, | |
| "loss": 0.3057, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.34058656575212864, | |
| "grad_norm": 2.1243770122528076, | |
| "learning_rate": 9.860114570402055e-06, | |
| "loss": 0.2971, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3784295175023652, | |
| "grad_norm": 2.0397891998291016, | |
| "learning_rate": 9.772283630189727e-06, | |
| "loss": 0.2811, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4162724692526017, | |
| "grad_norm": 2.167642116546631, | |
| "learning_rate": 9.663759271479858e-06, | |
| "loss": 0.2664, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.45411542100283825, | |
| "grad_norm": 2.2073721885681152, | |
| "learning_rate": 9.535012074008688e-06, | |
| "loss": 0.2721, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.49195837275307475, | |
| "grad_norm": 2.113102912902832, | |
| "learning_rate": 9.386600307117293e-06, | |
| "loss": 0.2549, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5298013245033113, | |
| "grad_norm": 1.9171561002731323, | |
| "learning_rate": 9.219167509002526e-06, | |
| "loss": 0.2534, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5676442762535477, | |
| "grad_norm": 1.8712971210479736, | |
| "learning_rate": 9.033439696227966e-06, | |
| "loss": 0.2447, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6054872280037843, | |
| "grad_norm": 2.0216710567474365, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.2491, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6433301797540208, | |
| "grad_norm": 1.623536229133606, | |
| "learning_rate": 8.610396252024113e-06, | |
| "loss": 0.2397, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6811731315042573, | |
| "grad_norm": 1.9642530679702759, | |
| "learning_rate": 8.374915007591053e-06, | |
| "loss": 0.2172, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7190160832544938, | |
| "grad_norm": 1.723118543624878, | |
| "learning_rate": 8.124799568282418e-06, | |
| "loss": 0.234, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7568590350047304, | |
| "grad_norm": 1.7284762859344482, | |
| "learning_rate": 7.861134476396903e-06, | |
| "loss": 0.2217, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7568590350047304, | |
| "eval_loss": 0.2220437228679657, | |
| "eval_runtime": 93.3051, | |
| "eval_samples_per_second": 8.821, | |
| "eval_steps_per_second": 1.265, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7947019867549668, | |
| "grad_norm": 1.6118355989456177, | |
| "learning_rate": 7.58506302778873e-06, | |
| "loss": 0.2129, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8325449385052034, | |
| "grad_norm": 1.7828593254089355, | |
| "learning_rate": 7.297782314345972e-06, | |
| "loss": 0.2146, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8703878902554399, | |
| "grad_norm": 1.4980388879776, | |
| "learning_rate": 7.00053803320028e-06, | |
| "loss": 0.1948, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9082308420056765, | |
| "grad_norm": 1.5348376035690308, | |
| "learning_rate": 6.694619085176159e-06, | |
| "loss": 0.2056, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9460737937559129, | |
| "grad_norm": 1.3809292316436768, | |
| "learning_rate": 6.381351985901842e-06, | |
| "loss": 0.1986, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9839167455061495, | |
| "grad_norm": 1.3319153785705566, | |
| "learning_rate": 6.062095113816069e-06, | |
| "loss": 0.1962, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0189214758751182, | |
| "grad_norm": 1.361677885055542, | |
| "learning_rate": 5.738232820012407e-06, | |
| "loss": 0.1666, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0567644276253547, | |
| "grad_norm": 1.4107803106307983, | |
| "learning_rate": 5.411169425461822e-06, | |
| "loss": 0.1459, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0946073793755913, | |
| "grad_norm": 1.373854637145996, | |
| "learning_rate": 5.082323131642496e-06, | |
| "loss": 0.1453, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1324503311258278, | |
| "grad_norm": 1.3363025188446045, | |
| "learning_rate": 4.753119870981486e-06, | |
| "loss": 0.1401, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1702932828760644, | |
| "grad_norm": 1.3609431982040405, | |
| "learning_rate": 4.424987123773654e-06, | |
| "loss": 0.132, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.208136234626301, | |
| "grad_norm": 1.300492286682129, | |
| "learning_rate": 4.0993477283888264e-06, | |
| "loss": 0.1342, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2459791863765375, | |
| "grad_norm": 1.1971514225006104, | |
| "learning_rate": 3.777613711607087e-06, | |
| "loss": 0.1322, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2838221381267738, | |
| "grad_norm": 1.4877253770828247, | |
| "learning_rate": 3.46118016583494e-06, | |
| "loss": 0.1368, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3216650898770104, | |
| "grad_norm": 1.2842473983764648, | |
| "learning_rate": 3.1514191997517387e-06, | |
| "loss": 0.1301, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.359508041627247, | |
| "grad_norm": 1.3692116737365723, | |
| "learning_rate": 2.8496739886173994e-06, | |
| "loss": 0.1314, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3973509933774835, | |
| "grad_norm": 1.254835605621338, | |
| "learning_rate": 2.5572529500402365e-06, | |
| "loss": 0.129, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.43519394512772, | |
| "grad_norm": 1.2396975755691528, | |
| "learning_rate": 2.275424070459803e-06, | |
| "loss": 0.1205, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4730368968779564, | |
| "grad_norm": 1.3641499280929565, | |
| "learning_rate": 2.005409406946e-06, | |
| "loss": 0.129, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.5108798486281931, | |
| "grad_norm": 1.4750701189041138, | |
| "learning_rate": 1.7483797881556175e-06, | |
| "loss": 0.1209, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5108798486281931, | |
| "eval_loss": 0.1692177653312683, | |
| "eval_runtime": 94.0494, | |
| "eval_samples_per_second": 8.751, | |
| "eval_steps_per_second": 1.255, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5487228003784295, | |
| "grad_norm": 1.1940029859542847, | |
| "learning_rate": 1.5054497374238275e-06, | |
| "loss": 0.1155, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.586565752128666, | |
| "grad_norm": 1.2632642984390259, | |
| "learning_rate": 1.277672640004936e-06, | |
| "loss": 0.1193, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6244087038789026, | |
| "grad_norm": 1.2763444185256958, | |
| "learning_rate": 1.066036175418082e-06, | |
| "loss": 0.115, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.6622516556291391, | |
| "grad_norm": 1.2933067083358765, | |
| "learning_rate": 8.714580347039491e-07, | |
| "loss": 0.1162, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7000946073793757, | |
| "grad_norm": 1.197369933128357, | |
| "learning_rate": 6.947819411632223e-07, | |
| "loss": 0.1138, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.737937559129612, | |
| "grad_norm": 1.1830955743789673, | |
| "learning_rate": 5.367739918315068e-07, | |
| "loss": 0.112, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.7757805108798488, | |
| "grad_norm": 1.2318562269210815, | |
| "learning_rate": 3.98119335554687e-07, | |
| "loss": 0.1098, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.8136234626300851, | |
| "grad_norm": 1.2451008558273315, | |
| "learning_rate": 2.7941920206915443e-07, | |
| "loss": 0.1152, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.8514664143803217, | |
| "grad_norm": 1.2878820896148682, | |
| "learning_rate": 1.8118829496930557e-07, | |
| "loss": 0.1051, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8893093661305582, | |
| "grad_norm": 1.2973670959472656, | |
| "learning_rate": 1.0385255986682718e-07, | |
| "loss": 0.1053, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9271523178807946, | |
| "grad_norm": 1.1544054746627808, | |
| "learning_rate": 4.774733741942206e-08, | |
| "loss": 0.1044, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9649952696310313, | |
| "grad_norm": 1.1692767143249512, | |
| "learning_rate": 1.3115909237734203e-08, | |
| "loss": 0.1083, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.188850164413452, | |
| "learning_rate": 1.0844297567258466e-10, | |
| "loss": 0.1132, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 530, | |
| "total_flos": 71607640850432.0, | |
| "train_loss": 0.2318565127984533, | |
| "train_runtime": 10363.9156, | |
| "train_samples_per_second": 1.428, | |
| "train_steps_per_second": 0.051 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 530, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 71607640850432.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |