{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 530, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03784295175023652, "grad_norm": 12.928278923034668, "learning_rate": 1.6981132075471698e-06, "loss": 1.4503, "step": 10 }, { "epoch": 0.07568590350047304, "grad_norm": 4.828810214996338, "learning_rate": 3.5849056603773586e-06, "loss": 0.8761, "step": 20 }, { "epoch": 0.11352885525070956, "grad_norm": 3.0466108322143555, "learning_rate": 5.4716981132075475e-06, "loss": 0.5782, "step": 30 }, { "epoch": 0.15137180700094607, "grad_norm": 2.7793734073638916, "learning_rate": 7.358490566037736e-06, "loss": 0.4365, "step": 40 }, { "epoch": 0.1892147587511826, "grad_norm": 2.6989638805389404, "learning_rate": 9.245283018867926e-06, "loss": 0.3779, "step": 50 }, { "epoch": 0.22705771050141912, "grad_norm": 2.662125825881958, "learning_rate": 9.99609654676786e-06, "loss": 0.3345, "step": 60 }, { "epoch": 0.26490066225165565, "grad_norm": 2.1630754470825195, "learning_rate": 9.9722641784023e-06, "loss": 0.3279, "step": 70 }, { "epoch": 0.30274361400189215, "grad_norm": 2.4073753356933594, "learning_rate": 9.92687124249773e-06, "loss": 0.3057, "step": 80 }, { "epoch": 0.34058656575212864, "grad_norm": 2.1243770122528076, "learning_rate": 9.860114570402055e-06, "loss": 0.2971, "step": 90 }, { "epoch": 0.3784295175023652, "grad_norm": 2.0397891998291016, "learning_rate": 9.772283630189727e-06, "loss": 0.2811, "step": 100 }, { "epoch": 0.4162724692526017, "grad_norm": 2.167642116546631, "learning_rate": 9.663759271479858e-06, "loss": 0.2664, "step": 110 }, { "epoch": 0.45411542100283825, "grad_norm": 2.2073721885681152, "learning_rate": 9.535012074008688e-06, "loss": 0.2721, "step": 120 }, { "epoch": 0.49195837275307475, "grad_norm": 2.113102912902832, "learning_rate": 9.386600307117293e-06, "loss": 0.2549, "step": 130 }, { "epoch": 0.5298013245033113, "grad_norm": 1.9171561002731323, "learning_rate": 9.219167509002526e-06, "loss": 0.2534, "step": 140 }, { "epoch": 0.5676442762535477, "grad_norm": 1.8712971210479736, "learning_rate": 9.033439696227966e-06, "loss": 0.2447, "step": 150 }, { "epoch": 0.6054872280037843, "grad_norm": 2.0216710567474365, "learning_rate": 8.83022221559489e-06, "loss": 0.2491, "step": 160 }, { "epoch": 0.6433301797540208, "grad_norm": 1.623536229133606, "learning_rate": 8.610396252024113e-06, "loss": 0.2397, "step": 170 }, { "epoch": 0.6811731315042573, "grad_norm": 1.9642530679702759, "learning_rate": 8.374915007591053e-06, "loss": 0.2172, "step": 180 }, { "epoch": 0.7190160832544938, "grad_norm": 1.723118543624878, "learning_rate": 8.124799568282418e-06, "loss": 0.234, "step": 190 }, { "epoch": 0.7568590350047304, "grad_norm": 1.7284762859344482, "learning_rate": 7.861134476396903e-06, "loss": 0.2217, "step": 200 }, { "epoch": 0.7568590350047304, "eval_loss": 0.2220437228679657, "eval_runtime": 93.3051, "eval_samples_per_second": 8.821, "eval_steps_per_second": 1.265, "step": 200 }, { "epoch": 0.7947019867549668, "grad_norm": 1.6118355989456177, "learning_rate": 7.58506302778873e-06, "loss": 0.2129, "step": 210 }, { "epoch": 0.8325449385052034, "grad_norm": 1.7828593254089355, "learning_rate": 7.297782314345972e-06, "loss": 0.2146, "step": 220 }, { "epoch": 0.8703878902554399, "grad_norm": 1.4980388879776, "learning_rate": 7.00053803320028e-06, "loss": 0.1948, "step": 230 }, { "epoch": 0.9082308420056765, "grad_norm": 1.5348376035690308, "learning_rate": 6.694619085176159e-06, "loss": 0.2056, "step": 240 }, { "epoch": 0.9460737937559129, "grad_norm": 1.3809292316436768, "learning_rate": 6.381351985901842e-06, "loss": 0.1986, "step": 250 }, { "epoch": 0.9839167455061495, "grad_norm": 1.3319153785705566, "learning_rate": 6.062095113816069e-06, "loss": 0.1962, "step": 260 }, { "epoch": 1.0189214758751182, "grad_norm": 1.361677885055542, "learning_rate": 5.738232820012407e-06, "loss": 0.1666, "step": 270 }, { "epoch": 1.0567644276253547, "grad_norm": 1.4107803106307983, "learning_rate": 5.411169425461822e-06, "loss": 0.1459, "step": 280 }, { "epoch": 1.0946073793755913, "grad_norm": 1.373854637145996, "learning_rate": 5.082323131642496e-06, "loss": 0.1453, "step": 290 }, { "epoch": 1.1324503311258278, "grad_norm": 1.3363025188446045, "learning_rate": 4.753119870981486e-06, "loss": 0.1401, "step": 300 }, { "epoch": 1.1702932828760644, "grad_norm": 1.3609431982040405, "learning_rate": 4.424987123773654e-06, "loss": 0.132, "step": 310 }, { "epoch": 1.208136234626301, "grad_norm": 1.300492286682129, "learning_rate": 4.0993477283888264e-06, "loss": 0.1342, "step": 320 }, { "epoch": 1.2459791863765375, "grad_norm": 1.1971514225006104, "learning_rate": 3.777613711607087e-06, "loss": 0.1322, "step": 330 }, { "epoch": 1.2838221381267738, "grad_norm": 1.4877253770828247, "learning_rate": 3.46118016583494e-06, "loss": 0.1368, "step": 340 }, { "epoch": 1.3216650898770104, "grad_norm": 1.2842473983764648, "learning_rate": 3.1514191997517387e-06, "loss": 0.1301, "step": 350 }, { "epoch": 1.359508041627247, "grad_norm": 1.3692116737365723, "learning_rate": 2.8496739886173994e-06, "loss": 0.1314, "step": 360 }, { "epoch": 1.3973509933774835, "grad_norm": 1.254835605621338, "learning_rate": 2.5572529500402365e-06, "loss": 0.129, "step": 370 }, { "epoch": 1.43519394512772, "grad_norm": 1.2396975755691528, "learning_rate": 2.275424070459803e-06, "loss": 0.1205, "step": 380 }, { "epoch": 1.4730368968779564, "grad_norm": 1.3641499280929565, "learning_rate": 2.005409406946e-06, "loss": 0.129, "step": 390 }, { "epoch": 1.5108798486281931, "grad_norm": 1.4750701189041138, "learning_rate": 1.7483797881556175e-06, "loss": 0.1209, "step": 400 }, { "epoch": 1.5108798486281931, "eval_loss": 0.1692177653312683, "eval_runtime": 94.0494, "eval_samples_per_second": 8.751, "eval_steps_per_second": 1.255, "step": 400 }, { "epoch": 1.5487228003784295, "grad_norm": 1.1940029859542847, "learning_rate": 1.5054497374238275e-06, "loss": 0.1155, "step": 410 }, { "epoch": 1.586565752128666, "grad_norm": 1.2632642984390259, "learning_rate": 1.277672640004936e-06, "loss": 0.1193, "step": 420 }, { "epoch": 1.6244087038789026, "grad_norm": 1.2763444185256958, "learning_rate": 1.066036175418082e-06, "loss": 0.115, "step": 430 }, { "epoch": 1.6622516556291391, "grad_norm": 1.2933067083358765, "learning_rate": 8.714580347039491e-07, "loss": 0.1162, "step": 440 }, { "epoch": 1.7000946073793757, "grad_norm": 1.197369933128357, "learning_rate": 6.947819411632223e-07, "loss": 0.1138, "step": 450 }, { "epoch": 1.737937559129612, "grad_norm": 1.1830955743789673, "learning_rate": 5.367739918315068e-07, "loss": 0.112, "step": 460 }, { "epoch": 1.7757805108798488, "grad_norm": 1.2318562269210815, "learning_rate": 3.98119335554687e-07, "loss": 0.1098, "step": 470 }, { "epoch": 1.8136234626300851, "grad_norm": 1.2451008558273315, "learning_rate": 2.7941920206915443e-07, "loss": 0.1152, "step": 480 }, { "epoch": 1.8514664143803217, "grad_norm": 1.2878820896148682, "learning_rate": 1.8118829496930557e-07, "loss": 0.1051, "step": 490 }, { "epoch": 1.8893093661305582, "grad_norm": 1.2973670959472656, "learning_rate": 1.0385255986682718e-07, "loss": 0.1053, "step": 500 }, { "epoch": 1.9271523178807946, "grad_norm": 1.1544054746627808, "learning_rate": 4.774733741942206e-08, "loss": 0.1044, "step": 510 }, { "epoch": 1.9649952696310313, "grad_norm": 1.1692767143249512, "learning_rate": 1.3115909237734203e-08, "loss": 0.1083, "step": 520 }, { "epoch": 2.0, "grad_norm": 2.188850164413452, "learning_rate": 1.0844297567258466e-10, "loss": 0.1132, "step": 530 }, { "epoch": 2.0, "step": 530, "total_flos": 71607640850432.0, "train_loss": 0.2318565127984533, "train_runtime": 10363.9156, "train_samples_per_second": 1.428, "train_steps_per_second": 0.051 } ], "logging_steps": 10, "max_steps": 530, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 71607640850432.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }