| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.14629240193837434, |
| "eval_steps": 500, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 18.54335018044506, |
| "learning_rate": 9.090909090909092e-05, |
| "loss": 6.4244, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 18.204878497494867, |
| "learning_rate": 0.00018181818181818183, |
| "loss": 6.4841, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 7.638311117651696, |
| "learning_rate": 0.00027272727272727274, |
| "loss": 4.3156, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 3.236258184290763, |
| "learning_rate": 0.00036363636363636367, |
| "loss": 3.8398, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.4816791146245016, |
| "learning_rate": 0.00045454545454545455, |
| "loss": 3.4542, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.4160385130751787, |
| "learning_rate": 0.0005454545454545455, |
| "loss": 3.0333, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.3449118849582677, |
| "learning_rate": 0.0006363636363636364, |
| "loss": 2.847, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.782966544742558, |
| "learning_rate": 0.0007272727272727273, |
| "loss": 2.4424, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.4286720843729141, |
| "learning_rate": 0.0008181818181818183, |
| "loss": 2.2581, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.19557058746571676, |
| "learning_rate": 0.0009090909090909091, |
| "loss": 2.1925, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.19629394764775782, |
| "learning_rate": 0.001, |
| "loss": 2.15, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.24283548114329223, |
| "learning_rate": 0.0009999773426770863, |
| "loss": 2.11, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.1602137829534043, |
| "learning_rate": 0.000999909372761763, |
| "loss": 2.1018, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.9265558670447405, |
| "learning_rate": 0.0009997960964140947, |
| "loss": 2.156, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.15041539304459745, |
| "learning_rate": 0.0009996375239002368, |
| "loss": 2.1039, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.08444195746666726, |
| "learning_rate": 0.000999433669591504, |
| "loss": 2.0826, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.05336799505560437, |
| "learning_rate": 0.0009991845519630679, |
| "loss": 2.0558, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.04503459780125264, |
| "learning_rate": 0.0009988901935922825, |
| "loss": 2.0479, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.03827244634244726, |
| "learning_rate": 0.0009985506211566387, |
| "loss": 2.0445, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.040159263058230144, |
| "learning_rate": 0.0009981658654313456, |
| "loss": 2.0618, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.04362572422821177, |
| "learning_rate": 0.0009977359612865424, |
| "loss": 2.04, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.04512109012272362, |
| "learning_rate": 0.0009972609476841367, |
| "loss": 2.035, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.0460336532256244, |
| "learning_rate": 0.0009967408676742752, |
| "loss": 2.0438, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.04161179529868031, |
| "learning_rate": 0.0009961757683914405, |
| "loss": 2.0075, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.03913485984103564, |
| "learning_rate": 0.0009955657010501807, |
| "loss": 1.9991, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.036980743783326483, |
| "learning_rate": 0.0009949107209404665, |
| "loss": 1.9939, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.03318251717420227, |
| "learning_rate": 0.0009942108874226813, |
| "loss": 1.9927, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.0331961384143108, |
| "learning_rate": 0.0009934662639222412, |
| "loss": 1.9806, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.03562839770417466, |
| "learning_rate": 0.0009926769179238466, |
| "loss": 1.9833, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.05177166624815994, |
| "learning_rate": 0.0009918429209653662, |
| "loss": 1.987, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.04047010737996018, |
| "learning_rate": 0.0009909643486313534, |
| "loss": 1.9683, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.04112376138396604, |
| "learning_rate": 0.0009900412805461966, |
| "loss": 1.9509, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.045127786908756885, |
| "learning_rate": 0.0009890738003669028, |
| "loss": 1.934, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.03348389008468475, |
| "learning_rate": 0.000988061995775515, |
| "loss": 1.9435, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.0369790335444681, |
| "learning_rate": 0.0009870059584711668, |
| "loss": 1.9304, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.03732257647443908, |
| "learning_rate": 0.000985905784161771, |
| "loss": 1.9318, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.034825381944822265, |
| "learning_rate": 0.0009847615725553456, |
| "loss": 1.9093, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.03380767656809614, |
| "learning_rate": 0.0009835734273509786, |
| "loss": 1.9031, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.03270535048583525, |
| "learning_rate": 0.000982341456229428, |
| "loss": 1.8901, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.02874837099611008, |
| "learning_rate": 0.0009810657708433637, |
| "loss": 1.9126, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.02842928929645405, |
| "learning_rate": 0.0009797464868072487, |
| "loss": 1.8894, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.02584963545732469, |
| "learning_rate": 0.0009783837236868609, |
| "loss": 1.8641, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.028295949533072753, |
| "learning_rate": 0.0009769776049884564, |
| "loss": 1.8857, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.030410135452704438, |
| "learning_rate": 0.0009755282581475768, |
| "loss": 1.8934, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.02966185510977657, |
| "learning_rate": 0.0009740358145174998, |
| "loss": 1.8535, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.030063842481471717, |
| "learning_rate": 0.0009725004093573342, |
| "loss": 1.8481, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.02556650721323588, |
| "learning_rate": 0.0009709221818197624, |
| "loss": 1.8638, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.026845238542832082, |
| "learning_rate": 0.0009693012749384279, |
| "loss": 1.8409, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.026180343066426557, |
| "learning_rate": 0.0009676378356149733, |
| "loss": 1.8536, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.026565196661024762, |
| "learning_rate": 0.0009659320146057262, |
| "loss": 1.8584, |
| "step": 50 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 341, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 25, |
| "total_flos": 1.0991121077706424e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|