| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 490, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04094165813715456, |
| "grad_norm": 12.65462875366211, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 0.84, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08188331627430911, |
| "grad_norm": 15.826253890991211, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 0.9075, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.12282497441146366, |
| "grad_norm": 12.505751609802246, |
| "learning_rate": 2.9e-06, |
| "loss": 0.9015, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16376663254861823, |
| "grad_norm": 15.053168296813965, |
| "learning_rate": 3.900000000000001e-06, |
| "loss": 0.8386, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2047082906857728, |
| "grad_norm": 15.304803848266602, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.8111, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.24564994882292732, |
| "grad_norm": 11.536665916442871, |
| "learning_rate": 5.9e-06, |
| "loss": 1.0011, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2865916069600819, |
| "grad_norm": 15.474630355834961, |
| "learning_rate": 6.9e-06, |
| "loss": 0.9621, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.32753326509723646, |
| "grad_norm": 16.180992126464844, |
| "learning_rate": 7.9e-06, |
| "loss": 0.9414, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.368474923234391, |
| "grad_norm": 19.92721939086914, |
| "learning_rate": 8.900000000000001e-06, |
| "loss": 0.8467, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4094165813715456, |
| "grad_norm": 13.27595043182373, |
| "learning_rate": 9.9e-06, |
| "loss": 0.8855, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4503582395087001, |
| "grad_norm": 17.830875396728516, |
| "learning_rate": 9.961702127659575e-06, |
| "loss": 0.8637, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.49129989764585463, |
| "grad_norm": 13.432883262634277, |
| "learning_rate": 9.919148936170213e-06, |
| "loss": 0.8807, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5322415557830092, |
| "grad_norm": 19.632150650024414, |
| "learning_rate": 9.876595744680851e-06, |
| "loss": 0.7876, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5731832139201638, |
| "grad_norm": 8.750787734985352, |
| "learning_rate": 9.834042553191491e-06, |
| "loss": 0.8812, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6141248720573184, |
| "grad_norm": 11.828136444091797, |
| "learning_rate": 9.79148936170213e-06, |
| "loss": 0.7476, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6550665301944729, |
| "grad_norm": 8.025004386901855, |
| "learning_rate": 9.748936170212768e-06, |
| "loss": 0.7658, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6960081883316275, |
| "grad_norm": 17.197458267211914, |
| "learning_rate": 9.706382978723406e-06, |
| "loss": 0.8227, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.736949846468782, |
| "grad_norm": 18.768630981445312, |
| "learning_rate": 9.663829787234044e-06, |
| "loss": 0.8761, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7778915046059366, |
| "grad_norm": 11.378251075744629, |
| "learning_rate": 9.621276595744682e-06, |
| "loss": 0.7956, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8188331627430911, |
| "grad_norm": 12.888134956359863, |
| "learning_rate": 9.57872340425532e-06, |
| "loss": 0.7741, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8597748208802457, |
| "grad_norm": 9.256698608398438, |
| "learning_rate": 9.536170212765959e-06, |
| "loss": 0.8193, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9007164790174002, |
| "grad_norm": 9.790871620178223, |
| "learning_rate": 9.493617021276597e-06, |
| "loss": 0.8279, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9416581371545547, |
| "grad_norm": 17.922643661499023, |
| "learning_rate": 9.451063829787235e-06, |
| "loss": 0.8167, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.9825997952917093, |
| "grad_norm": 15.21545696258545, |
| "learning_rate": 9.408510638297873e-06, |
| "loss": 0.678, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_general_loss": 0.7405520677566528, |
| "eval_general_runtime": 257.7451, |
| "eval_general_samples_per_second": 3.55, |
| "eval_general_steps_per_second": 0.888, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_code_loss": 0.8061306476593018, |
| "eval_code_runtime": 300.2792, |
| "eval_code_samples_per_second": 3.057, |
| "eval_code_steps_per_second": 0.766, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_stem_loss": 0.7527948021888733, |
| "eval_stem_runtime": 253.8295, |
| "eval_stem_samples_per_second": 3.601, |
| "eval_stem_steps_per_second": 0.902, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0204708290685773, |
| "grad_norm": 9.183212280273438, |
| "learning_rate": 9.365957446808511e-06, |
| "loss": 0.6701, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.0614124872057318, |
| "grad_norm": 11.502631187438965, |
| "learning_rate": 9.32340425531915e-06, |
| "loss": 0.6915, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.1023541453428864, |
| "grad_norm": 14.637332916259766, |
| "learning_rate": 9.280851063829788e-06, |
| "loss": 0.7034, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.143295803480041, |
| "grad_norm": 22.042236328125, |
| "learning_rate": 9.238297872340426e-06, |
| "loss": 0.7613, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.1842374616171956, |
| "grad_norm": 10.717690467834473, |
| "learning_rate": 9.195744680851064e-06, |
| "loss": 0.7115, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.22517911975435, |
| "grad_norm": 12.595451354980469, |
| "learning_rate": 9.153191489361702e-06, |
| "loss": 0.6753, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.2661207778915047, |
| "grad_norm": 15.185698509216309, |
| "learning_rate": 9.11063829787234e-06, |
| "loss": 0.6543, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.3070624360286591, |
| "grad_norm": 8.96109676361084, |
| "learning_rate": 9.06808510638298e-06, |
| "loss": 0.7118, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.3480040941658138, |
| "grad_norm": 11.143041610717773, |
| "learning_rate": 9.025531914893619e-06, |
| "loss": 0.7155, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.3889457523029682, |
| "grad_norm": 13.331513404846191, |
| "learning_rate": 8.982978723404257e-06, |
| "loss": 0.5873, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.429887410440123, |
| "grad_norm": 7.453923225402832, |
| "learning_rate": 8.940425531914895e-06, |
| "loss": 0.6085, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.4708290685772774, |
| "grad_norm": 10.874267578125, |
| "learning_rate": 8.897872340425533e-06, |
| "loss": 0.7046, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.511770726714432, |
| "grad_norm": 18.965225219726562, |
| "learning_rate": 8.855319148936171e-06, |
| "loss": 0.7275, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.5527123848515865, |
| "grad_norm": 11.133731842041016, |
| "learning_rate": 8.81276595744681e-06, |
| "loss": 0.7185, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.593654042988741, |
| "grad_norm": 9.591411590576172, |
| "learning_rate": 8.770212765957448e-06, |
| "loss": 0.6325, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6345957011258956, |
| "grad_norm": 9.676285743713379, |
| "learning_rate": 8.727659574468086e-06, |
| "loss": 0.5229, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.6755373592630503, |
| "grad_norm": 17.216745376586914, |
| "learning_rate": 8.685106382978724e-06, |
| "loss": 0.5627, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.7164790174002047, |
| "grad_norm": 12.413490295410156, |
| "learning_rate": 8.642553191489362e-06, |
| "loss": 0.6627, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.7574206755373591, |
| "grad_norm": 18.200937271118164, |
| "learning_rate": 8.6e-06, |
| "loss": 0.821, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.7983623336745138, |
| "grad_norm": 7.938803195953369, |
| "learning_rate": 8.557446808510639e-06, |
| "loss": 0.5181, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.8393039918116685, |
| "grad_norm": 16.005313873291016, |
| "learning_rate": 8.514893617021277e-06, |
| "loss": 0.5963, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.880245649948823, |
| "grad_norm": 7.592184066772461, |
| "learning_rate": 8.472340425531915e-06, |
| "loss": 0.5118, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.9211873080859774, |
| "grad_norm": 7.5901384353637695, |
| "learning_rate": 8.429787234042553e-06, |
| "loss": 0.6951, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.962128966223132, |
| "grad_norm": 15.962983131408691, |
| "learning_rate": 8.387234042553192e-06, |
| "loss": 0.5939, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 21.49174690246582, |
| "learning_rate": 8.34468085106383e-06, |
| "loss": 0.5632, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_general_loss": 0.6747614741325378, |
| "eval_general_runtime": 258.8315, |
| "eval_general_samples_per_second": 3.535, |
| "eval_general_steps_per_second": 0.885, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_code_loss": 0.7175714373588562, |
| "eval_code_runtime": 301.597, |
| "eval_code_samples_per_second": 3.044, |
| "eval_code_steps_per_second": 0.763, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_stem_loss": 0.6549679040908813, |
| "eval_stem_runtime": 254.6647, |
| "eval_stem_samples_per_second": 3.589, |
| "eval_stem_steps_per_second": 0.899, |
| "step": 490 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2450, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0645387161391596e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|