{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 401, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012484394506866416, "grad_norm": 1.0609335899353027, "learning_rate": 1.1881188118811881e-06, "loss": 1.2283, "step": 5 }, { "epoch": 0.024968789013732832, "grad_norm": 0.8323203921318054, "learning_rate": 2.673267326732673e-06, "loss": 1.2561, "step": 10 }, { "epoch": 0.03745318352059925, "grad_norm": 0.8071620464324951, "learning_rate": 4.158415841584159e-06, "loss": 1.2571, "step": 15 }, { "epoch": 0.049937578027465665, "grad_norm": 0.6458085775375366, "learning_rate": 5.643564356435644e-06, "loss": 1.2301, "step": 20 }, { "epoch": 0.062421972534332085, "grad_norm": 0.6387593746185303, "learning_rate": 7.128712871287129e-06, "loss": 1.1905, "step": 25 }, { "epoch": 0.0749063670411985, "grad_norm": 0.5484127402305603, "learning_rate": 8.613861386138613e-06, "loss": 1.2693, "step": 30 }, { "epoch": 0.08739076154806492, "grad_norm": 0.6438347101211548, "learning_rate": 1.00990099009901e-05, "loss": 1.1677, "step": 35 }, { "epoch": 0.09987515605493133, "grad_norm": 0.503511905670166, "learning_rate": 1.1584158415841584e-05, "loss": 1.1914, "step": 40 }, { "epoch": 0.11235955056179775, "grad_norm": 0.5014557242393494, "learning_rate": 1.306930693069307e-05, "loss": 1.2152, "step": 45 }, { "epoch": 0.12484394506866417, "grad_norm": 0.49721699953079224, "learning_rate": 1.4554455445544554e-05, "loss": 1.1607, "step": 50 }, { "epoch": 0.1373283395755306, "grad_norm": 0.5247049927711487, "learning_rate": 1.6039603960396042e-05, "loss": 1.1722, "step": 55 }, { "epoch": 0.149812734082397, "grad_norm": 0.4929521083831787, "learning_rate": 1.7524752475247524e-05, "loss": 1.1115, "step": 60 }, { "epoch": 0.16229712858926343, "grad_norm": 0.4400411546230316, "learning_rate": 1.900990099009901e-05, "loss": 1.1327, "step": 65 }, { "epoch": 0.17478152309612985, "grad_norm": 0.5146467089653015, "learning_rate": 2.0495049504950496e-05, "loss": 1.0995, "step": 70 }, { "epoch": 0.18726591760299627, "grad_norm": 0.4721396565437317, "learning_rate": 2.198019801980198e-05, "loss": 1.0986, "step": 75 }, { "epoch": 0.19975031210986266, "grad_norm": 0.5145531296730042, "learning_rate": 2.3465346534653467e-05, "loss": 1.052, "step": 80 }, { "epoch": 0.21223470661672908, "grad_norm": 0.4975873827934265, "learning_rate": 2.495049504950495e-05, "loss": 1.1328, "step": 85 }, { "epoch": 0.2247191011235955, "grad_norm": 0.5143810510635376, "learning_rate": 2.6435643564356436e-05, "loss": 1.1032, "step": 90 }, { "epoch": 0.23720349563046192, "grad_norm": 0.5382490754127502, "learning_rate": 2.792079207920792e-05, "loss": 1.1121, "step": 95 }, { "epoch": 0.24968789013732834, "grad_norm": 0.5149545073509216, "learning_rate": 2.9405940594059407e-05, "loss": 1.0955, "step": 100 }, { "epoch": 0.26217228464419473, "grad_norm": 0.5805860757827759, "learning_rate": 2.999981623248469e-05, "loss": 1.0344, "step": 105 }, { "epoch": 0.2746566791510612, "grad_norm": 0.524933934211731, "learning_rate": 2.9998693225086163e-05, "loss": 1.0875, "step": 110 }, { "epoch": 0.28714107365792757, "grad_norm": 0.5526661276817322, "learning_rate": 2.9996549379694418e-05, "loss": 1.1151, "step": 115 }, { "epoch": 0.299625468164794, "grad_norm": 0.5277731418609619, "learning_rate": 2.9993384842223426e-05, "loss": 1.04, "step": 120 }, { "epoch": 0.3121098626716604, "grad_norm": 0.5590221285820007, "learning_rate": 2.998919982805729e-05, "loss": 1.0067, "step": 125 }, { "epoch": 0.32459425717852686, "grad_norm": 0.596834659576416, "learning_rate": 2.998399462203559e-05, "loss": 0.9961, "step": 130 }, { "epoch": 0.33707865168539325, "grad_norm": 0.5012202262878418, "learning_rate": 2.9977769578433987e-05, "loss": 1.0435, "step": 135 }, { "epoch": 0.3495630461922597, "grad_norm": 0.6139566898345947, "learning_rate": 2.9970525120940132e-05, "loss": 0.9714, "step": 140 }, { "epoch": 0.3620474406991261, "grad_norm": 0.5993323922157288, "learning_rate": 2.9962261742624813e-05, "loss": 1.0804, "step": 145 }, { "epoch": 0.37453183520599254, "grad_norm": 0.6254947185516357, "learning_rate": 2.995298000590839e-05, "loss": 0.9821, "step": 150 }, { "epoch": 0.38701622971285893, "grad_norm": 0.6110448241233826, "learning_rate": 2.9942680542522535e-05, "loss": 0.9785, "step": 155 }, { "epoch": 0.3995006242197253, "grad_norm": 0.5888521075248718, "learning_rate": 2.993136405346722e-05, "loss": 0.9882, "step": 160 }, { "epoch": 0.41198501872659177, "grad_norm": 0.6749442219734192, "learning_rate": 2.991903130896301e-05, "loss": 1.0322, "step": 165 }, { "epoch": 0.42446941323345816, "grad_norm": 0.7961056232452393, "learning_rate": 2.9905683148398642e-05, "loss": 0.9149, "step": 170 }, { "epoch": 0.4369538077403246, "grad_norm": 0.6190369129180908, "learning_rate": 2.9891320480273886e-05, "loss": 0.9756, "step": 175 }, { "epoch": 0.449438202247191, "grad_norm": 0.6471540927886963, "learning_rate": 2.9875944282137734e-05, "loss": 0.9204, "step": 180 }, { "epoch": 0.46192259675405745, "grad_norm": 0.6503486037254333, "learning_rate": 2.9859555600521838e-05, "loss": 0.8895, "step": 185 }, { "epoch": 0.47440699126092384, "grad_norm": 0.6892472505569458, "learning_rate": 2.9842155550869294e-05, "loss": 0.9435, "step": 190 }, { "epoch": 0.4868913857677903, "grad_norm": 0.7145974636077881, "learning_rate": 2.982374531745873e-05, "loss": 0.9135, "step": 195 }, { "epoch": 0.4993757802746567, "grad_norm": 0.7395890951156616, "learning_rate": 2.9804326153323683e-05, "loss": 0.8777, "step": 200 }, { "epoch": 0.5118601747815231, "grad_norm": 0.8736094832420349, "learning_rate": 2.9783899380167345e-05, "loss": 0.8953, "step": 205 }, { "epoch": 0.5243445692883895, "grad_norm": 0.7746868133544922, "learning_rate": 2.976246638827257e-05, "loss": 0.8839, "step": 210 }, { "epoch": 0.5368289637952559, "grad_norm": 0.7751911878585815, "learning_rate": 2.974002863640727e-05, "loss": 0.8941, "step": 215 }, { "epoch": 0.5493133583021224, "grad_norm": 0.8329525589942932, "learning_rate": 2.9716587651725134e-05, "loss": 0.8562, "step": 220 }, { "epoch": 0.5617977528089888, "grad_norm": 0.8044184446334839, "learning_rate": 2.9692145029661664e-05, "loss": 0.8543, "step": 225 }, { "epoch": 0.5742821473158551, "grad_norm": 0.7520190477371216, "learning_rate": 2.9666702433825614e-05, "loss": 0.839, "step": 230 }, { "epoch": 0.5867665418227216, "grad_norm": 0.7825169563293457, "learning_rate": 2.9640261595885735e-05, "loss": 0.8368, "step": 235 }, { "epoch": 0.599250936329588, "grad_norm": 0.7438607215881348, "learning_rate": 2.9612824315452943e-05, "loss": 0.8619, "step": 240 }, { "epoch": 0.6117353308364545, "grad_norm": 0.7354984283447266, "learning_rate": 2.958439245995781e-05, "loss": 0.8337, "step": 245 }, { "epoch": 0.6242197253433208, "grad_norm": 0.7580518126487732, "learning_rate": 2.955496796452347e-05, "loss": 0.7538, "step": 250 }, { "epoch": 0.6367041198501873, "grad_norm": 0.7653853893280029, "learning_rate": 2.9524552831833924e-05, "loss": 0.8178, "step": 255 }, { "epoch": 0.6491885143570537, "grad_norm": 0.7680398225784302, "learning_rate": 2.9493149131997713e-05, "loss": 0.795, "step": 260 }, { "epoch": 0.66167290886392, "grad_norm": 0.7689065337181091, "learning_rate": 2.946075900240704e-05, "loss": 0.849, "step": 265 }, { "epoch": 0.6741573033707865, "grad_norm": 0.8538303971290588, "learning_rate": 2.942738464759229e-05, "loss": 0.7875, "step": 270 }, { "epoch": 0.686641697877653, "grad_norm": 0.9141761064529419, "learning_rate": 2.939302833907197e-05, "loss": 0.7989, "step": 275 }, { "epoch": 0.6991260923845194, "grad_norm": 0.9288301467895508, "learning_rate": 2.9357692415198136e-05, "loss": 0.7606, "step": 280 }, { "epoch": 0.7116104868913857, "grad_norm": 0.8967108130455017, "learning_rate": 2.932137928099722e-05, "loss": 0.782, "step": 285 }, { "epoch": 0.7240948813982522, "grad_norm": 0.7744588851928711, "learning_rate": 2.928409140800634e-05, "loss": 0.7793, "step": 290 }, { "epoch": 0.7365792759051186, "grad_norm": 0.9789696335792542, "learning_rate": 2.9245831334105084e-05, "loss": 0.78, "step": 295 }, { "epoch": 0.7490636704119851, "grad_norm": 0.853340744972229, "learning_rate": 2.9206601663342786e-05, "loss": 0.7659, "step": 300 }, { "epoch": 0.7615480649188514, "grad_norm": 0.9357976317405701, "learning_rate": 2.916640506576128e-05, "loss": 0.7468, "step": 305 }, { "epoch": 0.7740324594257179, "grad_norm": 0.8526910543441772, "learning_rate": 2.9125244277213176e-05, "loss": 0.7384, "step": 310 }, { "epoch": 0.7865168539325843, "grad_norm": 0.8397050499916077, "learning_rate": 2.908312209917565e-05, "loss": 0.7512, "step": 315 }, { "epoch": 0.7990012484394506, "grad_norm": 0.8564719557762146, "learning_rate": 2.904004139855978e-05, "loss": 0.7304, "step": 320 }, { "epoch": 0.8114856429463171, "grad_norm": 1.0210206508636475, "learning_rate": 2.8996005107515405e-05, "loss": 0.6818, "step": 325 }, { "epoch": 0.8239700374531835, "grad_norm": 0.9247426986694336, "learning_rate": 2.8951016223231567e-05, "loss": 0.7076, "step": 330 }, { "epoch": 0.83645443196005, "grad_norm": 1.1422244310379028, "learning_rate": 2.890507780773252e-05, "loss": 0.659, "step": 335 }, { "epoch": 0.8489388264669163, "grad_norm": 0.8729642629623413, "learning_rate": 2.8858192987669303e-05, "loss": 0.7419, "step": 340 }, { "epoch": 0.8614232209737828, "grad_norm": 0.9443491101264954, "learning_rate": 2.881036495410696e-05, "loss": 0.6977, "step": 345 }, { "epoch": 0.8739076154806492, "grad_norm": 0.9886714816093445, "learning_rate": 2.8761596962307343e-05, "loss": 0.6996, "step": 350 }, { "epoch": 0.8863920099875156, "grad_norm": 0.9167611002922058, "learning_rate": 2.871189233150754e-05, "loss": 0.6976, "step": 355 }, { "epoch": 0.898876404494382, "grad_norm": 0.9194567799568176, "learning_rate": 2.8661254444693975e-05, "loss": 0.6719, "step": 360 }, { "epoch": 0.9113607990012484, "grad_norm": 0.9292520880699158, "learning_rate": 2.8609686748372155e-05, "loss": 0.6539, "step": 365 }, { "epoch": 0.9238451935081149, "grad_norm": 0.8527399301528931, "learning_rate": 2.8557192752332082e-05, "loss": 0.6709, "step": 370 }, { "epoch": 0.9363295880149812, "grad_norm": 1.0521934032440186, "learning_rate": 2.8503776029409398e-05, "loss": 0.6857, "step": 375 }, { "epoch": 0.9488139825218477, "grad_norm": 0.9660723209381104, "learning_rate": 2.844944021524217e-05, "loss": 0.6418, "step": 380 }, { "epoch": 0.9612983770287141, "grad_norm": 0.9831718802452087, "learning_rate": 2.8394189008023482e-05, "loss": 0.6334, "step": 385 }, { "epoch": 0.9737827715355806, "grad_norm": 0.9556214809417725, "learning_rate": 2.833802616824972e-05, "loss": 0.6385, "step": 390 }, { "epoch": 0.9862671660424469, "grad_norm": 0.9405916929244995, "learning_rate": 2.8280955518464587e-05, "loss": 0.6673, "step": 395 }, { "epoch": 0.9987515605493134, "grad_norm": 0.9383732676506042, "learning_rate": 2.8222980942999005e-05, "loss": 0.618, "step": 400 } ], "logging_steps": 5, "max_steps": 2005, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.842838055865549e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }