| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 401, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012484394506866416, | |
| "grad_norm": 1.0609335899353027, | |
| "learning_rate": 1.1881188118811881e-06, | |
| "loss": 1.2283, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.024968789013732832, | |
| "grad_norm": 0.8323203921318054, | |
| "learning_rate": 2.673267326732673e-06, | |
| "loss": 1.2561, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03745318352059925, | |
| "grad_norm": 0.8071620464324951, | |
| "learning_rate": 4.158415841584159e-06, | |
| "loss": 1.2571, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.049937578027465665, | |
| "grad_norm": 0.6458085775375366, | |
| "learning_rate": 5.643564356435644e-06, | |
| "loss": 1.2301, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.062421972534332085, | |
| "grad_norm": 0.6387593746185303, | |
| "learning_rate": 7.128712871287129e-06, | |
| "loss": 1.1905, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0749063670411985, | |
| "grad_norm": 0.5484127402305603, | |
| "learning_rate": 8.613861386138613e-06, | |
| "loss": 1.2693, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08739076154806492, | |
| "grad_norm": 0.6438347101211548, | |
| "learning_rate": 1.00990099009901e-05, | |
| "loss": 1.1677, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09987515605493133, | |
| "grad_norm": 0.503511905670166, | |
| "learning_rate": 1.1584158415841584e-05, | |
| "loss": 1.1914, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11235955056179775, | |
| "grad_norm": 0.5014557242393494, | |
| "learning_rate": 1.306930693069307e-05, | |
| "loss": 1.2152, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.12484394506866417, | |
| "grad_norm": 0.49721699953079224, | |
| "learning_rate": 1.4554455445544554e-05, | |
| "loss": 1.1607, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1373283395755306, | |
| "grad_norm": 0.5247049927711487, | |
| "learning_rate": 1.6039603960396042e-05, | |
| "loss": 1.1722, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.149812734082397, | |
| "grad_norm": 0.4929521083831787, | |
| "learning_rate": 1.7524752475247524e-05, | |
| "loss": 1.1115, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.16229712858926343, | |
| "grad_norm": 0.4400411546230316, | |
| "learning_rate": 1.900990099009901e-05, | |
| "loss": 1.1327, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.17478152309612985, | |
| "grad_norm": 0.5146467089653015, | |
| "learning_rate": 2.0495049504950496e-05, | |
| "loss": 1.0995, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18726591760299627, | |
| "grad_norm": 0.4721396565437317, | |
| "learning_rate": 2.198019801980198e-05, | |
| "loss": 1.0986, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.19975031210986266, | |
| "grad_norm": 0.5145531296730042, | |
| "learning_rate": 2.3465346534653467e-05, | |
| "loss": 1.052, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.21223470661672908, | |
| "grad_norm": 0.4975873827934265, | |
| "learning_rate": 2.495049504950495e-05, | |
| "loss": 1.1328, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2247191011235955, | |
| "grad_norm": 0.5143810510635376, | |
| "learning_rate": 2.6435643564356436e-05, | |
| "loss": 1.1032, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23720349563046192, | |
| "grad_norm": 0.5382490754127502, | |
| "learning_rate": 2.792079207920792e-05, | |
| "loss": 1.1121, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.24968789013732834, | |
| "grad_norm": 0.5149545073509216, | |
| "learning_rate": 2.9405940594059407e-05, | |
| "loss": 1.0955, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.26217228464419473, | |
| "grad_norm": 0.5805860757827759, | |
| "learning_rate": 2.999981623248469e-05, | |
| "loss": 1.0344, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2746566791510612, | |
| "grad_norm": 0.524933934211731, | |
| "learning_rate": 2.9998693225086163e-05, | |
| "loss": 1.0875, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.28714107365792757, | |
| "grad_norm": 0.5526661276817322, | |
| "learning_rate": 2.9996549379694418e-05, | |
| "loss": 1.1151, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.299625468164794, | |
| "grad_norm": 0.5277731418609619, | |
| "learning_rate": 2.9993384842223426e-05, | |
| "loss": 1.04, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3121098626716604, | |
| "grad_norm": 0.5590221285820007, | |
| "learning_rate": 2.998919982805729e-05, | |
| "loss": 1.0067, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.32459425717852686, | |
| "grad_norm": 0.596834659576416, | |
| "learning_rate": 2.998399462203559e-05, | |
| "loss": 0.9961, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.33707865168539325, | |
| "grad_norm": 0.5012202262878418, | |
| "learning_rate": 2.9977769578433987e-05, | |
| "loss": 1.0435, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3495630461922597, | |
| "grad_norm": 0.6139566898345947, | |
| "learning_rate": 2.9970525120940132e-05, | |
| "loss": 0.9714, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3620474406991261, | |
| "grad_norm": 0.5993323922157288, | |
| "learning_rate": 2.9962261742624813e-05, | |
| "loss": 1.0804, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.37453183520599254, | |
| "grad_norm": 0.6254947185516357, | |
| "learning_rate": 2.995298000590839e-05, | |
| "loss": 0.9821, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.38701622971285893, | |
| "grad_norm": 0.6110448241233826, | |
| "learning_rate": 2.9942680542522535e-05, | |
| "loss": 0.9785, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3995006242197253, | |
| "grad_norm": 0.5888521075248718, | |
| "learning_rate": 2.993136405346722e-05, | |
| "loss": 0.9882, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.41198501872659177, | |
| "grad_norm": 0.6749442219734192, | |
| "learning_rate": 2.991903130896301e-05, | |
| "loss": 1.0322, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.42446941323345816, | |
| "grad_norm": 0.7961056232452393, | |
| "learning_rate": 2.9905683148398642e-05, | |
| "loss": 0.9149, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4369538077403246, | |
| "grad_norm": 0.6190369129180908, | |
| "learning_rate": 2.9891320480273886e-05, | |
| "loss": 0.9756, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.449438202247191, | |
| "grad_norm": 0.6471540927886963, | |
| "learning_rate": 2.9875944282137734e-05, | |
| "loss": 0.9204, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.46192259675405745, | |
| "grad_norm": 0.6503486037254333, | |
| "learning_rate": 2.9859555600521838e-05, | |
| "loss": 0.8895, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.47440699126092384, | |
| "grad_norm": 0.6892472505569458, | |
| "learning_rate": 2.9842155550869294e-05, | |
| "loss": 0.9435, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4868913857677903, | |
| "grad_norm": 0.7145974636077881, | |
| "learning_rate": 2.982374531745873e-05, | |
| "loss": 0.9135, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4993757802746567, | |
| "grad_norm": 0.7395890951156616, | |
| "learning_rate": 2.9804326153323683e-05, | |
| "loss": 0.8777, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5118601747815231, | |
| "grad_norm": 0.8736094832420349, | |
| "learning_rate": 2.9783899380167345e-05, | |
| "loss": 0.8953, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5243445692883895, | |
| "grad_norm": 0.7746868133544922, | |
| "learning_rate": 2.976246638827257e-05, | |
| "loss": 0.8839, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5368289637952559, | |
| "grad_norm": 0.7751911878585815, | |
| "learning_rate": 2.974002863640727e-05, | |
| "loss": 0.8941, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5493133583021224, | |
| "grad_norm": 0.8329525589942932, | |
| "learning_rate": 2.9716587651725134e-05, | |
| "loss": 0.8562, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5617977528089888, | |
| "grad_norm": 0.8044184446334839, | |
| "learning_rate": 2.9692145029661664e-05, | |
| "loss": 0.8543, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5742821473158551, | |
| "grad_norm": 0.7520190477371216, | |
| "learning_rate": 2.9666702433825614e-05, | |
| "loss": 0.839, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5867665418227216, | |
| "grad_norm": 0.7825169563293457, | |
| "learning_rate": 2.9640261595885735e-05, | |
| "loss": 0.8368, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.599250936329588, | |
| "grad_norm": 0.7438607215881348, | |
| "learning_rate": 2.9612824315452943e-05, | |
| "loss": 0.8619, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6117353308364545, | |
| "grad_norm": 0.7354984283447266, | |
| "learning_rate": 2.958439245995781e-05, | |
| "loss": 0.8337, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6242197253433208, | |
| "grad_norm": 0.7580518126487732, | |
| "learning_rate": 2.955496796452347e-05, | |
| "loss": 0.7538, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6367041198501873, | |
| "grad_norm": 0.7653853893280029, | |
| "learning_rate": 2.9524552831833924e-05, | |
| "loss": 0.8178, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6491885143570537, | |
| "grad_norm": 0.7680398225784302, | |
| "learning_rate": 2.9493149131997713e-05, | |
| "loss": 0.795, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.66167290886392, | |
| "grad_norm": 0.7689065337181091, | |
| "learning_rate": 2.946075900240704e-05, | |
| "loss": 0.849, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6741573033707865, | |
| "grad_norm": 0.8538303971290588, | |
| "learning_rate": 2.942738464759229e-05, | |
| "loss": 0.7875, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.686641697877653, | |
| "grad_norm": 0.9141761064529419, | |
| "learning_rate": 2.939302833907197e-05, | |
| "loss": 0.7989, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6991260923845194, | |
| "grad_norm": 0.9288301467895508, | |
| "learning_rate": 2.9357692415198136e-05, | |
| "loss": 0.7606, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7116104868913857, | |
| "grad_norm": 0.8967108130455017, | |
| "learning_rate": 2.932137928099722e-05, | |
| "loss": 0.782, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7240948813982522, | |
| "grad_norm": 0.7744588851928711, | |
| "learning_rate": 2.928409140800634e-05, | |
| "loss": 0.7793, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7365792759051186, | |
| "grad_norm": 0.9789696335792542, | |
| "learning_rate": 2.9245831334105084e-05, | |
| "loss": 0.78, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7490636704119851, | |
| "grad_norm": 0.853340744972229, | |
| "learning_rate": 2.9206601663342786e-05, | |
| "loss": 0.7659, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7615480649188514, | |
| "grad_norm": 0.9357976317405701, | |
| "learning_rate": 2.916640506576128e-05, | |
| "loss": 0.7468, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7740324594257179, | |
| "grad_norm": 0.8526910543441772, | |
| "learning_rate": 2.9125244277213176e-05, | |
| "loss": 0.7384, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7865168539325843, | |
| "grad_norm": 0.8397050499916077, | |
| "learning_rate": 2.908312209917565e-05, | |
| "loss": 0.7512, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7990012484394506, | |
| "grad_norm": 0.8564719557762146, | |
| "learning_rate": 2.904004139855978e-05, | |
| "loss": 0.7304, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8114856429463171, | |
| "grad_norm": 1.0210206508636475, | |
| "learning_rate": 2.8996005107515405e-05, | |
| "loss": 0.6818, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8239700374531835, | |
| "grad_norm": 0.9247426986694336, | |
| "learning_rate": 2.8951016223231567e-05, | |
| "loss": 0.7076, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.83645443196005, | |
| "grad_norm": 1.1422244310379028, | |
| "learning_rate": 2.890507780773252e-05, | |
| "loss": 0.659, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8489388264669163, | |
| "grad_norm": 0.8729642629623413, | |
| "learning_rate": 2.8858192987669303e-05, | |
| "loss": 0.7419, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8614232209737828, | |
| "grad_norm": 0.9443491101264954, | |
| "learning_rate": 2.881036495410696e-05, | |
| "loss": 0.6977, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8739076154806492, | |
| "grad_norm": 0.9886714816093445, | |
| "learning_rate": 2.8761596962307343e-05, | |
| "loss": 0.6996, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8863920099875156, | |
| "grad_norm": 0.9167611002922058, | |
| "learning_rate": 2.871189233150754e-05, | |
| "loss": 0.6976, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 0.9194567799568176, | |
| "learning_rate": 2.8661254444693975e-05, | |
| "loss": 0.6719, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9113607990012484, | |
| "grad_norm": 0.9292520880699158, | |
| "learning_rate": 2.8609686748372155e-05, | |
| "loss": 0.6539, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.9238451935081149, | |
| "grad_norm": 0.8527399301528931, | |
| "learning_rate": 2.8557192752332082e-05, | |
| "loss": 0.6709, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9363295880149812, | |
| "grad_norm": 1.0521934032440186, | |
| "learning_rate": 2.8503776029409398e-05, | |
| "loss": 0.6857, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9488139825218477, | |
| "grad_norm": 0.9660723209381104, | |
| "learning_rate": 2.844944021524217e-05, | |
| "loss": 0.6418, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9612983770287141, | |
| "grad_norm": 0.9831718802452087, | |
| "learning_rate": 2.8394189008023482e-05, | |
| "loss": 0.6334, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9737827715355806, | |
| "grad_norm": 0.9556214809417725, | |
| "learning_rate": 2.833802616824972e-05, | |
| "loss": 0.6385, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9862671660424469, | |
| "grad_norm": 0.9405916929244995, | |
| "learning_rate": 2.8280955518464587e-05, | |
| "loss": 0.6673, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.9987515605493134, | |
| "grad_norm": 0.9383732676506042, | |
| "learning_rate": 2.8222980942999005e-05, | |
| "loss": 0.618, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2005, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.842838055865549e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |