| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9910313901345291, | |
| "eval_steps": 500, | |
| "global_step": 444, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02242152466367713, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 3.225012776176047e-05, | |
| "loss": 0.8841, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04484304932735426, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 7.256278746396105e-05, | |
| "loss": 0.703, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06726457399103139, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 0.00011287544716616165, | |
| "loss": 0.6704, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08968609865470852, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 0.00015318810686836223, | |
| "loss": 0.6641, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11210762331838565, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 0.00019350076657056282, | |
| "loss": 0.6811, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.13452914798206278, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 0.00023381342627276343, | |
| "loss": 0.662, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15695067264573992, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 0.000274126085974964, | |
| "loss": 0.6823, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.17937219730941703, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 0.0002821678320668541, | |
| "loss": 0.9132, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.20179372197309417, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 0.00028208340355189316, | |
| "loss": 0.9479, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2242152466367713, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 0.00028193408503878413, | |
| "loss": 0.7504, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.24663677130044842, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00028171996818179184, | |
| "loss": 0.7178, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.26905829596412556, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.00028144118440951647, | |
| "loss": 0.7022, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2914798206278027, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.00028109790484422007, | |
| "loss": 0.6894, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.31390134529147984, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0002806903401967889, | |
| "loss": 0.6873, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.336322869955157, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.0002802187406373956, | |
| "loss": 0.6881, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.35874439461883406, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.0002796833956419402, | |
| "loss": 0.6827, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3811659192825112, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.00027908463381436484, | |
| "loss": 0.6601, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.40358744394618834, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00027842282268495153, | |
| "loss": 0.6652, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4260089686098655, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.000277698368484725, | |
| "loss": 0.6648, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.4484304932735426, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0002769117158961017, | |
| "loss": 0.668, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.47085201793721976, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.000276063347779936, | |
| "loss": 0.6516, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.49327354260089684, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.00027515378487913156, | |
| "loss": 0.651, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.515695067264574, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.00027418358549900095, | |
| "loss": 0.6457, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5381165919282511, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.0002731533451645679, | |
| "loss": 0.6383, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5605381165919282, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0002720636962550239, | |
| "loss": 0.6304, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5829596412556054, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.000270915307615563, | |
| "loss": 0.6478, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6053811659192825, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0002697088841468332, | |
| "loss": 0.621, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6278026905829597, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.00026844516637225627, | |
| "loss": 0.644, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6502242152466368, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.0002671249299834821, | |
| "loss": 0.6311, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.672645739910314, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00026574898536425504, | |
| "loss": 0.6328, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.695067264573991, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.00026431817709298756, | |
| "loss": 0.607, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7174887892376681, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.0002628333834243426, | |
| "loss": 0.6266, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7399103139013453, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 0.0002612955157501462, | |
| "loss": 0.6151, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7623318385650224, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.00025970551803995967, | |
| "loss": 0.6259, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7847533632286996, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.0002580643662616546, | |
| "loss": 0.6136, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.8071748878923767, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.0002563730677823472, | |
| "loss": 0.6075, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8295964125560538, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00025463266075005897, | |
| "loss": 0.6133, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.852017937219731, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.0002528442134564838, | |
| "loss": 0.597, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.874439461883408, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.0002510088236812521, | |
| "loss": 0.5931, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8968609865470852, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.00024912761801809485, | |
| "loss": 0.6266, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9192825112107623, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.0002472017511833209, | |
| "loss": 0.6032, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.9417040358744395, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.00024523240530703263, | |
| "loss": 0.5887, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9641255605381166, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0002432207892075138, | |
| "loss": 0.596, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9865470852017937, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.00024116813764923656, | |
| "loss": 0.6081, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9955156950672646, | |
| "eval_loss": 0.5990473628044128, | |
| "eval_runtime": 3.2112, | |
| "eval_samples_per_second": 13.079, | |
| "eval_steps_per_second": 13.079, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.0089686098654709, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00023907571058494136, | |
| "loss": 0.5292, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.031390134529148, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.0002369447923822569, | |
| "loss": 0.4347, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.053811659192825, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.00023477669103533273, | |
| "loss": 0.4224, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.0762331838565022, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.0002325727373619704, | |
| "loss": 0.4291, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0986547085201794, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.0002303342841867443, | |
| "loss": 0.4202, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.1210762331838564, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.0002280627055106151, | |
| "loss": 0.42, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1434977578475336, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.0002257593956675441, | |
| "loss": 0.434, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.1659192825112108, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.0002234257684686273, | |
| "loss": 0.4278, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.188340807174888, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.00022106325633427373, | |
| "loss": 0.4261, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.210762331838565, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00021867330941496144, | |
| "loss": 0.4233, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2331838565022422, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.00021625739470111004, | |
| "loss": 0.4301, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2556053811659194, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.00021381699512261696, | |
| "loss": 0.4405, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2780269058295963, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 0.00021135360863860965, | |
| "loss": 0.4323, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.3004484304932735, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.00020886874731797242, | |
| "loss": 0.4345, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3228699551569507, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00020636393641121277, | |
| "loss": 0.4321, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3452914798206277, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 0.0002038407134142364, | |
| "loss": 0.4273, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3677130044843049, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.0002013006271246058, | |
| "loss": 0.4319, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.390134529147982, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 0.00019874523669086193, | |
| "loss": 0.4301, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.4125560538116593, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.0001961761106554918, | |
| "loss": 0.4327, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.4349775784753362, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00019359482599213036, | |
| "loss": 0.4285, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4573991031390134, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.0001910029671375871, | |
| "loss": 0.4392, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4798206278026906, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00018840212501929134, | |
| "loss": 0.4245, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.5022421524663678, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00018579389607875394, | |
| "loss": 0.4231, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.5246636771300448, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.0001831798812916439, | |
| "loss": 0.4239, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.547085201793722, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.00018056168518508213, | |
| "loss": 0.4122, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5695067264573992, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 0.0001779409148527551, | |
| "loss": 0.4199, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5919282511210762, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.0001753191789684532, | |
| "loss": 0.4191, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.6143497757847534, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.00017269808679863934, | |
| "loss": 0.4109, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.6367713004484306, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.00017007924721465324, | |
| "loss": 0.4178, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.6591928251121075, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.00016746426770515897, | |
| "loss": 0.4236, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6816143497757847, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.0001648547533894405, | |
| "loss": 0.4221, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.704035874439462, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 0.00016225230603215222, | |
| "loss": 0.4144, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.726457399103139, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 0.0001596585230601281, | |
| "loss": 0.4171, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.7488789237668163, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.00015707499658185395, | |
| "loss": 0.4144, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7713004484304933, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.0001545033124102035, | |
| "loss": 0.3981, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7937219730941703, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.00015194504908903986, | |
| "loss": 0.4076, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.8161434977578477, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 0.0001494017769242775, | |
| "loss": 0.4093, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.8385650224215246, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00014687505702000225, | |
| "loss": 0.4092, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.8609865470852018, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 0.00014436644032023824, | |
| "loss": 0.3973, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.883408071748879, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.0001418774666569522, | |
| "loss": 0.397, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.905829596412556, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 0.00013940966380487815, | |
| "loss": 0.4015, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.9282511210762332, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 0.00013696454654374285, | |
| "loss": 0.4088, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9506726457399104, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.00013454361572846828, | |
| "loss": 0.3922, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.9730941704035874, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 0.00013214835736792096, | |
| "loss": 0.3984, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9820627802690582, | |
| "eval_loss": 0.5519173741340637, | |
| "eval_runtime": 3.2083, | |
| "eval_samples_per_second": 13.091, | |
| "eval_steps_per_second": 13.091, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.9910313901345291, | |
| "eval_loss": 0.5516598224639893, | |
| "eval_runtime": 3.2302, | |
| "eval_samples_per_second": 13.002, | |
| "eval_steps_per_second": 13.002, | |
| "step": 444 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 669, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.145669325918044e+17, | |
| "train_batch_size": 100, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |