{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0412796697626419, "grad_norm": 16.710033810394073, "learning_rate": 2.1621621621621622e-07, "loss": 1.0133, "mean_token_accuracy": 0.7553978890180588, "num_tokens": 474561.0, "step": 5 }, { "epoch": 0.0825593395252838, "grad_norm": 14.528834570178129, "learning_rate": 4.864864864864865e-07, "loss": 0.9896, "mean_token_accuracy": 0.7556369215250015, "num_tokens": 947546.0, "step": 10 }, { "epoch": 0.1238390092879257, "grad_norm": 8.780161837177058, "learning_rate": 7.567567567567568e-07, "loss": 0.8241, "mean_token_accuracy": 0.7811301812529564, "num_tokens": 1418691.0, "step": 15 }, { "epoch": 0.1651186790505676, "grad_norm": 2.7100520665485894, "learning_rate": 1.0270270270270269e-06, "loss": 0.7011, "mean_token_accuracy": 0.794181127846241, "num_tokens": 1892249.0, "step": 20 }, { "epoch": 0.20639834881320948, "grad_norm": 1.7196639977337644, "learning_rate": 1.2972972972972972e-06, "loss": 0.6437, "mean_token_accuracy": 0.800345453619957, "num_tokens": 2366197.0, "step": 25 }, { "epoch": 0.2476780185758514, "grad_norm": 1.680207863114678, "learning_rate": 1.5675675675675676e-06, "loss": 0.5926, "mean_token_accuracy": 0.8132463812828064, "num_tokens": 2838384.0, "step": 30 }, { "epoch": 0.2889576883384933, "grad_norm": 1.164814654682159, "learning_rate": 1.837837837837838e-06, "loss": 0.5987, "mean_token_accuracy": 0.8100811287760734, "num_tokens": 3309793.0, "step": 35 }, { "epoch": 0.3302373581011352, "grad_norm": 1.198281328185356, "learning_rate": 1.9998176420316e-06, "loss": 0.5949, "mean_token_accuracy": 0.809085787832737, "num_tokens": 3784285.0, "step": 40 }, { "epoch": 0.3715170278637771, "grad_norm": 1.284722675046685, "learning_rate": 1.9977668786231533e-06, "loss": 0.6033, "mean_token_accuracy": 0.8060941636562348, "num_tokens": 4259188.0, "step": 45 }, { "epoch": 0.41279669762641896, "grad_norm": 1.155248536480801, "learning_rate": 1.993442093851331e-06, "loss": 0.5767, "mean_token_accuracy": 0.8136543348431587, "num_tokens": 4733149.0, "step": 50 }, { "epoch": 0.4540763673890609, "grad_norm": 1.1147204245893136, "learning_rate": 1.986853144380224e-06, "loss": 0.5667, "mean_token_accuracy": 0.8162353426218033, "num_tokens": 5207210.0, "step": 55 }, { "epoch": 0.4953560371517028, "grad_norm": 1.112840928320902, "learning_rate": 1.9780150471563555e-06, "loss": 0.5869, "mean_token_accuracy": 0.810538823902607, "num_tokens": 5683326.0, "step": 60 }, { "epoch": 0.5366357069143447, "grad_norm": 1.0402844533961697, "learning_rate": 1.9669479451833974e-06, "loss": 0.5756, "mean_token_accuracy": 0.8130813702940941, "num_tokens": 6156332.0, "step": 65 }, { "epoch": 0.5779153766769866, "grad_norm": 1.0779121215536756, "learning_rate": 1.9536770616140275e-06, "loss": 0.5658, "mean_token_accuracy": 0.8173741087317467, "num_tokens": 6628529.0, "step": 70 }, { "epoch": 0.6191950464396285, "grad_norm": 1.0854741991980508, "learning_rate": 1.9382326422635704e-06, "loss": 0.5613, "mean_token_accuracy": 0.8183763369917869, "num_tokens": 7104675.0, "step": 75 }, { "epoch": 0.6604747162022704, "grad_norm": 1.03078206062344, "learning_rate": 1.920649886676429e-06, "loss": 0.5246, "mean_token_accuracy": 0.8290216967463493, "num_tokens": 7575339.0, "step": 80 }, { "epoch": 0.7017543859649122, "grad_norm": 1.0790498772904935, "learning_rate": 1.9009688679024189e-06, "loss": 0.5674, "mean_token_accuracy": 0.8155619785189628, "num_tokens": 8049585.0, "step": 85 }, { "epoch": 0.7430340557275542, "grad_norm": 1.0644009479188448, "learning_rate": 1.8792344411658468e-06, "loss": 0.5484, "mean_token_accuracy": 0.8202491089701652, "num_tokens": 8523864.0, "step": 90 }, { "epoch": 0.7843137254901961, "grad_norm": 1.0597366929661158, "learning_rate": 1.8554961416354758e-06, "loss": 0.5495, "mean_token_accuracy": 0.8222089603543281, "num_tokens": 8993860.0, "step": 95 }, { "epoch": 0.8255933952528379, "grad_norm": 1.0650070028025487, "learning_rate": 1.8298080715283857e-06, "loss": 0.5536, "mean_token_accuracy": 0.8196913883090019, "num_tokens": 9467969.0, "step": 100 }, { "epoch": 0.8668730650154799, "grad_norm": 1.0912797921529367, "learning_rate": 1.80222877680502e-06, "loss": 0.5694, "mean_token_accuracy": 0.8143000498414039, "num_tokens": 9943019.0, "step": 105 }, { "epoch": 0.9081527347781218, "grad_norm": 1.099585144155188, "learning_rate": 1.7728211137364486e-06, "loss": 0.5469, "mean_token_accuracy": 0.820717391371727, "num_tokens": 10416469.0, "step": 110 }, { "epoch": 0.9494324045407637, "grad_norm": 1.1065390692517416, "learning_rate": 1.7416521056479575e-06, "loss": 0.5654, "mean_token_accuracy": 0.8159454494714737, "num_tokens": 10891463.0, "step": 115 }, { "epoch": 0.9907120743034056, "grad_norm": 1.0600231806965752, "learning_rate": 1.7087927901654556e-06, "loss": 0.5343, "mean_token_accuracy": 0.8255413874983788, "num_tokens": 11362461.0, "step": 120 }, { "epoch": 1.0247678018575852, "grad_norm": 1.075516597064286, "learning_rate": 1.6743180573128493e-06, "loss": 0.5489, "mean_token_accuracy": 0.8342628081639608, "num_tokens": 11754365.0, "step": 125 }, { "epoch": 1.066047471620227, "grad_norm": 1.1024958520463706, "learning_rate": 1.6383064788293728e-06, "loss": 0.4588, "mean_token_accuracy": 0.8454525545239449, "num_tokens": 12226786.0, "step": 130 }, { "epoch": 1.107327141382869, "grad_norm": 1.1227030716421933, "learning_rate": 1.6008401290958805e-06, "loss": 0.4695, "mean_token_accuracy": 0.8423517674207688, "num_tokens": 12699204.0, "step": 135 }, { "epoch": 1.1486068111455108, "grad_norm": 1.1177693050643365, "learning_rate": 1.5620043980782325e-06, "loss": 0.4656, "mean_token_accuracy": 0.8432464152574539, "num_tokens": 13176184.0, "step": 140 }, { "epoch": 1.1898864809081529, "grad_norm": 1.1746918069579293, "learning_rate": 1.521887796714092e-06, "loss": 0.4549, "mean_token_accuracy": 0.845870116353035, "num_tokens": 13647750.0, "step": 145 }, { "epoch": 1.2311661506707947, "grad_norm": 1.1505216678817896, "learning_rate": 1.4805817551866838e-06, "loss": 0.4701, "mean_token_accuracy": 0.8416765749454498, "num_tokens": 14123743.0, "step": 150 }, { "epoch": 1.2724458204334366, "grad_norm": 1.1436761762759031, "learning_rate": 1.438180414545267e-06, "loss": 0.438, "mean_token_accuracy": 0.8506157398223877, "num_tokens": 14596023.0, "step": 155 }, { "epoch": 1.3137254901960784, "grad_norm": 1.1647013953489513, "learning_rate": 1.394780412147245e-06, "loss": 0.4522, "mean_token_accuracy": 0.8473987281322479, "num_tokens": 15071031.0, "step": 160 }, { "epoch": 1.3550051599587203, "grad_norm": 1.1333033999243753, "learning_rate": 1.3504806614109097e-06, "loss": 0.437, "mean_token_accuracy": 0.8531909629702568, "num_tokens": 15543244.0, "step": 165 }, { "epoch": 1.3962848297213624, "grad_norm": 1.1120619686788833, "learning_rate": 1.3053821263807945e-06, "loss": 0.4634, "mean_token_accuracy": 0.8448714122176171, "num_tokens": 16017065.0, "step": 170 }, { "epoch": 1.437564499484004, "grad_norm": 1.111129800110557, "learning_rate": 1.2595875916194184e-06, "loss": 0.4417, "mean_token_accuracy": 0.849587918817997, "num_tokens": 16491281.0, "step": 175 }, { "epoch": 1.478844169246646, "grad_norm": 1.150776254338656, "learning_rate": 1.21320142794987e-06, "loss": 0.4562, "mean_token_accuracy": 0.8464704275131225, "num_tokens": 16968313.0, "step": 180 }, { "epoch": 1.520123839009288, "grad_norm": 1.2131274987610103, "learning_rate": 1.16632935458313e-06, "loss": 0.45, "mean_token_accuracy": 0.8496235758066177, "num_tokens": 17441110.0, "step": 185 }, { "epoch": 1.5614035087719298, "grad_norm": 1.2050312039209545, "learning_rate": 1.119078198172262e-06, "loss": 0.4607, "mean_token_accuracy": 0.8451412498950959, "num_tokens": 17916232.0, "step": 190 }, { "epoch": 1.6026831785345719, "grad_norm": 1.1254347771320186, "learning_rate": 1.071555649342626e-06, "loss": 0.4633, "mean_token_accuracy": 0.8458669915795326, "num_tokens": 18390551.0, "step": 195 }, { "epoch": 1.6439628482972135, "grad_norm": 1.1206711136434166, "learning_rate": 1.0238700172530007e-06, "loss": 0.4512, "mean_token_accuracy": 0.8467774465680122, "num_tokens": 18864159.0, "step": 200 }, { "epoch": 1.6852425180598556, "grad_norm": 1.1206183234114782, "learning_rate": 9.761299827469992e-07, "loss": 0.4546, "mean_token_accuracy": 0.8466342076659202, "num_tokens": 19335203.0, "step": 205 }, { "epoch": 1.7265221878224974, "grad_norm": 1.1225182789231243, "learning_rate": 9.284443506573739e-07, "loss": 0.453, "mean_token_accuracy": 0.8467301100492477, "num_tokens": 19807808.0, "step": 210 }, { "epoch": 1.7678018575851393, "grad_norm": 1.0862991936704458, "learning_rate": 8.809218018277377e-07, "loss": 0.4345, "mean_token_accuracy": 0.852784389257431, "num_tokens": 20281710.0, "step": 215 }, { "epoch": 1.8090815273477814, "grad_norm": 1.048025349811373, "learning_rate": 8.336706454168699e-07, "loss": 0.4465, "mean_token_accuracy": 0.8481804400682449, "num_tokens": 20751414.0, "step": 220 }, { "epoch": 1.850361197110423, "grad_norm": 1.106143000786653, "learning_rate": 7.8679857205013e-07, "loss": 0.4481, "mean_token_accuracy": 0.848122601211071, "num_tokens": 21223522.0, "step": 225 }, { "epoch": 1.891640866873065, "grad_norm": 1.143127191810739, "learning_rate": 7.404124083805818e-07, "loss": 0.4531, "mean_token_accuracy": 0.8467795923352242, "num_tokens": 21697455.0, "step": 230 }, { "epoch": 1.932920536635707, "grad_norm": 1.2019232454749589, "learning_rate": 6.946178736192052e-07, "loss": 0.463, "mean_token_accuracy": 0.8440817475318909, "num_tokens": 22171241.0, "step": 235 }, { "epoch": 1.9742002063983488, "grad_norm": 1.1622964289091862, "learning_rate": 6.495193385890901e-07, "loss": 0.45, "mean_token_accuracy": 0.8478259801864624, "num_tokens": 22643133.0, "step": 240 }, { "epoch": 2.0082559339525283, "grad_norm": 1.3310228664644796, "learning_rate": 6.052195878527549e-07, "loss": 0.4939, "mean_token_accuracy": 0.8443521983695753, "num_tokens": 23036728.0, "step": 245 }, { "epoch": 2.0495356037151704, "grad_norm": 1.1421506298255963, "learning_rate": 5.618195854547332e-07, "loss": 0.401, "mean_token_accuracy": 0.8647464781999588, "num_tokens": 23512046.0, "step": 250 }, { "epoch": 2.090815273477812, "grad_norm": 1.0987074279262417, "learning_rate": 5.194182448133162e-07, "loss": 0.3902, "mean_token_accuracy": 0.8662704512476921, "num_tokens": 23988482.0, "step": 255 }, { "epoch": 2.132094943240454, "grad_norm": 1.135056862447882, "learning_rate": 4.781122032859079e-07, "loss": 0.3869, "mean_token_accuracy": 0.8672365352511406, "num_tokens": 24462226.0, "step": 260 }, { "epoch": 2.173374613003096, "grad_norm": 1.144024286056472, "learning_rate": 4.379956019217674e-07, "loss": 0.3816, "mean_token_accuracy": 0.868879072368145, "num_tokens": 24935522.0, "step": 265 }, { "epoch": 2.214654282765738, "grad_norm": 1.095313893833739, "learning_rate": 3.991598709041195e-07, "loss": 0.398, "mean_token_accuracy": 0.8641743138432503, "num_tokens": 25412409.0, "step": 270 }, { "epoch": 2.25593395252838, "grad_norm": 1.146038956350082, "learning_rate": 3.6169352117062745e-07, "loss": 0.3963, "mean_token_accuracy": 0.8645552083849907, "num_tokens": 25887799.0, "step": 275 }, { "epoch": 2.2972136222910216, "grad_norm": 1.1485726690588176, "learning_rate": 3.2568194268715065e-07, "loss": 0.3639, "mean_token_accuracy": 0.873411850631237, "num_tokens": 26358464.0, "step": 280 }, { "epoch": 2.3384932920536636, "grad_norm": 1.1969513148119446, "learning_rate": 2.912072098345446e-07, "loss": 0.3641, "mean_token_accuracy": 0.8735954254865647, "num_tokens": 26830555.0, "step": 285 }, { "epoch": 2.3797729618163057, "grad_norm": 1.1692613851184464, "learning_rate": 2.583478943520424e-07, "loss": 0.3813, "mean_token_accuracy": 0.8684006243944168, "num_tokens": 27301887.0, "step": 290 }, { "epoch": 2.4210526315789473, "grad_norm": 1.1756633565591612, "learning_rate": 2.271788862635513e-07, "loss": 0.3689, "mean_token_accuracy": 0.8724981382489204, "num_tokens": 27770674.0, "step": 295 }, { "epoch": 2.4623323013415894, "grad_norm": 1.1723451227641641, "learning_rate": 1.9777122319497986e-07, "loss": 0.4079, "mean_token_accuracy": 0.8619420573115348, "num_tokens": 28245209.0, "step": 300 }, { "epoch": 2.503611971104231, "grad_norm": 1.1497630056766186, "learning_rate": 1.7019192847161423e-07, "loss": 0.3652, "mean_token_accuracy": 0.872562825679779, "num_tokens": 28717418.0, "step": 305 }, { "epoch": 2.544891640866873, "grad_norm": 1.1389041755051739, "learning_rate": 1.4450385836452428e-07, "loss": 0.3915, "mean_token_accuracy": 0.8656164303421974, "num_tokens": 29191351.0, "step": 310 }, { "epoch": 2.586171310629515, "grad_norm": 1.1966708809689266, "learning_rate": 1.207655588341534e-07, "loss": 0.3851, "mean_token_accuracy": 0.8667084857821464, "num_tokens": 29664862.0, "step": 315 }, { "epoch": 2.627450980392157, "grad_norm": 1.1811552059060793, "learning_rate": 9.903113209758096e-08, "loss": 0.3768, "mean_token_accuracy": 0.8696465089917182, "num_tokens": 30139751.0, "step": 320 }, { "epoch": 2.6687306501547985, "grad_norm": 1.185142290815498, "learning_rate": 7.93501133235711e-08, "loss": 0.3823, "mean_token_accuracy": 0.8686287999153137, "num_tokens": 30609687.0, "step": 325 }, { "epoch": 2.7100103199174406, "grad_norm": 1.1477492199007593, "learning_rate": 6.17673577364296e-08, "loss": 0.3779, "mean_token_accuracy": 0.8701810359954834, "num_tokens": 31082327.0, "step": 330 }, { "epoch": 2.7512899896800826, "grad_norm": 1.1198458468611587, "learning_rate": 4.632293838597246e-08, "loss": 0.3726, "mean_token_accuracy": 0.8715211614966393, "num_tokens": 31558388.0, "step": 335 }, { "epoch": 2.7925696594427247, "grad_norm": 1.153660136950001, "learning_rate": 3.305205481660245e-08, "loss": 0.3933, "mean_token_accuracy": 0.8645625025033951, "num_tokens": 32034515.0, "step": 340 }, { "epoch": 2.8338493292053664, "grad_norm": 1.1426440702388894, "learning_rate": 2.19849528436441e-08, "loss": 0.3854, "mean_token_accuracy": 0.8678223595023156, "num_tokens": 32507410.0, "step": 345 }, { "epoch": 2.875128998968008, "grad_norm": 1.1521506874308294, "learning_rate": 1.3146855619776132e-08, "loss": 0.3604, "mean_token_accuracy": 0.8755264401435852, "num_tokens": 32978318.0, "step": 350 }, { "epoch": 2.91640866873065, "grad_norm": 1.1215252031109357, "learning_rate": 6.557906148669023e-09, "loss": 0.3876, "mean_token_accuracy": 0.8669898718595505, "num_tokens": 33451754.0, "step": 355 }, { "epoch": 2.957688338493292, "grad_norm": 1.1622153487674391, "learning_rate": 2.233121376846836e-09, "loss": 0.3952, "mean_token_accuracy": 0.8642757371068001, "num_tokens": 33925172.0, "step": 360 }, { "epoch": 2.998968008255934, "grad_norm": 1.1751426215214322, "learning_rate": 1.8235796839982664e-10, "loss": 0.3871, "mean_token_accuracy": 0.8679782792925834, "num_tokens": 34399819.0, "step": 365 }, { "epoch": 3.0, "mean_token_accuracy": 0.8686857223510742, "num_tokens": 34411845.0, "step": 366, "total_flos": 131252961591296.0, "train_loss": 0.4884359234017753, "train_runtime": 11375.5467, "train_samples_per_second": 4.088, "train_steps_per_second": 0.032 } ], "logging_steps": 5, "max_steps": 366, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 131252961591296.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }