{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 974, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02053915275994865, "grad_norm": 0.6241949796676636, "learning_rate": 9.99789342773948e-06, "loss": 0.1517, "step": 10 }, { "epoch": 0.0410783055198973, "grad_norm": 0.3109052777290344, "learning_rate": 9.99061372843304e-06, "loss": 0.1347, "step": 20 }, { "epoch": 0.06161745827984596, "grad_norm": 0.28893721103668213, "learning_rate": 9.978142466031648e-06, "loss": 0.12, "step": 30 }, { "epoch": 0.0821566110397946, "grad_norm": 0.2942233979701996, "learning_rate": 9.960492613958845e-06, "loss": 0.1187, "step": 40 }, { "epoch": 0.10269576379974327, "grad_norm": 0.26697084307670593, "learning_rate": 9.937682532746152e-06, "loss": 0.1146, "step": 50 }, { "epoch": 0.12323491655969192, "grad_norm": 0.26033908128738403, "learning_rate": 9.90973595093323e-06, "loss": 0.1111, "step": 60 }, { "epoch": 0.14377406931964057, "grad_norm": 0.24977026879787445, "learning_rate": 9.87668194038392e-06, "loss": 0.1073, "step": 70 }, { "epoch": 0.1643132220795892, "grad_norm": 0.2998938262462616, "learning_rate": 9.838554886043768e-06, "loss": 0.1077, "step": 80 }, { "epoch": 0.18485237483953787, "grad_norm": 0.30642247200012207, "learning_rate": 9.795394450170566e-06, "loss": 0.1087, "step": 90 }, { "epoch": 0.20539152759948653, "grad_norm": 0.2964343726634979, "learning_rate": 9.747245531075069e-06, "loss": 0.1052, "step": 100 }, { "epoch": 0.22593068035943517, "grad_norm": 0.2638753056526184, "learning_rate": 9.694158216414836e-06, "loss": 0.1061, "step": 110 }, { "epoch": 0.24646983311938384, "grad_norm": 0.2852879464626312, "learning_rate": 9.63618773108977e-06, "loss": 0.1024, "step": 120 }, { "epoch": 0.26700898587933247, "grad_norm": 0.2863173484802246, "learning_rate": 9.573394379793557e-06, "loss": 0.1032, "step": 130 }, { "epoch": 0.28754813863928114, "grad_norm": 0.2960529625415802, "learning_rate": 9.505843484280783e-06, "loss": 0.1063, "step": 140 }, { "epoch": 0.3080872913992298, "grad_norm": 0.28948211669921875, "learning_rate": 9.433605315414967e-06, "loss": 0.1027, "step": 150 }, { "epoch": 0.3286264441591784, "grad_norm": 0.2555496096611023, "learning_rate": 9.356755020068201e-06, "loss": 0.0995, "step": 160 }, { "epoch": 0.3491655969191271, "grad_norm": 0.2884163558483124, "learning_rate": 9.275372542948466e-06, "loss": 0.1017, "step": 170 }, { "epoch": 0.36970474967907574, "grad_norm": 0.2388547658920288, "learning_rate": 9.189542543435909e-06, "loss": 0.1015, "step": 180 }, { "epoch": 0.3902439024390244, "grad_norm": 0.2760099768638611, "learning_rate": 9.099354307514607e-06, "loss": 0.099, "step": 190 }, { "epoch": 0.41078305519897307, "grad_norm": 0.29273074865341187, "learning_rate": 9.004901654891453e-06, "loss": 0.0989, "step": 200 }, { "epoch": 0.4313222079589217, "grad_norm": 0.31362560391426086, "learning_rate": 8.906282841398747e-06, "loss": 0.096, "step": 210 }, { "epoch": 0.45186136071887034, "grad_norm": 0.27469882369041443, "learning_rate": 8.803600456782053e-06, "loss": 0.0957, "step": 220 }, { "epoch": 0.472400513478819, "grad_norm": 0.2949489653110504, "learning_rate": 8.696961317979622e-06, "loss": 0.0959, "step": 230 }, { "epoch": 0.49293966623876767, "grad_norm": 0.2868664264678955, "learning_rate": 8.586476358004427e-06, "loss": 0.0954, "step": 240 }, { "epoch": 0.5134788189987163, "grad_norm": 0.28068357706069946, "learning_rate": 8.472260510544383e-06, "loss": 0.0958, "step": 250 }, { "epoch": 0.5340179717586649, "grad_norm": 0.2683391273021698, "learning_rate": 8.354432590400792e-06, "loss": 0.0934, "step": 260 }, { "epoch": 0.5545571245186136, "grad_norm": 0.26564258337020874, "learning_rate": 8.23311516988942e-06, "loss": 0.0964, "step": 270 }, { "epoch": 0.5750962772785623, "grad_norm": 0.3143099844455719, "learning_rate": 8.10843445133274e-06, "loss": 0.094, "step": 280 }, { "epoch": 0.5956354300385109, "grad_norm": 0.25333961844444275, "learning_rate": 7.980520135776023e-06, "loss": 0.0908, "step": 290 }, { "epoch": 0.6161745827984596, "grad_norm": 0.2577582001686096, "learning_rate": 7.849505288063808e-06, "loss": 0.092, "step": 300 }, { "epoch": 0.6367137355584083, "grad_norm": 0.2629285752773285, "learning_rate": 7.71552619841716e-06, "loss": 0.0889, "step": 310 }, { "epoch": 0.6572528883183568, "grad_norm": 0.24951164424419403, "learning_rate": 7.578722240655645e-06, "loss": 0.0914, "step": 320 }, { "epoch": 0.6777920410783055, "grad_norm": 0.264588862657547, "learning_rate": 7.43923572721156e-06, "loss": 0.0885, "step": 330 }, { "epoch": 0.6983311938382541, "grad_norm": 0.2667712867259979, "learning_rate": 7.297211761087229e-06, "loss": 0.0918, "step": 340 }, { "epoch": 0.7188703465982028, "grad_norm": 0.2499169409275055, "learning_rate": 7.152798084909365e-06, "loss": 0.0865, "step": 350 }, { "epoch": 0.7394094993581515, "grad_norm": 0.27517884969711304, "learning_rate": 7.006144927237505e-06, "loss": 0.0866, "step": 360 }, { "epoch": 0.7599486521181001, "grad_norm": 0.2483636885881424, "learning_rate": 6.857404846286444e-06, "loss": 0.0871, "step": 370 }, { "epoch": 0.7804878048780488, "grad_norm": 0.30694103240966797, "learning_rate": 6.706732571225183e-06, "loss": 0.0884, "step": 380 }, { "epoch": 0.8010269576379975, "grad_norm": 0.23266981542110443, "learning_rate": 6.554284841217519e-06, "loss": 0.085, "step": 390 }, { "epoch": 0.8215661103979461, "grad_norm": 0.27542293071746826, "learning_rate": 6.400220242371714e-06, "loss": 0.0861, "step": 400 }, { "epoch": 0.8421052631578947, "grad_norm": 0.2664916515350342, "learning_rate": 6.244699042768835e-06, "loss": 0.0852, "step": 410 }, { "epoch": 0.8626444159178434, "grad_norm": 0.24229811131954193, "learning_rate": 6.087883025741408e-06, "loss": 0.0838, "step": 420 }, { "epoch": 0.883183568677792, "grad_norm": 0.23243330419063568, "learning_rate": 5.929935321575806e-06, "loss": 0.0865, "step": 430 }, { "epoch": 0.9037227214377407, "grad_norm": 0.2365313470363617, "learning_rate": 5.771020237813448e-06, "loss": 0.0837, "step": 440 }, { "epoch": 0.9242618741976893, "grad_norm": 0.24980725347995758, "learning_rate": 5.611303088327331e-06, "loss": 0.0879, "step": 450 }, { "epoch": 0.944801026957638, "grad_norm": 0.2599492371082306, "learning_rate": 5.450950021351725e-06, "loss": 0.0876, "step": 460 }, { "epoch": 0.9653401797175867, "grad_norm": 0.27633577585220337, "learning_rate": 5.290127846643903e-06, "loss": 0.0851, "step": 470 }, { "epoch": 0.9858793324775353, "grad_norm": 0.25800392031669617, "learning_rate": 5.129003861957706e-06, "loss": 0.0827, "step": 480 }, { "epoch": 1.0061617458279846, "grad_norm": 0.22503100335597992, "learning_rate": 4.967745679009474e-06, "loss": 0.0749, "step": 490 }, { "epoch": 1.0267008985879333, "grad_norm": 0.23040170967578888, "learning_rate": 4.806521049117363e-06, "loss": 0.0626, "step": 500 }, { "epoch": 1.047240051347882, "grad_norm": 0.23996306955814362, "learning_rate": 4.645497688695444e-06, "loss": 0.0621, "step": 510 }, { "epoch": 1.0677792041078304, "grad_norm": 0.2363160401582718, "learning_rate": 4.484843104784106e-06, "loss": 0.0626, "step": 520 }, { "epoch": 1.0883183568677792, "grad_norm": 0.24278973042964935, "learning_rate": 4.32472442079828e-06, "loss": 0.0608, "step": 530 }, { "epoch": 1.1088575096277278, "grad_norm": 0.2526680827140808, "learning_rate": 4.165308202674704e-06, "loss": 0.0626, "step": 540 }, { "epoch": 1.1293966623876766, "grad_norm": 0.260307639837265, "learning_rate": 4.006760285599146e-06, "loss": 0.0594, "step": 550 }, { "epoch": 1.149935815147625, "grad_norm": 0.2719564139842987, "learning_rate": 3.849245601493784e-06, "loss": 0.0563, "step": 560 }, { "epoch": 1.1704749679075739, "grad_norm": 0.23317693173885345, "learning_rate": 3.69292800744422e-06, "loss": 0.0589, "step": 570 }, { "epoch": 1.1910141206675224, "grad_norm": 0.23016999661922455, "learning_rate": 3.5379701152446184e-06, "loss": 0.0602, "step": 580 }, { "epoch": 1.2115532734274712, "grad_norm": 0.21775227785110474, "learning_rate": 3.384533122238267e-06, "loss": 0.0572, "step": 590 }, { "epoch": 1.2320924261874198, "grad_norm": 0.247865229845047, "learning_rate": 3.2327766436295618e-06, "loss": 0.0601, "step": 600 }, { "epoch": 1.2526315789473683, "grad_norm": 0.23535579442977905, "learning_rate": 3.0828585464418013e-06, "loss": 0.0605, "step": 610 }, { "epoch": 1.273170731707317, "grad_norm": 0.2479717880487442, "learning_rate": 2.934934785293593e-06, "loss": 0.0597, "step": 620 }, { "epoch": 1.2937098844672656, "grad_norm": 0.24478904902935028, "learning_rate": 2.7891592401646406e-06, "loss": 0.058, "step": 630 }, { "epoch": 1.3142490372272144, "grad_norm": 0.24038828909397125, "learning_rate": 2.6456835563197194e-06, "loss": 0.0574, "step": 640 }, { "epoch": 1.334788189987163, "grad_norm": 0.24659165740013123, "learning_rate": 2.504656986557342e-06, "loss": 0.0591, "step": 650 }, { "epoch": 1.3553273427471118, "grad_norm": 0.23365946114063263, "learning_rate": 2.3662262359472326e-06, "loss": 0.0587, "step": 660 }, { "epoch": 1.3758664955070603, "grad_norm": 0.2665676772594452, "learning_rate": 2.2305353092181014e-06, "loss": 0.0592, "step": 670 }, { "epoch": 1.396405648267009, "grad_norm": 0.24800005555152893, "learning_rate": 2.09772536095451e-06, "loss": 0.0588, "step": 680 }, { "epoch": 1.4169448010269576, "grad_norm": 0.2330596148967743, "learning_rate": 1.967934548758621e-06, "loss": 0.0588, "step": 690 }, { "epoch": 1.4374839537869062, "grad_norm": 0.2489139586687088, "learning_rate": 1.8412978895296223e-06, "loss": 0.0555, "step": 700 }, { "epoch": 1.458023106546855, "grad_norm": 0.2403998076915741, "learning_rate": 1.7179471190103103e-06, "loss": 0.0568, "step": 710 }, { "epoch": 1.4785622593068035, "grad_norm": 0.23150111734867096, "learning_rate": 1.5980105547469443e-06, "loss": 0.0584, "step": 720 }, { "epoch": 1.4991014120667523, "grad_norm": 0.2429497092962265, "learning_rate": 1.4816129626049365e-06, "loss": 0.0578, "step": 730 }, { "epoch": 1.5196405648267008, "grad_norm": 0.23500065505504608, "learning_rate": 1.3688754269792297e-06, "loss": 0.0579, "step": 740 }, { "epoch": 1.5401797175866494, "grad_norm": 0.24009016156196594, "learning_rate": 1.2599152248343848e-06, "loss": 0.0567, "step": 750 }, { "epoch": 1.5607188703465982, "grad_norm": 0.2493337094783783, "learning_rate": 1.1548457037054089e-06, "loss": 0.0581, "step": 760 }, { "epoch": 1.581258023106547, "grad_norm": 0.25276979804039, "learning_rate": 1.0537761637862198e-06, "loss": 0.0563, "step": 770 }, { "epoch": 1.6017971758664955, "grad_norm": 0.23559069633483887, "learning_rate": 9.568117442284375e-07, "loss": 0.0556, "step": 780 }, { "epoch": 1.622336328626444, "grad_norm": 0.22181905806064606, "learning_rate": 8.640533137687523e-07, "loss": 0.0556, "step": 790 }, { "epoch": 1.6428754813863928, "grad_norm": 0.25861266255378723, "learning_rate": 7.755973657986648e-07, "loss": 0.0571, "step": 800 }, { "epoch": 1.6634146341463416, "grad_norm": 0.24212630093097687, "learning_rate": 6.91535917985739e-07, "loss": 0.0555, "step": 810 }, { "epoch": 1.6839537869062902, "grad_norm": 0.23966702818870544, "learning_rate": 6.119564165508057e-07, "loss": 0.0553, "step": 820 }, { "epoch": 1.7044929396662387, "grad_norm": 0.2524842619895935, "learning_rate": 5.369416453006764e-07, "loss": 0.0575, "step": 830 }, { "epoch": 1.7250320924261873, "grad_norm": 0.2377074807882309, "learning_rate": 4.665696395110081e-07, "loss": 0.054, "step": 840 }, { "epoch": 1.745571245186136, "grad_norm": 0.23431187868118286, "learning_rate": 4.009136047489026e-07, "loss": 0.0563, "step": 850 }, { "epoch": 1.7661103979460848, "grad_norm": 0.2580386698246002, "learning_rate": 3.400418407196826e-07, "loss": 0.054, "step": 860 }, { "epoch": 1.7866495507060334, "grad_norm": 0.23353375494480133, "learning_rate": 2.840176702170683e-07, "loss": 0.054, "step": 870 }, { "epoch": 1.807188703465982, "grad_norm": 0.24874167144298553, "learning_rate": 2.3289937325065902e-07, "loss": 0.0566, "step": 880 }, { "epoch": 1.8277278562259307, "grad_norm": 0.2422235757112503, "learning_rate": 1.8674012641925642e-07, "loss": 0.0543, "step": 890 }, { "epoch": 1.8482670089858795, "grad_norm": 0.24417363107204437, "learning_rate": 1.455879475930827e-07, "loss": 0.057, "step": 900 }, { "epoch": 1.868806161745828, "grad_norm": 0.23857787251472473, "learning_rate": 1.0948564596244937e-07, "loss": 0.0572, "step": 910 }, { "epoch": 1.8893453145057766, "grad_norm": 0.25707000494003296, "learning_rate": 7.847077750483489e-08, "loss": 0.0553, "step": 920 }, { "epoch": 1.9098844672657251, "grad_norm": 0.2240484207868576, "learning_rate": 5.257560591669764e-08, "loss": 0.0546, "step": 930 }, { "epoch": 1.930423620025674, "grad_norm": 0.22742733359336853, "learning_rate": 3.182706905067001e-08, "loss": 0.0571, "step": 940 }, { "epoch": 1.9509627727856227, "grad_norm": 0.24693480134010315, "learning_rate": 1.624675089303529e-08, "loss": 0.0559, "step": 950 }, { "epoch": 1.9715019255455712, "grad_norm": 0.22910530865192413, "learning_rate": 5.850859110658191e-09, "loss": 0.0572, "step": 960 }, { "epoch": 1.9920410783055198, "grad_norm": 0.24262285232543945, "learning_rate": 6.502081907039381e-10, "loss": 0.0544, "step": 970 }, { "epoch": 2.0, "step": 974, "total_flos": 1.635906903930726e+19, "train_loss": 0.07772063925954106, "train_runtime": 22087.4933, "train_samples_per_second": 11.286, "train_steps_per_second": 0.044 } ], "logging_steps": 10, "max_steps": 974, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.635906903930726e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }