{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2493, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00802447464767541, "grad_norm": 1.7389945502862767, "learning_rate": 3.8e-06, "loss": 0.4595, "step": 20 }, { "epoch": 0.01604894929535082, "grad_norm": 1.21721434066401, "learning_rate": 7.8e-06, "loss": 0.34, "step": 40 }, { "epoch": 0.02407342394302623, "grad_norm": 1.4115052448687924, "learning_rate": 1.18e-05, "loss": 0.3013, "step": 60 }, { "epoch": 0.03209789859070164, "grad_norm": 1.1854124218870574, "learning_rate": 1.58e-05, "loss": 0.2841, "step": 80 }, { "epoch": 0.04012237323837705, "grad_norm": 1.0821081494365905, "learning_rate": 1.9800000000000004e-05, "loss": 0.2679, "step": 100 }, { "epoch": 0.04814684788605246, "grad_norm": 1.1396877768521696, "learning_rate": 2.38e-05, "loss": 0.2983, "step": 120 }, { "epoch": 0.05617132253372787, "grad_norm": 0.8863724861851471, "learning_rate": 2.7800000000000005e-05, "loss": 0.2976, "step": 140 }, { "epoch": 0.06419579718140328, "grad_norm": 1.1286540636393274, "learning_rate": 3.18e-05, "loss": 0.3047, "step": 160 }, { "epoch": 0.07222027182907868, "grad_norm": 1.6326246678745442, "learning_rate": 3.58e-05, "loss": 0.314, "step": 180 }, { "epoch": 0.0802447464767541, "grad_norm": 1.012185268707376, "learning_rate": 3.9800000000000005e-05, "loss": 0.3085, "step": 200 }, { "epoch": 0.0882692211244295, "grad_norm": 0.8913802081695356, "learning_rate": 4.38e-05, "loss": 0.3193, "step": 220 }, { "epoch": 0.09629369577210492, "grad_norm": 0.9943286125063995, "learning_rate": 4.78e-05, "loss": 0.3397, "step": 240 }, { "epoch": 0.10431817041978032, "grad_norm": 1.9971168907457575, "learning_rate": 4.999801376569132e-05, "loss": 0.3527, "step": 260 }, { "epoch": 0.11234264506745574, "grad_norm": 0.8841449553244364, "learning_rate": 4.9979380055256516e-05, "loss": 0.3249, "step": 280 }, { "epoch": 0.12036711971513114, "grad_norm": 0.7868550214589912, "learning_rate": 4.99411464181739e-05, "loss": 0.342, "step": 300 }, { "epoch": 0.12839159436280656, "grad_norm": 0.8832077761817314, "learning_rate": 4.988334285424654e-05, "loss": 0.3471, "step": 320 }, { "epoch": 0.13641606901048198, "grad_norm": 0.83820196705535, "learning_rate": 4.980601471870785e-05, "loss": 0.346, "step": 340 }, { "epoch": 0.14444054365815737, "grad_norm": 0.9195620174146593, "learning_rate": 4.970922268663387e-05, "loss": 0.341, "step": 360 }, { "epoch": 0.15246501830583278, "grad_norm": 0.8652054536281212, "learning_rate": 4.9593042705334926e-05, "loss": 0.3415, "step": 380 }, { "epoch": 0.1604894929535082, "grad_norm": 0.7223602683001062, "learning_rate": 4.945756593476401e-05, "loss": 0.34, "step": 400 }, { "epoch": 0.16851396760118362, "grad_norm": 0.7592581825293532, "learning_rate": 4.9302898675988616e-05, "loss": 0.3454, "step": 420 }, { "epoch": 0.176538442248859, "grad_norm": 0.8985377633784642, "learning_rate": 4.912916228778228e-05, "loss": 0.354, "step": 440 }, { "epoch": 0.18456291689653442, "grad_norm": 0.8407832804783139, "learning_rate": 4.893649309140107e-05, "loss": 0.3381, "step": 460 }, { "epoch": 0.19258739154420984, "grad_norm": 0.7817366357087974, "learning_rate": 4.8725042263619896e-05, "loss": 0.3384, "step": 480 }, { "epoch": 0.20061186619188526, "grad_norm": 0.7998744498632891, "learning_rate": 4.849497571811257e-05, "loss": 0.3433, "step": 500 }, { "epoch": 0.20863634083956065, "grad_norm": 0.7366878614464625, "learning_rate": 4.824647397526854e-05, "loss": 0.332, "step": 520 }, { "epoch": 0.21666081548723606, "grad_norm": 0.9257245781477709, "learning_rate": 4.797973202054865e-05, "loss": 0.3578, "step": 540 }, { "epoch": 0.22468529013491148, "grad_norm": 0.7229221912787754, "learning_rate": 4.769495915149091e-05, "loss": 0.3318, "step": 560 }, { "epoch": 0.2327097647825869, "grad_norm": 0.7896861351942223, "learning_rate": 4.7392378813486374e-05, "loss": 0.3239, "step": 580 }, { "epoch": 0.2407342394302623, "grad_norm": 0.7057182651650622, "learning_rate": 4.707222842445401e-05, "loss": 0.3367, "step": 600 }, { "epoch": 0.2487587140779377, "grad_norm": 0.7646297450840412, "learning_rate": 4.6734759188552116e-05, "loss": 0.3345, "step": 620 }, { "epoch": 0.2567831887256131, "grad_norm": 0.8144984197146983, "learning_rate": 4.638023589907239e-05, "loss": 0.3264, "step": 640 }, { "epoch": 0.26480766337328854, "grad_norm": 0.799665735836617, "learning_rate": 4.6008936730671414e-05, "loss": 0.339, "step": 660 }, { "epoch": 0.27283213802096395, "grad_norm": 0.888997665044708, "learning_rate": 4.562115302110254e-05, "loss": 0.3413, "step": 680 }, { "epoch": 0.28085661266863937, "grad_norm": 0.8011920131999447, "learning_rate": 4.5217189042619345e-05, "loss": 0.3389, "step": 700 }, { "epoch": 0.28888108731631473, "grad_norm": 0.8067168553032359, "learning_rate": 4.4797361763230224e-05, "loss": 0.3185, "step": 720 }, { "epoch": 0.29690556196399015, "grad_norm": 0.6665228849255169, "learning_rate": 4.436200059799121e-05, "loss": 0.3232, "step": 740 }, { "epoch": 0.30493003661166557, "grad_norm": 0.6617968011184702, "learning_rate": 4.391144715053238e-05, "loss": 0.318, "step": 760 }, { "epoch": 0.312954511259341, "grad_norm": 0.7925748113649975, "learning_rate": 4.344605494502053e-05, "loss": 0.3438, "step": 780 }, { "epoch": 0.3209789859070164, "grad_norm": 0.7874288584383095, "learning_rate": 4.2966189148768474e-05, "loss": 0.3218, "step": 800 }, { "epoch": 0.3290034605546918, "grad_norm": 0.5964012085577759, "learning_rate": 4.247222628570862e-05, "loss": 0.325, "step": 820 }, { "epoch": 0.33702793520236723, "grad_norm": 0.7901232690113225, "learning_rate": 4.196455394095561e-05, "loss": 0.3259, "step": 840 }, { "epoch": 0.34505240985004265, "grad_norm": 0.6161692661704234, "learning_rate": 4.144357045668993e-05, "loss": 0.3282, "step": 860 }, { "epoch": 0.353076884497718, "grad_norm": 0.7071927191922839, "learning_rate": 4.0909684619601e-05, "loss": 0.3215, "step": 880 }, { "epoch": 0.36110135914539343, "grad_norm": 0.7540766905998272, "learning_rate": 4.036331534013502e-05, "loss": 0.3141, "step": 900 }, { "epoch": 0.36912583379306885, "grad_norm": 0.7828196756015018, "learning_rate": 3.980489132379937e-05, "loss": 0.3099, "step": 920 }, { "epoch": 0.37715030844074426, "grad_norm": 0.7665378450501007, "learning_rate": 3.923485073478123e-05, "loss": 0.3154, "step": 940 }, { "epoch": 0.3851747830884197, "grad_norm": 0.7064791446421703, "learning_rate": 3.8653640852144643e-05, "loss": 0.3192, "step": 960 }, { "epoch": 0.3931992577360951, "grad_norm": 0.6894942843454174, "learning_rate": 3.80617177188755e-05, "loss": 0.3274, "step": 980 }, { "epoch": 0.4012237323837705, "grad_norm": 0.6747469008968356, "learning_rate": 3.745954578405012e-05, "loss": 0.3216, "step": 1000 }, { "epoch": 0.40924820703144593, "grad_norm": 0.8112185519554851, "learning_rate": 3.684759753840789e-05, "loss": 0.3239, "step": 1020 }, { "epoch": 0.4172726816791213, "grad_norm": 0.6811337824140572, "learning_rate": 3.622635314361416e-05, "loss": 0.3125, "step": 1040 }, { "epoch": 0.4252971563267967, "grad_norm": 0.7484799717463894, "learning_rate": 3.559630005550416e-05, "loss": 0.3209, "step": 1060 }, { "epoch": 0.4333216309744721, "grad_norm": 0.6914997047255979, "learning_rate": 3.495793264160359e-05, "loss": 0.3061, "step": 1080 }, { "epoch": 0.44134610562214754, "grad_norm": 0.6177004112053062, "learning_rate": 3.4311751793225964e-05, "loss": 0.3081, "step": 1100 }, { "epoch": 0.44937058026982296, "grad_norm": 0.7656940899490844, "learning_rate": 3.365826453245115e-05, "loss": 0.3258, "step": 1120 }, { "epoch": 0.4573950549174984, "grad_norm": 0.7072757457639977, "learning_rate": 3.2997983614293404e-05, "loss": 0.3158, "step": 1140 }, { "epoch": 0.4654195295651738, "grad_norm": 0.7029721091978121, "learning_rate": 3.2331427124371115e-05, "loss": 0.3155, "step": 1160 }, { "epoch": 0.4734440042128492, "grad_norm": 0.742312775553165, "learning_rate": 3.1659118072393906e-05, "loss": 0.3151, "step": 1180 }, { "epoch": 0.4814684788605246, "grad_norm": 0.7586510237201778, "learning_rate": 3.098158398178606e-05, "loss": 0.2921, "step": 1200 }, { "epoch": 0.4894929535082, "grad_norm": 0.6302386060292237, "learning_rate": 3.0299356475768326e-05, "loss": 0.3042, "step": 1220 }, { "epoch": 0.4975174281558754, "grad_norm": 0.7177873646521117, "learning_rate": 2.9612970860222816e-05, "loss": 0.3074, "step": 1240 }, { "epoch": 0.5055419028035508, "grad_norm": 0.6442414947030823, "learning_rate": 2.8922965703668337e-05, "loss": 0.3047, "step": 1260 }, { "epoch": 0.5135663774512262, "grad_norm": 0.7076061109007938, "learning_rate": 2.8229882414675672e-05, "loss": 0.3033, "step": 1280 }, { "epoch": 0.5215908520989017, "grad_norm": 0.6578079486576341, "learning_rate": 2.75342648170545e-05, "loss": 0.3051, "step": 1300 }, { "epoch": 0.5296153267465771, "grad_norm": 0.7945155922763601, "learning_rate": 2.6836658723145175e-05, "loss": 0.2967, "step": 1320 }, { "epoch": 0.5376398013942525, "grad_norm": 0.6487682986757239, "learning_rate": 2.613761150555019e-05, "loss": 0.3044, "step": 1340 }, { "epoch": 0.5456642760419279, "grad_norm": 0.6958050009061036, "learning_rate": 2.5437671667641445e-05, "loss": 0.3061, "step": 1360 }, { "epoch": 0.5536887506896033, "grad_norm": 0.6266867011506455, "learning_rate": 2.4737388413180217e-05, "loss": 0.2897, "step": 1380 }, { "epoch": 0.5617132253372787, "grad_norm": 0.7124320790824831, "learning_rate": 2.403731121538762e-05, "loss": 0.2862, "step": 1400 }, { "epoch": 0.5697376999849542, "grad_norm": 0.7470494242121043, "learning_rate": 2.3337989385803567e-05, "loss": 0.3018, "step": 1420 }, { "epoch": 0.5777621746326295, "grad_norm": 0.6734120153458941, "learning_rate": 2.2639971643272688e-05, "loss": 0.2927, "step": 1440 }, { "epoch": 0.5857866492803049, "grad_norm": 0.6694340995198956, "learning_rate": 2.194380568339519e-05, "loss": 0.2912, "step": 1460 }, { "epoch": 0.5938111239279803, "grad_norm": 0.6416254443296999, "learning_rate": 2.1250037748780706e-05, "loss": 0.2833, "step": 1480 }, { "epoch": 0.6018355985756557, "grad_norm": 0.6576796287869531, "learning_rate": 2.0559212200442152e-05, "loss": 0.3005, "step": 1500 }, { "epoch": 0.6098600732233311, "grad_norm": 0.7534672667085268, "learning_rate": 1.9871871090666023e-05, "loss": 0.2902, "step": 1520 }, { "epoch": 0.6178845478710066, "grad_norm": 0.683888843023483, "learning_rate": 1.9188553737694142e-05, "loss": 0.2802, "step": 1540 }, { "epoch": 0.625909022518682, "grad_norm": 0.6505090735155789, "learning_rate": 1.8509796302550763e-05, "loss": 0.287, "step": 1560 }, { "epoch": 0.6339334971663574, "grad_norm": 0.7052076493927231, "learning_rate": 1.783613136834688e-05, "loss": 0.281, "step": 1580 }, { "epoch": 0.6419579718140328, "grad_norm": 0.6384510613508823, "learning_rate": 1.7168087522391958e-05, "loss": 0.2763, "step": 1600 }, { "epoch": 0.6499824464617082, "grad_norm": 0.6633378463426743, "learning_rate": 1.6506188941440958e-05, "loss": 0.285, "step": 1620 }, { "epoch": 0.6580069211093836, "grad_norm": 0.6982839509670195, "learning_rate": 1.585095498040205e-05, "loss": 0.2804, "step": 1640 }, { "epoch": 0.666031395757059, "grad_norm": 0.6270272887976673, "learning_rate": 1.5202899764827799e-05, "loss": 0.2785, "step": 1660 }, { "epoch": 0.6740558704047345, "grad_norm": 0.6172418320458167, "learning_rate": 1.4562531787509504e-05, "loss": 0.2734, "step": 1680 }, { "epoch": 0.6820803450524099, "grad_norm": 0.7230716801753746, "learning_rate": 1.3930353509491225e-05, "loss": 0.2921, "step": 1700 }, { "epoch": 0.6901048197000853, "grad_norm": 0.624829631926983, "learning_rate": 1.3306860965816687e-05, "loss": 0.2745, "step": 1720 }, { "epoch": 0.6981292943477607, "grad_norm": 0.6057171773833744, "learning_rate": 1.2692543376318172e-05, "loss": 0.2812, "step": 1740 }, { "epoch": 0.706153768995436, "grad_norm": 0.6600106987481957, "learning_rate": 1.2087882761753052e-05, "loss": 0.2762, "step": 1760 }, { "epoch": 0.7141782436431114, "grad_norm": 0.6513027790812009, "learning_rate": 1.1493353565588946e-05, "loss": 0.2824, "step": 1780 }, { "epoch": 0.7222027182907869, "grad_norm": 0.6464858105319229, "learning_rate": 1.0909422281734372e-05, "loss": 0.2772, "step": 1800 }, { "epoch": 0.7302271929384623, "grad_norm": 0.5987416675990789, "learning_rate": 1.033654708850704e-05, "loss": 0.2729, "step": 1820 }, { "epoch": 0.7382516675861377, "grad_norm": 0.6700627827898618, "learning_rate": 9.775177489126839e-06, "loss": 0.2662, "step": 1840 }, { "epoch": 0.7462761422338131, "grad_norm": 0.6252184614146273, "learning_rate": 9.225753959015726e-06, "loss": 0.2757, "step": 1860 }, { "epoch": 0.7543006168814885, "grad_norm": 0.6775143307950428, "learning_rate": 8.688707600181237e-06, "loss": 0.2714, "step": 1880 }, { "epoch": 0.762325091529164, "grad_norm": 0.6952554867252799, "learning_rate": 8.164459802954771e-06, "loss": 0.2782, "step": 1900 }, { "epoch": 0.7703495661768394, "grad_norm": 0.6783881222495053, "learning_rate": 7.653421915350096e-06, "loss": 0.2699, "step": 1920 }, { "epoch": 0.7783740408245148, "grad_norm": 0.6556587479775633, "learning_rate": 7.1559949203015005e-06, "loss": 0.2674, "step": 1940 }, { "epoch": 0.7863985154721902, "grad_norm": 0.7269086771080256, "learning_rate": 6.67256912103485e-06, "loss": 0.2696, "step": 1960 }, { "epoch": 0.7944229901198656, "grad_norm": 0.6782398065845676, "learning_rate": 6.203523834818395e-06, "loss": 0.2616, "step": 1980 }, { "epoch": 0.802447464767541, "grad_norm": 0.6401864638918741, "learning_rate": 5.749227095333684e-06, "loss": 0.2727, "step": 2000 }, { "epoch": 0.8104719394152164, "grad_norm": 0.6920549327363028, "learning_rate": 5.310035363900029e-06, "loss": 0.2613, "step": 2020 }, { "epoch": 0.8184964140628919, "grad_norm": 0.7544225062042402, "learning_rate": 4.886293249779203e-06, "loss": 0.2649, "step": 2040 }, { "epoch": 0.8265208887105673, "grad_norm": 0.7784252166653262, "learning_rate": 4.4783332397797725e-06, "loss": 0.2646, "step": 2060 }, { "epoch": 0.8345453633582426, "grad_norm": 0.7372167602245259, "learning_rate": 4.086475437373222e-06, "loss": 0.2663, "step": 2080 }, { "epoch": 0.842569838005918, "grad_norm": 0.5940476502560079, "learning_rate": 3.711027311526605e-06, "loss": 0.2474, "step": 2100 }, { "epoch": 0.8505943126535934, "grad_norm": 0.7426516397535643, "learning_rate": 3.3522834554488265e-06, "loss": 0.2667, "step": 2120 }, { "epoch": 0.8586187873012688, "grad_norm": 0.6474191378185361, "learning_rate": 3.010525355439739e-06, "loss": 0.2633, "step": 2140 }, { "epoch": 0.8666432619489443, "grad_norm": 0.7138316369428281, "learning_rate": 2.6860211700235616e-06, "loss": 0.2653, "step": 2160 }, { "epoch": 0.8746677365966197, "grad_norm": 0.7205767765435191, "learning_rate": 2.3790255195398293e-06, "loss": 0.269, "step": 2180 }, { "epoch": 0.8826922112442951, "grad_norm": 0.6866728998175627, "learning_rate": 2.0897792863570133e-06, "loss": 0.2781, "step": 2200 }, { "epoch": 0.8907166858919705, "grad_norm": 0.6386115100985029, "learning_rate": 1.818509425865561e-06, "loss": 0.248, "step": 2220 }, { "epoch": 0.8987411605396459, "grad_norm": 0.5816063089909334, "learning_rate": 1.5654287883986568e-06, "loss": 0.2495, "step": 2240 }, { "epoch": 0.9067656351873213, "grad_norm": 0.6044106984921237, "learning_rate": 1.3307359522204187e-06, "loss": 0.2546, "step": 2260 }, { "epoch": 0.9147901098349968, "grad_norm": 0.6487114272956552, "learning_rate": 1.1146150677126321e-06, "loss": 0.2493, "step": 2280 }, { "epoch": 0.9228145844826722, "grad_norm": 0.7113121988343898, "learning_rate": 9.172357128822001e-07, "loss": 0.2649, "step": 2300 }, { "epoch": 0.9308390591303476, "grad_norm": 0.684922616701092, "learning_rate": 7.387527603027383e-07, "loss": 0.2619, "step": 2320 }, { "epoch": 0.938863533778023, "grad_norm": 0.6512571317668248, "learning_rate": 5.793062555946999e-07, "loss": 0.2626, "step": 2340 }, { "epoch": 0.9468880084256984, "grad_norm": 0.9231979423940574, "learning_rate": 4.390213075393973e-07, "loss": 0.268, "step": 2360 }, { "epoch": 0.9549124830733738, "grad_norm": 0.5874482600389693, "learning_rate": 3.1800798991309944e-07, "loss": 0.2627, "step": 2380 }, { "epoch": 0.9629369577210491, "grad_norm": 0.827108940722453, "learning_rate": 2.163612551182942e-07, "loss": 0.2586, "step": 2400 }, { "epoch": 0.9709614323687246, "grad_norm": 0.5907349717505673, "learning_rate": 1.3416085967982994e-07, "loss": 0.2642, "step": 2420 }, { "epoch": 0.9789859070164, "grad_norm": 0.7334026096749088, "learning_rate": 7.14713016644053e-08, "loss": 0.2649, "step": 2440 }, { "epoch": 0.9870103816640754, "grad_norm": 0.523367756897521, "learning_rate": 2.8341770072548567e-08, "loss": 0.2682, "step": 2460 }, { "epoch": 0.9950348563117508, "grad_norm": 0.6000740449055824, "learning_rate": 4.806106242741249e-09, "loss": 0.2627, "step": 2480 } ], "logging_steps": 20, "max_steps": 2493, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 41482954211328.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }