{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0035124532433173, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012544475868990056, "grad_norm": 0.13098102807998657, "learning_rate": 4.6226415094339625e-05, "loss": 0.9457, "step": 50 }, { "epoch": 0.025088951737980113, "grad_norm": 0.09060654789209366, "learning_rate": 4.980675516719879e-05, "loss": 0.3123, "step": 100 }, { "epoch": 0.037633427606970166, "grad_norm": 0.1386842280626297, "learning_rate": 4.959670643589313e-05, "loss": 0.2832, "step": 150 }, { "epoch": 0.050177903475960225, "grad_norm": 0.18612022697925568, "learning_rate": 4.9386657704587466e-05, "loss": 0.2708, "step": 200 }, { "epoch": 0.06272237934495027, "grad_norm": 0.17801949381828308, "learning_rate": 4.91766089732818e-05, "loss": 0.2662, "step": 250 }, { "epoch": 0.07526685521394033, "grad_norm": 0.10166127979755402, "learning_rate": 4.896656024197614e-05, "loss": 0.2599, "step": 300 }, { "epoch": 0.08781133108293039, "grad_norm": 0.21376383304595947, "learning_rate": 4.8756511510670475e-05, "loss": 0.2549, "step": 350 }, { "epoch": 0.10035580695192045, "grad_norm": 0.18035802245140076, "learning_rate": 4.8546462779364816e-05, "loss": 0.2515, "step": 400 }, { "epoch": 0.1129002828209105, "grad_norm": 0.17301003634929657, "learning_rate": 4.833641404805915e-05, "loss": 0.2492, "step": 450 }, { "epoch": 0.12544475868990054, "grad_norm": 0.15648286044597626, "learning_rate": 4.812636531675349e-05, "loss": 0.2476, "step": 500 }, { "epoch": 0.12544475868990054, "eval_loss": 0.40483859181404114, "eval_runtime": 304.1505, "eval_samples_per_second": 65.034, "eval_steps_per_second": 1.019, "step": 500 }, { "epoch": 0.13798923455889062, "grad_norm": 0.16616199910640717, "learning_rate": 4.7916316585447826e-05, "loss": 0.245, "step": 550 }, { "epoch": 0.15053371042788066, "grad_norm": 0.6291008591651917, "learning_rate": 4.770626785414216e-05, "loss": 0.2434, "step": 600 }, { "epoch": 0.16307818629687074, "grad_norm": 0.27643585205078125, "learning_rate": 4.74962191228365e-05, "loss": 0.2414, "step": 650 }, { "epoch": 0.17562266216586078, "grad_norm": 0.27261775732040405, "learning_rate": 4.7286170391530835e-05, "loss": 0.2407, "step": 700 }, { "epoch": 0.18816713803485083, "grad_norm": 0.1961769014596939, "learning_rate": 4.7076121660225176e-05, "loss": 0.2389, "step": 750 }, { "epoch": 0.2007116139038409, "grad_norm": 0.13879896700382233, "learning_rate": 4.686607292891951e-05, "loss": 0.237, "step": 800 }, { "epoch": 0.21325608977283095, "grad_norm": 0.20926761627197266, "learning_rate": 4.665602419761385e-05, "loss": 0.2372, "step": 850 }, { "epoch": 0.225800565641821, "grad_norm": 0.22533361613750458, "learning_rate": 4.6445975466308186e-05, "loss": 0.2351, "step": 900 }, { "epoch": 0.23834504151081107, "grad_norm": 0.1977718323469162, "learning_rate": 4.623592673500252e-05, "loss": 0.2348, "step": 950 }, { "epoch": 0.2508895173798011, "grad_norm": 0.26971426606178284, "learning_rate": 4.602587800369686e-05, "loss": 0.2334, "step": 1000 }, { "epoch": 0.2508895173798011, "eval_loss": 0.4072725474834442, "eval_runtime": 304.8492, "eval_samples_per_second": 64.885, "eval_steps_per_second": 1.017, "step": 1000 }, { "epoch": 0.2634339932487912, "grad_norm": 0.2769719958305359, "learning_rate": 4.5815829272391195e-05, "loss": 0.2334, "step": 1050 }, { "epoch": 0.27597846911778123, "grad_norm": 0.28794065117836, "learning_rate": 4.5605780541085536e-05, "loss": 0.2318, "step": 1100 }, { "epoch": 0.2885229449867713, "grad_norm": 0.37538444995880127, "learning_rate": 4.539573180977987e-05, "loss": 0.231, "step": 1150 }, { "epoch": 0.3010674208557613, "grad_norm": 0.17134840786457062, "learning_rate": 4.5185683078474204e-05, "loss": 0.231, "step": 1200 }, { "epoch": 0.31361189672475137, "grad_norm": 0.28663370013237, "learning_rate": 4.4975634347168545e-05, "loss": 0.2294, "step": 1250 }, { "epoch": 0.3261563725937415, "grad_norm": 0.7121312022209167, "learning_rate": 4.476558561586288e-05, "loss": 0.2297, "step": 1300 }, { "epoch": 0.3387008484627315, "grad_norm": 0.2550923526287079, "learning_rate": 4.455553688455722e-05, "loss": 0.2283, "step": 1350 }, { "epoch": 0.35124532433172156, "grad_norm": 1.0167971849441528, "learning_rate": 4.4345488153251555e-05, "loss": 0.2263, "step": 1400 }, { "epoch": 0.3637898002007116, "grad_norm": 0.6048764586448669, "learning_rate": 4.4135439421945896e-05, "loss": 0.2277, "step": 1450 }, { "epoch": 0.37633427606970166, "grad_norm": 0.31545552611351013, "learning_rate": 4.392539069064023e-05, "loss": 0.2269, "step": 1500 }, { "epoch": 0.37633427606970166, "eval_loss": 0.4088518023490906, "eval_runtime": 305.124, "eval_samples_per_second": 64.826, "eval_steps_per_second": 1.016, "step": 1500 }, { "epoch": 0.3888787519386917, "grad_norm": 0.37552791833877563, "learning_rate": 4.3715341959334564e-05, "loss": 0.2246, "step": 1550 }, { "epoch": 0.4014232278076818, "grad_norm": 0.2993505299091339, "learning_rate": 4.3505293228028905e-05, "loss": 0.2261, "step": 1600 }, { "epoch": 0.41396770367667185, "grad_norm": 0.15790335834026337, "learning_rate": 4.329524449672324e-05, "loss": 0.2251, "step": 1650 }, { "epoch": 0.4265121795456619, "grad_norm": 0.47013625502586365, "learning_rate": 4.308519576541758e-05, "loss": 0.2243, "step": 1700 }, { "epoch": 0.43905665541465194, "grad_norm": 0.2053990662097931, "learning_rate": 4.2875147034111915e-05, "loss": 0.2237, "step": 1750 }, { "epoch": 0.451601131283642, "grad_norm": 0.17550259828567505, "learning_rate": 4.2665098302806256e-05, "loss": 0.2228, "step": 1800 }, { "epoch": 0.46414560715263203, "grad_norm": 0.5729805827140808, "learning_rate": 4.245504957150059e-05, "loss": 0.2228, "step": 1850 }, { "epoch": 0.47669008302162214, "grad_norm": 0.3008301854133606, "learning_rate": 4.2245000840194924e-05, "loss": 0.2217, "step": 1900 }, { "epoch": 0.4892345588906122, "grad_norm": 0.2061658799648285, "learning_rate": 4.2034952108889265e-05, "loss": 0.223, "step": 1950 }, { "epoch": 0.5017790347596022, "grad_norm": 0.2295321226119995, "learning_rate": 4.18249033775836e-05, "loss": 0.2219, "step": 2000 }, { "epoch": 0.5017790347596022, "eval_loss": 0.4091717004776001, "eval_runtime": 304.785, "eval_samples_per_second": 64.898, "eval_steps_per_second": 1.017, "step": 2000 }, { "epoch": 0.5143235106285923, "grad_norm": 0.22435450553894043, "learning_rate": 4.161485464627794e-05, "loss": 0.2215, "step": 2050 }, { "epoch": 0.5268679864975824, "grad_norm": 0.185350701212883, "learning_rate": 4.1404805914972275e-05, "loss": 0.2207, "step": 2100 }, { "epoch": 0.5394124623665724, "grad_norm": 0.46742141246795654, "learning_rate": 4.119475718366661e-05, "loss": 0.2197, "step": 2150 }, { "epoch": 0.5519569382355625, "grad_norm": 0.20891498029232025, "learning_rate": 4.098470845236095e-05, "loss": 0.2194, "step": 2200 }, { "epoch": 0.5645014141045525, "grad_norm": 0.4283987581729889, "learning_rate": 4.0774659721055284e-05, "loss": 0.2192, "step": 2250 }, { "epoch": 0.5770458899735426, "grad_norm": 0.32103636860847473, "learning_rate": 4.0564610989749625e-05, "loss": 0.2185, "step": 2300 }, { "epoch": 0.5895903658425327, "grad_norm": 0.20490871369838715, "learning_rate": 4.035456225844396e-05, "loss": 0.2183, "step": 2350 }, { "epoch": 0.6021348417115227, "grad_norm": 0.3914024233818054, "learning_rate": 4.01445135271383e-05, "loss": 0.2184, "step": 2400 }, { "epoch": 0.6146793175805128, "grad_norm": 0.18293343484401703, "learning_rate": 3.9934464795832635e-05, "loss": 0.2186, "step": 2450 }, { "epoch": 0.6272237934495027, "grad_norm": 0.20402023196220398, "learning_rate": 3.972441606452697e-05, "loss": 0.2179, "step": 2500 }, { "epoch": 0.6272237934495027, "eval_loss": 0.40875929594039917, "eval_runtime": 304.5578, "eval_samples_per_second": 64.947, "eval_steps_per_second": 1.018, "step": 2500 }, { "epoch": 0.6397682693184928, "grad_norm": 0.48965466022491455, "learning_rate": 3.951436733322131e-05, "loss": 0.2168, "step": 2550 }, { "epoch": 0.652312745187483, "grad_norm": 0.5581162571907043, "learning_rate": 3.9304318601915644e-05, "loss": 0.2175, "step": 2600 }, { "epoch": 0.6648572210564729, "grad_norm": 0.23750029504299164, "learning_rate": 3.9094269870609985e-05, "loss": 0.217, "step": 2650 }, { "epoch": 0.677401696925463, "grad_norm": 0.5061260461807251, "learning_rate": 3.888422113930432e-05, "loss": 0.2151, "step": 2700 }, { "epoch": 0.689946172794453, "grad_norm": 0.1854904741048813, "learning_rate": 3.867417240799866e-05, "loss": 0.216, "step": 2750 }, { "epoch": 0.7024906486634431, "grad_norm": 0.22555580735206604, "learning_rate": 3.8464123676692995e-05, "loss": 0.2157, "step": 2800 }, { "epoch": 0.7150351245324331, "grad_norm": 0.4870660901069641, "learning_rate": 3.825407494538733e-05, "loss": 0.2151, "step": 2850 }, { "epoch": 0.7275796004014232, "grad_norm": 0.37115806341171265, "learning_rate": 3.804402621408167e-05, "loss": 0.2146, "step": 2900 }, { "epoch": 0.7401240762704133, "grad_norm": 0.34767332673072815, "learning_rate": 3.7833977482776004e-05, "loss": 0.2139, "step": 2950 }, { "epoch": 0.7526685521394033, "grad_norm": 0.2617909610271454, "learning_rate": 3.7623928751470345e-05, "loss": 0.2149, "step": 3000 }, { "epoch": 0.7526685521394033, "eval_loss": 0.40570953488349915, "eval_runtime": 304.7965, "eval_samples_per_second": 64.896, "eval_steps_per_second": 1.017, "step": 3000 }, { "epoch": 0.7652130280083934, "grad_norm": 0.6052380204200745, "learning_rate": 3.741388002016468e-05, "loss": 0.2141, "step": 3050 }, { "epoch": 0.7777575038773834, "grad_norm": 0.3745960295200348, "learning_rate": 3.7203831288859014e-05, "loss": 0.213, "step": 3100 }, { "epoch": 0.7903019797463735, "grad_norm": 0.24974456429481506, "learning_rate": 3.6993782557553355e-05, "loss": 0.2142, "step": 3150 }, { "epoch": 0.8028464556153636, "grad_norm": 0.4550504684448242, "learning_rate": 3.678373382624769e-05, "loss": 0.2133, "step": 3200 }, { "epoch": 0.8153909314843536, "grad_norm": 0.8576037287712097, "learning_rate": 3.657368509494203e-05, "loss": 0.2121, "step": 3250 }, { "epoch": 0.8279354073533437, "grad_norm": 0.4864007532596588, "learning_rate": 3.6363636363636364e-05, "loss": 0.2131, "step": 3300 }, { "epoch": 0.8404798832223337, "grad_norm": 0.6007568836212158, "learning_rate": 3.6153587632330705e-05, "loss": 0.2134, "step": 3350 }, { "epoch": 0.8530243590913238, "grad_norm": 0.2667822241783142, "learning_rate": 3.594353890102504e-05, "loss": 0.2123, "step": 3400 }, { "epoch": 0.8655688349603139, "grad_norm": 0.16192808747291565, "learning_rate": 3.5733490169719374e-05, "loss": 0.2102, "step": 3450 }, { "epoch": 0.8781133108293039, "grad_norm": 0.4632836580276489, "learning_rate": 3.5523441438413715e-05, "loss": 0.2127, "step": 3500 }, { "epoch": 0.8781133108293039, "eval_loss": 0.40804293751716614, "eval_runtime": 305.2396, "eval_samples_per_second": 64.802, "eval_steps_per_second": 1.016, "step": 3500 }, { "epoch": 0.890657786698294, "grad_norm": 0.1812131106853485, "learning_rate": 3.531339270710805e-05, "loss": 0.2124, "step": 3550 }, { "epoch": 0.903202262567284, "grad_norm": 0.29924267530441284, "learning_rate": 3.510334397580239e-05, "loss": 0.2116, "step": 3600 }, { "epoch": 0.9157467384362741, "grad_norm": 0.30432143807411194, "learning_rate": 3.4893295244496724e-05, "loss": 0.2096, "step": 3650 }, { "epoch": 0.9282912143052641, "grad_norm": 0.17945361137390137, "learning_rate": 3.4683246513191065e-05, "loss": 0.211, "step": 3700 }, { "epoch": 0.9408356901742542, "grad_norm": 0.2670902907848358, "learning_rate": 3.44731977818854e-05, "loss": 0.2118, "step": 3750 }, { "epoch": 0.9533801660432443, "grad_norm": 0.350669801235199, "learning_rate": 3.4263149050579734e-05, "loss": 0.2094, "step": 3800 }, { "epoch": 0.9659246419122343, "grad_norm": 0.3146061599254608, "learning_rate": 3.4053100319274075e-05, "loss": 0.2092, "step": 3850 }, { "epoch": 0.9784691177812244, "grad_norm": 0.16551902890205383, "learning_rate": 3.384305158796841e-05, "loss": 0.2103, "step": 3900 }, { "epoch": 0.9910135936502144, "grad_norm": 0.34425023198127747, "learning_rate": 3.363300285666275e-05, "loss": 0.2095, "step": 3950 }, { "epoch": 1.0035124532433173, "grad_norm": 0.44446665048599243, "learning_rate": 3.3422954125357084e-05, "loss": 0.2098, "step": 4000 }, { "epoch": 1.0035124532433173, "eval_loss": 0.40903258323669434, "eval_runtime": 305.036, "eval_samples_per_second": 64.845, "eval_steps_per_second": 1.016, "step": 4000 } ], "logging_steps": 50, "max_steps": 11955, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.237451667319271e+20, "train_batch_size": 128, "trial_name": null, "trial_params": null }