{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 19048, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02624947501049979, "grad_norm": 2.3986732959747314, "learning_rate": 4.868752624947501e-05, "loss": 4.3975, "step": 500 }, { "epoch": 0.05249895002099958, "grad_norm": 2.5253329277038574, "learning_rate": 4.7375052498950025e-05, "loss": 2.9127, "step": 1000 }, { "epoch": 0.05249895002099958, "eval_accuracy": 0.44588644304866, "eval_loss": 2.4536643028259277, "eval_runtime": 52.5922, "eval_samples_per_second": 116.5, "eval_steps_per_second": 3.651, "step": 1000 }, { "epoch": 0.07874842503149937, "grad_norm": 1.6817123889923096, "learning_rate": 4.606257874842503e-05, "loss": 2.2923, "step": 1500 }, { "epoch": 0.10499790004199916, "grad_norm": 2.106494665145874, "learning_rate": 4.475010499790005e-05, "loss": 1.9813, "step": 2000 }, { "epoch": 0.10499790004199916, "eval_accuracy": 0.5655891323454779, "eval_loss": 1.8068230152130127, "eval_runtime": 53.0303, "eval_samples_per_second": 115.538, "eval_steps_per_second": 3.621, "step": 2000 }, { "epoch": 0.13124737505249895, "grad_norm": 1.5576270818710327, "learning_rate": 4.3437631247375055e-05, "loss": 1.8298, "step": 2500 }, { "epoch": 0.15749685006299874, "grad_norm": 1.4892126321792603, "learning_rate": 4.212515749685006e-05, "loss": 1.7348, "step": 3000 }, { "epoch": 0.15749685006299874, "eval_accuracy": 0.5999509247165049, "eval_loss": 1.615663766860962, "eval_runtime": 52.9302, "eval_samples_per_second": 115.756, "eval_steps_per_second": 3.627, "step": 3000 }, { "epoch": 0.18374632507349853, "grad_norm": 1.2966831922531128, "learning_rate": 4.081268374632508e-05, "loss": 1.663, "step": 3500 }, { "epoch": 0.20999580008399832, "grad_norm": 1.2723957300186157, "learning_rate": 3.9500209995800084e-05, "loss": 1.6121, "step": 4000 }, { "epoch": 0.20999580008399832, "eval_accuracy": 0.6179099896121856, "eval_loss": 1.5170389413833618, "eval_runtime": 53.8279, "eval_samples_per_second": 113.826, "eval_steps_per_second": 3.567, "step": 4000 }, { "epoch": 0.2362452750944981, "grad_norm": 1.2891942262649536, "learning_rate": 3.81877362452751e-05, "loss": 1.5731, "step": 4500 }, { "epoch": 0.2624947501049979, "grad_norm": 1.2763597965240479, "learning_rate": 3.687526249475011e-05, "loss": 1.539, "step": 5000 }, { "epoch": 0.2624947501049979, "eval_accuracy": 0.6299712456490757, "eval_loss": 1.45320725440979, "eval_runtime": 53.4488, "eval_samples_per_second": 114.633, "eval_steps_per_second": 3.592, "step": 5000 }, { "epoch": 0.2887442251154977, "grad_norm": 1.19612717628479, "learning_rate": 3.5562788744225114e-05, "loss": 1.5094, "step": 5500 }, { "epoch": 0.3149937001259975, "grad_norm": 1.3306570053100586, "learning_rate": 3.425031499370013e-05, "loss": 1.4852, "step": 6000 }, { "epoch": 0.3149937001259975, "eval_accuracy": 0.6385916478526134, "eval_loss": 1.4079374074935913, "eval_runtime": 53.6371, "eval_samples_per_second": 114.231, "eval_steps_per_second": 3.58, "step": 6000 }, { "epoch": 0.34124317513649727, "grad_norm": 1.191298484802246, "learning_rate": 3.2937841243175137e-05, "loss": 1.468, "step": 6500 }, { "epoch": 0.36749265014699706, "grad_norm": 1.1890417337417603, "learning_rate": 3.162536749265015e-05, "loss": 1.4478, "step": 7000 }, { "epoch": 0.36749265014699706, "eval_accuracy": 0.6450839121935328, "eval_loss": 1.3743374347686768, "eval_runtime": 53.9027, "eval_samples_per_second": 113.668, "eval_steps_per_second": 3.562, "step": 7000 }, { "epoch": 0.39374212515749685, "grad_norm": 1.1638110876083374, "learning_rate": 3.031289374212516e-05, "loss": 1.4309, "step": 7500 }, { "epoch": 0.41999160016799664, "grad_norm": 1.2325953245162964, "learning_rate": 2.900041999160017e-05, "loss": 1.4187, "step": 8000 }, { "epoch": 0.41999160016799664, "eval_accuracy": 0.6506158900215877, "eval_loss": 1.3458136320114136, "eval_runtime": 53.3568, "eval_samples_per_second": 114.831, "eval_steps_per_second": 3.598, "step": 8000 }, { "epoch": 0.4462410751784964, "grad_norm": 1.1118295192718506, "learning_rate": 2.768794624107518e-05, "loss": 1.403, "step": 8500 }, { "epoch": 0.4724905501889962, "grad_norm": 1.198486328125, "learning_rate": 2.6375472490550192e-05, "loss": 1.3891, "step": 9000 }, { "epoch": 0.4724905501889962, "eval_accuracy": 0.65459121134424, "eval_loss": 1.3246122598648071, "eval_runtime": 53.4313, "eval_samples_per_second": 114.671, "eval_steps_per_second": 3.593, "step": 9000 }, { "epoch": 0.498740025199496, "grad_norm": 1.0942541360855103, "learning_rate": 2.50629987400252e-05, "loss": 1.3793, "step": 9500 }, { "epoch": 0.5249895002099958, "grad_norm": 1.1538808345794678, "learning_rate": 2.375052498950021e-05, "loss": 1.3687, "step": 10000 }, { "epoch": 0.5249895002099958, "eval_accuracy": 0.6589060391794983, "eval_loss": 1.3028844594955444, "eval_runtime": 53.8932, "eval_samples_per_second": 113.688, "eval_steps_per_second": 3.563, "step": 10000 }, { "epoch": 0.5512389752204956, "grad_norm": 1.0931830406188965, "learning_rate": 2.2438051238975222e-05, "loss": 1.3626, "step": 10500 }, { "epoch": 0.5774884502309954, "grad_norm": 1.116744041442871, "learning_rate": 2.1125577488450233e-05, "loss": 1.3523, "step": 11000 }, { "epoch": 0.5774884502309954, "eval_accuracy": 0.6622822782865323, "eval_loss": 1.2859536409378052, "eval_runtime": 53.9756, "eval_samples_per_second": 113.514, "eval_steps_per_second": 3.557, "step": 11000 }, { "epoch": 0.6037379252414952, "grad_norm": 1.0872825384140015, "learning_rate": 1.9813103737925244e-05, "loss": 1.3427, "step": 11500 }, { "epoch": 0.629987400251995, "grad_norm": 1.135183334350586, "learning_rate": 1.8500629987400252e-05, "loss": 1.3367, "step": 12000 }, { "epoch": 0.629987400251995, "eval_accuracy": 0.6648805879972004, "eval_loss": 1.2726922035217285, "eval_runtime": 53.5196, "eval_samples_per_second": 114.482, "eval_steps_per_second": 3.587, "step": 12000 }, { "epoch": 0.6562368752624947, "grad_norm": 1.1367709636688232, "learning_rate": 1.7188156236875263e-05, "loss": 1.3302, "step": 12500 }, { "epoch": 0.6824863502729945, "grad_norm": 1.1728744506835938, "learning_rate": 1.5875682486350274e-05, "loss": 1.3201, "step": 13000 }, { "epoch": 0.6824863502729945, "eval_accuracy": 0.6678633952150961, "eval_loss": 1.2587724924087524, "eval_runtime": 52.6916, "eval_samples_per_second": 116.28, "eval_steps_per_second": 3.644, "step": 13000 }, { "epoch": 0.7087358252834943, "grad_norm": 1.1379096508026123, "learning_rate": 1.4563208735825285e-05, "loss": 1.3131, "step": 13500 }, { "epoch": 0.7349853002939941, "grad_norm": 1.1188551187515259, "learning_rate": 1.3250734985300295e-05, "loss": 1.3107, "step": 14000 }, { "epoch": 0.7349853002939941, "eval_accuracy": 0.6699808437279283, "eval_loss": 1.2476825714111328, "eval_runtime": 54.1602, "eval_samples_per_second": 113.127, "eval_steps_per_second": 3.545, "step": 14000 }, { "epoch": 0.7612347753044939, "grad_norm": 1.1443243026733398, "learning_rate": 1.1938261234775306e-05, "loss": 1.3028, "step": 14500 }, { "epoch": 0.7874842503149937, "grad_norm": 1.0995649099349976, "learning_rate": 1.0625787484250315e-05, "loss": 1.2997, "step": 15000 }, { "epoch": 0.7874842503149937, "eval_accuracy": 0.6720863265507016, "eval_loss": 1.2376983165740967, "eval_runtime": 53.9473, "eval_samples_per_second": 113.574, "eval_steps_per_second": 3.559, "step": 15000 }, { "epoch": 0.8137337253254935, "grad_norm": 1.107537865638733, "learning_rate": 9.313313733725326e-06, "loss": 1.2937, "step": 15500 }, { "epoch": 0.8399832003359933, "grad_norm": 1.1298577785491943, "learning_rate": 8.000839983200337e-06, "loss": 1.2894, "step": 16000 }, { "epoch": 0.8399832003359933, "eval_accuracy": 0.673507690987171, "eval_loss": 1.230576515197754, "eval_runtime": 53.7584, "eval_samples_per_second": 113.973, "eval_steps_per_second": 3.572, "step": 16000 }, { "epoch": 0.8662326753464931, "grad_norm": 1.1056915521621704, "learning_rate": 6.6883662326753475e-06, "loss": 1.286, "step": 16500 }, { "epoch": 0.8924821503569929, "grad_norm": 1.1191200017929077, "learning_rate": 5.375892482150358e-06, "loss": 1.2849, "step": 17000 }, { "epoch": 0.8924821503569929, "eval_accuracy": 0.6750201542106227, "eval_loss": 1.2236781120300293, "eval_runtime": 53.3659, "eval_samples_per_second": 114.811, "eval_steps_per_second": 3.598, "step": 17000 }, { "epoch": 0.9187316253674926, "grad_norm": 1.1233047246932983, "learning_rate": 4.063418731625368e-06, "loss": 1.2786, "step": 17500 }, { "epoch": 0.9449811003779924, "grad_norm": 1.1161458492279053, "learning_rate": 2.7509449811003783e-06, "loss": 1.2738, "step": 18000 }, { "epoch": 0.9449811003779924, "eval_accuracy": 0.6761171686752274, "eval_loss": 1.2184966802597046, "eval_runtime": 53.3087, "eval_samples_per_second": 114.934, "eval_steps_per_second": 3.602, "step": 18000 }, { "epoch": 0.9712305753884922, "grad_norm": 1.147400975227356, "learning_rate": 1.4384712305753885e-06, "loss": 1.2733, "step": 18500 }, { "epoch": 0.997480050398992, "grad_norm": 1.1077104806900024, "learning_rate": 1.25997480050399e-07, "loss": 1.2725, "step": 19000 }, { "epoch": 0.997480050398992, "eval_accuracy": 0.6769313142268385, "eval_loss": 1.2148301601409912, "eval_runtime": 52.7052, "eval_samples_per_second": 116.25, "eval_steps_per_second": 3.643, "step": 19000 }, { "epoch": 1.0, "step": 19048, "total_flos": 3.18526483857408e+17, "train_loss": 1.56135786425812, "train_runtime": 7465.8111, "train_samples_per_second": 81.642, "train_steps_per_second": 2.551 } ], "logging_steps": 500, "max_steps": 19048, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.18526483857408e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }