| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 1000, | |
| "global_step": 19048, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02624947501049979, | |
| "grad_norm": 2.3986732959747314, | |
| "learning_rate": 4.868752624947501e-05, | |
| "loss": 4.3975, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05249895002099958, | |
| "grad_norm": 2.5253329277038574, | |
| "learning_rate": 4.7375052498950025e-05, | |
| "loss": 2.9127, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.05249895002099958, | |
| "eval_accuracy": 0.44588644304866, | |
| "eval_loss": 2.4536643028259277, | |
| "eval_runtime": 52.5922, | |
| "eval_samples_per_second": 116.5, | |
| "eval_steps_per_second": 3.651, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07874842503149937, | |
| "grad_norm": 1.6817123889923096, | |
| "learning_rate": 4.606257874842503e-05, | |
| "loss": 2.2923, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.10499790004199916, | |
| "grad_norm": 2.106494665145874, | |
| "learning_rate": 4.475010499790005e-05, | |
| "loss": 1.9813, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.10499790004199916, | |
| "eval_accuracy": 0.5655891323454779, | |
| "eval_loss": 1.8068230152130127, | |
| "eval_runtime": 53.0303, | |
| "eval_samples_per_second": 115.538, | |
| "eval_steps_per_second": 3.621, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.13124737505249895, | |
| "grad_norm": 1.5576270818710327, | |
| "learning_rate": 4.3437631247375055e-05, | |
| "loss": 1.8298, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15749685006299874, | |
| "grad_norm": 1.4892126321792603, | |
| "learning_rate": 4.212515749685006e-05, | |
| "loss": 1.7348, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15749685006299874, | |
| "eval_accuracy": 0.5999509247165049, | |
| "eval_loss": 1.615663766860962, | |
| "eval_runtime": 52.9302, | |
| "eval_samples_per_second": 115.756, | |
| "eval_steps_per_second": 3.627, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.18374632507349853, | |
| "grad_norm": 1.2966831922531128, | |
| "learning_rate": 4.081268374632508e-05, | |
| "loss": 1.663, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.20999580008399832, | |
| "grad_norm": 1.2723957300186157, | |
| "learning_rate": 3.9500209995800084e-05, | |
| "loss": 1.6121, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.20999580008399832, | |
| "eval_accuracy": 0.6179099896121856, | |
| "eval_loss": 1.5170389413833618, | |
| "eval_runtime": 53.8279, | |
| "eval_samples_per_second": 113.826, | |
| "eval_steps_per_second": 3.567, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2362452750944981, | |
| "grad_norm": 1.2891942262649536, | |
| "learning_rate": 3.81877362452751e-05, | |
| "loss": 1.5731, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2624947501049979, | |
| "grad_norm": 1.2763597965240479, | |
| "learning_rate": 3.687526249475011e-05, | |
| "loss": 1.539, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2624947501049979, | |
| "eval_accuracy": 0.6299712456490757, | |
| "eval_loss": 1.45320725440979, | |
| "eval_runtime": 53.4488, | |
| "eval_samples_per_second": 114.633, | |
| "eval_steps_per_second": 3.592, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2887442251154977, | |
| "grad_norm": 1.19612717628479, | |
| "learning_rate": 3.5562788744225114e-05, | |
| "loss": 1.5094, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.3149937001259975, | |
| "grad_norm": 1.3306570053100586, | |
| "learning_rate": 3.425031499370013e-05, | |
| "loss": 1.4852, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3149937001259975, | |
| "eval_accuracy": 0.6385916478526134, | |
| "eval_loss": 1.4079374074935913, | |
| "eval_runtime": 53.6371, | |
| "eval_samples_per_second": 114.231, | |
| "eval_steps_per_second": 3.58, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.34124317513649727, | |
| "grad_norm": 1.191298484802246, | |
| "learning_rate": 3.2937841243175137e-05, | |
| "loss": 1.468, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.36749265014699706, | |
| "grad_norm": 1.1890417337417603, | |
| "learning_rate": 3.162536749265015e-05, | |
| "loss": 1.4478, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.36749265014699706, | |
| "eval_accuracy": 0.6450839121935328, | |
| "eval_loss": 1.3743374347686768, | |
| "eval_runtime": 53.9027, | |
| "eval_samples_per_second": 113.668, | |
| "eval_steps_per_second": 3.562, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.39374212515749685, | |
| "grad_norm": 1.1638110876083374, | |
| "learning_rate": 3.031289374212516e-05, | |
| "loss": 1.4309, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.41999160016799664, | |
| "grad_norm": 1.2325953245162964, | |
| "learning_rate": 2.900041999160017e-05, | |
| "loss": 1.4187, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.41999160016799664, | |
| "eval_accuracy": 0.6506158900215877, | |
| "eval_loss": 1.3458136320114136, | |
| "eval_runtime": 53.3568, | |
| "eval_samples_per_second": 114.831, | |
| "eval_steps_per_second": 3.598, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4462410751784964, | |
| "grad_norm": 1.1118295192718506, | |
| "learning_rate": 2.768794624107518e-05, | |
| "loss": 1.403, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.4724905501889962, | |
| "grad_norm": 1.198486328125, | |
| "learning_rate": 2.6375472490550192e-05, | |
| "loss": 1.3891, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4724905501889962, | |
| "eval_accuracy": 0.65459121134424, | |
| "eval_loss": 1.3246122598648071, | |
| "eval_runtime": 53.4313, | |
| "eval_samples_per_second": 114.671, | |
| "eval_steps_per_second": 3.593, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.498740025199496, | |
| "grad_norm": 1.0942541360855103, | |
| "learning_rate": 2.50629987400252e-05, | |
| "loss": 1.3793, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5249895002099958, | |
| "grad_norm": 1.1538808345794678, | |
| "learning_rate": 2.375052498950021e-05, | |
| "loss": 1.3687, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5249895002099958, | |
| "eval_accuracy": 0.6589060391794983, | |
| "eval_loss": 1.3028844594955444, | |
| "eval_runtime": 53.8932, | |
| "eval_samples_per_second": 113.688, | |
| "eval_steps_per_second": 3.563, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5512389752204956, | |
| "grad_norm": 1.0931830406188965, | |
| "learning_rate": 2.2438051238975222e-05, | |
| "loss": 1.3626, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5774884502309954, | |
| "grad_norm": 1.116744041442871, | |
| "learning_rate": 2.1125577488450233e-05, | |
| "loss": 1.3523, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5774884502309954, | |
| "eval_accuracy": 0.6622822782865323, | |
| "eval_loss": 1.2859536409378052, | |
| "eval_runtime": 53.9756, | |
| "eval_samples_per_second": 113.514, | |
| "eval_steps_per_second": 3.557, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.6037379252414952, | |
| "grad_norm": 1.0872825384140015, | |
| "learning_rate": 1.9813103737925244e-05, | |
| "loss": 1.3427, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.629987400251995, | |
| "grad_norm": 1.135183334350586, | |
| "learning_rate": 1.8500629987400252e-05, | |
| "loss": 1.3367, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.629987400251995, | |
| "eval_accuracy": 0.6648805879972004, | |
| "eval_loss": 1.2726922035217285, | |
| "eval_runtime": 53.5196, | |
| "eval_samples_per_second": 114.482, | |
| "eval_steps_per_second": 3.587, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6562368752624947, | |
| "grad_norm": 1.1367709636688232, | |
| "learning_rate": 1.7188156236875263e-05, | |
| "loss": 1.3302, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6824863502729945, | |
| "grad_norm": 1.1728744506835938, | |
| "learning_rate": 1.5875682486350274e-05, | |
| "loss": 1.3201, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6824863502729945, | |
| "eval_accuracy": 0.6678633952150961, | |
| "eval_loss": 1.2587724924087524, | |
| "eval_runtime": 52.6916, | |
| "eval_samples_per_second": 116.28, | |
| "eval_steps_per_second": 3.644, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.7087358252834943, | |
| "grad_norm": 1.1379096508026123, | |
| "learning_rate": 1.4563208735825285e-05, | |
| "loss": 1.3131, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7349853002939941, | |
| "grad_norm": 1.1188551187515259, | |
| "learning_rate": 1.3250734985300295e-05, | |
| "loss": 1.3107, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7349853002939941, | |
| "eval_accuracy": 0.6699808437279283, | |
| "eval_loss": 1.2476825714111328, | |
| "eval_runtime": 54.1602, | |
| "eval_samples_per_second": 113.127, | |
| "eval_steps_per_second": 3.545, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7612347753044939, | |
| "grad_norm": 1.1443243026733398, | |
| "learning_rate": 1.1938261234775306e-05, | |
| "loss": 1.3028, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7874842503149937, | |
| "grad_norm": 1.0995649099349976, | |
| "learning_rate": 1.0625787484250315e-05, | |
| "loss": 1.2997, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7874842503149937, | |
| "eval_accuracy": 0.6720863265507016, | |
| "eval_loss": 1.2376983165740967, | |
| "eval_runtime": 53.9473, | |
| "eval_samples_per_second": 113.574, | |
| "eval_steps_per_second": 3.559, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.8137337253254935, | |
| "grad_norm": 1.107537865638733, | |
| "learning_rate": 9.313313733725326e-06, | |
| "loss": 1.2937, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8399832003359933, | |
| "grad_norm": 1.1298577785491943, | |
| "learning_rate": 8.000839983200337e-06, | |
| "loss": 1.2894, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8399832003359933, | |
| "eval_accuracy": 0.673507690987171, | |
| "eval_loss": 1.230576515197754, | |
| "eval_runtime": 53.7584, | |
| "eval_samples_per_second": 113.973, | |
| "eval_steps_per_second": 3.572, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8662326753464931, | |
| "grad_norm": 1.1056915521621704, | |
| "learning_rate": 6.6883662326753475e-06, | |
| "loss": 1.286, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.8924821503569929, | |
| "grad_norm": 1.1191200017929077, | |
| "learning_rate": 5.375892482150358e-06, | |
| "loss": 1.2849, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8924821503569929, | |
| "eval_accuracy": 0.6750201542106227, | |
| "eval_loss": 1.2236781120300293, | |
| "eval_runtime": 53.3659, | |
| "eval_samples_per_second": 114.811, | |
| "eval_steps_per_second": 3.598, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.9187316253674926, | |
| "grad_norm": 1.1233047246932983, | |
| "learning_rate": 4.063418731625368e-06, | |
| "loss": 1.2786, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9449811003779924, | |
| "grad_norm": 1.1161458492279053, | |
| "learning_rate": 2.7509449811003783e-06, | |
| "loss": 1.2738, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9449811003779924, | |
| "eval_accuracy": 0.6761171686752274, | |
| "eval_loss": 1.2184966802597046, | |
| "eval_runtime": 53.3087, | |
| "eval_samples_per_second": 114.934, | |
| "eval_steps_per_second": 3.602, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9712305753884922, | |
| "grad_norm": 1.147400975227356, | |
| "learning_rate": 1.4384712305753885e-06, | |
| "loss": 1.2733, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.997480050398992, | |
| "grad_norm": 1.1077104806900024, | |
| "learning_rate": 1.25997480050399e-07, | |
| "loss": 1.2725, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.997480050398992, | |
| "eval_accuracy": 0.6769313142268385, | |
| "eval_loss": 1.2148301601409912, | |
| "eval_runtime": 52.7052, | |
| "eval_samples_per_second": 116.25, | |
| "eval_steps_per_second": 3.643, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 19048, | |
| "total_flos": 3.18526483857408e+17, | |
| "train_loss": 1.56135786425812, | |
| "train_runtime": 7465.8111, | |
| "train_samples_per_second": 81.642, | |
| "train_steps_per_second": 2.551 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 19048, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.18526483857408e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |