| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 17184, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00011638733705772812, |
| "grad_norm": 8.116299629211426, |
| "learning_rate": 0.0, |
| "loss": 2.7266, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.05819366852886406, |
| "grad_norm": 57.76313781738281, |
| "learning_rate": 1.732045163543243e-06, |
| "loss": 3.7373, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05819366852886406, |
| "eval_cosine_accuracy": 0.9404736757278442, |
| "eval_loss": 2.536726713180542, |
| "eval_runtime": 23.2615, |
| "eval_samples_per_second": 406.594, |
| "eval_steps_per_second": 1.591, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.11638733705772812, |
| "grad_norm": 5.569820880889893, |
| "learning_rate": 3.4780584332440925e-06, |
| "loss": 2.8655, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11638733705772812, |
| "eval_cosine_accuracy": 0.9364559054374695, |
| "eval_loss": 2.6046745777130127, |
| "eval_runtime": 23.4136, |
| "eval_samples_per_second": 403.953, |
| "eval_steps_per_second": 1.58, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.17458100558659218, |
| "grad_norm": 5.892590045928955, |
| "learning_rate": 5.224071702944943e-06, |
| "loss": 2.3859, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.17458100558659218, |
| "eval_cosine_accuracy": 0.9412137866020203, |
| "eval_loss": 2.540637969970703, |
| "eval_runtime": 23.458, |
| "eval_samples_per_second": 403.189, |
| "eval_steps_per_second": 1.577, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.23277467411545624, |
| "grad_norm": 41.33493423461914, |
| "learning_rate": 6.970084972645793e-06, |
| "loss": 2.1884, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.23277467411545624, |
| "eval_cosine_accuracy": 0.9451258182525635, |
| "eval_loss": 2.5318424701690674, |
| "eval_runtime": 23.2789, |
| "eval_samples_per_second": 406.29, |
| "eval_steps_per_second": 1.589, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2909683426443203, |
| "grad_norm": 15.76939582824707, |
| "learning_rate": 8.71260621580724e-06, |
| "loss": 1.9576, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2909683426443203, |
| "eval_cosine_accuracy": 0.9467117786407471, |
| "eval_loss": 2.474266767501831, |
| "eval_runtime": 23.3675, |
| "eval_samples_per_second": 404.75, |
| "eval_steps_per_second": 1.583, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.34916201117318435, |
| "grad_norm": 7.4116950035095215, |
| "learning_rate": 1.045861948550809e-05, |
| "loss": 1.8211, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.34916201117318435, |
| "eval_cosine_accuracy": 0.9435398578643799, |
| "eval_loss": 2.547072649002075, |
| "eval_runtime": 23.3771, |
| "eval_samples_per_second": 404.583, |
| "eval_steps_per_second": 1.583, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.4073556797020484, |
| "grad_norm": 25.99747657775879, |
| "learning_rate": 1.2204632755208939e-05, |
| "loss": 1.6603, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.4073556797020484, |
| "eval_cosine_accuracy": 0.9489321112632751, |
| "eval_loss": 2.472174644470215, |
| "eval_runtime": 23.4652, |
| "eval_samples_per_second": 403.065, |
| "eval_steps_per_second": 1.577, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.4655493482309125, |
| "grad_norm": 5.763104438781738, |
| "learning_rate": 1.395064602490979e-05, |
| "loss": 1.596, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4655493482309125, |
| "eval_cosine_accuracy": 0.9438570737838745, |
| "eval_loss": 2.5426251888275146, |
| "eval_runtime": 23.3963, |
| "eval_samples_per_second": 404.252, |
| "eval_steps_per_second": 1.581, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.5237430167597765, |
| "grad_norm": 4.893315315246582, |
| "learning_rate": 1.5693167268071237e-05, |
| "loss": 1.5379, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5237430167597765, |
| "eval_cosine_accuracy": 0.9473461508750916, |
| "eval_loss": 2.4768149852752686, |
| "eval_runtime": 23.5926, |
| "eval_samples_per_second": 400.888, |
| "eval_steps_per_second": 1.568, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5819366852886406, |
| "grad_norm": 43.368614196777344, |
| "learning_rate": 1.7439180537772086e-05, |
| "loss": 1.5397, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5819366852886406, |
| "eval_cosine_accuracy": 0.9487206339836121, |
| "eval_loss": 2.4771170616149902, |
| "eval_runtime": 23.566, |
| "eval_samples_per_second": 401.341, |
| "eval_steps_per_second": 1.57, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.6401303538175046, |
| "grad_norm": 0.21546457707881927, |
| "learning_rate": 1.9185193807472936e-05, |
| "loss": 1.381, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6401303538175046, |
| "eval_cosine_accuracy": 0.9412137866020203, |
| "eval_loss": 2.6126925945281982, |
| "eval_runtime": 23.871, |
| "eval_samples_per_second": 396.213, |
| "eval_steps_per_second": 1.55, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6983240223463687, |
| "grad_norm": 27.70214080810547, |
| "learning_rate": 2.0931207077173788e-05, |
| "loss": 1.4407, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6983240223463687, |
| "eval_cosine_accuracy": 0.9492493271827698, |
| "eval_loss": 2.457711935043335, |
| "eval_runtime": 23.5818, |
| "eval_samples_per_second": 401.072, |
| "eval_steps_per_second": 1.569, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7565176908752328, |
| "grad_norm": 0.6600456237792969, |
| "learning_rate": 2.2677220346874637e-05, |
| "loss": 1.3692, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7565176908752328, |
| "eval_cosine_accuracy": 0.9438570737838745, |
| "eval_loss": 2.48168683052063, |
| "eval_runtime": 23.5688, |
| "eval_samples_per_second": 401.294, |
| "eval_steps_per_second": 1.57, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.8147113594040968, |
| "grad_norm": 15.486236572265625, |
| "learning_rate": 2.4419741590036085e-05, |
| "loss": 1.2731, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8147113594040968, |
| "eval_cosine_accuracy": 0.943751335144043, |
| "eval_loss": 2.5139832496643066, |
| "eval_runtime": 23.6217, |
| "eval_samples_per_second": 400.394, |
| "eval_steps_per_second": 1.566, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8729050279329609, |
| "grad_norm": 13.570350646972656, |
| "learning_rate": 2.6165754859736934e-05, |
| "loss": 1.223, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8729050279329609, |
| "eval_cosine_accuracy": 0.9494607448577881, |
| "eval_loss": 2.4431588649749756, |
| "eval_runtime": 23.5387, |
| "eval_samples_per_second": 401.807, |
| "eval_steps_per_second": 1.572, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.931098696461825, |
| "grad_norm": 4.8542633056640625, |
| "learning_rate": 2.7911768129437783e-05, |
| "loss": 1.1982, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.931098696461825, |
| "eval_cosine_accuracy": 0.9420596361160278, |
| "eval_loss": 2.5187907218933105, |
| "eval_runtime": 23.6462, |
| "eval_samples_per_second": 399.98, |
| "eval_steps_per_second": 1.565, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9892923649906891, |
| "grad_norm": 36.649906158447266, |
| "learning_rate": 2.9657781399138632e-05, |
| "loss": 1.1693, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.9892923649906891, |
| "eval_cosine_accuracy": 0.9469232559204102, |
| "eval_loss": 2.4668424129486084, |
| "eval_runtime": 23.9878, |
| "eval_samples_per_second": 394.284, |
| "eval_steps_per_second": 1.542, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.047486033519553, |
| "grad_norm": 14.853631019592285, |
| "learning_rate": 2.9982089877066498e-05, |
| "loss": 1.1045, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.047486033519553, |
| "eval_cosine_accuracy": 0.9422711133956909, |
| "eval_loss": 2.6691627502441406, |
| "eval_runtime": 23.6926, |
| "eval_samples_per_second": 399.197, |
| "eval_steps_per_second": 1.562, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.105679702048417, |
| "grad_norm": 15.238059997558594, |
| "learning_rate": 2.9909854135876818e-05, |
| "loss": 0.9695, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.105679702048417, |
| "eval_cosine_accuracy": 0.9454430341720581, |
| "eval_loss": 2.5817720890045166, |
| "eval_runtime": 23.7575, |
| "eval_samples_per_second": 398.106, |
| "eval_steps_per_second": 1.557, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.1638733705772812, |
| "grad_norm": 0.052569806575775146, |
| "learning_rate": 2.9782179657431337e-05, |
| "loss": 0.9498, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.1638733705772812, |
| "eval_cosine_accuracy": 0.9443857073783875, |
| "eval_loss": 2.5811285972595215, |
| "eval_runtime": 24.7703, |
| "eval_samples_per_second": 381.828, |
| "eval_steps_per_second": 1.494, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.2220670391061452, |
| "grad_norm": 5.478404998779297, |
| "learning_rate": 2.9599629603266307e-05, |
| "loss": 1.0137, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.2220670391061452, |
| "eval_cosine_accuracy": 0.9443857073783875, |
| "eval_loss": 2.588257312774658, |
| "eval_runtime": 23.6811, |
| "eval_samples_per_second": 399.391, |
| "eval_steps_per_second": 1.562, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.2802607076350094, |
| "grad_norm": 22.226593017578125, |
| "learning_rate": 2.93628816501217e-05, |
| "loss": 1.0031, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.2802607076350094, |
| "eval_cosine_accuracy": 0.943751335144043, |
| "eval_loss": 2.626814842224121, |
| "eval_runtime": 24.0203, |
| "eval_samples_per_second": 393.75, |
| "eval_steps_per_second": 1.54, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.3384543761638734, |
| "grad_norm": 18.21111488342285, |
| "learning_rate": 2.907281467246193e-05, |
| "loss": 0.9391, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.3384543761638734, |
| "eval_cosine_accuracy": 0.9466060400009155, |
| "eval_loss": 2.5278029441833496, |
| "eval_runtime": 23.6597, |
| "eval_samples_per_second": 399.751, |
| "eval_steps_per_second": 1.564, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.3966480446927374, |
| "grad_norm": 13.534646987915039, |
| "learning_rate": 2.873050547984859e-05, |
| "loss": 0.9508, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.3966480446927374, |
| "eval_cosine_accuracy": 0.9545358419418335, |
| "eval_loss": 2.387155532836914, |
| "eval_runtime": 24.4172, |
| "eval_samples_per_second": 387.35, |
| "eval_steps_per_second": 1.515, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.4548417132216014, |
| "grad_norm": 4.7890801429748535, |
| "learning_rate": 2.8337224819522882e-05, |
| "loss": 0.8975, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.4548417132216014, |
| "eval_cosine_accuracy": 0.9511524438858032, |
| "eval_loss": 2.531130313873291, |
| "eval_runtime": 23.694, |
| "eval_samples_per_second": 399.173, |
| "eval_steps_per_second": 1.562, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.5130353817504656, |
| "grad_norm": 39.141204833984375, |
| "learning_rate": 2.7894432659037298e-05, |
| "loss": 0.9006, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.5130353817504656, |
| "eval_cosine_accuracy": 0.9498836994171143, |
| "eval_loss": 2.4939699172973633, |
| "eval_runtime": 23.6563, |
| "eval_samples_per_second": 399.809, |
| "eval_steps_per_second": 1.564, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.5712290502793296, |
| "grad_norm": 5.199061393737793, |
| "learning_rate": 2.740480067649311e-05, |
| "loss": 0.9293, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.5712290502793296, |
| "eval_cosine_accuracy": 0.9558045864105225, |
| "eval_loss": 2.3089163303375244, |
| "eval_runtime": 23.8068, |
| "eval_samples_per_second": 397.282, |
| "eval_steps_per_second": 1.554, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.6294227188081938, |
| "grad_norm": 13.1704683303833, |
| "learning_rate": 2.6868184678138455e-05, |
| "loss": 0.9289, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.6294227188081938, |
| "eval_cosine_accuracy": 0.9548530578613281, |
| "eval_loss": 2.4148993492126465, |
| "eval_runtime": 23.6698, |
| "eval_samples_per_second": 399.58, |
| "eval_steps_per_second": 1.563, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.6876163873370578, |
| "grad_norm": 4.7075276374816895, |
| "learning_rate": 2.628751066655086e-05, |
| "loss": 0.9206, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.6876163873370578, |
| "eval_cosine_accuracy": 0.9537957310676575, |
| "eval_loss": 2.3580262660980225, |
| "eval_runtime": 23.8237, |
| "eval_samples_per_second": 396.999, |
| "eval_steps_per_second": 1.553, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.7458100558659218, |
| "grad_norm": 17.35055923461914, |
| "learning_rate": 2.5664934265693082e-05, |
| "loss": 0.9212, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.7458100558659218, |
| "eval_cosine_accuracy": 0.9540072083473206, |
| "eval_loss": 2.4263856410980225, |
| "eval_runtime": 24.2294, |
| "eval_samples_per_second": 390.352, |
| "eval_steps_per_second": 1.527, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.8040037243947857, |
| "grad_norm": 4.250364303588867, |
| "learning_rate": 2.5002766652889314e-05, |
| "loss": 0.8754, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.8040037243947857, |
| "eval_cosine_accuracy": 0.9536899924278259, |
| "eval_loss": 2.4231128692626953, |
| "eval_runtime": 23.6253, |
| "eval_samples_per_second": 400.334, |
| "eval_steps_per_second": 1.566, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.86219739292365, |
| "grad_norm": 7.852247714996338, |
| "learning_rate": 2.4303465979090062e-05, |
| "loss": 0.8594, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.86219739292365, |
| "eval_cosine_accuracy": 0.954430103302002, |
| "eval_loss": 2.395444393157959, |
| "eval_runtime": 23.5537, |
| "eval_samples_per_second": 401.551, |
| "eval_steps_per_second": 1.571, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.920391061452514, |
| "grad_norm": 0.0026893599424511194, |
| "learning_rate": 2.3571128596645755e-05, |
| "loss": 0.8732, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.920391061452514, |
| "eval_cosine_accuracy": 0.9555931687355042, |
| "eval_loss": 2.3798062801361084, |
| "eval_runtime": 23.6528, |
| "eval_samples_per_second": 399.868, |
| "eval_steps_per_second": 1.564, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.9785847299813781, |
| "grad_norm": 1.7342342138290405, |
| "learning_rate": 2.2805538836485382e-05, |
| "loss": 0.8562, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.9785847299813781, |
| "eval_cosine_accuracy": 0.9586593508720398, |
| "eval_loss": 2.3469178676605225, |
| "eval_runtime": 23.7353, |
| "eval_samples_per_second": 398.479, |
| "eval_steps_per_second": 1.559, |
| "step": 17000 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 34368, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 256, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|