| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 819, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03663003663003663, |
| "grad_norm": 7.746065665628135, |
| "learning_rate": 5e-06, |
| "loss": 1.0405, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07326007326007326, |
| "grad_norm": 2.858450692340174, |
| "learning_rate": 5e-06, |
| "loss": 0.9101, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10989010989010989, |
| "grad_norm": 5.616678502031101, |
| "learning_rate": 5e-06, |
| "loss": 0.8722, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14652014652014653, |
| "grad_norm": 1.8086289320620221, |
| "learning_rate": 5e-06, |
| "loss": 0.8529, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18315018315018314, |
| "grad_norm": 1.1491569593758013, |
| "learning_rate": 5e-06, |
| "loss": 0.8259, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21978021978021978, |
| "grad_norm": 1.1648582184563463, |
| "learning_rate": 5e-06, |
| "loss": 0.8144, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2564102564102564, |
| "grad_norm": 1.0006985861658089, |
| "learning_rate": 5e-06, |
| "loss": 0.7986, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.29304029304029305, |
| "grad_norm": 1.1821538573997477, |
| "learning_rate": 5e-06, |
| "loss": 0.7911, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.32967032967032966, |
| "grad_norm": 1.5897294264378354, |
| "learning_rate": 5e-06, |
| "loss": 0.7807, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3663003663003663, |
| "grad_norm": 1.9362698717241806, |
| "learning_rate": 5e-06, |
| "loss": 0.7819, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.40293040293040294, |
| "grad_norm": 1.0281035872096098, |
| "learning_rate": 5e-06, |
| "loss": 0.7791, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.43956043956043955, |
| "grad_norm": 1.6454534962205227, |
| "learning_rate": 5e-06, |
| "loss": 0.7688, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 0.9583089645191222, |
| "learning_rate": 5e-06, |
| "loss": 0.7669, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 0.7745194128699995, |
| "learning_rate": 5e-06, |
| "loss": 0.764, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5494505494505495, |
| "grad_norm": 0.7276307011919524, |
| "learning_rate": 5e-06, |
| "loss": 0.7586, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5860805860805861, |
| "grad_norm": 0.8554951216643023, |
| "learning_rate": 5e-06, |
| "loss": 0.7572, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6227106227106227, |
| "grad_norm": 0.7121498821814782, |
| "learning_rate": 5e-06, |
| "loss": 0.7574, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6593406593406593, |
| "grad_norm": 0.6839163656685533, |
| "learning_rate": 5e-06, |
| "loss": 0.7568, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6959706959706959, |
| "grad_norm": 0.9093255308667819, |
| "learning_rate": 5e-06, |
| "loss": 0.7472, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7326007326007326, |
| "grad_norm": 0.6638464031533008, |
| "learning_rate": 5e-06, |
| "loss": 0.751, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.5927695218746816, |
| "learning_rate": 5e-06, |
| "loss": 0.7477, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8058608058608059, |
| "grad_norm": 0.6423957250273598, |
| "learning_rate": 5e-06, |
| "loss": 0.7473, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8424908424908425, |
| "grad_norm": 0.6576759690318968, |
| "learning_rate": 5e-06, |
| "loss": 0.7422, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8791208791208791, |
| "grad_norm": 0.6476360077544144, |
| "learning_rate": 5e-06, |
| "loss": 0.7417, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9157509157509157, |
| "grad_norm": 0.6320860393471378, |
| "learning_rate": 5e-06, |
| "loss": 0.7409, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.5912257284209596, |
| "learning_rate": 5e-06, |
| "loss": 0.7431, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.989010989010989, |
| "grad_norm": 0.6319178097643761, |
| "learning_rate": 5e-06, |
| "loss": 0.742, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.7415268421173096, |
| "eval_runtime": 26.4114, |
| "eval_samples_per_second": 278.062, |
| "eval_steps_per_second": 1.098, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.0256410256410255, |
| "grad_norm": 0.7382077696725204, |
| "learning_rate": 5e-06, |
| "loss": 0.7069, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0622710622710623, |
| "grad_norm": 0.7190349411351752, |
| "learning_rate": 5e-06, |
| "loss": 0.6897, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.098901098901099, |
| "grad_norm": 0.7067376226598984, |
| "learning_rate": 5e-06, |
| "loss": 0.683, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1355311355311355, |
| "grad_norm": 0.5710281059044071, |
| "learning_rate": 5e-06, |
| "loss": 0.6868, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1721611721611722, |
| "grad_norm": 0.6626952213065733, |
| "learning_rate": 5e-06, |
| "loss": 0.6831, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2087912087912087, |
| "grad_norm": 0.7005105428895475, |
| "learning_rate": 5e-06, |
| "loss": 0.6869, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.2454212454212454, |
| "grad_norm": 0.6619897882086011, |
| "learning_rate": 5e-06, |
| "loss": 0.6878, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.282051282051282, |
| "grad_norm": 0.6508408490480377, |
| "learning_rate": 5e-06, |
| "loss": 0.6925, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.3186813186813187, |
| "grad_norm": 0.5432662746572642, |
| "learning_rate": 5e-06, |
| "loss": 0.6876, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3553113553113554, |
| "grad_norm": 0.6107105594531901, |
| "learning_rate": 5e-06, |
| "loss": 0.6931, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3919413919413919, |
| "grad_norm": 0.739761075105529, |
| "learning_rate": 5e-06, |
| "loss": 0.6866, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.8191906114187061, |
| "learning_rate": 5e-06, |
| "loss": 0.6906, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.4652014652014653, |
| "grad_norm": 0.7122072456112296, |
| "learning_rate": 5e-06, |
| "loss": 0.6887, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.5018315018315018, |
| "grad_norm": 0.636634621554325, |
| "learning_rate": 5e-06, |
| "loss": 0.6873, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 0.6281126431822524, |
| "learning_rate": 5e-06, |
| "loss": 0.6843, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.575091575091575, |
| "grad_norm": 0.6457324290442519, |
| "learning_rate": 5e-06, |
| "loss": 0.6864, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.6117216117216118, |
| "grad_norm": 0.6262969556153274, |
| "learning_rate": 5e-06, |
| "loss": 0.6839, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.6483516483516483, |
| "grad_norm": 0.5690984785192538, |
| "learning_rate": 5e-06, |
| "loss": 0.6928, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.684981684981685, |
| "grad_norm": 0.6172111079980963, |
| "learning_rate": 5e-06, |
| "loss": 0.6834, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7216117216117217, |
| "grad_norm": 0.7070197339375575, |
| "learning_rate": 5e-06, |
| "loss": 0.6871, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.7582417582417582, |
| "grad_norm": 0.6047694711574751, |
| "learning_rate": 5e-06, |
| "loss": 0.6871, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.7948717948717947, |
| "grad_norm": 0.5464670796066172, |
| "learning_rate": 5e-06, |
| "loss": 0.6853, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8315018315018317, |
| "grad_norm": 0.7356761200809793, |
| "learning_rate": 5e-06, |
| "loss": 0.6894, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.8681318681318682, |
| "grad_norm": 0.7493300019111138, |
| "learning_rate": 5e-06, |
| "loss": 0.6857, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 0.5706346610150999, |
| "learning_rate": 5e-06, |
| "loss": 0.6821, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9413919413919414, |
| "grad_norm": 0.6273732081753551, |
| "learning_rate": 5e-06, |
| "loss": 0.6867, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.978021978021978, |
| "grad_norm": 0.608424571172591, |
| "learning_rate": 5e-06, |
| "loss": 0.6821, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.7290822267532349, |
| "eval_runtime": 26.4229, |
| "eval_samples_per_second": 277.941, |
| "eval_steps_per_second": 1.098, |
| "step": 546 |
| }, |
| { |
| "epoch": 2.0146520146520146, |
| "grad_norm": 1.1435676214887527, |
| "learning_rate": 5e-06, |
| "loss": 0.6593, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.051282051282051, |
| "grad_norm": 0.8302719819904981, |
| "learning_rate": 5e-06, |
| "loss": 0.6276, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.087912087912088, |
| "grad_norm": 0.8821536275021126, |
| "learning_rate": 5e-06, |
| "loss": 0.6319, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.1245421245421245, |
| "grad_norm": 0.7567116572690076, |
| "learning_rate": 5e-06, |
| "loss": 0.6317, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.161172161172161, |
| "grad_norm": 0.7854569193284114, |
| "learning_rate": 5e-06, |
| "loss": 0.6303, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.197802197802198, |
| "grad_norm": 0.7432443286862768, |
| "learning_rate": 5e-06, |
| "loss": 0.6336, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.2344322344322345, |
| "grad_norm": 0.6585567960235167, |
| "learning_rate": 5e-06, |
| "loss": 0.6324, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.271062271062271, |
| "grad_norm": 0.655030138483751, |
| "learning_rate": 5e-06, |
| "loss": 0.6354, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.3076923076923075, |
| "grad_norm": 0.7081546371367903, |
| "learning_rate": 5e-06, |
| "loss": 0.6355, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.3443223443223444, |
| "grad_norm": 0.6817413886775435, |
| "learning_rate": 5e-06, |
| "loss": 0.6323, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.7087190344600633, |
| "learning_rate": 5e-06, |
| "loss": 0.6351, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.4175824175824174, |
| "grad_norm": 0.6282200808159406, |
| "learning_rate": 5e-06, |
| "loss": 0.6309, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.4542124542124544, |
| "grad_norm": 0.7257484686385395, |
| "learning_rate": 5e-06, |
| "loss": 0.6335, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.490842490842491, |
| "grad_norm": 0.7901264231040732, |
| "learning_rate": 5e-06, |
| "loss": 0.6383, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.5274725274725274, |
| "grad_norm": 0.5844396831283855, |
| "learning_rate": 5e-06, |
| "loss": 0.6342, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.564102564102564, |
| "grad_norm": 0.7132030040768085, |
| "learning_rate": 5e-06, |
| "loss": 0.6354, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.600732600732601, |
| "grad_norm": 0.7576742849969995, |
| "learning_rate": 5e-06, |
| "loss": 0.6332, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.6373626373626373, |
| "grad_norm": 0.9618623866692995, |
| "learning_rate": 5e-06, |
| "loss": 0.6382, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.6739926739926743, |
| "grad_norm": 0.7036643018293387, |
| "learning_rate": 5e-06, |
| "loss": 0.637, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.7106227106227108, |
| "grad_norm": 0.594473573698193, |
| "learning_rate": 5e-06, |
| "loss": 0.6336, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.7472527472527473, |
| "grad_norm": 0.6437871707069581, |
| "learning_rate": 5e-06, |
| "loss": 0.6371, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.7838827838827838, |
| "grad_norm": 0.6660532045165727, |
| "learning_rate": 5e-06, |
| "loss": 0.6333, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.8205128205128203, |
| "grad_norm": 0.592751889739025, |
| "learning_rate": 5e-06, |
| "loss": 0.6356, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.5902791558514751, |
| "learning_rate": 5e-06, |
| "loss": 0.6336, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.8937728937728937, |
| "grad_norm": 0.5484108433220659, |
| "learning_rate": 5e-06, |
| "loss": 0.6356, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.9304029304029307, |
| "grad_norm": 0.6657396541061575, |
| "learning_rate": 5e-06, |
| "loss": 0.6352, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.967032967032967, |
| "grad_norm": 0.68952795495605, |
| "learning_rate": 5e-06, |
| "loss": 0.6339, |
| "step": 810 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.7330417037010193, |
| "eval_runtime": 25.8793, |
| "eval_samples_per_second": 283.778, |
| "eval_steps_per_second": 1.121, |
| "step": 819 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 819, |
| "total_flos": 1371855504015360.0, |
| "train_loss": 0.7030115820403792, |
| "train_runtime": 5219.8536, |
| "train_samples_per_second": 80.188, |
| "train_steps_per_second": 0.157 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 819, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1371855504015360.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|