| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9972041006523766, |
| "eval_steps": 500, |
| "global_step": 804, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03727865796831314, |
| "grad_norm": 2.863966708076063, |
| "learning_rate": 5e-06, |
| "loss": 1.026, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07455731593662628, |
| "grad_norm": 1.3926538767403338, |
| "learning_rate": 5e-06, |
| "loss": 0.9052, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11183597390493942, |
| "grad_norm": 1.2844171360679957, |
| "learning_rate": 5e-06, |
| "loss": 0.8595, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14911463187325255, |
| "grad_norm": 1.2804154175067717, |
| "learning_rate": 5e-06, |
| "loss": 0.8369, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1863932898415657, |
| "grad_norm": 1.2295434002540855, |
| "learning_rate": 5e-06, |
| "loss": 0.8189, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.22367194780987884, |
| "grad_norm": 0.9533085893615539, |
| "learning_rate": 5e-06, |
| "loss": 0.8029, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.26095060577819196, |
| "grad_norm": 0.9173543178817729, |
| "learning_rate": 5e-06, |
| "loss": 0.7942, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2982292637465051, |
| "grad_norm": 1.1035649744908476, |
| "learning_rate": 5e-06, |
| "loss": 0.7819, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.33550792171481825, |
| "grad_norm": 1.0858240553462641, |
| "learning_rate": 5e-06, |
| "loss": 0.7749, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3727865796831314, |
| "grad_norm": 0.7409831961561815, |
| "learning_rate": 5e-06, |
| "loss": 0.7747, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.41006523765144454, |
| "grad_norm": 0.7383266991759111, |
| "learning_rate": 5e-06, |
| "loss": 0.771, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4473438956197577, |
| "grad_norm": 0.729136950048857, |
| "learning_rate": 5e-06, |
| "loss": 0.7661, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4846225535880708, |
| "grad_norm": 0.7091298599299047, |
| "learning_rate": 5e-06, |
| "loss": 0.7686, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5219012115563839, |
| "grad_norm": 0.8897155658891847, |
| "learning_rate": 5e-06, |
| "loss": 0.7585, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5591798695246971, |
| "grad_norm": 0.991237889992658, |
| "learning_rate": 5e-06, |
| "loss": 0.7596, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5964585274930102, |
| "grad_norm": 0.6956773386703347, |
| "learning_rate": 5e-06, |
| "loss": 0.7567, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6337371854613234, |
| "grad_norm": 0.6627254486504695, |
| "learning_rate": 5e-06, |
| "loss": 0.7553, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6710158434296365, |
| "grad_norm": 0.8294084043245143, |
| "learning_rate": 5e-06, |
| "loss": 0.7507, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7082945013979497, |
| "grad_norm": 0.8385421799416569, |
| "learning_rate": 5e-06, |
| "loss": 0.752, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7455731593662628, |
| "grad_norm": 1.2138081527115805, |
| "learning_rate": 5e-06, |
| "loss": 0.7478, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.782851817334576, |
| "grad_norm": 0.6817842666893509, |
| "learning_rate": 5e-06, |
| "loss": 0.7459, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8201304753028891, |
| "grad_norm": 0.6011497469129173, |
| "learning_rate": 5e-06, |
| "loss": 0.7457, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8574091332712023, |
| "grad_norm": 0.737363078092173, |
| "learning_rate": 5e-06, |
| "loss": 0.7454, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8946877912395154, |
| "grad_norm": 0.9165088326689114, |
| "learning_rate": 5e-06, |
| "loss": 0.7399, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9319664492078286, |
| "grad_norm": 0.631133171846698, |
| "learning_rate": 5e-06, |
| "loss": 0.7459, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9692451071761417, |
| "grad_norm": 0.6095804573659136, |
| "learning_rate": 5e-06, |
| "loss": 0.7451, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9990680335507922, |
| "eval_loss": 0.7414608001708984, |
| "eval_runtime": 285.7705, |
| "eval_samples_per_second": 25.286, |
| "eval_steps_per_second": 0.395, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.0065237651444547, |
| "grad_norm": 0.991121087496217, |
| "learning_rate": 5e-06, |
| "loss": 0.7852, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.0438024231127678, |
| "grad_norm": 0.9766463352882261, |
| "learning_rate": 5e-06, |
| "loss": 0.6895, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0810810810810811, |
| "grad_norm": 0.9726663140123732, |
| "learning_rate": 5e-06, |
| "loss": 0.6941, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.1183597390493942, |
| "grad_norm": 0.7674442614556171, |
| "learning_rate": 5e-06, |
| "loss": 0.6928, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1556383970177073, |
| "grad_norm": 0.686949635032861, |
| "learning_rate": 5e-06, |
| "loss": 0.6868, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1929170549860204, |
| "grad_norm": 0.7040486633398215, |
| "learning_rate": 5e-06, |
| "loss": 0.6923, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2301957129543337, |
| "grad_norm": 0.757865125886295, |
| "learning_rate": 5e-06, |
| "loss": 0.6901, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.2674743709226468, |
| "grad_norm": 0.9846801239791743, |
| "learning_rate": 5e-06, |
| "loss": 0.6918, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.30475302889096, |
| "grad_norm": 0.6230374762078432, |
| "learning_rate": 5e-06, |
| "loss": 0.6894, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.342031686859273, |
| "grad_norm": 0.6833293101908209, |
| "learning_rate": 5e-06, |
| "loss": 0.6882, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3793103448275863, |
| "grad_norm": 0.7286800883906255, |
| "learning_rate": 5e-06, |
| "loss": 0.6895, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.4165890027958994, |
| "grad_norm": 0.7749966544687281, |
| "learning_rate": 5e-06, |
| "loss": 0.6913, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4538676607642125, |
| "grad_norm": 0.77958879320336, |
| "learning_rate": 5e-06, |
| "loss": 0.6897, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.4911463187325256, |
| "grad_norm": 0.7140550646519259, |
| "learning_rate": 5e-06, |
| "loss": 0.685, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.5284249767008387, |
| "grad_norm": 0.6584304607146931, |
| "learning_rate": 5e-06, |
| "loss": 0.6902, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.565703634669152, |
| "grad_norm": 0.7452382115118451, |
| "learning_rate": 5e-06, |
| "loss": 0.6848, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.602982292637465, |
| "grad_norm": 0.8519625836288258, |
| "learning_rate": 5e-06, |
| "loss": 0.6888, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.6402609506057781, |
| "grad_norm": 0.562624844847511, |
| "learning_rate": 5e-06, |
| "loss": 0.6864, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.6775396085740915, |
| "grad_norm": 0.7282578944985719, |
| "learning_rate": 5e-06, |
| "loss": 0.6925, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.7148182665424043, |
| "grad_norm": 0.8007629798945024, |
| "learning_rate": 5e-06, |
| "loss": 0.6922, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7520969245107176, |
| "grad_norm": 0.6847302844270698, |
| "learning_rate": 5e-06, |
| "loss": 0.6873, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.7893755824790307, |
| "grad_norm": 0.6294374666555999, |
| "learning_rate": 5e-06, |
| "loss": 0.6845, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.8266542404473438, |
| "grad_norm": 0.7641590639056253, |
| "learning_rate": 5e-06, |
| "loss": 0.6862, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8639328984156571, |
| "grad_norm": 0.666418306068689, |
| "learning_rate": 5e-06, |
| "loss": 0.6899, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.9012115563839702, |
| "grad_norm": 0.5887624682915402, |
| "learning_rate": 5e-06, |
| "loss": 0.6839, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.9384902143522833, |
| "grad_norm": 0.6878912984211528, |
| "learning_rate": 5e-06, |
| "loss": 0.6822, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9757688723205966, |
| "grad_norm": 0.6704088372022132, |
| "learning_rate": 5e-06, |
| "loss": 0.6865, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.9981360671015844, |
| "eval_loss": 0.7287164330482483, |
| "eval_runtime": 286.215, |
| "eval_samples_per_second": 25.247, |
| "eval_steps_per_second": 0.395, |
| "step": 536 |
| }, |
| { |
| "epoch": 2.0130475302889095, |
| "grad_norm": 0.7652802012798461, |
| "learning_rate": 5e-06, |
| "loss": 0.7188, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.050326188257223, |
| "grad_norm": 0.6143634312705478, |
| "learning_rate": 5e-06, |
| "loss": 0.6374, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.0876048462255357, |
| "grad_norm": 0.7815576376342297, |
| "learning_rate": 5e-06, |
| "loss": 0.627, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.124883504193849, |
| "grad_norm": 0.6461505452236371, |
| "learning_rate": 5e-06, |
| "loss": 0.6339, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.1621621621621623, |
| "grad_norm": 0.6479801052923935, |
| "learning_rate": 5e-06, |
| "loss": 0.6367, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.199440820130475, |
| "grad_norm": 0.795040730406223, |
| "learning_rate": 5e-06, |
| "loss": 0.6361, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.2367194780987885, |
| "grad_norm": 0.718949197084576, |
| "learning_rate": 5e-06, |
| "loss": 0.6361, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.2739981360671018, |
| "grad_norm": 0.6166412328819074, |
| "learning_rate": 5e-06, |
| "loss": 0.6369, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.3112767940354146, |
| "grad_norm": 0.6766175977068568, |
| "learning_rate": 5e-06, |
| "loss": 0.639, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.348555452003728, |
| "grad_norm": 0.7034913016991963, |
| "learning_rate": 5e-06, |
| "loss": 0.6411, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.385834109972041, |
| "grad_norm": 0.6509863784269144, |
| "learning_rate": 5e-06, |
| "loss": 0.6368, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.423112767940354, |
| "grad_norm": 0.715734199522274, |
| "learning_rate": 5e-06, |
| "loss": 0.6351, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.4603914259086674, |
| "grad_norm": 0.6552323242186081, |
| "learning_rate": 5e-06, |
| "loss": 0.634, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.4976700838769803, |
| "grad_norm": 0.6632367791952104, |
| "learning_rate": 5e-06, |
| "loss": 0.6404, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.5349487418452936, |
| "grad_norm": 0.7300194823086572, |
| "learning_rate": 5e-06, |
| "loss": 0.6421, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.572227399813607, |
| "grad_norm": 0.5964985990739192, |
| "learning_rate": 5e-06, |
| "loss": 0.6369, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.60950605778192, |
| "grad_norm": 0.8180239269430768, |
| "learning_rate": 5e-06, |
| "loss": 0.6376, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.646784715750233, |
| "grad_norm": 0.6860649388075701, |
| "learning_rate": 5e-06, |
| "loss": 0.6368, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.684063373718546, |
| "grad_norm": 1.1051628648207943, |
| "learning_rate": 5e-06, |
| "loss": 0.6375, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.7213420316868593, |
| "grad_norm": 0.6214860543682473, |
| "learning_rate": 5e-06, |
| "loss": 0.6398, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.7586206896551726, |
| "grad_norm": 0.7826068718341928, |
| "learning_rate": 5e-06, |
| "loss": 0.6378, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.7958993476234855, |
| "grad_norm": 0.801079615885771, |
| "learning_rate": 5e-06, |
| "loss": 0.6413, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.8331780055917988, |
| "grad_norm": 0.5970154468549519, |
| "learning_rate": 5e-06, |
| "loss": 0.641, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.8704566635601116, |
| "grad_norm": 0.7276565987601038, |
| "learning_rate": 5e-06, |
| "loss": 0.6388, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.907735321528425, |
| "grad_norm": 0.690824403756736, |
| "learning_rate": 5e-06, |
| "loss": 0.6391, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.9450139794967383, |
| "grad_norm": 0.761736653534516, |
| "learning_rate": 5e-06, |
| "loss": 0.6352, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.982292637465051, |
| "grad_norm": 0.6454438184790852, |
| "learning_rate": 5e-06, |
| "loss": 0.6415, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.9972041006523766, |
| "eval_loss": 0.7303594946861267, |
| "eval_runtime": 285.1186, |
| "eval_samples_per_second": 25.344, |
| "eval_steps_per_second": 0.396, |
| "step": 804 |
| }, |
| { |
| "epoch": 2.9972041006523766, |
| "step": 804, |
| "total_flos": 1346520565678080.0, |
| "train_loss": 0.7048911662837166, |
| "train_runtime": 47750.7809, |
| "train_samples_per_second": 8.626, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 804, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1346520565678080.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|