{ "best_global_step": 4000, "best_metric": 0.7102092504501343, "best_model_checkpoint": "/Data/data_chess/checkpoint-4000", "epoch": 0.2694691457828079, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006736728644570197, "grad_norm": 0.7572135925292969, "learning_rate": 0.0006644295302013424, "loss": 2.9711, "step": 100 }, { "epoch": 0.013473457289140393, "grad_norm": 0.6671218276023865, "learning_rate": 0.0009999714348540981, "loss": 1.607, "step": 200 }, { "epoch": 0.02021018593371059, "grad_norm": 0.5963013172149658, "learning_rate": 0.0009997429332697315, "loss": 1.2186, "step": 300 }, { "epoch": 0.026946914578280787, "grad_norm": 0.46841374039649963, "learning_rate": 0.0009992860345329128, "loss": 1.0664, "step": 400 }, { "epoch": 0.03368364322285099, "grad_norm": 0.44179075956344604, "learning_rate": 0.0009986009474597423, "loss": 1.0087, "step": 500 }, { "epoch": 0.03368364322285099, "eval_loss": 0.919044017791748, "eval_runtime": 4.0503, "eval_samples_per_second": 1234.464, "eval_steps_per_second": 19.505, "step": 500 }, { "epoch": 0.04042037186742118, "grad_norm": 0.396930456161499, "learning_rate": 0.0009976879851550707, "loss": 0.9759, "step": 600 }, { "epoch": 0.04715710051199138, "grad_norm": 0.39025160670280457, "learning_rate": 0.0009965475648694023, "loss": 0.9506, "step": 700 }, { "epoch": 0.05389382915656157, "grad_norm": 0.3577525317668915, "learning_rate": 0.0009951802078081975, "loss": 0.9323, "step": 800 }, { "epoch": 0.060630557801131774, "grad_norm": 0.3586902618408203, "learning_rate": 0.0009935865388936683, "loss": 0.9148, "step": 900 }, { "epoch": 0.06736728644570197, "grad_norm": 0.3558438718318939, "learning_rate": 0.0009917672864791695, "loss": 0.8993, "step": 1000 }, { "epoch": 0.06736728644570197, "eval_loss": 0.8303639888763428, "eval_runtime": 4.0434, "eval_samples_per_second": 1236.578, "eval_steps_per_second": 19.538, "step": 1000 }, { "epoch": 0.07410401509027216, "grad_norm": 0.33415552973747253, "learning_rate": 0.0009897232820163203, "loss": 0.887, "step": 1100 }, { "epoch": 0.08084074373484236, "grad_norm": 0.3328455686569214, "learning_rate": 0.0009874554596750068, "loss": 0.8764, "step": 1200 }, { "epoch": 0.08757747237941256, "grad_norm": 0.34494221210479736, "learning_rate": 0.000984964855916438, "loss": 0.8687, "step": 1300 }, { "epoch": 0.09431420102398276, "grad_norm": 0.3205359876155853, "learning_rate": 0.0009822526090194543, "loss": 0.8616, "step": 1400 }, { "epoch": 0.10105092966855295, "grad_norm": 0.31778866052627563, "learning_rate": 0.0009793199585602988, "loss": 0.8531, "step": 1500 }, { "epoch": 0.10105092966855295, "eval_loss": 0.7877992391586304, "eval_runtime": 4.0459, "eval_samples_per_second": 1235.828, "eval_steps_per_second": 19.526, "step": 1500 }, { "epoch": 0.10778765831312315, "grad_norm": 0.2896178066730499, "learning_rate": 0.0009761682448460969, "loss": 0.845, "step": 1600 }, { "epoch": 0.11452438695769335, "grad_norm": 0.30994096398353577, "learning_rate": 0.0009727989083022944, "loss": 0.8422, "step": 1700 }, { "epoch": 0.12126111560226355, "grad_norm": 0.2902910113334656, "learning_rate": 0.0009692134888143424, "loss": 0.8343, "step": 1800 }, { "epoch": 0.12799784424683375, "grad_norm": 0.30568113923072815, "learning_rate": 0.0009654136250239245, "loss": 0.8302, "step": 1900 }, { "epoch": 0.13473457289140395, "grad_norm": 0.2924252152442932, "learning_rate": 0.0009614010535800489, "loss": 0.8237, "step": 2000 }, { "epoch": 0.13473457289140395, "eval_loss": 0.7636091113090515, "eval_runtime": 4.0523, "eval_samples_per_second": 1233.862, "eval_steps_per_second": 19.495, "step": 2000 }, { "epoch": 0.14147130153597412, "grad_norm": 0.2940412163734436, "learning_rate": 0.0009571776083453492, "loss": 0.8196, "step": 2100 }, { "epoch": 0.14820803018054432, "grad_norm": 0.2964770495891571, "learning_rate": 0.0009527452195579558, "loss": 0.8155, "step": 2200 }, { "epoch": 0.15494475882511452, "grad_norm": 0.29473844170570374, "learning_rate": 0.0009481059129493202, "loss": 0.8126, "step": 2300 }, { "epoch": 0.16168148746968472, "grad_norm": 0.26849669218063354, "learning_rate": 0.0009432618088183964, "loss": 0.81, "step": 2400 }, { "epoch": 0.16841821611425492, "grad_norm": 0.30280986428260803, "learning_rate": 0.0009382151210626026, "loss": 0.8071, "step": 2500 }, { "epoch": 0.16841821611425492, "eval_loss": 0.7478171586990356, "eval_runtime": 4.0503, "eval_samples_per_second": 1234.482, "eval_steps_per_second": 19.505, "step": 2500 }, { "epoch": 0.17515494475882512, "grad_norm": 0.31329289078712463, "learning_rate": 0.0009329681561660051, "loss": 0.8021, "step": 2600 }, { "epoch": 0.18189167340339532, "grad_norm": 0.2696886360645294, "learning_rate": 0.0009275233121451872, "loss": 0.8008, "step": 2700 }, { "epoch": 0.18862840204796552, "grad_norm": 0.26912084221839905, "learning_rate": 0.0009218830774532855, "loss": 0.7973, "step": 2800 }, { "epoch": 0.1953651306925357, "grad_norm": 0.27081477642059326, "learning_rate": 0.0009160500298426945, "loss": 0.7935, "step": 2900 }, { "epoch": 0.2021018593371059, "grad_norm": 0.27368906140327454, "learning_rate": 0.0009100268351869579, "loss": 0.7908, "step": 3000 }, { "epoch": 0.2021018593371059, "eval_loss": 0.7335129976272583, "eval_runtime": 4.0548, "eval_samples_per_second": 1233.113, "eval_steps_per_second": 19.483, "step": 3000 }, { "epoch": 0.2088385879816761, "grad_norm": 0.32002657651901245, "learning_rate": 0.0009038162462623858, "loss": 0.787, "step": 3100 }, { "epoch": 0.2155753166262463, "grad_norm": 0.25296279788017273, "learning_rate": 0.0008974211014899564, "loss": 0.7873, "step": 3200 }, { "epoch": 0.2223120452708165, "grad_norm": 0.27432960271835327, "learning_rate": 0.0008908443236380743, "loss": 0.7827, "step": 3300 }, { "epoch": 0.2290487739153867, "grad_norm": 0.2552671730518341, "learning_rate": 0.0008840889184867782, "loss": 0.7793, "step": 3400 }, { "epoch": 0.2357855025599569, "grad_norm": 0.27509957551956177, "learning_rate": 0.0008771579734540138, "loss": 0.7781, "step": 3500 }, { "epoch": 0.2357855025599569, "eval_loss": 0.7217926383018494, "eval_runtime": 4.0438, "eval_samples_per_second": 1236.453, "eval_steps_per_second": 19.536, "step": 3500 }, { "epoch": 0.2425222312045271, "grad_norm": 0.25811567902565, "learning_rate": 0.0008700546561845919, "loss": 0.7764, "step": 3600 }, { "epoch": 0.24925895984909727, "grad_norm": 0.26611506938934326, "learning_rate": 0.000862782213102482, "loss": 0.7744, "step": 3700 }, { "epoch": 0.2559956884936675, "grad_norm": 0.2390570193529129, "learning_rate": 0.0008553439679271025, "loss": 0.7717, "step": 3800 }, { "epoch": 0.2627324171382377, "grad_norm": 0.26289165019989014, "learning_rate": 0.0008477433201542824, "loss": 0.769, "step": 3900 }, { "epoch": 0.2694691457828079, "grad_norm": 0.2603146731853485, "learning_rate": 0.0008399837435025926, "loss": 0.7658, "step": 4000 }, { "epoch": 0.2694691457828079, "eval_loss": 0.7102092504501343, "eval_runtime": 4.0458, "eval_samples_per_second": 1235.853, "eval_steps_per_second": 19.526, "step": 4000 } ], "logging_steps": 100, "max_steps": 14844, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 493989888000000.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }