{ "best_metric": 1.6572628021240234, "best_model_checkpoint": "./modele-socratique-sft\\checkpoint-171", "epoch": 6.95906432748538, "eval_steps": 500, "global_step": 595, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11695906432748537, "grad_norm": 4.373384475708008, "learning_rate": 3.3333333333333335e-05, "loss": 5.2688, "step": 10 }, { "epoch": 0.23391812865497075, "grad_norm": 7.579962253570557, "learning_rate": 6.666666666666667e-05, "loss": 4.2978, "step": 20 }, { "epoch": 0.3508771929824561, "grad_norm": 3.2114369869232178, "learning_rate": 0.0001, "loss": 2.864, "step": 30 }, { "epoch": 0.4678362573099415, "grad_norm": 2.231703758239746, "learning_rate": 0.00013333333333333334, "loss": 2.2745, "step": 40 }, { "epoch": 0.5847953216374269, "grad_norm": 1.8353674411773682, "learning_rate": 0.0001666666666666667, "loss": 2.0056, "step": 50 }, { "epoch": 0.7017543859649122, "grad_norm": 3.654597282409668, "learning_rate": 0.0002, "loss": 1.9087, "step": 60 }, { "epoch": 0.8187134502923976, "grad_norm": 1.4905436038970947, "learning_rate": 0.00019626168224299065, "loss": 1.751, "step": 70 }, { "epoch": 0.935672514619883, "grad_norm": 1.4564584493637085, "learning_rate": 0.00019252336448598133, "loss": 1.7465, "step": 80 }, { "epoch": 0.9941520467836257, "eval_loss": 1.743960976600647, "eval_runtime": 1.1773, "eval_samples_per_second": 64.554, "eval_steps_per_second": 8.494, "step": 85 }, { "epoch": 1.0526315789473684, "grad_norm": 1.3438069820404053, "learning_rate": 0.00018878504672897197, "loss": 1.6509, "step": 90 }, { "epoch": 1.1695906432748537, "grad_norm": 1.4461097717285156, "learning_rate": 0.00018504672897196262, "loss": 1.5477, "step": 100 }, { "epoch": 1.286549707602339, "grad_norm": 1.7722139358520508, "learning_rate": 0.0001813084112149533, "loss": 1.6173, "step": 110 }, { "epoch": 1.4035087719298245, "grad_norm": 1.3352079391479492, "learning_rate": 0.00017757009345794393, "loss": 1.6119, "step": 120 }, { "epoch": 1.52046783625731, "grad_norm": 1.5055865049362183, "learning_rate": 0.00017383177570093458, "loss": 1.5867, "step": 130 }, { "epoch": 1.6374269005847952, "grad_norm": 1.6256341934204102, "learning_rate": 0.00017009345794392523, "loss": 1.5639, "step": 140 }, { "epoch": 1.7543859649122808, "grad_norm": 1.4370089769363403, "learning_rate": 0.0001663551401869159, "loss": 1.526, "step": 150 }, { "epoch": 1.871345029239766, "grad_norm": 1.7688274383544922, "learning_rate": 0.00016261682242990654, "loss": 1.5438, "step": 160 }, { "epoch": 1.9883040935672516, "grad_norm": 1.3985792398452759, "learning_rate": 0.0001588785046728972, "loss": 1.5231, "step": 170 }, { "epoch": 2.0, "eval_loss": 1.6572628021240234, "eval_runtime": 1.1323, "eval_samples_per_second": 67.121, "eval_steps_per_second": 8.832, "step": 171 }, { "epoch": 2.1052631578947367, "grad_norm": 1.5881729125976562, "learning_rate": 0.00015514018691588786, "loss": 1.3743, "step": 180 }, { "epoch": 2.2222222222222223, "grad_norm": 1.75332510471344, "learning_rate": 0.0001514018691588785, "loss": 1.3126, "step": 190 }, { "epoch": 2.3391812865497075, "grad_norm": 1.8260974884033203, "learning_rate": 0.00014766355140186915, "loss": 1.3309, "step": 200 }, { "epoch": 2.456140350877193, "grad_norm": 1.6168378591537476, "learning_rate": 0.00014392523364485982, "loss": 1.3181, "step": 210 }, { "epoch": 2.573099415204678, "grad_norm": 1.8509645462036133, "learning_rate": 0.00014018691588785047, "loss": 1.3092, "step": 220 }, { "epoch": 2.690058479532164, "grad_norm": 1.8677273988723755, "learning_rate": 0.0001364485981308411, "loss": 1.3226, "step": 230 }, { "epoch": 2.807017543859649, "grad_norm": 1.8391717672348022, "learning_rate": 0.00013271028037383179, "loss": 1.3218, "step": 240 }, { "epoch": 2.9239766081871346, "grad_norm": 1.6857693195343018, "learning_rate": 0.00012897196261682243, "loss": 1.2798, "step": 250 }, { "epoch": 2.9941520467836256, "eval_loss": 1.668265461921692, "eval_runtime": 1.1808, "eval_samples_per_second": 64.361, "eval_steps_per_second": 8.469, "step": 256 }, { "epoch": 3.0409356725146197, "grad_norm": 1.7038238048553467, "learning_rate": 0.00012523364485981308, "loss": 1.2419, "step": 260 }, { "epoch": 3.1578947368421053, "grad_norm": 2.1333186626434326, "learning_rate": 0.00012149532710280373, "loss": 1.1094, "step": 270 }, { "epoch": 3.2748538011695905, "grad_norm": 2.1007728576660156, "learning_rate": 0.00011775700934579439, "loss": 1.0783, "step": 280 }, { "epoch": 3.391812865497076, "grad_norm": 2.2706727981567383, "learning_rate": 0.00011401869158878504, "loss": 1.0856, "step": 290 }, { "epoch": 3.5087719298245617, "grad_norm": 2.1818716526031494, "learning_rate": 0.0001102803738317757, "loss": 1.1019, "step": 300 }, { "epoch": 3.625730994152047, "grad_norm": 2.2601561546325684, "learning_rate": 0.00010654205607476636, "loss": 1.1007, "step": 310 }, { "epoch": 3.742690058479532, "grad_norm": 2.215036153793335, "learning_rate": 0.000102803738317757, "loss": 1.0918, "step": 320 }, { "epoch": 3.8596491228070176, "grad_norm": 2.1949431896209717, "learning_rate": 9.906542056074767e-05, "loss": 1.0702, "step": 330 }, { "epoch": 3.976608187134503, "grad_norm": 2.5513603687286377, "learning_rate": 9.532710280373832e-05, "loss": 1.0959, "step": 340 }, { "epoch": 4.0, "eval_loss": 1.7493380308151245, "eval_runtime": 1.2179, "eval_samples_per_second": 62.404, "eval_steps_per_second": 8.211, "step": 342 }, { "epoch": 4.093567251461988, "grad_norm": 2.321434497833252, "learning_rate": 9.158878504672898e-05, "loss": 0.9406, "step": 350 }, { "epoch": 4.2105263157894735, "grad_norm": 2.5859365463256836, "learning_rate": 8.785046728971964e-05, "loss": 0.8669, "step": 360 }, { "epoch": 4.3274853801169595, "grad_norm": 2.6224827766418457, "learning_rate": 8.411214953271028e-05, "loss": 0.871, "step": 370 }, { "epoch": 4.444444444444445, "grad_norm": 2.5626139640808105, "learning_rate": 8.037383177570094e-05, "loss": 0.9072, "step": 380 }, { "epoch": 4.56140350877193, "grad_norm": 2.6079370975494385, "learning_rate": 7.663551401869158e-05, "loss": 0.9142, "step": 390 }, { "epoch": 4.678362573099415, "grad_norm": 2.648815631866455, "learning_rate": 7.289719626168224e-05, "loss": 0.8995, "step": 400 }, { "epoch": 4.7953216374269, "grad_norm": 2.6691548824310303, "learning_rate": 6.91588785046729e-05, "loss": 0.884, "step": 410 }, { "epoch": 4.912280701754386, "grad_norm": 2.8678014278411865, "learning_rate": 6.542056074766355e-05, "loss": 0.8708, "step": 420 }, { "epoch": 4.994152046783626, "eval_loss": 1.87999427318573, "eval_runtime": 1.2454, "eval_samples_per_second": 61.025, "eval_steps_per_second": 8.03, "step": 427 }, { "epoch": 5.029239766081871, "grad_norm": 2.3341574668884277, "learning_rate": 6.16822429906542e-05, "loss": 0.8429, "step": 430 }, { "epoch": 5.146198830409356, "grad_norm": 2.833981513977051, "learning_rate": 5.794392523364486e-05, "loss": 0.717, "step": 440 }, { "epoch": 5.2631578947368425, "grad_norm": 2.6604554653167725, "learning_rate": 5.420560747663551e-05, "loss": 0.7133, "step": 450 }, { "epoch": 5.380116959064328, "grad_norm": 2.6542723178863525, "learning_rate": 5.046728971962617e-05, "loss": 0.7169, "step": 460 }, { "epoch": 5.497076023391813, "grad_norm": 2.9539148807525635, "learning_rate": 4.672897196261683e-05, "loss": 0.7134, "step": 470 }, { "epoch": 5.614035087719298, "grad_norm": 3.140651226043701, "learning_rate": 4.299065420560748e-05, "loss": 0.7021, "step": 480 }, { "epoch": 5.730994152046784, "grad_norm": 3.199179172515869, "learning_rate": 3.925233644859813e-05, "loss": 0.7289, "step": 490 }, { "epoch": 5.847953216374269, "grad_norm": 2.914994478225708, "learning_rate": 3.551401869158878e-05, "loss": 0.7574, "step": 500 }, { "epoch": 5.964912280701754, "grad_norm": 2.9280600547790527, "learning_rate": 3.177570093457944e-05, "loss": 0.727, "step": 510 }, { "epoch": 6.0, "eval_loss": 2.0164217948913574, "eval_runtime": 1.1308, "eval_samples_per_second": 67.209, "eval_steps_per_second": 8.843, "step": 513 }, { "epoch": 6.081871345029239, "grad_norm": 2.5345840454101562, "learning_rate": 2.8037383177570094e-05, "loss": 0.6034, "step": 520 }, { "epoch": 6.1988304093567255, "grad_norm": 2.790266990661621, "learning_rate": 2.429906542056075e-05, "loss": 0.5992, "step": 530 }, { "epoch": 6.315789473684211, "grad_norm": 2.85329532623291, "learning_rate": 2.05607476635514e-05, "loss": 0.605, "step": 540 }, { "epoch": 6.432748538011696, "grad_norm": 2.943007230758667, "learning_rate": 1.6822429906542056e-05, "loss": 0.6168, "step": 550 }, { "epoch": 6.549707602339181, "grad_norm": 2.8062615394592285, "learning_rate": 1.308411214953271e-05, "loss": 0.5825, "step": 560 }, { "epoch": 6.666666666666667, "grad_norm": 2.97013258934021, "learning_rate": 9.345794392523365e-06, "loss": 0.6344, "step": 570 }, { "epoch": 6.783625730994152, "grad_norm": 2.7482662200927734, "learning_rate": 5.607476635514019e-06, "loss": 0.6138, "step": 580 }, { "epoch": 6.900584795321637, "grad_norm": 3.1583168506622314, "learning_rate": 1.8691588785046728e-06, "loss": 0.6043, "step": 590 }, { "epoch": 6.95906432748538, "eval_loss": 2.1435160636901855, "eval_runtime": 1.2421, "eval_samples_per_second": 61.187, "eval_steps_per_second": 8.051, "step": 595 } ], "logging_steps": 10, "max_steps": 595, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3262279482728448.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }