Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_metric": 0.15780216455459595, | |
| "best_model_checkpoint": "./mistral/01-03-24-Weni-ZeroShot-3.3.18-Mistral-7b-Multilanguage-3.2.0_Zeroshot-2_max_steps-800_batch_256_2024-03-01_ppid_7/checkpoint-800", | |
| "epoch": 7.920792079207921, | |
| "eval_steps": 100, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.141737699508667, | |
| "learning_rate": 4.5e-05, | |
| "loss": 1.4271, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.741797924041748, | |
| "learning_rate": 9.5e-05, | |
| "loss": 0.5555, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.7615867853164673, | |
| "learning_rate": 0.00014250000000000002, | |
| "loss": 0.4788, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0001775, | |
| "loss": 0.442, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 134.0851593017578, | |
| "learning_rate": 0.0001999533590836713, | |
| "loss": 1.6564, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "eval_loss": 0.7399550676345825, | |
| "eval_runtime": 93.2428, | |
| "eval_samples_per_second": 30.748, | |
| "eval_steps_per_second": 0.965, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.21864017844200134, | |
| "learning_rate": 0.00019930684569549264, | |
| "loss": 0.4734, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.25633054971694946, | |
| "learning_rate": 0.0001979045472484584, | |
| "loss": 0.4004, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.21116402745246887, | |
| "learning_rate": 0.00019575713608048145, | |
| "loss": 0.3924, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.2366013079881668, | |
| "learning_rate": 0.00019288095528719243, | |
| "loss": 0.3735, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.39251914620399475, | |
| "learning_rate": 0.00018929789434111373, | |
| "loss": 0.3627, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "eval_loss": 0.35110002756118774, | |
| "eval_runtime": 93.3064, | |
| "eval_samples_per_second": 30.727, | |
| "eval_steps_per_second": 0.965, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.2366660237312317, | |
| "learning_rate": 0.0001850352224995563, | |
| "loss": 0.3503, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.29862603545188904, | |
| "learning_rate": 0.00018012538126910608, | |
| "loss": 0.3338, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.28093767166137695, | |
| "learning_rate": 0.00017460573750616994, | |
| "loss": 0.322, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.2897046208381653, | |
| "learning_rate": 0.0001685182990326359, | |
| "loss": 0.3162, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.2848748564720154, | |
| "learning_rate": 0.00016190939493098344, | |
| "loss": 0.298, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "eval_loss": 0.2971964478492737, | |
| "eval_runtime": 93.2405, | |
| "eval_samples_per_second": 30.748, | |
| "eval_steps_per_second": 0.965, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 0.29757335782051086, | |
| "learning_rate": 0.0001548293229519914, | |
| "loss": 0.2821, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 0.3013056218624115, | |
| "learning_rate": 0.00014733196671848435, | |
| "loss": 0.273, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.3030945360660553, | |
| "learning_rate": 0.0001394743856384267, | |
| "loss": 0.2629, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.3085506856441498, | |
| "learning_rate": 0.00013131638064837494, | |
| "loss": 0.2554, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.31287622451782227, | |
| "learning_rate": 0.00012292003909224143, | |
| "loss": 0.2466, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "eval_loss": 0.24578672647476196, | |
| "eval_runtime": 93.3116, | |
| "eval_samples_per_second": 30.725, | |
| "eval_steps_per_second": 0.965, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.32288551330566406, | |
| "learning_rate": 0.00011434926219911793, | |
| "loss": 0.2305, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 0.35880547761917114, | |
| "learning_rate": 0.00010566927875633776, | |
| "loss": 0.2223, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 0.32476606965065, | |
| "learning_rate": 9.694614867901775e-05, | |
| "loss": 0.2172, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 0.34872496128082275, | |
| "learning_rate": 8.824626025421626e-05, | |
| "loss": 0.2088, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 0.36370012164115906, | |
| "learning_rate": 7.963582488598227e-05, | |
| "loss": 0.2044, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "eval_loss": 0.20646865665912628, | |
| "eval_runtime": 93.2566, | |
| "eval_samples_per_second": 30.743, | |
| "eval_steps_per_second": 0.965, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "grad_norm": 0.34507113695144653, | |
| "learning_rate": 7.118037318659108e-05, | |
| "loss": 0.1908, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.35, | |
| "grad_norm": 0.3758356273174286, | |
| "learning_rate": 6.294425624901638e-05, | |
| "loss": 0.1843, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 5.54, | |
| "grad_norm": 0.35137322545051575, | |
| "learning_rate": 5.499015589625649e-05, | |
| "loss": 0.1775, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 5.74, | |
| "grad_norm": 0.3667621910572052, | |
| "learning_rate": 4.7378607634813043e-05, | |
| "loss": 0.1727, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 5.94, | |
| "grad_norm": 0.36913108825683594, | |
| "learning_rate": 4.01675399429341e-05, | |
| "loss": 0.1695, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 5.94, | |
| "eval_loss": 0.1782875508069992, | |
| "eval_runtime": 93.2778, | |
| "eval_samples_per_second": 30.736, | |
| "eval_steps_per_second": 0.965, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.14, | |
| "grad_norm": 0.3702784776687622, | |
| "learning_rate": 3.341183339991658e-05, | |
| "loss": 0.1619, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 6.34, | |
| "grad_norm": 0.36967843770980835, | |
| "learning_rate": 2.7162903011759987e-05, | |
| "loss": 0.155, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 6.53, | |
| "grad_norm": 0.40270093083381653, | |
| "learning_rate": 2.146830691192553e-05, | |
| "loss": 0.1554, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 6.73, | |
| "grad_norm": 0.3911834955215454, | |
| "learning_rate": 1.6371384415224045e-05, | |
| "loss": 0.152, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 6.93, | |
| "grad_norm": 0.36315953731536865, | |
| "learning_rate": 1.1910926179461446e-05, | |
| "loss": 0.1491, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 6.93, | |
| "eval_loss": 0.16189825534820557, | |
| "eval_runtime": 93.2524, | |
| "eval_samples_per_second": 30.745, | |
| "eval_steps_per_second": 0.965, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 7.13, | |
| "grad_norm": 0.367480993270874, | |
| "learning_rate": 8.12087898511018e-06, | |
| "loss": 0.1443, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 7.33, | |
| "grad_norm": 0.3658962547779083, | |
| "learning_rate": 5.030087379812298e-06, | |
| "loss": 0.142, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 0.3797013461589813, | |
| "learning_rate": 2.662074153955152e-06, | |
| "loss": 0.1433, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 7.72, | |
| "grad_norm": 0.36631888151168823, | |
| "learning_rate": 1.0348613180329757e-06, | |
| "loss": 0.1422, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 0.3723652958869934, | |
| "learning_rate": 1.608329442651213e-07, | |
| "loss": 0.1426, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "eval_loss": 0.15780216455459595, | |
| "eval_runtime": 93.294, | |
| "eval_samples_per_second": 30.731, | |
| "eval_steps_per_second": 0.965, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 100, | |
| "total_flos": 7.274143615381668e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |