{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.9758233428001404, "learning_rate": 1.8e-06, "loss": 1.1603, "step": 10 }, { "grad_norm": 0.42593392729759216, "learning_rate": 3.8e-06, "loss": 1.1479, "step": 20 }, { "grad_norm": 0.21856296062469482, "learning_rate": 5.8e-06, "loss": 1.1204, "step": 30 }, { "grad_norm": 0.1426735520362854, "learning_rate": 7.8e-06, "loss": 1.1026, "step": 40 }, { "grad_norm": 0.13335095345973969, "learning_rate": 9.800000000000001e-06, "loss": 1.0891, "step": 50 }, { "grad_norm": 0.11337289214134216, "learning_rate": 1.18e-05, "loss": 1.0658, "step": 60 }, { "grad_norm": 0.09717453271150589, "learning_rate": 1.3800000000000002e-05, "loss": 1.0543, "step": 70 }, { "grad_norm": 0.10322871059179306, "learning_rate": 1.58e-05, "loss": 1.0375, "step": 80 }, { "grad_norm": 0.17508411407470703, "learning_rate": 1.78e-05, "loss": 1.0261, "step": 90 }, { "grad_norm": 0.29097145795822144, "learning_rate": 1.9800000000000004e-05, "loss": 1.0143, "step": 100 }, { "grad_norm": 0.28556156158447266, "learning_rate": 2.18e-05, "loss": 0.9888, "step": 110 }, { "grad_norm": 0.7807521224021912, "learning_rate": 2.38e-05, "loss": 0.943, "step": 120 }, { "grad_norm": 0.6720882654190063, "learning_rate": 2.58e-05, "loss": 0.8726, "step": 130 }, { "grad_norm": 0.867290198802948, "learning_rate": 2.7800000000000005e-05, "loss": 0.7941, "step": 140 }, { "grad_norm": 0.7180277705192566, "learning_rate": 2.98e-05, "loss": 0.731, "step": 150 }, { "grad_norm": 0.6411461234092712, "learning_rate": 3.18e-05, "loss": 0.6729, "step": 160 }, { "grad_norm": 0.8856205344200134, "learning_rate": 3.38e-05, "loss": 0.6213, "step": 170 }, { "grad_norm": 1.1326690912246704, "learning_rate": 3.58e-05, "loss": 0.5673, "step": 180 }, { "grad_norm": 0.8645981550216675, "learning_rate": 3.7800000000000004e-05, "loss": 0.5007, "step": 190 }, { "grad_norm": 1.4590177536010742, "learning_rate": 3.9800000000000005e-05, "loss": 0.4514, "step": 200 }, { "grad_norm": 1.0651799440383911, "learning_rate": 4.18e-05, "loss": 0.4098, "step": 210 }, { "grad_norm": 0.9970216155052185, "learning_rate": 4.38e-05, "loss": 0.3782, "step": 220 }, { "grad_norm": 1.4383465051651, "learning_rate": 4.58e-05, "loss": 0.3505, "step": 230 }, { "grad_norm": 1.0782465934753418, "learning_rate": 4.78e-05, "loss": 0.3158, "step": 240 }, { "grad_norm": 1.101946234703064, "learning_rate": 4.9800000000000004e-05, "loss": 0.2916, "step": 250 }, { "grad_norm": 1.587511420249939, "learning_rate": 5.1800000000000005e-05, "loss": 0.261, "step": 260 }, { "grad_norm": 1.435607671737671, "learning_rate": 5.380000000000001e-05, "loss": 0.2437, "step": 270 }, { "grad_norm": 0.9712788462638855, "learning_rate": 5.580000000000001e-05, "loss": 0.2171, "step": 280 }, { "grad_norm": 1.7632523775100708, "learning_rate": 5.7799999999999995e-05, "loss": 0.2055, "step": 290 }, { "grad_norm": 1.7171282768249512, "learning_rate": 5.9800000000000003e-05, "loss": 0.1883, "step": 300 }, { "grad_norm": 1.374088168144226, "learning_rate": 6.18e-05, "loss": 0.1745, "step": 310 }, { "grad_norm": 1.4401962757110596, "learning_rate": 6.38e-05, "loss": 0.1576, "step": 320 }, { "grad_norm": 1.3337668180465698, "learning_rate": 6.58e-05, "loss": 0.1363, "step": 330 }, { "grad_norm": 0.9742352366447449, "learning_rate": 6.780000000000001e-05, "loss": 0.1193, "step": 340 }, { "grad_norm": 1.369019865989685, "learning_rate": 6.98e-05, "loss": 0.1097, "step": 350 }, { "grad_norm": 1.2735106945037842, "learning_rate": 7.18e-05, "loss": 0.0989, "step": 360 }, { "grad_norm": 1.1764349937438965, "learning_rate": 7.38e-05, "loss": 0.0924, "step": 370 }, { "grad_norm": 1.46502685546875, "learning_rate": 7.58e-05, "loss": 0.0892, "step": 380 }, { "grad_norm": 1.2585327625274658, "learning_rate": 7.780000000000001e-05, "loss": 0.0867, "step": 390 }, { "grad_norm": 1.3663945198059082, "learning_rate": 7.98e-05, "loss": 0.0823, "step": 400 }, { "grad_norm": 1.3091000318527222, "learning_rate": 8.18e-05, "loss": 0.0749, "step": 410 }, { "grad_norm": 1.0578632354736328, "learning_rate": 8.38e-05, "loss": 0.0772, "step": 420 }, { "grad_norm": 1.354399561882019, "learning_rate": 8.58e-05, "loss": 0.0747, "step": 430 }, { "grad_norm": 0.9889944195747375, "learning_rate": 8.78e-05, "loss": 0.0741, "step": 440 }, { "grad_norm": 1.1401726007461548, "learning_rate": 8.98e-05, "loss": 0.0722, "step": 450 }, { "grad_norm": 0.9166666269302368, "learning_rate": 9.180000000000001e-05, "loss": 0.0656, "step": 460 }, { "grad_norm": 1.3585307598114014, "learning_rate": 9.38e-05, "loss": 0.0714, "step": 470 }, { "grad_norm": 1.211781620979309, "learning_rate": 9.58e-05, "loss": 0.0692, "step": 480 }, { "grad_norm": 1.2247947454452515, "learning_rate": 9.78e-05, "loss": 0.0648, "step": 490 }, { "grad_norm": 0.8823159337043762, "learning_rate": 9.98e-05, "loss": 0.0669, "step": 500 }, { "grad_norm": 1.1154427528381348, "learning_rate": 9.9999778549206e-05, "loss": 0.0639, "step": 510 }, { "grad_norm": 0.8653926253318787, "learning_rate": 9.999901304280685e-05, "loss": 0.0567, "step": 520 }, { "grad_norm": 0.831631064414978, "learning_rate": 9.999770075521164e-05, "loss": 0.0538, "step": 530 }, { "grad_norm": 0.8719817399978638, "learning_rate": 9.99958417007713e-05, "loss": 0.0591, "step": 540 }, { "grad_norm": 0.9873416423797607, "learning_rate": 9.999343589981615e-05, "loss": 0.0542, "step": 550 }, { "grad_norm": 0.9876612424850464, "learning_rate": 9.999048337865568e-05, "loss": 0.0522, "step": 560 }, { "grad_norm": 1.1858694553375244, "learning_rate": 9.998698416957815e-05, "loss": 0.0558, "step": 570 }, { "grad_norm": 0.646891176700592, "learning_rate": 9.998293831085037e-05, "loss": 0.0517, "step": 580 }, { "grad_norm": 0.6390032768249512, "learning_rate": 9.997834584671719e-05, "loss": 0.0537, "step": 590 }, { "grad_norm": 0.8343250751495361, "learning_rate": 9.997320682740107e-05, "loss": 0.0517, "step": 600 }, { "grad_norm": 0.7489078640937805, "learning_rate": 9.996752130910149e-05, "loss": 0.0489, "step": 610 }, { "grad_norm": 0.8712443113327026, "learning_rate": 9.99612893539944e-05, "loss": 0.0501, "step": 620 }, { "grad_norm": 0.9857075810432434, "learning_rate": 9.995451103023144e-05, "loss": 0.0477, "step": 630 }, { "grad_norm": 0.7690502405166626, "learning_rate": 9.994718641193928e-05, "loss": 0.0493, "step": 640 }, { "grad_norm": 1.0277982950210571, "learning_rate": 9.993931557921874e-05, "loss": 0.0481, "step": 650 }, { "grad_norm": 0.7521364688873291, "learning_rate": 9.993089861814402e-05, "loss": 0.0445, "step": 660 }, { "grad_norm": 0.7378376722335815, "learning_rate": 9.992193562076166e-05, "loss": 0.0519, "step": 670 }, { "grad_norm": 0.848283052444458, "learning_rate": 9.991242668508954e-05, "loss": 0.0468, "step": 680 }, { "grad_norm": 0.8540059328079224, "learning_rate": 9.990237191511587e-05, "loss": 0.0438, "step": 690 }, { "grad_norm": 0.8540256023406982, "learning_rate": 9.989177142079802e-05, "loss": 0.0436, "step": 700 }, { "grad_norm": 0.8738433718681335, "learning_rate": 9.988062531806126e-05, "loss": 0.0423, "step": 710 }, { "grad_norm": 0.7137842178344727, "learning_rate": 9.986893372879762e-05, "loss": 0.0423, "step": 720 }, { "grad_norm": 0.8221761584281921, "learning_rate": 9.985669678086443e-05, "loss": 0.0454, "step": 730 }, { "grad_norm": 0.7644378542900085, "learning_rate": 9.984391460808298e-05, "loss": 0.043, "step": 740 }, { "grad_norm": 0.9529057145118713, "learning_rate": 9.983058735023709e-05, "loss": 0.0412, "step": 750 }, { "grad_norm": 0.5925287008285522, "learning_rate": 9.98167151530715e-05, "loss": 0.0412, "step": 760 }, { "grad_norm": 0.9713786840438843, "learning_rate": 9.980229816829034e-05, "loss": 0.0438, "step": 770 }, { "grad_norm": 0.8202999830245972, "learning_rate": 9.978733655355544e-05, "loss": 0.0426, "step": 780 }, { "grad_norm": 0.8735260367393494, "learning_rate": 9.977183047248464e-05, "loss": 0.0417, "step": 790 }, { "grad_norm": 0.780419647693634, "learning_rate": 9.975578009464992e-05, "loss": 0.0442, "step": 800 }, { "grad_norm": 0.6911860108375549, "learning_rate": 9.97391855955757e-05, "loss": 0.0387, "step": 810 }, { "grad_norm": 0.7028207182884216, "learning_rate": 9.972204715673669e-05, "loss": 0.0406, "step": 820 }, { "grad_norm": 0.6863601207733154, "learning_rate": 9.970436496555617e-05, "loss": 0.0365, "step": 830 }, { "grad_norm": 0.8116464614868164, "learning_rate": 9.968613921540373e-05, "loss": 0.0403, "step": 840 }, { "grad_norm": 0.5927073359489441, "learning_rate": 9.966737010559326e-05, "loss": 0.0457, "step": 850 }, { "grad_norm": 0.7573205828666687, "learning_rate": 9.964805784138072e-05, "loss": 0.0386, "step": 860 }, { "grad_norm": 0.777636706829071, "learning_rate": 9.962820263396195e-05, "loss": 0.0377, "step": 870 }, { "grad_norm": 0.6754245758056641, "learning_rate": 9.960780470047033e-05, "loss": 0.0402, "step": 880 }, { "grad_norm": 0.7781431674957275, "learning_rate": 9.958686426397437e-05, "loss": 0.0392, "step": 890 }, { "grad_norm": 0.7408170104026794, "learning_rate": 9.956538155347534e-05, "loss": 0.0397, "step": 900 }, { "grad_norm": 0.7765600085258484, "learning_rate": 9.95433568039047e-05, "loss": 0.0368, "step": 910 }, { "grad_norm": 0.7626947164535522, "learning_rate": 9.952079025612162e-05, "loss": 0.0375, "step": 920 }, { "grad_norm": 0.7383814454078674, "learning_rate": 9.949768215691022e-05, "loss": 0.0396, "step": 930 }, { "grad_norm": 0.5346885919570923, "learning_rate": 9.9474032758977e-05, "loss": 0.038, "step": 940 }, { "grad_norm": 0.649907648563385, "learning_rate": 9.944984232094794e-05, "loss": 0.0377, "step": 950 }, { "grad_norm": 0.7877050042152405, "learning_rate": 9.942511110736584e-05, "loss": 0.0359, "step": 960 }, { "grad_norm": 0.7351949214935303, "learning_rate": 9.939983938868726e-05, "loss": 0.0372, "step": 970 }, { "grad_norm": 1.1521971225738525, "learning_rate": 9.93740274412797e-05, "loss": 0.0361, "step": 980 }, { "grad_norm": 0.817077100276947, "learning_rate": 9.934767554741846e-05, "loss": 0.0377, "step": 990 }, { "grad_norm": 0.5828076601028442, "learning_rate": 9.932078399528361e-05, "loss": 0.0365, "step": 1000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }