{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5, "grad_norm": 1.8985621929168701, "learning_rate": 9.910000000000001e-05, "loss": 0.2229, "step": 10 }, { "epoch": 1.0, "grad_norm": 2.6959447860717773, "learning_rate": 9.81e-05, "loss": 0.0395, "step": 20 }, { "epoch": 1.5, "grad_norm": 6.28496789932251, "learning_rate": 9.71e-05, "loss": 0.0261, "step": 30 }, { "epoch": 2.0, "grad_norm": 0.8198316097259521, "learning_rate": 9.61e-05, "loss": 0.0346, "step": 40 }, { "epoch": 2.5, "grad_norm": 3.930227518081665, "learning_rate": 9.51e-05, "loss": 0.0237, "step": 50 }, { "epoch": 3.0, "grad_norm": 0.24117116630077362, "learning_rate": 9.41e-05, "loss": 0.0144, "step": 60 }, { "epoch": 3.5, "grad_norm": 2.9592466354370117, "learning_rate": 9.310000000000001e-05, "loss": 0.0136, "step": 70 }, { "epoch": 4.0, "grad_norm": 6.1739702224731445, "learning_rate": 9.21e-05, "loss": 0.0143, "step": 80 }, { "epoch": 4.5, "grad_norm": 1.1602582931518555, "learning_rate": 9.11e-05, "loss": 0.0166, "step": 90 }, { "epoch": 5.0, "grad_norm": 3.7479774951934814, "learning_rate": 9.010000000000001e-05, "loss": 0.0174, "step": 100 }, { "epoch": 5.5, "grad_norm": 1.2943713665008545, "learning_rate": 8.910000000000001e-05, "loss": 0.0127, "step": 110 }, { "epoch": 6.0, "grad_norm": 1.9738529920578003, "learning_rate": 8.81e-05, "loss": 0.0056, "step": 120 }, { "epoch": 6.5, "grad_norm": 1.6200770139694214, "learning_rate": 8.71e-05, "loss": 0.0053, "step": 130 }, { "epoch": 7.0, "grad_norm": 2.116046667098999, "learning_rate": 8.61e-05, "loss": 0.0058, "step": 140 }, { "epoch": 7.5, "grad_norm": 1.7189213037490845, "learning_rate": 8.510000000000001e-05, "loss": 0.0035, "step": 150 }, { "epoch": 8.0, "grad_norm": 0.13006743788719177, "learning_rate": 8.41e-05, "loss": 0.0025, "step": 160 }, { "epoch": 8.5, "grad_norm": 0.35229212045669556, "learning_rate": 8.31e-05, "loss": 0.002, "step": 170 }, { "epoch": 9.0, "grad_norm": 0.5283310413360596, "learning_rate": 8.21e-05, "loss": 0.0026, "step": 180 }, { "epoch": 9.5, "grad_norm": 1.4066272974014282, "learning_rate": 8.11e-05, "loss": 0.0028, "step": 190 }, { "epoch": 10.0, "grad_norm": 0.6277774572372437, "learning_rate": 8.010000000000001e-05, "loss": 0.0018, "step": 200 }, { "epoch": 10.5, "grad_norm": 1.3672561645507812, "learning_rate": 7.910000000000001e-05, "loss": 0.0019, "step": 210 }, { "epoch": 11.0, "grad_norm": 0.849717915058136, "learning_rate": 7.81e-05, "loss": 0.0014, "step": 220 }, { "epoch": 11.5, "grad_norm": 1.2382935285568237, "learning_rate": 7.71e-05, "loss": 0.0017, "step": 230 }, { "epoch": 12.0, "grad_norm": 0.3568917512893677, "learning_rate": 7.61e-05, "loss": 0.0015, "step": 240 }, { "epoch": 12.5, "grad_norm": 0.02996239811182022, "learning_rate": 7.510000000000001e-05, "loss": 0.0009, "step": 250 }, { "epoch": 13.0, "grad_norm": 0.3278927206993103, "learning_rate": 7.41e-05, "loss": 0.0016, "step": 260 }, { "epoch": 13.5, "grad_norm": 0.78089839220047, "learning_rate": 7.31e-05, "loss": 0.0015, "step": 270 }, { "epoch": 14.0, "grad_norm": 1.049814224243164, "learning_rate": 7.21e-05, "loss": 0.0008, "step": 280 }, { "epoch": 14.5, "grad_norm": 0.05354577675461769, "learning_rate": 7.11e-05, "loss": 0.0012, "step": 290 }, { "epoch": 15.0, "grad_norm": 0.7721472978591919, "learning_rate": 7.01e-05, "loss": 0.0007, "step": 300 }, { "epoch": 15.5, "grad_norm": 0.5011315941810608, "learning_rate": 6.91e-05, "loss": 0.0006, "step": 310 }, { "epoch": 16.0, "grad_norm": 0.2097933441400528, "learning_rate": 6.81e-05, "loss": 0.001, "step": 320 }, { "epoch": 16.5, "grad_norm": 0.6485989689826965, "learning_rate": 6.71e-05, "loss": 0.0011, "step": 330 }, { "epoch": 17.0, "grad_norm": 0.46603426337242126, "learning_rate": 6.610000000000001e-05, "loss": 0.0008, "step": 340 }, { "epoch": 17.5, "grad_norm": 1.16153883934021, "learning_rate": 6.510000000000001e-05, "loss": 0.0012, "step": 350 }, { "epoch": 18.0, "grad_norm": 0.509899377822876, "learning_rate": 6.41e-05, "loss": 0.001, "step": 360 }, { "epoch": 18.5, "grad_norm": 0.22210121154785156, "learning_rate": 6.31e-05, "loss": 0.0006, "step": 370 }, { "epoch": 19.0, "grad_norm": 1.4337605237960815, "learning_rate": 6.21e-05, "loss": 0.0014, "step": 380 }, { "epoch": 19.5, "grad_norm": 0.6034935116767883, "learning_rate": 6.110000000000001e-05, "loss": 0.0006, "step": 390 }, { "epoch": 20.0, "grad_norm": 0.27057749032974243, "learning_rate": 6.0100000000000004e-05, "loss": 0.0009, "step": 400 }, { "epoch": 20.5, "grad_norm": 0.6695419549942017, "learning_rate": 5.91e-05, "loss": 0.0004, "step": 410 }, { "epoch": 21.0, "grad_norm": 1.8617427349090576, "learning_rate": 5.8099999999999996e-05, "loss": 0.002, "step": 420 }, { "epoch": 21.5, "grad_norm": 0.7135971784591675, "learning_rate": 5.71e-05, "loss": 0.0022, "step": 430 }, { "epoch": 22.0, "grad_norm": 1.104410171508789, "learning_rate": 5.610000000000001e-05, "loss": 0.0018, "step": 440 }, { "epoch": 22.5, "grad_norm": 0.5932751297950745, "learning_rate": 5.5100000000000004e-05, "loss": 0.0014, "step": 450 }, { "epoch": 23.0, "grad_norm": 0.4027913510799408, "learning_rate": 5.410000000000001e-05, "loss": 0.0008, "step": 460 }, { "epoch": 23.5, "grad_norm": 0.7953199148178101, "learning_rate": 5.31e-05, "loss": 0.0011, "step": 470 }, { "epoch": 24.0, "grad_norm": 0.25498589873313904, "learning_rate": 5.2100000000000006e-05, "loss": 0.0004, "step": 480 }, { "epoch": 24.5, "grad_norm": 0.8258208632469177, "learning_rate": 5.11e-05, "loss": 0.0007, "step": 490 }, { "epoch": 25.0, "grad_norm": 0.18510620296001434, "learning_rate": 5.0100000000000005e-05, "loss": 0.0007, "step": 500 }, { "epoch": 25.5, "grad_norm": 0.8307366371154785, "learning_rate": 4.91e-05, "loss": 0.0007, "step": 510 }, { "epoch": 26.0, "grad_norm": 0.7541914582252502, "learning_rate": 4.8100000000000004e-05, "loss": 0.0007, "step": 520 }, { "epoch": 26.5, "grad_norm": 0.3338281214237213, "learning_rate": 4.71e-05, "loss": 0.0004, "step": 530 }, { "epoch": 27.0, "grad_norm": 0.06798086315393448, "learning_rate": 4.61e-05, "loss": 0.0007, "step": 540 }, { "epoch": 27.5, "grad_norm": 0.15522146224975586, "learning_rate": 4.5100000000000005e-05, "loss": 0.0004, "step": 550 }, { "epoch": 28.0, "grad_norm": 0.4013696610927582, "learning_rate": 4.41e-05, "loss": 0.0006, "step": 560 }, { "epoch": 28.5, "grad_norm": 0.40657514333724976, "learning_rate": 4.3100000000000004e-05, "loss": 0.0007, "step": 570 }, { "epoch": 29.0, "grad_norm": 0.14566798508167267, "learning_rate": 4.21e-05, "loss": 0.0001, "step": 580 }, { "epoch": 29.5, "grad_norm": 0.4930070638656616, "learning_rate": 4.11e-05, "loss": 0.0003, "step": 590 }, { "epoch": 30.0, "grad_norm": 0.2327234447002411, "learning_rate": 4.0100000000000006e-05, "loss": 0.0005, "step": 600 }, { "epoch": 30.5, "grad_norm": 0.5080268383026123, "learning_rate": 3.91e-05, "loss": 0.0003, "step": 610 }, { "epoch": 31.0, "grad_norm": 0.5666539669036865, "learning_rate": 3.8100000000000005e-05, "loss": 0.0004, "step": 620 }, { "epoch": 31.5, "grad_norm": 0.03347349166870117, "learning_rate": 3.71e-05, "loss": 0.0006, "step": 630 }, { "epoch": 32.0, "grad_norm": 0.4377991557121277, "learning_rate": 3.61e-05, "loss": 0.0001, "step": 640 }, { "epoch": 32.5, "grad_norm": 0.19413357973098755, "learning_rate": 3.51e-05, "loss": 0.0005, "step": 650 }, { "epoch": 33.0, "grad_norm": 0.18701113760471344, "learning_rate": 3.41e-05, "loss": 0.0001, "step": 660 }, { "epoch": 33.5, "grad_norm": 0.31651440262794495, "learning_rate": 3.3100000000000005e-05, "loss": 0.0005, "step": 670 }, { "epoch": 34.0, "grad_norm": 0.02828579768538475, "learning_rate": 3.21e-05, "loss": 0.0, "step": 680 }, { "epoch": 34.5, "grad_norm": 0.34173232316970825, "learning_rate": 3.1100000000000004e-05, "loss": 0.0006, "step": 690 }, { "epoch": 35.0, "grad_norm": 0.09027812629938126, "learning_rate": 3.01e-05, "loss": 0.0001, "step": 700 }, { "epoch": 35.5, "grad_norm": 0.22914953529834747, "learning_rate": 2.91e-05, "loss": 0.0001, "step": 710 }, { "epoch": 36.0, "grad_norm": 0.2684924900531769, "learning_rate": 2.8100000000000005e-05, "loss": 0.0006, "step": 720 }, { "epoch": 36.5, "grad_norm": 0.29140010476112366, "learning_rate": 2.7100000000000005e-05, "loss": 0.0003, "step": 730 }, { "epoch": 37.0, "grad_norm": 0.39679646492004395, "learning_rate": 2.61e-05, "loss": 0.0003, "step": 740 }, { "epoch": 37.5, "grad_norm": 0.08030686527490616, "learning_rate": 2.51e-05, "loss": 0.0005, "step": 750 }, { "epoch": 38.0, "grad_norm": 0.06952106207609177, "learning_rate": 2.41e-05, "loss": 0.0, "step": 760 }, { "epoch": 38.5, "grad_norm": 0.03430015593767166, "learning_rate": 2.3100000000000002e-05, "loss": 0.0003, "step": 770 }, { "epoch": 39.0, "grad_norm": 0.31544044613838196, "learning_rate": 2.2100000000000002e-05, "loss": 0.0003, "step": 780 }, { "epoch": 39.5, "grad_norm": 0.20336127281188965, "learning_rate": 2.11e-05, "loss": 0.0, "step": 790 }, { "epoch": 40.0, "grad_norm": 0.028785208240151405, "learning_rate": 2.01e-05, "loss": 0.0005, "step": 800 }, { "epoch": 40.5, "grad_norm": 0.1161634773015976, "learning_rate": 1.91e-05, "loss": 0.0004, "step": 810 }, { "epoch": 41.0, "grad_norm": 0.11442368477582932, "learning_rate": 1.81e-05, "loss": 0.0, "step": 820 }, { "epoch": 41.5, "grad_norm": 0.010891513898968697, "learning_rate": 1.7100000000000002e-05, "loss": 0.0002, "step": 830 }, { "epoch": 42.0, "grad_norm": 0.2547859251499176, "learning_rate": 1.6100000000000002e-05, "loss": 0.0003, "step": 840 }, { "epoch": 42.5, "grad_norm": 0.006857059430330992, "learning_rate": 1.51e-05, "loss": 0.0002, "step": 850 }, { "epoch": 43.0, "grad_norm": 0.4951632618904114, "learning_rate": 1.4099999999999999e-05, "loss": 0.0003, "step": 860 }, { "epoch": 43.5, "grad_norm": 0.041084468364715576, "learning_rate": 1.3100000000000002e-05, "loss": 0.0004, "step": 870 }, { "epoch": 44.0, "grad_norm": 0.11843769252300262, "learning_rate": 1.2100000000000001e-05, "loss": 0.0, "step": 880 }, { "epoch": 44.5, "grad_norm": 0.03217615187168121, "learning_rate": 1.11e-05, "loss": 0.0005, "step": 890 }, { "epoch": 45.0, "grad_norm": 0.018256094306707382, "learning_rate": 1.0100000000000002e-05, "loss": 0.0, "step": 900 }, { "epoch": 45.5, "grad_norm": 0.042205873876810074, "learning_rate": 9.100000000000001e-06, "loss": 0.0, "step": 910 }, { "epoch": 46.0, "grad_norm": 0.05937511846423149, "learning_rate": 8.1e-06, "loss": 0.0004, "step": 920 }, { "epoch": 46.5, "grad_norm": 0.10673025250434875, "learning_rate": 7.1e-06, "loss": 0.0002, "step": 930 }, { "epoch": 47.0, "grad_norm": 0.1283574253320694, "learning_rate": 6.1e-06, "loss": 0.0002, "step": 940 }, { "epoch": 47.5, "grad_norm": 0.04566584527492523, "learning_rate": 5.1e-06, "loss": 0.0004, "step": 950 }, { "epoch": 48.0, "grad_norm": 0.08355249464511871, "learning_rate": 4.1000000000000006e-06, "loss": 0.0, "step": 960 }, { "epoch": 48.5, "grad_norm": 0.12251041829586029, "learning_rate": 3.1e-06, "loss": 0.0002, "step": 970 }, { "epoch": 49.0, "grad_norm": 0.16482886672019958, "learning_rate": 2.1000000000000002e-06, "loss": 0.0002, "step": 980 }, { "epoch": 49.5, "grad_norm": 0.008726044557988644, "learning_rate": 1.1e-06, "loss": 0.0002, "step": 990 }, { "epoch": 50.0, "grad_norm": 0.0353192463517189, "learning_rate": 1.0000000000000001e-07, "loss": 0.0002, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }