| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 50.0, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.8985621929168701, |
| "learning_rate": 9.910000000000001e-05, |
| "loss": 0.2229, |
| "step": 10 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.6959447860717773, |
| "learning_rate": 9.81e-05, |
| "loss": 0.0395, |
| "step": 20 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 6.28496789932251, |
| "learning_rate": 9.71e-05, |
| "loss": 0.0261, |
| "step": 30 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.8198316097259521, |
| "learning_rate": 9.61e-05, |
| "loss": 0.0346, |
| "step": 40 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 3.930227518081665, |
| "learning_rate": 9.51e-05, |
| "loss": 0.0237, |
| "step": 50 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.24117116630077362, |
| "learning_rate": 9.41e-05, |
| "loss": 0.0144, |
| "step": 60 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 2.9592466354370117, |
| "learning_rate": 9.310000000000001e-05, |
| "loss": 0.0136, |
| "step": 70 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 6.1739702224731445, |
| "learning_rate": 9.21e-05, |
| "loss": 0.0143, |
| "step": 80 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 1.1602582931518555, |
| "learning_rate": 9.11e-05, |
| "loss": 0.0166, |
| "step": 90 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 3.7479774951934814, |
| "learning_rate": 9.010000000000001e-05, |
| "loss": 0.0174, |
| "step": 100 |
| }, |
| { |
| "epoch": 5.5, |
| "grad_norm": 1.2943713665008545, |
| "learning_rate": 8.910000000000001e-05, |
| "loss": 0.0127, |
| "step": 110 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 1.9738529920578003, |
| "learning_rate": 8.81e-05, |
| "loss": 0.0056, |
| "step": 120 |
| }, |
| { |
| "epoch": 6.5, |
| "grad_norm": 1.6200770139694214, |
| "learning_rate": 8.71e-05, |
| "loss": 0.0053, |
| "step": 130 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 2.116046667098999, |
| "learning_rate": 8.61e-05, |
| "loss": 0.0058, |
| "step": 140 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 1.7189213037490845, |
| "learning_rate": 8.510000000000001e-05, |
| "loss": 0.0035, |
| "step": 150 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.13006743788719177, |
| "learning_rate": 8.41e-05, |
| "loss": 0.0025, |
| "step": 160 |
| }, |
| { |
| "epoch": 8.5, |
| "grad_norm": 0.35229212045669556, |
| "learning_rate": 8.31e-05, |
| "loss": 0.002, |
| "step": 170 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 0.5283310413360596, |
| "learning_rate": 8.21e-05, |
| "loss": 0.0026, |
| "step": 180 |
| }, |
| { |
| "epoch": 9.5, |
| "grad_norm": 1.4066272974014282, |
| "learning_rate": 8.11e-05, |
| "loss": 0.0028, |
| "step": 190 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.6277774572372437, |
| "learning_rate": 8.010000000000001e-05, |
| "loss": 0.0018, |
| "step": 200 |
| }, |
| { |
| "epoch": 10.5, |
| "grad_norm": 1.3672561645507812, |
| "learning_rate": 7.910000000000001e-05, |
| "loss": 0.0019, |
| "step": 210 |
| }, |
| { |
| "epoch": 11.0, |
| "grad_norm": 0.849717915058136, |
| "learning_rate": 7.81e-05, |
| "loss": 0.0014, |
| "step": 220 |
| }, |
| { |
| "epoch": 11.5, |
| "grad_norm": 1.2382935285568237, |
| "learning_rate": 7.71e-05, |
| "loss": 0.0017, |
| "step": 230 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 0.3568917512893677, |
| "learning_rate": 7.61e-05, |
| "loss": 0.0015, |
| "step": 240 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.02996239811182022, |
| "learning_rate": 7.510000000000001e-05, |
| "loss": 0.0009, |
| "step": 250 |
| }, |
| { |
| "epoch": 13.0, |
| "grad_norm": 0.3278927206993103, |
| "learning_rate": 7.41e-05, |
| "loss": 0.0016, |
| "step": 260 |
| }, |
| { |
| "epoch": 13.5, |
| "grad_norm": 0.78089839220047, |
| "learning_rate": 7.31e-05, |
| "loss": 0.0015, |
| "step": 270 |
| }, |
| { |
| "epoch": 14.0, |
| "grad_norm": 1.049814224243164, |
| "learning_rate": 7.21e-05, |
| "loss": 0.0008, |
| "step": 280 |
| }, |
| { |
| "epoch": 14.5, |
| "grad_norm": 0.05354577675461769, |
| "learning_rate": 7.11e-05, |
| "loss": 0.0012, |
| "step": 290 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.7721472978591919, |
| "learning_rate": 7.01e-05, |
| "loss": 0.0007, |
| "step": 300 |
| }, |
| { |
| "epoch": 15.5, |
| "grad_norm": 0.5011315941810608, |
| "learning_rate": 6.91e-05, |
| "loss": 0.0006, |
| "step": 310 |
| }, |
| { |
| "epoch": 16.0, |
| "grad_norm": 0.2097933441400528, |
| "learning_rate": 6.81e-05, |
| "loss": 0.001, |
| "step": 320 |
| }, |
| { |
| "epoch": 16.5, |
| "grad_norm": 0.6485989689826965, |
| "learning_rate": 6.71e-05, |
| "loss": 0.0011, |
| "step": 330 |
| }, |
| { |
| "epoch": 17.0, |
| "grad_norm": 0.46603426337242126, |
| "learning_rate": 6.610000000000001e-05, |
| "loss": 0.0008, |
| "step": 340 |
| }, |
| { |
| "epoch": 17.5, |
| "grad_norm": 1.16153883934021, |
| "learning_rate": 6.510000000000001e-05, |
| "loss": 0.0012, |
| "step": 350 |
| }, |
| { |
| "epoch": 18.0, |
| "grad_norm": 0.509899377822876, |
| "learning_rate": 6.41e-05, |
| "loss": 0.001, |
| "step": 360 |
| }, |
| { |
| "epoch": 18.5, |
| "grad_norm": 0.22210121154785156, |
| "learning_rate": 6.31e-05, |
| "loss": 0.0006, |
| "step": 370 |
| }, |
| { |
| "epoch": 19.0, |
| "grad_norm": 1.4337605237960815, |
| "learning_rate": 6.21e-05, |
| "loss": 0.0014, |
| "step": 380 |
| }, |
| { |
| "epoch": 19.5, |
| "grad_norm": 0.6034935116767883, |
| "learning_rate": 6.110000000000001e-05, |
| "loss": 0.0006, |
| "step": 390 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.27057749032974243, |
| "learning_rate": 6.0100000000000004e-05, |
| "loss": 0.0009, |
| "step": 400 |
| }, |
| { |
| "epoch": 20.5, |
| "grad_norm": 0.6695419549942017, |
| "learning_rate": 5.91e-05, |
| "loss": 0.0004, |
| "step": 410 |
| }, |
| { |
| "epoch": 21.0, |
| "grad_norm": 1.8617427349090576, |
| "learning_rate": 5.8099999999999996e-05, |
| "loss": 0.002, |
| "step": 420 |
| }, |
| { |
| "epoch": 21.5, |
| "grad_norm": 0.7135971784591675, |
| "learning_rate": 5.71e-05, |
| "loss": 0.0022, |
| "step": 430 |
| }, |
| { |
| "epoch": 22.0, |
| "grad_norm": 1.104410171508789, |
| "learning_rate": 5.610000000000001e-05, |
| "loss": 0.0018, |
| "step": 440 |
| }, |
| { |
| "epoch": 22.5, |
| "grad_norm": 0.5932751297950745, |
| "learning_rate": 5.5100000000000004e-05, |
| "loss": 0.0014, |
| "step": 450 |
| }, |
| { |
| "epoch": 23.0, |
| "grad_norm": 0.4027913510799408, |
| "learning_rate": 5.410000000000001e-05, |
| "loss": 0.0008, |
| "step": 460 |
| }, |
| { |
| "epoch": 23.5, |
| "grad_norm": 0.7953199148178101, |
| "learning_rate": 5.31e-05, |
| "loss": 0.0011, |
| "step": 470 |
| }, |
| { |
| "epoch": 24.0, |
| "grad_norm": 0.25498589873313904, |
| "learning_rate": 5.2100000000000006e-05, |
| "loss": 0.0004, |
| "step": 480 |
| }, |
| { |
| "epoch": 24.5, |
| "grad_norm": 0.8258208632469177, |
| "learning_rate": 5.11e-05, |
| "loss": 0.0007, |
| "step": 490 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 0.18510620296001434, |
| "learning_rate": 5.0100000000000005e-05, |
| "loss": 0.0007, |
| "step": 500 |
| }, |
| { |
| "epoch": 25.5, |
| "grad_norm": 0.8307366371154785, |
| "learning_rate": 4.91e-05, |
| "loss": 0.0007, |
| "step": 510 |
| }, |
| { |
| "epoch": 26.0, |
| "grad_norm": 0.7541914582252502, |
| "learning_rate": 4.8100000000000004e-05, |
| "loss": 0.0007, |
| "step": 520 |
| }, |
| { |
| "epoch": 26.5, |
| "grad_norm": 0.3338281214237213, |
| "learning_rate": 4.71e-05, |
| "loss": 0.0004, |
| "step": 530 |
| }, |
| { |
| "epoch": 27.0, |
| "grad_norm": 0.06798086315393448, |
| "learning_rate": 4.61e-05, |
| "loss": 0.0007, |
| "step": 540 |
| }, |
| { |
| "epoch": 27.5, |
| "grad_norm": 0.15522146224975586, |
| "learning_rate": 4.5100000000000005e-05, |
| "loss": 0.0004, |
| "step": 550 |
| }, |
| { |
| "epoch": 28.0, |
| "grad_norm": 0.4013696610927582, |
| "learning_rate": 4.41e-05, |
| "loss": 0.0006, |
| "step": 560 |
| }, |
| { |
| "epoch": 28.5, |
| "grad_norm": 0.40657514333724976, |
| "learning_rate": 4.3100000000000004e-05, |
| "loss": 0.0007, |
| "step": 570 |
| }, |
| { |
| "epoch": 29.0, |
| "grad_norm": 0.14566798508167267, |
| "learning_rate": 4.21e-05, |
| "loss": 0.0001, |
| "step": 580 |
| }, |
| { |
| "epoch": 29.5, |
| "grad_norm": 0.4930070638656616, |
| "learning_rate": 4.11e-05, |
| "loss": 0.0003, |
| "step": 590 |
| }, |
| { |
| "epoch": 30.0, |
| "grad_norm": 0.2327234447002411, |
| "learning_rate": 4.0100000000000006e-05, |
| "loss": 0.0005, |
| "step": 600 |
| }, |
| { |
| "epoch": 30.5, |
| "grad_norm": 0.5080268383026123, |
| "learning_rate": 3.91e-05, |
| "loss": 0.0003, |
| "step": 610 |
| }, |
| { |
| "epoch": 31.0, |
| "grad_norm": 0.5666539669036865, |
| "learning_rate": 3.8100000000000005e-05, |
| "loss": 0.0004, |
| "step": 620 |
| }, |
| { |
| "epoch": 31.5, |
| "grad_norm": 0.03347349166870117, |
| "learning_rate": 3.71e-05, |
| "loss": 0.0006, |
| "step": 630 |
| }, |
| { |
| "epoch": 32.0, |
| "grad_norm": 0.4377991557121277, |
| "learning_rate": 3.61e-05, |
| "loss": 0.0001, |
| "step": 640 |
| }, |
| { |
| "epoch": 32.5, |
| "grad_norm": 0.19413357973098755, |
| "learning_rate": 3.51e-05, |
| "loss": 0.0005, |
| "step": 650 |
| }, |
| { |
| "epoch": 33.0, |
| "grad_norm": 0.18701113760471344, |
| "learning_rate": 3.41e-05, |
| "loss": 0.0001, |
| "step": 660 |
| }, |
| { |
| "epoch": 33.5, |
| "grad_norm": 0.31651440262794495, |
| "learning_rate": 3.3100000000000005e-05, |
| "loss": 0.0005, |
| "step": 670 |
| }, |
| { |
| "epoch": 34.0, |
| "grad_norm": 0.02828579768538475, |
| "learning_rate": 3.21e-05, |
| "loss": 0.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 34.5, |
| "grad_norm": 0.34173232316970825, |
| "learning_rate": 3.1100000000000004e-05, |
| "loss": 0.0006, |
| "step": 690 |
| }, |
| { |
| "epoch": 35.0, |
| "grad_norm": 0.09027812629938126, |
| "learning_rate": 3.01e-05, |
| "loss": 0.0001, |
| "step": 700 |
| }, |
| { |
| "epoch": 35.5, |
| "grad_norm": 0.22914953529834747, |
| "learning_rate": 2.91e-05, |
| "loss": 0.0001, |
| "step": 710 |
| }, |
| { |
| "epoch": 36.0, |
| "grad_norm": 0.2684924900531769, |
| "learning_rate": 2.8100000000000005e-05, |
| "loss": 0.0006, |
| "step": 720 |
| }, |
| { |
| "epoch": 36.5, |
| "grad_norm": 0.29140010476112366, |
| "learning_rate": 2.7100000000000005e-05, |
| "loss": 0.0003, |
| "step": 730 |
| }, |
| { |
| "epoch": 37.0, |
| "grad_norm": 0.39679646492004395, |
| "learning_rate": 2.61e-05, |
| "loss": 0.0003, |
| "step": 740 |
| }, |
| { |
| "epoch": 37.5, |
| "grad_norm": 0.08030686527490616, |
| "learning_rate": 2.51e-05, |
| "loss": 0.0005, |
| "step": 750 |
| }, |
| { |
| "epoch": 38.0, |
| "grad_norm": 0.06952106207609177, |
| "learning_rate": 2.41e-05, |
| "loss": 0.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 38.5, |
| "grad_norm": 0.03430015593767166, |
| "learning_rate": 2.3100000000000002e-05, |
| "loss": 0.0003, |
| "step": 770 |
| }, |
| { |
| "epoch": 39.0, |
| "grad_norm": 0.31544044613838196, |
| "learning_rate": 2.2100000000000002e-05, |
| "loss": 0.0003, |
| "step": 780 |
| }, |
| { |
| "epoch": 39.5, |
| "grad_norm": 0.20336127281188965, |
| "learning_rate": 2.11e-05, |
| "loss": 0.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 40.0, |
| "grad_norm": 0.028785208240151405, |
| "learning_rate": 2.01e-05, |
| "loss": 0.0005, |
| "step": 800 |
| }, |
| { |
| "epoch": 40.5, |
| "grad_norm": 0.1161634773015976, |
| "learning_rate": 1.91e-05, |
| "loss": 0.0004, |
| "step": 810 |
| }, |
| { |
| "epoch": 41.0, |
| "grad_norm": 0.11442368477582932, |
| "learning_rate": 1.81e-05, |
| "loss": 0.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 41.5, |
| "grad_norm": 0.010891513898968697, |
| "learning_rate": 1.7100000000000002e-05, |
| "loss": 0.0002, |
| "step": 830 |
| }, |
| { |
| "epoch": 42.0, |
| "grad_norm": 0.2547859251499176, |
| "learning_rate": 1.6100000000000002e-05, |
| "loss": 0.0003, |
| "step": 840 |
| }, |
| { |
| "epoch": 42.5, |
| "grad_norm": 0.006857059430330992, |
| "learning_rate": 1.51e-05, |
| "loss": 0.0002, |
| "step": 850 |
| }, |
| { |
| "epoch": 43.0, |
| "grad_norm": 0.4951632618904114, |
| "learning_rate": 1.4099999999999999e-05, |
| "loss": 0.0003, |
| "step": 860 |
| }, |
| { |
| "epoch": 43.5, |
| "grad_norm": 0.041084468364715576, |
| "learning_rate": 1.3100000000000002e-05, |
| "loss": 0.0004, |
| "step": 870 |
| }, |
| { |
| "epoch": 44.0, |
| "grad_norm": 0.11843769252300262, |
| "learning_rate": 1.2100000000000001e-05, |
| "loss": 0.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 44.5, |
| "grad_norm": 0.03217615187168121, |
| "learning_rate": 1.11e-05, |
| "loss": 0.0005, |
| "step": 890 |
| }, |
| { |
| "epoch": 45.0, |
| "grad_norm": 0.018256094306707382, |
| "learning_rate": 1.0100000000000002e-05, |
| "loss": 0.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 45.5, |
| "grad_norm": 0.042205873876810074, |
| "learning_rate": 9.100000000000001e-06, |
| "loss": 0.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 46.0, |
| "grad_norm": 0.05937511846423149, |
| "learning_rate": 8.1e-06, |
| "loss": 0.0004, |
| "step": 920 |
| }, |
| { |
| "epoch": 46.5, |
| "grad_norm": 0.10673025250434875, |
| "learning_rate": 7.1e-06, |
| "loss": 0.0002, |
| "step": 930 |
| }, |
| { |
| "epoch": 47.0, |
| "grad_norm": 0.1283574253320694, |
| "learning_rate": 6.1e-06, |
| "loss": 0.0002, |
| "step": 940 |
| }, |
| { |
| "epoch": 47.5, |
| "grad_norm": 0.04566584527492523, |
| "learning_rate": 5.1e-06, |
| "loss": 0.0004, |
| "step": 950 |
| }, |
| { |
| "epoch": 48.0, |
| "grad_norm": 0.08355249464511871, |
| "learning_rate": 4.1000000000000006e-06, |
| "loss": 0.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 48.5, |
| "grad_norm": 0.12251041829586029, |
| "learning_rate": 3.1e-06, |
| "loss": 0.0002, |
| "step": 970 |
| }, |
| { |
| "epoch": 49.0, |
| "grad_norm": 0.16482886672019958, |
| "learning_rate": 2.1000000000000002e-06, |
| "loss": 0.0002, |
| "step": 980 |
| }, |
| { |
| "epoch": 49.5, |
| "grad_norm": 0.008726044557988644, |
| "learning_rate": 1.1e-06, |
| "loss": 0.0002, |
| "step": 990 |
| }, |
| { |
| "epoch": 50.0, |
| "grad_norm": 0.0353192463517189, |
| "learning_rate": 1.0000000000000001e-07, |
| "loss": 0.0002, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|