| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 942, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03189792663476874, | |
| "grad_norm": 3.013042151093929, | |
| "learning_rate": 9.473684210526317e-07, | |
| "loss": 0.565, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06379585326953748, | |
| "grad_norm": 1.2701924445501063, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.5229, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09569377990430622, | |
| "grad_norm": 0.7768953365324198, | |
| "learning_rate": 3.052631578947369e-06, | |
| "loss": 0.474, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12759170653907495, | |
| "grad_norm": 0.45169335889949896, | |
| "learning_rate": 4.105263157894737e-06, | |
| "loss": 0.4322, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1594896331738437, | |
| "grad_norm": 0.26445344786633457, | |
| "learning_rate": 5.157894736842106e-06, | |
| "loss": 0.4053, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19138755980861244, | |
| "grad_norm": 0.25831272135029637, | |
| "learning_rate": 6.2105263157894745e-06, | |
| "loss": 0.3901, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.22328548644338117, | |
| "grad_norm": 0.17538527283658426, | |
| "learning_rate": 7.263157894736843e-06, | |
| "loss": 0.3822, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2551834130781499, | |
| "grad_norm": 0.18527453951415881, | |
| "learning_rate": 8.315789473684212e-06, | |
| "loss": 0.3722, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28708133971291866, | |
| "grad_norm": 0.20196203327464468, | |
| "learning_rate": 9.36842105263158e-06, | |
| "loss": 0.3678, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3189792663476874, | |
| "grad_norm": 0.18902867456468747, | |
| "learning_rate": 9.999449718452724e-06, | |
| "loss": 0.3584, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.17356792823733477, | |
| "learning_rate": 9.993260441994116e-06, | |
| "loss": 0.3583, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3827751196172249, | |
| "grad_norm": 0.1892746680615421, | |
| "learning_rate": 9.980202579323212e-06, | |
| "loss": 0.3538, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.41467304625199364, | |
| "grad_norm": 0.20189481557460487, | |
| "learning_rate": 9.960294092462332e-06, | |
| "loss": 0.3518, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.44657097288676234, | |
| "grad_norm": 0.18519395971058, | |
| "learning_rate": 9.933562366956445e-06, | |
| "loss": 0.3516, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4784688995215311, | |
| "grad_norm": 0.18001196763912958, | |
| "learning_rate": 9.900044174202389e-06, | |
| "loss": 0.3437, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5103668261562998, | |
| "grad_norm": 0.1789402994798246, | |
| "learning_rate": 9.859785620867197e-06, | |
| "loss": 0.3437, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5422647527910686, | |
| "grad_norm": 0.19853820288193685, | |
| "learning_rate": 9.812842085465086e-06, | |
| "loss": 0.3433, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5741626794258373, | |
| "grad_norm": 0.18843581504920484, | |
| "learning_rate": 9.759278142180348e-06, | |
| "loss": 0.3403, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.2082648683775011, | |
| "learning_rate": 9.699167472040965e-06, | |
| "loss": 0.3415, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6379585326953748, | |
| "grad_norm": 0.19886672216676665, | |
| "learning_rate": 9.632592761565078e-06, | |
| "loss": 0.3408, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6698564593301436, | |
| "grad_norm": 0.21716648325335103, | |
| "learning_rate": 9.559645589019787e-06, | |
| "loss": 0.3378, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.18928950327295563, | |
| "learning_rate": 9.480426298448706e-06, | |
| "loss": 0.3367, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.733652312599681, | |
| "grad_norm": 0.21004277195552987, | |
| "learning_rate": 9.39504386164157e-06, | |
| "loss": 0.3336, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7655502392344498, | |
| "grad_norm": 0.2163242152067136, | |
| "learning_rate": 9.303615728235753e-06, | |
| "loss": 0.335, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7974481658692185, | |
| "grad_norm": 0.22611368075174626, | |
| "learning_rate": 9.206267664155906e-06, | |
| "loss": 0.3351, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8293460925039873, | |
| "grad_norm": 0.20610831325174112, | |
| "learning_rate": 9.103133578613959e-06, | |
| "loss": 0.3318, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.861244019138756, | |
| "grad_norm": 0.21802776432957696, | |
| "learning_rate": 8.994355339907429e-06, | |
| "loss": 0.3313, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8931419457735247, | |
| "grad_norm": 0.2197523854167031, | |
| "learning_rate": 8.880082580269454e-06, | |
| "loss": 0.3301, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9250398724082934, | |
| "grad_norm": 0.18393297959566468, | |
| "learning_rate": 8.760472490038974e-06, | |
| "loss": 0.3301, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9569377990430622, | |
| "grad_norm": 0.19027330099124132, | |
| "learning_rate": 8.635689601434197e-06, | |
| "loss": 0.3296, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.988835725677831, | |
| "grad_norm": 0.19413716292373118, | |
| "learning_rate": 8.505905562226784e-06, | |
| "loss": 0.3304, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0191387559808613, | |
| "grad_norm": 0.19397058012134533, | |
| "learning_rate": 8.371298899628091e-06, | |
| "loss": 0.3213, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0510366826156299, | |
| "grad_norm": 0.22864286624931157, | |
| "learning_rate": 8.232054774712239e-06, | |
| "loss": 0.3174, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0829346092503986, | |
| "grad_norm": 0.18619768509667967, | |
| "learning_rate": 8.08836472771384e-06, | |
| "loss": 0.3164, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1148325358851674, | |
| "grad_norm": 0.1903227419221792, | |
| "learning_rate": 7.940426414550732e-06, | |
| "loss": 0.3192, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1467304625199362, | |
| "grad_norm": 0.18207715314228023, | |
| "learning_rate": 7.788443334934148e-06, | |
| "loss": 0.3152, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.178628389154705, | |
| "grad_norm": 0.22287934362831285, | |
| "learning_rate": 7.632624552440337e-06, | |
| "loss": 0.3149, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.1930728154008719, | |
| "learning_rate": 7.4731844069286965e-06, | |
| "loss": 0.3137, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2424242424242424, | |
| "grad_norm": 0.199264104206528, | |
| "learning_rate": 7.310342219701981e-06, | |
| "loss": 0.3155, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2743221690590112, | |
| "grad_norm": 0.18515835862301386, | |
| "learning_rate": 7.144321991814205e-06, | |
| "loss": 0.317, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.30622009569378, | |
| "grad_norm": 0.1900258258750772, | |
| "learning_rate": 6.975352095941194e-06, | |
| "loss": 0.3144, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3381180223285487, | |
| "grad_norm": 0.1893070126950014, | |
| "learning_rate": 6.803664962237665e-06, | |
| "loss": 0.3112, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3700159489633175, | |
| "grad_norm": 0.1898407610646885, | |
| "learning_rate": 6.6294967586129614e-06, | |
| "loss": 0.3165, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.401913875598086, | |
| "grad_norm": 0.19737522250412162, | |
| "learning_rate": 6.453087065865229e-06, | |
| "loss": 0.3133, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4338118022328548, | |
| "grad_norm": 0.17191277709169925, | |
| "learning_rate": 6.274678548120921e-06, | |
| "loss": 0.3159, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4657097288676235, | |
| "grad_norm": 0.19203495843105348, | |
| "learning_rate": 6.094516619032975e-06, | |
| "loss": 0.3096, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4976076555023923, | |
| "grad_norm": 0.18040319361097096, | |
| "learning_rate": 5.91284910419681e-06, | |
| "loss": 0.3119, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.529505582137161, | |
| "grad_norm": 0.18262929744020603, | |
| "learning_rate": 5.729925900248524e-06, | |
| "loss": 0.3143, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5614035087719298, | |
| "grad_norm": 0.18840512451062935, | |
| "learning_rate": 5.5459986311142365e-06, | |
| "loss": 0.3116, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5933014354066986, | |
| "grad_norm": 0.17012758923055069, | |
| "learning_rate": 5.361320301883413e-06, | |
| "loss": 0.3099, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.6251993620414673, | |
| "grad_norm": 0.17689153860721135, | |
| "learning_rate": 5.176144950782296e-06, | |
| "loss": 0.3104, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.657097288676236, | |
| "grad_norm": 0.1763445989704775, | |
| "learning_rate": 4.990727299726196e-06, | |
| "loss": 0.3123, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6889952153110048, | |
| "grad_norm": 0.1749063493157416, | |
| "learning_rate": 4.805322403931312e-06, | |
| "loss": 0.3125, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.7208931419457736, | |
| "grad_norm": 0.16356744694700953, | |
| "learning_rate": 4.620185301068067e-06, | |
| "loss": 0.3117, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7527910685805423, | |
| "grad_norm": 0.17728652885848495, | |
| "learning_rate": 4.4355706604385905e-06, | |
| "loss": 0.3091, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.784688995215311, | |
| "grad_norm": 0.15681863001918459, | |
| "learning_rate": 4.251732432660909e-06, | |
| "loss": 0.3074, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.8165869218500799, | |
| "grad_norm": 0.15662044378658432, | |
| "learning_rate": 4.0689235003417425e-06, | |
| "loss": 0.3117, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8484848484848486, | |
| "grad_norm": 0.165028679777127, | |
| "learning_rate": 3.887395330218429e-06, | |
| "loss": 0.3088, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.8803827751196174, | |
| "grad_norm": 0.15246983219476976, | |
| "learning_rate": 3.7073976272484647e-06, | |
| "loss": 0.3101, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.912280701754386, | |
| "grad_norm": 0.15887139892428426, | |
| "learning_rate": 3.529177991122519e-06, | |
| "loss": 0.3106, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9441786283891547, | |
| "grad_norm": 0.1833365263259369, | |
| "learning_rate": 3.3529815756733773e-06, | |
| "loss": 0.3097, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9760765550239234, | |
| "grad_norm": 0.18558945586201675, | |
| "learning_rate": 3.1790507516493473e-06, | |
| "loss": 0.308, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.006379585326954, | |
| "grad_norm": 0.17610108001200678, | |
| "learning_rate": 3.0076247733159846e-06, | |
| "loss": 0.3084, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.0382775119617227, | |
| "grad_norm": 0.16117110116102118, | |
| "learning_rate": 2.8389394493447732e-06, | |
| "loss": 0.302, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.18050551147297098, | |
| "learning_rate": 2.6732268184414695e-06, | |
| "loss": 0.2986, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.1020733652312598, | |
| "grad_norm": 0.15542898064111021, | |
| "learning_rate": 2.5107148301602825e-06, | |
| "loss": 0.3017, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.1339712918660285, | |
| "grad_norm": 0.15732161869282196, | |
| "learning_rate": 2.3516270313430085e-06, | |
| "loss": 0.3014, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.1658692185007973, | |
| "grad_norm": 0.15069055388397343, | |
| "learning_rate": 2.196182258614365e-06, | |
| "loss": 0.301, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.197767145135566, | |
| "grad_norm": 0.14621726869728996, | |
| "learning_rate": 2.0445943373566178e-06, | |
| "loss": 0.2995, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.229665071770335, | |
| "grad_norm": 0.15611592766595808, | |
| "learning_rate": 1.897071787577482e-06, | |
| "loss": 0.2993, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.2615629984051036, | |
| "grad_norm": 0.16311648279426832, | |
| "learning_rate": 1.7538175370759797e-06, | |
| "loss": 0.2987, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.2934609250398723, | |
| "grad_norm": 0.1518132702014371, | |
| "learning_rate": 1.6150286423007472e-06, | |
| "loss": 0.2988, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.325358851674641, | |
| "grad_norm": 0.14623956587378453, | |
| "learning_rate": 1.480896017284843e-06, | |
| "loss": 0.2987, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.35725677830941, | |
| "grad_norm": 0.1450791346600745, | |
| "learning_rate": 1.35160417102985e-06, | |
| "loss": 0.2975, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.3891547049441786, | |
| "grad_norm": 0.14245338861258353, | |
| "learning_rate": 1.2273309537005801e-06, | |
| "loss": 0.2979, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.1407504601934374, | |
| "learning_rate": 1.1082473119794695e-06, | |
| "loss": 0.2995, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.452950558213716, | |
| "grad_norm": 0.14848521642896026, | |
| "learning_rate": 9.945170539172171e-07, | |
| "loss": 0.2988, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.484848484848485, | |
| "grad_norm": 0.14731141992992397, | |
| "learning_rate": 8.862966236031151e-07, | |
| "loss": 0.3018, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.5167464114832536, | |
| "grad_norm": 0.14962017404876418, | |
| "learning_rate": 7.837348859650367e-07, | |
| "loss": 0.2993, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.5486443381180224, | |
| "grad_norm": 0.17642521589827595, | |
| "learning_rate": 6.86972921995096e-07, | |
| "loss": 0.2976, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.580542264752791, | |
| "grad_norm": 0.14352847715232434, | |
| "learning_rate": 5.961438346826792e-07, | |
| "loss": 0.2998, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.61244019138756, | |
| "grad_norm": 0.13822751785341988, | |
| "learning_rate": 5.113725659217689e-07, | |
| "loss": 0.3006, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.6443381180223287, | |
| "grad_norm": 0.15068234135380182, | |
| "learning_rate": 4.3277572464446247e-07, | |
| "loss": 0.3007, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6762360446570974, | |
| "grad_norm": 0.259420590664316, | |
| "learning_rate": 3.604614264170486e-07, | |
| "loss": 0.2978, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.708133971291866, | |
| "grad_norm": 0.13787699442689774, | |
| "learning_rate": 2.945291447193399e-07, | |
| "loss": 0.2978, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.740031897926635, | |
| "grad_norm": 0.13973454335478264, | |
| "learning_rate": 2.3506957411178478e-07, | |
| "loss": 0.2939, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.13749755308347614, | |
| "learning_rate": 1.821645054786414e-07, | |
| "loss": 0.3004, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.803827751196172, | |
| "grad_norm": 0.15736317208256428, | |
| "learning_rate": 1.358867135187636e-07, | |
| "loss": 0.2998, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.8357256778309408, | |
| "grad_norm": 0.1332226119002476, | |
| "learning_rate": 9.629985663881514e-08, | |
| "loss": 0.2995, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.8676236044657095, | |
| "grad_norm": 0.14370883034646106, | |
| "learning_rate": 6.34583893865831e-08, | |
| "loss": 0.3009, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.8995215311004783, | |
| "grad_norm": 0.1378486858698825, | |
| "learning_rate": 3.7407487544861565e-08, | |
| "loss": 0.3002, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.931419457735247, | |
| "grad_norm": 0.13698177398845934, | |
| "learning_rate": 1.818298598893542e-08, | |
| "loss": 0.2967, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.963317384370016, | |
| "grad_norm": 0.14400015647475053, | |
| "learning_rate": 5.81132939315443e-09, | |
| "loss": 0.2949, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.9952153110047846, | |
| "grad_norm": 0.14118769704303924, | |
| "learning_rate": 3.095358543914184e-10, | |
| "loss": 0.2986, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 942, | |
| "total_flos": 9110918151536640.0, | |
| "train_loss": 0.32638987993738455, | |
| "train_runtime": 18097.0555, | |
| "train_samples_per_second": 3.325, | |
| "train_steps_per_second": 0.052 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 942, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9110918151536640.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |