| { | |
| "best_global_step": 2000, | |
| "best_metric": 0.09745433926582336, | |
| "best_model_checkpoint": "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/checkpoints/checkpoint-2000", | |
| "epoch": 2.254948210439955, | |
| "eval_steps": 250, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01128089426725464, | |
| "grad_norm": 12.25, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.7492, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02256178853450928, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.6591, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.033842682801763925, | |
| "grad_norm": 4.75, | |
| "learning_rate": 7.25e-05, | |
| "loss": 0.5451, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04512357706901856, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 9.75e-05, | |
| "loss": 0.4973, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0564044713362732, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 0.00012250000000000002, | |
| "loss": 0.4536, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06768536560352785, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 0.0001475, | |
| "loss": 0.4242, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07896625987078248, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 0.00017250000000000002, | |
| "loss": 0.409, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09024715413803712, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 0.00019750000000000003, | |
| "loss": 0.3968, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10152804840529177, | |
| "grad_norm": 6.0, | |
| "learning_rate": 0.0001993025958930647, | |
| "loss": 0.3769, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1128089426725464, | |
| "grad_norm": 4.875, | |
| "learning_rate": 0.0001985277024409144, | |
| "loss": 0.3902, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12408983693980105, | |
| "grad_norm": 7.0, | |
| "learning_rate": 0.00019775280898876404, | |
| "loss": 0.3347, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1353707312070557, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 0.00019697791553661373, | |
| "loss": 0.3346, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14665162547431032, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 0.0001962030220844634, | |
| "loss": 0.3517, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.15793251974156497, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 0.00019542812863231307, | |
| "loss": 0.3614, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16921341400881962, | |
| "grad_norm": 9.25, | |
| "learning_rate": 0.00019465323518016273, | |
| "loss": 0.3342, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18049430827607424, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 0.0001938783417280124, | |
| "loss": 0.3328, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1917752025433289, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 0.0001931034482758621, | |
| "loss": 0.3238, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.20305609681058354, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 0.00019232855482371176, | |
| "loss": 0.3256, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.21433699107783816, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 0.0001915536613715614, | |
| "loss": 0.3487, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2256178853450928, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 0.0001907787679194111, | |
| "loss": 0.3266, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.23689877961234745, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 0.00019000387446726076, | |
| "loss": 0.316, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2481796738796021, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 0.00018922898101511045, | |
| "loss": 0.3118, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.25946056814685675, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0001884540875629601, | |
| "loss": 0.3226, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2707414624141114, | |
| "grad_norm": 1.125, | |
| "learning_rate": 0.00018767919411080976, | |
| "loss": 0.3407, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.282022356681366, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 0.00018690430065865945, | |
| "loss": 0.3253, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.282022356681366, | |
| "eval_loss": 0.35227087140083313, | |
| "eval_runtime": 3939.2496, | |
| "eval_samples_per_second": 29.703, | |
| "eval_steps_per_second": 0.928, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.29330325094862064, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 0.00018612940720650912, | |
| "loss": 0.3383, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3045841452158753, | |
| "grad_norm": 3.25, | |
| "learning_rate": 0.00018535451375435879, | |
| "loss": 0.3237, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.31586503948312994, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.00018457962030220845, | |
| "loss": 0.3091, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3271459337503846, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 0.00018380472685005812, | |
| "loss": 0.3179, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.33842682801763924, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 0.0001830298333979078, | |
| "loss": 0.334, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.34970772228489383, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 0.00018225493994575745, | |
| "loss": 0.3292, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3609886165521485, | |
| "grad_norm": 8.375, | |
| "learning_rate": 0.00018148004649360715, | |
| "loss": 0.3083, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3722695108194031, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 0.0001807051530414568, | |
| "loss": 0.2957, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3835504050866578, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 0.00017993025958930648, | |
| "loss": 0.3152, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3948312993539124, | |
| "grad_norm": 4.375, | |
| "learning_rate": 0.00017915536613715614, | |
| "loss": 0.2846, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4061121936211671, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 0.0001783804726850058, | |
| "loss": 0.2714, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4173930878884217, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 0.0001776055792328555, | |
| "loss": 0.2834, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4286739821556763, | |
| "grad_norm": 2.625, | |
| "learning_rate": 0.00017683068578070517, | |
| "loss": 0.2854, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.43995487642293096, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 0.00017605579232855484, | |
| "loss": 0.276, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4512357706901856, | |
| "grad_norm": 10.375, | |
| "learning_rate": 0.0001752808988764045, | |
| "loss": 0.2842, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.46251666495744026, | |
| "grad_norm": 9.75, | |
| "learning_rate": 0.00017450600542425417, | |
| "loss": 0.3073, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4737975592246949, | |
| "grad_norm": 8.875, | |
| "learning_rate": 0.00017373111197210386, | |
| "loss": 0.2636, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.48507845349194956, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 0.0001729562185199535, | |
| "loss": 0.2467, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4963593477592042, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.00017218132506780317, | |
| "loss": 0.277, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5076402420264589, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 0.00017140643161565286, | |
| "loss": 0.2831, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5189211362937135, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 0.00017063153816350253, | |
| "loss": 0.2432, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5302020305609682, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 0.0001698566447113522, | |
| "loss": 0.2442, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5414829248282228, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 0.00016908175125920186, | |
| "loss": 0.2655, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5527638190954773, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 0.00016830685780705153, | |
| "loss": 0.2463, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.564044713362732, | |
| "grad_norm": 10.75, | |
| "learning_rate": 0.00016753196435490122, | |
| "loss": 0.2679, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.564044713362732, | |
| "eval_loss": 0.24632702767848969, | |
| "eval_runtime": 3940.4364, | |
| "eval_samples_per_second": 29.694, | |
| "eval_steps_per_second": 0.928, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5753256076299866, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 0.0001667570709027509, | |
| "loss": 0.27, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5866065018972413, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 0.00016598217745060053, | |
| "loss": 0.2807, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5978873961644959, | |
| "grad_norm": 7.75, | |
| "learning_rate": 0.00016520728399845022, | |
| "loss": 0.2615, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6091682904317506, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 0.0001644323905462999, | |
| "loss": 0.2567, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6204491846990052, | |
| "grad_norm": 3.375, | |
| "learning_rate": 0.00016365749709414956, | |
| "loss": 0.2562, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6317300789662599, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 0.00016288260364199922, | |
| "loss": 0.2389, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6430109732335145, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 0.0001621077101898489, | |
| "loss": 0.2331, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6542918675007692, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 0.00016133281673769858, | |
| "loss": 0.2522, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6655727617680238, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.00016055792328554825, | |
| "loss": 0.2196, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6768536560352785, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 0.00015978302983339792, | |
| "loss": 0.2523, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6881345503025331, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 0.00015900813638124758, | |
| "loss": 0.263, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6994154445697877, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 0.00015823324292909725, | |
| "loss": 0.2148, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7106963388370423, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 0.00015745834947694694, | |
| "loss": 0.22, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.721977233104297, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 0.00015668345602479658, | |
| "loss": 0.2439, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7332581273715516, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.00015590856257264628, | |
| "loss": 0.2244, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7445390216388063, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.00015513366912049594, | |
| "loss": 0.1983, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7558199159060609, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 0.0001543587756683456, | |
| "loss": 0.2232, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7671008101733156, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 0.00015358388221619528, | |
| "loss": 0.2558, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7783817044405702, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 0.00015280898876404494, | |
| "loss": 0.249, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7896625987078248, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 0.00015203409531189464, | |
| "loss": 0.2103, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8009434929750795, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 0.0001512592018597443, | |
| "loss": 0.2328, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8122243872423341, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 0.00015048430840759394, | |
| "loss": 0.2327, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8235052815095888, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 0.00014970941495544364, | |
| "loss": 0.2027, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8347861757768434, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.0001489345215032933, | |
| "loss": 0.2416, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8460670700440981, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 0.000148159628051143, | |
| "loss": 0.2004, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8460670700440981, | |
| "eval_loss": 0.1825282871723175, | |
| "eval_runtime": 3937.2667, | |
| "eval_samples_per_second": 29.718, | |
| "eval_steps_per_second": 0.929, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8573479643113526, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 0.00014738473459899264, | |
| "loss": 0.2186, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8686288585786073, | |
| "grad_norm": 3.25, | |
| "learning_rate": 0.0001466098411468423, | |
| "loss": 0.2004, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8799097528458619, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 0.000145834947694692, | |
| "loss": 0.2121, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8911906471131166, | |
| "grad_norm": 3.125, | |
| "learning_rate": 0.00014506005424254166, | |
| "loss": 0.2116, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9024715413803712, | |
| "grad_norm": 9.75, | |
| "learning_rate": 0.00014428516079039133, | |
| "loss": 0.2407, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9137524356476259, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 0.000143510267338241, | |
| "loss": 0.2077, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9250333299148805, | |
| "grad_norm": 5.25, | |
| "learning_rate": 0.00014273537388609066, | |
| "loss": 0.2267, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9363142241821352, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 0.00014196048043394036, | |
| "loss": 0.1969, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9475951184493898, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 0.00014118558698179, | |
| "loss": 0.1998, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9588760127166445, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 0.0001404106935296397, | |
| "loss": 0.2008, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9701569069838991, | |
| "grad_norm": 6.0, | |
| "learning_rate": 0.00013963580007748935, | |
| "loss": 0.1914, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9814378012511538, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 0.00013886090662533902, | |
| "loss": 0.191, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9927186955184084, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 0.0001380860131731887, | |
| "loss": 0.1816, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.0033842682801764, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 0.00013731111972103835, | |
| "loss": 0.2074, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.014665162547431, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 0.00013653622626888802, | |
| "loss": 0.1984, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0259460568146856, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 0.00013576133281673771, | |
| "loss": 0.1648, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.0372269510819403, | |
| "grad_norm": 3.5, | |
| "learning_rate": 0.00013498643936458738, | |
| "loss": 0.192, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.048507845349195, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 0.00013421154591243705, | |
| "loss": 0.1522, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.0597887396164496, | |
| "grad_norm": 4.625, | |
| "learning_rate": 0.00013343665246028671, | |
| "loss": 0.1715, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.0710696338837042, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 0.00013266175900813638, | |
| "loss": 0.1593, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.082350528150959, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.00013188686555598607, | |
| "loss": 0.1439, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.0936314224182135, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.0001311119721038357, | |
| "loss": 0.1328, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.1049123166854682, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.0001303370786516854, | |
| "loss": 0.1763, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1161932109527228, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.00012956218519953507, | |
| "loss": 0.1689, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1274741052199775, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 0.00012878729174738474, | |
| "loss": 0.1391, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1274741052199775, | |
| "eval_loss": 0.16099952161312103, | |
| "eval_runtime": 3944.2805, | |
| "eval_samples_per_second": 29.665, | |
| "eval_steps_per_second": 0.927, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1387549994872321, | |
| "grad_norm": 2.375, | |
| "learning_rate": 0.0001280123982952344, | |
| "loss": 0.1669, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.1500358937544868, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 0.00012723750484308407, | |
| "loss": 0.1542, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.1613167880217414, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 0.00012646261139093377, | |
| "loss": 0.1566, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.172597682288996, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.00012568771793878343, | |
| "loss": 0.1865, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.1838785765562507, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 0.00012491282448663307, | |
| "loss": 0.1729, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.1951594708235054, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 0.00012413793103448277, | |
| "loss": 0.1755, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.20644036509076, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 0.00012336303758233243, | |
| "loss": 0.1397, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.2177212593580147, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.00012258814413018213, | |
| "loss": 0.1583, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.2290021536252693, | |
| "grad_norm": 3.5, | |
| "learning_rate": 0.00012181325067803178, | |
| "loss": 0.1782, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.2402830478925237, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 0.00012103835722588143, | |
| "loss": 0.1693, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.2515639421597786, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 0.00012026346377373113, | |
| "loss": 0.1591, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.262844836427033, | |
| "grad_norm": 2.375, | |
| "learning_rate": 0.00011948857032158078, | |
| "loss": 0.1741, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.2741257306942877, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 0.00011871367686943047, | |
| "loss": 0.1446, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.2854066249615423, | |
| "grad_norm": 1.875, | |
| "learning_rate": 0.00011793878341728013, | |
| "loss": 0.1627, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.296687519228797, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 0.00011716388996512979, | |
| "loss": 0.1509, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.3079684134960516, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 0.00011638899651297947, | |
| "loss": 0.1545, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.3192493077633063, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 0.00011561410306082914, | |
| "loss": 0.1503, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.330530202030561, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 0.00011483920960867882, | |
| "loss": 0.1597, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.3418110962978156, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 0.00011406431615652849, | |
| "loss": 0.1547, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.3530919905650702, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 0.00011328942270437815, | |
| "loss": 0.1701, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.3643728848323249, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 0.00011251452925222783, | |
| "loss": 0.1655, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.3756537790995795, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 0.00011173963580007749, | |
| "loss": 0.1526, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.3869346733668342, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 0.00011096474234792718, | |
| "loss": 0.1539, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.3982155676340888, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 0.00011018984889577683, | |
| "loss": 0.1421, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.4094964619013435, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 0.0001094149554436265, | |
| "loss": 0.1549, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.4094964619013435, | |
| "eval_loss": 0.13532690703868866, | |
| "eval_runtime": 3940.6219, | |
| "eval_samples_per_second": 29.693, | |
| "eval_steps_per_second": 0.928, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.4207773561685981, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 0.00010864006199147618, | |
| "loss": 0.1361, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.4320582504358528, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 0.00010786516853932584, | |
| "loss": 0.1735, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.4433391447031074, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00010709027508717552, | |
| "loss": 0.1747, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.454620038970362, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 0.00010631538163502519, | |
| "loss": 0.1461, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.4659009332376167, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 0.00010554048818287486, | |
| "loss": 0.1701, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.4771818275048714, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 0.00010476559473072454, | |
| "loss": 0.1473, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.488462721772126, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.0001039907012785742, | |
| "loss": 0.1292, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.4997436160393804, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 0.00010321580782642388, | |
| "loss": 0.1572, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.511024510306635, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 0.00010244091437427355, | |
| "loss": 0.1446, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.5223054045738897, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 0.0001016660209221232, | |
| "loss": 0.1469, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.5335862988411444, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00010089112746997288, | |
| "loss": 0.1774, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.544867193108399, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 0.00010011623401782255, | |
| "loss": 0.1371, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.5561480873756537, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.934134056567223e-05, | |
| "loss": 0.1459, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.5674289816429083, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 9.85664471135219e-05, | |
| "loss": 0.138, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.578709875910163, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.779155366137158e-05, | |
| "loss": 0.1395, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.5899907701774176, | |
| "grad_norm": 3.0, | |
| "learning_rate": 9.701666020922123e-05, | |
| "loss": 0.1526, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.6012716644446723, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.624176675707091e-05, | |
| "loss": 0.1612, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.612552558711927, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 9.546687330492058e-05, | |
| "loss": 0.1355, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.6238334529791816, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.469197985277026e-05, | |
| "loss": 0.1607, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.6351143472464362, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 9.391708640061992e-05, | |
| "loss": 0.1562, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.6463952415136909, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 9.314219294846959e-05, | |
| "loss": 0.146, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.6576761357809455, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.236729949631926e-05, | |
| "loss": 0.1405, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.6689570300482002, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 9.159240604416894e-05, | |
| "loss": 0.137, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.6802379243154548, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 9.08175125920186e-05, | |
| "loss": 0.1674, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.6915188185827095, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 9.004261913986827e-05, | |
| "loss": 0.1452, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6915188185827095, | |
| "eval_loss": 0.1216062381863594, | |
| "eval_runtime": 3942.5719, | |
| "eval_samples_per_second": 29.678, | |
| "eval_steps_per_second": 0.928, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.7027997128499641, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8.926772568771794e-05, | |
| "loss": 0.1491, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.7140806071172188, | |
| "grad_norm": 2.75, | |
| "learning_rate": 8.849283223556762e-05, | |
| "loss": 0.1304, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.7253615013844734, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 8.771793878341728e-05, | |
| "loss": 0.1367, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.736642395651728, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 8.694304533126696e-05, | |
| "loss": 0.1321, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.7479232899189827, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 8.616815187911662e-05, | |
| "loss": 0.1413, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.7592041841862374, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.53932584269663e-05, | |
| "loss": 0.1593, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.770485078453492, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 8.461836497481596e-05, | |
| "loss": 0.1472, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.7817659727207467, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 8.384347152266564e-05, | |
| "loss": 0.144, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.7930468669880013, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 8.306857807051531e-05, | |
| "loss": 0.1431, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.804327761255256, | |
| "grad_norm": 4.5, | |
| "learning_rate": 8.229368461836498e-05, | |
| "loss": 0.1455, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.8156086555225106, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 8.151879116621464e-05, | |
| "loss": 0.1142, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.8268895497897653, | |
| "grad_norm": 2.25, | |
| "learning_rate": 8.074389771406432e-05, | |
| "loss": 0.1554, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.83817044405702, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 7.996900426191399e-05, | |
| "loss": 0.1376, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.8494513383242746, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 7.919411080976367e-05, | |
| "loss": 0.1308, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.8607322325915292, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 7.841921735761332e-05, | |
| "loss": 0.1599, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.8720131268587838, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 7.7644323905463e-05, | |
| "loss": 0.1291, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.8832940211260385, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.686943045331267e-05, | |
| "loss": 0.1368, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.8945749153932931, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 7.609453700116235e-05, | |
| "loss": 0.1585, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.9058558096605478, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 7.531964354901202e-05, | |
| "loss": 0.1426, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.9171367039278022, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 7.454475009686168e-05, | |
| "loss": 0.1578, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.9284175981950569, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 7.376985664471135e-05, | |
| "loss": 0.1251, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.9396984924623115, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 7.299496319256103e-05, | |
| "loss": 0.1229, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.9509793867295662, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 7.22200697404107e-05, | |
| "loss": 0.1122, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.9622602809968208, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 7.144517628826036e-05, | |
| "loss": 0.144, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.9735411752640755, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 7.067028283611004e-05, | |
| "loss": 0.1526, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.9735411752640755, | |
| "eval_loss": 0.10789535939693451, | |
| "eval_runtime": 3942.9784, | |
| "eval_samples_per_second": 29.675, | |
| "eval_steps_per_second": 0.927, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.9848220695313301, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.989538938395971e-05, | |
| "loss": 0.1409, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.9961029637985848, | |
| "grad_norm": 0.75, | |
| "learning_rate": 6.912049593180939e-05, | |
| "loss": 0.1317, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.0067685365603527, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 6.834560247965905e-05, | |
| "loss": 0.1106, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.0180494308276073, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 6.757070902750872e-05, | |
| "loss": 0.0853, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.029330325094862, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 6.679581557535839e-05, | |
| "loss": 0.0947, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.0406112193621166, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 6.602092212320807e-05, | |
| "loss": 0.0886, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.0518921136293713, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 6.524602867105773e-05, | |
| "loss": 0.1107, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.063173007896626, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 6.447113521890741e-05, | |
| "loss": 0.085, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.0744539021638806, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 6.369624176675707e-05, | |
| "loss": 0.087, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.0857347964311352, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 6.292134831460675e-05, | |
| "loss": 0.1305, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.09701569069839, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 6.214645486245641e-05, | |
| "loss": 0.1178, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.1082965849656445, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 6.13715614103061e-05, | |
| "loss": 0.1124, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.119577479232899, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 6.059666795815576e-05, | |
| "loss": 0.1044, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.130858373500154, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.982177450600542e-05, | |
| "loss": 0.0873, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.1421392677674085, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 5.9046881053855094e-05, | |
| "loss": 0.1124, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.153420162034663, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 5.827198760170477e-05, | |
| "loss": 0.0943, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.164701056301918, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 5.749709414955444e-05, | |
| "loss": 0.0892, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.1759819505691724, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.672220069740411e-05, | |
| "loss": 0.1028, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.187262844836427, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 5.594730724525378e-05, | |
| "loss": 0.1033, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.1985437391036817, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 5.517241379310345e-05, | |
| "loss": 0.1093, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.2098246333709364, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.439752034095312e-05, | |
| "loss": 0.1123, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.221105527638191, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 5.362262688880279e-05, | |
| "loss": 0.1139, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.2323864219054457, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 5.284773343665246e-05, | |
| "loss": 0.0987, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.2436673161727003, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.207283998450213e-05, | |
| "loss": 0.1008, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.254948210439955, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.1297946532351806e-05, | |
| "loss": 0.0925, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.254948210439955, | |
| "eval_loss": 0.09745433926582336, | |
| "eval_runtime": 3942.1232, | |
| "eval_samples_per_second": 29.682, | |
| "eval_steps_per_second": 0.928, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2661, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |