{ "best_global_step": 2000, "best_metric": 0.09745433926582336, "best_model_checkpoint": "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/checkpoints/checkpoint-2000", "epoch": 2.254948210439955, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01128089426725464, "grad_norm": 12.25, "learning_rate": 2.25e-05, "loss": 0.7492, "step": 10 }, { "epoch": 0.02256178853450928, "grad_norm": 10.25, "learning_rate": 4.75e-05, "loss": 0.6591, "step": 20 }, { "epoch": 0.033842682801763925, "grad_norm": 4.75, "learning_rate": 7.25e-05, "loss": 0.5451, "step": 30 }, { "epoch": 0.04512357706901856, "grad_norm": 4.40625, "learning_rate": 9.75e-05, "loss": 0.4973, "step": 40 }, { "epoch": 0.0564044713362732, "grad_norm": 5.90625, "learning_rate": 0.00012250000000000002, "loss": 0.4536, "step": 50 }, { "epoch": 0.06768536560352785, "grad_norm": 3.90625, "learning_rate": 0.0001475, "loss": 0.4242, "step": 60 }, { "epoch": 0.07896625987078248, "grad_norm": 1.734375, "learning_rate": 0.00017250000000000002, "loss": 0.409, "step": 70 }, { "epoch": 0.09024715413803712, "grad_norm": 3.390625, "learning_rate": 0.00019750000000000003, "loss": 0.3968, "step": 80 }, { "epoch": 0.10152804840529177, "grad_norm": 6.0, "learning_rate": 0.0001993025958930647, "loss": 0.3769, "step": 90 }, { "epoch": 0.1128089426725464, "grad_norm": 4.875, "learning_rate": 0.0001985277024409144, "loss": 0.3902, "step": 100 }, { "epoch": 0.12408983693980105, "grad_norm": 7.0, "learning_rate": 0.00019775280898876404, "loss": 0.3347, "step": 110 }, { "epoch": 0.1353707312070557, "grad_norm": 5.53125, "learning_rate": 0.00019697791553661373, "loss": 0.3346, "step": 120 }, { "epoch": 0.14665162547431032, "grad_norm": 6.34375, "learning_rate": 0.0001962030220844634, "loss": 0.3517, "step": 130 }, { "epoch": 0.15793251974156497, "grad_norm": 4.96875, "learning_rate": 0.00019542812863231307, "loss": 0.3614, "step": 140 }, { "epoch": 0.16921341400881962, "grad_norm": 9.25, "learning_rate": 0.00019465323518016273, "loss": 0.3342, "step": 150 }, { "epoch": 0.18049430827607424, "grad_norm": 1.78125, "learning_rate": 0.0001938783417280124, "loss": 0.3328, "step": 160 }, { "epoch": 0.1917752025433289, "grad_norm": 5.4375, "learning_rate": 0.0001931034482758621, "loss": 0.3238, "step": 170 }, { "epoch": 0.20305609681058354, "grad_norm": 1.015625, "learning_rate": 0.00019232855482371176, "loss": 0.3256, "step": 180 }, { "epoch": 0.21433699107783816, "grad_norm": 2.390625, "learning_rate": 0.0001915536613715614, "loss": 0.3487, "step": 190 }, { "epoch": 0.2256178853450928, "grad_norm": 8.5625, "learning_rate": 0.0001907787679194111, "loss": 0.3266, "step": 200 }, { "epoch": 0.23689877961234745, "grad_norm": 1.5703125, "learning_rate": 0.00019000387446726076, "loss": 0.316, "step": 210 }, { "epoch": 0.2481796738796021, "grad_norm": 1.2890625, "learning_rate": 0.00018922898101511045, "loss": 0.3118, "step": 220 }, { "epoch": 0.25946056814685675, "grad_norm": 2.3125, "learning_rate": 0.0001884540875629601, "loss": 0.3226, "step": 230 }, { "epoch": 0.2707414624141114, "grad_norm": 1.125, "learning_rate": 0.00018767919411080976, "loss": 0.3407, "step": 240 }, { "epoch": 0.282022356681366, "grad_norm": 12.6875, "learning_rate": 0.00018690430065865945, "loss": 0.3253, "step": 250 }, { "epoch": 0.282022356681366, "eval_loss": 0.35227087140083313, "eval_runtime": 3939.2496, "eval_samples_per_second": 29.703, "eval_steps_per_second": 0.928, "step": 250 }, { "epoch": 0.29330325094862064, "grad_norm": 11.8125, "learning_rate": 0.00018612940720650912, "loss": 0.3383, "step": 260 }, { "epoch": 0.3045841452158753, "grad_norm": 3.25, "learning_rate": 0.00018535451375435879, "loss": 0.3237, "step": 270 }, { "epoch": 0.31586503948312994, "grad_norm": 3.65625, "learning_rate": 0.00018457962030220845, "loss": 0.3091, "step": 280 }, { "epoch": 0.3271459337503846, "grad_norm": 7.9375, "learning_rate": 0.00018380472685005812, "loss": 0.3179, "step": 290 }, { "epoch": 0.33842682801763924, "grad_norm": 5.21875, "learning_rate": 0.0001830298333979078, "loss": 0.334, "step": 300 }, { "epoch": 0.34970772228489383, "grad_norm": 2.765625, "learning_rate": 0.00018225493994575745, "loss": 0.3292, "step": 310 }, { "epoch": 0.3609886165521485, "grad_norm": 8.375, "learning_rate": 0.00018148004649360715, "loss": 0.3083, "step": 320 }, { "epoch": 0.3722695108194031, "grad_norm": 2.046875, "learning_rate": 0.0001807051530414568, "loss": 0.2957, "step": 330 }, { "epoch": 0.3835504050866578, "grad_norm": 2.34375, "learning_rate": 0.00017993025958930648, "loss": 0.3152, "step": 340 }, { "epoch": 0.3948312993539124, "grad_norm": 4.375, "learning_rate": 0.00017915536613715614, "loss": 0.2846, "step": 350 }, { "epoch": 0.4061121936211671, "grad_norm": 9.1875, "learning_rate": 0.0001783804726850058, "loss": 0.2714, "step": 360 }, { "epoch": 0.4173930878884217, "grad_norm": 2.46875, "learning_rate": 0.0001776055792328555, "loss": 0.2834, "step": 370 }, { "epoch": 0.4286739821556763, "grad_norm": 2.625, "learning_rate": 0.00017683068578070517, "loss": 0.2854, "step": 380 }, { "epoch": 0.43995487642293096, "grad_norm": 1.5546875, "learning_rate": 0.00017605579232855484, "loss": 0.276, "step": 390 }, { "epoch": 0.4512357706901856, "grad_norm": 10.375, "learning_rate": 0.0001752808988764045, "loss": 0.2842, "step": 400 }, { "epoch": 0.46251666495744026, "grad_norm": 9.75, "learning_rate": 0.00017450600542425417, "loss": 0.3073, "step": 410 }, { "epoch": 0.4737975592246949, "grad_norm": 8.875, "learning_rate": 0.00017373111197210386, "loss": 0.2636, "step": 420 }, { "epoch": 0.48507845349194956, "grad_norm": 3.234375, "learning_rate": 0.0001729562185199535, "loss": 0.2467, "step": 430 }, { "epoch": 0.4963593477592042, "grad_norm": 1.3671875, "learning_rate": 0.00017218132506780317, "loss": 0.277, "step": 440 }, { "epoch": 0.5076402420264589, "grad_norm": 5.9375, "learning_rate": 0.00017140643161565286, "loss": 0.2831, "step": 450 }, { "epoch": 0.5189211362937135, "grad_norm": 1.7421875, "learning_rate": 0.00017063153816350253, "loss": 0.2432, "step": 460 }, { "epoch": 0.5302020305609682, "grad_norm": 2.6875, "learning_rate": 0.0001698566447113522, "loss": 0.2442, "step": 470 }, { "epoch": 0.5414829248282228, "grad_norm": 2.921875, "learning_rate": 0.00016908175125920186, "loss": 0.2655, "step": 480 }, { "epoch": 0.5527638190954773, "grad_norm": 1.6796875, "learning_rate": 0.00016830685780705153, "loss": 0.2463, "step": 490 }, { "epoch": 0.564044713362732, "grad_norm": 10.75, "learning_rate": 0.00016753196435490122, "loss": 0.2679, "step": 500 }, { "epoch": 0.564044713362732, "eval_loss": 0.24632702767848969, "eval_runtime": 3940.4364, "eval_samples_per_second": 29.694, "eval_steps_per_second": 0.928, "step": 500 }, { "epoch": 0.5753256076299866, "grad_norm": 4.53125, "learning_rate": 0.0001667570709027509, "loss": 0.27, "step": 510 }, { "epoch": 0.5866065018972413, "grad_norm": 3.203125, "learning_rate": 0.00016598217745060053, "loss": 0.2807, "step": 520 }, { "epoch": 0.5978873961644959, "grad_norm": 7.75, "learning_rate": 0.00016520728399845022, "loss": 0.2615, "step": 530 }, { "epoch": 0.6091682904317506, "grad_norm": 4.78125, "learning_rate": 0.0001644323905462999, "loss": 0.2567, "step": 540 }, { "epoch": 0.6204491846990052, "grad_norm": 3.375, "learning_rate": 0.00016365749709414956, "loss": 0.2562, "step": 550 }, { "epoch": 0.6317300789662599, "grad_norm": 1.8828125, "learning_rate": 0.00016288260364199922, "loss": 0.2389, "step": 560 }, { "epoch": 0.6430109732335145, "grad_norm": 2.46875, "learning_rate": 0.0001621077101898489, "loss": 0.2331, "step": 570 }, { "epoch": 0.6542918675007692, "grad_norm": 3.03125, "learning_rate": 0.00016133281673769858, "loss": 0.2522, "step": 580 }, { "epoch": 0.6655727617680238, "grad_norm": 1.28125, "learning_rate": 0.00016055792328554825, "loss": 0.2196, "step": 590 }, { "epoch": 0.6768536560352785, "grad_norm": 3.1875, "learning_rate": 0.00015978302983339792, "loss": 0.2523, "step": 600 }, { "epoch": 0.6881345503025331, "grad_norm": 3.96875, "learning_rate": 0.00015900813638124758, "loss": 0.263, "step": 610 }, { "epoch": 0.6994154445697877, "grad_norm": 3.21875, "learning_rate": 0.00015823324292909725, "loss": 0.2148, "step": 620 }, { "epoch": 0.7106963388370423, "grad_norm": 1.6796875, "learning_rate": 0.00015745834947694694, "loss": 0.22, "step": 630 }, { "epoch": 0.721977233104297, "grad_norm": 4.34375, "learning_rate": 0.00015668345602479658, "loss": 0.2439, "step": 640 }, { "epoch": 0.7332581273715516, "grad_norm": 1.484375, "learning_rate": 0.00015590856257264628, "loss": 0.2244, "step": 650 }, { "epoch": 0.7445390216388063, "grad_norm": 1.6015625, "learning_rate": 0.00015513366912049594, "loss": 0.1983, "step": 660 }, { "epoch": 0.7558199159060609, "grad_norm": 6.65625, "learning_rate": 0.0001543587756683456, "loss": 0.2232, "step": 670 }, { "epoch": 0.7671008101733156, "grad_norm": 1.03125, "learning_rate": 0.00015358388221619528, "loss": 0.2558, "step": 680 }, { "epoch": 0.7783817044405702, "grad_norm": 4.40625, "learning_rate": 0.00015280898876404494, "loss": 0.249, "step": 690 }, { "epoch": 0.7896625987078248, "grad_norm": 3.4375, "learning_rate": 0.00015203409531189464, "loss": 0.2103, "step": 700 }, { "epoch": 0.8009434929750795, "grad_norm": 1.640625, "learning_rate": 0.0001512592018597443, "loss": 0.2328, "step": 710 }, { "epoch": 0.8122243872423341, "grad_norm": 2.421875, "learning_rate": 0.00015048430840759394, "loss": 0.2327, "step": 720 }, { "epoch": 0.8235052815095888, "grad_norm": 7.1875, "learning_rate": 0.00014970941495544364, "loss": 0.2027, "step": 730 }, { "epoch": 0.8347861757768434, "grad_norm": 4.5, "learning_rate": 0.0001489345215032933, "loss": 0.2416, "step": 740 }, { "epoch": 0.8460670700440981, "grad_norm": 2.078125, "learning_rate": 0.000148159628051143, "loss": 0.2004, "step": 750 }, { "epoch": 0.8460670700440981, "eval_loss": 0.1825282871723175, "eval_runtime": 3937.2667, "eval_samples_per_second": 29.718, "eval_steps_per_second": 0.929, "step": 750 }, { "epoch": 0.8573479643113526, "grad_norm": 1.734375, "learning_rate": 0.00014738473459899264, "loss": 0.2186, "step": 760 }, { "epoch": 0.8686288585786073, "grad_norm": 3.25, "learning_rate": 0.0001466098411468423, "loss": 0.2004, "step": 770 }, { "epoch": 0.8799097528458619, "grad_norm": 1.9140625, "learning_rate": 0.000145834947694692, "loss": 0.2121, "step": 780 }, { "epoch": 0.8911906471131166, "grad_norm": 3.125, "learning_rate": 0.00014506005424254166, "loss": 0.2116, "step": 790 }, { "epoch": 0.9024715413803712, "grad_norm": 9.75, "learning_rate": 0.00014428516079039133, "loss": 0.2407, "step": 800 }, { "epoch": 0.9137524356476259, "grad_norm": 3.5625, "learning_rate": 0.000143510267338241, "loss": 0.2077, "step": 810 }, { "epoch": 0.9250333299148805, "grad_norm": 5.25, "learning_rate": 0.00014273537388609066, "loss": 0.2267, "step": 820 }, { "epoch": 0.9363142241821352, "grad_norm": 2.5625, "learning_rate": 0.00014196048043394036, "loss": 0.1969, "step": 830 }, { "epoch": 0.9475951184493898, "grad_norm": 1.53125, "learning_rate": 0.00014118558698179, "loss": 0.1998, "step": 840 }, { "epoch": 0.9588760127166445, "grad_norm": 1.0390625, "learning_rate": 0.0001404106935296397, "loss": 0.2008, "step": 850 }, { "epoch": 0.9701569069838991, "grad_norm": 6.0, "learning_rate": 0.00013963580007748935, "loss": 0.1914, "step": 860 }, { "epoch": 0.9814378012511538, "grad_norm": 1.4609375, "learning_rate": 0.00013886090662533902, "loss": 0.191, "step": 870 }, { "epoch": 0.9927186955184084, "grad_norm": 2.390625, "learning_rate": 0.0001380860131731887, "loss": 0.1816, "step": 880 }, { "epoch": 1.0033842682801764, "grad_norm": 3.859375, "learning_rate": 0.00013731111972103835, "loss": 0.2074, "step": 890 }, { "epoch": 1.014665162547431, "grad_norm": 4.28125, "learning_rate": 0.00013653622626888802, "loss": 0.1984, "step": 900 }, { "epoch": 1.0259460568146856, "grad_norm": 1.3203125, "learning_rate": 0.00013576133281673771, "loss": 0.1648, "step": 910 }, { "epoch": 1.0372269510819403, "grad_norm": 3.5, "learning_rate": 0.00013498643936458738, "loss": 0.192, "step": 920 }, { "epoch": 1.048507845349195, "grad_norm": 6.53125, "learning_rate": 0.00013421154591243705, "loss": 0.1522, "step": 930 }, { "epoch": 1.0597887396164496, "grad_norm": 4.625, "learning_rate": 0.00013343665246028671, "loss": 0.1715, "step": 940 }, { "epoch": 1.0710696338837042, "grad_norm": 3.171875, "learning_rate": 0.00013266175900813638, "loss": 0.1593, "step": 950 }, { "epoch": 1.082350528150959, "grad_norm": 3.65625, "learning_rate": 0.00013188686555598607, "loss": 0.1439, "step": 960 }, { "epoch": 1.0936314224182135, "grad_norm": 2.140625, "learning_rate": 0.0001311119721038357, "loss": 0.1328, "step": 970 }, { "epoch": 1.1049123166854682, "grad_norm": 2.359375, "learning_rate": 0.0001303370786516854, "loss": 0.1763, "step": 980 }, { "epoch": 1.1161932109527228, "grad_norm": 1.484375, "learning_rate": 0.00012956218519953507, "loss": 0.1689, "step": 990 }, { "epoch": 1.1274741052199775, "grad_norm": 1.59375, "learning_rate": 0.00012878729174738474, "loss": 0.1391, "step": 1000 }, { "epoch": 1.1274741052199775, "eval_loss": 0.16099952161312103, "eval_runtime": 3944.2805, "eval_samples_per_second": 29.665, "eval_steps_per_second": 0.927, "step": 1000 }, { "epoch": 1.1387549994872321, "grad_norm": 2.375, "learning_rate": 0.0001280123982952344, "loss": 0.1669, "step": 1010 }, { "epoch": 1.1500358937544868, "grad_norm": 0.89453125, "learning_rate": 0.00012723750484308407, "loss": 0.1542, "step": 1020 }, { "epoch": 1.1613167880217414, "grad_norm": 2.90625, "learning_rate": 0.00012646261139093377, "loss": 0.1566, "step": 1030 }, { "epoch": 1.172597682288996, "grad_norm": 2.359375, "learning_rate": 0.00012568771793878343, "loss": 0.1865, "step": 1040 }, { "epoch": 1.1838785765562507, "grad_norm": 1.6484375, "learning_rate": 0.00012491282448663307, "loss": 0.1729, "step": 1050 }, { "epoch": 1.1951594708235054, "grad_norm": 3.546875, "learning_rate": 0.00012413793103448277, "loss": 0.1755, "step": 1060 }, { "epoch": 1.20644036509076, "grad_norm": 3.390625, "learning_rate": 0.00012336303758233243, "loss": 0.1397, "step": 1070 }, { "epoch": 1.2177212593580147, "grad_norm": 2.125, "learning_rate": 0.00012258814413018213, "loss": 0.1583, "step": 1080 }, { "epoch": 1.2290021536252693, "grad_norm": 3.5, "learning_rate": 0.00012181325067803178, "loss": 0.1782, "step": 1090 }, { "epoch": 1.2402830478925237, "grad_norm": 0.99609375, "learning_rate": 0.00012103835722588143, "loss": 0.1693, "step": 1100 }, { "epoch": 1.2515639421597786, "grad_norm": 3.296875, "learning_rate": 0.00012026346377373113, "loss": 0.1591, "step": 1110 }, { "epoch": 1.262844836427033, "grad_norm": 2.375, "learning_rate": 0.00011948857032158078, "loss": 0.1741, "step": 1120 }, { "epoch": 1.2741257306942877, "grad_norm": 1.2421875, "learning_rate": 0.00011871367686943047, "loss": 0.1446, "step": 1130 }, { "epoch": 1.2854066249615423, "grad_norm": 1.875, "learning_rate": 0.00011793878341728013, "loss": 0.1627, "step": 1140 }, { "epoch": 1.296687519228797, "grad_norm": 2.484375, "learning_rate": 0.00011716388996512979, "loss": 0.1509, "step": 1150 }, { "epoch": 1.3079684134960516, "grad_norm": 1.6875, "learning_rate": 0.00011638899651297947, "loss": 0.1545, "step": 1160 }, { "epoch": 1.3192493077633063, "grad_norm": 4.9375, "learning_rate": 0.00011561410306082914, "loss": 0.1503, "step": 1170 }, { "epoch": 1.330530202030561, "grad_norm": 1.9375, "learning_rate": 0.00011483920960867882, "loss": 0.1597, "step": 1180 }, { "epoch": 1.3418110962978156, "grad_norm": 1.171875, "learning_rate": 0.00011406431615652849, "loss": 0.1547, "step": 1190 }, { "epoch": 1.3530919905650702, "grad_norm": 2.34375, "learning_rate": 0.00011328942270437815, "loss": 0.1701, "step": 1200 }, { "epoch": 1.3643728848323249, "grad_norm": 1.0625, "learning_rate": 0.00011251452925222783, "loss": 0.1655, "step": 1210 }, { "epoch": 1.3756537790995795, "grad_norm": 3.234375, "learning_rate": 0.00011173963580007749, "loss": 0.1526, "step": 1220 }, { "epoch": 1.3869346733668342, "grad_norm": 1.2890625, "learning_rate": 0.00011096474234792718, "loss": 0.1539, "step": 1230 }, { "epoch": 1.3982155676340888, "grad_norm": 3.640625, "learning_rate": 0.00011018984889577683, "loss": 0.1421, "step": 1240 }, { "epoch": 1.4094964619013435, "grad_norm": 3.046875, "learning_rate": 0.0001094149554436265, "loss": 0.1549, "step": 1250 }, { "epoch": 1.4094964619013435, "eval_loss": 0.13532690703868866, "eval_runtime": 3940.6219, "eval_samples_per_second": 29.693, "eval_steps_per_second": 0.928, "step": 1250 }, { "epoch": 1.4207773561685981, "grad_norm": 1.765625, "learning_rate": 0.00010864006199147618, "loss": 0.1361, "step": 1260 }, { "epoch": 1.4320582504358528, "grad_norm": 2.171875, "learning_rate": 0.00010786516853932584, "loss": 0.1735, "step": 1270 }, { "epoch": 1.4433391447031074, "grad_norm": 1.40625, "learning_rate": 0.00010709027508717552, "loss": 0.1747, "step": 1280 }, { "epoch": 1.454620038970362, "grad_norm": 1.5078125, "learning_rate": 0.00010631538163502519, "loss": 0.1461, "step": 1290 }, { "epoch": 1.4659009332376167, "grad_norm": 2.453125, "learning_rate": 0.00010554048818287486, "loss": 0.1701, "step": 1300 }, { "epoch": 1.4771818275048714, "grad_norm": 3.046875, "learning_rate": 0.00010476559473072454, "loss": 0.1473, "step": 1310 }, { "epoch": 1.488462721772126, "grad_norm": 2.140625, "learning_rate": 0.0001039907012785742, "loss": 0.1292, "step": 1320 }, { "epoch": 1.4997436160393804, "grad_norm": 2.90625, "learning_rate": 0.00010321580782642388, "loss": 0.1572, "step": 1330 }, { "epoch": 1.511024510306635, "grad_norm": 2.15625, "learning_rate": 0.00010244091437427355, "loss": 0.1446, "step": 1340 }, { "epoch": 1.5223054045738897, "grad_norm": 3.40625, "learning_rate": 0.0001016660209221232, "loss": 0.1469, "step": 1350 }, { "epoch": 1.5335862988411444, "grad_norm": 1.109375, "learning_rate": 0.00010089112746997288, "loss": 0.1774, "step": 1360 }, { "epoch": 1.544867193108399, "grad_norm": 1.984375, "learning_rate": 0.00010011623401782255, "loss": 0.1371, "step": 1370 }, { "epoch": 1.5561480873756537, "grad_norm": 2.15625, "learning_rate": 9.934134056567223e-05, "loss": 0.1459, "step": 1380 }, { "epoch": 1.5674289816429083, "grad_norm": 1.6953125, "learning_rate": 9.85664471135219e-05, "loss": 0.138, "step": 1390 }, { "epoch": 1.578709875910163, "grad_norm": 1.421875, "learning_rate": 9.779155366137158e-05, "loss": 0.1395, "step": 1400 }, { "epoch": 1.5899907701774176, "grad_norm": 3.0, "learning_rate": 9.701666020922123e-05, "loss": 0.1526, "step": 1410 }, { "epoch": 1.6012716644446723, "grad_norm": 1.1953125, "learning_rate": 9.624176675707091e-05, "loss": 0.1612, "step": 1420 }, { "epoch": 1.612552558711927, "grad_norm": 1.1171875, "learning_rate": 9.546687330492058e-05, "loss": 0.1355, "step": 1430 }, { "epoch": 1.6238334529791816, "grad_norm": 1.171875, "learning_rate": 9.469197985277026e-05, "loss": 0.1607, "step": 1440 }, { "epoch": 1.6351143472464362, "grad_norm": 3.078125, "learning_rate": 9.391708640061992e-05, "loss": 0.1562, "step": 1450 }, { "epoch": 1.6463952415136909, "grad_norm": 1.3359375, "learning_rate": 9.314219294846959e-05, "loss": 0.146, "step": 1460 }, { "epoch": 1.6576761357809455, "grad_norm": 2.125, "learning_rate": 9.236729949631926e-05, "loss": 0.1405, "step": 1470 }, { "epoch": 1.6689570300482002, "grad_norm": 1.5859375, "learning_rate": 9.159240604416894e-05, "loss": 0.137, "step": 1480 }, { "epoch": 1.6802379243154548, "grad_norm": 2.984375, "learning_rate": 9.08175125920186e-05, "loss": 0.1674, "step": 1490 }, { "epoch": 1.6915188185827095, "grad_norm": 2.296875, "learning_rate": 9.004261913986827e-05, "loss": 0.1452, "step": 1500 }, { "epoch": 1.6915188185827095, "eval_loss": 0.1216062381863594, "eval_runtime": 3942.5719, "eval_samples_per_second": 29.678, "eval_steps_per_second": 0.928, "step": 1500 }, { "epoch": 1.7027997128499641, "grad_norm": 2.625, "learning_rate": 8.926772568771794e-05, "loss": 0.1491, "step": 1510 }, { "epoch": 1.7140806071172188, "grad_norm": 2.75, "learning_rate": 8.849283223556762e-05, "loss": 0.1304, "step": 1520 }, { "epoch": 1.7253615013844734, "grad_norm": 1.8359375, "learning_rate": 8.771793878341728e-05, "loss": 0.1367, "step": 1530 }, { "epoch": 1.736642395651728, "grad_norm": 2.453125, "learning_rate": 8.694304533126696e-05, "loss": 0.1321, "step": 1540 }, { "epoch": 1.7479232899189827, "grad_norm": 2.53125, "learning_rate": 8.616815187911662e-05, "loss": 0.1413, "step": 1550 }, { "epoch": 1.7592041841862374, "grad_norm": 2.296875, "learning_rate": 8.53932584269663e-05, "loss": 0.1593, "step": 1560 }, { "epoch": 1.770485078453492, "grad_norm": 3.90625, "learning_rate": 8.461836497481596e-05, "loss": 0.1472, "step": 1570 }, { "epoch": 1.7817659727207467, "grad_norm": 1.703125, "learning_rate": 8.384347152266564e-05, "loss": 0.144, "step": 1580 }, { "epoch": 1.7930468669880013, "grad_norm": 1.34375, "learning_rate": 8.306857807051531e-05, "loss": 0.1431, "step": 1590 }, { "epoch": 1.804327761255256, "grad_norm": 4.5, "learning_rate": 8.229368461836498e-05, "loss": 0.1455, "step": 1600 }, { "epoch": 1.8156086555225106, "grad_norm": 2.015625, "learning_rate": 8.151879116621464e-05, "loss": 0.1142, "step": 1610 }, { "epoch": 1.8268895497897653, "grad_norm": 2.25, "learning_rate": 8.074389771406432e-05, "loss": 0.1554, "step": 1620 }, { "epoch": 1.83817044405702, "grad_norm": 1.6328125, "learning_rate": 7.996900426191399e-05, "loss": 0.1376, "step": 1630 }, { "epoch": 1.8494513383242746, "grad_norm": 1.5234375, "learning_rate": 7.919411080976367e-05, "loss": 0.1308, "step": 1640 }, { "epoch": 1.8607322325915292, "grad_norm": 1.78125, "learning_rate": 7.841921735761332e-05, "loss": 0.1599, "step": 1650 }, { "epoch": 1.8720131268587838, "grad_norm": 1.5625, "learning_rate": 7.7644323905463e-05, "loss": 0.1291, "step": 1660 }, { "epoch": 1.8832940211260385, "grad_norm": 2.203125, "learning_rate": 7.686943045331267e-05, "loss": 0.1368, "step": 1670 }, { "epoch": 1.8945749153932931, "grad_norm": 2.828125, "learning_rate": 7.609453700116235e-05, "loss": 0.1585, "step": 1680 }, { "epoch": 1.9058558096605478, "grad_norm": 4.4375, "learning_rate": 7.531964354901202e-05, "loss": 0.1426, "step": 1690 }, { "epoch": 1.9171367039278022, "grad_norm": 1.890625, "learning_rate": 7.454475009686168e-05, "loss": 0.1578, "step": 1700 }, { "epoch": 1.9284175981950569, "grad_norm": 1.8984375, "learning_rate": 7.376985664471135e-05, "loss": 0.1251, "step": 1710 }, { "epoch": 1.9396984924623115, "grad_norm": 1.953125, "learning_rate": 7.299496319256103e-05, "loss": 0.1229, "step": 1720 }, { "epoch": 1.9509793867295662, "grad_norm": 1.640625, "learning_rate": 7.22200697404107e-05, "loss": 0.1122, "step": 1730 }, { "epoch": 1.9622602809968208, "grad_norm": 1.2734375, "learning_rate": 7.144517628826036e-05, "loss": 0.144, "step": 1740 }, { "epoch": 1.9735411752640755, "grad_norm": 0.921875, "learning_rate": 7.067028283611004e-05, "loss": 0.1526, "step": 1750 }, { "epoch": 1.9735411752640755, "eval_loss": 0.10789535939693451, "eval_runtime": 3942.9784, "eval_samples_per_second": 29.675, "eval_steps_per_second": 0.927, "step": 1750 }, { "epoch": 1.9848220695313301, "grad_norm": 2.125, "learning_rate": 6.989538938395971e-05, "loss": 0.1409, "step": 1760 }, { "epoch": 1.9961029637985848, "grad_norm": 0.75, "learning_rate": 6.912049593180939e-05, "loss": 0.1317, "step": 1770 }, { "epoch": 2.0067685365603527, "grad_norm": 4.03125, "learning_rate": 6.834560247965905e-05, "loss": 0.1106, "step": 1780 }, { "epoch": 2.0180494308276073, "grad_norm": 2.9375, "learning_rate": 6.757070902750872e-05, "loss": 0.0853, "step": 1790 }, { "epoch": 2.029330325094862, "grad_norm": 1.078125, "learning_rate": 6.679581557535839e-05, "loss": 0.0947, "step": 1800 }, { "epoch": 2.0406112193621166, "grad_norm": 1.734375, "learning_rate": 6.602092212320807e-05, "loss": 0.0886, "step": 1810 }, { "epoch": 2.0518921136293713, "grad_norm": 1.21875, "learning_rate": 6.524602867105773e-05, "loss": 0.1107, "step": 1820 }, { "epoch": 2.063173007896626, "grad_norm": 0.93359375, "learning_rate": 6.447113521890741e-05, "loss": 0.085, "step": 1830 }, { "epoch": 2.0744539021638806, "grad_norm": 1.5078125, "learning_rate": 6.369624176675707e-05, "loss": 0.087, "step": 1840 }, { "epoch": 2.0857347964311352, "grad_norm": 2.21875, "learning_rate": 6.292134831460675e-05, "loss": 0.1305, "step": 1850 }, { "epoch": 2.09701569069839, "grad_norm": 1.6953125, "learning_rate": 6.214645486245641e-05, "loss": 0.1178, "step": 1860 }, { "epoch": 2.1082965849656445, "grad_norm": 2.078125, "learning_rate": 6.13715614103061e-05, "loss": 0.1124, "step": 1870 }, { "epoch": 2.119577479232899, "grad_norm": 0.81640625, "learning_rate": 6.059666795815576e-05, "loss": 0.1044, "step": 1880 }, { "epoch": 2.130858373500154, "grad_norm": 1.8828125, "learning_rate": 5.982177450600542e-05, "loss": 0.0873, "step": 1890 }, { "epoch": 2.1421392677674085, "grad_norm": 1.65625, "learning_rate": 5.9046881053855094e-05, "loss": 0.1124, "step": 1900 }, { "epoch": 2.153420162034663, "grad_norm": 1.4921875, "learning_rate": 5.827198760170477e-05, "loss": 0.0943, "step": 1910 }, { "epoch": 2.164701056301918, "grad_norm": 1.15625, "learning_rate": 5.749709414955444e-05, "loss": 0.0892, "step": 1920 }, { "epoch": 2.1759819505691724, "grad_norm": 2.328125, "learning_rate": 5.672220069740411e-05, "loss": 0.1028, "step": 1930 }, { "epoch": 2.187262844836427, "grad_norm": 1.359375, "learning_rate": 5.594730724525378e-05, "loss": 0.1033, "step": 1940 }, { "epoch": 2.1985437391036817, "grad_norm": 1.1171875, "learning_rate": 5.517241379310345e-05, "loss": 0.1093, "step": 1950 }, { "epoch": 2.2098246333709364, "grad_norm": 1.515625, "learning_rate": 5.439752034095312e-05, "loss": 0.1123, "step": 1960 }, { "epoch": 2.221105527638191, "grad_norm": 1.765625, "learning_rate": 5.362262688880279e-05, "loss": 0.1139, "step": 1970 }, { "epoch": 2.2323864219054457, "grad_norm": 1.7890625, "learning_rate": 5.284773343665246e-05, "loss": 0.0987, "step": 1980 }, { "epoch": 2.2436673161727003, "grad_norm": 2.140625, "learning_rate": 5.207283998450213e-05, "loss": 0.1008, "step": 1990 }, { "epoch": 2.254948210439955, "grad_norm": 2.3125, "learning_rate": 5.1297946532351806e-05, "loss": 0.0925, "step": 2000 }, { "epoch": 2.254948210439955, "eval_loss": 0.09745433926582336, "eval_runtime": 3942.1232, "eval_samples_per_second": 29.682, "eval_steps_per_second": 0.928, "step": 2000 } ], "logging_steps": 10, "max_steps": 2661, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }