{ "best_metric": 0.20129592716693878, "best_model_checkpoint": "./fine-tuned/checkpoint-5500", "epoch": 3.9985950122936424, "eval_steps": 100, "global_step": 5692, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.035124692658939236, "grad_norm": 31298.91015625, "learning_rate": 2.9736472241742796e-05, "loss": 0.2772, "step": 50 }, { "epoch": 0.07024938531787847, "grad_norm": 28423.171875, "learning_rate": 2.9472944483485594e-05, "loss": 0.2575, "step": 100 }, { "epoch": 0.07024938531787847, "eval_loss": 0.22961987555027008, "eval_runtime": 67.6563, "eval_samples_per_second": 65.921, "eval_steps_per_second": 2.069, "step": 100 }, { "epoch": 0.1053740779768177, "grad_norm": 28882.9609375, "learning_rate": 2.9209416725228392e-05, "loss": 0.24, "step": 150 }, { "epoch": 0.14049877063575694, "grad_norm": 44492.234375, "learning_rate": 2.894588896697119e-05, "loss": 0.2427, "step": 200 }, { "epoch": 0.14049877063575694, "eval_loss": 0.22477279603481293, "eval_runtime": 67.2438, "eval_samples_per_second": 66.326, "eval_steps_per_second": 2.082, "step": 200 }, { "epoch": 0.17562346329469616, "grad_norm": 23385.271484375, "learning_rate": 2.8682361208713985e-05, "loss": 0.237, "step": 250 }, { "epoch": 0.2107481559536354, "grad_norm": 65184.7578125, "learning_rate": 2.841883345045678e-05, "loss": 0.2351, "step": 300 }, { "epoch": 0.2107481559536354, "eval_loss": 0.22264569997787476, "eval_runtime": 67.1557, "eval_samples_per_second": 66.413, "eval_steps_per_second": 2.085, "step": 300 }, { "epoch": 0.24587284861257463, "grad_norm": 26510.09375, "learning_rate": 2.8155305692199578e-05, "loss": 0.2387, "step": 350 }, { "epoch": 0.2809975412715139, "grad_norm": 35873.625, "learning_rate": 2.7891777933942376e-05, "loss": 0.239, "step": 400 }, { "epoch": 0.2809975412715139, "eval_loss": 0.22040367126464844, "eval_runtime": 67.2556, "eval_samples_per_second": 66.314, "eval_steps_per_second": 2.082, "step": 400 }, { "epoch": 0.31612223393045313, "grad_norm": 190454.703125, "learning_rate": 2.7628250175685175e-05, "loss": 0.2343, "step": 450 }, { "epoch": 0.3512469265893923, "grad_norm": 27248.146484375, "learning_rate": 2.736472241742797e-05, "loss": 0.2349, "step": 500 }, { "epoch": 0.3512469265893923, "eval_loss": 0.21807625889778137, "eval_runtime": 67.3281, "eval_samples_per_second": 66.243, "eval_steps_per_second": 2.079, "step": 500 }, { "epoch": 0.3863716192483316, "grad_norm": 21019.255859375, "learning_rate": 2.7101194659170764e-05, "loss": 0.2286, "step": 550 }, { "epoch": 0.4214963119072708, "grad_norm": 23071.5703125, "learning_rate": 2.6837666900913563e-05, "loss": 0.2311, "step": 600 }, { "epoch": 0.4214963119072708, "eval_loss": 0.21645724773406982, "eval_runtime": 67.1857, "eval_samples_per_second": 66.383, "eval_steps_per_second": 2.084, "step": 600 }, { "epoch": 0.45662100456621, "grad_norm": 21536.572265625, "learning_rate": 2.657413914265636e-05, "loss": 0.2249, "step": 650 }, { "epoch": 0.49174569722514927, "grad_norm": 22037.119140625, "learning_rate": 2.631061138439916e-05, "loss": 0.2302, "step": 700 }, { "epoch": 0.49174569722514927, "eval_loss": 0.21522314846515656, "eval_runtime": 67.377, "eval_samples_per_second": 66.195, "eval_steps_per_second": 2.078, "step": 700 }, { "epoch": 0.5268703898840885, "grad_norm": 24826.04296875, "learning_rate": 2.6047083626141954e-05, "loss": 0.2295, "step": 750 }, { "epoch": 0.5619950825430278, "grad_norm": 21309.46875, "learning_rate": 2.578355586788475e-05, "loss": 0.2265, "step": 800 }, { "epoch": 0.5619950825430278, "eval_loss": 0.21485908329486847, "eval_runtime": 67.9456, "eval_samples_per_second": 65.641, "eval_steps_per_second": 2.06, "step": 800 }, { "epoch": 0.597119775201967, "grad_norm": 21253.212890625, "learning_rate": 2.5520028109627547e-05, "loss": 0.2255, "step": 850 }, { "epoch": 0.6322444678609063, "grad_norm": 25884.013671875, "learning_rate": 2.5256500351370345e-05, "loss": 0.2189, "step": 900 }, { "epoch": 0.6322444678609063, "eval_loss": 0.21369116008281708, "eval_runtime": 67.5126, "eval_samples_per_second": 66.062, "eval_steps_per_second": 2.074, "step": 900 }, { "epoch": 0.6673691605198454, "grad_norm": 32345.33203125, "learning_rate": 2.4992972593113144e-05, "loss": 0.2177, "step": 950 }, { "epoch": 0.7024938531787847, "grad_norm": 22764.255859375, "learning_rate": 2.472944483485594e-05, "loss": 0.2205, "step": 1000 }, { "epoch": 0.7024938531787847, "eval_loss": 0.2125701606273651, "eval_runtime": 67.5281, "eval_samples_per_second": 66.047, "eval_steps_per_second": 2.073, "step": 1000 }, { "epoch": 0.7376185458377239, "grad_norm": 26256.35546875, "learning_rate": 2.4465917076598737e-05, "loss": 0.2224, "step": 1050 }, { "epoch": 0.7727432384966632, "grad_norm": 29107.78515625, "learning_rate": 2.420238931834153e-05, "loss": 0.2211, "step": 1100 }, { "epoch": 0.7727432384966632, "eval_loss": 0.2117428034543991, "eval_runtime": 67.5369, "eval_samples_per_second": 66.038, "eval_steps_per_second": 2.073, "step": 1100 }, { "epoch": 0.8078679311556024, "grad_norm": 98354.15625, "learning_rate": 2.393886156008433e-05, "loss": 0.215, "step": 1150 }, { "epoch": 0.8429926238145417, "grad_norm": 22886.3984375, "learning_rate": 2.3675333801827128e-05, "loss": 0.2229, "step": 1200 }, { "epoch": 0.8429926238145417, "eval_loss": 0.2107735425233841, "eval_runtime": 67.6295, "eval_samples_per_second": 65.948, "eval_steps_per_second": 2.07, "step": 1200 }, { "epoch": 0.8781173164734809, "grad_norm": 20510.26171875, "learning_rate": 2.3411806043569923e-05, "loss": 0.2105, "step": 1250 }, { "epoch": 0.91324200913242, "grad_norm": 20053.85546875, "learning_rate": 2.314827828531272e-05, "loss": 0.2195, "step": 1300 }, { "epoch": 0.91324200913242, "eval_loss": 0.20966531336307526, "eval_runtime": 67.6112, "eval_samples_per_second": 65.965, "eval_steps_per_second": 2.071, "step": 1300 }, { "epoch": 0.9483667017913593, "grad_norm": 28154.595703125, "learning_rate": 2.2884750527055516e-05, "loss": 0.2215, "step": 1350 }, { "epoch": 0.9834913944502985, "grad_norm": 28011.71484375, "learning_rate": 2.2621222768798314e-05, "loss": 0.2172, "step": 1400 }, { "epoch": 0.9834913944502985, "eval_loss": 0.20960816740989685, "eval_runtime": 67.6089, "eval_samples_per_second": 65.968, "eval_steps_per_second": 2.071, "step": 1400 }, { "epoch": 1.0186160871092378, "grad_norm": 26518.01171875, "learning_rate": 2.2357695010541112e-05, "loss": 0.21, "step": 1450 }, { "epoch": 1.053740779768177, "grad_norm": 20411.26171875, "learning_rate": 2.2094167252283907e-05, "loss": 0.2139, "step": 1500 }, { "epoch": 1.053740779768177, "eval_loss": 0.20940540730953217, "eval_runtime": 67.4684, "eval_samples_per_second": 66.105, "eval_steps_per_second": 2.075, "step": 1500 }, { "epoch": 1.0888654724271163, "grad_norm": 25448.7734375, "learning_rate": 2.1830639494026705e-05, "loss": 0.2119, "step": 1550 }, { "epoch": 1.1239901650860555, "grad_norm": 20371.7109375, "learning_rate": 2.15671117357695e-05, "loss": 0.2074, "step": 1600 }, { "epoch": 1.1239901650860555, "eval_loss": 0.2086929827928543, "eval_runtime": 67.511, "eval_samples_per_second": 66.063, "eval_steps_per_second": 2.074, "step": 1600 }, { "epoch": 1.1591148577449948, "grad_norm": 24624.9609375, "learning_rate": 2.13035839775123e-05, "loss": 0.2109, "step": 1650 }, { "epoch": 1.194239550403934, "grad_norm": 28790.974609375, "learning_rate": 2.1040056219255097e-05, "loss": 0.2126, "step": 1700 }, { "epoch": 1.194239550403934, "eval_loss": 0.20832768082618713, "eval_runtime": 67.1973, "eval_samples_per_second": 66.372, "eval_steps_per_second": 2.083, "step": 1700 }, { "epoch": 1.2293642430628733, "grad_norm": 22134.93359375, "learning_rate": 2.077652846099789e-05, "loss": 0.2118, "step": 1750 }, { "epoch": 1.2644889357218125, "grad_norm": 22432.322265625, "learning_rate": 2.051300070274069e-05, "loss": 0.2128, "step": 1800 }, { "epoch": 1.2644889357218125, "eval_loss": 0.20813611149787903, "eval_runtime": 67.1539, "eval_samples_per_second": 66.415, "eval_steps_per_second": 2.085, "step": 1800 }, { "epoch": 1.2996136283807518, "grad_norm": 21562.96484375, "learning_rate": 2.0249472944483485e-05, "loss": 0.2135, "step": 1850 }, { "epoch": 1.334738321039691, "grad_norm": 22612.58203125, "learning_rate": 1.9985945186226283e-05, "loss": 0.2081, "step": 1900 }, { "epoch": 1.334738321039691, "eval_loss": 0.2073371410369873, "eval_runtime": 67.1629, "eval_samples_per_second": 66.406, "eval_steps_per_second": 2.084, "step": 1900 }, { "epoch": 1.36986301369863, "grad_norm": 22550.556640625, "learning_rate": 1.972241742796908e-05, "loss": 0.2037, "step": 1950 }, { "epoch": 1.4049877063575693, "grad_norm": 24281.9140625, "learning_rate": 1.9458889669711876e-05, "loss": 0.2111, "step": 2000 }, { "epoch": 1.4049877063575693, "eval_loss": 0.20700447261333466, "eval_runtime": 67.2893, "eval_samples_per_second": 66.281, "eval_steps_per_second": 2.081, "step": 2000 }, { "epoch": 1.4401123990165086, "grad_norm": 25767.197265625, "learning_rate": 1.9195361911454674e-05, "loss": 0.2054, "step": 2050 }, { "epoch": 1.4752370916754478, "grad_norm": 22215.111328125, "learning_rate": 1.893183415319747e-05, "loss": 0.2082, "step": 2100 }, { "epoch": 1.4752370916754478, "eval_loss": 0.20631250739097595, "eval_runtime": 67.1038, "eval_samples_per_second": 66.464, "eval_steps_per_second": 2.086, "step": 2100 }, { "epoch": 1.510361784334387, "grad_norm": 27927.373046875, "learning_rate": 1.8668306394940267e-05, "loss": 0.2128, "step": 2150 }, { "epoch": 1.5454864769933263, "grad_norm": 25635.267578125, "learning_rate": 1.8404778636683066e-05, "loss": 0.2078, "step": 2200 }, { "epoch": 1.5454864769933263, "eval_loss": 0.20582793653011322, "eval_runtime": 67.2723, "eval_samples_per_second": 66.298, "eval_steps_per_second": 2.081, "step": 2200 }, { "epoch": 1.5806111696522656, "grad_norm": 25550.1171875, "learning_rate": 1.814125087842586e-05, "loss": 0.2058, "step": 2250 }, { "epoch": 1.6157358623112048, "grad_norm": 21671.251953125, "learning_rate": 1.787772312016866e-05, "loss": 0.206, "step": 2300 }, { "epoch": 1.6157358623112048, "eval_loss": 0.2059122622013092, "eval_runtime": 67.4662, "eval_samples_per_second": 66.107, "eval_steps_per_second": 2.075, "step": 2300 }, { "epoch": 1.650860554970144, "grad_norm": 21685.947265625, "learning_rate": 1.7614195361911453e-05, "loss": 0.2086, "step": 2350 }, { "epoch": 1.685985247629083, "grad_norm": 24516.828125, "learning_rate": 1.7350667603654252e-05, "loss": 0.2069, "step": 2400 }, { "epoch": 1.685985247629083, "eval_loss": 0.20495346188545227, "eval_runtime": 67.1671, "eval_samples_per_second": 66.402, "eval_steps_per_second": 2.084, "step": 2400 }, { "epoch": 1.7211099402880223, "grad_norm": 22610.7734375, "learning_rate": 1.708713984539705e-05, "loss": 0.2052, "step": 2450 }, { "epoch": 1.7562346329469616, "grad_norm": 35525.84765625, "learning_rate": 1.6823612087139845e-05, "loss": 0.2051, "step": 2500 }, { "epoch": 1.7562346329469616, "eval_loss": 0.20481644570827484, "eval_runtime": 67.1059, "eval_samples_per_second": 66.462, "eval_steps_per_second": 2.086, "step": 2500 }, { "epoch": 1.7913593256059008, "grad_norm": 20207.35546875, "learning_rate": 1.6560084328882643e-05, "loss": 0.2049, "step": 2550 }, { "epoch": 1.82648401826484, "grad_norm": 17453.359375, "learning_rate": 1.6296556570625438e-05, "loss": 0.2101, "step": 2600 }, { "epoch": 1.82648401826484, "eval_loss": 0.20485134422779083, "eval_runtime": 67.202, "eval_samples_per_second": 66.367, "eval_steps_per_second": 2.083, "step": 2600 }, { "epoch": 1.8616087109237793, "grad_norm": 24568.439453125, "learning_rate": 1.603302881236824e-05, "loss": 0.2081, "step": 2650 }, { "epoch": 1.8967334035827186, "grad_norm": 22425.1875, "learning_rate": 1.5769501054111034e-05, "loss": 0.2032, "step": 2700 }, { "epoch": 1.8967334035827186, "eval_loss": 0.2041337788105011, "eval_runtime": 67.2372, "eval_samples_per_second": 66.332, "eval_steps_per_second": 2.082, "step": 2700 }, { "epoch": 1.9318580962416578, "grad_norm": 21858.3828125, "learning_rate": 1.550597329585383e-05, "loss": 0.2074, "step": 2750 }, { "epoch": 1.966982788900597, "grad_norm": 17712.39453125, "learning_rate": 1.5242445537596626e-05, "loss": 0.205, "step": 2800 }, { "epoch": 1.966982788900597, "eval_loss": 0.20371712744235992, "eval_runtime": 67.1299, "eval_samples_per_second": 66.438, "eval_steps_per_second": 2.086, "step": 2800 }, { "epoch": 2.0021074815595363, "grad_norm": 20413.91796875, "learning_rate": 1.4978917779339424e-05, "loss": 0.203, "step": 2850 }, { "epoch": 2.0372321742184756, "grad_norm": 21380.130859375, "learning_rate": 1.471539002108222e-05, "loss": 0.199, "step": 2900 }, { "epoch": 2.0372321742184756, "eval_loss": 0.20416177809238434, "eval_runtime": 67.1771, "eval_samples_per_second": 66.392, "eval_steps_per_second": 2.084, "step": 2900 }, { "epoch": 2.072356866877415, "grad_norm": 28436.697265625, "learning_rate": 1.4451862262825019e-05, "loss": 0.1989, "step": 2950 }, { "epoch": 2.107481559536354, "grad_norm": 18739.8359375, "learning_rate": 1.4188334504567815e-05, "loss": 0.1982, "step": 3000 }, { "epoch": 2.107481559536354, "eval_loss": 0.2037852257490158, "eval_runtime": 67.2417, "eval_samples_per_second": 66.328, "eval_steps_per_second": 2.082, "step": 3000 }, { "epoch": 2.1426062521952933, "grad_norm": 26514.828125, "learning_rate": 1.3924806746310612e-05, "loss": 0.2032, "step": 3050 }, { "epoch": 2.1777309448542326, "grad_norm": 22808.0234375, "learning_rate": 1.3661278988053408e-05, "loss": 0.1944, "step": 3100 }, { "epoch": 2.1777309448542326, "eval_loss": 0.20371171832084656, "eval_runtime": 67.0231, "eval_samples_per_second": 66.544, "eval_steps_per_second": 2.089, "step": 3100 }, { "epoch": 2.212855637513172, "grad_norm": 24228.18359375, "learning_rate": 1.3397751229796205e-05, "loss": 0.2056, "step": 3150 }, { "epoch": 2.247980330172111, "grad_norm": 20969.25390625, "learning_rate": 1.3134223471539003e-05, "loss": 0.1948, "step": 3200 }, { "epoch": 2.247980330172111, "eval_loss": 0.20387396216392517, "eval_runtime": 66.9567, "eval_samples_per_second": 66.61, "eval_steps_per_second": 2.091, "step": 3200 }, { "epoch": 2.2831050228310503, "grad_norm": 42587.73046875, "learning_rate": 1.28706957132818e-05, "loss": 0.2072, "step": 3250 }, { "epoch": 2.3182297154899896, "grad_norm": 22174.130859375, "learning_rate": 1.2607167955024596e-05, "loss": 0.2023, "step": 3300 }, { "epoch": 2.3182297154899896, "eval_loss": 0.20358328521251678, "eval_runtime": 67.1207, "eval_samples_per_second": 66.447, "eval_steps_per_second": 2.086, "step": 3300 }, { "epoch": 2.353354408148929, "grad_norm": 28607.568359375, "learning_rate": 1.2343640196767393e-05, "loss": 0.1964, "step": 3350 }, { "epoch": 2.388479100807868, "grad_norm": 27227.3203125, "learning_rate": 1.208011243851019e-05, "loss": 0.2075, "step": 3400 }, { "epoch": 2.388479100807868, "eval_loss": 0.20336925983428955, "eval_runtime": 67.2613, "eval_samples_per_second": 66.309, "eval_steps_per_second": 2.081, "step": 3400 }, { "epoch": 2.4236037934668073, "grad_norm": 24440.291015625, "learning_rate": 1.1816584680252988e-05, "loss": 0.1999, "step": 3450 }, { "epoch": 2.4587284861257466, "grad_norm": 23327.6328125, "learning_rate": 1.1553056921995784e-05, "loss": 0.2041, "step": 3500 }, { "epoch": 2.4587284861257466, "eval_loss": 0.2032385915517807, "eval_runtime": 67.0192, "eval_samples_per_second": 66.548, "eval_steps_per_second": 2.089, "step": 3500 }, { "epoch": 2.493853178784686, "grad_norm": 23787.681640625, "learning_rate": 1.128952916373858e-05, "loss": 0.1984, "step": 3550 }, { "epoch": 2.528977871443625, "grad_norm": 24526.529296875, "learning_rate": 1.1026001405481377e-05, "loss": 0.1971, "step": 3600 }, { "epoch": 2.528977871443625, "eval_loss": 0.20272360742092133, "eval_runtime": 66.8824, "eval_samples_per_second": 66.684, "eval_steps_per_second": 2.093, "step": 3600 }, { "epoch": 2.564102564102564, "grad_norm": 23948.60546875, "learning_rate": 1.0762473647224174e-05, "loss": 0.1904, "step": 3650 }, { "epoch": 2.5992272567615036, "grad_norm": 17924.513671875, "learning_rate": 1.0498945888966972e-05, "loss": 0.1968, "step": 3700 }, { "epoch": 2.5992272567615036, "eval_loss": 0.20258785784244537, "eval_runtime": 67.0213, "eval_samples_per_second": 66.546, "eval_steps_per_second": 2.089, "step": 3700 }, { "epoch": 2.6343519494204424, "grad_norm": 18695.21875, "learning_rate": 1.0235418130709768e-05, "loss": 0.1961, "step": 3750 }, { "epoch": 2.669476642079382, "grad_norm": 23424.083984375, "learning_rate": 9.971890372452565e-06, "loss": 0.1961, "step": 3800 }, { "epoch": 2.669476642079382, "eval_loss": 0.2024257928133011, "eval_runtime": 67.1877, "eval_samples_per_second": 66.381, "eval_steps_per_second": 2.084, "step": 3800 }, { "epoch": 2.704601334738321, "grad_norm": 18417.158203125, "learning_rate": 9.708362614195362e-06, "loss": 0.2004, "step": 3850 }, { "epoch": 2.73972602739726, "grad_norm": 29204.578125, "learning_rate": 9.444834855938158e-06, "loss": 0.2, "step": 3900 }, { "epoch": 2.73972602739726, "eval_loss": 0.20261028409004211, "eval_runtime": 67.145, "eval_samples_per_second": 66.423, "eval_steps_per_second": 2.085, "step": 3900 }, { "epoch": 2.7748507200561994, "grad_norm": 22810.859375, "learning_rate": 9.181307097680956e-06, "loss": 0.1955, "step": 3950 }, { "epoch": 2.8099754127151386, "grad_norm": 20385.189453125, "learning_rate": 8.917779339423753e-06, "loss": 0.1902, "step": 4000 }, { "epoch": 2.8099754127151386, "eval_loss": 0.20224925875663757, "eval_runtime": 66.8567, "eval_samples_per_second": 66.71, "eval_steps_per_second": 2.094, "step": 4000 }, { "epoch": 2.845100105374078, "grad_norm": 60070.58984375, "learning_rate": 8.65425158116655e-06, "loss": 0.1969, "step": 4050 }, { "epoch": 2.880224798033017, "grad_norm": 20594.654296875, "learning_rate": 8.390723822909348e-06, "loss": 0.2009, "step": 4100 }, { "epoch": 2.880224798033017, "eval_loss": 0.20173698663711548, "eval_runtime": 66.8679, "eval_samples_per_second": 66.699, "eval_steps_per_second": 2.094, "step": 4100 }, { "epoch": 2.9153494906919564, "grad_norm": 22764.1640625, "learning_rate": 8.127196064652143e-06, "loss": 0.1939, "step": 4150 }, { "epoch": 2.9504741833508956, "grad_norm": 22604.9375, "learning_rate": 7.86366830639494e-06, "loss": 0.1991, "step": 4200 }, { "epoch": 2.9504741833508956, "eval_loss": 0.20178209245204926, "eval_runtime": 67.157, "eval_samples_per_second": 66.412, "eval_steps_per_second": 2.085, "step": 4200 }, { "epoch": 2.985598876009835, "grad_norm": 23427.0, "learning_rate": 7.600140548137737e-06, "loss": 0.1982, "step": 4250 }, { "epoch": 3.020723568668774, "grad_norm": 22872.943359375, "learning_rate": 7.336612789880535e-06, "loss": 0.1905, "step": 4300 }, { "epoch": 3.020723568668774, "eval_loss": 0.20212285220623016, "eval_runtime": 66.9569, "eval_samples_per_second": 66.61, "eval_steps_per_second": 2.091, "step": 4300 }, { "epoch": 3.0558482613277134, "grad_norm": 20360.029296875, "learning_rate": 7.073085031623331e-06, "loss": 0.2011, "step": 4350 }, { "epoch": 3.0909729539866526, "grad_norm": 26769.02734375, "learning_rate": 6.809557273366128e-06, "loss": 0.1939, "step": 4400 }, { "epoch": 3.0909729539866526, "eval_loss": 0.20202863216400146, "eval_runtime": 66.9701, "eval_samples_per_second": 66.597, "eval_steps_per_second": 2.09, "step": 4400 }, { "epoch": 3.126097646645592, "grad_norm": 34976.171875, "learning_rate": 6.546029515108924e-06, "loss": 0.1912, "step": 4450 }, { "epoch": 3.161222339304531, "grad_norm": 50123.8671875, "learning_rate": 6.282501756851722e-06, "loss": 0.1934, "step": 4500 }, { "epoch": 3.161222339304531, "eval_loss": 0.20200392603874207, "eval_runtime": 66.9822, "eval_samples_per_second": 66.585, "eval_steps_per_second": 2.09, "step": 4500 }, { "epoch": 3.1963470319634704, "grad_norm": 30103.6484375, "learning_rate": 6.018973998594519e-06, "loss": 0.1891, "step": 4550 }, { "epoch": 3.2314717246224096, "grad_norm": 22014.908203125, "learning_rate": 5.755446240337316e-06, "loss": 0.1933, "step": 4600 }, { "epoch": 3.2314717246224096, "eval_loss": 0.20177535712718964, "eval_runtime": 66.8767, "eval_samples_per_second": 66.69, "eval_steps_per_second": 2.093, "step": 4600 }, { "epoch": 3.266596417281349, "grad_norm": 24894.115234375, "learning_rate": 5.491918482080113e-06, "loss": 0.1921, "step": 4650 }, { "epoch": 3.301721109940288, "grad_norm": 21648.677734375, "learning_rate": 5.2283907238229096e-06, "loss": 0.1914, "step": 4700 }, { "epoch": 3.301721109940288, "eval_loss": 0.20187227427959442, "eval_runtime": 66.9001, "eval_samples_per_second": 66.667, "eval_steps_per_second": 2.093, "step": 4700 }, { "epoch": 3.3368458025992274, "grad_norm": 24555.294921875, "learning_rate": 4.964862965565706e-06, "loss": 0.1914, "step": 4750 }, { "epoch": 3.3719704952581666, "grad_norm": 44338.69921875, "learning_rate": 4.7013352073085035e-06, "loss": 0.1936, "step": 4800 }, { "epoch": 3.3719704952581666, "eval_loss": 0.20171089470386505, "eval_runtime": 67.0479, "eval_samples_per_second": 66.52, "eval_steps_per_second": 2.088, "step": 4800 }, { "epoch": 3.407095187917106, "grad_norm": 23296.537109375, "learning_rate": 4.4378074490513e-06, "loss": 0.1949, "step": 4850 }, { "epoch": 3.442219880576045, "grad_norm": 21337.087890625, "learning_rate": 4.1742796907940974e-06, "loss": 0.1902, "step": 4900 }, { "epoch": 3.442219880576045, "eval_loss": 0.20151035487651825, "eval_runtime": 66.9445, "eval_samples_per_second": 66.622, "eval_steps_per_second": 2.091, "step": 4900 }, { "epoch": 3.4773445732349844, "grad_norm": 20258.736328125, "learning_rate": 3.910751932536894e-06, "loss": 0.1966, "step": 4950 }, { "epoch": 3.512469265893923, "grad_norm": 22937.763671875, "learning_rate": 3.647224174279691e-06, "loss": 0.1949, "step": 5000 }, { "epoch": 3.512469265893923, "eval_loss": 0.2013118416070938, "eval_runtime": 67.0166, "eval_samples_per_second": 66.551, "eval_steps_per_second": 2.089, "step": 5000 }, { "epoch": 3.547593958552863, "grad_norm": 27274.357421875, "learning_rate": 3.383696416022488e-06, "loss": 0.1968, "step": 5050 }, { "epoch": 3.5827186512118017, "grad_norm": 26782.548828125, "learning_rate": 3.1201686577652844e-06, "loss": 0.1878, "step": 5100 }, { "epoch": 3.5827186512118017, "eval_loss": 0.20154449343681335, "eval_runtime": 67.1325, "eval_samples_per_second": 66.436, "eval_steps_per_second": 2.085, "step": 5100 }, { "epoch": 3.6178433438707414, "grad_norm": 18810.177734375, "learning_rate": 2.8566408995080814e-06, "loss": 0.1912, "step": 5150 }, { "epoch": 3.65296803652968, "grad_norm": 26744.78515625, "learning_rate": 2.593113141250879e-06, "loss": 0.1975, "step": 5200 }, { "epoch": 3.65296803652968, "eval_loss": 0.20147912204265594, "eval_runtime": 67.0091, "eval_samples_per_second": 66.558, "eval_steps_per_second": 2.089, "step": 5200 }, { "epoch": 3.68809272918862, "grad_norm": 23326.36328125, "learning_rate": 2.3295853829936753e-06, "loss": 0.1995, "step": 5250 }, { "epoch": 3.7232174218475587, "grad_norm": 21197.091796875, "learning_rate": 2.0660576247364723e-06, "loss": 0.1894, "step": 5300 }, { "epoch": 3.7232174218475587, "eval_loss": 0.20139345526695251, "eval_runtime": 66.9887, "eval_samples_per_second": 66.578, "eval_steps_per_second": 2.09, "step": 5300 }, { "epoch": 3.758342114506498, "grad_norm": 23258.3671875, "learning_rate": 1.8025298664792693e-06, "loss": 0.1941, "step": 5350 }, { "epoch": 3.793466807165437, "grad_norm": 25702.90234375, "learning_rate": 1.539002108222066e-06, "loss": 0.1952, "step": 5400 }, { "epoch": 3.793466807165437, "eval_loss": 0.20133435726165771, "eval_runtime": 67.0042, "eval_samples_per_second": 66.563, "eval_steps_per_second": 2.089, "step": 5400 }, { "epoch": 3.8285914998243764, "grad_norm": 22600.765625, "learning_rate": 1.275474349964863e-06, "loss": 0.1912, "step": 5450 }, { "epoch": 3.8637161924833157, "grad_norm": 25134.44921875, "learning_rate": 1.0119465917076597e-06, "loss": 0.197, "step": 5500 }, { "epoch": 3.8637161924833157, "eval_loss": 0.20129592716693878, "eval_runtime": 67.1868, "eval_samples_per_second": 66.382, "eval_steps_per_second": 2.084, "step": 5500 }, { "epoch": 3.898840885142255, "grad_norm": 22639.22265625, "learning_rate": 7.484188334504568e-07, "loss": 0.1898, "step": 5550 }, { "epoch": 3.933965577801194, "grad_norm": 108627.9453125, "learning_rate": 4.848910751932538e-07, "loss": 0.1887, "step": 5600 }, { "epoch": 3.933965577801194, "eval_loss": 0.2013484090566635, "eval_runtime": 67.1981, "eval_samples_per_second": 66.371, "eval_steps_per_second": 2.083, "step": 5600 }, { "epoch": 3.9690902704601334, "grad_norm": 28155.427734375, "learning_rate": 2.213633169360506e-07, "loss": 0.1955, "step": 5650 } ], "logging_steps": 50, "max_steps": 5692, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.545216223281152e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }