| { | |
| "best_metric": 0.20129592716693878, | |
| "best_model_checkpoint": "./fine-tuned/checkpoint-5500", | |
| "epoch": 3.9985950122936424, | |
| "eval_steps": 100, | |
| "global_step": 5692, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.035124692658939236, | |
| "grad_norm": 31298.91015625, | |
| "learning_rate": 2.9736472241742796e-05, | |
| "loss": 0.2772, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07024938531787847, | |
| "grad_norm": 28423.171875, | |
| "learning_rate": 2.9472944483485594e-05, | |
| "loss": 0.2575, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07024938531787847, | |
| "eval_loss": 0.22961987555027008, | |
| "eval_runtime": 67.6563, | |
| "eval_samples_per_second": 65.921, | |
| "eval_steps_per_second": 2.069, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1053740779768177, | |
| "grad_norm": 28882.9609375, | |
| "learning_rate": 2.9209416725228392e-05, | |
| "loss": 0.24, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14049877063575694, | |
| "grad_norm": 44492.234375, | |
| "learning_rate": 2.894588896697119e-05, | |
| "loss": 0.2427, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.14049877063575694, | |
| "eval_loss": 0.22477279603481293, | |
| "eval_runtime": 67.2438, | |
| "eval_samples_per_second": 66.326, | |
| "eval_steps_per_second": 2.082, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17562346329469616, | |
| "grad_norm": 23385.271484375, | |
| "learning_rate": 2.8682361208713985e-05, | |
| "loss": 0.237, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2107481559536354, | |
| "grad_norm": 65184.7578125, | |
| "learning_rate": 2.841883345045678e-05, | |
| "loss": 0.2351, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2107481559536354, | |
| "eval_loss": 0.22264569997787476, | |
| "eval_runtime": 67.1557, | |
| "eval_samples_per_second": 66.413, | |
| "eval_steps_per_second": 2.085, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.24587284861257463, | |
| "grad_norm": 26510.09375, | |
| "learning_rate": 2.8155305692199578e-05, | |
| "loss": 0.2387, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2809975412715139, | |
| "grad_norm": 35873.625, | |
| "learning_rate": 2.7891777933942376e-05, | |
| "loss": 0.239, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2809975412715139, | |
| "eval_loss": 0.22040367126464844, | |
| "eval_runtime": 67.2556, | |
| "eval_samples_per_second": 66.314, | |
| "eval_steps_per_second": 2.082, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.31612223393045313, | |
| "grad_norm": 190454.703125, | |
| "learning_rate": 2.7628250175685175e-05, | |
| "loss": 0.2343, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3512469265893923, | |
| "grad_norm": 27248.146484375, | |
| "learning_rate": 2.736472241742797e-05, | |
| "loss": 0.2349, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3512469265893923, | |
| "eval_loss": 0.21807625889778137, | |
| "eval_runtime": 67.3281, | |
| "eval_samples_per_second": 66.243, | |
| "eval_steps_per_second": 2.079, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3863716192483316, | |
| "grad_norm": 21019.255859375, | |
| "learning_rate": 2.7101194659170764e-05, | |
| "loss": 0.2286, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4214963119072708, | |
| "grad_norm": 23071.5703125, | |
| "learning_rate": 2.6837666900913563e-05, | |
| "loss": 0.2311, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4214963119072708, | |
| "eval_loss": 0.21645724773406982, | |
| "eval_runtime": 67.1857, | |
| "eval_samples_per_second": 66.383, | |
| "eval_steps_per_second": 2.084, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.45662100456621, | |
| "grad_norm": 21536.572265625, | |
| "learning_rate": 2.657413914265636e-05, | |
| "loss": 0.2249, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.49174569722514927, | |
| "grad_norm": 22037.119140625, | |
| "learning_rate": 2.631061138439916e-05, | |
| "loss": 0.2302, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.49174569722514927, | |
| "eval_loss": 0.21522314846515656, | |
| "eval_runtime": 67.377, | |
| "eval_samples_per_second": 66.195, | |
| "eval_steps_per_second": 2.078, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5268703898840885, | |
| "grad_norm": 24826.04296875, | |
| "learning_rate": 2.6047083626141954e-05, | |
| "loss": 0.2295, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5619950825430278, | |
| "grad_norm": 21309.46875, | |
| "learning_rate": 2.578355586788475e-05, | |
| "loss": 0.2265, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5619950825430278, | |
| "eval_loss": 0.21485908329486847, | |
| "eval_runtime": 67.9456, | |
| "eval_samples_per_second": 65.641, | |
| "eval_steps_per_second": 2.06, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.597119775201967, | |
| "grad_norm": 21253.212890625, | |
| "learning_rate": 2.5520028109627547e-05, | |
| "loss": 0.2255, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6322444678609063, | |
| "grad_norm": 25884.013671875, | |
| "learning_rate": 2.5256500351370345e-05, | |
| "loss": 0.2189, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6322444678609063, | |
| "eval_loss": 0.21369116008281708, | |
| "eval_runtime": 67.5126, | |
| "eval_samples_per_second": 66.062, | |
| "eval_steps_per_second": 2.074, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6673691605198454, | |
| "grad_norm": 32345.33203125, | |
| "learning_rate": 2.4992972593113144e-05, | |
| "loss": 0.2177, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.7024938531787847, | |
| "grad_norm": 22764.255859375, | |
| "learning_rate": 2.472944483485594e-05, | |
| "loss": 0.2205, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7024938531787847, | |
| "eval_loss": 0.2125701606273651, | |
| "eval_runtime": 67.5281, | |
| "eval_samples_per_second": 66.047, | |
| "eval_steps_per_second": 2.073, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7376185458377239, | |
| "grad_norm": 26256.35546875, | |
| "learning_rate": 2.4465917076598737e-05, | |
| "loss": 0.2224, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7727432384966632, | |
| "grad_norm": 29107.78515625, | |
| "learning_rate": 2.420238931834153e-05, | |
| "loss": 0.2211, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7727432384966632, | |
| "eval_loss": 0.2117428034543991, | |
| "eval_runtime": 67.5369, | |
| "eval_samples_per_second": 66.038, | |
| "eval_steps_per_second": 2.073, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8078679311556024, | |
| "grad_norm": 98354.15625, | |
| "learning_rate": 2.393886156008433e-05, | |
| "loss": 0.215, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.8429926238145417, | |
| "grad_norm": 22886.3984375, | |
| "learning_rate": 2.3675333801827128e-05, | |
| "loss": 0.2229, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8429926238145417, | |
| "eval_loss": 0.2107735425233841, | |
| "eval_runtime": 67.6295, | |
| "eval_samples_per_second": 65.948, | |
| "eval_steps_per_second": 2.07, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8781173164734809, | |
| "grad_norm": 20510.26171875, | |
| "learning_rate": 2.3411806043569923e-05, | |
| "loss": 0.2105, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.91324200913242, | |
| "grad_norm": 20053.85546875, | |
| "learning_rate": 2.314827828531272e-05, | |
| "loss": 0.2195, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.91324200913242, | |
| "eval_loss": 0.20966531336307526, | |
| "eval_runtime": 67.6112, | |
| "eval_samples_per_second": 65.965, | |
| "eval_steps_per_second": 2.071, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.9483667017913593, | |
| "grad_norm": 28154.595703125, | |
| "learning_rate": 2.2884750527055516e-05, | |
| "loss": 0.2215, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.9834913944502985, | |
| "grad_norm": 28011.71484375, | |
| "learning_rate": 2.2621222768798314e-05, | |
| "loss": 0.2172, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9834913944502985, | |
| "eval_loss": 0.20960816740989685, | |
| "eval_runtime": 67.6089, | |
| "eval_samples_per_second": 65.968, | |
| "eval_steps_per_second": 2.071, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.0186160871092378, | |
| "grad_norm": 26518.01171875, | |
| "learning_rate": 2.2357695010541112e-05, | |
| "loss": 0.21, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.053740779768177, | |
| "grad_norm": 20411.26171875, | |
| "learning_rate": 2.2094167252283907e-05, | |
| "loss": 0.2139, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.053740779768177, | |
| "eval_loss": 0.20940540730953217, | |
| "eval_runtime": 67.4684, | |
| "eval_samples_per_second": 66.105, | |
| "eval_steps_per_second": 2.075, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.0888654724271163, | |
| "grad_norm": 25448.7734375, | |
| "learning_rate": 2.1830639494026705e-05, | |
| "loss": 0.2119, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.1239901650860555, | |
| "grad_norm": 20371.7109375, | |
| "learning_rate": 2.15671117357695e-05, | |
| "loss": 0.2074, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.1239901650860555, | |
| "eval_loss": 0.2086929827928543, | |
| "eval_runtime": 67.511, | |
| "eval_samples_per_second": 66.063, | |
| "eval_steps_per_second": 2.074, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.1591148577449948, | |
| "grad_norm": 24624.9609375, | |
| "learning_rate": 2.13035839775123e-05, | |
| "loss": 0.2109, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.194239550403934, | |
| "grad_norm": 28790.974609375, | |
| "learning_rate": 2.1040056219255097e-05, | |
| "loss": 0.2126, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.194239550403934, | |
| "eval_loss": 0.20832768082618713, | |
| "eval_runtime": 67.1973, | |
| "eval_samples_per_second": 66.372, | |
| "eval_steps_per_second": 2.083, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.2293642430628733, | |
| "grad_norm": 22134.93359375, | |
| "learning_rate": 2.077652846099789e-05, | |
| "loss": 0.2118, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.2644889357218125, | |
| "grad_norm": 22432.322265625, | |
| "learning_rate": 2.051300070274069e-05, | |
| "loss": 0.2128, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.2644889357218125, | |
| "eval_loss": 0.20813611149787903, | |
| "eval_runtime": 67.1539, | |
| "eval_samples_per_second": 66.415, | |
| "eval_steps_per_second": 2.085, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.2996136283807518, | |
| "grad_norm": 21562.96484375, | |
| "learning_rate": 2.0249472944483485e-05, | |
| "loss": 0.2135, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.334738321039691, | |
| "grad_norm": 22612.58203125, | |
| "learning_rate": 1.9985945186226283e-05, | |
| "loss": 0.2081, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.334738321039691, | |
| "eval_loss": 0.2073371410369873, | |
| "eval_runtime": 67.1629, | |
| "eval_samples_per_second": 66.406, | |
| "eval_steps_per_second": 2.084, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.36986301369863, | |
| "grad_norm": 22550.556640625, | |
| "learning_rate": 1.972241742796908e-05, | |
| "loss": 0.2037, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.4049877063575693, | |
| "grad_norm": 24281.9140625, | |
| "learning_rate": 1.9458889669711876e-05, | |
| "loss": 0.2111, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.4049877063575693, | |
| "eval_loss": 0.20700447261333466, | |
| "eval_runtime": 67.2893, | |
| "eval_samples_per_second": 66.281, | |
| "eval_steps_per_second": 2.081, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.4401123990165086, | |
| "grad_norm": 25767.197265625, | |
| "learning_rate": 1.9195361911454674e-05, | |
| "loss": 0.2054, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.4752370916754478, | |
| "grad_norm": 22215.111328125, | |
| "learning_rate": 1.893183415319747e-05, | |
| "loss": 0.2082, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.4752370916754478, | |
| "eval_loss": 0.20631250739097595, | |
| "eval_runtime": 67.1038, | |
| "eval_samples_per_second": 66.464, | |
| "eval_steps_per_second": 2.086, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.510361784334387, | |
| "grad_norm": 27927.373046875, | |
| "learning_rate": 1.8668306394940267e-05, | |
| "loss": 0.2128, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.5454864769933263, | |
| "grad_norm": 25635.267578125, | |
| "learning_rate": 1.8404778636683066e-05, | |
| "loss": 0.2078, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.5454864769933263, | |
| "eval_loss": 0.20582793653011322, | |
| "eval_runtime": 67.2723, | |
| "eval_samples_per_second": 66.298, | |
| "eval_steps_per_second": 2.081, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.5806111696522656, | |
| "grad_norm": 25550.1171875, | |
| "learning_rate": 1.814125087842586e-05, | |
| "loss": 0.2058, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.6157358623112048, | |
| "grad_norm": 21671.251953125, | |
| "learning_rate": 1.787772312016866e-05, | |
| "loss": 0.206, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.6157358623112048, | |
| "eval_loss": 0.2059122622013092, | |
| "eval_runtime": 67.4662, | |
| "eval_samples_per_second": 66.107, | |
| "eval_steps_per_second": 2.075, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.650860554970144, | |
| "grad_norm": 21685.947265625, | |
| "learning_rate": 1.7614195361911453e-05, | |
| "loss": 0.2086, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.685985247629083, | |
| "grad_norm": 24516.828125, | |
| "learning_rate": 1.7350667603654252e-05, | |
| "loss": 0.2069, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.685985247629083, | |
| "eval_loss": 0.20495346188545227, | |
| "eval_runtime": 67.1671, | |
| "eval_samples_per_second": 66.402, | |
| "eval_steps_per_second": 2.084, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.7211099402880223, | |
| "grad_norm": 22610.7734375, | |
| "learning_rate": 1.708713984539705e-05, | |
| "loss": 0.2052, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.7562346329469616, | |
| "grad_norm": 35525.84765625, | |
| "learning_rate": 1.6823612087139845e-05, | |
| "loss": 0.2051, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.7562346329469616, | |
| "eval_loss": 0.20481644570827484, | |
| "eval_runtime": 67.1059, | |
| "eval_samples_per_second": 66.462, | |
| "eval_steps_per_second": 2.086, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.7913593256059008, | |
| "grad_norm": 20207.35546875, | |
| "learning_rate": 1.6560084328882643e-05, | |
| "loss": 0.2049, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.82648401826484, | |
| "grad_norm": 17453.359375, | |
| "learning_rate": 1.6296556570625438e-05, | |
| "loss": 0.2101, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.82648401826484, | |
| "eval_loss": 0.20485134422779083, | |
| "eval_runtime": 67.202, | |
| "eval_samples_per_second": 66.367, | |
| "eval_steps_per_second": 2.083, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.8616087109237793, | |
| "grad_norm": 24568.439453125, | |
| "learning_rate": 1.603302881236824e-05, | |
| "loss": 0.2081, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.8967334035827186, | |
| "grad_norm": 22425.1875, | |
| "learning_rate": 1.5769501054111034e-05, | |
| "loss": 0.2032, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.8967334035827186, | |
| "eval_loss": 0.2041337788105011, | |
| "eval_runtime": 67.2372, | |
| "eval_samples_per_second": 66.332, | |
| "eval_steps_per_second": 2.082, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.9318580962416578, | |
| "grad_norm": 21858.3828125, | |
| "learning_rate": 1.550597329585383e-05, | |
| "loss": 0.2074, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.966982788900597, | |
| "grad_norm": 17712.39453125, | |
| "learning_rate": 1.5242445537596626e-05, | |
| "loss": 0.205, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.966982788900597, | |
| "eval_loss": 0.20371712744235992, | |
| "eval_runtime": 67.1299, | |
| "eval_samples_per_second": 66.438, | |
| "eval_steps_per_second": 2.086, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.0021074815595363, | |
| "grad_norm": 20413.91796875, | |
| "learning_rate": 1.4978917779339424e-05, | |
| "loss": 0.203, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.0372321742184756, | |
| "grad_norm": 21380.130859375, | |
| "learning_rate": 1.471539002108222e-05, | |
| "loss": 0.199, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.0372321742184756, | |
| "eval_loss": 0.20416177809238434, | |
| "eval_runtime": 67.1771, | |
| "eval_samples_per_second": 66.392, | |
| "eval_steps_per_second": 2.084, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.072356866877415, | |
| "grad_norm": 28436.697265625, | |
| "learning_rate": 1.4451862262825019e-05, | |
| "loss": 0.1989, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.107481559536354, | |
| "grad_norm": 18739.8359375, | |
| "learning_rate": 1.4188334504567815e-05, | |
| "loss": 0.1982, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.107481559536354, | |
| "eval_loss": 0.2037852257490158, | |
| "eval_runtime": 67.2417, | |
| "eval_samples_per_second": 66.328, | |
| "eval_steps_per_second": 2.082, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.1426062521952933, | |
| "grad_norm": 26514.828125, | |
| "learning_rate": 1.3924806746310612e-05, | |
| "loss": 0.2032, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.1777309448542326, | |
| "grad_norm": 22808.0234375, | |
| "learning_rate": 1.3661278988053408e-05, | |
| "loss": 0.1944, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.1777309448542326, | |
| "eval_loss": 0.20371171832084656, | |
| "eval_runtime": 67.0231, | |
| "eval_samples_per_second": 66.544, | |
| "eval_steps_per_second": 2.089, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.212855637513172, | |
| "grad_norm": 24228.18359375, | |
| "learning_rate": 1.3397751229796205e-05, | |
| "loss": 0.2056, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.247980330172111, | |
| "grad_norm": 20969.25390625, | |
| "learning_rate": 1.3134223471539003e-05, | |
| "loss": 0.1948, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.247980330172111, | |
| "eval_loss": 0.20387396216392517, | |
| "eval_runtime": 66.9567, | |
| "eval_samples_per_second": 66.61, | |
| "eval_steps_per_second": 2.091, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.2831050228310503, | |
| "grad_norm": 42587.73046875, | |
| "learning_rate": 1.28706957132818e-05, | |
| "loss": 0.2072, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.3182297154899896, | |
| "grad_norm": 22174.130859375, | |
| "learning_rate": 1.2607167955024596e-05, | |
| "loss": 0.2023, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.3182297154899896, | |
| "eval_loss": 0.20358328521251678, | |
| "eval_runtime": 67.1207, | |
| "eval_samples_per_second": 66.447, | |
| "eval_steps_per_second": 2.086, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.353354408148929, | |
| "grad_norm": 28607.568359375, | |
| "learning_rate": 1.2343640196767393e-05, | |
| "loss": 0.1964, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.388479100807868, | |
| "grad_norm": 27227.3203125, | |
| "learning_rate": 1.208011243851019e-05, | |
| "loss": 0.2075, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.388479100807868, | |
| "eval_loss": 0.20336925983428955, | |
| "eval_runtime": 67.2613, | |
| "eval_samples_per_second": 66.309, | |
| "eval_steps_per_second": 2.081, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.4236037934668073, | |
| "grad_norm": 24440.291015625, | |
| "learning_rate": 1.1816584680252988e-05, | |
| "loss": 0.1999, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.4587284861257466, | |
| "grad_norm": 23327.6328125, | |
| "learning_rate": 1.1553056921995784e-05, | |
| "loss": 0.2041, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.4587284861257466, | |
| "eval_loss": 0.2032385915517807, | |
| "eval_runtime": 67.0192, | |
| "eval_samples_per_second": 66.548, | |
| "eval_steps_per_second": 2.089, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.493853178784686, | |
| "grad_norm": 23787.681640625, | |
| "learning_rate": 1.128952916373858e-05, | |
| "loss": 0.1984, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.528977871443625, | |
| "grad_norm": 24526.529296875, | |
| "learning_rate": 1.1026001405481377e-05, | |
| "loss": 0.1971, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.528977871443625, | |
| "eval_loss": 0.20272360742092133, | |
| "eval_runtime": 66.8824, | |
| "eval_samples_per_second": 66.684, | |
| "eval_steps_per_second": 2.093, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.564102564102564, | |
| "grad_norm": 23948.60546875, | |
| "learning_rate": 1.0762473647224174e-05, | |
| "loss": 0.1904, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.5992272567615036, | |
| "grad_norm": 17924.513671875, | |
| "learning_rate": 1.0498945888966972e-05, | |
| "loss": 0.1968, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.5992272567615036, | |
| "eval_loss": 0.20258785784244537, | |
| "eval_runtime": 67.0213, | |
| "eval_samples_per_second": 66.546, | |
| "eval_steps_per_second": 2.089, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.6343519494204424, | |
| "grad_norm": 18695.21875, | |
| "learning_rate": 1.0235418130709768e-05, | |
| "loss": 0.1961, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.669476642079382, | |
| "grad_norm": 23424.083984375, | |
| "learning_rate": 9.971890372452565e-06, | |
| "loss": 0.1961, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.669476642079382, | |
| "eval_loss": 0.2024257928133011, | |
| "eval_runtime": 67.1877, | |
| "eval_samples_per_second": 66.381, | |
| "eval_steps_per_second": 2.084, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.704601334738321, | |
| "grad_norm": 18417.158203125, | |
| "learning_rate": 9.708362614195362e-06, | |
| "loss": 0.2004, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.73972602739726, | |
| "grad_norm": 29204.578125, | |
| "learning_rate": 9.444834855938158e-06, | |
| "loss": 0.2, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.73972602739726, | |
| "eval_loss": 0.20261028409004211, | |
| "eval_runtime": 67.145, | |
| "eval_samples_per_second": 66.423, | |
| "eval_steps_per_second": 2.085, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.7748507200561994, | |
| "grad_norm": 22810.859375, | |
| "learning_rate": 9.181307097680956e-06, | |
| "loss": 0.1955, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.8099754127151386, | |
| "grad_norm": 20385.189453125, | |
| "learning_rate": 8.917779339423753e-06, | |
| "loss": 0.1902, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.8099754127151386, | |
| "eval_loss": 0.20224925875663757, | |
| "eval_runtime": 66.8567, | |
| "eval_samples_per_second": 66.71, | |
| "eval_steps_per_second": 2.094, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.845100105374078, | |
| "grad_norm": 60070.58984375, | |
| "learning_rate": 8.65425158116655e-06, | |
| "loss": 0.1969, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.880224798033017, | |
| "grad_norm": 20594.654296875, | |
| "learning_rate": 8.390723822909348e-06, | |
| "loss": 0.2009, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.880224798033017, | |
| "eval_loss": 0.20173698663711548, | |
| "eval_runtime": 66.8679, | |
| "eval_samples_per_second": 66.699, | |
| "eval_steps_per_second": 2.094, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.9153494906919564, | |
| "grad_norm": 22764.1640625, | |
| "learning_rate": 8.127196064652143e-06, | |
| "loss": 0.1939, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.9504741833508956, | |
| "grad_norm": 22604.9375, | |
| "learning_rate": 7.86366830639494e-06, | |
| "loss": 0.1991, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.9504741833508956, | |
| "eval_loss": 0.20178209245204926, | |
| "eval_runtime": 67.157, | |
| "eval_samples_per_second": 66.412, | |
| "eval_steps_per_second": 2.085, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.985598876009835, | |
| "grad_norm": 23427.0, | |
| "learning_rate": 7.600140548137737e-06, | |
| "loss": 0.1982, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 3.020723568668774, | |
| "grad_norm": 22872.943359375, | |
| "learning_rate": 7.336612789880535e-06, | |
| "loss": 0.1905, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.020723568668774, | |
| "eval_loss": 0.20212285220623016, | |
| "eval_runtime": 66.9569, | |
| "eval_samples_per_second": 66.61, | |
| "eval_steps_per_second": 2.091, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.0558482613277134, | |
| "grad_norm": 20360.029296875, | |
| "learning_rate": 7.073085031623331e-06, | |
| "loss": 0.2011, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 3.0909729539866526, | |
| "grad_norm": 26769.02734375, | |
| "learning_rate": 6.809557273366128e-06, | |
| "loss": 0.1939, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.0909729539866526, | |
| "eval_loss": 0.20202863216400146, | |
| "eval_runtime": 66.9701, | |
| "eval_samples_per_second": 66.597, | |
| "eval_steps_per_second": 2.09, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.126097646645592, | |
| "grad_norm": 34976.171875, | |
| "learning_rate": 6.546029515108924e-06, | |
| "loss": 0.1912, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.161222339304531, | |
| "grad_norm": 50123.8671875, | |
| "learning_rate": 6.282501756851722e-06, | |
| "loss": 0.1934, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.161222339304531, | |
| "eval_loss": 0.20200392603874207, | |
| "eval_runtime": 66.9822, | |
| "eval_samples_per_second": 66.585, | |
| "eval_steps_per_second": 2.09, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.1963470319634704, | |
| "grad_norm": 30103.6484375, | |
| "learning_rate": 6.018973998594519e-06, | |
| "loss": 0.1891, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 3.2314717246224096, | |
| "grad_norm": 22014.908203125, | |
| "learning_rate": 5.755446240337316e-06, | |
| "loss": 0.1933, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.2314717246224096, | |
| "eval_loss": 0.20177535712718964, | |
| "eval_runtime": 66.8767, | |
| "eval_samples_per_second": 66.69, | |
| "eval_steps_per_second": 2.093, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.266596417281349, | |
| "grad_norm": 24894.115234375, | |
| "learning_rate": 5.491918482080113e-06, | |
| "loss": 0.1921, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 3.301721109940288, | |
| "grad_norm": 21648.677734375, | |
| "learning_rate": 5.2283907238229096e-06, | |
| "loss": 0.1914, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.301721109940288, | |
| "eval_loss": 0.20187227427959442, | |
| "eval_runtime": 66.9001, | |
| "eval_samples_per_second": 66.667, | |
| "eval_steps_per_second": 2.093, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.3368458025992274, | |
| "grad_norm": 24555.294921875, | |
| "learning_rate": 4.964862965565706e-06, | |
| "loss": 0.1914, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 3.3719704952581666, | |
| "grad_norm": 44338.69921875, | |
| "learning_rate": 4.7013352073085035e-06, | |
| "loss": 0.1936, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.3719704952581666, | |
| "eval_loss": 0.20171089470386505, | |
| "eval_runtime": 67.0479, | |
| "eval_samples_per_second": 66.52, | |
| "eval_steps_per_second": 2.088, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.407095187917106, | |
| "grad_norm": 23296.537109375, | |
| "learning_rate": 4.4378074490513e-06, | |
| "loss": 0.1949, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 3.442219880576045, | |
| "grad_norm": 21337.087890625, | |
| "learning_rate": 4.1742796907940974e-06, | |
| "loss": 0.1902, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.442219880576045, | |
| "eval_loss": 0.20151035487651825, | |
| "eval_runtime": 66.9445, | |
| "eval_samples_per_second": 66.622, | |
| "eval_steps_per_second": 2.091, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.4773445732349844, | |
| "grad_norm": 20258.736328125, | |
| "learning_rate": 3.910751932536894e-06, | |
| "loss": 0.1966, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 3.512469265893923, | |
| "grad_norm": 22937.763671875, | |
| "learning_rate": 3.647224174279691e-06, | |
| "loss": 0.1949, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.512469265893923, | |
| "eval_loss": 0.2013118416070938, | |
| "eval_runtime": 67.0166, | |
| "eval_samples_per_second": 66.551, | |
| "eval_steps_per_second": 2.089, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.547593958552863, | |
| "grad_norm": 27274.357421875, | |
| "learning_rate": 3.383696416022488e-06, | |
| "loss": 0.1968, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 3.5827186512118017, | |
| "grad_norm": 26782.548828125, | |
| "learning_rate": 3.1201686577652844e-06, | |
| "loss": 0.1878, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.5827186512118017, | |
| "eval_loss": 0.20154449343681335, | |
| "eval_runtime": 67.1325, | |
| "eval_samples_per_second": 66.436, | |
| "eval_steps_per_second": 2.085, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.6178433438707414, | |
| "grad_norm": 18810.177734375, | |
| "learning_rate": 2.8566408995080814e-06, | |
| "loss": 0.1912, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 3.65296803652968, | |
| "grad_norm": 26744.78515625, | |
| "learning_rate": 2.593113141250879e-06, | |
| "loss": 0.1975, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.65296803652968, | |
| "eval_loss": 0.20147912204265594, | |
| "eval_runtime": 67.0091, | |
| "eval_samples_per_second": 66.558, | |
| "eval_steps_per_second": 2.089, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.68809272918862, | |
| "grad_norm": 23326.36328125, | |
| "learning_rate": 2.3295853829936753e-06, | |
| "loss": 0.1995, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.7232174218475587, | |
| "grad_norm": 21197.091796875, | |
| "learning_rate": 2.0660576247364723e-06, | |
| "loss": 0.1894, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.7232174218475587, | |
| "eval_loss": 0.20139345526695251, | |
| "eval_runtime": 66.9887, | |
| "eval_samples_per_second": 66.578, | |
| "eval_steps_per_second": 2.09, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.758342114506498, | |
| "grad_norm": 23258.3671875, | |
| "learning_rate": 1.8025298664792693e-06, | |
| "loss": 0.1941, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.793466807165437, | |
| "grad_norm": 25702.90234375, | |
| "learning_rate": 1.539002108222066e-06, | |
| "loss": 0.1952, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.793466807165437, | |
| "eval_loss": 0.20133435726165771, | |
| "eval_runtime": 67.0042, | |
| "eval_samples_per_second": 66.563, | |
| "eval_steps_per_second": 2.089, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.8285914998243764, | |
| "grad_norm": 22600.765625, | |
| "learning_rate": 1.275474349964863e-06, | |
| "loss": 0.1912, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.8637161924833157, | |
| "grad_norm": 25134.44921875, | |
| "learning_rate": 1.0119465917076597e-06, | |
| "loss": 0.197, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.8637161924833157, | |
| "eval_loss": 0.20129592716693878, | |
| "eval_runtime": 67.1868, | |
| "eval_samples_per_second": 66.382, | |
| "eval_steps_per_second": 2.084, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.898840885142255, | |
| "grad_norm": 22639.22265625, | |
| "learning_rate": 7.484188334504568e-07, | |
| "loss": 0.1898, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 3.933965577801194, | |
| "grad_norm": 108627.9453125, | |
| "learning_rate": 4.848910751932538e-07, | |
| "loss": 0.1887, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.933965577801194, | |
| "eval_loss": 0.2013484090566635, | |
| "eval_runtime": 67.1981, | |
| "eval_samples_per_second": 66.371, | |
| "eval_steps_per_second": 2.083, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.9690902704601334, | |
| "grad_norm": 28155.427734375, | |
| "learning_rate": 2.213633169360506e-07, | |
| "loss": 0.1955, | |
| "step": 5650 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 5692, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.545216223281152e+16, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |