| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.992108229988726, | |
| "eval_steps": 500, | |
| "global_step": 110, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009019165727170236, | |
| "grad_norm": 0.8173778653144836, | |
| "learning_rate": 4.9989804820704735e-05, | |
| "loss": 0.8968, | |
| "num_input_tokens_seen": 2097152, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.018038331454340473, | |
| "grad_norm": 0.5661506056785583, | |
| "learning_rate": 4.995922759815339e-05, | |
| "loss": 0.8202, | |
| "num_input_tokens_seen": 4194304, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02705749718151071, | |
| "grad_norm": 0.4142468571662903, | |
| "learning_rate": 4.9908293271567286e-05, | |
| "loss": 0.7783, | |
| "num_input_tokens_seen": 6291456, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.036076662908680945, | |
| "grad_norm": 0.2738800048828125, | |
| "learning_rate": 4.9837043383713753e-05, | |
| "loss": 0.7642, | |
| "num_input_tokens_seen": 8388608, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04509582863585118, | |
| "grad_norm": 0.2011900395154953, | |
| "learning_rate": 4.9745536047023324e-05, | |
| "loss": 0.7475, | |
| "num_input_tokens_seen": 10485760, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05411499436302142, | |
| "grad_norm": 0.161564901471138, | |
| "learning_rate": 4.963384589619233e-05, | |
| "loss": 0.7319, | |
| "num_input_tokens_seen": 12582912, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06313416009019165, | |
| "grad_norm": 0.14298053085803986, | |
| "learning_rate": 4.9502064027309836e-05, | |
| "loss": 0.7125, | |
| "num_input_tokens_seen": 14680064, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.07215332581736189, | |
| "grad_norm": 0.14268024265766144, | |
| "learning_rate": 4.935029792355834e-05, | |
| "loss": 0.7117, | |
| "num_input_tokens_seen": 16777216, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08117249154453213, | |
| "grad_norm": 0.1712426245212555, | |
| "learning_rate": 4.917867136754893e-05, | |
| "loss": 0.724, | |
| "num_input_tokens_seen": 18874368, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.09019165727170236, | |
| "grad_norm": 0.15666206181049347, | |
| "learning_rate": 4.898732434036244e-05, | |
| "loss": 0.7144, | |
| "num_input_tokens_seen": 20971520, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0992108229988726, | |
| "grad_norm": 0.139165461063385, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 0.697, | |
| "num_input_tokens_seen": 23068672, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.10822998872604284, | |
| "grad_norm": 0.15043741464614868, | |
| "learning_rate": 4.854610909098812e-05, | |
| "loss": 0.7117, | |
| "num_input_tokens_seen": 25165824, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.11724915445321307, | |
| "grad_norm": 0.13478276133537292, | |
| "learning_rate": 4.829660073028631e-05, | |
| "loss": 0.6996, | |
| "num_input_tokens_seen": 27262976, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.1262683201803833, | |
| "grad_norm": 0.12838861346244812, | |
| "learning_rate": 4.802809132787125e-05, | |
| "loss": 0.7172, | |
| "num_input_tokens_seen": 29360128, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.13528748590755355, | |
| "grad_norm": 0.11803996562957764, | |
| "learning_rate": 4.774079988386296e-05, | |
| "loss": 0.725, | |
| "num_input_tokens_seen": 31457280, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.14430665163472378, | |
| "grad_norm": 0.11841780692338943, | |
| "learning_rate": 4.743496071728396e-05, | |
| "loss": 0.7073, | |
| "num_input_tokens_seen": 33554432, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.15332581736189402, | |
| "grad_norm": 0.11901742964982986, | |
| "learning_rate": 4.711082327494536e-05, | |
| "loss": 0.7021, | |
| "num_input_tokens_seen": 35651584, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.16234498308906425, | |
| "grad_norm": 0.11239754408597946, | |
| "learning_rate": 4.6768651927994434e-05, | |
| "loss": 0.6894, | |
| "num_input_tokens_seen": 37748736, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.1713641488162345, | |
| "grad_norm": 0.10254418104887009, | |
| "learning_rate": 4.640872575628973e-05, | |
| "loss": 0.7031, | |
| "num_input_tokens_seen": 39845888, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.18038331454340473, | |
| "grad_norm": 0.09802790731191635, | |
| "learning_rate": 4.6031338320779534e-05, | |
| "loss": 0.6629, | |
| "num_input_tokens_seen": 41943040, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.18940248027057496, | |
| "grad_norm": 0.0993066355586052, | |
| "learning_rate": 4.563679742406935e-05, | |
| "loss": 0.6774, | |
| "num_input_tokens_seen": 44040192, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.1984216459977452, | |
| "grad_norm": 0.09300491958856583, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 0.7041, | |
| "num_input_tokens_seen": 46137344, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.20744081172491544, | |
| "grad_norm": 0.08633296191692352, | |
| "learning_rate": 4.479755614805688e-05, | |
| "loss": 0.6894, | |
| "num_input_tokens_seen": 48234496, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.21645997745208567, | |
| "grad_norm": 0.0841744914650917, | |
| "learning_rate": 4.4353540265977064e-05, | |
| "loss": 0.673, | |
| "num_input_tokens_seen": 50331648, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2254791431792559, | |
| "grad_norm": 0.09111865609884262, | |
| "learning_rate": 4.389373935885646e-05, | |
| "loss": 0.6691, | |
| "num_input_tokens_seen": 52428800, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.23449830890642615, | |
| "grad_norm": 0.08103901892900467, | |
| "learning_rate": 4.341852844691012e-05, | |
| "loss": 0.6794, | |
| "num_input_tokens_seen": 54525952, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.24351747463359638, | |
| "grad_norm": 0.08423160016536713, | |
| "learning_rate": 4.292829511897409e-05, | |
| "loss": 0.6946, | |
| "num_input_tokens_seen": 56623104, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.2525366403607666, | |
| "grad_norm": 0.09218155592679977, | |
| "learning_rate": 4.242343921638234e-05, | |
| "loss": 0.6939, | |
| "num_input_tokens_seen": 58720256, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2615558060879369, | |
| "grad_norm": 0.07839576154947281, | |
| "learning_rate": 4.1904372506850484e-05, | |
| "loss": 0.6759, | |
| "num_input_tokens_seen": 60817408, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.2705749718151071, | |
| "grad_norm": 0.08427103608846664, | |
| "learning_rate": 4.137151834863213e-05, | |
| "loss": 0.6856, | |
| "num_input_tokens_seen": 62914560, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.27959413754227735, | |
| "grad_norm": 0.08769369125366211, | |
| "learning_rate": 4.082531134522176e-05, | |
| "loss": 0.6753, | |
| "num_input_tokens_seen": 65011712, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.28861330326944756, | |
| "grad_norm": 0.09144359081983566, | |
| "learning_rate": 4.0266196990885955e-05, | |
| "loss": 0.6769, | |
| "num_input_tokens_seen": 67108864, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.2976324689966178, | |
| "grad_norm": 0.08329298347234726, | |
| "learning_rate": 3.969463130731183e-05, | |
| "loss": 0.6664, | |
| "num_input_tokens_seen": 69206016, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.30665163472378804, | |
| "grad_norm": 0.07759370654821396, | |
| "learning_rate": 3.911108047166924e-05, | |
| "loss": 0.6383, | |
| "num_input_tokens_seen": 71303168, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.3156708004509583, | |
| "grad_norm": 0.07686188071966171, | |
| "learning_rate": 3.851602043638994e-05, | |
| "loss": 0.6726, | |
| "num_input_tokens_seen": 73400320, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3246899661781285, | |
| "grad_norm": 0.0853535607457161, | |
| "learning_rate": 3.790993654097405e-05, | |
| "loss": 0.6612, | |
| "num_input_tokens_seen": 75497472, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3337091319052988, | |
| "grad_norm": 0.08194194734096527, | |
| "learning_rate": 3.72933231161401e-05, | |
| "loss": 0.6763, | |
| "num_input_tokens_seen": 77594624, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.342728297632469, | |
| "grad_norm": 0.0810617133975029, | |
| "learning_rate": 3.6666683080641846e-05, | |
| "loss": 0.6533, | |
| "num_input_tokens_seen": 79691776, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.35174746335963925, | |
| "grad_norm": 0.08145678043365479, | |
| "learning_rate": 3.603052753108053e-05, | |
| "loss": 0.6598, | |
| "num_input_tokens_seen": 81788928, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.36076662908680945, | |
| "grad_norm": 0.07560884952545166, | |
| "learning_rate": 3.5385375325047166e-05, | |
| "loss": 0.6801, | |
| "num_input_tokens_seen": 83886080, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3697857948139797, | |
| "grad_norm": 0.06700561195611954, | |
| "learning_rate": 3.4731752657934794e-05, | |
| "loss": 0.6668, | |
| "num_input_tokens_seen": 85983232, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.3788049605411499, | |
| "grad_norm": 0.076134592294693, | |
| "learning_rate": 3.4070192633766025e-05, | |
| "loss": 0.6756, | |
| "num_input_tokens_seen": 88080384, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.3878241262683202, | |
| "grad_norm": 0.06908991187810898, | |
| "learning_rate": 3.3401234830385756e-05, | |
| "loss": 0.6589, | |
| "num_input_tokens_seen": 90177536, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.3968432919954904, | |
| "grad_norm": 0.07035559415817261, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.6435, | |
| "num_input_tokens_seen": 92274688, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.40586245772266066, | |
| "grad_norm": 0.06883817166090012, | |
| "learning_rate": 3.2043313921035743e-05, | |
| "loss": 0.6602, | |
| "num_input_tokens_seen": 94371840, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.41488162344983087, | |
| "grad_norm": 0.06745729595422745, | |
| "learning_rate": 3.135545835483718e-05, | |
| "loss": 0.648, | |
| "num_input_tokens_seen": 96468992, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.42390078917700114, | |
| "grad_norm": 0.06895168870687485, | |
| "learning_rate": 3.0662419185644115e-05, | |
| "loss": 0.6493, | |
| "num_input_tokens_seen": 98566144, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.43291995490417134, | |
| "grad_norm": 0.06701447814702988, | |
| "learning_rate": 2.996476166614364e-05, | |
| "loss": 0.6701, | |
| "num_input_tokens_seen": 100663296, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.4419391206313416, | |
| "grad_norm": 0.07372091710567474, | |
| "learning_rate": 2.92630548158156e-05, | |
| "loss": 0.6674, | |
| "num_input_tokens_seen": 102760448, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4509582863585118, | |
| "grad_norm": 0.07131503522396088, | |
| "learning_rate": 2.8557870956832132e-05, | |
| "loss": 0.6635, | |
| "num_input_tokens_seen": 104857600, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4599774520856821, | |
| "grad_norm": 0.0702953040599823, | |
| "learning_rate": 2.7849785247263515e-05, | |
| "loss": 0.6598, | |
| "num_input_tokens_seen": 106954752, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.4689966178128523, | |
| "grad_norm": 0.06502864509820938, | |
| "learning_rate": 2.7139375211970996e-05, | |
| "loss": 0.6732, | |
| "num_input_tokens_seen": 109051904, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.47801578354002255, | |
| "grad_norm": 0.06253138929605484, | |
| "learning_rate": 2.6427220271569203e-05, | |
| "loss": 0.6546, | |
| "num_input_tokens_seen": 111149056, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.48703494926719276, | |
| "grad_norm": 0.06724034994840622, | |
| "learning_rate": 2.5713901269842404e-05, | |
| "loss": 0.6705, | |
| "num_input_tokens_seen": 113246208, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.496054114994363, | |
| "grad_norm": 0.06660095602273941, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.6463, | |
| "num_input_tokens_seen": 115343360, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5050732807215332, | |
| "grad_norm": 0.06747590005397797, | |
| "learning_rate": 2.42860987301576e-05, | |
| "loss": 0.6505, | |
| "num_input_tokens_seen": 117440512, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5140924464487034, | |
| "grad_norm": 0.07006718963384628, | |
| "learning_rate": 2.35727797284308e-05, | |
| "loss": 0.6524, | |
| "num_input_tokens_seen": 119537664, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5231116121758738, | |
| "grad_norm": 0.06662806868553162, | |
| "learning_rate": 2.2860624788029013e-05, | |
| "loss": 0.6559, | |
| "num_input_tokens_seen": 121634816, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.532130777903044, | |
| "grad_norm": 0.06757567077875137, | |
| "learning_rate": 2.2150214752736488e-05, | |
| "loss": 0.6511, | |
| "num_input_tokens_seen": 123731968, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.5411499436302142, | |
| "grad_norm": 0.07193508744239807, | |
| "learning_rate": 2.1442129043167874e-05, | |
| "loss": 0.6457, | |
| "num_input_tokens_seen": 125829120, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5501691093573844, | |
| "grad_norm": 0.06620261073112488, | |
| "learning_rate": 2.0736945184184405e-05, | |
| "loss": 0.6492, | |
| "num_input_tokens_seen": 127926272, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.5591882750845547, | |
| "grad_norm": 0.06846100836992264, | |
| "learning_rate": 2.003523833385637e-05, | |
| "loss": 0.6539, | |
| "num_input_tokens_seen": 130023424, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.5682074408117249, | |
| "grad_norm": 0.06885959208011627, | |
| "learning_rate": 1.9337580814355888e-05, | |
| "loss": 0.6417, | |
| "num_input_tokens_seen": 132120576, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.5772266065388951, | |
| "grad_norm": 0.06715461611747742, | |
| "learning_rate": 1.8644541645162834e-05, | |
| "loss": 0.6663, | |
| "num_input_tokens_seen": 134217728, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.5862457722660653, | |
| "grad_norm": 0.06593496352434158, | |
| "learning_rate": 1.795668607896426e-05, | |
| "loss": 0.6572, | |
| "num_input_tokens_seen": 136314880, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5952649379932357, | |
| "grad_norm": 0.06741371005773544, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.6825, | |
| "num_input_tokens_seen": 138412032, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6042841037204059, | |
| "grad_norm": 0.06627509742975235, | |
| "learning_rate": 1.6598765169614243e-05, | |
| "loss": 0.6509, | |
| "num_input_tokens_seen": 140509184, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6133032694475761, | |
| "grad_norm": 0.06725791096687317, | |
| "learning_rate": 1.5929807366233977e-05, | |
| "loss": 0.6619, | |
| "num_input_tokens_seen": 142606336, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6223224351747464, | |
| "grad_norm": 0.06500604748725891, | |
| "learning_rate": 1.5268247342065215e-05, | |
| "loss": 0.6759, | |
| "num_input_tokens_seen": 144703488, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.6313416009019166, | |
| "grad_norm": 0.07281672209501266, | |
| "learning_rate": 1.4614624674952842e-05, | |
| "loss": 0.6568, | |
| "num_input_tokens_seen": 146800640, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6403607666290868, | |
| "grad_norm": 0.0661671832203865, | |
| "learning_rate": 1.3969472468919461e-05, | |
| "loss": 0.6472, | |
| "num_input_tokens_seen": 148897792, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.649379932356257, | |
| "grad_norm": 0.06678120791912079, | |
| "learning_rate": 1.3333316919358157e-05, | |
| "loss": 0.6473, | |
| "num_input_tokens_seen": 150994944, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.6583990980834273, | |
| "grad_norm": 0.06216076388955116, | |
| "learning_rate": 1.2706676883859903e-05, | |
| "loss": 0.6485, | |
| "num_input_tokens_seen": 153092096, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.6674182638105975, | |
| "grad_norm": 0.06877604126930237, | |
| "learning_rate": 1.2090063459025955e-05, | |
| "loss": 0.6544, | |
| "num_input_tokens_seen": 155189248, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6764374295377678, | |
| "grad_norm": 0.06808489561080933, | |
| "learning_rate": 1.148397956361007e-05, | |
| "loss": 0.6763, | |
| "num_input_tokens_seen": 157286400, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.685456595264938, | |
| "grad_norm": 0.06282905489206314, | |
| "learning_rate": 1.0888919528330777e-05, | |
| "loss": 0.6406, | |
| "num_input_tokens_seen": 159383552, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.6944757609921083, | |
| "grad_norm": 0.06371884793043137, | |
| "learning_rate": 1.0305368692688174e-05, | |
| "loss": 0.6502, | |
| "num_input_tokens_seen": 161480704, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7034949267192785, | |
| "grad_norm": 0.06734833121299744, | |
| "learning_rate": 9.733803009114045e-06, | |
| "loss": 0.6495, | |
| "num_input_tokens_seen": 163577856, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7125140924464487, | |
| "grad_norm": 0.06442791223526001, | |
| "learning_rate": 9.174688654778243e-06, | |
| "loss": 0.6469, | |
| "num_input_tokens_seen": 165675008, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.7215332581736189, | |
| "grad_norm": 0.06670290976762772, | |
| "learning_rate": 8.628481651367876e-06, | |
| "loss": 0.6642, | |
| "num_input_tokens_seen": 167772160, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7305524239007892, | |
| "grad_norm": 0.06524420529603958, | |
| "learning_rate": 8.09562749314952e-06, | |
| "loss": 0.6598, | |
| "num_input_tokens_seen": 169869312, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.7395715896279594, | |
| "grad_norm": 0.06147584691643715, | |
| "learning_rate": 7.576560783617668e-06, | |
| "loss": 0.6461, | |
| "num_input_tokens_seen": 171966464, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.7485907553551296, | |
| "grad_norm": 0.06680367887020111, | |
| "learning_rate": 7.071704881025915e-06, | |
| "loss": 0.6706, | |
| "num_input_tokens_seen": 174063616, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.7576099210822999, | |
| "grad_norm": 0.0657731294631958, | |
| "learning_rate": 6.5814715530898745e-06, | |
| "loss": 0.6768, | |
| "num_input_tokens_seen": 176160768, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.7666290868094702, | |
| "grad_norm": 0.06921912729740143, | |
| "learning_rate": 6.106260641143546e-06, | |
| "loss": 0.6446, | |
| "num_input_tokens_seen": 178257920, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7756482525366404, | |
| "grad_norm": 0.06477085500955582, | |
| "learning_rate": 5.646459734022938e-06, | |
| "loss": 0.6393, | |
| "num_input_tokens_seen": 180355072, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.7846674182638106, | |
| "grad_norm": 0.06648603081703186, | |
| "learning_rate": 5.202443851943126e-06, | |
| "loss": 0.6585, | |
| "num_input_tokens_seen": 182452224, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.7936865839909808, | |
| "grad_norm": 0.06899042427539825, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 0.6393, | |
| "num_input_tokens_seen": 184549376, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8027057497181511, | |
| "grad_norm": 0.06467007100582123, | |
| "learning_rate": 4.36320257593065e-06, | |
| "loss": 0.6613, | |
| "num_input_tokens_seen": 186646528, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.8117249154453213, | |
| "grad_norm": 0.06616765260696411, | |
| "learning_rate": 3.968661679220468e-06, | |
| "loss": 0.6631, | |
| "num_input_tokens_seen": 188743680, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8207440811724915, | |
| "grad_norm": 0.06439518183469772, | |
| "learning_rate": 3.591274243710277e-06, | |
| "loss": 0.6356, | |
| "num_input_tokens_seen": 190840832, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.8297632468996617, | |
| "grad_norm": 0.06294091790914536, | |
| "learning_rate": 3.2313480720055745e-06, | |
| "loss": 0.6475, | |
| "num_input_tokens_seen": 192937984, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.8387824126268321, | |
| "grad_norm": 0.06395729631185532, | |
| "learning_rate": 2.889176725054643e-06, | |
| "loss": 0.6387, | |
| "num_input_tokens_seen": 195035136, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.8478015783540023, | |
| "grad_norm": 0.06324164569377899, | |
| "learning_rate": 2.565039282716045e-06, | |
| "loss": 0.6533, | |
| "num_input_tokens_seen": 197132288, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.8568207440811725, | |
| "grad_norm": 0.06285514682531357, | |
| "learning_rate": 2.2592001161370392e-06, | |
| "loss": 0.6606, | |
| "num_input_tokens_seen": 199229440, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8658399098083427, | |
| "grad_norm": 0.06468215584754944, | |
| "learning_rate": 1.97190867212875e-06, | |
| "loss": 0.6667, | |
| "num_input_tokens_seen": 201326592, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.874859075535513, | |
| "grad_norm": 0.06431297212839127, | |
| "learning_rate": 1.703399269713693e-06, | |
| "loss": 0.6599, | |
| "num_input_tokens_seen": 203423744, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.8838782412626832, | |
| "grad_norm": 0.06363896280527115, | |
| "learning_rate": 1.4538909090118846e-06, | |
| "loss": 0.6499, | |
| "num_input_tokens_seen": 205520896, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.8928974069898534, | |
| "grad_norm": 0.06607792526483536, | |
| "learning_rate": 1.2235870926211619e-06, | |
| "loss": 0.649, | |
| "num_input_tokens_seen": 207618048, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.9019165727170236, | |
| "grad_norm": 0.06387382745742798, | |
| "learning_rate": 1.0126756596375686e-06, | |
| "loss": 0.6551, | |
| "num_input_tokens_seen": 209715200, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.910935738444194, | |
| "grad_norm": 0.07034407556056976, | |
| "learning_rate": 8.213286324510738e-07, | |
| "loss": 0.668, | |
| "num_input_tokens_seen": 211812352, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.9199549041713642, | |
| "grad_norm": 0.07087666541337967, | |
| "learning_rate": 6.497020764416633e-07, | |
| "loss": 0.6505, | |
| "num_input_tokens_seen": 213909504, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.9289740698985344, | |
| "grad_norm": 0.07003826647996902, | |
| "learning_rate": 4.979359726901639e-07, | |
| "loss": 0.6504, | |
| "num_input_tokens_seen": 216006656, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.9379932356257046, | |
| "grad_norm": 0.06608197838068008, | |
| "learning_rate": 3.6615410380767544e-07, | |
| "loss": 0.6504, | |
| "num_input_tokens_seen": 218103808, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.9470124013528749, | |
| "grad_norm": 0.0653860792517662, | |
| "learning_rate": 2.544639529766829e-07, | |
| "loss": 0.6491, | |
| "num_input_tokens_seen": 220200960, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.9560315670800451, | |
| "grad_norm": 0.06089319288730621, | |
| "learning_rate": 1.6295661628624447e-07, | |
| "loss": 0.6449, | |
| "num_input_tokens_seen": 222298112, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.9650507328072153, | |
| "grad_norm": 0.06895752251148224, | |
| "learning_rate": 9.170672843271666e-08, | |
| "loss": 0.6729, | |
| "num_input_tokens_seen": 224395264, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.9740698985343855, | |
| "grad_norm": 0.06209622696042061, | |
| "learning_rate": 4.07724018466088e-08, | |
| "loss": 0.6308, | |
| "num_input_tokens_seen": 226492416, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.9830890642615558, | |
| "grad_norm": 0.06596114486455917, | |
| "learning_rate": 1.0195179295269252e-08, | |
| "loss": 0.6695, | |
| "num_input_tokens_seen": 228589568, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.992108229988726, | |
| "grad_norm": 0.06238327547907829, | |
| "learning_rate": 0.0, | |
| "loss": 0.6467, | |
| "num_input_tokens_seen": 230686720, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.992108229988726, | |
| "num_input_tokens_seen": 230686720, | |
| "step": 110, | |
| "total_flos": 9.786587384895242e+18, | |
| "train_loss": 0.6728460962122137, | |
| "train_runtime": 17352.9545, | |
| "train_samples_per_second": 3.271, | |
| "train_steps_per_second": 0.006 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 110, | |
| "num_input_tokens_seen": 230686720, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.786587384895242e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |