| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 12.698412698412698, | |
| "eval_steps": 500, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.15873015873015872, | |
| "grad_norm": 0.42645490169525146, | |
| "learning_rate": 4.999720254525684e-05, | |
| "loss": 1.3067, | |
| "num_input_tokens_seen": 269280, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 0.10797163844108582, | |
| "learning_rate": 4.9987533135093934e-05, | |
| "loss": 0.2064, | |
| "num_input_tokens_seen": 536656, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 0.10832954943180084, | |
| "learning_rate": 4.997095990396411e-05, | |
| "loss": 0.2025, | |
| "num_input_tokens_seen": 804720, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.11103782057762146, | |
| "learning_rate": 4.994748743089566e-05, | |
| "loss": 0.2011, | |
| "num_input_tokens_seen": 1073520, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7936507936507936, | |
| "grad_norm": 0.09958792477846146, | |
| "learning_rate": 4.9917122201112656e-05, | |
| "loss": 0.2028, | |
| "num_input_tokens_seen": 1341184, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.6581681370735168, | |
| "learning_rate": 4.9879872604243184e-05, | |
| "loss": 0.1993, | |
| "num_input_tokens_seen": 1609968, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.2799030542373657, | |
| "learning_rate": 4.983574893200139e-05, | |
| "loss": 0.1979, | |
| "num_input_tokens_seen": 1878240, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.2698412698412698, | |
| "grad_norm": 0.17586013674736023, | |
| "learning_rate": 4.978476337534393e-05, | |
| "loss": 0.1931, | |
| "num_input_tokens_seen": 2146528, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.24899278581142426, | |
| "learning_rate": 4.972693002110176e-05, | |
| "loss": 0.1931, | |
| "num_input_tokens_seen": 2415696, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.5873015873015874, | |
| "grad_norm": 0.16181747615337372, | |
| "learning_rate": 4.9662264848088034e-05, | |
| "loss": 0.192, | |
| "num_input_tokens_seen": 2683600, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.746031746031746, | |
| "grad_norm": 0.18402352929115295, | |
| "learning_rate": 4.959078572268337e-05, | |
| "loss": 0.1874, | |
| "num_input_tokens_seen": 2950720, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 0.2943824827671051, | |
| "learning_rate": 4.951251239389948e-05, | |
| "loss": 0.1871, | |
| "num_input_tokens_seen": 3219792, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.0634920634920633, | |
| "grad_norm": 0.18450967967510223, | |
| "learning_rate": 4.942746648792274e-05, | |
| "loss": 0.1887, | |
| "num_input_tokens_seen": 3488400, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.2516356408596039, | |
| "learning_rate": 4.9335671502139024e-05, | |
| "loss": 0.1876, | |
| "num_input_tokens_seen": 3757952, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 0.23607608675956726, | |
| "learning_rate": 4.9237152798641696e-05, | |
| "loss": 0.1843, | |
| "num_input_tokens_seen": 4025536, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.5396825396825395, | |
| "grad_norm": 0.1812293380498886, | |
| "learning_rate": 4.9131937597224185e-05, | |
| "loss": 0.1791, | |
| "num_input_tokens_seen": 4294240, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.6984126984126986, | |
| "grad_norm": 0.1874535083770752, | |
| "learning_rate": 4.902005496785951e-05, | |
| "loss": 0.1851, | |
| "num_input_tokens_seen": 4563376, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.25721630454063416, | |
| "learning_rate": 4.8901535822668446e-05, | |
| "loss": 0.1836, | |
| "num_input_tokens_seen": 4831168, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.015873015873016, | |
| "grad_norm": 0.22797122597694397, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 0.1834, | |
| "num_input_tokens_seen": 5098496, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.1746031746031744, | |
| "grad_norm": 0.16337507963180542, | |
| "learning_rate": 4.8644720792278264e-05, | |
| "loss": 0.186, | |
| "num_input_tokens_seen": 5368864, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.17769697308540344, | |
| "learning_rate": 4.850649586266255e-05, | |
| "loss": 0.1803, | |
| "num_input_tokens_seen": 5637456, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.492063492063492, | |
| "grad_norm": 0.2481444925069809, | |
| "learning_rate": 4.836177630878289e-05, | |
| "loss": 0.1798, | |
| "num_input_tokens_seen": 5905104, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.6507936507936507, | |
| "grad_norm": 0.22745923697948456, | |
| "learning_rate": 4.821060211529424e-05, | |
| "loss": 0.1815, | |
| "num_input_tokens_seen": 6174032, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.8095238095238093, | |
| "grad_norm": 0.16727988421916962, | |
| "learning_rate": 4.8053015050207915e-05, | |
| "loss": 0.1811, | |
| "num_input_tokens_seen": 6442896, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.9682539682539684, | |
| "grad_norm": 0.3471706807613373, | |
| "learning_rate": 4.7889058653351485e-05, | |
| "loss": 0.1795, | |
| "num_input_tokens_seen": 6710352, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.1269841269841265, | |
| "grad_norm": 0.23989547789096832, | |
| "learning_rate": 4.771877822433911e-05, | |
| "loss": 0.1769, | |
| "num_input_tokens_seen": 6977744, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 0.23704519867897034, | |
| "learning_rate": 4.754222081005574e-05, | |
| "loss": 0.174, | |
| "num_input_tokens_seen": 7246272, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.2272966355085373, | |
| "learning_rate": 4.7359435191658425e-05, | |
| "loss": 0.1716, | |
| "num_input_tokens_seen": 7512592, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.603174603174603, | |
| "grad_norm": 0.23121878504753113, | |
| "learning_rate": 4.717047187109861e-05, | |
| "loss": 0.1804, | |
| "num_input_tokens_seen": 7780144, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 0.31674066185951233, | |
| "learning_rate": 4.697538305716885e-05, | |
| "loss": 0.1784, | |
| "num_input_tokens_seen": 8049392, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.920634920634921, | |
| "grad_norm": 0.2399132400751114, | |
| "learning_rate": 4.6774222651078106e-05, | |
| "loss": 0.1796, | |
| "num_input_tokens_seen": 8316912, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.079365079365079, | |
| "grad_norm": 0.2677905261516571, | |
| "learning_rate": 4.656704623155922e-05, | |
| "loss": 0.1736, | |
| "num_input_tokens_seen": 8586544, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 5.238095238095238, | |
| "grad_norm": 0.33959662914276123, | |
| "learning_rate": 4.6353911039513145e-05, | |
| "loss": 0.1766, | |
| "num_input_tokens_seen": 8855680, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 5.396825396825397, | |
| "grad_norm": 0.26891693472862244, | |
| "learning_rate": 4.613487596219376e-05, | |
| "loss": 0.1724, | |
| "num_input_tokens_seen": 9123808, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.2796987295150757, | |
| "learning_rate": 4.591000151693789e-05, | |
| "loss": 0.1721, | |
| "num_input_tokens_seen": 9392560, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 0.257348895072937, | |
| "learning_rate": 4.567934983444495e-05, | |
| "loss": 0.1718, | |
| "num_input_tokens_seen": 9660480, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.8730158730158735, | |
| "grad_norm": 0.2910774052143097, | |
| "learning_rate": 4.544298464161079e-05, | |
| "loss": 0.1718, | |
| "num_input_tokens_seen": 9927936, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 6.031746031746032, | |
| "grad_norm": 0.3452795445919037, | |
| "learning_rate": 4.520097124392055e-05, | |
| "loss": 0.1711, | |
| "num_input_tokens_seen": 10197520, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 6.190476190476191, | |
| "grad_norm": 0.46368861198425293, | |
| "learning_rate": 4.49533765074054e-05, | |
| "loss": 0.1652, | |
| "num_input_tokens_seen": 10466240, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 6.349206349206349, | |
| "grad_norm": 0.42205390334129333, | |
| "learning_rate": 4.4700268840168045e-05, | |
| "loss": 0.1677, | |
| "num_input_tokens_seen": 10734496, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 6.507936507936508, | |
| "grad_norm": 0.25223520398139954, | |
| "learning_rate": 4.444171817348225e-05, | |
| "loss": 0.1684, | |
| "num_input_tokens_seen": 11004416, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.4380488991737366, | |
| "learning_rate": 4.417779594247143e-05, | |
| "loss": 0.1655, | |
| "num_input_tokens_seen": 11272656, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.825396825396825, | |
| "grad_norm": 0.2701490819454193, | |
| "learning_rate": 4.3908575066371835e-05, | |
| "loss": 0.1722, | |
| "num_input_tokens_seen": 11540112, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 6.984126984126984, | |
| "grad_norm": 0.3422671854496002, | |
| "learning_rate": 4.363412992838566e-05, | |
| "loss": 0.1676, | |
| "num_input_tokens_seen": 11808816, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "grad_norm": 0.6143015623092651, | |
| "learning_rate": 4.335453635512961e-05, | |
| "loss": 0.1538, | |
| "num_input_tokens_seen": 12077648, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 7.301587301587301, | |
| "grad_norm": 0.44244784116744995, | |
| "learning_rate": 4.306987159568479e-05, | |
| "loss": 0.1572, | |
| "num_input_tokens_seen": 12346240, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 7.4603174603174605, | |
| "grad_norm": 0.441853404045105, | |
| "learning_rate": 4.278021430025343e-05, | |
| "loss": 0.1587, | |
| "num_input_tokens_seen": 12614864, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 7.619047619047619, | |
| "grad_norm": 0.520702600479126, | |
| "learning_rate": 4.248564449842864e-05, | |
| "loss": 0.1616, | |
| "num_input_tokens_seen": 12883088, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.473958283662796, | |
| "learning_rate": 4.2186243577082954e-05, | |
| "loss": 0.1602, | |
| "num_input_tokens_seen": 13151264, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.936507936507937, | |
| "grad_norm": 0.4550235867500305, | |
| "learning_rate": 4.1882094257881885e-05, | |
| "loss": 0.1597, | |
| "num_input_tokens_seen": 13419344, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 8.095238095238095, | |
| "grad_norm": 0.7338590025901794, | |
| "learning_rate": 4.157328057442874e-05, | |
| "loss": 0.1473, | |
| "num_input_tokens_seen": 13686752, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 8.253968253968253, | |
| "grad_norm": 0.6510297060012817, | |
| "learning_rate": 4.1259887849046906e-05, | |
| "loss": 0.1363, | |
| "num_input_tokens_seen": 13954352, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 8.412698412698413, | |
| "grad_norm": 0.767859160900116, | |
| "learning_rate": 4.0942002669206085e-05, | |
| "loss": 0.1408, | |
| "num_input_tokens_seen": 14222352, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 0.7285030484199524, | |
| "learning_rate": 4.0619712863599e-05, | |
| "loss": 0.1422, | |
| "num_input_tokens_seen": 14491920, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 8.73015873015873, | |
| "grad_norm": 0.6987579464912415, | |
| "learning_rate": 4.029310747787516e-05, | |
| "loss": 0.1483, | |
| "num_input_tokens_seen": 14760400, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.7618018984794617, | |
| "learning_rate": 3.996227675003834e-05, | |
| "loss": 0.1437, | |
| "num_input_tokens_seen": 15029280, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 9.047619047619047, | |
| "grad_norm": 0.7082319855690002, | |
| "learning_rate": 3.962731208551474e-05, | |
| "loss": 0.1386, | |
| "num_input_tokens_seen": 15298416, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 9.206349206349206, | |
| "grad_norm": 0.9523563385009766, | |
| "learning_rate": 3.928830603189844e-05, | |
| "loss": 0.1034, | |
| "num_input_tokens_seen": 15567104, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 9.365079365079366, | |
| "grad_norm": 1.1607928276062012, | |
| "learning_rate": 3.894535225338143e-05, | |
| "loss": 0.1073, | |
| "num_input_tokens_seen": 15835952, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 9.523809523809524, | |
| "grad_norm": 1.0483174324035645, | |
| "learning_rate": 3.859854550487506e-05, | |
| "loss": 0.1124, | |
| "num_input_tokens_seen": 16103648, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 9.682539682539682, | |
| "grad_norm": 0.9111513495445251, | |
| "learning_rate": 3.824798160583012e-05, | |
| "loss": 0.1202, | |
| "num_input_tokens_seen": 16373888, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 9.841269841269842, | |
| "grad_norm": 1.031439185142517, | |
| "learning_rate": 3.789375741376286e-05, | |
| "loss": 0.1194, | |
| "num_input_tokens_seen": 16642320, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.9815431237220764, | |
| "learning_rate": 3.7535970797494136e-05, | |
| "loss": 0.117, | |
| "num_input_tokens_seen": 16910032, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 10.158730158730158, | |
| "grad_norm": 1.4907585382461548, | |
| "learning_rate": 3.717472061010918e-05, | |
| "loss": 0.0739, | |
| "num_input_tokens_seen": 17178576, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 10.317460317460318, | |
| "grad_norm": 1.1762831211090088, | |
| "learning_rate": 3.681010666164546e-05, | |
| "loss": 0.0704, | |
| "num_input_tokens_seen": 17448288, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 10.476190476190476, | |
| "grad_norm": 1.2105902433395386, | |
| "learning_rate": 3.644222969151605e-05, | |
| "loss": 0.0735, | |
| "num_input_tokens_seen": 17716784, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 10.634920634920634, | |
| "grad_norm": 1.1394544839859009, | |
| "learning_rate": 3.607119134067629e-05, | |
| "loss": 0.077, | |
| "num_input_tokens_seen": 17984944, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 10.793650793650794, | |
| "grad_norm": 1.2243598699569702, | |
| "learning_rate": 3.569709412354136e-05, | |
| "loss": 0.0763, | |
| "num_input_tokens_seen": 18252080, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 10.952380952380953, | |
| "grad_norm": 1.0364540815353394, | |
| "learning_rate": 3.5320041399662494e-05, | |
| "loss": 0.0762, | |
| "num_input_tokens_seen": 18520464, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 1.0455269813537598, | |
| "learning_rate": 3.494013734516971e-05, | |
| "loss": 0.0514, | |
| "num_input_tokens_seen": 18786528, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 11.26984126984127, | |
| "grad_norm": 1.2155787944793701, | |
| "learning_rate": 3.4557486923988924e-05, | |
| "loss": 0.0375, | |
| "num_input_tokens_seen": 19055536, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 11.428571428571429, | |
| "grad_norm": 1.1954303979873657, | |
| "learning_rate": 3.4172195858841404e-05, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 19324304, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 11.587301587301587, | |
| "grad_norm": 1.1928291320800781, | |
| "learning_rate": 3.378437060203357e-05, | |
| "loss": 0.0374, | |
| "num_input_tokens_seen": 19593552, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 11.746031746031747, | |
| "grad_norm": 1.192438006401062, | |
| "learning_rate": 3.3394118306045217e-05, | |
| "loss": 0.0426, | |
| "num_input_tokens_seen": 19862784, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 11.904761904761905, | |
| "grad_norm": 1.1554771661758423, | |
| "learning_rate": 3.3001546793924285e-05, | |
| "loss": 0.0432, | |
| "num_input_tokens_seen": 20131584, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 12.063492063492063, | |
| "grad_norm": 0.7850580215454102, | |
| "learning_rate": 3.260676452949641e-05, | |
| "loss": 0.0348, | |
| "num_input_tokens_seen": 20401120, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 12.222222222222221, | |
| "grad_norm": 0.6133368611335754, | |
| "learning_rate": 3.22098805873973e-05, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 20670080, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 12.380952380952381, | |
| "grad_norm": 0.9954155087471008, | |
| "learning_rate": 3.1811004622936525e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 20938000, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 12.53968253968254, | |
| "grad_norm": 0.9651346206665039, | |
| "learning_rate": 3.141024684180071e-05, | |
| "loss": 0.0212, | |
| "num_input_tokens_seen": 21206432, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 12.698412698412698, | |
| "grad_norm": 1.0618289709091187, | |
| "learning_rate": 3.10077179696048e-05, | |
| "loss": 0.0231, | |
| "num_input_tokens_seen": 21476960, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1890, | |
| "num_input_tokens_seen": 21476960, | |
| "num_train_epochs": 30, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.216970364477768e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |