{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 12.698412698412698, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15873015873015872, "grad_norm": 0.42645490169525146, "learning_rate": 4.999720254525684e-05, "loss": 1.3067, "num_input_tokens_seen": 269280, "step": 10 }, { "epoch": 0.31746031746031744, "grad_norm": 0.10797163844108582, "learning_rate": 4.9987533135093934e-05, "loss": 0.2064, "num_input_tokens_seen": 536656, "step": 20 }, { "epoch": 0.47619047619047616, "grad_norm": 0.10832954943180084, "learning_rate": 4.997095990396411e-05, "loss": 0.2025, "num_input_tokens_seen": 804720, "step": 30 }, { "epoch": 0.6349206349206349, "grad_norm": 0.11103782057762146, "learning_rate": 4.994748743089566e-05, "loss": 0.2011, "num_input_tokens_seen": 1073520, "step": 40 }, { "epoch": 0.7936507936507936, "grad_norm": 0.09958792477846146, "learning_rate": 4.9917122201112656e-05, "loss": 0.2028, "num_input_tokens_seen": 1341184, "step": 50 }, { "epoch": 0.9523809523809523, "grad_norm": 0.6581681370735168, "learning_rate": 4.9879872604243184e-05, "loss": 0.1993, "num_input_tokens_seen": 1609968, "step": 60 }, { "epoch": 1.1111111111111112, "grad_norm": 0.2799030542373657, "learning_rate": 4.983574893200139e-05, "loss": 0.1979, "num_input_tokens_seen": 1878240, "step": 70 }, { "epoch": 1.2698412698412698, "grad_norm": 0.17586013674736023, "learning_rate": 4.978476337534393e-05, "loss": 0.1931, "num_input_tokens_seen": 2146528, "step": 80 }, { "epoch": 1.4285714285714286, "grad_norm": 0.24899278581142426, "learning_rate": 4.972693002110176e-05, "loss": 0.1931, "num_input_tokens_seen": 2415696, "step": 90 }, { "epoch": 1.5873015873015874, "grad_norm": 0.16181747615337372, "learning_rate": 4.9662264848088034e-05, "loss": 0.192, "num_input_tokens_seen": 2683600, "step": 100 }, { "epoch": 1.746031746031746, "grad_norm": 0.18402352929115295, "learning_rate": 4.959078572268337e-05, "loss": 0.1874, "num_input_tokens_seen": 2950720, "step": 110 }, { "epoch": 1.9047619047619047, "grad_norm": 0.2943824827671051, "learning_rate": 4.951251239389948e-05, "loss": 0.1871, "num_input_tokens_seen": 3219792, "step": 120 }, { "epoch": 2.0634920634920633, "grad_norm": 0.18450967967510223, "learning_rate": 4.942746648792274e-05, "loss": 0.1887, "num_input_tokens_seen": 3488400, "step": 130 }, { "epoch": 2.2222222222222223, "grad_norm": 0.2516356408596039, "learning_rate": 4.9335671502139024e-05, "loss": 0.1876, "num_input_tokens_seen": 3757952, "step": 140 }, { "epoch": 2.380952380952381, "grad_norm": 0.23607608675956726, "learning_rate": 4.9237152798641696e-05, "loss": 0.1843, "num_input_tokens_seen": 4025536, "step": 150 }, { "epoch": 2.5396825396825395, "grad_norm": 0.1812293380498886, "learning_rate": 4.9131937597224185e-05, "loss": 0.1791, "num_input_tokens_seen": 4294240, "step": 160 }, { "epoch": 2.6984126984126986, "grad_norm": 0.1874535083770752, "learning_rate": 4.902005496785951e-05, "loss": 0.1851, "num_input_tokens_seen": 4563376, "step": 170 }, { "epoch": 2.857142857142857, "grad_norm": 0.25721630454063416, "learning_rate": 4.8901535822668446e-05, "loss": 0.1836, "num_input_tokens_seen": 4831168, "step": 180 }, { "epoch": 3.015873015873016, "grad_norm": 0.22797122597694397, "learning_rate": 4.877641290737884e-05, "loss": 0.1834, "num_input_tokens_seen": 5098496, "step": 190 }, { "epoch": 3.1746031746031744, "grad_norm": 0.16337507963180542, "learning_rate": 4.8644720792278264e-05, "loss": 0.186, "num_input_tokens_seen": 5368864, "step": 200 }, { "epoch": 3.3333333333333335, "grad_norm": 0.17769697308540344, "learning_rate": 4.850649586266255e-05, "loss": 0.1803, "num_input_tokens_seen": 5637456, "step": 210 }, { "epoch": 3.492063492063492, "grad_norm": 0.2481444925069809, "learning_rate": 4.836177630878289e-05, "loss": 0.1798, "num_input_tokens_seen": 5905104, "step": 220 }, { "epoch": 3.6507936507936507, "grad_norm": 0.22745923697948456, "learning_rate": 4.821060211529424e-05, "loss": 0.1815, "num_input_tokens_seen": 6174032, "step": 230 }, { "epoch": 3.8095238095238093, "grad_norm": 0.16727988421916962, "learning_rate": 4.8053015050207915e-05, "loss": 0.1811, "num_input_tokens_seen": 6442896, "step": 240 }, { "epoch": 3.9682539682539684, "grad_norm": 0.3471706807613373, "learning_rate": 4.7889058653351485e-05, "loss": 0.1795, "num_input_tokens_seen": 6710352, "step": 250 }, { "epoch": 4.1269841269841265, "grad_norm": 0.23989547789096832, "learning_rate": 4.771877822433911e-05, "loss": 0.1769, "num_input_tokens_seen": 6977744, "step": 260 }, { "epoch": 4.285714285714286, "grad_norm": 0.23704519867897034, "learning_rate": 4.754222081005574e-05, "loss": 0.174, "num_input_tokens_seen": 7246272, "step": 270 }, { "epoch": 4.444444444444445, "grad_norm": 0.2272966355085373, "learning_rate": 4.7359435191658425e-05, "loss": 0.1716, "num_input_tokens_seen": 7512592, "step": 280 }, { "epoch": 4.603174603174603, "grad_norm": 0.23121878504753113, "learning_rate": 4.717047187109861e-05, "loss": 0.1804, "num_input_tokens_seen": 7780144, "step": 290 }, { "epoch": 4.761904761904762, "grad_norm": 0.31674066185951233, "learning_rate": 4.697538305716885e-05, "loss": 0.1784, "num_input_tokens_seen": 8049392, "step": 300 }, { "epoch": 4.920634920634921, "grad_norm": 0.2399132400751114, "learning_rate": 4.6774222651078106e-05, "loss": 0.1796, "num_input_tokens_seen": 8316912, "step": 310 }, { "epoch": 5.079365079365079, "grad_norm": 0.2677905261516571, "learning_rate": 4.656704623155922e-05, "loss": 0.1736, "num_input_tokens_seen": 8586544, "step": 320 }, { "epoch": 5.238095238095238, "grad_norm": 0.33959662914276123, "learning_rate": 4.6353911039513145e-05, "loss": 0.1766, "num_input_tokens_seen": 8855680, "step": 330 }, { "epoch": 5.396825396825397, "grad_norm": 0.26891693472862244, "learning_rate": 4.613487596219376e-05, "loss": 0.1724, "num_input_tokens_seen": 9123808, "step": 340 }, { "epoch": 5.555555555555555, "grad_norm": 0.2796987295150757, "learning_rate": 4.591000151693789e-05, "loss": 0.1721, "num_input_tokens_seen": 9392560, "step": 350 }, { "epoch": 5.714285714285714, "grad_norm": 0.257348895072937, "learning_rate": 4.567934983444495e-05, "loss": 0.1718, "num_input_tokens_seen": 9660480, "step": 360 }, { "epoch": 5.8730158730158735, "grad_norm": 0.2910774052143097, "learning_rate": 4.544298464161079e-05, "loss": 0.1718, "num_input_tokens_seen": 9927936, "step": 370 }, { "epoch": 6.031746031746032, "grad_norm": 0.3452795445919037, "learning_rate": 4.520097124392055e-05, "loss": 0.1711, "num_input_tokens_seen": 10197520, "step": 380 }, { "epoch": 6.190476190476191, "grad_norm": 0.46368861198425293, "learning_rate": 4.49533765074054e-05, "loss": 0.1652, "num_input_tokens_seen": 10466240, "step": 390 }, { "epoch": 6.349206349206349, "grad_norm": 0.42205390334129333, "learning_rate": 4.4700268840168045e-05, "loss": 0.1677, "num_input_tokens_seen": 10734496, "step": 400 }, { "epoch": 6.507936507936508, "grad_norm": 0.25223520398139954, "learning_rate": 4.444171817348225e-05, "loss": 0.1684, "num_input_tokens_seen": 11004416, "step": 410 }, { "epoch": 6.666666666666667, "grad_norm": 0.4380488991737366, "learning_rate": 4.417779594247143e-05, "loss": 0.1655, "num_input_tokens_seen": 11272656, "step": 420 }, { "epoch": 6.825396825396825, "grad_norm": 0.2701490819454193, "learning_rate": 4.3908575066371835e-05, "loss": 0.1722, "num_input_tokens_seen": 11540112, "step": 430 }, { "epoch": 6.984126984126984, "grad_norm": 0.3422671854496002, "learning_rate": 4.363412992838566e-05, "loss": 0.1676, "num_input_tokens_seen": 11808816, "step": 440 }, { "epoch": 7.142857142857143, "grad_norm": 0.6143015623092651, "learning_rate": 4.335453635512961e-05, "loss": 0.1538, "num_input_tokens_seen": 12077648, "step": 450 }, { "epoch": 7.301587301587301, "grad_norm": 0.44244784116744995, "learning_rate": 4.306987159568479e-05, "loss": 0.1572, "num_input_tokens_seen": 12346240, "step": 460 }, { "epoch": 7.4603174603174605, "grad_norm": 0.441853404045105, "learning_rate": 4.278021430025343e-05, "loss": 0.1587, "num_input_tokens_seen": 12614864, "step": 470 }, { "epoch": 7.619047619047619, "grad_norm": 0.520702600479126, "learning_rate": 4.248564449842864e-05, "loss": 0.1616, "num_input_tokens_seen": 12883088, "step": 480 }, { "epoch": 7.777777777777778, "grad_norm": 0.473958283662796, "learning_rate": 4.2186243577082954e-05, "loss": 0.1602, "num_input_tokens_seen": 13151264, "step": 490 }, { "epoch": 7.936507936507937, "grad_norm": 0.4550235867500305, "learning_rate": 4.1882094257881885e-05, "loss": 0.1597, "num_input_tokens_seen": 13419344, "step": 500 }, { "epoch": 8.095238095238095, "grad_norm": 0.7338590025901794, "learning_rate": 4.157328057442874e-05, "loss": 0.1473, "num_input_tokens_seen": 13686752, "step": 510 }, { "epoch": 8.253968253968253, "grad_norm": 0.6510297060012817, "learning_rate": 4.1259887849046906e-05, "loss": 0.1363, "num_input_tokens_seen": 13954352, "step": 520 }, { "epoch": 8.412698412698413, "grad_norm": 0.767859160900116, "learning_rate": 4.0942002669206085e-05, "loss": 0.1408, "num_input_tokens_seen": 14222352, "step": 530 }, { "epoch": 8.571428571428571, "grad_norm": 0.7285030484199524, "learning_rate": 4.0619712863599e-05, "loss": 0.1422, "num_input_tokens_seen": 14491920, "step": 540 }, { "epoch": 8.73015873015873, "grad_norm": 0.6987579464912415, "learning_rate": 4.029310747787516e-05, "loss": 0.1483, "num_input_tokens_seen": 14760400, "step": 550 }, { "epoch": 8.88888888888889, "grad_norm": 0.7618018984794617, "learning_rate": 3.996227675003834e-05, "loss": 0.1437, "num_input_tokens_seen": 15029280, "step": 560 }, { "epoch": 9.047619047619047, "grad_norm": 0.7082319855690002, "learning_rate": 3.962731208551474e-05, "loss": 0.1386, "num_input_tokens_seen": 15298416, "step": 570 }, { "epoch": 9.206349206349206, "grad_norm": 0.9523563385009766, "learning_rate": 3.928830603189844e-05, "loss": 0.1034, "num_input_tokens_seen": 15567104, "step": 580 }, { "epoch": 9.365079365079366, "grad_norm": 1.1607928276062012, "learning_rate": 3.894535225338143e-05, "loss": 0.1073, "num_input_tokens_seen": 15835952, "step": 590 }, { "epoch": 9.523809523809524, "grad_norm": 1.0483174324035645, "learning_rate": 3.859854550487506e-05, "loss": 0.1124, "num_input_tokens_seen": 16103648, "step": 600 }, { "epoch": 9.682539682539682, "grad_norm": 0.9111513495445251, "learning_rate": 3.824798160583012e-05, "loss": 0.1202, "num_input_tokens_seen": 16373888, "step": 610 }, { "epoch": 9.841269841269842, "grad_norm": 1.031439185142517, "learning_rate": 3.789375741376286e-05, "loss": 0.1194, "num_input_tokens_seen": 16642320, "step": 620 }, { "epoch": 10.0, "grad_norm": 0.9815431237220764, "learning_rate": 3.7535970797494136e-05, "loss": 0.117, "num_input_tokens_seen": 16910032, "step": 630 }, { "epoch": 10.158730158730158, "grad_norm": 1.4907585382461548, "learning_rate": 3.717472061010918e-05, "loss": 0.0739, "num_input_tokens_seen": 17178576, "step": 640 }, { "epoch": 10.317460317460318, "grad_norm": 1.1762831211090088, "learning_rate": 3.681010666164546e-05, "loss": 0.0704, "num_input_tokens_seen": 17448288, "step": 650 }, { "epoch": 10.476190476190476, "grad_norm": 1.2105902433395386, "learning_rate": 3.644222969151605e-05, "loss": 0.0735, "num_input_tokens_seen": 17716784, "step": 660 }, { "epoch": 10.634920634920634, "grad_norm": 1.1394544839859009, "learning_rate": 3.607119134067629e-05, "loss": 0.077, "num_input_tokens_seen": 17984944, "step": 670 }, { "epoch": 10.793650793650794, "grad_norm": 1.2243598699569702, "learning_rate": 3.569709412354136e-05, "loss": 0.0763, "num_input_tokens_seen": 18252080, "step": 680 }, { "epoch": 10.952380952380953, "grad_norm": 1.0364540815353394, "learning_rate": 3.5320041399662494e-05, "loss": 0.0762, "num_input_tokens_seen": 18520464, "step": 690 }, { "epoch": 11.11111111111111, "grad_norm": 1.0455269813537598, "learning_rate": 3.494013734516971e-05, "loss": 0.0514, "num_input_tokens_seen": 18786528, "step": 700 }, { "epoch": 11.26984126984127, "grad_norm": 1.2155787944793701, "learning_rate": 3.4557486923988924e-05, "loss": 0.0375, "num_input_tokens_seen": 19055536, "step": 710 }, { "epoch": 11.428571428571429, "grad_norm": 1.1954303979873657, "learning_rate": 3.4172195858841404e-05, "loss": 0.0389, "num_input_tokens_seen": 19324304, "step": 720 }, { "epoch": 11.587301587301587, "grad_norm": 1.1928291320800781, "learning_rate": 3.378437060203357e-05, "loss": 0.0374, "num_input_tokens_seen": 19593552, "step": 730 }, { "epoch": 11.746031746031747, "grad_norm": 1.192438006401062, "learning_rate": 3.3394118306045217e-05, "loss": 0.0426, "num_input_tokens_seen": 19862784, "step": 740 }, { "epoch": 11.904761904761905, "grad_norm": 1.1554771661758423, "learning_rate": 3.3001546793924285e-05, "loss": 0.0432, "num_input_tokens_seen": 20131584, "step": 750 }, { "epoch": 12.063492063492063, "grad_norm": 0.7850580215454102, "learning_rate": 3.260676452949641e-05, "loss": 0.0348, "num_input_tokens_seen": 20401120, "step": 760 }, { "epoch": 12.222222222222221, "grad_norm": 0.6133368611335754, "learning_rate": 3.22098805873973e-05, "loss": 0.0165, "num_input_tokens_seen": 20670080, "step": 770 }, { "epoch": 12.380952380952381, "grad_norm": 0.9954155087471008, "learning_rate": 3.1811004622936525e-05, "loss": 0.0192, "num_input_tokens_seen": 20938000, "step": 780 }, { "epoch": 12.53968253968254, "grad_norm": 0.9651346206665039, "learning_rate": 3.141024684180071e-05, "loss": 0.0212, "num_input_tokens_seen": 21206432, "step": 790 }, { "epoch": 12.698412698412698, "grad_norm": 1.0618289709091187, "learning_rate": 3.10077179696048e-05, "loss": 0.0231, "num_input_tokens_seen": 21476960, "step": 800 } ], "logging_steps": 10, "max_steps": 1890, "num_input_tokens_seen": 21476960, "num_train_epochs": 30, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.216970364477768e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }