{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.870544090056285, "eval_steps": 500, "global_step": 165, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0300187617260788, "grad_norm": 19.618404854139, "learning_rate": 1e-05, "loss": 0.6154, "mean_token_accuracy": 0.8398024253547192, "step": 1 }, { "epoch": 0.0600375234521576, "grad_norm": 21.798338409796745, "learning_rate": 2e-05, "loss": 0.6416, "mean_token_accuracy": 0.8340235594660044, "step": 2 }, { "epoch": 0.0900562851782364, "grad_norm": 13.833687232901854, "learning_rate": 3e-05, "loss": 0.5896, "mean_token_accuracy": 0.8433804120868444, "step": 3 }, { "epoch": 0.1200750469043152, "grad_norm": 5.505910810820941, "learning_rate": 4e-05, "loss": 0.5319, "mean_token_accuracy": 0.8556831870228052, "step": 4 }, { "epoch": 0.150093808630394, "grad_norm": 5.278702397056334, "learning_rate": 5e-05, "loss": 0.4437, "mean_token_accuracy": 0.8703144080936909, "step": 5 }, { "epoch": 0.1801125703564728, "grad_norm": 2.1162182135646033, "learning_rate": 4.9995181012051625e-05, "loss": 0.4193, "mean_token_accuracy": 0.878010880202055, "step": 6 }, { "epoch": 0.2101313320825516, "grad_norm": 14.208326816182495, "learning_rate": 4.9980725906018074e-05, "loss": 0.4096, "mean_token_accuracy": 0.8749048858880997, "step": 7 }, { "epoch": 0.2401500938086304, "grad_norm": 2.162621083920564, "learning_rate": 4.9956640254617906e-05, "loss": 0.3978, "mean_token_accuracy": 0.8762698639184237, "step": 8 }, { "epoch": 0.2701688555347092, "grad_norm": 2.034935016310286, "learning_rate": 4.99229333433282e-05, "loss": 0.374, "mean_token_accuracy": 0.8837966062128544, "step": 9 }, { "epoch": 0.300187617260788, "grad_norm": 1.2368690799465214, "learning_rate": 4.987961816680492e-05, "loss": 0.3545, "mean_token_accuracy": 0.8879855256527662, "step": 10 }, { "epoch": 0.3302063789868668, "grad_norm": 0.8933826446995154, "learning_rate": 4.982671142387316e-05, "loss": 0.3527, "mean_token_accuracy": 0.8875276073813438, "step": 11 }, { "epoch": 0.3602251407129456, "grad_norm": 0.8267017608965835, "learning_rate": 4.976423351108943e-05, "loss": 0.3186, "mean_token_accuracy": 0.8965118452906609, "step": 12 }, { "epoch": 0.3902439024390244, "grad_norm": 0.6967492468619846, "learning_rate": 4.9692208514878444e-05, "loss": 0.3016, "mean_token_accuracy": 0.9023277424275875, "step": 13 }, { "epoch": 0.4202626641651032, "grad_norm": 0.626201960051008, "learning_rate": 4.9610664202247294e-05, "loss": 0.3189, "mean_token_accuracy": 0.8961522448807955, "step": 14 }, { "epoch": 0.450281425891182, "grad_norm": 0.519219193366074, "learning_rate": 4.951963201008076e-05, "loss": 0.3031, "mean_token_accuracy": 0.9007221981883049, "step": 15 }, { "epoch": 0.4803001876172608, "grad_norm": 0.5430064582418314, "learning_rate": 4.9419147033021814e-05, "loss": 0.2963, "mean_token_accuracy": 0.9018692336976528, "step": 16 }, { "epoch": 0.5103189493433395, "grad_norm": 0.45295446404829903, "learning_rate": 4.9309248009941914e-05, "loss": 0.2945, "mean_token_accuracy": 0.9022990744560957, "step": 17 }, { "epoch": 0.5403377110694184, "grad_norm": 0.39810793732783883, "learning_rate": 4.9189977309006495e-05, "loss": 0.2867, "mean_token_accuracy": 0.9044581968337297, "step": 18 }, { "epoch": 0.5703564727954972, "grad_norm": 0.29279224179883323, "learning_rate": 4.906138091134118e-05, "loss": 0.2817, "mean_token_accuracy": 0.9055654220283031, "step": 19 }, { "epoch": 0.600375234521576, "grad_norm": 0.26896824294508065, "learning_rate": 4.892350839330522e-05, "loss": 0.2921, "mean_token_accuracy": 0.9025575239211321, "step": 20 }, { "epoch": 0.6303939962476548, "grad_norm": 0.2400534443678901, "learning_rate": 4.877641290737884e-05, "loss": 0.2831, "mean_token_accuracy": 0.9051281735301018, "step": 21 }, { "epoch": 0.6604127579737336, "grad_norm": 0.2249165746946966, "learning_rate": 4.862015116167196e-05, "loss": 0.2698, "mean_token_accuracy": 0.9093952961266041, "step": 22 }, { "epoch": 0.6904315196998124, "grad_norm": 0.264247809537063, "learning_rate": 4.8454783398062106e-05, "loss": 0.2686, "mean_token_accuracy": 0.9091940615326166, "step": 23 }, { "epoch": 0.7204502814258912, "grad_norm": 0.20930943221019285, "learning_rate": 4.828037336897009e-05, "loss": 0.2687, "mean_token_accuracy": 0.9089630376547575, "step": 24 }, { "epoch": 0.7504690431519699, "grad_norm": 0.23889395502942187, "learning_rate": 4.8096988312782174e-05, "loss": 0.2871, "mean_token_accuracy": 0.9030982349067926, "step": 25 }, { "epoch": 0.7804878048780488, "grad_norm": 0.21055564681809716, "learning_rate": 4.7904698927928406e-05, "loss": 0.272, "mean_token_accuracy": 0.9078760109841824, "step": 26 }, { "epoch": 0.8105065666041276, "grad_norm": 0.21681199372541698, "learning_rate": 4.7703579345627035e-05, "loss": 0.2619, "mean_token_accuracy": 0.9109147116541862, "step": 27 }, { "epoch": 0.8405253283302064, "grad_norm": 0.21247193653216784, "learning_rate": 4.749370710130554e-05, "loss": 0.2721, "mean_token_accuracy": 0.9074795469641685, "step": 28 }, { "epoch": 0.8705440900562852, "grad_norm": 0.20525916333687041, "learning_rate": 4.72751631047092e-05, "loss": 0.2539, "mean_token_accuracy": 0.9133741557598114, "step": 29 }, { "epoch": 0.900562851782364, "grad_norm": 0.21529654405923737, "learning_rate": 4.7048031608708876e-05, "loss": 0.2603, "mean_token_accuracy": 0.9109627865254879, "step": 30 }, { "epoch": 0.9305816135084428, "grad_norm": 0.20791794762620378, "learning_rate": 4.681240017681993e-05, "loss": 0.2593, "mean_token_accuracy": 0.9111653957515955, "step": 31 }, { "epoch": 0.9606003752345216, "grad_norm": 0.20774824517485244, "learning_rate": 4.65683596494448e-05, "loss": 0.2719, "mean_token_accuracy": 0.9068219736218452, "step": 32 }, { "epoch": 0.9906191369606003, "grad_norm": 0.28582938868285823, "learning_rate": 4.6316004108852305e-05, "loss": 0.2645, "mean_token_accuracy": 0.9088481441140175, "step": 33 }, { "epoch": 1.0, "grad_norm": 0.28582938868285823, "learning_rate": 4.6055430842907167e-05, "loss": 0.2564, "mean_token_accuracy": 0.9133941173553467, "step": 34 }, { "epoch": 1.0300187617260788, "grad_norm": 0.3589081697155284, "learning_rate": 4.5786740307563636e-05, "loss": 0.2082, "mean_token_accuracy": 0.9285639356821775, "step": 35 }, { "epoch": 1.0600375234521575, "grad_norm": 0.19449792688035672, "learning_rate": 4.551003608813784e-05, "loss": 0.2047, "mean_token_accuracy": 0.9296260979026556, "step": 36 }, { "epoch": 1.0900562851782365, "grad_norm": 0.23233991689426617, "learning_rate": 4.522542485937369e-05, "loss": 0.1979, "mean_token_accuracy": 0.9314604848623276, "step": 37 }, { "epoch": 1.1200750469043153, "grad_norm": 0.21035371628271216, "learning_rate": 4.493301634431768e-05, "loss": 0.2014, "mean_token_accuracy": 0.9298410974442959, "step": 38 }, { "epoch": 1.150093808630394, "grad_norm": 0.20990624713625997, "learning_rate": 4.463292327201862e-05, "loss": 0.1913, "mean_token_accuracy": 0.933486595749855, "step": 39 }, { "epoch": 1.1801125703564728, "grad_norm": 0.2156272816847033, "learning_rate": 4.4325261334068426e-05, "loss": 0.2031, "mean_token_accuracy": 0.9307098593562841, "step": 40 }, { "epoch": 1.2101313320825515, "grad_norm": 0.21696878272059866, "learning_rate": 4.401014914000078e-05, "loss": 0.1915, "mean_token_accuracy": 0.9335418920964003, "step": 41 }, { "epoch": 1.2401500938086305, "grad_norm": 0.1818612765558643, "learning_rate": 4.3687708171564925e-05, "loss": 0.1791, "mean_token_accuracy": 0.9380327388644218, "step": 42 }, { "epoch": 1.2701688555347093, "grad_norm": 0.18129814277988898, "learning_rate": 4.335806273589214e-05, "loss": 0.1931, "mean_token_accuracy": 0.9324233587831259, "step": 43 }, { "epoch": 1.300187617260788, "grad_norm": 0.18921071728690822, "learning_rate": 4.302133991757297e-05, "loss": 0.1861, "mean_token_accuracy": 0.9347784202545881, "step": 44 }, { "epoch": 1.3302063789868668, "grad_norm": 0.1846346124739407, "learning_rate": 4.267766952966369e-05, "loss": 0.1978, "mean_token_accuracy": 0.9310048930346966, "step": 45 }, { "epoch": 1.3602251407129455, "grad_norm": 0.18689120002736795, "learning_rate": 4.23271840636409e-05, "loss": 0.1931, "mean_token_accuracy": 0.9321947041898966, "step": 46 }, { "epoch": 1.3902439024390243, "grad_norm": 0.18301258133692994, "learning_rate": 4.197001863832355e-05, "loss": 0.1991, "mean_token_accuracy": 0.9307528082281351, "step": 47 }, { "epoch": 1.4202626641651033, "grad_norm": 0.20071944245709974, "learning_rate": 4.1606310947782044e-05, "loss": 0.1883, "mean_token_accuracy": 0.9341552760452032, "step": 48 }, { "epoch": 1.450281425891182, "grad_norm": 0.21531485697866234, "learning_rate": 4.123620120825459e-05, "loss": 0.1793, "mean_token_accuracy": 0.9380034245550632, "step": 49 }, { "epoch": 1.4803001876172608, "grad_norm": 0.17040701196766744, "learning_rate": 4.085983210409114e-05, "loss": 0.17, "mean_token_accuracy": 0.9408059008419514, "step": 50 }, { "epoch": 1.5103189493433395, "grad_norm": 0.17082023776864208, "learning_rate": 4.047734873274586e-05, "loss": 0.1777, "mean_token_accuracy": 0.9373182617127895, "step": 51 }, { "epoch": 1.5403377110694185, "grad_norm": 0.18880547525592725, "learning_rate": 4.008889854883929e-05, "loss": 0.1905, "mean_token_accuracy": 0.9339997190982103, "step": 52 }, { "epoch": 1.5703564727954973, "grad_norm": 0.2003270144688197, "learning_rate": 3.969463130731183e-05, "loss": 0.1829, "mean_token_accuracy": 0.9364625960588455, "step": 53 }, { "epoch": 1.600375234521576, "grad_norm": 0.16248574881358357, "learning_rate": 3.9294699005690305e-05, "loss": 0.187, "mean_token_accuracy": 0.9349782522767782, "step": 54 }, { "epoch": 1.6303939962476548, "grad_norm": 0.16268952077579069, "learning_rate": 3.888925582549006e-05, "loss": 0.1806, "mean_token_accuracy": 0.9380554854869843, "step": 55 }, { "epoch": 1.6604127579737336, "grad_norm": 0.16260973286493194, "learning_rate": 3.847845807277502e-05, "loss": 0.1756, "mean_token_accuracy": 0.9381309170275927, "step": 56 }, { "epoch": 1.6904315196998123, "grad_norm": 0.18849387268876527, "learning_rate": 3.8062464117898724e-05, "loss": 0.1905, "mean_token_accuracy": 0.933776805177331, "step": 57 }, { "epoch": 1.720450281425891, "grad_norm": 0.1812480467627804, "learning_rate": 3.764143433444962e-05, "loss": 0.1845, "mean_token_accuracy": 0.9354843944311142, "step": 58 }, { "epoch": 1.7504690431519698, "grad_norm": 0.19727408903046884, "learning_rate": 3.721553103742388e-05, "loss": 0.1839, "mean_token_accuracy": 0.9353628680109978, "step": 59 }, { "epoch": 1.7804878048780488, "grad_norm": 0.16881751417638702, "learning_rate": 3.678491842064995e-05, "loss": 0.1847, "mean_token_accuracy": 0.9353015590459108, "step": 60 }, { "epoch": 1.8105065666041276, "grad_norm": 0.1805153593928837, "learning_rate": 3.634976249348867e-05, "loss": 0.189, "mean_token_accuracy": 0.9340192507952452, "step": 61 }, { "epoch": 1.8405253283302065, "grad_norm": 0.16744864978079732, "learning_rate": 3.591023101683355e-05, "loss": 0.1873, "mean_token_accuracy": 0.9332233294844627, "step": 62 }, { "epoch": 1.8705440900562853, "grad_norm": 0.20943512548005347, "learning_rate": 3.54664934384357e-05, "loss": 0.1833, "mean_token_accuracy": 0.9361728671938181, "step": 63 }, { "epoch": 1.900562851782364, "grad_norm": 0.15100109107147408, "learning_rate": 3.5018720827578524e-05, "loss": 0.177, "mean_token_accuracy": 0.9376390129327774, "step": 64 }, { "epoch": 1.9305816135084428, "grad_norm": 0.20136076678950812, "learning_rate": 3.456708580912725e-05, "loss": 0.1847, "mean_token_accuracy": 0.9356410764157772, "step": 65 }, { "epoch": 1.9606003752345216, "grad_norm": 0.16935110772638642, "learning_rate": 3.411176249697875e-05, "loss": 0.1882, "mean_token_accuracy": 0.9341955110430717, "step": 66 }, { "epoch": 1.9906191369606003, "grad_norm": 0.17801077092117232, "learning_rate": 3.365292642693732e-05, "loss": 0.1791, "mean_token_accuracy": 0.9368807151913643, "step": 67 }, { "epoch": 2.0, "grad_norm": 0.17801077092117232, "learning_rate": 3.319075448904234e-05, "loss": 0.1817, "mean_token_accuracy": 0.9353618502616883, "step": 68 }, { "epoch": 2.0300187617260788, "grad_norm": 0.34283977157187906, "learning_rate": 3.272542485937369e-05, "loss": 0.1162, "mean_token_accuracy": 0.9604951441287994, "step": 69 }, { "epoch": 2.0600375234521575, "grad_norm": 0.25353133352641416, "learning_rate": 3.225711693136156e-05, "loss": 0.1155, "mean_token_accuracy": 0.9606517199426889, "step": 70 }, { "epoch": 2.0900562851782363, "grad_norm": 0.36813345733413727, "learning_rate": 3.178601124662686e-05, "loss": 0.1092, "mean_token_accuracy": 0.9620461780577898, "step": 71 }, { "epoch": 2.120075046904315, "grad_norm": 0.20837522140479256, "learning_rate": 3.131228942537895e-05, "loss": 0.1064, "mean_token_accuracy": 0.9636496491730213, "step": 72 }, { "epoch": 2.150093808630394, "grad_norm": 0.2546796945935164, "learning_rate": 3.083613409639764e-05, "loss": 0.1082, "mean_token_accuracy": 0.9626397844403982, "step": 73 }, { "epoch": 2.180112570356473, "grad_norm": 0.2517042958600063, "learning_rate": 3.035772882662627e-05, "loss": 0.1024, "mean_token_accuracy": 0.9642387926578522, "step": 74 }, { "epoch": 2.2101313320825517, "grad_norm": 0.16863389096389939, "learning_rate": 2.9877258050403212e-05, "loss": 0.1011, "mean_token_accuracy": 0.964973971247673, "step": 75 }, { "epoch": 2.2401500938086305, "grad_norm": 0.2256068322542817, "learning_rate": 2.9394906998358868e-05, "loss": 0.0979, "mean_token_accuracy": 0.9662024211138487, "step": 76 }, { "epoch": 2.2701688555347093, "grad_norm": 0.19130902536055486, "learning_rate": 2.8910861626005776e-05, "loss": 0.101, "mean_token_accuracy": 0.9646210763603449, "step": 77 }, { "epoch": 2.300187617260788, "grad_norm": 0.18029622833908, "learning_rate": 2.8425308542049206e-05, "loss": 0.0943, "mean_token_accuracy": 0.9668951816856861, "step": 78 }, { "epoch": 2.3302063789868668, "grad_norm": 0.1715983987427455, "learning_rate": 2.7938434936445945e-05, "loss": 0.1025, "mean_token_accuracy": 0.9641035441309214, "step": 79 }, { "epoch": 2.3602251407129455, "grad_norm": 0.17151947074238844, "learning_rate": 2.7450428508239024e-05, "loss": 0.0993, "mean_token_accuracy": 0.9651761185377836, "step": 80 }, { "epoch": 2.3902439024390243, "grad_norm": 0.17762362563985393, "learning_rate": 2.6961477393196126e-05, "loss": 0.1016, "mean_token_accuracy": 0.9645342864096165, "step": 81 }, { "epoch": 2.420262664165103, "grad_norm": 0.17493795219201744, "learning_rate": 2.6471770091279724e-05, "loss": 0.1032, "mean_token_accuracy": 0.965608624741435, "step": 82 }, { "epoch": 2.450281425891182, "grad_norm": 0.21622340080905333, "learning_rate": 2.598149539397672e-05, "loss": 0.1056, "mean_token_accuracy": 0.9633868020027876, "step": 83 }, { "epoch": 2.480300187617261, "grad_norm": 0.18325655719580544, "learning_rate": 2.5490842311515707e-05, "loss": 0.1003, "mean_token_accuracy": 0.9652356337755919, "step": 84 }, { "epoch": 2.5103189493433398, "grad_norm": 0.16079654454953773, "learning_rate": 2.5e-05, "loss": 0.0951, "mean_token_accuracy": 0.9671048391610384, "step": 85 }, { "epoch": 2.5403377110694185, "grad_norm": 0.177885663467419, "learning_rate": 2.4509157688484295e-05, "loss": 0.1019, "mean_token_accuracy": 0.9652324616909027, "step": 86 }, { "epoch": 2.5703564727954973, "grad_norm": 0.16463009515777124, "learning_rate": 2.4018504606023293e-05, "loss": 0.0983, "mean_token_accuracy": 0.9660285171121359, "step": 87 }, { "epoch": 2.600375234521576, "grad_norm": 0.14988401935266468, "learning_rate": 2.3528229908720272e-05, "loss": 0.0973, "mean_token_accuracy": 0.9662998840212822, "step": 88 }, { "epoch": 2.630393996247655, "grad_norm": 0.1714584031856408, "learning_rate": 2.303852260680388e-05, "loss": 0.0993, "mean_token_accuracy": 0.9654844384640455, "step": 89 }, { "epoch": 2.6604127579737336, "grad_norm": 0.15655630724758532, "learning_rate": 2.2549571491760986e-05, "loss": 0.1044, "mean_token_accuracy": 0.9633280653506517, "step": 90 }, { "epoch": 2.6904315196998123, "grad_norm": 0.15989111678931958, "learning_rate": 2.2061565063554064e-05, "loss": 0.0962, "mean_token_accuracy": 0.9662177134305239, "step": 91 }, { "epoch": 2.720450281425891, "grad_norm": 0.1612719262065956, "learning_rate": 2.1574691457950803e-05, "loss": 0.1, "mean_token_accuracy": 0.9648805633187294, "step": 92 }, { "epoch": 2.75046904315197, "grad_norm": 0.1447218929697437, "learning_rate": 2.1089138373994223e-05, "loss": 0.097, "mean_token_accuracy": 0.9660444520413876, "step": 93 }, { "epoch": 2.7804878048780486, "grad_norm": 0.15448044912912087, "learning_rate": 2.0605093001641138e-05, "loss": 0.1037, "mean_token_accuracy": 0.9642052594572306, "step": 94 }, { "epoch": 2.8105065666041273, "grad_norm": 0.14976483567215834, "learning_rate": 2.0122741949596797e-05, "loss": 0.103, "mean_token_accuracy": 0.9642070364207029, "step": 95 }, { "epoch": 2.8405253283302065, "grad_norm": 0.15397846138230065, "learning_rate": 1.9642271173373737e-05, "loss": 0.1024, "mean_token_accuracy": 0.9642751514911652, "step": 96 }, { "epoch": 2.8705440900562853, "grad_norm": 0.16533125622570222, "learning_rate": 1.9163865903602374e-05, "loss": 0.0983, "mean_token_accuracy": 0.9661570060998201, "step": 97 }, { "epoch": 2.900562851782364, "grad_norm": 0.14567827324511498, "learning_rate": 1.868771057462105e-05, "loss": 0.0895, "mean_token_accuracy": 0.9689803905785084, "step": 98 }, { "epoch": 2.930581613508443, "grad_norm": 0.13721507889257023, "learning_rate": 1.8213988753373146e-05, "loss": 0.1018, "mean_token_accuracy": 0.9658490009605885, "step": 99 }, { "epoch": 2.9606003752345216, "grad_norm": 0.18558487132226667, "learning_rate": 1.7742883068638447e-05, "loss": 0.0975, "mean_token_accuracy": 0.9673260115087032, "step": 100 }, { "epoch": 2.9906191369606003, "grad_norm": 0.14278892649537844, "learning_rate": 1.7274575140626318e-05, "loss": 0.0945, "mean_token_accuracy": 0.9672219399362803, "step": 101 }, { "epoch": 3.0, "grad_norm": 0.14278892649537844, "learning_rate": 1.6809245510957665e-05, "loss": 0.104, "mean_token_accuracy": 0.9641202390193939, "step": 102 }, { "epoch": 3.0300187617260788, "grad_norm": 0.29206855231690615, "learning_rate": 1.6347073573062672e-05, "loss": 0.052, "mean_token_accuracy": 0.9840696156024933, "step": 103 }, { "epoch": 3.0600375234521575, "grad_norm": 0.21146610857781498, "learning_rate": 1.588823750302126e-05, "loss": 0.0506, "mean_token_accuracy": 0.9837026111781597, "step": 104 }, { "epoch": 3.0900562851782363, "grad_norm": 0.1728680637000517, "learning_rate": 1.5432914190872757e-05, "loss": 0.0492, "mean_token_accuracy": 0.9842210356146097, "step": 105 }, { "epoch": 3.120075046904315, "grad_norm": 0.137716977630954, "learning_rate": 1.498127917242148e-05, "loss": 0.0493, "mean_token_accuracy": 0.9839507173746824, "step": 106 }, { "epoch": 3.150093808630394, "grad_norm": 0.14551903804275892, "learning_rate": 1.4533506561564306e-05, "loss": 0.0544, "mean_token_accuracy": 0.9822139292955399, "step": 107 }, { "epoch": 3.180112570356473, "grad_norm": 0.16669835535632535, "learning_rate": 1.4089768983166444e-05, "loss": 0.0489, "mean_token_accuracy": 0.9840298742055893, "step": 108 }, { "epoch": 3.2101313320825517, "grad_norm": 0.18300271784408872, "learning_rate": 1.3650237506511331e-05, "loss": 0.0497, "mean_token_accuracy": 0.983882175758481, "step": 109 }, { "epoch": 3.2401500938086305, "grad_norm": 0.1843234481043501, "learning_rate": 1.3215081579350058e-05, "loss": 0.0485, "mean_token_accuracy": 0.9843094442039728, "step": 110 }, { "epoch": 3.2701688555347093, "grad_norm": 0.3461827490875774, "learning_rate": 1.2784468962576136e-05, "loss": 0.047, "mean_token_accuracy": 0.9847969133406878, "step": 111 }, { "epoch": 3.300187617260788, "grad_norm": 0.15632977455270483, "learning_rate": 1.235856566555039e-05, "loss": 0.049, "mean_token_accuracy": 0.9837981257587671, "step": 112 }, { "epoch": 3.3302063789868668, "grad_norm": 0.14640471914964392, "learning_rate": 1.1937535882101281e-05, "loss": 0.0458, "mean_token_accuracy": 0.9851204100996256, "step": 113 }, { "epoch": 3.3602251407129455, "grad_norm": 0.13729939899053178, "learning_rate": 1.1521541927224994e-05, "loss": 0.0456, "mean_token_accuracy": 0.9848766028881073, "step": 114 }, { "epoch": 3.3902439024390243, "grad_norm": 0.13806503349144675, "learning_rate": 1.1110744174509952e-05, "loss": 0.049, "mean_token_accuracy": 0.9844018053263426, "step": 115 }, { "epoch": 3.420262664165103, "grad_norm": 0.1677329902297057, "learning_rate": 1.0705300994309697e-05, "loss": 0.0509, "mean_token_accuracy": 0.9836404304951429, "step": 116 }, { "epoch": 3.450281425891182, "grad_norm": 0.1363456396457925, "learning_rate": 1.0305368692688174e-05, "loss": 0.0489, "mean_token_accuracy": 0.9842113871127367, "step": 117 }, { "epoch": 3.480300187617261, "grad_norm": 0.14670430283357652, "learning_rate": 9.911101451160715e-06, "loss": 0.0476, "mean_token_accuracy": 0.9845409169793129, "step": 118 }, { "epoch": 3.5103189493433398, "grad_norm": 0.13593290922974113, "learning_rate": 9.522651267254149e-06, "loss": 0.0498, "mean_token_accuracy": 0.9841745216399431, "step": 119 }, { "epoch": 3.5403377110694185, "grad_norm": 0.1405157943110022, "learning_rate": 9.140167895908867e-06, "loss": 0.0515, "mean_token_accuracy": 0.9838052876293659, "step": 120 }, { "epoch": 3.5703564727954973, "grad_norm": 0.13398507397046694, "learning_rate": 8.763798791745411e-06, "loss": 0.044, "mean_token_accuracy": 0.985531248152256, "step": 121 }, { "epoch": 3.600375234521576, "grad_norm": 0.12595342205919996, "learning_rate": 8.393689052217966e-06, "loss": 0.0443, "mean_token_accuracy": 0.9851887430995703, "step": 122 }, { "epoch": 3.630393996247655, "grad_norm": 0.12802288754577185, "learning_rate": 8.029981361676456e-06, "loss": 0.0477, "mean_token_accuracy": 0.9847091306000948, "step": 123 }, { "epoch": 3.6604127579737336, "grad_norm": 0.13540249038634009, "learning_rate": 7.672815936359107e-06, "loss": 0.0437, "mean_token_accuracy": 0.9858846813440323, "step": 124 }, { "epoch": 3.6904315196998123, "grad_norm": 0.1272358814553976, "learning_rate": 7.3223304703363135e-06, "loss": 0.0472, "mean_token_accuracy": 0.9844079315662384, "step": 125 }, { "epoch": 3.720450281425891, "grad_norm": 0.13634273240990136, "learning_rate": 6.9786600824270296e-06, "loss": 0.0427, "mean_token_accuracy": 0.9858784638345242, "step": 126 }, { "epoch": 3.75046904315197, "grad_norm": 0.15771802380242175, "learning_rate": 6.641937264107867e-06, "loss": 0.0469, "mean_token_accuracy": 0.9847830552607775, "step": 127 }, { "epoch": 3.7804878048780486, "grad_norm": 0.13754448160952976, "learning_rate": 6.312291828435077e-06, "loss": 0.0462, "mean_token_accuracy": 0.9851614981889725, "step": 128 }, { "epoch": 3.8105065666041273, "grad_norm": 0.14734425443158122, "learning_rate": 5.989850859999227e-06, "loss": 0.0422, "mean_token_accuracy": 0.9861964080482721, "step": 129 }, { "epoch": 3.8405253283302065, "grad_norm": 0.12037178039604896, "learning_rate": 5.674738665931575e-06, "loss": 0.0445, "mean_token_accuracy": 0.9854839760810137, "step": 130 }, { "epoch": 3.8705440900562853, "grad_norm": 0.13270114277633968, "learning_rate": 5.367076727981382e-06, "loss": 0.046, "mean_token_accuracy": 0.98503128439188, "step": 131 }, { "epoch": 3.900562851782364, "grad_norm": 0.12186409904443084, "learning_rate": 5.066983655682325e-06, "loss": 0.0413, "mean_token_accuracy": 0.9866250548511744, "step": 132 }, { "epoch": 3.930581613508443, "grad_norm": 0.11572178677884377, "learning_rate": 4.7745751406263165e-06, "loss": 0.0455, "mean_token_accuracy": 0.9853598214685917, "step": 133 }, { "epoch": 3.9606003752345216, "grad_norm": 0.1285035698798016, "learning_rate": 4.48996391186216e-06, "loss": 0.0446, "mean_token_accuracy": 0.9853111784905195, "step": 134 }, { "epoch": 3.9906191369606003, "grad_norm": 0.12237492704812947, "learning_rate": 4.213259692436367e-06, "loss": 0.0472, "mean_token_accuracy": 0.9846988655626774, "step": 135 }, { "epoch": 4.0, "grad_norm": 0.17459270329168236, "learning_rate": 3.944569157092839e-06, "loss": 0.0397, "mean_token_accuracy": 0.9869139909744262, "step": 136 }, { "epoch": 4.030018761726079, "grad_norm": 0.21688322258057258, "learning_rate": 3.6839958911476957e-06, "loss": 0.0316, "mean_token_accuracy": 0.9908953290432692, "step": 137 }, { "epoch": 4.0600375234521575, "grad_norm": 0.12510412863095888, "learning_rate": 3.431640350555204e-06, "loss": 0.0298, "mean_token_accuracy": 0.9912976007908583, "step": 138 }, { "epoch": 4.090056285178236, "grad_norm": 0.11532401288217803, "learning_rate": 3.187599823180071e-06, "loss": 0.0291, "mean_token_accuracy": 0.9916361309587955, "step": 139 }, { "epoch": 4.120075046904315, "grad_norm": 0.11662866353878451, "learning_rate": 2.9519683912911266e-06, "loss": 0.0316, "mean_token_accuracy": 0.9906173534691334, "step": 140 }, { "epoch": 4.150093808630394, "grad_norm": 0.10785269578007366, "learning_rate": 2.7248368952908053e-06, "loss": 0.0278, "mean_token_accuracy": 0.9918341338634491, "step": 141 }, { "epoch": 4.1801125703564725, "grad_norm": 0.10540687517578978, "learning_rate": 2.506292898694468e-06, "loss": 0.0304, "mean_token_accuracy": 0.9909927677363157, "step": 142 }, { "epoch": 4.210131332082551, "grad_norm": 0.10795341728368958, "learning_rate": 2.296420654372966e-06, "loss": 0.0292, "mean_token_accuracy": 0.9913486260920763, "step": 143 }, { "epoch": 4.24015009380863, "grad_norm": 0.09918400957702202, "learning_rate": 2.0953010720716037e-06, "loss": 0.0285, "mean_token_accuracy": 0.991315545514226, "step": 144 }, { "epoch": 4.270168855534709, "grad_norm": 0.1028689650543891, "learning_rate": 1.9030116872178316e-06, "loss": 0.0268, "mean_token_accuracy": 0.9919464886188507, "step": 145 }, { "epoch": 4.300187617260788, "grad_norm": 0.0934195965967741, "learning_rate": 1.7196266310299108e-06, "loss": 0.0271, "mean_token_accuracy": 0.9918058719485998, "step": 146 }, { "epoch": 4.330206378986867, "grad_norm": 0.09146571370676639, "learning_rate": 1.5452166019378989e-06, "loss": 0.0273, "mean_token_accuracy": 0.9917649105191231, "step": 147 }, { "epoch": 4.360225140712946, "grad_norm": 0.09753557772930677, "learning_rate": 1.379848838328049e-06, "loss": 0.0286, "mean_token_accuracy": 0.9913905151188374, "step": 148 }, { "epoch": 4.390243902439025, "grad_norm": 0.11653668851139358, "learning_rate": 1.2235870926211619e-06, "loss": 0.0277, "mean_token_accuracy": 0.9916701205074787, "step": 149 }, { "epoch": 4.4202626641651035, "grad_norm": 0.09067908471373788, "learning_rate": 1.0764916066947794e-06, "loss": 0.0258, "mean_token_accuracy": 0.9922576006501913, "step": 150 }, { "epoch": 4.450281425891182, "grad_norm": 0.09653433513408423, "learning_rate": 9.386190886588208e-07, "loss": 0.0271, "mean_token_accuracy": 0.9919117372483015, "step": 151 }, { "epoch": 4.480300187617261, "grad_norm": 0.0987084116462941, "learning_rate": 8.10022690993506e-07, "loss": 0.028, "mean_token_accuracy": 0.9915720969438553, "step": 152 }, { "epoch": 4.51031894934334, "grad_norm": 0.10457824343264062, "learning_rate": 6.907519900580861e-07, "loss": 0.0302, "mean_token_accuracy": 0.9909002613276243, "step": 153 }, { "epoch": 4.5403377110694185, "grad_norm": 0.10340772315470596, "learning_rate": 5.808529669781904e-07, "loss": 0.0264, "mean_token_accuracy": 0.9919101018458605, "step": 154 }, { "epoch": 4.570356472795497, "grad_norm": 0.09744615982408229, "learning_rate": 4.803679899192392e-07, "loss": 0.0285, "mean_token_accuracy": 0.9909517038613558, "step": 155 }, { "epoch": 4.600375234521576, "grad_norm": 0.09254462095652977, "learning_rate": 3.8933579775271013e-07, "loss": 0.0263, "mean_token_accuracy": 0.9920994155108929, "step": 156 }, { "epoch": 4.630393996247655, "grad_norm": 0.09530189373391666, "learning_rate": 3.077914851215585e-07, "loss": 0.0283, "mean_token_accuracy": 0.9914026968181133, "step": 157 }, { "epoch": 4.6604127579737336, "grad_norm": 0.09988980276705559, "learning_rate": 2.3576648891056875e-07, "loss": 0.027, "mean_token_accuracy": 0.9920587744563818, "step": 158 }, { "epoch": 4.690431519699812, "grad_norm": 0.09117032881835743, "learning_rate": 1.732885761268427e-07, "loss": 0.0262, "mean_token_accuracy": 0.991992175579071, "step": 159 }, { "epoch": 4.720450281425891, "grad_norm": 0.09146419854434591, "learning_rate": 1.2038183319507955e-07, "loss": 0.0264, "mean_token_accuracy": 0.9921840745955706, "step": 160 }, { "epoch": 4.75046904315197, "grad_norm": 0.0941692484126693, "learning_rate": 7.706665667180091e-08, "loss": 0.0262, "mean_token_accuracy": 0.992155384272337, "step": 161 }, { "epoch": 4.780487804878049, "grad_norm": 0.09868857104545, "learning_rate": 4.335974538210441e-08, "loss": 0.0286, "mean_token_accuracy": 0.9914395287632942, "step": 162 }, { "epoch": 4.810506566604127, "grad_norm": 0.098281670570088, "learning_rate": 1.9274093981927478e-08, "loss": 0.0269, "mean_token_accuracy": 0.9919413533061743, "step": 163 }, { "epoch": 4.840525328330206, "grad_norm": 0.08992856114449468, "learning_rate": 4.818987948379539e-09, "loss": 0.0266, "mean_token_accuracy": 0.9920829199254513, "step": 164 }, { "epoch": 4.870544090056285, "grad_norm": 0.09146022503539022, "learning_rate": 0.0, "loss": 0.0274, "mean_token_accuracy": 0.9918729793280363, "step": 165 }, { "epoch": 4.870544090056285, "step": 165, "total_flos": 195199345459200.0, "train_loss": 0.14440994993077985, "train_runtime": 12814.9616, "train_samples_per_second": 3.325, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 165, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 195199345459200.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }