| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.870544090056285, | |
| "eval_steps": 500, | |
| "global_step": 165, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0300187617260788, | |
| "grad_norm": 19.618404854139, | |
| "learning_rate": 1e-05, | |
| "loss": 0.6154, | |
| "mean_token_accuracy": 0.8398024253547192, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0600375234521576, | |
| "grad_norm": 21.798338409796745, | |
| "learning_rate": 2e-05, | |
| "loss": 0.6416, | |
| "mean_token_accuracy": 0.8340235594660044, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0900562851782364, | |
| "grad_norm": 13.833687232901854, | |
| "learning_rate": 3e-05, | |
| "loss": 0.5896, | |
| "mean_token_accuracy": 0.8433804120868444, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.1200750469043152, | |
| "grad_norm": 5.505910810820941, | |
| "learning_rate": 4e-05, | |
| "loss": 0.5319, | |
| "mean_token_accuracy": 0.8556831870228052, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.150093808630394, | |
| "grad_norm": 5.278702397056334, | |
| "learning_rate": 5e-05, | |
| "loss": 0.4437, | |
| "mean_token_accuracy": 0.8703144080936909, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1801125703564728, | |
| "grad_norm": 2.1162182135646033, | |
| "learning_rate": 4.9995181012051625e-05, | |
| "loss": 0.4193, | |
| "mean_token_accuracy": 0.878010880202055, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.2101313320825516, | |
| "grad_norm": 14.208326816182495, | |
| "learning_rate": 4.9980725906018074e-05, | |
| "loss": 0.4096, | |
| "mean_token_accuracy": 0.8749048858880997, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.2401500938086304, | |
| "grad_norm": 2.162621083920564, | |
| "learning_rate": 4.9956640254617906e-05, | |
| "loss": 0.3978, | |
| "mean_token_accuracy": 0.8762698639184237, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.2701688555347092, | |
| "grad_norm": 2.034935016310286, | |
| "learning_rate": 4.99229333433282e-05, | |
| "loss": 0.374, | |
| "mean_token_accuracy": 0.8837966062128544, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.300187617260788, | |
| "grad_norm": 1.2368690799465214, | |
| "learning_rate": 4.987961816680492e-05, | |
| "loss": 0.3545, | |
| "mean_token_accuracy": 0.8879855256527662, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3302063789868668, | |
| "grad_norm": 0.8933826446995154, | |
| "learning_rate": 4.982671142387316e-05, | |
| "loss": 0.3527, | |
| "mean_token_accuracy": 0.8875276073813438, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.3602251407129456, | |
| "grad_norm": 0.8267017608965835, | |
| "learning_rate": 4.976423351108943e-05, | |
| "loss": 0.3186, | |
| "mean_token_accuracy": 0.8965118452906609, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.3902439024390244, | |
| "grad_norm": 0.6967492468619846, | |
| "learning_rate": 4.9692208514878444e-05, | |
| "loss": 0.3016, | |
| "mean_token_accuracy": 0.9023277424275875, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.4202626641651032, | |
| "grad_norm": 0.626201960051008, | |
| "learning_rate": 4.9610664202247294e-05, | |
| "loss": 0.3189, | |
| "mean_token_accuracy": 0.8961522448807955, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.450281425891182, | |
| "grad_norm": 0.519219193366074, | |
| "learning_rate": 4.951963201008076e-05, | |
| "loss": 0.3031, | |
| "mean_token_accuracy": 0.9007221981883049, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.4803001876172608, | |
| "grad_norm": 0.5430064582418314, | |
| "learning_rate": 4.9419147033021814e-05, | |
| "loss": 0.2963, | |
| "mean_token_accuracy": 0.9018692336976528, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.5103189493433395, | |
| "grad_norm": 0.45295446404829903, | |
| "learning_rate": 4.9309248009941914e-05, | |
| "loss": 0.2945, | |
| "mean_token_accuracy": 0.9022990744560957, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.5403377110694184, | |
| "grad_norm": 0.39810793732783883, | |
| "learning_rate": 4.9189977309006495e-05, | |
| "loss": 0.2867, | |
| "mean_token_accuracy": 0.9044581968337297, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.5703564727954972, | |
| "grad_norm": 0.29279224179883323, | |
| "learning_rate": 4.906138091134118e-05, | |
| "loss": 0.2817, | |
| "mean_token_accuracy": 0.9055654220283031, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.600375234521576, | |
| "grad_norm": 0.26896824294508065, | |
| "learning_rate": 4.892350839330522e-05, | |
| "loss": 0.2921, | |
| "mean_token_accuracy": 0.9025575239211321, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.6303939962476548, | |
| "grad_norm": 0.2400534443678901, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 0.2831, | |
| "mean_token_accuracy": 0.9051281735301018, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.6604127579737336, | |
| "grad_norm": 0.2249165746946966, | |
| "learning_rate": 4.862015116167196e-05, | |
| "loss": 0.2698, | |
| "mean_token_accuracy": 0.9093952961266041, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.6904315196998124, | |
| "grad_norm": 0.264247809537063, | |
| "learning_rate": 4.8454783398062106e-05, | |
| "loss": 0.2686, | |
| "mean_token_accuracy": 0.9091940615326166, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.7204502814258912, | |
| "grad_norm": 0.20930943221019285, | |
| "learning_rate": 4.828037336897009e-05, | |
| "loss": 0.2687, | |
| "mean_token_accuracy": 0.9089630376547575, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.7504690431519699, | |
| "grad_norm": 0.23889395502942187, | |
| "learning_rate": 4.8096988312782174e-05, | |
| "loss": 0.2871, | |
| "mean_token_accuracy": 0.9030982349067926, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.7804878048780488, | |
| "grad_norm": 0.21055564681809716, | |
| "learning_rate": 4.7904698927928406e-05, | |
| "loss": 0.272, | |
| "mean_token_accuracy": 0.9078760109841824, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.8105065666041276, | |
| "grad_norm": 0.21681199372541698, | |
| "learning_rate": 4.7703579345627035e-05, | |
| "loss": 0.2619, | |
| "mean_token_accuracy": 0.9109147116541862, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.8405253283302064, | |
| "grad_norm": 0.21247193653216784, | |
| "learning_rate": 4.749370710130554e-05, | |
| "loss": 0.2721, | |
| "mean_token_accuracy": 0.9074795469641685, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.8705440900562852, | |
| "grad_norm": 0.20525916333687041, | |
| "learning_rate": 4.72751631047092e-05, | |
| "loss": 0.2539, | |
| "mean_token_accuracy": 0.9133741557598114, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.900562851782364, | |
| "grad_norm": 0.21529654405923737, | |
| "learning_rate": 4.7048031608708876e-05, | |
| "loss": 0.2603, | |
| "mean_token_accuracy": 0.9109627865254879, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.9305816135084428, | |
| "grad_norm": 0.20791794762620378, | |
| "learning_rate": 4.681240017681993e-05, | |
| "loss": 0.2593, | |
| "mean_token_accuracy": 0.9111653957515955, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.9606003752345216, | |
| "grad_norm": 0.20774824517485244, | |
| "learning_rate": 4.65683596494448e-05, | |
| "loss": 0.2719, | |
| "mean_token_accuracy": 0.9068219736218452, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.9906191369606003, | |
| "grad_norm": 0.28582938868285823, | |
| "learning_rate": 4.6316004108852305e-05, | |
| "loss": 0.2645, | |
| "mean_token_accuracy": 0.9088481441140175, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.28582938868285823, | |
| "learning_rate": 4.6055430842907167e-05, | |
| "loss": 0.2564, | |
| "mean_token_accuracy": 0.9133941173553467, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.0300187617260788, | |
| "grad_norm": 0.3589081697155284, | |
| "learning_rate": 4.5786740307563636e-05, | |
| "loss": 0.2082, | |
| "mean_token_accuracy": 0.9285639356821775, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.0600375234521575, | |
| "grad_norm": 0.19449792688035672, | |
| "learning_rate": 4.551003608813784e-05, | |
| "loss": 0.2047, | |
| "mean_token_accuracy": 0.9296260979026556, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.0900562851782365, | |
| "grad_norm": 0.23233991689426617, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 0.1979, | |
| "mean_token_accuracy": 0.9314604848623276, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.1200750469043153, | |
| "grad_norm": 0.21035371628271216, | |
| "learning_rate": 4.493301634431768e-05, | |
| "loss": 0.2014, | |
| "mean_token_accuracy": 0.9298410974442959, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.150093808630394, | |
| "grad_norm": 0.20990624713625997, | |
| "learning_rate": 4.463292327201862e-05, | |
| "loss": 0.1913, | |
| "mean_token_accuracy": 0.933486595749855, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.1801125703564728, | |
| "grad_norm": 0.2156272816847033, | |
| "learning_rate": 4.4325261334068426e-05, | |
| "loss": 0.2031, | |
| "mean_token_accuracy": 0.9307098593562841, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.2101313320825515, | |
| "grad_norm": 0.21696878272059866, | |
| "learning_rate": 4.401014914000078e-05, | |
| "loss": 0.1915, | |
| "mean_token_accuracy": 0.9335418920964003, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.2401500938086305, | |
| "grad_norm": 0.1818612765558643, | |
| "learning_rate": 4.3687708171564925e-05, | |
| "loss": 0.1791, | |
| "mean_token_accuracy": 0.9380327388644218, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.2701688555347093, | |
| "grad_norm": 0.18129814277988898, | |
| "learning_rate": 4.335806273589214e-05, | |
| "loss": 0.1931, | |
| "mean_token_accuracy": 0.9324233587831259, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.300187617260788, | |
| "grad_norm": 0.18921071728690822, | |
| "learning_rate": 4.302133991757297e-05, | |
| "loss": 0.1861, | |
| "mean_token_accuracy": 0.9347784202545881, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.3302063789868668, | |
| "grad_norm": 0.1846346124739407, | |
| "learning_rate": 4.267766952966369e-05, | |
| "loss": 0.1978, | |
| "mean_token_accuracy": 0.9310048930346966, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.3602251407129455, | |
| "grad_norm": 0.18689120002736795, | |
| "learning_rate": 4.23271840636409e-05, | |
| "loss": 0.1931, | |
| "mean_token_accuracy": 0.9321947041898966, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.3902439024390243, | |
| "grad_norm": 0.18301258133692994, | |
| "learning_rate": 4.197001863832355e-05, | |
| "loss": 0.1991, | |
| "mean_token_accuracy": 0.9307528082281351, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.4202626641651033, | |
| "grad_norm": 0.20071944245709974, | |
| "learning_rate": 4.1606310947782044e-05, | |
| "loss": 0.1883, | |
| "mean_token_accuracy": 0.9341552760452032, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.450281425891182, | |
| "grad_norm": 0.21531485697866234, | |
| "learning_rate": 4.123620120825459e-05, | |
| "loss": 0.1793, | |
| "mean_token_accuracy": 0.9380034245550632, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.4803001876172608, | |
| "grad_norm": 0.17040701196766744, | |
| "learning_rate": 4.085983210409114e-05, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9408059008419514, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.5103189493433395, | |
| "grad_norm": 0.17082023776864208, | |
| "learning_rate": 4.047734873274586e-05, | |
| "loss": 0.1777, | |
| "mean_token_accuracy": 0.9373182617127895, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.5403377110694185, | |
| "grad_norm": 0.18880547525592725, | |
| "learning_rate": 4.008889854883929e-05, | |
| "loss": 0.1905, | |
| "mean_token_accuracy": 0.9339997190982103, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.5703564727954973, | |
| "grad_norm": 0.2003270144688197, | |
| "learning_rate": 3.969463130731183e-05, | |
| "loss": 0.1829, | |
| "mean_token_accuracy": 0.9364625960588455, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.600375234521576, | |
| "grad_norm": 0.16248574881358357, | |
| "learning_rate": 3.9294699005690305e-05, | |
| "loss": 0.187, | |
| "mean_token_accuracy": 0.9349782522767782, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.6303939962476548, | |
| "grad_norm": 0.16268952077579069, | |
| "learning_rate": 3.888925582549006e-05, | |
| "loss": 0.1806, | |
| "mean_token_accuracy": 0.9380554854869843, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.6604127579737336, | |
| "grad_norm": 0.16260973286493194, | |
| "learning_rate": 3.847845807277502e-05, | |
| "loss": 0.1756, | |
| "mean_token_accuracy": 0.9381309170275927, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.6904315196998123, | |
| "grad_norm": 0.18849387268876527, | |
| "learning_rate": 3.8062464117898724e-05, | |
| "loss": 0.1905, | |
| "mean_token_accuracy": 0.933776805177331, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.720450281425891, | |
| "grad_norm": 0.1812480467627804, | |
| "learning_rate": 3.764143433444962e-05, | |
| "loss": 0.1845, | |
| "mean_token_accuracy": 0.9354843944311142, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.7504690431519698, | |
| "grad_norm": 0.19727408903046884, | |
| "learning_rate": 3.721553103742388e-05, | |
| "loss": 0.1839, | |
| "mean_token_accuracy": 0.9353628680109978, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.7804878048780488, | |
| "grad_norm": 0.16881751417638702, | |
| "learning_rate": 3.678491842064995e-05, | |
| "loss": 0.1847, | |
| "mean_token_accuracy": 0.9353015590459108, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.8105065666041276, | |
| "grad_norm": 0.1805153593928837, | |
| "learning_rate": 3.634976249348867e-05, | |
| "loss": 0.189, | |
| "mean_token_accuracy": 0.9340192507952452, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.8405253283302065, | |
| "grad_norm": 0.16744864978079732, | |
| "learning_rate": 3.591023101683355e-05, | |
| "loss": 0.1873, | |
| "mean_token_accuracy": 0.9332233294844627, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.8705440900562853, | |
| "grad_norm": 0.20943512548005347, | |
| "learning_rate": 3.54664934384357e-05, | |
| "loss": 0.1833, | |
| "mean_token_accuracy": 0.9361728671938181, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.900562851782364, | |
| "grad_norm": 0.15100109107147408, | |
| "learning_rate": 3.5018720827578524e-05, | |
| "loss": 0.177, | |
| "mean_token_accuracy": 0.9376390129327774, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.9305816135084428, | |
| "grad_norm": 0.20136076678950812, | |
| "learning_rate": 3.456708580912725e-05, | |
| "loss": 0.1847, | |
| "mean_token_accuracy": 0.9356410764157772, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.9606003752345216, | |
| "grad_norm": 0.16935110772638642, | |
| "learning_rate": 3.411176249697875e-05, | |
| "loss": 0.1882, | |
| "mean_token_accuracy": 0.9341955110430717, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.9906191369606003, | |
| "grad_norm": 0.17801077092117232, | |
| "learning_rate": 3.365292642693732e-05, | |
| "loss": 0.1791, | |
| "mean_token_accuracy": 0.9368807151913643, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.17801077092117232, | |
| "learning_rate": 3.319075448904234e-05, | |
| "loss": 0.1817, | |
| "mean_token_accuracy": 0.9353618502616883, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.0300187617260788, | |
| "grad_norm": 0.34283977157187906, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.1162, | |
| "mean_token_accuracy": 0.9604951441287994, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.0600375234521575, | |
| "grad_norm": 0.25353133352641416, | |
| "learning_rate": 3.225711693136156e-05, | |
| "loss": 0.1155, | |
| "mean_token_accuracy": 0.9606517199426889, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.0900562851782363, | |
| "grad_norm": 0.36813345733413727, | |
| "learning_rate": 3.178601124662686e-05, | |
| "loss": 0.1092, | |
| "mean_token_accuracy": 0.9620461780577898, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.120075046904315, | |
| "grad_norm": 0.20837522140479256, | |
| "learning_rate": 3.131228942537895e-05, | |
| "loss": 0.1064, | |
| "mean_token_accuracy": 0.9636496491730213, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.150093808630394, | |
| "grad_norm": 0.2546796945935164, | |
| "learning_rate": 3.083613409639764e-05, | |
| "loss": 0.1082, | |
| "mean_token_accuracy": 0.9626397844403982, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.180112570356473, | |
| "grad_norm": 0.2517042958600063, | |
| "learning_rate": 3.035772882662627e-05, | |
| "loss": 0.1024, | |
| "mean_token_accuracy": 0.9642387926578522, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.2101313320825517, | |
| "grad_norm": 0.16863389096389939, | |
| "learning_rate": 2.9877258050403212e-05, | |
| "loss": 0.1011, | |
| "mean_token_accuracy": 0.964973971247673, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.2401500938086305, | |
| "grad_norm": 0.2256068322542817, | |
| "learning_rate": 2.9394906998358868e-05, | |
| "loss": 0.0979, | |
| "mean_token_accuracy": 0.9662024211138487, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.2701688555347093, | |
| "grad_norm": 0.19130902536055486, | |
| "learning_rate": 2.8910861626005776e-05, | |
| "loss": 0.101, | |
| "mean_token_accuracy": 0.9646210763603449, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 2.300187617260788, | |
| "grad_norm": 0.18029622833908, | |
| "learning_rate": 2.8425308542049206e-05, | |
| "loss": 0.0943, | |
| "mean_token_accuracy": 0.9668951816856861, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 2.3302063789868668, | |
| "grad_norm": 0.1715983987427455, | |
| "learning_rate": 2.7938434936445945e-05, | |
| "loss": 0.1025, | |
| "mean_token_accuracy": 0.9641035441309214, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 2.3602251407129455, | |
| "grad_norm": 0.17151947074238844, | |
| "learning_rate": 2.7450428508239024e-05, | |
| "loss": 0.0993, | |
| "mean_token_accuracy": 0.9651761185377836, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.3902439024390243, | |
| "grad_norm": 0.17762362563985393, | |
| "learning_rate": 2.6961477393196126e-05, | |
| "loss": 0.1016, | |
| "mean_token_accuracy": 0.9645342864096165, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.420262664165103, | |
| "grad_norm": 0.17493795219201744, | |
| "learning_rate": 2.6471770091279724e-05, | |
| "loss": 0.1032, | |
| "mean_token_accuracy": 0.965608624741435, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.450281425891182, | |
| "grad_norm": 0.21622340080905333, | |
| "learning_rate": 2.598149539397672e-05, | |
| "loss": 0.1056, | |
| "mean_token_accuracy": 0.9633868020027876, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 2.480300187617261, | |
| "grad_norm": 0.18325655719580544, | |
| "learning_rate": 2.5490842311515707e-05, | |
| "loss": 0.1003, | |
| "mean_token_accuracy": 0.9652356337755919, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.5103189493433398, | |
| "grad_norm": 0.16079654454953773, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0951, | |
| "mean_token_accuracy": 0.9671048391610384, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.5403377110694185, | |
| "grad_norm": 0.177885663467419, | |
| "learning_rate": 2.4509157688484295e-05, | |
| "loss": 0.1019, | |
| "mean_token_accuracy": 0.9652324616909027, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 2.5703564727954973, | |
| "grad_norm": 0.16463009515777124, | |
| "learning_rate": 2.4018504606023293e-05, | |
| "loss": 0.0983, | |
| "mean_token_accuracy": 0.9660285171121359, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 2.600375234521576, | |
| "grad_norm": 0.14988401935266468, | |
| "learning_rate": 2.3528229908720272e-05, | |
| "loss": 0.0973, | |
| "mean_token_accuracy": 0.9662998840212822, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.630393996247655, | |
| "grad_norm": 0.1714584031856408, | |
| "learning_rate": 2.303852260680388e-05, | |
| "loss": 0.0993, | |
| "mean_token_accuracy": 0.9654844384640455, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.6604127579737336, | |
| "grad_norm": 0.15655630724758532, | |
| "learning_rate": 2.2549571491760986e-05, | |
| "loss": 0.1044, | |
| "mean_token_accuracy": 0.9633280653506517, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.6904315196998123, | |
| "grad_norm": 0.15989111678931958, | |
| "learning_rate": 2.2061565063554064e-05, | |
| "loss": 0.0962, | |
| "mean_token_accuracy": 0.9662177134305239, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.720450281425891, | |
| "grad_norm": 0.1612719262065956, | |
| "learning_rate": 2.1574691457950803e-05, | |
| "loss": 0.1, | |
| "mean_token_accuracy": 0.9648805633187294, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.75046904315197, | |
| "grad_norm": 0.1447218929697437, | |
| "learning_rate": 2.1089138373994223e-05, | |
| "loss": 0.097, | |
| "mean_token_accuracy": 0.9660444520413876, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.7804878048780486, | |
| "grad_norm": 0.15448044912912087, | |
| "learning_rate": 2.0605093001641138e-05, | |
| "loss": 0.1037, | |
| "mean_token_accuracy": 0.9642052594572306, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 2.8105065666041273, | |
| "grad_norm": 0.14976483567215834, | |
| "learning_rate": 2.0122741949596797e-05, | |
| "loss": 0.103, | |
| "mean_token_accuracy": 0.9642070364207029, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.8405253283302065, | |
| "grad_norm": 0.15397846138230065, | |
| "learning_rate": 1.9642271173373737e-05, | |
| "loss": 0.1024, | |
| "mean_token_accuracy": 0.9642751514911652, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.8705440900562853, | |
| "grad_norm": 0.16533125622570222, | |
| "learning_rate": 1.9163865903602374e-05, | |
| "loss": 0.0983, | |
| "mean_token_accuracy": 0.9661570060998201, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 2.900562851782364, | |
| "grad_norm": 0.14567827324511498, | |
| "learning_rate": 1.868771057462105e-05, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9689803905785084, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.930581613508443, | |
| "grad_norm": 0.13721507889257023, | |
| "learning_rate": 1.8213988753373146e-05, | |
| "loss": 0.1018, | |
| "mean_token_accuracy": 0.9658490009605885, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.9606003752345216, | |
| "grad_norm": 0.18558487132226667, | |
| "learning_rate": 1.7742883068638447e-05, | |
| "loss": 0.0975, | |
| "mean_token_accuracy": 0.9673260115087032, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.9906191369606003, | |
| "grad_norm": 0.14278892649537844, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.0945, | |
| "mean_token_accuracy": 0.9672219399362803, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.14278892649537844, | |
| "learning_rate": 1.6809245510957665e-05, | |
| "loss": 0.104, | |
| "mean_token_accuracy": 0.9641202390193939, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 3.0300187617260788, | |
| "grad_norm": 0.29206855231690615, | |
| "learning_rate": 1.6347073573062672e-05, | |
| "loss": 0.052, | |
| "mean_token_accuracy": 0.9840696156024933, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 3.0600375234521575, | |
| "grad_norm": 0.21146610857781498, | |
| "learning_rate": 1.588823750302126e-05, | |
| "loss": 0.0506, | |
| "mean_token_accuracy": 0.9837026111781597, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 3.0900562851782363, | |
| "grad_norm": 0.1728680637000517, | |
| "learning_rate": 1.5432914190872757e-05, | |
| "loss": 0.0492, | |
| "mean_token_accuracy": 0.9842210356146097, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 3.120075046904315, | |
| "grad_norm": 0.137716977630954, | |
| "learning_rate": 1.498127917242148e-05, | |
| "loss": 0.0493, | |
| "mean_token_accuracy": 0.9839507173746824, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 3.150093808630394, | |
| "grad_norm": 0.14551903804275892, | |
| "learning_rate": 1.4533506561564306e-05, | |
| "loss": 0.0544, | |
| "mean_token_accuracy": 0.9822139292955399, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 3.180112570356473, | |
| "grad_norm": 0.16669835535632535, | |
| "learning_rate": 1.4089768983166444e-05, | |
| "loss": 0.0489, | |
| "mean_token_accuracy": 0.9840298742055893, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 3.2101313320825517, | |
| "grad_norm": 0.18300271784408872, | |
| "learning_rate": 1.3650237506511331e-05, | |
| "loss": 0.0497, | |
| "mean_token_accuracy": 0.983882175758481, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 3.2401500938086305, | |
| "grad_norm": 0.1843234481043501, | |
| "learning_rate": 1.3215081579350058e-05, | |
| "loss": 0.0485, | |
| "mean_token_accuracy": 0.9843094442039728, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.2701688555347093, | |
| "grad_norm": 0.3461827490875774, | |
| "learning_rate": 1.2784468962576136e-05, | |
| "loss": 0.047, | |
| "mean_token_accuracy": 0.9847969133406878, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 3.300187617260788, | |
| "grad_norm": 0.15632977455270483, | |
| "learning_rate": 1.235856566555039e-05, | |
| "loss": 0.049, | |
| "mean_token_accuracy": 0.9837981257587671, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 3.3302063789868668, | |
| "grad_norm": 0.14640471914964392, | |
| "learning_rate": 1.1937535882101281e-05, | |
| "loss": 0.0458, | |
| "mean_token_accuracy": 0.9851204100996256, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 3.3602251407129455, | |
| "grad_norm": 0.13729939899053178, | |
| "learning_rate": 1.1521541927224994e-05, | |
| "loss": 0.0456, | |
| "mean_token_accuracy": 0.9848766028881073, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 3.3902439024390243, | |
| "grad_norm": 0.13806503349144675, | |
| "learning_rate": 1.1110744174509952e-05, | |
| "loss": 0.049, | |
| "mean_token_accuracy": 0.9844018053263426, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 3.420262664165103, | |
| "grad_norm": 0.1677329902297057, | |
| "learning_rate": 1.0705300994309697e-05, | |
| "loss": 0.0509, | |
| "mean_token_accuracy": 0.9836404304951429, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 3.450281425891182, | |
| "grad_norm": 0.1363456396457925, | |
| "learning_rate": 1.0305368692688174e-05, | |
| "loss": 0.0489, | |
| "mean_token_accuracy": 0.9842113871127367, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 3.480300187617261, | |
| "grad_norm": 0.14670430283357652, | |
| "learning_rate": 9.911101451160715e-06, | |
| "loss": 0.0476, | |
| "mean_token_accuracy": 0.9845409169793129, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 3.5103189493433398, | |
| "grad_norm": 0.13593290922974113, | |
| "learning_rate": 9.522651267254149e-06, | |
| "loss": 0.0498, | |
| "mean_token_accuracy": 0.9841745216399431, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 3.5403377110694185, | |
| "grad_norm": 0.1405157943110022, | |
| "learning_rate": 9.140167895908867e-06, | |
| "loss": 0.0515, | |
| "mean_token_accuracy": 0.9838052876293659, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.5703564727954973, | |
| "grad_norm": 0.13398507397046694, | |
| "learning_rate": 8.763798791745411e-06, | |
| "loss": 0.044, | |
| "mean_token_accuracy": 0.985531248152256, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 3.600375234521576, | |
| "grad_norm": 0.12595342205919996, | |
| "learning_rate": 8.393689052217966e-06, | |
| "loss": 0.0443, | |
| "mean_token_accuracy": 0.9851887430995703, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 3.630393996247655, | |
| "grad_norm": 0.12802288754577185, | |
| "learning_rate": 8.029981361676456e-06, | |
| "loss": 0.0477, | |
| "mean_token_accuracy": 0.9847091306000948, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 3.6604127579737336, | |
| "grad_norm": 0.13540249038634009, | |
| "learning_rate": 7.672815936359107e-06, | |
| "loss": 0.0437, | |
| "mean_token_accuracy": 0.9858846813440323, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 3.6904315196998123, | |
| "grad_norm": 0.1272358814553976, | |
| "learning_rate": 7.3223304703363135e-06, | |
| "loss": 0.0472, | |
| "mean_token_accuracy": 0.9844079315662384, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 3.720450281425891, | |
| "grad_norm": 0.13634273240990136, | |
| "learning_rate": 6.9786600824270296e-06, | |
| "loss": 0.0427, | |
| "mean_token_accuracy": 0.9858784638345242, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 3.75046904315197, | |
| "grad_norm": 0.15771802380242175, | |
| "learning_rate": 6.641937264107867e-06, | |
| "loss": 0.0469, | |
| "mean_token_accuracy": 0.9847830552607775, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 3.7804878048780486, | |
| "grad_norm": 0.13754448160952976, | |
| "learning_rate": 6.312291828435077e-06, | |
| "loss": 0.0462, | |
| "mean_token_accuracy": 0.9851614981889725, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 3.8105065666041273, | |
| "grad_norm": 0.14734425443158122, | |
| "learning_rate": 5.989850859999227e-06, | |
| "loss": 0.0422, | |
| "mean_token_accuracy": 0.9861964080482721, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 3.8405253283302065, | |
| "grad_norm": 0.12037178039604896, | |
| "learning_rate": 5.674738665931575e-06, | |
| "loss": 0.0445, | |
| "mean_token_accuracy": 0.9854839760810137, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.8705440900562853, | |
| "grad_norm": 0.13270114277633968, | |
| "learning_rate": 5.367076727981382e-06, | |
| "loss": 0.046, | |
| "mean_token_accuracy": 0.98503128439188, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 3.900562851782364, | |
| "grad_norm": 0.12186409904443084, | |
| "learning_rate": 5.066983655682325e-06, | |
| "loss": 0.0413, | |
| "mean_token_accuracy": 0.9866250548511744, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 3.930581613508443, | |
| "grad_norm": 0.11572178677884377, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 0.0455, | |
| "mean_token_accuracy": 0.9853598214685917, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 3.9606003752345216, | |
| "grad_norm": 0.1285035698798016, | |
| "learning_rate": 4.48996391186216e-06, | |
| "loss": 0.0446, | |
| "mean_token_accuracy": 0.9853111784905195, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 3.9906191369606003, | |
| "grad_norm": 0.12237492704812947, | |
| "learning_rate": 4.213259692436367e-06, | |
| "loss": 0.0472, | |
| "mean_token_accuracy": 0.9846988655626774, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.17459270329168236, | |
| "learning_rate": 3.944569157092839e-06, | |
| "loss": 0.0397, | |
| "mean_token_accuracy": 0.9869139909744262, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 4.030018761726079, | |
| "grad_norm": 0.21688322258057258, | |
| "learning_rate": 3.6839958911476957e-06, | |
| "loss": 0.0316, | |
| "mean_token_accuracy": 0.9908953290432692, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 4.0600375234521575, | |
| "grad_norm": 0.12510412863095888, | |
| "learning_rate": 3.431640350555204e-06, | |
| "loss": 0.0298, | |
| "mean_token_accuracy": 0.9912976007908583, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 4.090056285178236, | |
| "grad_norm": 0.11532401288217803, | |
| "learning_rate": 3.187599823180071e-06, | |
| "loss": 0.0291, | |
| "mean_token_accuracy": 0.9916361309587955, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 4.120075046904315, | |
| "grad_norm": 0.11662866353878451, | |
| "learning_rate": 2.9519683912911266e-06, | |
| "loss": 0.0316, | |
| "mean_token_accuracy": 0.9906173534691334, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 4.150093808630394, | |
| "grad_norm": 0.10785269578007366, | |
| "learning_rate": 2.7248368952908053e-06, | |
| "loss": 0.0278, | |
| "mean_token_accuracy": 0.9918341338634491, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 4.1801125703564725, | |
| "grad_norm": 0.10540687517578978, | |
| "learning_rate": 2.506292898694468e-06, | |
| "loss": 0.0304, | |
| "mean_token_accuracy": 0.9909927677363157, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 4.210131332082551, | |
| "grad_norm": 0.10795341728368958, | |
| "learning_rate": 2.296420654372966e-06, | |
| "loss": 0.0292, | |
| "mean_token_accuracy": 0.9913486260920763, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 4.24015009380863, | |
| "grad_norm": 0.09918400957702202, | |
| "learning_rate": 2.0953010720716037e-06, | |
| "loss": 0.0285, | |
| "mean_token_accuracy": 0.991315545514226, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 4.270168855534709, | |
| "grad_norm": 0.1028689650543891, | |
| "learning_rate": 1.9030116872178316e-06, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.9919464886188507, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 4.300187617260788, | |
| "grad_norm": 0.0934195965967741, | |
| "learning_rate": 1.7196266310299108e-06, | |
| "loss": 0.0271, | |
| "mean_token_accuracy": 0.9918058719485998, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 4.330206378986867, | |
| "grad_norm": 0.09146571370676639, | |
| "learning_rate": 1.5452166019378989e-06, | |
| "loss": 0.0273, | |
| "mean_token_accuracy": 0.9917649105191231, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 4.360225140712946, | |
| "grad_norm": 0.09753557772930677, | |
| "learning_rate": 1.379848838328049e-06, | |
| "loss": 0.0286, | |
| "mean_token_accuracy": 0.9913905151188374, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 4.390243902439025, | |
| "grad_norm": 0.11653668851139358, | |
| "learning_rate": 1.2235870926211619e-06, | |
| "loss": 0.0277, | |
| "mean_token_accuracy": 0.9916701205074787, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 4.4202626641651035, | |
| "grad_norm": 0.09067908471373788, | |
| "learning_rate": 1.0764916066947794e-06, | |
| "loss": 0.0258, | |
| "mean_token_accuracy": 0.9922576006501913, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.450281425891182, | |
| "grad_norm": 0.09653433513408423, | |
| "learning_rate": 9.386190886588208e-07, | |
| "loss": 0.0271, | |
| "mean_token_accuracy": 0.9919117372483015, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 4.480300187617261, | |
| "grad_norm": 0.0987084116462941, | |
| "learning_rate": 8.10022690993506e-07, | |
| "loss": 0.028, | |
| "mean_token_accuracy": 0.9915720969438553, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 4.51031894934334, | |
| "grad_norm": 0.10457824343264062, | |
| "learning_rate": 6.907519900580861e-07, | |
| "loss": 0.0302, | |
| "mean_token_accuracy": 0.9909002613276243, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 4.5403377110694185, | |
| "grad_norm": 0.10340772315470596, | |
| "learning_rate": 5.808529669781904e-07, | |
| "loss": 0.0264, | |
| "mean_token_accuracy": 0.9919101018458605, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 4.570356472795497, | |
| "grad_norm": 0.09744615982408229, | |
| "learning_rate": 4.803679899192392e-07, | |
| "loss": 0.0285, | |
| "mean_token_accuracy": 0.9909517038613558, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 4.600375234521576, | |
| "grad_norm": 0.09254462095652977, | |
| "learning_rate": 3.8933579775271013e-07, | |
| "loss": 0.0263, | |
| "mean_token_accuracy": 0.9920994155108929, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 4.630393996247655, | |
| "grad_norm": 0.09530189373391666, | |
| "learning_rate": 3.077914851215585e-07, | |
| "loss": 0.0283, | |
| "mean_token_accuracy": 0.9914026968181133, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 4.6604127579737336, | |
| "grad_norm": 0.09988980276705559, | |
| "learning_rate": 2.3576648891056875e-07, | |
| "loss": 0.027, | |
| "mean_token_accuracy": 0.9920587744563818, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 4.690431519699812, | |
| "grad_norm": 0.09117032881835743, | |
| "learning_rate": 1.732885761268427e-07, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.991992175579071, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 4.720450281425891, | |
| "grad_norm": 0.09146419854434591, | |
| "learning_rate": 1.2038183319507955e-07, | |
| "loss": 0.0264, | |
| "mean_token_accuracy": 0.9921840745955706, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 4.75046904315197, | |
| "grad_norm": 0.0941692484126693, | |
| "learning_rate": 7.706665667180091e-08, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.992155384272337, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 4.780487804878049, | |
| "grad_norm": 0.09868857104545, | |
| "learning_rate": 4.335974538210441e-08, | |
| "loss": 0.0286, | |
| "mean_token_accuracy": 0.9914395287632942, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 4.810506566604127, | |
| "grad_norm": 0.098281670570088, | |
| "learning_rate": 1.9274093981927478e-08, | |
| "loss": 0.0269, | |
| "mean_token_accuracy": 0.9919413533061743, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 4.840525328330206, | |
| "grad_norm": 0.08992856114449468, | |
| "learning_rate": 4.818987948379539e-09, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9920829199254513, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 4.870544090056285, | |
| "grad_norm": 0.09146022503539022, | |
| "learning_rate": 0.0, | |
| "loss": 0.0274, | |
| "mean_token_accuracy": 0.9918729793280363, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 4.870544090056285, | |
| "step": 165, | |
| "total_flos": 195199345459200.0, | |
| "train_loss": 0.14440994993077985, | |
| "train_runtime": 12814.9616, | |
| "train_samples_per_second": 3.325, | |
| "train_steps_per_second": 0.013 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 165, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 195199345459200.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |