| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.9655172413793105, | |
| "eval_steps": 500, | |
| "global_step": 125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03940886699507389, | |
| "grad_norm": 2.5584669687578936, | |
| "learning_rate": 6.153846153846155e-06, | |
| "loss": 0.7558, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.07881773399014778, | |
| "grad_norm": 2.565466980202395, | |
| "learning_rate": 1.230769230769231e-05, | |
| "loss": 0.7528, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.11822660098522167, | |
| "grad_norm": 1.8753957272537682, | |
| "learning_rate": 1.8461538461538465e-05, | |
| "loss": 0.7189, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.15763546798029557, | |
| "grad_norm": 1.5499772451700504, | |
| "learning_rate": 2.461538461538462e-05, | |
| "loss": 0.6998, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.19704433497536947, | |
| "grad_norm": 1.629703652668965, | |
| "learning_rate": 3.0769230769230774e-05, | |
| "loss": 0.639, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.23645320197044334, | |
| "grad_norm": 1.351435580335104, | |
| "learning_rate": 3.692307692307693e-05, | |
| "loss": 0.6071, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 2.1491351662060074, | |
| "learning_rate": 4.307692307692308e-05, | |
| "loss": 0.6034, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.31527093596059114, | |
| "grad_norm": 1.1876259973559116, | |
| "learning_rate": 4.923076923076924e-05, | |
| "loss": 0.5724, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.35467980295566504, | |
| "grad_norm": 1.371610184643724, | |
| "learning_rate": 5.538461538461539e-05, | |
| "loss": 0.5544, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.39408866995073893, | |
| "grad_norm": 1.032169030947674, | |
| "learning_rate": 6.153846153846155e-05, | |
| "loss": 0.5503, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.43349753694581283, | |
| "grad_norm": 1.1747270299234402, | |
| "learning_rate": 6.76923076923077e-05, | |
| "loss": 0.5314, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.4729064039408867, | |
| "grad_norm": 0.9851373875214015, | |
| "learning_rate": 7.384615384615386e-05, | |
| "loss": 0.5293, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.5123152709359606, | |
| "grad_norm": 0.9498199365678511, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5197, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.6850321797374413, | |
| "learning_rate": 7.998426505532213e-05, | |
| "loss": 0.5108, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.5911330049261084, | |
| "grad_norm": 0.741883724345909, | |
| "learning_rate": 7.993707260071268e-05, | |
| "loss": 0.5039, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.6305418719211823, | |
| "grad_norm": 0.6201096559420001, | |
| "learning_rate": 7.985845976470478e-05, | |
| "loss": 0.492, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.6699507389162561, | |
| "grad_norm": 0.5810333918069511, | |
| "learning_rate": 7.974848839572971e-05, | |
| "loss": 0.4894, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.7093596059113301, | |
| "grad_norm": 0.538664331289255, | |
| "learning_rate": 7.960724501345783e-05, | |
| "loss": 0.478, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.7487684729064039, | |
| "grad_norm": 0.48351034605589627, | |
| "learning_rate": 7.943484074072943e-05, | |
| "loss": 0.4758, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.7881773399014779, | |
| "grad_norm": 0.5588522296796855, | |
| "learning_rate": 7.923141121612922e-05, | |
| "loss": 0.4755, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 0.5099414849429755, | |
| "learning_rate": 7.899711648727294e-05, | |
| "loss": 0.4669, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.8669950738916257, | |
| "grad_norm": 0.5111926450042357, | |
| "learning_rate": 7.873214088489047e-05, | |
| "loss": 0.4634, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.9064039408866995, | |
| "grad_norm": 0.42484145211363433, | |
| "learning_rate": 7.843669287780399e-05, | |
| "loss": 0.4539, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.9458128078817734, | |
| "grad_norm": 0.4063070610232536, | |
| "learning_rate": 7.811100490891586e-05, | |
| "loss": 0.4545, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.9852216748768473, | |
| "grad_norm": 0.34890570659951814, | |
| "learning_rate": 7.775533321233471e-05, | |
| "loss": 0.4472, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.0344827586206897, | |
| "grad_norm": 0.4447080012153922, | |
| "learning_rate": 7.736995761178399e-05, | |
| "loss": 0.4415, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.0738916256157636, | |
| "grad_norm": 0.302803670590809, | |
| "learning_rate": 7.695518130045147e-05, | |
| "loss": 0.4339, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.1133004926108374, | |
| "grad_norm": 0.35078704995091015, | |
| "learning_rate": 7.651133060245276e-05, | |
| "loss": 0.4334, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.1527093596059113, | |
| "grad_norm": 0.347929638232275, | |
| "learning_rate": 7.603875471609677e-05, | |
| "loss": 0.4306, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.1921182266009853, | |
| "grad_norm": 0.34132416804989185, | |
| "learning_rate": 7.55378254391549e-05, | |
| "loss": 0.4303, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.2315270935960592, | |
| "grad_norm": 0.2780341991318012, | |
| "learning_rate": 7.500893687635015e-05, | |
| "loss": 0.4187, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.270935960591133, | |
| "grad_norm": 0.236156587933769, | |
| "learning_rate": 7.445250512929637e-05, | |
| "loss": 0.4163, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.3103448275862069, | |
| "grad_norm": 0.28381264582341237, | |
| "learning_rate": 7.386896796913137e-05, | |
| "loss": 0.4112, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.3497536945812807, | |
| "grad_norm": 0.222292178815377, | |
| "learning_rate": 7.325878449210182e-05, | |
| "loss": 0.4167, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.3891625615763548, | |
| "grad_norm": 0.35512090170091437, | |
| "learning_rate": 7.262243475837041e-05, | |
| "loss": 0.4109, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.2511586648870301, | |
| "learning_rate": 7.196041941432998e-05, | |
| "loss": 0.4124, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.4679802955665024, | |
| "grad_norm": 0.22170782679952175, | |
| "learning_rate": 7.12732592987212e-05, | |
| "loss": 0.4056, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.5073891625615765, | |
| "grad_norm": 0.18785361140671847, | |
| "learning_rate": 7.05614950328643e-05, | |
| "loss": 0.4087, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.5467980295566504, | |
| "grad_norm": 0.20698082992129718, | |
| "learning_rate": 6.982568659532663e-05, | |
| "loss": 0.412, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.5862068965517242, | |
| "grad_norm": 0.2923360568345124, | |
| "learning_rate": 6.906641288136109e-05, | |
| "loss": 0.4077, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.625615763546798, | |
| "grad_norm": 0.36347305309861494, | |
| "learning_rate": 6.828427124746191e-05, | |
| "loss": 0.4135, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.6650246305418719, | |
| "grad_norm": 0.42985470055529523, | |
| "learning_rate": 6.747987704139607e-05, | |
| "loss": 0.408, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.7044334975369457, | |
| "grad_norm": 0.505450596225966, | |
| "learning_rate": 6.665386311808017e-05, | |
| "loss": 0.4125, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.7438423645320196, | |
| "grad_norm": 0.49870209403284876, | |
| "learning_rate": 6.580687934168352e-05, | |
| "loss": 0.4028, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.7832512315270936, | |
| "grad_norm": 0.3113343805759419, | |
| "learning_rate": 6.493959207434934e-05, | |
| "loss": 0.403, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.8226600985221675, | |
| "grad_norm": 0.2080418139358356, | |
| "learning_rate": 6.405268365193624e-05, | |
| "loss": 0.4143, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.8620689655172413, | |
| "grad_norm": 0.35947872339994125, | |
| "learning_rate": 6.314685184719224e-05, | |
| "loss": 0.3986, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.9014778325123154, | |
| "grad_norm": 0.2889476841026003, | |
| "learning_rate": 6.22228093207841e-05, | |
| "loss": 0.4034, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.9408866995073892, | |
| "grad_norm": 0.16926057758761603, | |
| "learning_rate": 6.128128306061347e-05, | |
| "loss": 0.398, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.980295566502463, | |
| "grad_norm": 0.3015239387441465, | |
| "learning_rate": 6.0323013809861185e-05, | |
| "loss": 0.3946, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 2.0295566502463056, | |
| "grad_norm": 0.27369640017832153, | |
| "learning_rate": 5.9348755484209597e-05, | |
| "loss": 0.3927, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 2.0689655172413794, | |
| "grad_norm": 0.1434834184101561, | |
| "learning_rate": 5.835927457870151e-05, | |
| "loss": 0.3811, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 2.1083743842364533, | |
| "grad_norm": 0.2854039627965572, | |
| "learning_rate": 5.735534956470233e-05, | |
| "loss": 0.3757, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 2.147783251231527, | |
| "grad_norm": 0.3009695490584673, | |
| "learning_rate": 5.6337770277439854e-05, | |
| "loss": 0.3818, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 2.187192118226601, | |
| "grad_norm": 0.154882062870217, | |
| "learning_rate": 5.5307337294603595e-05, | |
| "loss": 0.3758, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.226600985221675, | |
| "grad_norm": 0.23852846311077655, | |
| "learning_rate": 5.4264861306492525e-05, | |
| "loss": 0.3667, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 2.2660098522167487, | |
| "grad_norm": 0.22608925683357714, | |
| "learning_rate": 5.321116247820669e-05, | |
| "loss": 0.3711, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 2.3054187192118225, | |
| "grad_norm": 0.1583983272233887, | |
| "learning_rate": 5.214706980438459e-05, | |
| "loss": 0.367, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 2.344827586206897, | |
| "grad_norm": 0.205572865908627, | |
| "learning_rate": 5.107342045699397e-05, | |
| "loss": 0.3651, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 2.3842364532019706, | |
| "grad_norm": 0.2250613392996445, | |
| "learning_rate": 4.999105912668908e-05, | |
| "loss": 0.3723, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.4236453201970445, | |
| "grad_norm": 0.12838953670182573, | |
| "learning_rate": 4.890083735825258e-05, | |
| "loss": 0.3696, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 2.4630541871921183, | |
| "grad_norm": 0.2251988369582669, | |
| "learning_rate": 4.780361288064514e-05, | |
| "loss": 0.3676, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.502463054187192, | |
| "grad_norm": 0.1725969347073438, | |
| "learning_rate": 4.670024893218946e-05, | |
| "loss": 0.3697, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.541871921182266, | |
| "grad_norm": 0.12188885415056715, | |
| "learning_rate": 4.5591613581419984e-05, | |
| "loss": 0.3576, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.58128078817734, | |
| "grad_norm": 0.18927822293311972, | |
| "learning_rate": 4.4478579044132314e-05, | |
| "loss": 0.3667, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.6206896551724137, | |
| "grad_norm": 0.11561614146910962, | |
| "learning_rate": 4.336202099716991e-05, | |
| "loss": 0.3658, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.6600985221674875, | |
| "grad_norm": 0.1516544571012481, | |
| "learning_rate": 4.2242817889487676e-05, | |
| "loss": 0.3648, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.6995073891625614, | |
| "grad_norm": 0.1317279280788641, | |
| "learning_rate": 4.112185025103476e-05, | |
| "loss": 0.3615, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.7389162561576352, | |
| "grad_norm": 0.11760148271456246, | |
| "learning_rate": 4e-05, | |
| "loss": 0.3663, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.7783251231527095, | |
| "grad_norm": 0.12906163898612472, | |
| "learning_rate": 3.8878149748965245e-05, | |
| "loss": 0.3606, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.8177339901477834, | |
| "grad_norm": 0.09949229840426103, | |
| "learning_rate": 3.775718211051233e-05, | |
| "loss": 0.3667, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.12817295262296305, | |
| "learning_rate": 3.6637979002830106e-05, | |
| "loss": 0.3691, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.896551724137931, | |
| "grad_norm": 0.09926866475598084, | |
| "learning_rate": 3.552142095586769e-05, | |
| "loss": 0.3664, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.935960591133005, | |
| "grad_norm": 0.1188738983781825, | |
| "learning_rate": 3.4408386418580036e-05, | |
| "loss": 0.3717, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.9753694581280787, | |
| "grad_norm": 0.09857847772154998, | |
| "learning_rate": 3.329975106781055e-05, | |
| "loss": 0.358, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 3.0246305418719213, | |
| "grad_norm": 0.12437210969829017, | |
| "learning_rate": 3.219638711935488e-05, | |
| "loss": 0.3487, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 3.064039408866995, | |
| "grad_norm": 0.13092335355733034, | |
| "learning_rate": 3.109916264174743e-05, | |
| "loss": 0.3378, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 3.103448275862069, | |
| "grad_norm": 0.1380029588610029, | |
| "learning_rate": 3.000894087331092e-05, | |
| "loss": 0.3471, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 3.142857142857143, | |
| "grad_norm": 0.16659609357249366, | |
| "learning_rate": 2.892657954300603e-05, | |
| "loss": 0.345, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 3.1822660098522166, | |
| "grad_norm": 0.1491210789680348, | |
| "learning_rate": 2.7852930195615413e-05, | |
| "loss": 0.3423, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 3.2216748768472905, | |
| "grad_norm": 0.13323974852056628, | |
| "learning_rate": 2.678883752179333e-05, | |
| "loss": 0.3375, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 3.2610837438423648, | |
| "grad_norm": 0.16071703950070282, | |
| "learning_rate": 2.573513869350748e-05, | |
| "loss": 0.3449, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 3.3004926108374386, | |
| "grad_norm": 0.11154281193190216, | |
| "learning_rate": 2.4692662705396412e-05, | |
| "loss": 0.3477, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 3.3399014778325125, | |
| "grad_norm": 0.13730160221975535, | |
| "learning_rate": 2.366222972256016e-05, | |
| "loss": 0.3393, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 3.3793103448275863, | |
| "grad_norm": 0.1001261256156619, | |
| "learning_rate": 2.264465043529768e-05, | |
| "loss": 0.3386, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 3.41871921182266, | |
| "grad_norm": 0.11871409719140899, | |
| "learning_rate": 2.1640725421298487e-05, | |
| "loss": 0.3413, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 3.458128078817734, | |
| "grad_norm": 0.10822197872193218, | |
| "learning_rate": 2.065124451579041e-05, | |
| "loss": 0.3415, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 3.497536945812808, | |
| "grad_norm": 0.09848707693593217, | |
| "learning_rate": 1.9676986190138835e-05, | |
| "loss": 0.3424, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 3.5369458128078817, | |
| "grad_norm": 0.10650565727211944, | |
| "learning_rate": 1.8718716939386543e-05, | |
| "loss": 0.3438, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 3.5763546798029555, | |
| "grad_norm": 0.09005212054070572, | |
| "learning_rate": 1.7777190679215923e-05, | |
| "loss": 0.3413, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 3.6157635467980294, | |
| "grad_norm": 0.09471297169488595, | |
| "learning_rate": 1.6853148152807774e-05, | |
| "loss": 0.3394, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 3.655172413793103, | |
| "grad_norm": 0.08803273896652859, | |
| "learning_rate": 1.5947316348063764e-05, | |
| "loss": 0.3452, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 3.6945812807881775, | |
| "grad_norm": 0.08555929815288703, | |
| "learning_rate": 1.5060407925650662e-05, | |
| "loss": 0.3386, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 3.7339901477832513, | |
| "grad_norm": 0.07676232199734123, | |
| "learning_rate": 1.4193120658316506e-05, | |
| "loss": 0.3384, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 3.773399014778325, | |
| "grad_norm": 0.07008988332755012, | |
| "learning_rate": 1.3346136881919845e-05, | |
| "loss": 0.3423, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 3.812807881773399, | |
| "grad_norm": 0.07699181822522967, | |
| "learning_rate": 1.2520122958603933e-05, | |
| "loss": 0.3394, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 3.852216748768473, | |
| "grad_norm": 0.07037612515608051, | |
| "learning_rate": 1.1715728752538103e-05, | |
| "loss": 0.3377, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 3.8916256157635467, | |
| "grad_norm": 0.06692075874239917, | |
| "learning_rate": 1.0933587118638927e-05, | |
| "loss": 0.3389, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 3.9310344827586206, | |
| "grad_norm": 0.07239315796297541, | |
| "learning_rate": 1.0174313404673378e-05, | |
| "loss": 0.3386, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 3.970443349753695, | |
| "grad_norm": 0.06509354323682291, | |
| "learning_rate": 9.438504967135703e-06, | |
| "loss": 0.3435, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 4.019704433497537, | |
| "grad_norm": 0.07415226571205577, | |
| "learning_rate": 8.72674070127881e-06, | |
| "loss": 0.3299, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 4.059113300492611, | |
| "grad_norm": 0.09427673195861289, | |
| "learning_rate": 8.039580585670047e-06, | |
| "loss": 0.3271, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 4.098522167487685, | |
| "grad_norm": 0.07768702640271646, | |
| "learning_rate": 7.3775652416295936e-06, | |
| "loss": 0.3298, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 4.137931034482759, | |
| "grad_norm": 0.07261233473354416, | |
| "learning_rate": 6.7412155078981865e-06, | |
| "loss": 0.3295, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 4.177339901477833, | |
| "grad_norm": 0.07340342692044237, | |
| "learning_rate": 6.1310320308686354e-06, | |
| "loss": 0.3274, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 4.216748768472907, | |
| "grad_norm": 0.07153254469501634, | |
| "learning_rate": 5.547494870703642e-06, | |
| "loss": 0.3307, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 4.25615763546798, | |
| "grad_norm": 0.07858955565979121, | |
| "learning_rate": 4.991063123649853e-06, | |
| "loss": 0.326, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 4.295566502463054, | |
| "grad_norm": 0.0747794841861464, | |
| "learning_rate": 4.462174560845114e-06, | |
| "loss": 0.3273, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 4.334975369458128, | |
| "grad_norm": 0.06814386298200781, | |
| "learning_rate": 3.961245283903239e-06, | |
| "loss": 0.3304, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 4.374384236453202, | |
| "grad_norm": 0.06709307703349002, | |
| "learning_rate": 3.4886693975472443e-06, | |
| "loss": 0.3285, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 4.413793103448276, | |
| "grad_norm": 0.06195820663002009, | |
| "learning_rate": 3.0448186995485307e-06, | |
| "loss": 0.328, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 4.45320197044335, | |
| "grad_norm": 0.06590977322439409, | |
| "learning_rate": 2.630042388216012e-06, | |
| "loss": 0.3314, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 4.4926108374384235, | |
| "grad_norm": 0.06216983243982217, | |
| "learning_rate": 2.244666787665297e-06, | |
| "loss": 0.3285, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 4.532019704433497, | |
| "grad_norm": 0.0568288191054022, | |
| "learning_rate": 1.888995091084147e-06, | |
| "loss": 0.3267, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 4.571428571428571, | |
| "grad_norm": 0.05502800210878154, | |
| "learning_rate": 1.5633071221960205e-06, | |
| "loss": 0.3199, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 4.610837438423645, | |
| "grad_norm": 0.05085194354160944, | |
| "learning_rate": 1.2678591151095466e-06, | |
| "loss": 0.3291, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 4.650246305418719, | |
| "grad_norm": 0.0542030183423217, | |
| "learning_rate": 1.0028835127270553e-06, | |
| "loss": 0.328, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 4.689655172413794, | |
| "grad_norm": 0.04943720054109848, | |
| "learning_rate": 7.685887838707828e-07, | |
| "loss": 0.3282, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 4.7290640394088665, | |
| "grad_norm": 0.05350280931369455, | |
| "learning_rate": 5.651592592705646e-07, | |
| "loss": 0.3358, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 4.768472906403941, | |
| "grad_norm": 0.051675651653332454, | |
| "learning_rate": 3.9275498654217425e-07, | |
| "loss": 0.3231, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.807881773399015, | |
| "grad_norm": 0.049565582110987425, | |
| "learning_rate": 2.5151160427029584e-07, | |
| "loss": 0.3273, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 4.847290640394089, | |
| "grad_norm": 0.05086813508645347, | |
| "learning_rate": 1.4154023529523663e-07, | |
| "loss": 0.3252, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 4.886699507389163, | |
| "grad_norm": 0.04981788886449752, | |
| "learning_rate": 6.292739928733582e-08, | |
| "loss": 0.3251, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 4.926108374384237, | |
| "grad_norm": 0.051983785964860994, | |
| "learning_rate": 1.5734944677885388e-08, | |
| "loss": 0.3261, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 4.9655172413793105, | |
| "grad_norm": 0.05048075450561713, | |
| "learning_rate": 0.0, | |
| "loss": 0.3278, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 4.9655172413793105, | |
| "step": 125, | |
| "total_flos": 4.29306832130723e+18, | |
| "train_loss": 0.40043015813827515, | |
| "train_runtime": 25003.3066, | |
| "train_samples_per_second": 2.597, | |
| "train_steps_per_second": 0.005 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.29306832130723e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |