| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.896817068905212, | |
| "eval_steps": 500, | |
| "global_step": 21000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023318176518596245, | |
| "grad_norm": 0.3708130121231079, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1701, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04663635303719249, | |
| "grad_norm": 0.7055436968803406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9527, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06995452955578874, | |
| "grad_norm": 0.310996949672699, | |
| "learning_rate": 0.0002, | |
| "loss": 0.871, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09327270607438498, | |
| "grad_norm": 0.34611570835113525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8128, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11659088259298123, | |
| "grad_norm": 0.2793200612068176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8008, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13990905911157747, | |
| "grad_norm": 0.2440558820962906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7364, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.16322723563017372, | |
| "grad_norm": 0.20660006999969482, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7016, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.18654541214876996, | |
| "grad_norm": 0.3151717782020569, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6986, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2098635886673662, | |
| "grad_norm": 0.4207448363304138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.649, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23318176518596245, | |
| "grad_norm": 0.43152570724487305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6725, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2564999417045587, | |
| "grad_norm": 0.31539487838745117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6395, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.27981811822315494, | |
| "grad_norm": 0.3349384665489197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6033, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3031362947417512, | |
| "grad_norm": 0.2724147140979767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6076, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.32645447126034743, | |
| "grad_norm": 0.2925530970096588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.585, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3497726477789437, | |
| "grad_norm": 0.4674293100833893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5657, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3730908242975399, | |
| "grad_norm": 0.3915441930294037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.39640900081613617, | |
| "grad_norm": 0.24304556846618652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5198, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4197271773347324, | |
| "grad_norm": 0.5447902679443359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.44304535385332866, | |
| "grad_norm": 0.4133426547050476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5204, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4663635303719249, | |
| "grad_norm": 0.41733473539352417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5204, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.48968170689052115, | |
| "grad_norm": 0.3181161880493164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4698, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5129998834091174, | |
| "grad_norm": 0.34142622351646423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4871, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5363180599277136, | |
| "grad_norm": 0.1926470398902893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4649, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.5596362364463099, | |
| "grad_norm": 0.30340591073036194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4665, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5829544129649061, | |
| "grad_norm": 0.3195839524269104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4667, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6062725894835024, | |
| "grad_norm": 0.2145429104566574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4463, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6295907660020986, | |
| "grad_norm": 0.15962275862693787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.429, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.6529089425206949, | |
| "grad_norm": 0.3597501516342163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4277, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.6762271190392911, | |
| "grad_norm": 0.44612497091293335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4123, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6995452955578874, | |
| "grad_norm": 0.21562007069587708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4074, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7228634720764836, | |
| "grad_norm": 0.23217037320137024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4037, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.7461816485950798, | |
| "grad_norm": 0.3096787631511688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.401, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.7694998251136761, | |
| "grad_norm": 0.18558426201343536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3983, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.7928180016322723, | |
| "grad_norm": 0.2520066797733307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4056, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.8161361781508686, | |
| "grad_norm": 0.41013041138648987, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3706, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8394543546694648, | |
| "grad_norm": 0.14811871945858002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3829, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.8627725311880611, | |
| "grad_norm": 0.36381468176841736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3744, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.8860907077066573, | |
| "grad_norm": 0.28783467411994934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3538, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.9094088842252536, | |
| "grad_norm": 0.23508860170841217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3277, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.9327270607438498, | |
| "grad_norm": 0.3819214403629303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3317, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.9560452372624461, | |
| "grad_norm": 0.298714816570282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3329, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.9793634137810423, | |
| "grad_norm": 0.17287446558475494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3418, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.0026815902996387, | |
| "grad_norm": 0.3725602328777313, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3224, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.0259997668182348, | |
| "grad_norm": 0.6124657988548279, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2589, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.0493179433368311, | |
| "grad_norm": 0.5308946371078491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2718, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.0726361198554273, | |
| "grad_norm": 0.3070002496242523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2662, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.0959542963740236, | |
| "grad_norm": 0.44111424684524536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2516, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.1192724728926198, | |
| "grad_norm": 0.32735341787338257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2652, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.1425906494112161, | |
| "grad_norm": 0.3475642800331116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2498, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.1659088259298123, | |
| "grad_norm": 0.41938111186027527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2577, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.1892270024484086, | |
| "grad_norm": 0.47618812322616577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.251, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.2125451789670048, | |
| "grad_norm": 0.27327144145965576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2511, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.2358633554856011, | |
| "grad_norm": 0.3251878321170807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2264, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.2591815320041972, | |
| "grad_norm": 0.5156410336494446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2617, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.2824997085227934, | |
| "grad_norm": 0.30861613154411316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2441, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.3058178850413897, | |
| "grad_norm": 0.43310919404029846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2331, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.329136061559986, | |
| "grad_norm": 0.36176246404647827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2431, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.3524542380785822, | |
| "grad_norm": 0.3790377974510193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2458, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.3757724145971786, | |
| "grad_norm": 0.4052121341228485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2446, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.3990905911157747, | |
| "grad_norm": 0.35783982276916504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2465, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.422408767634371, | |
| "grad_norm": 0.35436511039733887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2569, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.4457269441529672, | |
| "grad_norm": 0.2950509488582611, | |
| "learning_rate": 0.0002, | |
| "loss": 0.22, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.4690451206715636, | |
| "grad_norm": 0.36950767040252686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2433, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.4923632971901597, | |
| "grad_norm": 0.35253265500068665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2269, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.515681473708756, | |
| "grad_norm": 0.3378414213657379, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2329, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.5389996502273522, | |
| "grad_norm": 0.4102073311805725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2404, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.5623178267459483, | |
| "grad_norm": 0.4430312216281891, | |
| "learning_rate": 0.0002, | |
| "loss": 0.235, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.5856360032645447, | |
| "grad_norm": 0.3363936245441437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2288, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.608954179783141, | |
| "grad_norm": 0.3177776634693146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2443, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.6322723563017372, | |
| "grad_norm": 0.33283111453056335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2267, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.6555905328203333, | |
| "grad_norm": 0.4799099564552307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2355, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.6789087093389297, | |
| "grad_norm": 0.38987642526626587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2268, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.702226885857526, | |
| "grad_norm": 0.32820141315460205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2098, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.7255450623761222, | |
| "grad_norm": 0.4211929142475128, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2291, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.7488632388947183, | |
| "grad_norm": 0.42743125557899475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2192, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.7721814154133146, | |
| "grad_norm": 0.33759135007858276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2301, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.795499591931911, | |
| "grad_norm": 0.24578171968460083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2233, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.8188177684505071, | |
| "grad_norm": 0.3331544101238251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2308, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.8421359449691033, | |
| "grad_norm": 0.4028831720352173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2112, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.8654541214876996, | |
| "grad_norm": 0.3874329924583435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1998, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.888772298006296, | |
| "grad_norm": 0.30130070447921753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.203, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.9120904745248921, | |
| "grad_norm": 0.41124048829078674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2184, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.9354086510434882, | |
| "grad_norm": 0.3104913532733917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2211, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.9587268275620846, | |
| "grad_norm": 0.30567994713783264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2039, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.982045004080681, | |
| "grad_norm": 0.3126045763492584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2107, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.0053631805992773, | |
| "grad_norm": 0.29460686445236206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1901, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.0286813571178732, | |
| "grad_norm": 0.4113939106464386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1621, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.0519995336364696, | |
| "grad_norm": 0.33105671405792236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1657, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.075317710155066, | |
| "grad_norm": 0.33191269636154175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1773, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.0986358866736623, | |
| "grad_norm": 0.3344513475894928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1654, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.121954063192258, | |
| "grad_norm": 0.31760096549987793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1677, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.1452722397108546, | |
| "grad_norm": 0.32853373885154724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1775, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.168590416229451, | |
| "grad_norm": 0.38260915875434875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1644, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.1919085927480473, | |
| "grad_norm": 0.3272022604942322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1632, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.215226769266643, | |
| "grad_norm": 0.40181514620780945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1672, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.2385449457852395, | |
| "grad_norm": 0.285182923078537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1695, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.261863122303836, | |
| "grad_norm": 0.3401045799255371, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1658, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.2851812988224323, | |
| "grad_norm": 0.45088696479797363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.173, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.308499475341028, | |
| "grad_norm": 0.09891465306282043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1725, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.3318176518596245, | |
| "grad_norm": 0.3077000081539154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1777, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.355135828378221, | |
| "grad_norm": 0.2650957703590393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1606, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.3784540048968172, | |
| "grad_norm": 0.2967466413974762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1626, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.401772181415413, | |
| "grad_norm": 0.21177765727043152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1762, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.4250903579340095, | |
| "grad_norm": 0.34562838077545166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1653, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.448408534452606, | |
| "grad_norm": 0.2537182569503784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1722, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.4717267109712022, | |
| "grad_norm": 0.22955211997032166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1713, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.495044887489798, | |
| "grad_norm": 0.3709162175655365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1679, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.5183630640083945, | |
| "grad_norm": 0.24581150710582733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1604, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.541681240526991, | |
| "grad_norm": 0.20854513347148895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1687, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.5649994170455868, | |
| "grad_norm": 0.2496633380651474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.163, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.588317593564183, | |
| "grad_norm": 0.23603980243206024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1748, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.6116357700827795, | |
| "grad_norm": 0.36322489380836487, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1798, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.634953946601376, | |
| "grad_norm": 0.32981303334236145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1588, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.658272123119972, | |
| "grad_norm": 0.4760492742061615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1723, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.681590299638568, | |
| "grad_norm": 0.22435927391052246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1742, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.7049084761571645, | |
| "grad_norm": 0.2695131003856659, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1602, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.728226652675761, | |
| "grad_norm": 0.16897708177566528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1698, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.751544829194357, | |
| "grad_norm": 0.2540949881076813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1641, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.7748630057129535, | |
| "grad_norm": 0.40854746103286743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1747, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.7981811822315494, | |
| "grad_norm": 0.3012579679489136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1619, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.821499358750146, | |
| "grad_norm": 0.18468593060970306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1686, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.844817535268742, | |
| "grad_norm": 0.3668818175792694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1588, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.868135711787338, | |
| "grad_norm": 0.5856422185897827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1784, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.8914538883059344, | |
| "grad_norm": 0.37487712502479553, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1701, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.9147720648245308, | |
| "grad_norm": 0.29282090067863464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1613, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.938090241343127, | |
| "grad_norm": 0.306607186794281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1655, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.9614084178617235, | |
| "grad_norm": 0.1990358531475067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.17, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.9847265943803194, | |
| "grad_norm": 0.4855429232120514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1722, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.0080447708989158, | |
| "grad_norm": 0.39795544743537903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1548, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.031362947417512, | |
| "grad_norm": 0.3113553524017334, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1396, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.054681123936108, | |
| "grad_norm": 0.3086554706096649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1364, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.0779993004547044, | |
| "grad_norm": 0.24818335473537445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1414, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.1013174769733007, | |
| "grad_norm": 0.37954941391944885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1388, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.124635653491897, | |
| "grad_norm": 0.2943727672100067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1408, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.147953830010493, | |
| "grad_norm": 0.35590696334838867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1363, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.1712720065290894, | |
| "grad_norm": 0.19578373432159424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.137, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.1945901830476857, | |
| "grad_norm": 0.25028303265571594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1348, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 3.217908359566282, | |
| "grad_norm": 0.18405300378799438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1372, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 3.241226536084878, | |
| "grad_norm": 0.31417056918144226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1428, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 3.2645447126034743, | |
| "grad_norm": 0.22496923804283142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1378, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.2878628891220707, | |
| "grad_norm": 0.23862232267856598, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1362, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 3.311181065640667, | |
| "grad_norm": 0.2142096310853958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.139, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 3.334499242159263, | |
| "grad_norm": 0.2794269025325775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1376, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 3.3578174186778593, | |
| "grad_norm": 0.14498618245124817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1416, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 3.3811355951964557, | |
| "grad_norm": 0.2895399332046509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1379, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.404453771715052, | |
| "grad_norm": 0.2537992000579834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1356, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 3.427771948233648, | |
| "grad_norm": 0.20395183563232422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1424, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 3.4510901247522443, | |
| "grad_norm": 0.15283405780792236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1395, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 3.4744083012708407, | |
| "grad_norm": 0.4268224537372589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1359, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 3.497726477789437, | |
| "grad_norm": 0.22292669117450714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1386, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.5210446543080334, | |
| "grad_norm": 0.11900927871465683, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1442, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 3.5443628308266293, | |
| "grad_norm": 0.45133286714553833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1365, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 3.5676810073452256, | |
| "grad_norm": 0.30186957120895386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1416, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 3.590999183863822, | |
| "grad_norm": 0.31408384442329407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1387, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 3.614317360382418, | |
| "grad_norm": 0.36072710156440735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1428, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.6376355369010143, | |
| "grad_norm": 0.28984448313713074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1393, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 3.6609537134196106, | |
| "grad_norm": 0.2014656662940979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1435, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 3.684271889938207, | |
| "grad_norm": 0.41273656487464905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1369, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 3.7075900664568033, | |
| "grad_norm": 0.48672163486480713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1433, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 3.7309082429753992, | |
| "grad_norm": 0.19120950996875763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1405, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.7542264194939956, | |
| "grad_norm": 0.19792740046977997, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1451, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 3.777544596012592, | |
| "grad_norm": 0.14919213950634003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1382, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 3.800862772531188, | |
| "grad_norm": 0.4650104343891144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1339, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 3.8241809490497842, | |
| "grad_norm": 0.3627985417842865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1422, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 3.8474991255683806, | |
| "grad_norm": 0.7782896161079407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1432, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.870817302086977, | |
| "grad_norm": 0.2858645021915436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1413, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 3.8941354786055733, | |
| "grad_norm": 0.22150644659996033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1437, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 3.917453655124169, | |
| "grad_norm": 0.3596114218235016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1463, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 3.9407718316427656, | |
| "grad_norm": 0.14949366450309753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1449, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 3.964090008161362, | |
| "grad_norm": 0.32889851927757263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1396, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.987408184679958, | |
| "grad_norm": 0.1940721869468689, | |
| "learning_rate": 0.0002, | |
| "loss": 0.14, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.010726361198555, | |
| "grad_norm": 0.1328798085451126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1316, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 4.0340445377171505, | |
| "grad_norm": 0.09979192912578583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1224, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 4.0573627142357465, | |
| "grad_norm": 0.22828274965286255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1184, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 4.080680890754343, | |
| "grad_norm": 0.1396108716726303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1189, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.103999067272939, | |
| "grad_norm": 0.1849929839372635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1231, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 4.127317243791535, | |
| "grad_norm": 0.14947502315044403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1158, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 4.150635420310132, | |
| "grad_norm": 0.3471536934375763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1204, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 4.173953596828728, | |
| "grad_norm": 0.23290419578552246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1175, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 4.197271773347325, | |
| "grad_norm": 0.17477743327617645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1205, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.2205899498659205, | |
| "grad_norm": 0.1214243695139885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1188, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 4.243908126384516, | |
| "grad_norm": 0.12706777453422546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1196, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 4.267226302903113, | |
| "grad_norm": 0.18115375936031342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1179, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 4.290544479421709, | |
| "grad_norm": 0.05149231478571892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1224, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 4.313862655940305, | |
| "grad_norm": 0.47274354100227356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1192, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.337180832458902, | |
| "grad_norm": 0.218338742852211, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1244, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 4.360499008977498, | |
| "grad_norm": 0.1247347891330719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1267, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 4.383817185496095, | |
| "grad_norm": 0.2586764991283417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1236, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 4.4071353620146905, | |
| "grad_norm": 0.11474807560443878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1252, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 4.430453538533286, | |
| "grad_norm": 0.34646329283714294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1237, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.453771715051883, | |
| "grad_norm": 0.17445826530456543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1183, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 4.477089891570479, | |
| "grad_norm": 0.3867531716823578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1248, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 4.500408068089076, | |
| "grad_norm": 0.15927106142044067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1258, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 4.523726244607672, | |
| "grad_norm": 0.2284346967935562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1244, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 4.547044421126268, | |
| "grad_norm": 0.3231777250766754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1257, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 4.5703625976448645, | |
| "grad_norm": 0.10116703063249588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1293, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 4.59368077416346, | |
| "grad_norm": 0.2922173738479614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1262, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 4.616998950682056, | |
| "grad_norm": 0.1958065629005432, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1258, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 4.640317127200653, | |
| "grad_norm": 0.08755222707986832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1293, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 4.663635303719249, | |
| "grad_norm": 0.1416950523853302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1227, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 4.686953480237845, | |
| "grad_norm": 0.21383579075336456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1272, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 4.710271656756442, | |
| "grad_norm": 0.27910149097442627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1298, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 4.733589833275038, | |
| "grad_norm": 0.07715137302875519, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1266, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 4.7569080097936345, | |
| "grad_norm": 0.08127077668905258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1269, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 4.78022618631223, | |
| "grad_norm": 0.3075973391532898, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1308, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 4.803544362830826, | |
| "grad_norm": 0.23989351093769073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1217, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 4.826862539349423, | |
| "grad_norm": 0.1361120343208313, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1237, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 4.850180715868019, | |
| "grad_norm": 0.3711351156234741, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1248, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 4.873498892386616, | |
| "grad_norm": 0.3196912109851837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1236, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 4.896817068905212, | |
| "grad_norm": 0.10089880973100662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1248, | |
| "step": 21000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 21440, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 3000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9056254817400013e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |