| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.0, |
| "eval_steps": 500, |
| "global_step": 870, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03466204506065858, |
| "grad_norm": 1.530611515045166, |
| "learning_rate": 0.0002, |
| "loss": 3.0151, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.06932409012131716, |
| "grad_norm": 1.7004058361053467, |
| "learning_rate": 0.0002, |
| "loss": 2.0235, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.10398613518197573, |
| "grad_norm": 1.933908462524414, |
| "learning_rate": 0.0002, |
| "loss": 1.1225, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.1386481802426343, |
| "grad_norm": 0.9951248168945312, |
| "learning_rate": 0.0002, |
| "loss": 0.5422, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1733102253032929, |
| "grad_norm": 6.152623176574707, |
| "learning_rate": 0.0002, |
| "loss": 1.5639, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.20797227036395147, |
| "grad_norm": 8.008278846740723, |
| "learning_rate": 0.0002, |
| "loss": 2.6486, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.24263431542461006, |
| "grad_norm": 4.960943222045898, |
| "learning_rate": 0.0002, |
| "loss": 2.6059, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.2772963604852686, |
| "grad_norm": 4.607645511627197, |
| "learning_rate": 0.0002, |
| "loss": 1.6898, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3119584055459272, |
| "grad_norm": 0.8448249101638794, |
| "learning_rate": 0.0002, |
| "loss": 0.799, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.3466204506065858, |
| "grad_norm": 0.46238261461257935, |
| "learning_rate": 0.0002, |
| "loss": 0.5374, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.38128249566724437, |
| "grad_norm": 0.3575257956981659, |
| "learning_rate": 0.0002, |
| "loss": 0.4247, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.41594454072790293, |
| "grad_norm": 2.5151493549346924, |
| "learning_rate": 0.0002, |
| "loss": 0.7472, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4506065857885615, |
| "grad_norm": 17.38052749633789, |
| "learning_rate": 0.0002, |
| "loss": 1.8466, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.4852686308492201, |
| "grad_norm": 15.523968696594238, |
| "learning_rate": 0.0002, |
| "loss": 2.1105, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5199306759098787, |
| "grad_norm": 1.0907574892044067, |
| "learning_rate": 0.0002, |
| "loss": 1.1052, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5545927209705372, |
| "grad_norm": 0.5225581526756287, |
| "learning_rate": 0.0002, |
| "loss": 0.5663, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5892547660311959, |
| "grad_norm": 0.3306778073310852, |
| "learning_rate": 0.0002, |
| "loss": 0.4648, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.6239168110918544, |
| "grad_norm": 0.33595138788223267, |
| "learning_rate": 0.0002, |
| "loss": 0.3762, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.658578856152513, |
| "grad_norm": 1.978458285331726, |
| "learning_rate": 0.0002, |
| "loss": 0.9162, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6932409012131716, |
| "grad_norm": 1.1226632595062256, |
| "learning_rate": 0.0002, |
| "loss": 1.5477, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7279029462738301, |
| "grad_norm": 22.263992309570312, |
| "learning_rate": 0.0002, |
| "loss": 1.8063, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.7625649913344887, |
| "grad_norm": 1.3690563440322876, |
| "learning_rate": 0.0002, |
| "loss": 1.3102, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.7972270363951474, |
| "grad_norm": 0.5651267766952515, |
| "learning_rate": 0.0002, |
| "loss": 0.553, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.8318890814558059, |
| "grad_norm": 0.3272489011287689, |
| "learning_rate": 0.0002, |
| "loss": 0.4813, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.8665511265164645, |
| "grad_norm": 0.30475500226020813, |
| "learning_rate": 0.0002, |
| "loss": 0.3746, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.901213171577123, |
| "grad_norm": 0.2812097370624542, |
| "learning_rate": 0.0002, |
| "loss": 0.284, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.9358752166377816, |
| "grad_norm": 1.159839391708374, |
| "learning_rate": 0.0002, |
| "loss": 1.5179, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.9705372616984402, |
| "grad_norm": 4.740174770355225, |
| "learning_rate": 0.0002, |
| "loss": 1.549, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.613008499145508, |
| "learning_rate": 0.0002, |
| "loss": 1.3938, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.0346620450606585, |
| "grad_norm": 0.31811827421188354, |
| "learning_rate": 0.0002, |
| "loss": 0.5353, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.0693240901213172, |
| "grad_norm": 0.42897579073905945, |
| "learning_rate": 0.0002, |
| "loss": 0.4452, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.1039861351819757, |
| "grad_norm": 0.28854838013648987, |
| "learning_rate": 0.0002, |
| "loss": 0.3563, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.1386481802426343, |
| "grad_norm": 0.32319992780685425, |
| "learning_rate": 0.0002, |
| "loss": 0.2697, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.173310225303293, |
| "grad_norm": 1.0144641399383545, |
| "learning_rate": 0.0002, |
| "loss": 1.3199, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.2079722703639515, |
| "grad_norm": 2.1387648582458496, |
| "learning_rate": 0.0002, |
| "loss": 1.1948, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.24263431542461, |
| "grad_norm": 5.240429401397705, |
| "learning_rate": 0.0002, |
| "loss": 1.2138, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.2772963604852685, |
| "grad_norm": 0.5809195637702942, |
| "learning_rate": 0.0002, |
| "loss": 0.7417, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.3119584055459272, |
| "grad_norm": 0.3426031768321991, |
| "learning_rate": 0.0002, |
| "loss": 0.4541, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.3466204506065858, |
| "grad_norm": 0.34089571237564087, |
| "learning_rate": 0.0002, |
| "loss": 0.3709, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.3812824956672443, |
| "grad_norm": 0.2676282823085785, |
| "learning_rate": 0.0002, |
| "loss": 0.2981, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.415944540727903, |
| "grad_norm": 1.8662705421447754, |
| "learning_rate": 0.0002, |
| "loss": 0.8353, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.4506065857885615, |
| "grad_norm": 3.2310688495635986, |
| "learning_rate": 0.0002, |
| "loss": 1.2188, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.48526863084922, |
| "grad_norm": 2.255415916442871, |
| "learning_rate": 0.0002, |
| "loss": 1.1677, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.5199306759098787, |
| "grad_norm": 0.5544691681861877, |
| "learning_rate": 0.0002, |
| "loss": 0.799, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.5545927209705372, |
| "grad_norm": 0.4003375172615051, |
| "learning_rate": 0.0002, |
| "loss": 0.4786, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.5892547660311958, |
| "grad_norm": 0.3774619698524475, |
| "learning_rate": 0.0002, |
| "loss": 0.4233, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.6239168110918545, |
| "grad_norm": 0.352490097284317, |
| "learning_rate": 0.0002, |
| "loss": 0.3332, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.658578856152513, |
| "grad_norm": 0.2637154459953308, |
| "learning_rate": 0.0002, |
| "loss": 0.2438, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.6932409012131715, |
| "grad_norm": 1.3437107801437378, |
| "learning_rate": 0.0002, |
| "loss": 1.4351, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.7279029462738302, |
| "grad_norm": 2.647561550140381, |
| "learning_rate": 0.0002, |
| "loss": 1.0698, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.7625649913344887, |
| "grad_norm": 0.6839751601219177, |
| "learning_rate": 0.0002, |
| "loss": 0.7411, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.7972270363951472, |
| "grad_norm": 0.4790812134742737, |
| "learning_rate": 0.0002, |
| "loss": 0.5009, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.831889081455806, |
| "grad_norm": 0.36322328448295593, |
| "learning_rate": 0.0002, |
| "loss": 0.4014, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.8665511265164645, |
| "grad_norm": 0.31694507598876953, |
| "learning_rate": 0.0002, |
| "loss": 0.3441, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.901213171577123, |
| "grad_norm": 0.24292677640914917, |
| "learning_rate": 0.0002, |
| "loss": 0.2619, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.9358752166377817, |
| "grad_norm": 1.133062720298767, |
| "learning_rate": 0.0002, |
| "loss": 1.1564, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.9705372616984402, |
| "grad_norm": 10.754274368286133, |
| "learning_rate": 0.0002, |
| "loss": 1.436, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.6346553564071655, |
| "learning_rate": 0.0002, |
| "loss": 1.3215, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.0346620450606587, |
| "grad_norm": 0.7156389355659485, |
| "learning_rate": 0.0002, |
| "loss": 0.551, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.069324090121317, |
| "grad_norm": 0.3842258155345917, |
| "learning_rate": 0.0002, |
| "loss": 0.3783, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.1039861351819757, |
| "grad_norm": 0.35967278480529785, |
| "learning_rate": 0.0002, |
| "loss": 0.3124, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.1386481802426345, |
| "grad_norm": 0.296977162361145, |
| "learning_rate": 0.0002, |
| "loss": 0.2367, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.1733102253032928, |
| "grad_norm": 1.6834601163864136, |
| "learning_rate": 0.0002, |
| "loss": 0.7685, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.2079722703639515, |
| "grad_norm": 5.758884906768799, |
| "learning_rate": 0.0002, |
| "loss": 1.0063, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.2426343154246102, |
| "grad_norm": 15.24948501586914, |
| "learning_rate": 0.0002, |
| "loss": 1.231, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.2772963604852685, |
| "grad_norm": 0.599730372428894, |
| "learning_rate": 0.0002, |
| "loss": 0.6392, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.3119584055459272, |
| "grad_norm": 0.5307044982910156, |
| "learning_rate": 0.0002, |
| "loss": 0.3804, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.346620450606586, |
| "grad_norm": 0.33558061718940735, |
| "learning_rate": 0.0002, |
| "loss": 0.3282, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.3812824956672443, |
| "grad_norm": 0.2716732919216156, |
| "learning_rate": 0.0002, |
| "loss": 0.2429, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.415944540727903, |
| "grad_norm": 2.454636812210083, |
| "learning_rate": 0.0002, |
| "loss": 0.5396, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.4506065857885613, |
| "grad_norm": 13.286089897155762, |
| "learning_rate": 0.0002, |
| "loss": 1.0545, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.48526863084922, |
| "grad_norm": 12.619502067565918, |
| "learning_rate": 0.0002, |
| "loss": 1.1547, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.5199306759098787, |
| "grad_norm": 0.6347368359565735, |
| "learning_rate": 0.0002, |
| "loss": 0.7136, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.554592720970537, |
| "grad_norm": 0.44485101103782654, |
| "learning_rate": 0.0002, |
| "loss": 0.3994, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.5892547660311958, |
| "grad_norm": 0.36494728922843933, |
| "learning_rate": 0.0002, |
| "loss": 0.3107, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.6239168110918545, |
| "grad_norm": 0.322171688079834, |
| "learning_rate": 0.0002, |
| "loss": 0.2753, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.6585788561525128, |
| "grad_norm": 0.9659174084663391, |
| "learning_rate": 0.0002, |
| "loss": 0.2948, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.6932409012131715, |
| "grad_norm": 0.9371129870414734, |
| "learning_rate": 0.0002, |
| "loss": 1.0286, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.7279029462738302, |
| "grad_norm": 3.4619617462158203, |
| "learning_rate": 0.0002, |
| "loss": 1.0745, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.7625649913344885, |
| "grad_norm": 0.6884852051734924, |
| "learning_rate": 0.0002, |
| "loss": 0.7637, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.7972270363951472, |
| "grad_norm": 0.4440446197986603, |
| "learning_rate": 0.0002, |
| "loss": 0.4331, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.831889081455806, |
| "grad_norm": 0.37561893463134766, |
| "learning_rate": 0.0002, |
| "loss": 0.3667, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.8665511265164643, |
| "grad_norm": 0.320077121257782, |
| "learning_rate": 0.0002, |
| "loss": 0.2769, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.901213171577123, |
| "grad_norm": 1.4914495944976807, |
| "learning_rate": 0.0002, |
| "loss": 0.4493, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.9358752166377817, |
| "grad_norm": 1.164036750793457, |
| "learning_rate": 0.0002, |
| "loss": 1.1276, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.97053726169844, |
| "grad_norm": 5.013108253479004, |
| "learning_rate": 0.0002, |
| "loss": 1.0841, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 2.2247564792633057, |
| "learning_rate": 0.0002, |
| "loss": 0.7636, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.0346620450606587, |
| "grad_norm": 0.3730102479457855, |
| "learning_rate": 0.0002, |
| "loss": 0.4291, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.069324090121317, |
| "grad_norm": 0.36571791768074036, |
| "learning_rate": 0.0002, |
| "loss": 0.3155, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.1039861351819757, |
| "grad_norm": 0.34427550435066223, |
| "learning_rate": 0.0002, |
| "loss": 0.2297, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.1386481802426345, |
| "grad_norm": 0.3725241422653198, |
| "learning_rate": 0.0002, |
| "loss": 0.173, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.1733102253032928, |
| "grad_norm": 0.9735671877861023, |
| "learning_rate": 0.0002, |
| "loss": 0.8876, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.2079722703639515, |
| "grad_norm": 3.0348145961761475, |
| "learning_rate": 0.0002, |
| "loss": 0.7825, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.2426343154246102, |
| "grad_norm": 1.4603136777877808, |
| "learning_rate": 0.0002, |
| "loss": 0.8017, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.2772963604852685, |
| "grad_norm": 0.6849704384803772, |
| "learning_rate": 0.0002, |
| "loss": 0.6029, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.3119584055459272, |
| "grad_norm": 0.45950087904930115, |
| "learning_rate": 0.0002, |
| "loss": 0.3388, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.346620450606586, |
| "grad_norm": 0.3234940469264984, |
| "learning_rate": 0.0002, |
| "loss": 0.2635, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.3812824956672443, |
| "grad_norm": 0.3088044822216034, |
| "learning_rate": 0.0002, |
| "loss": 0.1867, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.415944540727903, |
| "grad_norm": 0.36887720227241516, |
| "learning_rate": 0.0002, |
| "loss": 0.1653, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.4506065857885613, |
| "grad_norm": 1.041858196258545, |
| "learning_rate": 0.0002, |
| "loss": 0.9041, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.48526863084922, |
| "grad_norm": 1.4774988889694214, |
| "learning_rate": 0.0002, |
| "loss": 0.9789, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.5199306759098787, |
| "grad_norm": 0.6804758310317993, |
| "learning_rate": 0.0002, |
| "loss": 0.5891, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.554592720970537, |
| "grad_norm": 0.43344879150390625, |
| "learning_rate": 0.0002, |
| "loss": 0.3632, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.5892547660311958, |
| "grad_norm": 0.4160469174385071, |
| "learning_rate": 0.0002, |
| "loss": 0.2734, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.6239168110918545, |
| "grad_norm": 0.35103052854537964, |
| "learning_rate": 0.0002, |
| "loss": 0.2215, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.6585788561525128, |
| "grad_norm": 1.083409309387207, |
| "learning_rate": 0.0002, |
| "loss": 0.2623, |
| "step": 530 |
| }, |
| { |
| "epoch": 3.6932409012131715, |
| "grad_norm": 1.2548085451126099, |
| "learning_rate": 0.0002, |
| "loss": 0.9395, |
| "step": 535 |
| }, |
| { |
| "epoch": 3.7279029462738302, |
| "grad_norm": 2.565399169921875, |
| "learning_rate": 0.0002, |
| "loss": 0.9097, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.7625649913344885, |
| "grad_norm": 0.6658923625946045, |
| "learning_rate": 0.0002, |
| "loss": 0.739, |
| "step": 545 |
| }, |
| { |
| "epoch": 3.7972270363951472, |
| "grad_norm": 0.4989807605743408, |
| "learning_rate": 0.0002, |
| "loss": 0.396, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.831889081455806, |
| "grad_norm": 0.39474496245384216, |
| "learning_rate": 0.0002, |
| "loss": 0.2809, |
| "step": 555 |
| }, |
| { |
| "epoch": 3.8665511265164643, |
| "grad_norm": 0.38600996136665344, |
| "learning_rate": 0.0002, |
| "loss": 0.2254, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.901213171577123, |
| "grad_norm": 0.3662823736667633, |
| "learning_rate": 0.0002, |
| "loss": 0.167, |
| "step": 565 |
| }, |
| { |
| "epoch": 3.9358752166377817, |
| "grad_norm": 1.349672794342041, |
| "learning_rate": 0.0002, |
| "loss": 1.1593, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.97053726169844, |
| "grad_norm": 2.3923165798187256, |
| "learning_rate": 0.0002, |
| "loss": 0.8778, |
| "step": 575 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.7578555345535278, |
| "learning_rate": 0.0002, |
| "loss": 0.6897, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.034662045060658, |
| "grad_norm": 0.5052911043167114, |
| "learning_rate": 0.0002, |
| "loss": 0.3913, |
| "step": 585 |
| }, |
| { |
| "epoch": 4.0693240901213175, |
| "grad_norm": 0.5212499499320984, |
| "learning_rate": 0.0002, |
| "loss": 0.2326, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.103986135181976, |
| "grad_norm": 0.452105313539505, |
| "learning_rate": 0.0002, |
| "loss": 0.1846, |
| "step": 595 |
| }, |
| { |
| "epoch": 4.138648180242634, |
| "grad_norm": 0.4193221926689148, |
| "learning_rate": 0.0002, |
| "loss": 0.1359, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.173310225303293, |
| "grad_norm": 2.3956053256988525, |
| "learning_rate": 0.0002, |
| "loss": 0.3795, |
| "step": 605 |
| }, |
| { |
| "epoch": 4.2079722703639515, |
| "grad_norm": 1.946535348892212, |
| "learning_rate": 0.0002, |
| "loss": 0.579, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.24263431542461, |
| "grad_norm": 1.874873161315918, |
| "learning_rate": 0.0002, |
| "loss": 0.7528, |
| "step": 615 |
| }, |
| { |
| "epoch": 4.277296360485269, |
| "grad_norm": 0.6922146081924438, |
| "learning_rate": 0.0002, |
| "loss": 0.4398, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.311958405545927, |
| "grad_norm": 0.5228244662284851, |
| "learning_rate": 0.0002, |
| "loss": 0.2615, |
| "step": 625 |
| }, |
| { |
| "epoch": 4.3466204506065855, |
| "grad_norm": 0.34687915444374084, |
| "learning_rate": 0.0002, |
| "loss": 0.191, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.381282495667245, |
| "grad_norm": 0.4160069525241852, |
| "learning_rate": 0.0002, |
| "loss": 0.1434, |
| "step": 635 |
| }, |
| { |
| "epoch": 4.415944540727903, |
| "grad_norm": 1.468666434288025, |
| "learning_rate": 0.0002, |
| "loss": 0.61, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.450606585788561, |
| "grad_norm": 3.174201488494873, |
| "learning_rate": 0.0002, |
| "loss": 0.5773, |
| "step": 645 |
| }, |
| { |
| "epoch": 4.4852686308492205, |
| "grad_norm": 1.8258005380630493, |
| "learning_rate": 0.0002, |
| "loss": 0.7495, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.519930675909879, |
| "grad_norm": 0.8468143343925476, |
| "learning_rate": 0.0002, |
| "loss": 0.5441, |
| "step": 655 |
| }, |
| { |
| "epoch": 4.554592720970537, |
| "grad_norm": 0.5467656254768372, |
| "learning_rate": 0.0002, |
| "loss": 0.2906, |
| "step": 660 |
| }, |
| { |
| "epoch": 4.589254766031196, |
| "grad_norm": 0.396992564201355, |
| "learning_rate": 0.0002, |
| "loss": 0.2186, |
| "step": 665 |
| }, |
| { |
| "epoch": 4.6239168110918545, |
| "grad_norm": 0.31931889057159424, |
| "learning_rate": 0.0002, |
| "loss": 0.1629, |
| "step": 670 |
| }, |
| { |
| "epoch": 4.658578856152513, |
| "grad_norm": 1.755745530128479, |
| "learning_rate": 0.0002, |
| "loss": 0.2292, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.693240901213172, |
| "grad_norm": 0.9869062900543213, |
| "learning_rate": 0.0002, |
| "loss": 0.6716, |
| "step": 680 |
| }, |
| { |
| "epoch": 4.72790294627383, |
| "grad_norm": 1.5987203121185303, |
| "learning_rate": 0.0002, |
| "loss": 0.8182, |
| "step": 685 |
| }, |
| { |
| "epoch": 4.7625649913344885, |
| "grad_norm": 0.870819091796875, |
| "learning_rate": 0.0002, |
| "loss": 0.5902, |
| "step": 690 |
| }, |
| { |
| "epoch": 4.797227036395148, |
| "grad_norm": 0.5098839402198792, |
| "learning_rate": 0.0002, |
| "loss": 0.3224, |
| "step": 695 |
| }, |
| { |
| "epoch": 4.831889081455806, |
| "grad_norm": 0.5435961484909058, |
| "learning_rate": 0.0002, |
| "loss": 0.2182, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.866551126516464, |
| "grad_norm": 0.33275553584098816, |
| "learning_rate": 0.0002, |
| "loss": 0.1595, |
| "step": 705 |
| }, |
| { |
| "epoch": 4.9012131715771226, |
| "grad_norm": 1.8109265565872192, |
| "learning_rate": 0.0002, |
| "loss": 0.1985, |
| "step": 710 |
| }, |
| { |
| "epoch": 4.935875216637782, |
| "grad_norm": 1.149431824684143, |
| "learning_rate": 0.0002, |
| "loss": 0.8148, |
| "step": 715 |
| }, |
| { |
| "epoch": 4.97053726169844, |
| "grad_norm": 1.4355350732803345, |
| "learning_rate": 0.0002, |
| "loss": 0.6675, |
| "step": 720 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 2.76149845123291, |
| "learning_rate": 0.0002, |
| "loss": 0.5541, |
| "step": 725 |
| }, |
| { |
| "epoch": 5.034662045060658, |
| "grad_norm": 0.6026546359062195, |
| "learning_rate": 0.0002, |
| "loss": 0.3515, |
| "step": 730 |
| }, |
| { |
| "epoch": 5.0693240901213175, |
| "grad_norm": 0.37150222063064575, |
| "learning_rate": 0.0002, |
| "loss": 0.1661, |
| "step": 735 |
| }, |
| { |
| "epoch": 5.103986135181976, |
| "grad_norm": 0.31225013732910156, |
| "learning_rate": 0.0002, |
| "loss": 0.1243, |
| "step": 740 |
| }, |
| { |
| "epoch": 5.138648180242634, |
| "grad_norm": 0.4382600784301758, |
| "learning_rate": 0.0002, |
| "loss": 0.1004, |
| "step": 745 |
| }, |
| { |
| "epoch": 5.173310225303293, |
| "grad_norm": 2.0306389331817627, |
| "learning_rate": 0.0002, |
| "loss": 0.3743, |
| "step": 750 |
| }, |
| { |
| "epoch": 5.2079722703639515, |
| "grad_norm": 1.6935468912124634, |
| "learning_rate": 0.0002, |
| "loss": 0.4202, |
| "step": 755 |
| }, |
| { |
| "epoch": 5.24263431542461, |
| "grad_norm": 1.4990637302398682, |
| "learning_rate": 0.0002, |
| "loss": 0.5533, |
| "step": 760 |
| }, |
| { |
| "epoch": 5.277296360485269, |
| "grad_norm": 0.6687419414520264, |
| "learning_rate": 0.0002, |
| "loss": 0.3429, |
| "step": 765 |
| }, |
| { |
| "epoch": 5.311958405545927, |
| "grad_norm": 0.5400169491767883, |
| "learning_rate": 0.0002, |
| "loss": 0.1759, |
| "step": 770 |
| }, |
| { |
| "epoch": 5.3466204506065855, |
| "grad_norm": 0.39679139852523804, |
| "learning_rate": 0.0002, |
| "loss": 0.1395, |
| "step": 775 |
| }, |
| { |
| "epoch": 5.381282495667245, |
| "grad_norm": 0.33043885231018066, |
| "learning_rate": 0.0002, |
| "loss": 0.1004, |
| "step": 780 |
| }, |
| { |
| "epoch": 5.415944540727903, |
| "grad_norm": 3.1912200450897217, |
| "learning_rate": 0.0002, |
| "loss": 0.305, |
| "step": 785 |
| }, |
| { |
| "epoch": 5.450606585788561, |
| "grad_norm": 1.2743686437606812, |
| "learning_rate": 0.0002, |
| "loss": 0.4876, |
| "step": 790 |
| }, |
| { |
| "epoch": 5.4852686308492205, |
| "grad_norm": 2.9798948764801025, |
| "learning_rate": 0.0002, |
| "loss": 0.7476, |
| "step": 795 |
| }, |
| { |
| "epoch": 5.519930675909879, |
| "grad_norm": 0.7342365384101868, |
| "learning_rate": 0.0002, |
| "loss": 0.463, |
| "step": 800 |
| }, |
| { |
| "epoch": 5.554592720970537, |
| "grad_norm": 0.5035507678985596, |
| "learning_rate": 0.0002, |
| "loss": 0.2116, |
| "step": 805 |
| }, |
| { |
| "epoch": 5.589254766031196, |
| "grad_norm": 0.4394471347332001, |
| "learning_rate": 0.0002, |
| "loss": 0.1568, |
| "step": 810 |
| }, |
| { |
| "epoch": 5.6239168110918545, |
| "grad_norm": 0.383515328168869, |
| "learning_rate": 0.0002, |
| "loss": 0.1035, |
| "step": 815 |
| }, |
| { |
| "epoch": 5.658578856152513, |
| "grad_norm": 2.142784357070923, |
| "learning_rate": 0.0002, |
| "loss": 0.3458, |
| "step": 820 |
| }, |
| { |
| "epoch": 5.693240901213172, |
| "grad_norm": 1.2484018802642822, |
| "learning_rate": 0.0002, |
| "loss": 0.3576, |
| "step": 825 |
| }, |
| { |
| "epoch": 5.72790294627383, |
| "grad_norm": 2.6863198280334473, |
| "learning_rate": 0.0002, |
| "loss": 0.5996, |
| "step": 830 |
| }, |
| { |
| "epoch": 5.7625649913344885, |
| "grad_norm": 1.098381757736206, |
| "learning_rate": 0.0002, |
| "loss": 0.4993, |
| "step": 835 |
| }, |
| { |
| "epoch": 5.797227036395148, |
| "grad_norm": 0.5609210133552551, |
| "learning_rate": 0.0002, |
| "loss": 0.2608, |
| "step": 840 |
| }, |
| { |
| "epoch": 5.831889081455806, |
| "grad_norm": 0.5123816132545471, |
| "learning_rate": 0.0002, |
| "loss": 0.1799, |
| "step": 845 |
| }, |
| { |
| "epoch": 5.866551126516464, |
| "grad_norm": 0.46399134397506714, |
| "learning_rate": 0.0002, |
| "loss": 0.1254, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.9012131715771226, |
| "grad_norm": 0.5089249014854431, |
| "learning_rate": 0.0002, |
| "loss": 0.0934, |
| "step": 855 |
| }, |
| { |
| "epoch": 5.935875216637782, |
| "grad_norm": 1.3031930923461914, |
| "learning_rate": 0.0002, |
| "loss": 0.6664, |
| "step": 860 |
| }, |
| { |
| "epoch": 5.97053726169844, |
| "grad_norm": 2.294696569442749, |
| "learning_rate": 0.0002, |
| "loss": 0.6034, |
| "step": 865 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 2.738062620162964, |
| "learning_rate": 0.0002, |
| "loss": 0.587, |
| "step": 870 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1440, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.028815749249229e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|