Invalid JSON: Unexpected token 'N', ..."al_loss": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9994869163673679, | |
| "eval_steps": 500, | |
| "global_step": 974, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.28735690720469315, | |
| "learning_rate": 2.040816326530612e-06, | |
| "loss": 0.8613, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.3167433972327598, | |
| "learning_rate": 1.0204081632653061e-05, | |
| "loss": 0.7893, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.2897093230208272, | |
| "learning_rate": 2.0408163265306123e-05, | |
| "loss": 0.8123, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.32529746878796306, | |
| "learning_rate": 3.061224489795919e-05, | |
| "loss": 0.8337, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.41596863358989566, | |
| "learning_rate": 4.0816326530612245e-05, | |
| "loss": 0.7853, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.24343841042790573, | |
| "learning_rate": 5.102040816326531e-05, | |
| "loss": 0.7278, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.23881432693463991, | |
| "learning_rate": 6.122448979591838e-05, | |
| "loss": 0.7748, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.26488823170135234, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": 0.7369, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.2139946375583922, | |
| "learning_rate": 8.163265306122449e-05, | |
| "loss": 0.7414, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.21151116397910674, | |
| "learning_rate": 9.183673469387756e-05, | |
| "loss": 0.7377, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.22393538825183892, | |
| "learning_rate": 0.00010204081632653062, | |
| "loss": 0.7804, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.23672135676179382, | |
| "learning_rate": 0.00011224489795918367, | |
| "loss": 0.7587, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.19299901899888663, | |
| "learning_rate": 0.00012244897959183676, | |
| "loss": 0.7395, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.216886150554683, | |
| "learning_rate": 0.0001326530612244898, | |
| "loss": 0.7301, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.23673356651710506, | |
| "learning_rate": 0.00014285714285714287, | |
| "loss": 0.7038, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.24221298858964754, | |
| "learning_rate": 0.0001530612244897959, | |
| "loss": 0.7223, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.2281125789539668, | |
| "learning_rate": 0.00016326530612244898, | |
| "loss": 0.6883, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.20472032849324356, | |
| "learning_rate": 0.00017346938775510205, | |
| "loss": 0.7637, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.262981845017497, | |
| "learning_rate": 0.00018367346938775512, | |
| "loss": 0.7539, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.2045459660012419, | |
| "learning_rate": 0.00019387755102040816, | |
| "loss": 0.7145, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.19901520782139356, | |
| "learning_rate": 0.0001999974277115551, | |
| "loss": 0.746, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.20406293825215613, | |
| "learning_rate": 0.00019996849098629418, | |
| "loss": 0.6897, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.21959287502020255, | |
| "learning_rate": 0.00019990741151022301, | |
| "loss": 0.7047, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.2674905077734418, | |
| "learning_rate": 0.0001998142089221534, | |
| "loss": 0.7807, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.19121955233560559, | |
| "learning_rate": 0.0001996889131894033, | |
| "loss": 0.7405, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.2273017831695812, | |
| "learning_rate": 0.00019953156459816179, | |
| "loss": 0.7068, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.20355662352048876, | |
| "learning_rate": 0.0001993422137405354, | |
| "loss": 0.7405, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.20243555943079916, | |
| "learning_rate": 0.00019912092149828174, | |
| "loss": 0.77, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.2187163182490927, | |
| "learning_rate": 0.00019886775902323405, | |
| "loss": 0.7517, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.21898481894502772, | |
| "learning_rate": 0.00019858280771442385, | |
| "loss": 0.7449, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.2741369175466676, | |
| "learning_rate": 0.00019826615919190887, | |
| "loss": 0.7293, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.1963482017199379, | |
| "learning_rate": 0.00019791791526731445, | |
| "loss": 0.7338, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.2252716807662381, | |
| "learning_rate": 0.00019753818791109828, | |
| "loss": 0.7165, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.20760666448279344, | |
| "learning_rate": 0.0001971270992165486, | |
| "loss": 0.6738, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.20386126764416265, | |
| "learning_rate": 0.00019668478136052774, | |
| "loss": 0.6954, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.2044311112622246, | |
| "learning_rate": 0.0001962113765609735, | |
| "loss": 0.6905, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.19608231248059868, | |
| "learning_rate": 0.0001957070370311717, | |
| "loss": 0.7291, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.2344659008650863, | |
| "learning_rate": 0.00019517192493081565, | |
| "loss": 0.7213, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.2066414920484931, | |
| "learning_rate": 0.00019460621231386676, | |
| "loss": 0.7669, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.18973265779951576, | |
| "learning_rate": 0.00019401008107323455, | |
| "loss": 0.735, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.21253574311911852, | |
| "learning_rate": 0.0001933837228822925, | |
| "loss": 0.71, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.21426682728436638, | |
| "learning_rate": 0.0001927273391332499, | |
| "loss": 0.7394, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.19895817515454003, | |
| "learning_rate": 0.00019204114087239806, | |
| "loss": 0.7458, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.20302093776022942, | |
| "learning_rate": 0.00019132534873225323, | |
| "loss": 0.7314, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.2144349163303807, | |
| "learning_rate": 0.00019058019286061665, | |
| "loss": 0.7152, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.1971709726410862, | |
| "learning_rate": 0.00018980591284657535, | |
| "loss": 0.7365, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.22203472003740699, | |
| "learning_rate": 0.00018900275764346768, | |
| "loss": 0.7188, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.21934841651902945, | |
| "learning_rate": 0.0001881709854888372, | |
| "loss": 0.7107, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.1801668247807498, | |
| "learning_rate": 0.00018731086382140226, | |
| "loss": 0.725, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.21531423468010022, | |
| "learning_rate": 0.00018642266919506644, | |
| "loss": 0.728, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.22094698505169547, | |
| "learning_rate": 0.00018550668718999872, | |
| "loss": 0.6933, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.1773217147142517, | |
| "learning_rate": 0.0001845632123208111, | |
| "loss": 0.7513, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.2055315923802406, | |
| "learning_rate": 0.0001835925479418637, | |
| "loss": 0.7519, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.16963560799229266, | |
| "learning_rate": 0.0001825950061497276, | |
| "loss": 0.6841, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.22358671455728935, | |
| "learning_rate": 0.00018157090768283678, | |
| "loss": 0.7031, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.19649622034513511, | |
| "learning_rate": 0.00018052058181836151, | |
| "loss": 0.7152, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.23337948032120506, | |
| "learning_rate": 0.00017944436626633623, | |
| "loss": 0.7344, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.2230560456920727, | |
| "learning_rate": 0.00017834260706107595, | |
| "loss": 0.7048, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.18683201628360252, | |
| "learning_rate": 0.00017721565844991643, | |
| "loss": 0.6804, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.23557766421834736, | |
| "learning_rate": 0.00017606388277931328, | |
| "loss": 0.7466, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.20807682466726182, | |
| "learning_rate": 0.0001748876503783373, | |
| "loss": 0.7311, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.20735258362025547, | |
| "learning_rate": 0.00017368733943960276, | |
| "loss": 0.7272, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.20304702188871634, | |
| "learning_rate": 0.00017246333589766787, | |
| "loss": 0.745, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.20320219594683134, | |
| "learning_rate": 0.00017121603330494544, | |
| "loss": 0.7519, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.21213293638896646, | |
| "learning_rate": 0.0001699458327051647, | |
| "loss": 0.7359, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.21793747848007688, | |
| "learning_rate": 0.00016865314250442398, | |
| "loss": 0.7055, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.22141216826722213, | |
| "learning_rate": 0.00016733837833987633, | |
| "loss": 0.7037, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.22853984000819327, | |
| "learning_rate": 0.00016600196294609045, | |
| "loss": 0.7059, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.20663633655741356, | |
| "learning_rate": 0.00016464432601912912, | |
| "loss": 0.7177, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.18810130620976084, | |
| "learning_rate": 0.0001632659040783897, | |
| "loss": 0.7284, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.20429229628784093, | |
| "learning_rate": 0.00016186714032625035, | |
| "loss": 0.7408, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.22545843123823847, | |
| "learning_rate": 0.00016044848450556787, | |
| "loss": 0.7167, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.20814059355174414, | |
| "learning_rate": 0.00015901039275507245, | |
| "loss": 0.7119, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.20573234266098542, | |
| "learning_rate": 0.00015755332746270572, | |
| "loss": 0.7023, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.22027836397935158, | |
| "learning_rate": 0.00015607775711694977, | |
| "loss": 0.7145, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.23097511819867636, | |
| "learning_rate": 0.00015458415615619484, | |
| "loss": 0.7037, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.18436146046862298, | |
| "learning_rate": 0.00015307300481619333, | |
| "loss": 0.7419, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.21587522416498928, | |
| "learning_rate": 0.00015154478897565045, | |
| "loss": 0.76, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.18717033363542637, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.7144, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.20860152567696305, | |
| "learning_rate": 0.00014843913458341645, | |
| "loss": 0.6931, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.23965643462280595, | |
| "learning_rate": 0.00014686269458911332, | |
| "loss": 0.7057, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.20436937033016242, | |
| "learning_rate": 0.00014527118688797963, | |
| "loss": 0.7508, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.23533937009710554, | |
| "learning_rate": 0.0001436651231956064, | |
| "loss": 0.7168, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.22180461462318066, | |
| "learning_rate": 0.00014204501990775533, | |
| "loss": 0.7335, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.1996738235990708, | |
| "learning_rate": 0.00014041139793432274, | |
| "loss": 0.7162, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.21103614768476256, | |
| "learning_rate": 0.00013876478253185183, | |
| "loss": 0.7331, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.22289372370077495, | |
| "learning_rate": 0.00013710570313464778, | |
| "loss": 0.7044, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.229537262499816, | |
| "learning_rate": 0.0001354346931845492, | |
| "loss": 0.75, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.19323768910419992, | |
| "learning_rate": 0.00013375228995941133, | |
| "loss": 0.7351, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.2047265000452632, | |
| "learning_rate": 0.0001320590344003557, | |
| "loss": 0.7332, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.21689200679001508, | |
| "learning_rate": 0.00013035547093784186, | |
| "loss": 0.7193, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.18177975055261658, | |
| "learning_rate": 0.00012864214731661742, | |
| "loss": 0.7062, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.22823965854179848, | |
| "learning_rate": 0.00012691961441960238, | |
| "loss": 0.6943, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.19970723650312586, | |
| "learning_rate": 0.00012518842609076413, | |
| "loss": 0.7188, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.21665273924724815, | |
| "learning_rate": 0.00012344913895704097, | |
| "loss": 0.7296, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.20693546255073467, | |
| "learning_rate": 0.00012170231224937032, | |
| "loss": 0.7084, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.2230052437580283, | |
| "learning_rate": 0.00011994850762287989, | |
| "loss": 0.7286, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.20642193075079962, | |
| "learning_rate": 0.0001181882889762994, | |
| "loss": 0.739, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.19372424148062226, | |
| "learning_rate": 0.00011642222227065089, | |
| "loss": 0.7072, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.23927300164054763, | |
| "learning_rate": 0.00011465087534727587, | |
| "loss": 0.779, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.2284386096464582, | |
| "learning_rate": 0.0001128748177452581, | |
| "loss": 0.7125, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.1914737248165551, | |
| "learning_rate": 0.00011109462051830017, | |
| "loss": 0.7672, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.19448312525913036, | |
| "learning_rate": 0.00010931085605111354, | |
| "loss": 0.6975, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.21441915676709558, | |
| "learning_rate": 0.00010752409787538, | |
| "loss": 0.7269, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.19881919454925634, | |
| "learning_rate": 0.00010573492048534515, | |
| "loss": 0.6814, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.19972824907153813, | |
| "learning_rate": 0.00010394389915310149, | |
| "loss": 0.68, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.2406076929176273, | |
| "learning_rate": 0.00010215160974362223, | |
| "loss": 0.7707, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.20777439386702035, | |
| "learning_rate": 0.00010035862852960387, | |
| "loss": 0.7491, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.24747809376637386, | |
| "learning_rate": 9.856553200617805e-05, | |
| "loss": 0.6889, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.20716823780350135, | |
| "learning_rate": 9.677289670555169e-05, | |
| "loss": 0.7247, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.1991885456295323, | |
| "learning_rate": 9.49812990116353e-05, | |
| "loss": 0.7411, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.20535559514249033, | |
| "learning_rate": 9.319131497471894e-05, | |
| "loss": 0.75, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.18612767193322602, | |
| "learning_rate": 9.140352012625537e-05, | |
| "loss": 0.7214, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.21568066556853366, | |
| "learning_rate": 8.961848929381026e-05, | |
| "loss": 0.7378, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.20352009266326396, | |
| "learning_rate": 8.783679641623845e-05, | |
| "loss": 0.7332, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.23303296784518934, | |
| "learning_rate": 8.605901435914607e-05, | |
| "loss": 0.7005, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.19779884278017262, | |
| "learning_rate": 8.428571473069775e-05, | |
| "loss": 0.7281, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.23013073874335851, | |
| "learning_rate": 8.25174676978282e-05, | |
| "loss": 0.6749, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.2248906461637265, | |
| "learning_rate": 8.075484180291701e-05, | |
| "loss": 0.6938, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.18810582346079818, | |
| "learning_rate": 7.899840378098588e-05, | |
| "loss": 0.7388, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.20884410793300284, | |
| "learning_rate": 7.724871837747707e-05, | |
| "loss": 0.7373, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.2166745910351328, | |
| "learning_rate": 7.550634816667142e-05, | |
| "loss": 0.6979, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.24100998404487434, | |
| "learning_rate": 7.377185337080442e-05, | |
| "loss": 0.7076, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.2256882455993147, | |
| "learning_rate": 7.204579167993881e-05, | |
| "loss": 0.731, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.18574834794500147, | |
| "learning_rate": 7.032871807265096e-05, | |
| "loss": 0.7473, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.19595704844512632, | |
| "learning_rate": 6.862118463758943e-05, | |
| "loss": 0.7123, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.1922152988123935, | |
| "learning_rate": 6.69237403959624e-05, | |
| "loss": 0.7155, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.22349381871955526, | |
| "learning_rate": 6.52369311250116e-05, | |
| "loss": 0.6986, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.1860191514854131, | |
| "learning_rate": 6.356129918252927e-05, | |
| "loss": 0.7194, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.21287065828060023, | |
| "learning_rate": 6.189738333247432e-05, | |
| "loss": 0.6888, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.21736641037698234, | |
| "learning_rate": 6.024571857174443e-05, | |
| "loss": 0.74, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.2161329163446709, | |
| "learning_rate": 5.860683595815893e-05, | |
| "loss": 0.6949, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.19032361730874883, | |
| "learning_rate": 5.698126243970845e-05, | |
| "loss": 0.7027, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.20020919043570076, | |
| "learning_rate": 5.536952068512608e-05, | |
| "loss": 0.7079, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.19493600127593236, | |
| "learning_rate": 5.3772128915834184e-05, | |
| "loss": 0.7946, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.1937560205349817, | |
| "learning_rate": 5.218960073932122e-05, | |
| "loss": 0.7512, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.20929045061919116, | |
| "learning_rate": 5.062244498400228e-05, | |
| "loss": 0.7401, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.19822387763869162, | |
| "learning_rate": 4.907116553561607e-05, | |
| "loss": 0.7035, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.18973790696605586, | |
| "learning_rate": 4.753626117521103e-05, | |
| "loss": 0.6855, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.1951787661520754, | |
| "learning_rate": 4.601822541877291e-05, | |
| "loss": 0.697, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.23171207370747388, | |
| "learning_rate": 4.451754635854517e-05, | |
| "loss": 0.7174, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.20672883919318008, | |
| "learning_rate": 4.303470650609325e-05, | |
| "loss": 0.6991, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.20596139821563042, | |
| "learning_rate": 4.1570182637163155e-05, | |
| "loss": 0.7579, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.21052057640954702, | |
| "learning_rate": 4.0124445638384366e-05, | |
| "loss": 0.6564, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.2194425673779361, | |
| "learning_rate": 3.869796035586625e-05, | |
| "loss": 0.7112, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.18864013157813014, | |
| "learning_rate": 3.7291185445736444e-05, | |
| "loss": 0.7009, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.20033201942780526, | |
| "learning_rate": 3.590457322666997e-05, | |
| "loss": 0.7412, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.21557678658312107, | |
| "learning_rate": 3.453856953445557e-05, | |
| "loss": 0.7384, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.1793703740528382, | |
| "learning_rate": 3.319361357864663e-05, | |
| "loss": 0.7098, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.1804784138992169, | |
| "learning_rate": 3.187013780134291e-05, | |
| "loss": 0.7287, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.1868508163249596, | |
| "learning_rate": 3.05685677381475e-05, | |
| "loss": 0.7472, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.21833090239816652, | |
| "learning_rate": 2.9289321881345254e-05, | |
| "loss": 0.7121, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.22900968128450175, | |
| "learning_rate": 2.8032811545345294e-05, | |
| "loss": 0.7631, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.25207875961572096, | |
| "learning_rate": 2.679944073443158e-05, | |
| "loss": 0.717, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.21435265220399202, | |
| "learning_rate": 2.5589606012863963e-05, | |
| "loss": 0.7391, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.21363241574566763, | |
| "learning_rate": 2.4403696377371142e-05, | |
| "loss": 0.7446, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.20585815945916697, | |
| "learning_rate": 2.324209313207736e-05, | |
| "loss": 0.7582, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.20004345669038107, | |
| "learning_rate": 2.210516976590179e-05, | |
| "loss": 0.7302, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.22895803213623186, | |
| "learning_rate": 2.099329183247126e-05, | |
| "loss": 0.7175, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.19029034354303137, | |
| "learning_rate": 1.9906816832584253e-05, | |
| "loss": 0.6968, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.18984158225134234, | |
| "learning_rate": 1.8846094099263912e-05, | |
| "loss": 0.7259, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.20694620466818364, | |
| "learning_rate": 1.781146468543765e-05, | |
| "loss": 0.6664, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.1880785385367806, | |
| "learning_rate": 1.6803261254278636e-05, | |
| "loss": 0.6995, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.23959056489828764, | |
| "learning_rate": 1.582180797224507e-05, | |
| "loss": 0.7076, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.2074947617084518, | |
| "learning_rate": 1.4867420404851307e-05, | |
| "loss": 0.7257, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.1697947347606707, | |
| "learning_rate": 1.3940405415204416e-05, | |
| "loss": 0.729, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.19240120017654297, | |
| "learning_rate": 1.30410610653389e-05, | |
| "loss": 0.7579, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.22247780509078305, | |
| "learning_rate": 1.2169676520381168e-05, | |
| "loss": 0.7197, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.2186893700102881, | |
| "learning_rate": 1.1326531955574526e-05, | |
| "loss": 0.7392, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.19758258278741933, | |
| "learning_rate": 1.0511898466194903e-05, | |
| "loss": 0.7161, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.1935470586238971, | |
| "learning_rate": 9.726037980385738e-06, | |
| "loss": 0.6866, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.20632207102948852, | |
| "learning_rate": 8.969203174940654e-06, | |
| "loss": 0.7005, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.1864625802291902, | |
| "learning_rate": 8.24163739406062e-06, | |
| "loss": 0.7332, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.1856036197326992, | |
| "learning_rate": 7.543574571111655e-06, | |
| "loss": 0.7387, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.21017043077001935, | |
| "learning_rate": 6.875239153408542e-06, | |
| "loss": 0.6983, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.22653212036464043, | |
| "learning_rate": 6.236846030048604e-06, | |
| "loss": 0.7, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.1799402029152891, | |
| "learning_rate": 5.6286004628186675e-06, | |
| "loss": 0.7291, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.20533686230996426, | |
| "learning_rate": 5.0506980201973974e-06, | |
| "loss": 0.7303, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.20499691124412314, | |
| "learning_rate": 4.503324514474483e-06, | |
| "loss": 0.7543, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.20195822302653685, | |
| "learning_rate": 3.986655942006579e-06, | |
| "loss": 0.7246, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.23109568891518523, | |
| "learning_rate": 3.5008584266294386e-06, | |
| "loss": 0.7003, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.2213682915194419, | |
| "learning_rate": 3.0460881662442763e-06, | |
| "loss": 0.7511, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.21218423218406246, | |
| "learning_rate": 2.622491382595693e-06, | |
| "loss": 0.7221, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.21119477478168394, | |
| "learning_rate": 2.2302042742571193e-06, | |
| "loss": 0.7139, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.19455625533472815, | |
| "learning_rate": 1.869352972839067e-06, | |
| "loss": 0.7059, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.2364635036372563, | |
| "learning_rate": 1.5400535024342022e-06, | |
| "loss": 0.6871, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.23010554295931945, | |
| "learning_rate": 1.2424117423122328e-06, | |
| "loss": 0.7364, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.18952563182659934, | |
| "learning_rate": 9.765233928766493e-07, | |
| "loss": 0.7058, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.21828660241676695, | |
| "learning_rate": 7.42473944894384e-07, | |
| "loss": 0.6962, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.18672947784161634, | |
| "learning_rate": 5.403386520079323e-07, | |
| "loss": 0.6808, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.1911360941907681, | |
| "learning_rate": 3.701825065392184e-07, | |
| "loss": 0.7428, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.18259404077794916, | |
| "learning_rate": 2.320602185927001e-07, | |
| "loss": 0.6799, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.20002215130980527, | |
| "learning_rate": 1.2601619846444035e-07, | |
| "loss": 0.7611, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.19193385733473736, | |
| "learning_rate": 5.208454236296234e-08, | |
| "loss": 0.7183, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.22837515366442288, | |
| "learning_rate": 1.0289021446308056e-08, | |
| "loss": 0.7474, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1509.2466, | |
| "eval_samples_per_second": 1.531, | |
| "eval_steps_per_second": 0.383, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 974, | |
| "total_flos": 1.1274999981146112e+16, | |
| "train_loss": 0.7250666028420294, | |
| "train_runtime": 36380.3803, | |
| "train_samples_per_second": 1.714, | |
| "train_steps_per_second": 0.027 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 974, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "total_flos": 1.1274999981146112e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |