| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995747929973436, | |
| "eval_steps": 500, | |
| "global_step": 1708, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005852311434410677, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.1695906432748538e-06, | |
| "loss": 1.737, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0029261557172053382, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 5.8479532163742686e-06, | |
| "loss": 1.7512, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0058523114344106765, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 1.1695906432748537e-05, | |
| "loss": 1.7378, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008778467151616015, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 1.7543859649122806e-05, | |
| "loss": 1.7494, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.011704622868821353, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 2.3391812865497074e-05, | |
| "loss": 1.7101, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.014630778586026691, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 2.9239766081871346e-05, | |
| "loss": 1.6696, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01755693430323203, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 3.508771929824561e-05, | |
| "loss": 1.6369, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02048309002043737, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 4.093567251461988e-05, | |
| "loss": 1.6058, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.023409245737642706, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 4.678362573099415e-05, | |
| "loss": 1.5587, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.026335401454848046, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 5.2631578947368424e-05, | |
| "loss": 1.5149, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.029261557172053382, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 5.847953216374269e-05, | |
| "loss": 1.4769, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03218771288925872, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 6.432748538011695e-05, | |
| "loss": 1.4573, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03511386860646406, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 7.017543859649122e-05, | |
| "loss": 1.4219, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0380400243236694, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 7.602339181286549e-05, | |
| "loss": 1.398, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04096618004087474, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 8.187134502923976e-05, | |
| "loss": 1.3912, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.043892335758080075, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 8.771929824561403e-05, | |
| "loss": 1.3633, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04681849147528541, | |
| "grad_norm": 0.06103515625, | |
| "learning_rate": 9.35672514619883e-05, | |
| "loss": 1.3546, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.049744647192490755, | |
| "grad_norm": 0.05712890625, | |
| "learning_rate": 9.941520467836257e-05, | |
| "loss": 1.3309, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.05267080290969609, | |
| "grad_norm": 0.05517578125, | |
| "learning_rate": 0.00010526315789473685, | |
| "loss": 1.3234, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05559695862690143, | |
| "grad_norm": 0.048828125, | |
| "learning_rate": 0.00011111111111111112, | |
| "loss": 1.3061, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.058523114344106765, | |
| "grad_norm": 0.048828125, | |
| "learning_rate": 0.00011695906432748539, | |
| "loss": 1.2844, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06144927006131211, | |
| "grad_norm": 0.04833984375, | |
| "learning_rate": 0.00012280701754385965, | |
| "loss": 1.2973, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.06437542577851744, | |
| "grad_norm": 0.050537109375, | |
| "learning_rate": 0.0001286549707602339, | |
| "loss": 1.2836, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06730158149572278, | |
| "grad_norm": 0.048095703125, | |
| "learning_rate": 0.0001345029239766082, | |
| "loss": 1.2723, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.07022773721292812, | |
| "grad_norm": 0.046630859375, | |
| "learning_rate": 0.00014035087719298245, | |
| "loss": 1.2634, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07315389293013345, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.00014619883040935673, | |
| "loss": 1.2355, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0760800486473388, | |
| "grad_norm": 0.05029296875, | |
| "learning_rate": 0.00015204678362573098, | |
| "loss": 1.2494, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07900620436454414, | |
| "grad_norm": 0.054931640625, | |
| "learning_rate": 0.00015789473684210527, | |
| "loss": 1.253, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.08193236008174948, | |
| "grad_norm": 0.05322265625, | |
| "learning_rate": 0.00016374269005847952, | |
| "loss": 1.2499, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08485851579895481, | |
| "grad_norm": 0.04833984375, | |
| "learning_rate": 0.0001695906432748538, | |
| "loss": 1.2193, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.08778467151616015, | |
| "grad_norm": 0.05712890625, | |
| "learning_rate": 0.00017543859649122806, | |
| "loss": 1.2339, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09071082723336549, | |
| "grad_norm": 0.056884765625, | |
| "learning_rate": 0.00018128654970760234, | |
| "loss": 1.2427, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.09363698295057082, | |
| "grad_norm": 0.052490234375, | |
| "learning_rate": 0.0001871345029239766, | |
| "loss": 1.2184, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09656313866777616, | |
| "grad_norm": 0.06298828125, | |
| "learning_rate": 0.00019298245614035088, | |
| "loss": 1.2306, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.09948929438498151, | |
| "grad_norm": 0.06298828125, | |
| "learning_rate": 0.00019883040935672513, | |
| "loss": 1.214, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10241545010218685, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.00019999665774502696, | |
| "loss": 1.2176, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.10534160581939218, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001999830802170989, | |
| "loss": 1.204, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10826776153659752, | |
| "grad_norm": 0.06298828125, | |
| "learning_rate": 0.00019995905994229593, | |
| "loss": 1.2153, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.11119391725380286, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 0.00019992459942941906, | |
| "loss": 1.1936, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1141200729710082, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 0.00019987970227770135, | |
| "loss": 1.1987, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.11704622868821353, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 0.00019982437317643217, | |
| "loss": 1.2065, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11997238440541888, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 0.00019975861790446722, | |
| "loss": 1.2088, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.12289854012262422, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.0001996824433296252, | |
| "loss": 1.2082, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12582469583982955, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.00019959585740797028, | |
| "loss": 1.2062, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1287508515570349, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 0.0001994988691829812, | |
| "loss": 1.2046, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.13167700727424023, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.00019939148878460677, | |
| "loss": 1.195, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.13460316299144556, | |
| "grad_norm": 0.0859375, | |
| "learning_rate": 0.00019927372742820779, | |
| "loss": 1.1807, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1375293187086509, | |
| "grad_norm": 0.07666015625, | |
| "learning_rate": 0.0001991455974133857, | |
| "loss": 1.1887, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.14045547442585624, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.0001990071121226979, | |
| "loss": 1.189, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.14338163014306157, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 0.0001988582860202601, | |
| "loss": 1.172, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.1463077858602669, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.00019869913465023548, | |
| "loss": 1.1738, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14923394157747225, | |
| "grad_norm": 0.08447265625, | |
| "learning_rate": 0.00019852967463521124, | |
| "loss": 1.1947, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.1521600972946776, | |
| "grad_norm": 0.080078125, | |
| "learning_rate": 0.0001983499236744625, | |
| "loss": 1.1789, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15508625301188295, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.00019815990054210361, | |
| "loss": 1.1878, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.15801240872908828, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.00019795962508512742, | |
| "loss": 1.1825, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.16093856444629362, | |
| "grad_norm": 0.103515625, | |
| "learning_rate": 0.00019774911822133216, | |
| "loss": 1.1848, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.16386472016349896, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 0.0001975284019371368, | |
| "loss": 1.1634, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1667908758807043, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.0001972974992852847, | |
| "loss": 1.1539, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.16971703159790963, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.00019705643438243584, | |
| "loss": 1.1656, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.17264318731511497, | |
| "grad_norm": 0.0859375, | |
| "learning_rate": 0.00019680523240664786, | |
| "loss": 1.1923, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.1755693430323203, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.00019654391959474647, | |
| "loss": 1.1651, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17849549874952564, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.00019627252323958504, | |
| "loss": 1.1501, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.18142165446673097, | |
| "grad_norm": 0.0859375, | |
| "learning_rate": 0.00019599107168719412, | |
| "loss": 1.1581, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1843478101839363, | |
| "grad_norm": 0.08349609375, | |
| "learning_rate": 0.0001956995943338206, | |
| "loss": 1.1785, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.18727396590114165, | |
| "grad_norm": 0.08447265625, | |
| "learning_rate": 0.00019539812162285767, | |
| "loss": 1.1691, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.19020012161834698, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 0.00019508668504166505, | |
| "loss": 1.1758, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.19312627733555232, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.00019476531711828027, | |
| "loss": 1.1582, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.19605243305275769, | |
| "grad_norm": 0.08642578125, | |
| "learning_rate": 0.0001944340514180212, | |
| "loss": 1.1767, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.19897858876996302, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.00019409292253998062, | |
| "loss": 1.1392, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.20190474448716836, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 0.0001937419661134121, | |
| "loss": 1.1626, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.2048309002043737, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.00019338121879400896, | |
| "loss": 1.1551, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.20775705592157903, | |
| "grad_norm": 0.080078125, | |
| "learning_rate": 0.00019301071826007576, | |
| "loss": 1.1495, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.21068321163878437, | |
| "grad_norm": 0.0810546875, | |
| "learning_rate": 0.00019263050320859283, | |
| "loss": 1.1514, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2136093673559897, | |
| "grad_norm": 0.08935546875, | |
| "learning_rate": 0.00019224061335117472, | |
| "loss": 1.1649, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.21653552307319504, | |
| "grad_norm": 0.08837890625, | |
| "learning_rate": 0.0001918410894099224, | |
| "loss": 1.1433, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.21946167879040038, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.00019143197311317014, | |
| "loss": 1.1275, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.22238783450760571, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.00019101330719112705, | |
| "loss": 1.1684, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.22531399022481105, | |
| "grad_norm": 0.07763671875, | |
| "learning_rate": 0.00019058513537141428, | |
| "loss": 1.1606, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.2282401459420164, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.0001901475023744977, | |
| "loss": 1.148, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.23116630165922172, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.00018970045390901728, | |
| "loss": 1.1626, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.23409245737642706, | |
| "grad_norm": 0.0849609375, | |
| "learning_rate": 0.00018924403666701286, | |
| "loss": 1.1575, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23701861309363242, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.00018877829831904746, | |
| "loss": 1.1637, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.23994476881083776, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.0001883032875092283, | |
| "loss": 1.1441, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2428709245280431, | |
| "grad_norm": 0.0771484375, | |
| "learning_rate": 0.00018781905385012627, | |
| "loss": 1.1615, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.24579708024524843, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 0.000187325647917594, | |
| "loss": 1.1536, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.24872323596245377, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.00018682312124548346, | |
| "loss": 1.1512, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2516493916796591, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.00018631152632026364, | |
| "loss": 1.1397, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.25457554739686444, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.00018579091657553844, | |
| "loss": 1.1585, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2575017031140698, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.00018526134638646583, | |
| "loss": 1.1612, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2604278588312751, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.00018472287106407876, | |
| "loss": 1.1272, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.26335401454848045, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 0.00018417554684950794, | |
| "loss": 1.1413, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2662801702656858, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.00018361943090810796, | |
| "loss": 1.1489, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.2692063259828911, | |
| "grad_norm": 0.0888671875, | |
| "learning_rate": 0.00018305458132348657, | |
| "loss": 1.1575, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.27213248170009646, | |
| "grad_norm": 0.07666015625, | |
| "learning_rate": 0.00018248105709143799, | |
| "loss": 1.136, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.2750586374173018, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.00018189891811378137, | |
| "loss": 1.1369, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.27798479313450714, | |
| "grad_norm": 0.08251953125, | |
| "learning_rate": 0.0001813082251921041, | |
| "loss": 1.1255, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.28091094885171247, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 0.0001807090400214114, | |
| "loss": 1.1288, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2838371045689178, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.00018010142518368278, | |
| "loss": 1.1233, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.28676326028612315, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.00017948544414133534, | |
| "loss": 1.1475, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2896894160033285, | |
| "grad_norm": 0.080078125, | |
| "learning_rate": 0.00017886116123059574, | |
| "loss": 1.1356, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.2926155717205338, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.00017822864165478034, | |
| "loss": 1.1553, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.29554172743773915, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.00017758795147748523, | |
| "loss": 1.1188, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.2984678831549445, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.00017693915761568608, | |
| "loss": 1.1388, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3013940388721499, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.000176282327832749, | |
| "loss": 1.1267, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.3043201945893552, | |
| "grad_norm": 0.083984375, | |
| "learning_rate": 0.0001756175307313531, | |
| "loss": 1.1341, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.30724635030656056, | |
| "grad_norm": 0.08349609375, | |
| "learning_rate": 0.00017494483574632513, | |
| "loss": 1.1365, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.3101725060237659, | |
| "grad_norm": 0.08447265625, | |
| "learning_rate": 0.00017426431313738734, | |
| "loss": 1.1335, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.31309866174097123, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.00017357603398181936, | |
| "loss": 1.1484, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.31602481745817657, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.00017288007016703444, | |
| "loss": 1.1186, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3189509731753819, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.00017217649438307106, | |
| "loss": 1.1442, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.32187712889258724, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 0.00017146538011500093, | |
| "loss": 1.1284, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3248032846097926, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.00017074680163525375, | |
| "loss": 1.1331, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3277294403269979, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.00017002083399586, | |
| "loss": 1.1255, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.33065559604420325, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.00016928755302061173, | |
| "loss": 1.1354, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3335817517614086, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.0001685470352971437, | |
| "loss": 1.1333, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3365079074786139, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.00016779935816893353, | |
| "loss": 1.1376, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.33943406319581926, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.00016704459972722414, | |
| "loss": 1.1249, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3423602189130246, | |
| "grad_norm": 0.0732421875, | |
| "learning_rate": 0.00016628283880286703, | |
| "loss": 1.1451, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.34528637463022993, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.00016551415495808915, | |
| "loss": 1.1195, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.34821253034743527, | |
| "grad_norm": 0.0810546875, | |
| "learning_rate": 0.00016473862847818277, | |
| "loss": 1.146, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.3511386860646406, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.00016395634036312013, | |
| "loss": 1.1327, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.35406484178184594, | |
| "grad_norm": 0.0791015625, | |
| "learning_rate": 0.00016316737231909342, | |
| "loss": 1.1176, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.3569909974990513, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.000162371806749981, | |
| "loss": 1.1208, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3599171532162566, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.00016156972674874056, | |
| "loss": 1.1315, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.36284330893346195, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.00016076121608873072, | |
| "loss": 1.1455, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3657694646506673, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.000159946359214961, | |
| "loss": 1.1234, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3686956203678726, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.00015912524123527221, | |
| "loss": 1.1185, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.37162177608507796, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.0001582979479114472, | |
| "loss": 1.1208, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.3745479318022833, | |
| "grad_norm": 0.07666015625, | |
| "learning_rate": 0.0001574645656502536, | |
| "loss": 1.1257, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.37747408751948863, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.0001566251814944188, | |
| "loss": 1.1317, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.38040024323669397, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.00015577988311353904, | |
| "loss": 1.1431, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3833263989538993, | |
| "grad_norm": 0.0771484375, | |
| "learning_rate": 0.0001549287587949226, | |
| "loss": 1.1253, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.38625255467110464, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 0.00015407189743436864, | |
| "loss": 1.1314, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.38917871038831003, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.00015320938852688248, | |
| "loss": 1.1148, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.39210486610551537, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 0.00015234132215732822, | |
| "loss": 1.141, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3950310218227207, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.00015146778899102, | |
| "loss": 1.1222, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.39795717753992604, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.00015058888026425212, | |
| "loss": 1.1177, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4008833332571314, | |
| "grad_norm": 0.0830078125, | |
| "learning_rate": 0.00014970468777477026, | |
| "loss": 1.1181, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.4038094889743367, | |
| "grad_norm": 0.08349609375, | |
| "learning_rate": 0.00014881530387218325, | |
| "loss": 1.1417, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.40673564469154205, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.00014792082144831793, | |
| "loss": 1.1302, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.4096618004087474, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.00014702133392751688, | |
| "loss": 1.122, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4125879561259527, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 0.00014611693525688066, | |
| "loss": 1.1268, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.41551411184315806, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.00014520771989645563, | |
| "loss": 1.1238, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4184402675603634, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 0.00014429378280936804, | |
| "loss": 1.119, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.42136642327756874, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.0001433752194519054, | |
| "loss": 1.1187, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.42429257899477407, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.00014245212576354682, | |
| "loss": 1.122, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.4272187347119794, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 0.0001415245981569424, | |
| "loss": 1.1267, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.43014489042918475, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.00014059273350784342, | |
| "loss": 1.1273, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.4330710461463901, | |
| "grad_norm": 0.076171875, | |
| "learning_rate": 0.00013965662914498428, | |
| "loss": 1.1267, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4359972018635954, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 0.00013871638283991677, | |
| "loss": 1.1175, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.43892335758080075, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.0001377720927967985, | |
| "loss": 1.1211, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4418495132980061, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 0.00013682385764213572, | |
| "loss": 1.1319, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.44477566901521143, | |
| "grad_norm": 0.07666015625, | |
| "learning_rate": 0.00013587177641448265, | |
| "loss": 1.1233, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.44770182473241676, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.00013491594855409697, | |
| "loss": 1.1385, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.4506279804496221, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.00013395647389255396, | |
| "loss": 1.1189, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.45355413616682744, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.00013299345264231957, | |
| "loss": 1.1157, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.4564802918840328, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.00013202698538628376, | |
| "loss": 1.1224, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4594064476012381, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 0.00013105717306725501, | |
| "loss": 1.1283, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.46233260331844345, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.0001300841169774174, | |
| "loss": 1.131, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4652587590356488, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.000129107918747751, | |
| "loss": 1.1175, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.4681849147528541, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.00012812868033741724, | |
| "loss": 1.138, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.47111107047005946, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.00012714650402310967, | |
| "loss": 1.1344, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.47403722618726485, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.00012616149238837146, | |
| "loss": 1.1195, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4769633819044702, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.00012517374831288146, | |
| "loss": 1.1005, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.4798895376216755, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.00012418337496170842, | |
| "loss": 1.1158, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.48281569333888086, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.00012319047577453638, | |
| "loss": 1.1181, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.4857418490560862, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 0.00012219515445486054, | |
| "loss": 1.1321, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.48866800477329153, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 0.00012119751495915617, | |
| "loss": 1.1309, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.49159416049049687, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 0.00012019766148602062, | |
| "loss": 1.1276, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4945203162077022, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 0.00011919569846529057, | |
| "loss": 1.1173, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.49744647192490754, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.00011819173054713466, | |
| "loss": 1.1111, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5003726276421129, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 0.00011718586259112326, | |
| "loss": 1.1137, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.5032987833593182, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 0.0001161781996552765, | |
| "loss": 1.1157, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5062249390765236, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.00011516884698509143, | |
| "loss": 1.1136, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.5091510947937289, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.00011415791000254964, | |
| "loss": 1.1217, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5120772505109342, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.0001131454942951065, | |
| "loss": 1.119, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5150034062281396, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.0001121317056046629, | |
| "loss": 1.1122, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5179295619453449, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.00011111664981652121, | |
| "loss": 1.1137, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.5208557176625502, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.00011010043294832601, | |
| "loss": 1.1132, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5237818733797556, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.00010908316113899097, | |
| "loss": 1.1373, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.5267080290969609, | |
| "grad_norm": 0.0732421875, | |
| "learning_rate": 0.00010806494063761335, | |
| "loss": 1.1165, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5296341848141662, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.00010704587779237654, | |
| "loss": 1.1149, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.5325603405313716, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.00010602607903944279, | |
| "loss": 1.1244, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5354864962485769, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.00010500565089183627, | |
| "loss": 1.1141, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.5384126519657823, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.00010398469992831832, | |
| "loss": 1.1031, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5413388076829876, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.00010296333278225599, | |
| "loss": 1.1072, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5442649634001929, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.00010194165613048444, | |
| "loss": 1.0993, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5471911191173983, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.00010091977668216524, | |
| "loss": 1.1089, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.5501172748346036, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 9.989780116764115e-05, | |
| "loss": 1.1042, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5530434305518089, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 9.887583632728845e-05, | |
| "loss": 1.1062, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.5559695862690143, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 9.785398890036867e-05, | |
| "loss": 1.1092, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5588957419862196, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 9.683236561388e-05, | |
| "loss": 1.1173, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.5618218977034249, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 9.581107317141026e-05, | |
| "loss": 1.1407, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5647480534206303, | |
| "grad_norm": 0.07666015625, | |
| "learning_rate": 9.479021824199229e-05, | |
| "loss": 1.1365, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.5676742091378356, | |
| "grad_norm": 0.07763671875, | |
| "learning_rate": 9.376990744896276e-05, | |
| "loss": 1.1031, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.570600364855041, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 9.275024735882588e-05, | |
| "loss": 1.0896, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.5735265205722463, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 9.173134447012322e-05, | |
| "loss": 1.1094, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5764526762894516, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 9.071330520231033e-05, | |
| "loss": 1.1127, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.579378832006657, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 8.969623588464163e-05, | |
| "loss": 1.1176, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5823049877238623, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 8.868024274506505e-05, | |
| "loss": 1.112, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5852311434410676, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 8.766543189912705e-05, | |
| "loss": 1.0846, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.588157299158273, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 8.665190933888904e-05, | |
| "loss": 1.0961, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.5910834548754783, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 8.56397809218574e-05, | |
| "loss": 1.1146, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5940096105926836, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 8.4629152359927e-05, | |
| "loss": 1.1066, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.596935766309889, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 8.362012920834014e-05, | |
| "loss": 1.1253, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5998619220270943, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 8.261281685466177e-05, | |
| "loss": 1.1072, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.6027880777442998, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 8.160732050777235e-05, | |
| "loss": 1.1147, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6057142334615051, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 8.060374518687926e-05, | |
| "loss": 1.11, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.6086403891787104, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 7.960219571054799e-05, | |
| "loss": 1.123, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6115665448959158, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 7.860277668575449e-05, | |
| "loss": 1.1035, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.6144927006131211, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 7.76055924969594e-05, | |
| "loss": 1.09, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6174188563303264, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 7.661074729520548e-05, | |
| "loss": 1.1279, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.6203450120475318, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 7.561834498723974e-05, | |
| "loss": 1.1141, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6232711677647371, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 7.462848922466092e-05, | |
| "loss": 1.1102, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.6261973234819425, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 7.364128339309326e-05, | |
| "loss": 1.1128, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6291234791991478, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 7.265683060138868e-05, | |
| "loss": 1.1054, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.6320496349163531, | |
| "grad_norm": 0.07666015625, | |
| "learning_rate": 7.167523367085749e-05, | |
| "loss": 1.1097, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6349757906335585, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 7.069659512452918e-05, | |
| "loss": 1.1148, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.6379019463507638, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 6.972101717644429e-05, | |
| "loss": 1.0997, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6408281020679691, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 6.874860172097883e-05, | |
| "loss": 1.097, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.6437542577851745, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 6.777945032220187e-05, | |
| "loss": 1.1006, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6466804135023798, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 6.681366420326747e-05, | |
| "loss": 1.1191, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.6496065692195852, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 6.58513442358427e-05, | |
| "loss": 1.0901, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6525327249367905, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 6.489259092957193e-05, | |
| "loss": 1.1113, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.6554588806539958, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 6.3937504421579e-05, | |
| "loss": 1.0945, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6583850363712012, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 6.298618446600856e-05, | |
| "loss": 1.1073, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.6613111920884065, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 6.203873042360722e-05, | |
| "loss": 1.1178, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6642373478056118, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 6.109524125134571e-05, | |
| "loss": 1.1291, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.6671635035228172, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 6.015581549208322e-05, | |
| "loss": 1.0985, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6700896592400225, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 5.9220551264275356e-05, | |
| "loss": 1.1294, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.6730158149572278, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 5.828954625172597e-05, | |
| "loss": 1.115, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6759419706744332, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 5.736289769338441e-05, | |
| "loss": 1.1024, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.6788681263916385, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 5.644070237318977e-05, | |
| "loss": 1.0993, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6817942821088439, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 5.552305660996202e-05, | |
| "loss": 1.1172, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.6847204378260492, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 5.4610056247341814e-05, | |
| "loss": 1.0988, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6876465935432545, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 5.3701796643780524e-05, | |
| "loss": 1.1142, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.6905727492604599, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 5.279837266258016e-05, | |
| "loss": 1.1271, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6934989049776652, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 5.189987866198548e-05, | |
| "loss": 1.1055, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.6964250606948705, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 5.100640848532878e-05, | |
| "loss": 1.1277, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6993512164120759, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 5.011805545122826e-05, | |
| "loss": 1.1093, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.7022773721292812, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 4.923491234384158e-05, | |
| "loss": 1.1055, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7052035278464865, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 4.8357071403174746e-05, | |
| "loss": 1.098, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.7081296835636919, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 4.748462431544826e-05, | |
| "loss": 1.114, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7110558392808972, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 4.661766220352097e-05, | |
| "loss": 1.1073, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.7139819949981026, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 4.5756275617372465e-05, | |
| "loss": 1.1121, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7169081507153079, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 4.490055452464594e-05, | |
| "loss": 1.12, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.7198343064325132, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 4.405058830125137e-05, | |
| "loss": 1.092, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7227604621497186, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 4.320646572203033e-05, | |
| "loss": 1.0998, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.7256866178669239, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 4.236827495148443e-05, | |
| "loss": 1.0993, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7286127735841292, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 4.153610353456654e-05, | |
| "loss": 1.1323, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.7315389293013346, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 4.071003838753737e-05, | |
| "loss": 1.1264, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7344650850185399, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 3.9890165788887365e-05, | |
| "loss": 1.1057, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.7373912407357452, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 3.9076571370325364e-05, | |
| "loss": 1.1119, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7403173964529506, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 3.82693401078349e-05, | |
| "loss": 1.0996, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.7432435521701559, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 3.7468556312798685e-05, | |
| "loss": 1.1051, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7461697078873613, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 3.667430362319277e-05, | |
| "loss": 1.0959, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.7490958636045666, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 3.588666499485115e-05, | |
| "loss": 1.1129, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7520220193217719, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 3.510572269280097e-05, | |
| "loss": 1.1184, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.7549481750389773, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 3.433155828267089e-05, | |
| "loss": 1.1003, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7578743307561826, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 3.356425262217164e-05, | |
| "loss": 1.106, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.7608004864733879, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 3.280388585265075e-05, | |
| "loss": 1.1066, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7637266421905933, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 3.205053739072248e-05, | |
| "loss": 1.1026, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.7666527979077986, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 3.130428591997282e-05, | |
| "loss": 1.1256, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.769578953625004, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 3.0565209382741664e-05, | |
| "loss": 1.1018, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.7725051093422093, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 2.9833384971981838e-05, | |
| "loss": 1.1099, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7754312650594147, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 2.9108889123196824e-05, | |
| "loss": 1.0995, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.7783574207766201, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 2.839179750645752e-05, | |
| "loss": 1.1194, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7812835764938254, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 2.768218501849862e-05, | |
| "loss": 1.0955, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.7842097322110307, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 2.6980125774896238e-05, | |
| "loss": 1.0712, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7871358879282361, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 2.6285693102326868e-05, | |
| "loss": 1.1019, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.7900620436454414, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 2.559895953090856e-05, | |
| "loss": 1.1022, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7929881993626468, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 2.491999678662582e-05, | |
| "loss": 1.1027, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.7959143550798521, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 2.4248875783837987e-05, | |
| "loss": 1.1292, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7988405107970574, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 2.358566661787257e-05, | |
| "loss": 1.1117, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.8017666665142628, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 2.293043855770416e-05, | |
| "loss": 1.1176, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.8046928222314681, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 2.2283260038719646e-05, | |
| "loss": 1.1074, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.8076189779486734, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 2.1644198655570504e-05, | |
| "loss": 1.1123, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8105451336658788, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 2.1013321155112754e-05, | |
| "loss": 1.0979, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.8134712893830841, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 2.0390693429435627e-05, | |
| "loss": 1.1102, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.8163974451002894, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 1.977638050897954e-05, | |
| "loss": 1.1133, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.8193236008174948, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 1.917044655574387e-05, | |
| "loss": 1.1045, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8222497565347001, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 1.8572954856585535e-05, | |
| "loss": 1.0967, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.8251759122519055, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 1.798396781660914e-05, | |
| "loss": 1.1199, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.8281020679691108, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 1.7403546952648885e-05, | |
| "loss": 1.1039, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.8310282236863161, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 1.6831752886843512e-05, | |
| "loss": 1.1106, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8339543794035215, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 1.626864534030469e-05, | |
| "loss": 1.106, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.8368805351207268, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 1.571428312687928e-05, | |
| "loss": 1.1004, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.8398066908379321, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 1.5168724147006652e-05, | |
| "loss": 1.1244, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.8427328465551375, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 1.4632025381671133e-05, | |
| "loss": 1.1227, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8456590022723428, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 1.4104242886450824e-05, | |
| "loss": 1.1073, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.8485851579895481, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 1.3585431785662627e-05, | |
| "loss": 1.0903, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8515113137067535, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 1.3075646266604913e-05, | |
| "loss": 1.1129, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.8544374694239588, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 1.257493957389796e-05, | |
| "loss": 1.1293, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8573636251411642, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 1.208336400392268e-05, | |
| "loss": 1.0987, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.8602897808583695, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 1.1600970899358588e-05, | |
| "loss": 1.1044, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8632159365755748, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 1.1127810643821401e-05, | |
| "loss": 1.1182, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.8661420922927802, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 1.0663932656600505e-05, | |
| "loss": 1.0957, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8690682480099855, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 1.0209385387497517e-05, | |
| "loss": 1.1238, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.8719944037271908, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 9.764216311765905e-06, | |
| "loss": 1.1209, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8749205594443962, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 9.328471925152381e-06, | |
| "loss": 1.1046, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.8778467151616015, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 8.902197739040708e-06, | |
| "loss": 1.1205, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8807728708788068, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 8.485438275698154e-06, | |
| "loss": 1.1202, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.8836990265960122, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 8.078237063625538e-06, | |
| "loss": 1.1177, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8866251823132175, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 7.680636633010695e-06, | |
| "loss": 1.1116, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.8895513380304229, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 7.292678511286522e-06, | |
| "loss": 1.1067, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8924774937476282, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 6.914403218793608e-06, | |
| "loss": 1.12, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.8954036494648335, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 6.5458502645480924e-06, | |
| "loss": 1.1298, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8983298051820389, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 6.187058142115077e-06, | |
| "loss": 1.1069, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.9012559608992442, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 5.838064325588288e-06, | |
| "loss": 1.0941, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.9041821166164495, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 5.498905265675958e-06, | |
| "loss": 1.0976, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.9071082723336549, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 5.169616385893794e-06, | |
| "loss": 1.101, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9100344280508602, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 4.850232078865169e-06, | |
| "loss": 1.121, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.9129605837680655, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 4.5407857027289555e-06, | |
| "loss": 1.1013, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9158867394852709, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 4.241309577655406e-06, | |
| "loss": 1.1464, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.9188128952024762, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 3.951834982470526e-06, | |
| "loss": 1.1111, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.9217390509196816, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 3.672392151389137e-06, | |
| "loss": 1.1078, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.9246652066368869, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 3.4030102708570212e-06, | |
| "loss": 1.1195, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.9275913623540922, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 3.143717476502572e-06, | |
| "loss": 1.1216, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.9305175180712976, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 2.8945408501981906e-06, | |
| "loss": 1.0936, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.9334436737885029, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 2.6555064172316234e-06, | |
| "loss": 1.106, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.9363698295057082, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 2.4266391435878387e-06, | |
| "loss": 1.1061, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9392959852229136, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 2.2079629333414453e-06, | |
| "loss": 1.1119, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.9422221409401189, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 1.999500626159967e-06, | |
| "loss": 1.1094, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.9451482966573242, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 1.8012739949183844e-06, | |
| "loss": 1.1141, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.9480744523745297, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 1.6133037434250985e-06, | |
| "loss": 1.1084, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.951000608091735, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 1.4356095042594386e-06, | |
| "loss": 1.1208, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.9539267638089404, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 1.2682098367212237e-06, | |
| "loss": 1.1116, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.9568529195261457, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 1.1111222248922471e-06, | |
| "loss": 1.1047, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.959779075243351, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 9.643630758102484e-07, | |
| "loss": 1.0998, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9627052309605564, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 8.279477177551842e-07, | |
| "loss": 1.1073, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.9656313866777617, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 7.018903986483083e-07, | |
| "loss": 1.1124, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.968557542394967, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 5.862042845640403e-07, | |
| "loss": 1.1023, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.9714836981121724, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 4.809014583548432e-07, | |
| "loss": 1.1234, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9744098538293777, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 3.859929183892108e-07, | |
| "loss": 1.111, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.9773360095465831, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 3.014885774029419e-07, | |
| "loss": 1.1179, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9802621652637884, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 2.2739726146381311e-07, | |
| "loss": 1.1082, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.9831883209809937, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 1.6372670904974963e-07, | |
| "loss": 1.14, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9861144766981991, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 1.1048357024054934e-07, | |
| "loss": 1.1126, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.9890406324154044, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 6.76734060233275e-08, | |
| "loss": 1.1087, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9919667881326097, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 3.5300687711703475e-08, | |
| "loss": 1.1235, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.9948929438498151, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 1.3368796478807621e-08, | |
| "loss": 1.1019, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9978190995670204, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 1.8800230040860733e-09, | |
| "loss": 1.1143, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.9995747929973436, | |
| "eval_loss": 1.278271198272705, | |
| "eval_runtime": 1249.6014, | |
| "eval_samples_per_second": 12.937, | |
| "eval_steps_per_second": 12.937, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 0.9995747929973436, | |
| "step": 1708, | |
| "total_flos": 2.8130589802160456e+18, | |
| "train_loss": 0.978924176871637, | |
| "train_runtime": 53778.1468, | |
| "train_samples_per_second": 4.067, | |
| "train_steps_per_second": 0.032 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1708, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "total_flos": 2.8130589802160456e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |