| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 2860, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017497812773403325, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 2.0930232558139536e-06, | |
| "loss": 1.0581, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03499562554680665, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 4.418604651162791e-06, | |
| "loss": 1.0503, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05249343832020997, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 6.744186046511628e-06, | |
| "loss": 0.9804, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0699912510936133, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 9.069767441860465e-06, | |
| "loss": 0.8786, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08748906386701662, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.1395348837209304e-05, | |
| "loss": 0.8081, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10498687664041995, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 1.372093023255814e-05, | |
| "loss": 0.8047, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12248468941382328, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.6046511627906977e-05, | |
| "loss": 0.7453, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1399825021872266, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 1.8372093023255815e-05, | |
| "loss": 0.7581, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15748031496062992, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 2.0697674418604654e-05, | |
| "loss": 0.7399, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17497812773403323, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 2.3023255813953492e-05, | |
| "loss": 0.7476, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19247594050743658, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 2.5348837209302327e-05, | |
| "loss": 0.7247, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2099737532808399, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 2.7674418604651166e-05, | |
| "loss": 0.7179, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2274715660542432, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 0.6949, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.24496937882764655, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 3.232558139534884e-05, | |
| "loss": 0.7043, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.26246719160104987, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.4651162790697674e-05, | |
| "loss": 0.6934, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2799650043744532, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 3.697674418604651e-05, | |
| "loss": 0.6993, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2974628171478565, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.930232558139535e-05, | |
| "loss": 0.7027, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.9999858083836754e-05, | |
| "loss": 0.6935, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3324584426946632, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 3.999916298964876e-05, | |
| "loss": 0.7137, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.34995625546806647, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 3.999788867354247e-05, | |
| "loss": 0.6916, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3674540682414698, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.9996035176526014e-05, | |
| "loss": 0.6947, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.38495188101487315, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 3.9993602558245834e-05, | |
| "loss": 0.7045, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.40244969378827644, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.9990590896984766e-05, | |
| "loss": 0.7016, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4199475065616798, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.998700028965957e-05, | |
| "loss": 0.7072, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4374453193350831, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.9982830851817736e-05, | |
| "loss": 0.6986, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4549431321084864, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.997808271763385e-05, | |
| "loss": 0.6804, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.47244094488188976, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.997275603990519e-05, | |
| "loss": 0.7016, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4899387576552931, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 3.996685099004686e-05, | |
| "loss": 0.6921, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5074365704286964, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 3.996036775808628e-05, | |
| "loss": 0.6857, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5249343832020997, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 3.995330655265704e-05, | |
| "loss": 0.6967, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5424321959755031, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 3.994566760099221e-05, | |
| "loss": 0.6969, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5599300087489064, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.9937451148917e-05, | |
| "loss": 0.6862, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5774278215223098, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.992865746084089e-05, | |
| "loss": 0.695, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.594925634295713, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.991928681974908e-05, | |
| "loss": 0.7036, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6124234470691163, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 3.9909339527193416e-05, | |
| "loss": 0.7064, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.989881590328265e-05, | |
| "loss": 0.6768, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.647419072615923, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.98877162866722e-05, | |
| "loss": 0.7161, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6649168853893264, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.9876041034553165e-05, | |
| "loss": 0.6731, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6824146981627297, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 3.9863790522640926e-05, | |
| "loss": 0.6905, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6999125109361329, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 3.9850965145162954e-05, | |
| "loss": 0.7067, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7174103237095363, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.983756531484622e-05, | |
| "loss": 0.6982, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7349081364829396, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.9823591462903856e-05, | |
| "loss": 0.6932, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.752405949256343, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 3.9809044039021295e-05, | |
| "loss": 0.7004, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7699037620297463, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.979392351134178e-05, | |
| "loss": 0.7002, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7874015748031497, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.977823036645132e-05, | |
| "loss": 0.7065, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8048993875765529, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.976196510936307e-05, | |
| "loss": 0.692, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8223972003499562, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.9745128263500976e-05, | |
| "loss": 0.6847, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8398950131233596, | |
| "grad_norm": 0.75, | |
| "learning_rate": 3.972772037068303e-05, | |
| "loss": 0.6868, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8573928258967629, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 3.970974199110378e-05, | |
| "loss": 0.6727, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8748906386701663, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.9691193703316336e-05, | |
| "loss": 0.69, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8923884514435696, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.9672076104213706e-05, | |
| "loss": 0.6696, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9098862642169728, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.965238980900965e-05, | |
| "loss": 0.6852, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9273840769903762, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.963213545121881e-05, | |
| "loss": 0.6852, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.9611313682636395e-05, | |
| "loss": 0.6907, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9623797025371829, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.9589925173317165e-05, | |
| "loss": 0.6753, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9798775153105862, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.956797061155386e-05, | |
| "loss": 0.6796, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9973753280839895, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.954545070385508e-05, | |
| "loss": 0.6875, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0139982502187226, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.952236617492253e-05, | |
| "loss": 0.6204, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.031496062992126, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.94987177676277e-05, | |
| "loss": 0.5968, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0489938757655293, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 3.947450624298799e-05, | |
| "loss": 0.6147, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0664916885389326, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.944973238014215e-05, | |
| "loss": 0.6052, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.083989501312336, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.942439697632528e-05, | |
| "loss": 0.6152, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.1014873140857393, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.9398500846843136e-05, | |
| "loss": 0.622, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1189851268591426, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.9372044825045905e-05, | |
| "loss": 0.6361, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.136482939632546, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.934502976230138e-05, | |
| "loss": 0.6015, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1539807524059493, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.9317456527967566e-05, | |
| "loss": 0.6231, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1714785651793527, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.92893260093647e-05, | |
| "loss": 0.6325, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.188976377952756, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.926063911174673e-05, | |
| "loss": 0.6, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.2064741907261591, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.92313967582721e-05, | |
| "loss": 0.6158, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2239720034995625, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.9201599889974155e-05, | |
| "loss": 0.6184, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2414698162729658, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.917124946573076e-05, | |
| "loss": 0.6159, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.2589676290463692, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.91403464622335e-05, | |
| "loss": 0.6221, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.2764654418197725, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.910889187395622e-05, | |
| "loss": 0.6393, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.2939632545931758, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.9076886713123056e-05, | |
| "loss": 0.6168, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.3114610673665792, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.9044332009675816e-05, | |
| "loss": 0.611, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3289588801399825, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 3.901122881124087e-05, | |
| "loss": 0.625, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3464566929133859, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.897757818309543e-05, | |
| "loss": 0.6196, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.3639545056867892, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.8943381208133266e-05, | |
| "loss": 0.6451, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.3814523184601923, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.890863898682984e-05, | |
| "loss": 0.6255, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.3989501312335957, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.887335263720693e-05, | |
| "loss": 0.6182, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.416447944006999, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.883752329479662e-05, | |
| "loss": 0.6206, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.4339457567804024, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 3.880115211260478e-05, | |
| "loss": 0.6068, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.4514435695538057, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 3.876424026107394e-05, | |
| "loss": 0.6221, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.468941382327209, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.872678892804563e-05, | |
| "loss": 0.6118, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.4864391951006124, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.868879931872218e-05, | |
| "loss": 0.6137, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.5039370078740157, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.865027265562789e-05, | |
| "loss": 0.61, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.521434820647419, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.8611210178569746e-05, | |
| "loss": 0.6285, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5389326334208224, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.857161314459745e-05, | |
| "loss": 0.6211, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.5564304461942258, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.853148282796306e-05, | |
| "loss": 0.6291, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5739282589676291, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.849082052007988e-05, | |
| "loss": 0.6074, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5914260717410325, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.844962752948099e-05, | |
| "loss": 0.6314, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.6089238845144358, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.8407905181777095e-05, | |
| "loss": 0.6167, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.6264216972878391, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.836565481961386e-05, | |
| "loss": 0.6235, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6439195100612425, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.8322877802628704e-05, | |
| "loss": 0.6106, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.6614173228346458, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.827957550740711e-05, | |
| "loss": 0.6041, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.678915135608049, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.82357493274382e-05, | |
| "loss": 0.6176, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.6964129483814523, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.8191400673070034e-05, | |
| "loss": 0.6105, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.7139107611548556, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.81465309714641e-05, | |
| "loss": 0.6278, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.731408573928259, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.810114166654948e-05, | |
| "loss": 0.6189, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.7489063867016623, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 3.805523421897633e-05, | |
| "loss": 0.6299, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7664041994750657, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.800881010606889e-05, | |
| "loss": 0.6059, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.7839020122484688, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.7961870821777946e-05, | |
| "loss": 0.6199, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.8013998250218721, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.791441787663276e-05, | |
| "loss": 0.5948, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.8188976377952755, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.786645279769244e-05, | |
| "loss": 0.608, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.8363954505686788, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.781797712849683e-05, | |
| "loss": 0.6327, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8538932633420822, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.7768992429016796e-05, | |
| "loss": 0.6392, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.8713910761154855, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.7719500275604074e-05, | |
| "loss": 0.6359, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.7669502260940495e-05, | |
| "loss": 0.6156, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.9063867016622922, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.761899999398677e-05, | |
| "loss": 0.6142, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.9238845144356955, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 3.756799509993069e-05, | |
| "loss": 0.6282, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9413823272090989, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.751648922013482e-05, | |
| "loss": 0.6388, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.9588801399825022, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.746448401208373e-05, | |
| "loss": 0.612, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.9763779527559056, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.7411981149330575e-05, | |
| "loss": 0.622, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.993875765529309, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.735898232144331e-05, | |
| "loss": 0.6204, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.010498687664042, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.7305489233950284e-05, | |
| "loss": 0.5708, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.027996500437445, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 3.725150360828537e-05, | |
| "loss": 0.5258, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.0454943132108485, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.7197027181732556e-05, | |
| "loss": 0.5357, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.062992125984252, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.714206170737003e-05, | |
| "loss": 0.5301, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.080489938757655, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.708660895401381e-05, | |
| "loss": 0.5334, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.0979877515310585, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.703067070616076e-05, | |
| "loss": 0.5308, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.115485564304462, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 3.697424876393122e-05, | |
| "loss": 0.5243, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.1329833770778652, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.6917344943011043e-05, | |
| "loss": 0.5139, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.1504811898512686, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.685996107459317e-05, | |
| "loss": 0.533, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.167979002624672, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 3.680209900531872e-05, | |
| "loss": 0.5098, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.1854768153980753, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.6743760597217536e-05, | |
| "loss": 0.5318, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.2029746281714786, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.668494772764828e-05, | |
| "loss": 0.5223, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.220472440944882, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.662566228923805e-05, | |
| "loss": 0.5241, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.2379702537182853, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.656590618982138e-05, | |
| "loss": 0.5249, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.2554680664916886, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.650568135237897e-05, | |
| "loss": 0.5412, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.272965879265092, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 3.644498971497569e-05, | |
| "loss": 0.53, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2904636920384953, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 3.63838332306983e-05, | |
| "loss": 0.5382, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.3079615048118987, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.6322213867592536e-05, | |
| "loss": 0.5278, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.325459317585302, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 3.626013360859982e-05, | |
| "loss": 0.5409, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.3429571303587053, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 3.619759445149341e-05, | |
| "loss": 0.5277, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.3604549431321082, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 3.613459840881415e-05, | |
| "loss": 0.5344, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.377952755905512, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.607114750780568e-05, | |
| "loss": 0.5364, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.395450568678915, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.6007243790349205e-05, | |
| "loss": 0.5238, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.4129483814523183, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.594288931289778e-05, | |
| "loss": 0.5322, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.4304461942257216, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.587808614641015e-05, | |
| "loss": 0.5418, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.447944006999125, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.581283637628409e-05, | |
| "loss": 0.537, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.4654418197725283, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.574714210228929e-05, | |
| "loss": 0.5286, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.4829396325459316, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.5681005438499795e-05, | |
| "loss": 0.5196, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.500437445319335, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.561442851322599e-05, | |
| "loss": 0.5559, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.5179352580927383, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.554741346894606e-05, | |
| "loss": 0.5325, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.5354330708661417, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.5479962462237116e-05, | |
| "loss": 0.5368, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.552930883639545, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 3.541207766370572e-05, | |
| "loss": 0.5368, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.5704286964129484, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 3.534376125791807e-05, | |
| "loss": 0.5385, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.5879265091863517, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.527501544332972e-05, | |
| "loss": 0.5365, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.605424321959755, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.520584243221479e-05, | |
| "loss": 0.5439, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.6229221347331584, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.5136244450594814e-05, | |
| "loss": 0.5214, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.6404199475065617, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.506622373816706e-05, | |
| "loss": 0.5334, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.657917760279965, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 3.499578254823249e-05, | |
| "loss": 0.544, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.6754155730533684, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 3.4924923147623265e-05, | |
| "loss": 0.5254, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.6929133858267718, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 3.485364781662974e-05, | |
| "loss": 0.536, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.710411198600175, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 3.478195884892712e-05, | |
| "loss": 0.5471, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.7279090113735784, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.4709858551501664e-05, | |
| "loss": 0.5235, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.745406824146982, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.463734924457638e-05, | |
| "loss": 0.5486, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.7629046369203847, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 3.4564433261536446e-05, | |
| "loss": 0.5218, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.7804024496937885, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 3.449111294885407e-05, | |
| "loss": 0.5256, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.7979002624671914, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.4417390666012966e-05, | |
| "loss": 0.5296, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.815398075240595, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 3.434326878543246e-05, | |
| "loss": 0.5365, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.832895888013998, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 3.426874969239113e-05, | |
| "loss": 0.5226, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.850393700787402, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.419383578495005e-05, | |
| "loss": 0.5351, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.8678915135608047, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.411852947387559e-05, | |
| "loss": 0.5297, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.885389326334208, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.40428331825619e-05, | |
| "loss": 0.5248, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.9028871391076114, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.3966749346952856e-05, | |
| "loss": 0.5327, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.9203849518810148, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 3.389028041546372e-05, | |
| "loss": 0.5315, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.937882764654418, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 3.381342884890231e-05, | |
| "loss": 0.5327, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.9553805774278215, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.373619712038985e-05, | |
| "loss": 0.5399, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.972878390201225, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.365858771528135e-05, | |
| "loss": 0.5307, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.990376202974628, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 3.358060313108564e-05, | |
| "loss": 0.5427, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.0069991251093615, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.3502245877384986e-05, | |
| "loss": 0.4888, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.024496937882765, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.3423518475754376e-05, | |
| "loss": 0.4435, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.041994750656168, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 3.33444234596803e-05, | |
| "loss": 0.429, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.059492563429571, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 3.326496337447928e-05, | |
| "loss": 0.437, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.0769903762029744, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 3.3185140777215956e-05, | |
| "loss": 0.4387, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.094488188976378, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 3.3104958236620755e-05, | |
| "loss": 0.4211, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.111986001749781, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.302441833300728e-05, | |
| "loss": 0.4265, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.1294838145231845, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.2943523658189246e-05, | |
| "loss": 0.438, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.146981627296588, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.286227681539708e-05, | |
| "loss": 0.4379, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.164479440069991, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 3.278068041919415e-05, | |
| "loss": 0.4284, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.1819772528433945, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.2698737095392593e-05, | |
| "loss": 0.4299, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.199475065616798, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.261644948096891e-05, | |
| "loss": 0.4351, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.216972878390201, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 3.2533820223978996e-05, | |
| "loss": 0.4336, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.2344706911636045, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.245085198347298e-05, | |
| "loss": 0.4376, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.251968503937008, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 3.236754742940965e-05, | |
| "loss": 0.4461, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.269466316710411, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.228390924257053e-05, | |
| "loss": 0.4472, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.2869641294838146, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 3.2199940114473615e-05, | |
| "loss": 0.4404, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.304461942257218, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 3.211564274728676e-05, | |
| "loss": 0.4357, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.3219597550306212, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 3.203101985374069e-05, | |
| "loss": 0.4405, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.3394575678040246, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.194607415704174e-05, | |
| "loss": 0.4524, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.356955380577428, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 3.186080839078423e-05, | |
| "loss": 0.4311, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.3744531933508313, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 3.177522529886244e-05, | |
| "loss": 0.4257, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.3919510061242346, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 3.1689327635382374e-05, | |
| "loss": 0.4305, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.409448818897638, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 3.160311816457309e-05, | |
| "loss": 0.4473, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.4269466316710413, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 3.151659966069777e-05, | |
| "loss": 0.4543, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 3.142977490796441e-05, | |
| "loss": 0.4444, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.4619422572178475, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 3.134264670043627e-05, | |
| "loss": 0.4543, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.4794400699912513, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.125521784194192e-05, | |
| "loss": 0.4433, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.4969378827646542, | |
| "grad_norm": 0.625, | |
| "learning_rate": 3.1167491145985035e-05, | |
| "loss": 0.4403, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.514435695538058, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 3.107946943565384e-05, | |
| "loss": 0.4516, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.531933508311461, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 3.099115554353027e-05, | |
| "loss": 0.4326, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.5494313210848643, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 3.090255231159881e-05, | |
| "loss": 0.4309, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.5669291338582676, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.081366259115504e-05, | |
| "loss": 0.4328, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.584426946631671, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.072448924271389e-05, | |
| "loss": 0.4333, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.6019247594050743, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.063503513591758e-05, | |
| "loss": 0.4518, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.6194225721784776, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.0545303149443265e-05, | |
| "loss": 0.4383, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.636920384951881, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.0455296170910404e-05, | |
| "loss": 0.436, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.6544181977252843, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 3.0365017096787834e-05, | |
| "loss": 0.4363, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.6719160104986877, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.0274468832300576e-05, | |
| "loss": 0.4463, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.689413823272091, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 3.0183654291336313e-05, | |
| "loss": 0.4533, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.7069116360454943, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 3.009257639635164e-05, | |
| "loss": 0.4255, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.7244094488188977, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 3.0001238078278022e-05, | |
| "loss": 0.4452, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.741907261592301, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.990964227642746e-05, | |
| "loss": 0.4344, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.7594050743657044, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 2.9817791938397902e-05, | |
| "loss": 0.4327, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.7769028871391077, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 2.9725690019978422e-05, | |
| "loss": 0.4461, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.794400699912511, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.9633339485054037e-05, | |
| "loss": 0.443, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.8118985126859144, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.9540743305510406e-05, | |
| "loss": 0.4387, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.8293963254593177, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 2.9447904461138114e-05, | |
| "loss": 0.4355, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.846894138232721, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.9354825939536854e-05, | |
| "loss": 0.4425, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.864391951006124, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.9261510736019222e-05, | |
| "loss": 0.4518, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.8818897637795278, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.9167961853514368e-05, | |
| "loss": 0.4552, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.8993875765529307, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 2.9074182302471347e-05, | |
| "loss": 0.4506, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.9168853893263345, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 2.898017510076224e-05, | |
| "loss": 0.4388, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 3.9343832020997374, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 2.8885943273585037e-05, | |
| "loss": 0.4373, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.9518810148731407, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.8791489853366284e-05, | |
| "loss": 0.4359, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 3.969378827646544, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 2.8696817879663515e-05, | |
| "loss": 0.4538, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 3.9868766404199474, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 2.8601930399067393e-05, | |
| "loss": 0.4394, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 4.003499562554681, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 2.8506830465103723e-05, | |
| "loss": 0.4332, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 4.020997375328084, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 2.8411521138135154e-05, | |
| "loss": 0.3484, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.038495188101487, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.8316005485262717e-05, | |
| "loss": 0.3652, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 4.05599300087489, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 2.8220286580227093e-05, | |
| "loss": 0.3603, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 4.073490813648294, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.8124367503309736e-05, | |
| "loss": 0.3401, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 4.090988626421697, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 2.802825134123371e-05, | |
| "loss": 0.3446, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 4.108486439195101, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.7931941187064402e-05, | |
| "loss": 0.3732, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 4.125984251968504, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 2.783544014010993e-05, | |
| "loss": 0.3567, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 4.1434820647419075, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 2.773875130582146e-05, | |
| "loss": 0.3575, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 4.16097987751531, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.764187779569324e-05, | |
| "loss": 0.36, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 4.178477690288714, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 2.7544822727162475e-05, | |
| "loss": 0.345, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 4.195975503062117, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 2.7447589223509004e-05, | |
| "loss": 0.3614, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.213473315835521, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 2.7350180413754798e-05, | |
| "loss": 0.3584, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 4.230971128608924, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 2.7252599432563278e-05, | |
| "loss": 0.3567, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 4.248468941382328, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 2.7154849420138405e-05, | |
| "loss": 0.3513, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 4.2659667541557305, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 2.7056933522123677e-05, | |
| "loss": 0.3572, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 4.283464566929134, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.695885488950083e-05, | |
| "loss": 0.3461, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 4.300962379702537, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 2.686061667848852e-05, | |
| "loss": 0.3534, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 4.318460192475941, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.67622220504407e-05, | |
| "loss": 0.3608, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 4.335958005249344, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.6663674171744887e-05, | |
| "loss": 0.3549, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 4.353455818022747, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.6564976213720292e-05, | |
| "loss": 0.3533, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.3709536307961505, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 2.646613135251576e-05, | |
| "loss": 0.3542, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.388451443569553, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 2.6367142769007533e-05, | |
| "loss": 0.3426, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 4.405949256342957, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 2.626801364869693e-05, | |
| "loss": 0.3511, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 4.42344706911636, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 2.6168747181607795e-05, | |
| "loss": 0.3528, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 4.440944881889764, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 2.606934656218386e-05, | |
| "loss": 0.3491, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 4.458442694663167, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 2.5969814989185965e-05, | |
| "loss": 0.3563, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 4.475940507436571, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 2.5870155665589066e-05, | |
| "loss": 0.3603, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 4.4934383202099735, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.5770371798479234e-05, | |
| "loss": 0.3525, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 4.510936132983377, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 2.5670466598950385e-05, | |
| "loss": 0.3487, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 4.52843394575678, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 2.5570443282000974e-05, | |
| "loss": 0.3606, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 4.545931758530184, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 2.547030506643055e-05, | |
| "loss": 0.3569, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.563429571303587, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 2.5370055174736144e-05, | |
| "loss": 0.3509, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 4.580927384076991, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.5269696833008577e-05, | |
| "loss": 0.3549, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 4.5984251968503935, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.516923327082865e-05, | |
| "loss": 0.3521, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 4.615923009623797, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 2.5068667721163213e-05, | |
| "loss": 0.3549, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 4.6334208223972, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 2.496800342026112e-05, | |
| "loss": 0.3448, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.650918635170604, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.4867243607549085e-05, | |
| "loss": 0.3517, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 4.668416447944007, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 2.4766391525527436e-05, | |
| "loss": 0.3548, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 4.685914260717411, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 2.4665450419665783e-05, | |
| "loss": 0.3605, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 4.703412073490814, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 2.4564423538298567e-05, | |
| "loss": 0.3565, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 4.7209098862642165, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.4463314132520522e-05, | |
| "loss": 0.344, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.73840769903762, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 2.4362125456082058e-05, | |
| "loss": 0.3496, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 4.755905511811024, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 2.426086076528455e-05, | |
| "loss": 0.3584, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 4.773403324584427, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 2.4159523318875562e-05, | |
| "loss": 0.3627, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 4.79090113735783, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 2.405811637794396e-05, | |
| "loss": 0.3585, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 4.808398950131234, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 2.3956643205814994e-05, | |
| "loss": 0.3651, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.8258967629046365, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 2.385510706794524e-05, | |
| "loss": 0.3537, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 4.84339457567804, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 2.375351123181756e-05, | |
| "loss": 0.3476, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 4.860892388451443, | |
| "grad_norm": 0.75, | |
| "learning_rate": 2.365185896683595e-05, | |
| "loss": 0.3416, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 4.878390201224847, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.3550153544220273e-05, | |
| "loss": 0.3569, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.89588801399825, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.3448398236901073e-05, | |
| "loss": 0.3502, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.913385826771654, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 2.3346596319414173e-05, | |
| "loss": 0.3538, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.930883639545057, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 2.3244751067795366e-05, | |
| "loss": 0.3635, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 4.94838145231846, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 2.314286575947494e-05, | |
| "loss": 0.3567, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 4.965879265091863, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 2.304094367317223e-05, | |
| "loss": 0.3293, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 4.983377077865267, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 2.293898808879011e-05, | |
| "loss": 0.3656, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 2.2837002287309426e-05, | |
| "loss": 0.3726, | |
| "step": 2860 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5710, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.691223286459776e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |