| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9932104752667313, |
| "eval_steps": 1024, |
| "global_step": 21504, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011823934229365849, |
| "grad_norm": 0.17750050127506256, |
| "learning_rate": 0.000498046875, |
| "loss": 1.7531359195709229, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.023647868458731697, |
| "grad_norm": 0.14886082708835602, |
| "learning_rate": 0.000998046875, |
| "loss": 1.120242953300476, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.03547180268809755, |
| "grad_norm": 0.11476853489875793, |
| "learning_rate": 0.000999640996023194, |
| "loss": 1.0460094213485718, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "grad_norm": 0.26101624965667725, |
| "learning_rate": 0.0009985588674043958, |
| "loss": 1.0132286548614502, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.3147891739459887, |
| "eval_loss": 0.9696788661283989, |
| "eval_mse_loss": 0.9696788661283989, |
| "flow/cos_sim": 0.6852108316334415, |
| "flow/improvement_ratio": 0.4729253878332164, |
| "flow/mag_ratio_mean": 0.6834825821663146, |
| "flow/mag_ratio_std": 0.18169553397589078, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.047295736917463395, |
| "eval_cos_loss": 0.3147891739459887, |
| "eval_loss": 0.9696788661283989, |
| "eval_mse_loss": 0.9696788661283989, |
| "eval_runtime": 37.5026, |
| "eval_samples_per_second": 746.427, |
| "eval_steps_per_second": 11.679, |
| "flow/cos_sim": 0.6852108316334415, |
| "flow/improvement_ratio": 0.4729253878332164, |
| "flow/mag_ratio_mean": 0.6834825821663146, |
| "flow/mag_ratio_std": 0.18169553397589078, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.05911967114682925, |
| "grad_norm": 0.32471492886543274, |
| "learning_rate": 0.0009967551747861387, |
| "loss": 0.9918397068977356, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0709436053761951, |
| "grad_norm": 0.19580507278442383, |
| "learning_rate": 0.000994232528651847, |
| "loss": 0.9722790718078613, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.08276753960556095, |
| "grad_norm": 0.13594156503677368, |
| "learning_rate": 0.0009909945800260092, |
| "loss": 0.9547147750854492, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "grad_norm": 0.49703535437583923, |
| "learning_rate": 0.0009870460151900522, |
| "loss": 0.9414160847663879, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.2932011633430986, |
| "eval_loss": 0.8981472950547797, |
| "eval_mse_loss": 0.8981472950547797, |
| "flow/cos_sim": 0.7067988623767139, |
| "flow/improvement_ratio": 0.47438520363204556, |
| "flow/mag_ratio_mean": 0.6956081727868346, |
| "flow/mag_ratio_std": 0.19316728430115468, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.09459147383492679, |
| "eval_cos_loss": 0.2932011633430986, |
| "eval_loss": 0.8981472950547797, |
| "eval_mse_loss": 0.8981472950547797, |
| "eval_runtime": 37.712, |
| "eval_samples_per_second": 742.284, |
| "eval_steps_per_second": 11.614, |
| "flow/cos_sim": 0.7067988623767139, |
| "flow/improvement_ratio": 0.47438520363204556, |
| "flow/mag_ratio_mean": 0.6956081727868346, |
| "flow/mag_ratio_std": 0.19316728430115468, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.10641540806429264, |
| "grad_norm": 0.49009642004966736, |
| "learning_rate": 0.0009823925488998885, |
| "loss": 0.929905116558075, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.1182393422936585, |
| "grad_norm": 0.21833102405071259, |
| "learning_rate": 0.0009770409161149525, |
| "loss": 0.9224135875701904, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.13006327652302435, |
| "grad_norm": 0.3465236723423004, |
| "learning_rate": 0.0009709988622506973, |
| "loss": 0.9120264649391174, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "grad_norm": 0.3057152032852173, |
| "learning_rate": 0.000964275131968659, |
| "loss": 0.9054349064826965, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.28495094285571954, |
| "eval_loss": 0.8664921860988826, |
| "eval_mse_loss": 0.8664921860988826, |
| "flow/cos_sim": 0.7150490750733032, |
| "flow/improvement_ratio": 0.48076002510715293, |
| "flow/mag_ratio_mean": 0.7135201504785721, |
| "flow/mag_ratio_std": 0.19565439009911392, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.1418872107523902, |
| "eval_cos_loss": 0.28495094285571954, |
| "eval_loss": 0.8664921860988826, |
| "eval_mse_loss": 0.8664921860988826, |
| "eval_runtime": 38.3291, |
| "eval_samples_per_second": 730.333, |
| "eval_steps_per_second": 11.427, |
| "flow/cos_sim": 0.7150490750733032, |
| "flow/improvement_ratio": 0.48076002510715293, |
| "flow/mag_ratio_mean": 0.7135201504785721, |
| "flow/mag_ratio_std": 0.19565439009911392, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.15371114498175603, |
| "grad_norm": 0.599277138710022, |
| "learning_rate": 0.0009568794565203123, |
| "loss": 0.9003790020942688, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.1655350792111219, |
| "grad_norm": 0.6535385251045227, |
| "learning_rate": 0.0009488225396630347, |
| "loss": 0.897746205329895, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.17735901344048774, |
| "grad_norm": 0.4937800467014313, |
| "learning_rate": 0.0009401160421685646, |
| "loss": 0.8885043859481812, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "grad_norm": 1.3498679399490356, |
| "learning_rate": 0.0009307725649463714, |
| "loss": 0.8906658887863159, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.281152941626773, |
| "eval_loss": 0.8519824494237769, |
| "eval_mse_loss": 0.8519824494237769, |
| "flow/cos_sim": 0.718847086304399, |
| "flow/improvement_ratio": 0.47086378285601804, |
| "flow/mag_ratio_mean": 0.7225797281145505, |
| "flow/mag_ratio_std": 0.19704299981463447, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.18918294766985358, |
| "eval_cos_loss": 0.281152941626773, |
| "eval_loss": 0.8519824494237769, |
| "eval_mse_loss": 0.8519824494237769, |
| "eval_runtime": 37.6235, |
| "eval_samples_per_second": 744.029, |
| "eval_steps_per_second": 11.642, |
| "flow/cos_sim": 0.718847086304399, |
| "flow/improvement_ratio": 0.47086378285601804, |
| "flow/mag_ratio_mean": 0.7225797281145505, |
| "flow/mag_ratio_std": 0.19704299981463447, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.20100688189921945, |
| "grad_norm": 0.24068038165569305, |
| "learning_rate": 0.0009208056308063659, |
| "loss": 0.885456383228302, |
| "step": 4352 |
| }, |
| { |
| "epoch": 0.2128308161285853, |
| "grad_norm": 0.33502769470214844, |
| "learning_rate": 0.0009102296648873445, |
| "loss": 0.8807857632637024, |
| "step": 4608 |
| }, |
| { |
| "epoch": 0.22465475035795113, |
| "grad_norm": 0.176764577627182, |
| "learning_rate": 0.0008990599737794927, |
| "loss": 0.8787178993225098, |
| "step": 4864 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "grad_norm": 0.1856887936592102, |
| "learning_rate": 0.0008873127233711644, |
| "loss": 0.8718249797821045, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.2779194979022627, |
| "eval_loss": 0.8412302969253227, |
| "eval_mse_loss": 0.8412302969253227, |
| "flow/cos_sim": 0.7220805322742898, |
| "flow/improvement_ratio": 0.4709661888749632, |
| "flow/mag_ratio_mean": 0.7200435751105008, |
| "flow/mag_ratio_std": 0.19833092303036554, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.236478684587317, |
| "eval_cos_loss": 0.2779194979022627, |
| "eval_loss": 0.8412302969253227, |
| "eval_mse_loss": 0.8412302969253227, |
| "eval_runtime": 37.8514, |
| "eval_samples_per_second": 739.549, |
| "eval_steps_per_second": 11.572, |
| "flow/cos_sim": 0.7220805322742898, |
| "flow/improvement_ratio": 0.4709661888749632, |
| "flow/mag_ratio_mean": 0.7200435751105008, |
| "flow/mag_ratio_std": 0.19833092303036554, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.24830261881668284, |
| "grad_norm": 0.315166711807251, |
| "learning_rate": 0.0008750049154520011, |
| "loss": 0.8694944977760315, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.2601265530460487, |
| "grad_norm": 0.6202583909034729, |
| "learning_rate": 0.0008621543631062487, |
| "loss": 0.8698850870132446, |
| "step": 5632 |
| }, |
| { |
| "epoch": 0.27195048727541454, |
| "grad_norm": 0.154885396361351, |
| "learning_rate": 0.0008487796649318904, |
| "loss": 0.866125762462616, |
| "step": 5888 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "grad_norm": 0.1593707948923111, |
| "learning_rate": 0.0008349001781229053, |
| "loss": 0.8656928539276123, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.27761552666556344, |
| "eval_loss": 0.8367355261763482, |
| "eval_mse_loss": 0.8367355261763482, |
| "flow/cos_sim": 0.7223844902428318, |
| "flow/improvement_ratio": 0.47543454585282224, |
| "flow/mag_ratio_mean": 0.7271437070685435, |
| "flow/mag_ratio_std": 0.2028201749063518, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2837744215047804, |
| "eval_cos_loss": 0.27761552666556344, |
| "eval_loss": 0.8367355261763482, |
| "eval_mse_loss": 0.8367355261763482, |
| "eval_runtime": 37.5912, |
| "eval_samples_per_second": 744.669, |
| "eval_steps_per_second": 11.652, |
| "flow/cos_sim": 0.7223844902428318, |
| "flow/improvement_ratio": 0.47543454585282224, |
| "flow/mag_ratio_mean": 0.7271437070685435, |
| "flow/mag_ratio_std": 0.2028201749063518, |
| "step": 6144 |
| }, |
| { |
| "epoch": 0.2955983557341462, |
| "grad_norm": 0.24749045073986053, |
| "learning_rate": 0.0008205359904536107, |
| "loss": 0.8599100112915039, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30742228996351206, |
| "grad_norm": 0.4366367757320404, |
| "learning_rate": 0.0008057078912056363, |
| "loss": 0.8599902987480164, |
| "step": 6656 |
| }, |
| { |
| "epoch": 0.3192462241928779, |
| "grad_norm": 0.2574009597301483, |
| "learning_rate": 0.0007904373410796086, |
| "loss": 0.8590140342712402, |
| "step": 6912 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "grad_norm": 0.24796369671821594, |
| "learning_rate": 0.0007747464411350876, |
| "loss": 0.8581823110580444, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.27323219904752627, |
| "eval_loss": 0.8229639839091801, |
| "eval_mse_loss": 0.8229639839091801, |
| "flow/cos_sim": 0.7267678196299566, |
| "flow/improvement_ratio": 0.4771028495814702, |
| "flow/mag_ratio_mean": 0.7125569941246346, |
| "flow/mag_ratio_std": 0.20383604319944773, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.3310701584222438, |
| "eval_cos_loss": 0.27323219904752627, |
| "eval_loss": 0.8229639839091801, |
| "eval_mse_loss": 0.8229639839091801, |
| "eval_runtime": 37.8121, |
| "eval_samples_per_second": 740.319, |
| "eval_steps_per_second": 11.584, |
| "flow/cos_sim": 0.7267678196299566, |
| "flow/improvement_ratio": 0.4771028495814702, |
| "flow/mag_ratio_mean": 0.7125569941246346, |
| "flow/mag_ratio_std": 0.20383604319944773, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.34289409265160964, |
| "grad_norm": 0.7159921526908875, |
| "learning_rate": 0.000758657900803716, |
| "loss": 0.858252763748169, |
| "step": 7424 |
| }, |
| { |
| "epoch": 0.3547180268809755, |
| "grad_norm": 0.6132074594497681, |
| "learning_rate": 0.000742195005021869, |
| "loss": 0.8558468818664551, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.3665419611103413, |
| "grad_norm": 0.18163549900054932, |
| "learning_rate": 0.0007253815805303786, |
| "loss": 0.85396808385849, |
| "step": 7936 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "grad_norm": 0.572221040725708, |
| "learning_rate": 0.0007082419613901028, |
| "loss": 0.8530430197715759, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.2715048284427216, |
| "eval_loss": 0.8155939645691005, |
| "eval_mse_loss": 0.8155939645691005, |
| "flow/cos_sim": 0.7284952074153238, |
| "flow/improvement_ratio": 0.47482473080016707, |
| "flow/mag_ratio_mean": 0.724112270765653, |
| "flow/mag_ratio_std": 0.2020510737210104, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.37836589533970716, |
| "eval_cos_loss": 0.2715048284427216, |
| "eval_loss": 0.8155939645691005, |
| "eval_mse_loss": 0.8155939645691005, |
| "eval_runtime": 37.7077, |
| "eval_samples_per_second": 742.367, |
| "eval_steps_per_second": 11.616, |
| "flow/cos_sim": 0.7284952074153238, |
| "flow/improvement_ratio": 0.47482473080016707, |
| "flow/mag_ratio_mean": 0.724112270765653, |
| "flow/mag_ratio_std": 0.2020510737210104, |
| "step": 8192 |
| }, |
| { |
| "epoch": 0.390189829569073, |
| "grad_norm": 0.2026418000459671, |
| "learning_rate": 0.0006908009537632514, |
| "loss": 0.8493704199790955, |
| "step": 8448 |
| }, |
| { |
| "epoch": 0.4020137637984389, |
| "grad_norm": 0.5044918656349182, |
| "learning_rate": 0.0006730838000114403, |
| "loss": 0.8506228923797607, |
| "step": 8704 |
| }, |
| { |
| "epoch": 0.41383769802780473, |
| "grad_norm": 0.2458736002445221, |
| "learning_rate": 0.0006551161421624341, |
| "loss": 0.8482614159584045, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "grad_norm": 0.7097583413124084, |
| "learning_rate": 0.0006369239847984517, |
| "loss": 0.846942126750946, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.27180098626575516, |
| "eval_loss": 0.8161548636003172, |
| "eval_mse_loss": 0.8161548636003172, |
| "flow/cos_sim": 0.7281990317993512, |
| "flow/improvement_ratio": 0.4732581986003815, |
| "flow/mag_ratio_mean": 0.7231416398803937, |
| "flow/mag_ratio_std": 0.201789186425405, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4256616322571706, |
| "eval_cos_loss": 0.27180098626575516, |
| "eval_loss": 0.8161548636003172, |
| "eval_mse_loss": 0.8161548636003172, |
| "eval_runtime": 37.5698, |
| "eval_samples_per_second": 745.094, |
| "eval_steps_per_second": 11.658, |
| "flow/cos_sim": 0.7281990317993512, |
| "flow/improvement_ratio": 0.4732581986003815, |
| "flow/mag_ratio_mean": 0.7231416398803937, |
| "flow/mag_ratio_std": 0.201789186425405, |
| "step": 9216 |
| }, |
| { |
| "epoch": 0.4374855664865364, |
| "grad_norm": 0.3167099952697754, |
| "learning_rate": 0.0006185336574197479, |
| "loss": 0.8480910062789917, |
| "step": 9472 |
| }, |
| { |
| "epoch": 0.44930950071590225, |
| "grad_norm": 0.4810751974582672, |
| "learning_rate": 0.0005999717763379407, |
| "loss": 0.8465522527694702, |
| "step": 9728 |
| }, |
| { |
| "epoch": 0.4611334349452681, |
| "grad_norm": 0.2259974628686905, |
| "learning_rate": 0.0005812652061542363, |
| "loss": 0.844083309173584, |
| "step": 9984 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "grad_norm": 0.6505718231201172, |
| "learning_rate": 0.0005624410208783071, |
| "loss": 0.8436377644538879, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.2699208431047936, |
| "eval_loss": 0.808695926922097, |
| "eval_mse_loss": 0.808695926922097, |
| "flow/cos_sim": 0.7300791967677199, |
| "flow/improvement_ratio": 0.47688411740951886, |
| "flow/mag_ratio_mean": 0.7248269360359401, |
| "flow/mag_ratio_std": 0.20361059002544238, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.472957369174634, |
| "eval_cos_loss": 0.2699208431047936, |
| "eval_loss": 0.808695926922097, |
| "eval_mse_loss": 0.808695926922097, |
| "eval_runtime": 37.5891, |
| "eval_samples_per_second": 744.71, |
| "eval_steps_per_second": 11.652, |
| "flow/cos_sim": 0.7300791967677199, |
| "flow/improvement_ratio": 0.47688411740951886, |
| "flow/mag_ratio_mean": 0.7248269360359401, |
| "flow/mag_ratio_std": 0.20361059002544238, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.48478130340399983, |
| "grad_norm": 0.29194557666778564, |
| "learning_rate": 0.0005435264647440881, |
| "loss": 0.8419358730316162, |
| "step": 10496 |
| }, |
| { |
| "epoch": 0.49660523763336567, |
| "grad_norm": 0.20549535751342773, |
| "learning_rate": 0.000524548912779213, |
| "loss": 0.8400572538375854, |
| "step": 10752 |
| }, |
| { |
| "epoch": 0.5084291718627315, |
| "grad_norm": 0.7953479290008545, |
| "learning_rate": 0.0005055358311851499, |
| "loss": 0.8401579260826111, |
| "step": 11008 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "grad_norm": 0.14846356213092804, |
| "learning_rate": 0.0004865147375853812, |
| "loss": 0.840056836605072, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.2704421652642559, |
| "eval_loss": 0.8102181156476339, |
| "eval_mse_loss": 0.8102181156476339, |
| "flow/cos_sim": 0.7295578481399849, |
| "flow/improvement_ratio": 0.4786316101273445, |
| "flow/mag_ratio_mean": 0.730471750780872, |
| "flow/mag_ratio_std": 0.2050654717368078, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5202531060920974, |
| "eval_cos_loss": 0.2704421652642559, |
| "eval_loss": 0.8102181156476339, |
| "eval_mse_loss": 0.8102181156476339, |
| "eval_runtime": 37.702, |
| "eval_samples_per_second": 742.48, |
| "eval_steps_per_second": 11.617, |
| "flow/cos_sim": 0.7295578481399849, |
| "flow/improvement_ratio": 0.4786316101273445, |
| "flow/mag_ratio_mean": 0.730471750780872, |
| "flow/mag_ratio_std": 0.2050654717368078, |
| "step": 11264 |
| }, |
| { |
| "epoch": 0.5320770403214632, |
| "grad_norm": 0.49196043610572815, |
| "learning_rate": 0.0004675131611991607, |
| "loss": 0.8376814723014832, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.5439009745508291, |
| "grad_norm": 0.3101634383201599, |
| "learning_rate": 0.0004485586029984899, |
| "loss": 0.8409138321876526, |
| "step": 11776 |
| }, |
| { |
| "epoch": 0.5557249087801949, |
| "grad_norm": 0.6860947608947754, |
| "learning_rate": 0.00042967849590597266, |
| "loss": 0.8401282429695129, |
| "step": 12032 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "grad_norm": 0.5976177453994751, |
| "learning_rate": 0.0004109001650911621, |
| "loss": 0.8376214504241943, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.2699217140742633, |
| "eval_loss": 0.8073501400479443, |
| "eval_mse_loss": 0.8073501400479443, |
| "flow/cos_sim": 0.7300783122239047, |
| "flow/improvement_ratio": 0.4786103154154129, |
| "flow/mag_ratio_mean": 0.7241527468102164, |
| "flow/mag_ratio_std": 0.20363660513946455, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5675488430095608, |
| "eval_cos_loss": 0.2699217140742633, |
| "eval_loss": 0.8073501400479443, |
| "eval_mse_loss": 0.8073501400479443, |
| "eval_runtime": 37.7433, |
| "eval_samples_per_second": 741.668, |
| "eval_steps_per_second": 11.605, |
| "flow/cos_sim": 0.7300783122239047, |
| "flow/improvement_ratio": 0.4786103154154129, |
| "flow/mag_ratio_mean": 0.7241527468102164, |
| "flow/mag_ratio_std": 0.20363660513946455, |
| "step": 12288 |
| }, |
| { |
| "epoch": 0.5793727772389267, |
| "grad_norm": 0.19840270280838013, |
| "learning_rate": 0.0003922507884228551, |
| "loss": 0.8377624154090881, |
| "step": 12544 |
| }, |
| { |
| "epoch": 0.5911967114682924, |
| "grad_norm": 0.6053916215896606, |
| "learning_rate": 0.00037375735713457723, |
| "loss": 0.838446855545044, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6030206456976583, |
| "grad_norm": 0.1777360439300537, |
| "learning_rate": 0.00035544663676018276, |
| "loss": 0.8392548561096191, |
| "step": 13056 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "grad_norm": 0.2625690698623657, |
| "learning_rate": 0.00033734512839611255, |
| "loss": 0.8352001309394836, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.2678832187568216, |
| "eval_loss": 0.8018131276516065, |
| "eval_mse_loss": 0.8018131276516065, |
| "flow/cos_sim": 0.732116795565984, |
| "flow/improvement_ratio": 0.47306563096231524, |
| "flow/mag_ratio_mean": 0.730011395259535, |
| "flow/mag_ratio_std": 0.20525332758007528, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.6148445799270241, |
| "eval_cos_loss": 0.2678832187568216, |
| "eval_loss": 0.8018131276516065, |
| "eval_mse_loss": 0.8018131276516065, |
| "eval_runtime": 37.6397, |
| "eval_samples_per_second": 743.71, |
| "eval_steps_per_second": 11.637, |
| "flow/cos_sim": 0.732116795565984, |
| "flow/improvement_ratio": 0.47306563096231524, |
| "flow/mag_ratio_mean": 0.730011395259535, |
| "flow/mag_ratio_std": 0.20525332758007528, |
| "step": 13312 |
| }, |
| { |
| "epoch": 0.62666851415639, |
| "grad_norm": 0.5689898729324341, |
| "learning_rate": 0.0003194790303463687, |
| "loss": 0.833480954170227, |
| "step": 13568 |
| }, |
| { |
| "epoch": 0.6384924483857558, |
| "grad_norm": 0.28950658440589905, |
| "learning_rate": 0.00030187420020572406, |
| "loss": 0.8348052501678467, |
| "step": 13824 |
| }, |
| { |
| "epoch": 0.6503163826151217, |
| "grad_norm": 0.15908953547477722, |
| "learning_rate": 0.00028455611743603626, |
| "loss": 0.834086000919342, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "grad_norm": 0.15000976622104645, |
| "learning_rate": 0.0002675498464898373, |
| "loss": 0.8325310945510864, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.2677479595898493, |
| "eval_loss": 0.8002720110492619, |
| "eval_mse_loss": 0.8002720110492619, |
| "flow/cos_sim": 0.73225207307023, |
| "flow/improvement_ratio": 0.474588285298108, |
| "flow/mag_ratio_mean": 0.732072616140592, |
| "flow/mag_ratio_std": 0.20331306650077915, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6621403168444876, |
| "eval_cos_loss": 0.2677479595898493, |
| "eval_loss": 0.8002720110492619, |
| "eval_mse_loss": 0.8002720110492619, |
| "eval_runtime": 37.36, |
| "eval_samples_per_second": 749.278, |
| "eval_steps_per_second": 11.724, |
| "flow/cos_sim": 0.73225207307023, |
| "flow/improvement_ratio": 0.474588285298108, |
| "flow/mag_ratio_mean": 0.732072616140592, |
| "flow/mag_ratio_std": 0.20331306650077915, |
| "step": 14336 |
| }, |
| { |
| "epoch": 0.6739642510738534, |
| "grad_norm": 0.2764960825443268, |
| "learning_rate": 0.0002508800005345623, |
| "loss": 0.8334859609603882, |
| "step": 14592 |
| }, |
| { |
| "epoch": 0.6857881853032193, |
| "grad_norm": 0.519814670085907, |
| "learning_rate": 0.00023457070582992562, |
| "loss": 0.8344730138778687, |
| "step": 14848 |
| }, |
| { |
| "epoch": 0.6976121195325851, |
| "grad_norm": 0.22350260615348816, |
| "learning_rate": 0.00021864556680999692, |
| "loss": 0.8335193395614624, |
| "step": 15104 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "grad_norm": 0.128391832113266, |
| "learning_rate": 0.0002031276319205152, |
| "loss": 0.8319519758224487, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.26936231590977544, |
| "eval_loss": 0.803815519292605, |
| "eval_mse_loss": 0.803815519292605, |
| "flow/cos_sim": 0.7306376860294168, |
| "flow/improvement_ratio": 0.4827857036568803, |
| "flow/mag_ratio_mean": 0.7272341569809064, |
| "flow/mag_ratio_std": 0.20492048452706096, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.709436053761951, |
| "eval_cos_loss": 0.26936231590977544, |
| "eval_loss": 0.803815519292605, |
| "eval_mse_loss": 0.803815519292605, |
| "eval_runtime": 37.8308, |
| "eval_samples_per_second": 739.952, |
| "eval_steps_per_second": 11.578, |
| "flow/cos_sim": 0.7306376860294168, |
| "flow/improvement_ratio": 0.4827857036568803, |
| "flow/mag_ratio_mean": 0.7272341569809064, |
| "flow/mag_ratio_std": 0.20492048452706096, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.7212599879913169, |
| "grad_norm": 0.492816299200058, |
| "learning_rate": 0.00018803936026088542, |
| "loss": 0.8342287540435791, |
| "step": 15616 |
| }, |
| { |
| "epoch": 0.7330839222206826, |
| "grad_norm": 0.1290881633758545, |
| "learning_rate": 0.00017340258907913464, |
| "loss": 0.8337975740432739, |
| "step": 15872 |
| }, |
| { |
| "epoch": 0.7449078564500485, |
| "grad_norm": 0.16791236400604248, |
| "learning_rate": 0.0001592385021668743, |
| "loss": 0.8301421403884888, |
| "step": 16128 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "grad_norm": 0.21357402205467224, |
| "learning_rate": 0.0001455675992000087, |
| "loss": 0.830890953540802, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.27139199657799445, |
| "eval_loss": 0.811649158392867, |
| "eval_mse_loss": 0.811649158392867, |
| "flow/cos_sim": 0.7286080298902782, |
| "flow/improvement_ratio": 0.4760344069298, |
| "flow/mag_ratio_mean": 0.733996944204313, |
| "flow/mag_ratio_std": 0.20226374339975722, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7567317906794143, |
| "eval_cos_loss": 0.27139199657799445, |
| "eval_loss": 0.811649158392867, |
| "eval_mse_loss": 0.811649158392867, |
| "eval_runtime": 37.4193, |
| "eval_samples_per_second": 748.089, |
| "eval_steps_per_second": 11.705, |
| "flow/cos_sim": 0.7286080298902782, |
| "flow/improvement_ratio": 0.4760344069298, |
| "flow/mag_ratio_mean": 0.733996944204313, |
| "flow/mag_ratio_std": 0.20226374339975722, |
| "step": 16384 |
| }, |
| { |
| "epoch": 0.7685557249087802, |
| "grad_norm": 0.14695969223976135, |
| "learning_rate": 0.000132409666069565, |
| "loss": 0.8289425373077393, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.780379659138146, |
| "grad_norm": 0.2218649983406067, |
| "learning_rate": 0.0001197837462455823, |
| "loss": 0.829971432685852, |
| "step": 16896 |
| }, |
| { |
| "epoch": 0.7922035933675119, |
| "grad_norm": 0.13823017477989197, |
| "learning_rate": 0.00010770811321550749, |
| "loss": 0.831263542175293, |
| "step": 17152 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "grad_norm": 0.1739463061094284, |
| "learning_rate": 9.620024403698591e-05, |
| "loss": 0.8309124112129211, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.2683021985164516, |
| "eval_loss": 0.8020191139554325, |
| "eval_mse_loss": 0.8020191139554325, |
| "flow/cos_sim": 0.731697819820822, |
| "flow/improvement_ratio": 0.4752517786200188, |
| "flow/mag_ratio_mean": 0.7306965604492518, |
| "flow/mag_ratio_std": 0.20605108883555076, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8040275275968778, |
| "eval_cos_loss": 0.2683021985164516, |
| "eval_loss": 0.8020191139554325, |
| "eval_mse_loss": 0.8020191139554325, |
| "eval_runtime": 37.718, |
| "eval_samples_per_second": 742.166, |
| "eval_steps_per_second": 11.613, |
| "flow/cos_sim": 0.731697819820822, |
| "flow/improvement_ratio": 0.4752517786200188, |
| "flow/mag_ratio_mean": 0.7306965604492518, |
| "flow/mag_ratio_std": 0.20605108883555076, |
| "step": 17408 |
| }, |
| { |
| "epoch": 0.8158514618262436, |
| "grad_norm": 0.26471275091171265, |
| "learning_rate": 8.527679404332429e-05, |
| "loss": 0.831107497215271, |
| "step": 17664 |
| }, |
| { |
| "epoch": 0.8276753960556095, |
| "grad_norm": 0.11510410159826279, |
| "learning_rate": 7.495357273823544e-05, |
| "loss": 0.8281733393669128, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.8394993302849753, |
| "grad_norm": 0.29159778356552124, |
| "learning_rate": 6.524552091475183e-05, |
| "loss": 0.8288251161575317, |
| "step": 18176 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "grad_norm": 0.17007242143154144, |
| "learning_rate": 5.6166689031422024e-05, |
| "loss": 0.8286168575286865, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.2690970574475859, |
| "eval_loss": 0.8030253814780004, |
| "eval_mse_loss": 0.8030253814780004, |
| "flow/cos_sim": 0.7309029475194678, |
| "flow/improvement_ratio": 0.47304362157300184, |
| "flow/mag_ratio_mean": 0.7320575154807469, |
| "flow/mag_ratio_std": 0.20494019774269295, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8513232645143411, |
| "eval_cos_loss": 0.2690970574475859, |
| "eval_loss": 0.8030253814780004, |
| "eval_mse_loss": 0.8030253814780004, |
| "eval_runtime": 38.0297, |
| "eval_samples_per_second": 736.082, |
| "eval_steps_per_second": 11.517, |
| "flow/cos_sim": 0.7309029475194678, |
| "flow/improvement_ratio": 0.47304362157300184, |
| "flow/mag_ratio_mean": 0.7320575154807469, |
| "flow/mag_ratio_std": 0.20494019774269295, |
| "step": 18432 |
| }, |
| { |
| "epoch": 0.8631471987437069, |
| "grad_norm": 0.1886417269706726, |
| "learning_rate": 4.773021687709067e-05, |
| "loss": 0.8288365006446838, |
| "step": 18688 |
| }, |
| { |
| "epoch": 0.8749711329730728, |
| "grad_norm": 0.3300864100456238, |
| "learning_rate": 3.994831455368719e-05, |
| "loss": 0.8259992599487305, |
| "step": 18944 |
| }, |
| { |
| "epoch": 0.8867950672024387, |
| "grad_norm": 0.1487259566783905, |
| "learning_rate": 3.283224480455282e-05, |
| "loss": 0.8295122981071472, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "grad_norm": 0.15876281261444092, |
| "learning_rate": 2.639230671387627e-05, |
| "loss": 0.8303685784339905, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.26947175830466563, |
| "eval_loss": 0.8027528178746297, |
| "eval_mse_loss": 0.8027528178746297, |
| "flow/cos_sim": 0.7305282521737765, |
| "flow/improvement_ratio": 0.47920923064288484, |
| "flow/mag_ratio_mean": 0.7306106731499711, |
| "flow/mag_ratio_std": 0.20609150707857793, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.8986190014318045, |
| "eval_cos_loss": 0.26947175830466563, |
| "eval_loss": 0.8027528178746297, |
| "eval_mse_loss": 0.8027528178746297, |
| "eval_runtime": 38.5004, |
| "eval_samples_per_second": 727.083, |
| "eval_steps_per_second": 11.376, |
| "flow/cos_sim": 0.7305282521737765, |
| "flow/improvement_ratio": 0.47920923064288484, |
| "flow/mag_ratio_mean": 0.7306106731499711, |
| "flow/mag_ratio_std": 0.20609150707857793, |
| "step": 19456 |
| }, |
| { |
| "epoch": 0.9104429356611704, |
| "grad_norm": 0.12024150043725967, |
| "learning_rate": 2.063782080083576e-05, |
| "loss": 0.8279772996902466, |
| "step": 19712 |
| }, |
| { |
| "epoch": 0.9222668698905362, |
| "grad_norm": 0.39260947704315186, |
| "learning_rate": 1.557711553001523e-05, |
| "loss": 0.8271908760070801, |
| "step": 19968 |
| }, |
| { |
| "epoch": 0.9340908041199021, |
| "grad_norm": 0.13970452547073364, |
| "learning_rate": 1.1217515257622269e-05, |
| "loss": 0.8309769034385681, |
| "step": 20224 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "grad_norm": 0.11275673657655716, |
| "learning_rate": 7.565329630950746e-06, |
| "loss": 0.8321537971496582, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.26695949672700064, |
| "eval_loss": 0.797557680165931, |
| "eval_mse_loss": 0.797557680165931, |
| "flow/cos_sim": 0.7330405146019644, |
| "flow/improvement_ratio": 0.4724381293880341, |
| "flow/mag_ratio_mean": 0.7334565095705529, |
| "flow/mag_ratio_std": 0.20568558155265573, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.945914738349268, |
| "eval_cos_loss": 0.26695949672700064, |
| "eval_loss": 0.797557680165931, |
| "eval_mse_loss": 0.797557680165931, |
| "eval_runtime": 37.6307, |
| "eval_samples_per_second": 743.886, |
| "eval_steps_per_second": 11.639, |
| "flow/cos_sim": 0.7330405146019644, |
| "flow/improvement_ratio": 0.4724381293880341, |
| "flow/mag_ratio_mean": 0.7334565095705529, |
| "flow/mag_ratio_std": 0.20568558155265573, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.9577386725786338, |
| "grad_norm": 0.13636957108974457, |
| "learning_rate": 4.62584445643166e-06, |
| "loss": 0.8281410932540894, |
| "step": 20736 |
| }, |
| { |
| "epoch": 0.9695626068079997, |
| "grad_norm": 0.16119834780693054, |
| "learning_rate": 2.40331404948807e-06, |
| "loss": 0.8287432789802551, |
| "step": 20992 |
| }, |
| { |
| "epoch": 0.9813865410373654, |
| "grad_norm": 0.17925769090652466, |
| "learning_rate": 9.009550772663965e-07, |
| "loss": 0.8295612335205078, |
| "step": 21248 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "grad_norm": 0.09990924596786499, |
| "learning_rate": 1.2094190315575791e-07, |
| "loss": 0.8297268152236938, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.2675227834481627, |
| "eval_loss": 0.7975949161945413, |
| "eval_mse_loss": 0.7975949161945413, |
| "flow/cos_sim": 0.7324772468984944, |
| "flow/improvement_ratio": 0.4809445963330465, |
| "flow/mag_ratio_mean": 0.7317501669877196, |
| "flow/mag_ratio_std": 0.2055224990599776, |
| "step": 21504 |
| }, |
| { |
| "epoch": 0.9932104752667313, |
| "eval_cos_loss": 0.2675227834481627, |
| "eval_loss": 0.7975949161945413, |
| "eval_mse_loss": 0.7975949161945413, |
| "eval_runtime": 38.0323, |
| "eval_samples_per_second": 736.032, |
| "eval_steps_per_second": 11.517, |
| "flow/cos_sim": 0.7324772468984944, |
| "flow/improvement_ratio": 0.4809445963330465, |
| "flow/mag_ratio_mean": 0.7317501669877196, |
| "flow/mag_ratio_std": 0.2055224990599776, |
| "step": 21504 |
| } |
| ], |
| "logging_steps": 256, |
| "max_steps": 21651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1024, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|