{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6148445799270241, "eval_steps": 1024, "global_step": 13312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011823934229365849, "grad_norm": 0.17750050127506256, "learning_rate": 0.000498046875, "loss": 1.7531359195709229, "step": 256 }, { "epoch": 0.023647868458731697, "grad_norm": 0.14886082708835602, "learning_rate": 0.000998046875, "loss": 1.120242953300476, "step": 512 }, { "epoch": 0.03547180268809755, "grad_norm": 0.11476853489875793, "learning_rate": 0.000999640996023194, "loss": 1.0460094213485718, "step": 768 }, { "epoch": 0.047295736917463395, "grad_norm": 0.26101624965667725, "learning_rate": 0.0009985588674043958, "loss": 1.0132286548614502, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.3147891739459887, "eval_loss": 0.9696788661283989, "eval_mse_loss": 0.9696788661283989, "flow/cos_sim": 0.6852108316334415, "flow/improvement_ratio": 0.4729253878332164, "flow/mag_ratio_mean": 0.6834825821663146, "flow/mag_ratio_std": 0.18169553397589078, "step": 1024 }, { "epoch": 0.047295736917463395, "eval_cos_loss": 0.3147891739459887, "eval_loss": 0.9696788661283989, "eval_mse_loss": 0.9696788661283989, "eval_runtime": 37.5026, "eval_samples_per_second": 746.427, "eval_steps_per_second": 11.679, "flow/cos_sim": 0.6852108316334415, "flow/improvement_ratio": 0.4729253878332164, "flow/mag_ratio_mean": 0.6834825821663146, "flow/mag_ratio_std": 0.18169553397589078, "step": 1024 }, { "epoch": 0.05911967114682925, "grad_norm": 0.32471492886543274, "learning_rate": 0.0009967551747861387, "loss": 0.9918397068977356, "step": 1280 }, { "epoch": 0.0709436053761951, "grad_norm": 0.19580507278442383, "learning_rate": 0.000994232528651847, "loss": 0.9722790718078613, "step": 1536 }, { "epoch": 0.08276753960556095, "grad_norm": 0.13594156503677368, "learning_rate": 0.0009909945800260092, "loss": 0.9547147750854492, "step": 1792 }, { "epoch": 0.09459147383492679, "grad_norm": 0.49703535437583923, "learning_rate": 0.0009870460151900522, "loss": 0.9414160847663879, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.2932011633430986, "eval_loss": 0.8981472950547797, "eval_mse_loss": 0.8981472950547797, "flow/cos_sim": 0.7067988623767139, "flow/improvement_ratio": 0.47438520363204556, "flow/mag_ratio_mean": 0.6956081727868346, "flow/mag_ratio_std": 0.19316728430115468, "step": 2048 }, { "epoch": 0.09459147383492679, "eval_cos_loss": 0.2932011633430986, "eval_loss": 0.8981472950547797, "eval_mse_loss": 0.8981472950547797, "eval_runtime": 37.712, "eval_samples_per_second": 742.284, "eval_steps_per_second": 11.614, "flow/cos_sim": 0.7067988623767139, "flow/improvement_ratio": 0.47438520363204556, "flow/mag_ratio_mean": 0.6956081727868346, "flow/mag_ratio_std": 0.19316728430115468, "step": 2048 }, { "epoch": 0.10641540806429264, "grad_norm": 0.49009642004966736, "learning_rate": 0.0009823925488998885, "loss": 0.929905116558075, "step": 2304 }, { "epoch": 0.1182393422936585, "grad_norm": 0.21833102405071259, "learning_rate": 0.0009770409161149525, "loss": 0.9224135875701904, "step": 2560 }, { "epoch": 0.13006327652302435, "grad_norm": 0.3465236723423004, "learning_rate": 0.0009709988622506973, "loss": 0.9120264649391174, "step": 2816 }, { "epoch": 0.1418872107523902, "grad_norm": 0.3057152032852173, "learning_rate": 0.000964275131968659, "loss": 0.9054349064826965, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.28495094285571954, "eval_loss": 0.8664921860988826, "eval_mse_loss": 0.8664921860988826, "flow/cos_sim": 0.7150490750733032, "flow/improvement_ratio": 0.48076002510715293, "flow/mag_ratio_mean": 0.7135201504785721, "flow/mag_ratio_std": 0.19565439009911392, "step": 3072 }, { "epoch": 0.1418872107523902, "eval_cos_loss": 0.28495094285571954, "eval_loss": 0.8664921860988826, "eval_mse_loss": 0.8664921860988826, "eval_runtime": 38.3291, "eval_samples_per_second": 730.333, "eval_steps_per_second": 11.427, "flow/cos_sim": 0.7150490750733032, "flow/improvement_ratio": 0.48076002510715293, "flow/mag_ratio_mean": 0.7135201504785721, "flow/mag_ratio_std": 0.19565439009911392, "step": 3072 }, { "epoch": 0.15371114498175603, "grad_norm": 0.599277138710022, "learning_rate": 0.0009568794565203123, "loss": 0.9003790020942688, "step": 3328 }, { "epoch": 0.1655350792111219, "grad_norm": 0.6535385251045227, "learning_rate": 0.0009488225396630347, "loss": 0.897746205329895, "step": 3584 }, { "epoch": 0.17735901344048774, "grad_norm": 0.4937800467014313, "learning_rate": 0.0009401160421685646, "loss": 0.8885043859481812, "step": 3840 }, { "epoch": 0.18918294766985358, "grad_norm": 1.3498679399490356, "learning_rate": 0.0009307725649463714, "loss": 0.8906658887863159, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.281152941626773, "eval_loss": 0.8519824494237769, "eval_mse_loss": 0.8519824494237769, "flow/cos_sim": 0.718847086304399, "flow/improvement_ratio": 0.47086378285601804, "flow/mag_ratio_mean": 0.7225797281145505, "flow/mag_ratio_std": 0.19704299981463447, "step": 4096 }, { "epoch": 0.18918294766985358, "eval_cos_loss": 0.281152941626773, "eval_loss": 0.8519824494237769, "eval_mse_loss": 0.8519824494237769, "eval_runtime": 37.6235, "eval_samples_per_second": 744.029, "eval_steps_per_second": 11.642, "flow/cos_sim": 0.718847086304399, "flow/improvement_ratio": 0.47086378285601804, "flow/mag_ratio_mean": 0.7225797281145505, "flow/mag_ratio_std": 0.19704299981463447, "step": 4096 }, { "epoch": 0.20100688189921945, "grad_norm": 0.24068038165569305, "learning_rate": 0.0009208056308063659, "loss": 0.885456383228302, "step": 4352 }, { "epoch": 0.2128308161285853, "grad_norm": 0.33502769470214844, "learning_rate": 0.0009102296648873445, "loss": 0.8807857632637024, "step": 4608 }, { "epoch": 0.22465475035795113, "grad_norm": 0.176764577627182, "learning_rate": 0.0008990599737794927, "loss": 0.8787178993225098, "step": 4864 }, { "epoch": 0.236478684587317, "grad_norm": 0.1856887936592102, "learning_rate": 0.0008873127233711644, "loss": 0.8718249797821045, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.2779194979022627, "eval_loss": 0.8412302969253227, "eval_mse_loss": 0.8412302969253227, "flow/cos_sim": 0.7220805322742898, "flow/improvement_ratio": 0.4709661888749632, "flow/mag_ratio_mean": 0.7200435751105008, "flow/mag_ratio_std": 0.19833092303036554, "step": 5120 }, { "epoch": 0.236478684587317, "eval_cos_loss": 0.2779194979022627, "eval_loss": 0.8412302969253227, "eval_mse_loss": 0.8412302969253227, "eval_runtime": 37.8514, "eval_samples_per_second": 739.549, "eval_steps_per_second": 11.572, "flow/cos_sim": 0.7220805322742898, "flow/improvement_ratio": 0.4709661888749632, "flow/mag_ratio_mean": 0.7200435751105008, "flow/mag_ratio_std": 0.19833092303036554, "step": 5120 }, { "epoch": 0.24830261881668284, "grad_norm": 0.315166711807251, "learning_rate": 0.0008750049154520011, "loss": 0.8694944977760315, "step": 5376 }, { "epoch": 0.2601265530460487, "grad_norm": 0.6202583909034729, "learning_rate": 0.0008621543631062487, "loss": 0.8698850870132446, "step": 5632 }, { "epoch": 0.27195048727541454, "grad_norm": 0.154885396361351, "learning_rate": 0.0008487796649318904, "loss": 0.866125762462616, "step": 5888 }, { "epoch": 0.2837744215047804, "grad_norm": 0.1593707948923111, "learning_rate": 0.0008349001781229053, "loss": 0.8656928539276123, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.27761552666556344, "eval_loss": 0.8367355261763482, "eval_mse_loss": 0.8367355261763482, "flow/cos_sim": 0.7223844902428318, "flow/improvement_ratio": 0.47543454585282224, "flow/mag_ratio_mean": 0.7271437070685435, "flow/mag_ratio_std": 0.2028201749063518, "step": 6144 }, { "epoch": 0.2837744215047804, "eval_cos_loss": 0.27761552666556344, "eval_loss": 0.8367355261763482, "eval_mse_loss": 0.8367355261763482, "eval_runtime": 37.5912, "eval_samples_per_second": 744.669, "eval_steps_per_second": 11.652, "flow/cos_sim": 0.7223844902428318, "flow/improvement_ratio": 0.47543454585282224, "flow/mag_ratio_mean": 0.7271437070685435, "flow/mag_ratio_std": 0.2028201749063518, "step": 6144 }, { "epoch": 0.2955983557341462, "grad_norm": 0.24749045073986053, "learning_rate": 0.0008205359904536107, "loss": 0.8599100112915039, "step": 6400 }, { "epoch": 0.30742228996351206, "grad_norm": 0.4366367757320404, "learning_rate": 0.0008057078912056363, "loss": 0.8599902987480164, "step": 6656 }, { "epoch": 0.3192462241928779, "grad_norm": 0.2574009597301483, "learning_rate": 0.0007904373410796086, "loss": 0.8590140342712402, "step": 6912 }, { "epoch": 0.3310701584222438, "grad_norm": 0.24796369671821594, "learning_rate": 0.0007747464411350876, "loss": 0.8581823110580444, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.27323219904752627, "eval_loss": 0.8229639839091801, "eval_mse_loss": 0.8229639839091801, "flow/cos_sim": 0.7267678196299566, "flow/improvement_ratio": 0.4771028495814702, "flow/mag_ratio_mean": 0.7125569941246346, "flow/mag_ratio_std": 0.20383604319944773, "step": 7168 }, { "epoch": 0.3310701584222438, "eval_cos_loss": 0.27323219904752627, "eval_loss": 0.8229639839091801, "eval_mse_loss": 0.8229639839091801, "eval_runtime": 37.8121, "eval_samples_per_second": 740.319, "eval_steps_per_second": 11.584, "flow/cos_sim": 0.7267678196299566, "flow/improvement_ratio": 0.4771028495814702, "flow/mag_ratio_mean": 0.7125569941246346, "flow/mag_ratio_std": 0.20383604319944773, "step": 7168 }, { "epoch": 0.34289409265160964, "grad_norm": 0.7159921526908875, "learning_rate": 0.000758657900803716, "loss": 0.858252763748169, "step": 7424 }, { "epoch": 0.3547180268809755, "grad_norm": 0.6132074594497681, "learning_rate": 0.000742195005021869, "loss": 0.8558468818664551, "step": 7680 }, { "epoch": 0.3665419611103413, "grad_norm": 0.18163549900054932, "learning_rate": 0.0007253815805303786, "loss": 0.85396808385849, "step": 7936 }, { "epoch": 0.37836589533970716, "grad_norm": 0.572221040725708, "learning_rate": 0.0007082419613901028, "loss": 0.8530430197715759, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.2715048284427216, "eval_loss": 0.8155939645691005, "eval_mse_loss": 0.8155939645691005, "flow/cos_sim": 0.7284952074153238, "flow/improvement_ratio": 0.47482473080016707, "flow/mag_ratio_mean": 0.724112270765653, "flow/mag_ratio_std": 0.2020510737210104, "step": 8192 }, { "epoch": 0.37836589533970716, "eval_cos_loss": 0.2715048284427216, "eval_loss": 0.8155939645691005, "eval_mse_loss": 0.8155939645691005, "eval_runtime": 37.7077, "eval_samples_per_second": 742.367, "eval_steps_per_second": 11.616, "flow/cos_sim": 0.7284952074153238, "flow/improvement_ratio": 0.47482473080016707, "flow/mag_ratio_mean": 0.724112270765653, "flow/mag_ratio_std": 0.2020510737210104, "step": 8192 }, { "epoch": 0.390189829569073, "grad_norm": 0.2026418000459671, "learning_rate": 0.0006908009537632514, "loss": 0.8493704199790955, "step": 8448 }, { "epoch": 0.4020137637984389, "grad_norm": 0.5044918656349182, "learning_rate": 0.0006730838000114403, "loss": 0.8506228923797607, "step": 8704 }, { "epoch": 0.41383769802780473, "grad_norm": 0.2458736002445221, "learning_rate": 0.0006551161421624341, "loss": 0.8482614159584045, "step": 8960 }, { "epoch": 0.4256616322571706, "grad_norm": 0.7097583413124084, "learning_rate": 0.0006369239847984517, "loss": 0.846942126750946, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.27180098626575516, "eval_loss": 0.8161548636003172, "eval_mse_loss": 0.8161548636003172, "flow/cos_sim": 0.7281990317993512, "flow/improvement_ratio": 0.4732581986003815, "flow/mag_ratio_mean": 0.7231416398803937, "flow/mag_ratio_std": 0.201789186425405, "step": 9216 }, { "epoch": 0.4256616322571706, "eval_cos_loss": 0.27180098626575516, "eval_loss": 0.8161548636003172, "eval_mse_loss": 0.8161548636003172, "eval_runtime": 37.5698, "eval_samples_per_second": 745.094, "eval_steps_per_second": 11.658, "flow/cos_sim": 0.7281990317993512, "flow/improvement_ratio": 0.4732581986003815, "flow/mag_ratio_mean": 0.7231416398803937, "flow/mag_ratio_std": 0.201789186425405, "step": 9216 }, { "epoch": 0.4374855664865364, "grad_norm": 0.3167099952697754, "learning_rate": 0.0006185336574197479, "loss": 0.8480910062789917, "step": 9472 }, { "epoch": 0.44930950071590225, "grad_norm": 0.4810751974582672, "learning_rate": 0.0005999717763379407, "loss": 0.8465522527694702, "step": 9728 }, { "epoch": 0.4611334349452681, "grad_norm": 0.2259974628686905, "learning_rate": 0.0005812652061542363, "loss": 0.844083309173584, "step": 9984 }, { "epoch": 0.472957369174634, "grad_norm": 0.6505718231201172, "learning_rate": 0.0005624410208783071, "loss": 0.8436377644538879, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.2699208431047936, "eval_loss": 0.808695926922097, "eval_mse_loss": 0.808695926922097, "flow/cos_sim": 0.7300791967677199, "flow/improvement_ratio": 0.47688411740951886, "flow/mag_ratio_mean": 0.7248269360359401, "flow/mag_ratio_std": 0.20361059002544238, "step": 10240 }, { "epoch": 0.472957369174634, "eval_cos_loss": 0.2699208431047936, "eval_loss": 0.808695926922097, "eval_mse_loss": 0.808695926922097, "eval_runtime": 37.5891, "eval_samples_per_second": 744.71, "eval_steps_per_second": 11.652, "flow/cos_sim": 0.7300791967677199, "flow/improvement_ratio": 0.47688411740951886, "flow/mag_ratio_mean": 0.7248269360359401, "flow/mag_ratio_std": 0.20361059002544238, "step": 10240 }, { "epoch": 0.48478130340399983, "grad_norm": 0.29194557666778564, "learning_rate": 0.0005435264647440881, "loss": 0.8419358730316162, "step": 10496 }, { "epoch": 0.49660523763336567, "grad_norm": 0.20549535751342773, "learning_rate": 0.000524548912779213, "loss": 0.8400572538375854, "step": 10752 }, { "epoch": 0.5084291718627315, "grad_norm": 0.7953479290008545, "learning_rate": 0.0005055358311851499, "loss": 0.8401579260826111, "step": 11008 }, { "epoch": 0.5202531060920974, "grad_norm": 0.14846356213092804, "learning_rate": 0.0004865147375853812, "loss": 0.840056836605072, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.2704421652642559, "eval_loss": 0.8102181156476339, "eval_mse_loss": 0.8102181156476339, "flow/cos_sim": 0.7295578481399849, "flow/improvement_ratio": 0.4786316101273445, "flow/mag_ratio_mean": 0.730471750780872, "flow/mag_ratio_std": 0.2050654717368078, "step": 11264 }, { "epoch": 0.5202531060920974, "eval_cos_loss": 0.2704421652642559, "eval_loss": 0.8102181156476339, "eval_mse_loss": 0.8102181156476339, "eval_runtime": 37.702, "eval_samples_per_second": 742.48, "eval_steps_per_second": 11.617, "flow/cos_sim": 0.7295578481399849, "flow/improvement_ratio": 0.4786316101273445, "flow/mag_ratio_mean": 0.730471750780872, "flow/mag_ratio_std": 0.2050654717368078, "step": 11264 }, { "epoch": 0.5320770403214632, "grad_norm": 0.49196043610572815, "learning_rate": 0.0004675131611991607, "loss": 0.8376814723014832, "step": 11520 }, { "epoch": 0.5439009745508291, "grad_norm": 0.3101634383201599, "learning_rate": 0.0004485586029984899, "loss": 0.8409138321876526, "step": 11776 }, { "epoch": 0.5557249087801949, "grad_norm": 0.6860947608947754, "learning_rate": 0.00042967849590597266, "loss": 0.8401282429695129, "step": 12032 }, { "epoch": 0.5675488430095608, "grad_norm": 0.5976177453994751, "learning_rate": 0.0004109001650911621, "loss": 0.8376214504241943, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.2699217140742633, "eval_loss": 0.8073501400479443, "eval_mse_loss": 0.8073501400479443, "flow/cos_sim": 0.7300783122239047, "flow/improvement_ratio": 0.4786103154154129, "flow/mag_ratio_mean": 0.7241527468102164, "flow/mag_ratio_std": 0.20363660513946455, "step": 12288 }, { "epoch": 0.5675488430095608, "eval_cos_loss": 0.2699217140742633, "eval_loss": 0.8073501400479443, "eval_mse_loss": 0.8073501400479443, "eval_runtime": 37.7433, "eval_samples_per_second": 741.668, "eval_steps_per_second": 11.605, "flow/cos_sim": 0.7300783122239047, "flow/improvement_ratio": 0.4786103154154129, "flow/mag_ratio_mean": 0.7241527468102164, "flow/mag_ratio_std": 0.20363660513946455, "step": 12288 }, { "epoch": 0.5793727772389267, "grad_norm": 0.19840270280838013, "learning_rate": 0.0003922507884228551, "loss": 0.8377624154090881, "step": 12544 }, { "epoch": 0.5911967114682924, "grad_norm": 0.6053916215896606, "learning_rate": 0.00037375735713457723, "loss": 0.838446855545044, "step": 12800 }, { "epoch": 0.6030206456976583, "grad_norm": 0.1777360439300537, "learning_rate": 0.00035544663676018276, "loss": 0.8392548561096191, "step": 13056 }, { "epoch": 0.6148445799270241, "grad_norm": 0.2625690698623657, "learning_rate": 0.00033734512839611255, "loss": 0.8352001309394836, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.2678832187568216, "eval_loss": 0.8018131276516065, "eval_mse_loss": 0.8018131276516065, "flow/cos_sim": 0.732116795565984, "flow/improvement_ratio": 0.47306563096231524, "flow/mag_ratio_mean": 0.730011395259535, "flow/mag_ratio_std": 0.20525332758007528, "step": 13312 }, { "epoch": 0.6148445799270241, "eval_cos_loss": 0.2678832187568216, "eval_loss": 0.8018131276516065, "eval_mse_loss": 0.8018131276516065, "eval_runtime": 37.6397, "eval_samples_per_second": 743.71, "eval_steps_per_second": 11.637, "flow/cos_sim": 0.732116795565984, "flow/improvement_ratio": 0.47306563096231524, "flow/mag_ratio_mean": 0.730011395259535, "flow/mag_ratio_std": 0.20525332758007528, "step": 13312 } ], "logging_steps": 256, "max_steps": 21651, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1024, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }