Upload 141 files
Browse files- hugging/td_fuse/merge.py +36 -13
hugging/td_fuse/merge.py
CHANGED
|
@@ -386,7 +386,36 @@ def save_checkpoint(
|
|
| 386 |
cfg: MergeConfig,
|
| 387 |
):
|
| 388 |
"""Save a checkpoint after a successful merge stage."""
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
| 391 |
|
| 392 |
print(f"[merge] Saving checkpoint to {ckpt_dir}...")
|
|
@@ -1158,24 +1187,18 @@ def run_pipeline(
|
|
| 1158 |
healing_targets = residual_bank.get_healing_targets(top_n=50)
|
| 1159 |
pipeline_results["suggested_healing_targets"] = healing_targets
|
| 1160 |
|
| 1161 |
-
# ---
|
|
|
|
| 1162 |
if pipeline_results["final_checkpoint"]:
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
-
#
|
| 1166 |
import shutil as _shutil
|
| 1167 |
-
for _cleanup in ["models/base"]:
|
| 1168 |
_cp = Path(_cleanup)
|
| 1169 |
if _cp.exists() and _cp.is_dir():
|
| 1170 |
_shutil.rmtree(str(_cp))
|
| 1171 |
print(f"[merge] Freed disk: {_cleanup}")
|
| 1172 |
-
import gc; gc.collect()
|
| 1173 |
-
_stat = _shutil.disk_usage("/")
|
| 1174 |
-
print(f"[merge] Disk: {_stat.free / 1e9:.1f} GB free / {_stat.total / 1e9:.1f} GB total")
|
| 1175 |
-
target_model.save_pretrained(final_dir)
|
| 1176 |
-
target_tokenizer.save_pretrained(final_dir)
|
| 1177 |
-
pipeline_results["final_model_path"] = str(final_dir)
|
| 1178 |
-
print(f"\n[pipeline] Final model saved to {final_dir}")
|
| 1179 |
|
| 1180 |
if all_passed:
|
| 1181 |
pipeline_results["overall_status"] = "all_passed"
|
|
|
|
| 386 |
cfg: MergeConfig,
|
| 387 |
):
|
| 388 |
"""Save a checkpoint after a successful merge stage."""
|
| 389 |
+
import shutil
|
| 390 |
+
|
| 391 |
+
ckpt_base = Path(cfg.checkpoint_dir)
|
| 392 |
+
ckpt_dir = ckpt_base / f"after_{stage_name}"
|
| 393 |
+
|
| 394 |
+
# --- Pre-save cleanup: free disk space ---
|
| 395 |
+
# 1. Delete residuals (non-essential, 5-20GB)
|
| 396 |
+
residuals_dir = ckpt_base / "residuals"
|
| 397 |
+
if residuals_dir.exists():
|
| 398 |
+
shutil.rmtree(str(residuals_dir), ignore_errors=True)
|
| 399 |
+
print(f"[merge] Freed disk: deleted residuals")
|
| 400 |
+
|
| 401 |
+
# 2. Delete td_fuse_outputs/final (duplicate of last checkpoint, ~17GB)
|
| 402 |
+
final_dir = Path("td_fuse_outputs") / "final"
|
| 403 |
+
if final_dir.exists():
|
| 404 |
+
shutil.rmtree(str(final_dir), ignore_errors=True)
|
| 405 |
+
print(f"[merge] Freed disk: deleted td_fuse_outputs/final")
|
| 406 |
+
|
| 407 |
+
# 3. Delete OLD checkpoints (already on HuggingFace via watcher)
|
| 408 |
+
if ckpt_base.exists():
|
| 409 |
+
for old_ckpt in ckpt_base.glob("after_*"):
|
| 410 |
+
if old_ckpt.name != f"after_{stage_name}" and old_ckpt.is_dir():
|
| 411 |
+
shutil.rmtree(str(old_ckpt), ignore_errors=True)
|
| 412 |
+
print(f"[merge] Freed disk: deleted old checkpoint {old_ckpt.name}")
|
| 413 |
+
|
| 414 |
+
# Check disk space
|
| 415 |
+
import shutil as sh_util
|
| 416 |
+
total, used, free = sh_util.disk_usage("/")
|
| 417 |
+
print(f"[merge] Disk after cleanup: {free/1e9:.1f} GB free / {total/1e9:.1f} GB total")
|
| 418 |
+
|
| 419 |
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
| 420 |
|
| 421 |
print(f"[merge] Saving checkpoint to {ckpt_dir}...")
|
|
|
|
| 1187 |
healing_targets = residual_bank.get_healing_targets(top_n=50)
|
| 1188 |
pipeline_results["suggested_healing_targets"] = healing_targets
|
| 1189 |
|
| 1190 |
+
# --- Skip final model save (duplicate of checkpoint, wastes 17GB disk) ---
|
| 1191 |
+
# The checkpoint in td_fuse_checkpoints/after_<stage> IS the final model
|
| 1192 |
if pipeline_results["final_checkpoint"]:
|
| 1193 |
+
pipeline_results["final_model_path"] = pipeline_results["final_checkpoint"]
|
| 1194 |
+
print(f"\n[pipeline] Final model is at: {pipeline_results['final_checkpoint']}")
|
| 1195 |
+
# Clean up models/base if still around
|
| 1196 |
import shutil as _shutil
|
| 1197 |
+
for _cleanup in ["models/base", "td_fuse_outputs/final"]:
|
| 1198 |
_cp = Path(_cleanup)
|
| 1199 |
if _cp.exists() and _cp.is_dir():
|
| 1200 |
_shutil.rmtree(str(_cp))
|
| 1201 |
print(f"[merge] Freed disk: {_cleanup}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1202 |
|
| 1203 |
if all_passed:
|
| 1204 |
pipeline_results["overall_status"] = "all_passed"
|