td-builder commited on
Commit
9295327
·
verified ·
1 Parent(s): 51b9f58

Upload 141 files

Browse files
Files changed (1) hide show
  1. hugging/td_fuse/merge.py +36 -13
hugging/td_fuse/merge.py CHANGED
@@ -386,7 +386,36 @@ def save_checkpoint(
386
  cfg: MergeConfig,
387
  ):
388
  """Save a checkpoint after a successful merge stage."""
389
- ckpt_dir = Path(cfg.checkpoint_dir) / f"after_{stage_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  ckpt_dir.mkdir(parents=True, exist_ok=True)
391
 
392
  print(f"[merge] Saving checkpoint to {ckpt_dir}...")
@@ -1158,24 +1187,18 @@ def run_pipeline(
1158
  healing_targets = residual_bank.get_healing_targets(top_n=50)
1159
  pipeline_results["suggested_healing_targets"] = healing_targets
1160
 
1161
- # --- Save final model ---
 
1162
  if pipeline_results["final_checkpoint"]:
1163
- final_dir = Path(cfg.output_dir) / "final"
1164
- final_dir.mkdir(parents=True, exist_ok=True)
1165
- # Free disk space before final save (Bug #25 fix)
1166
  import shutil as _shutil
1167
- for _cleanup in ["models/base"]:
1168
  _cp = Path(_cleanup)
1169
  if _cp.exists() and _cp.is_dir():
1170
  _shutil.rmtree(str(_cp))
1171
  print(f"[merge] Freed disk: {_cleanup}")
1172
- import gc; gc.collect()
1173
- _stat = _shutil.disk_usage("/")
1174
- print(f"[merge] Disk: {_stat.free / 1e9:.1f} GB free / {_stat.total / 1e9:.1f} GB total")
1175
- target_model.save_pretrained(final_dir)
1176
- target_tokenizer.save_pretrained(final_dir)
1177
- pipeline_results["final_model_path"] = str(final_dir)
1178
- print(f"\n[pipeline] Final model saved to {final_dir}")
1179
 
1180
  if all_passed:
1181
  pipeline_results["overall_status"] = "all_passed"
 
386
  cfg: MergeConfig,
387
  ):
388
  """Save a checkpoint after a successful merge stage."""
389
+ import shutil
390
+
391
+ ckpt_base = Path(cfg.checkpoint_dir)
392
+ ckpt_dir = ckpt_base / f"after_{stage_name}"
393
+
394
+ # --- Pre-save cleanup: free disk space ---
395
+ # 1. Delete residuals (non-essential, 5-20GB)
396
+ residuals_dir = ckpt_base / "residuals"
397
+ if residuals_dir.exists():
398
+ shutil.rmtree(str(residuals_dir), ignore_errors=True)
399
+ print(f"[merge] Freed disk: deleted residuals")
400
+
401
+ # 2. Delete td_fuse_outputs/final (duplicate of last checkpoint, ~17GB)
402
+ final_dir = Path("td_fuse_outputs") / "final"
403
+ if final_dir.exists():
404
+ shutil.rmtree(str(final_dir), ignore_errors=True)
405
+ print(f"[merge] Freed disk: deleted td_fuse_outputs/final")
406
+
407
+ # 3. Delete OLD checkpoints (already on HuggingFace via watcher)
408
+ if ckpt_base.exists():
409
+ for old_ckpt in ckpt_base.glob("after_*"):
410
+ if old_ckpt.name != f"after_{stage_name}" and old_ckpt.is_dir():
411
+ shutil.rmtree(str(old_ckpt), ignore_errors=True)
412
+ print(f"[merge] Freed disk: deleted old checkpoint {old_ckpt.name}")
413
+
414
+ # Check disk space
415
+ import shutil as sh_util
416
+ total, used, free = sh_util.disk_usage("/")
417
+ print(f"[merge] Disk after cleanup: {free/1e9:.1f} GB free / {total/1e9:.1f} GB total")
418
+
419
  ckpt_dir.mkdir(parents=True, exist_ok=True)
420
 
421
  print(f"[merge] Saving checkpoint to {ckpt_dir}...")
 
1187
  healing_targets = residual_bank.get_healing_targets(top_n=50)
1188
  pipeline_results["suggested_healing_targets"] = healing_targets
1189
 
1190
+ # --- Skip final model save (duplicate of checkpoint, wastes 17GB disk) ---
1191
+ # The checkpoint in td_fuse_checkpoints/after_<stage> IS the final model
1192
  if pipeline_results["final_checkpoint"]:
1193
+ pipeline_results["final_model_path"] = pipeline_results["final_checkpoint"]
1194
+ print(f"\n[pipeline] Final model is at: {pipeline_results['final_checkpoint']}")
1195
+ # Clean up models/base if still around
1196
  import shutil as _shutil
1197
+ for _cleanup in ["models/base", "td_fuse_outputs/final"]:
1198
  _cp = Path(_cleanup)
1199
  if _cp.exists() and _cp.is_dir():
1200
  _shutil.rmtree(str(_cp))
1201
  print(f"[merge] Freed disk: {_cleanup}")
 
 
 
 
 
 
 
1202
 
1203
  if all_passed:
1204
  pipeline_results["overall_status"] = "all_passed"