td-builder commited on
Commit
111d38c
·
verified ·
1 Parent(s): 2212c4a

Upload 137 files

Browse files
Files changed (1) hide show
  1. hugging/td_fuse/heal.py +31 -0
hugging/td_fuse/heal.py CHANGED
@@ -347,6 +347,37 @@ def apply_qlora_standard(
347
 
348
  print(f"\n[heal] Merging LoRA adapters...")
349
  merged_model = model.merge_and_unload()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  merged_model.save_pretrained(str(healed_dir))
351
  tokenizer.save_pretrained(str(healed_dir))
352
 
 
347
 
348
  print(f"\n[heal] Merging LoRA adapters...")
349
  merged_model = model.merge_and_unload()
350
+
351
+ # Free disk space before saving — remove duplicate model copies
352
+ import shutil, gc
353
+ print("[heal] Freeing disk space before save...")
354
+
355
+ # Search for large duplicate directories we can safely remove
356
+ # The healed model in memory IS the final product — we don't need old copies
357
+ cleanup_targets = [
358
+ "td_fuse_outputs/final", # duplicate of after_deepseek
359
+ "td_fuse_outputs/healed", # old healed dir if exists
360
+ ]
361
+ for target in cleanup_targets:
362
+ target_path = Path(target)
363
+ if target_path.exists() and target_path.is_dir():
364
+ shutil.rmtree(str(target_path))
365
+ print(f"[heal] Freed space: removed {target_path}")
366
+
367
+ # Remove any trainer checkpoint-* dirs (we have the merged model in memory)
368
+ for parent in [Path("."), Path("td_lang_outputs"), Path(cfg.output_dir)]:
369
+ if parent.exists():
370
+ for ckpt in parent.rglob("checkpoint-*"):
371
+ if ckpt.is_dir():
372
+ shutil.rmtree(str(ckpt))
373
+ print(f"[heal] Freed space: removed {ckpt}")
374
+
375
+ gc.collect()
376
+
377
+ # Report free space
378
+ stat = shutil.disk_usage("/")
379
+ print(f"[heal] Disk space: {stat.free / 1e9:.1f} GB free / {stat.total / 1e9:.1f} GB total")
380
+
381
  merged_model.save_pretrained(str(healed_dir))
382
  tokenizer.save_pretrained(str(healed_dir))
383