Upload 141 files

Browse files

Files changed (3) hide show

hugging/save_checkpoint.py +16 -0
hugging/td_fuse/heal.py +10 -1
hugging/td_fuse/transport.py +41 -1

hugging/save_checkpoint.py CHANGED Viewed

@@ -75,6 +75,22 @@ def main():
         print(f"Uploading latest: {latest}")
         upload_checkpoint(api, latest)
     print("\nAll done! Checkpoints saved to HuggingFace.")

         print(f"Uploading latest: {latest}")
         upload_checkpoint(api, latest)
+    # Also upload perm_cache if it exists (tiny files, saves 12 min per re-run)
+    perm_cache = CKPT_DIR / "perm_cache"
+    if perm_cache.exists() and any(perm_cache.glob("*.npz")):
+        try:
+            size_kb = sum(f.stat().st_size for f in perm_cache.rglob("*") if f.is_file()) / 1024
+            print(f"  Uploading perm_cache ({size_kb:.0f} KB) to {REPO}/perm_cache/...")
+            api.upload_folder(
+                folder_path=str(perm_cache),
+                path_in_repo="perm_cache",
+                repo_id=REPO,
+                commit_message="Permutation cache (saves 12 min Sinkhorn)",
+            )
+            print(f"  Done: perm_cache")
+        except Exception as e:
+            print(f"  WARNING: perm_cache upload failed ({e})")
     print("\nAll done! Checkpoints saved to HuggingFace.")

hugging/td_fuse/heal.py CHANGED Viewed

@@ -347,6 +347,15 @@ def apply_qlora_standard(
     print("\n[heal] Starting standard QLoRA healing fine-tune...")
     trainer.train()
     # Save — merge LoRA adapters
     healed_dir = Path(cfg.output_dir) / "healed"
     healed_dir.mkdir(parents=True, exist_ok=True)
@@ -354,7 +363,7 @@ def apply_qlora_standard(
     print(f"\n[heal] Merging LoRA adapters...")
     merged_model = model.merge_and_unload()
-    import shutil, gc
     # SAVE FIRST — never delete anything until save is confirmed
     # save_pretrained can fail on 4-bit merged models (NotImplementedError)

     print("\n[heal] Starting standard QLoRA healing fine-tune...")
     trainer.train()
+    # Free disk space: delete training checkpoints (epoch saves) before saving final model
+    # These are ~17GB and we need room for the healed model
+    import shutil, gc
+    heal_output_dir = Path(cfg.output_dir) / "heal_output"
+    if heal_output_dir.exists():
+        print(f"[heal] Cleaning up training checkpoints to free disk space...")
+        shutil.rmtree(str(heal_output_dir), ignore_errors=True)
+        print(f"[heal] Freed ~17GB from {heal_output_dir}")
     # Save — merge LoRA adapters
     healed_dir = Path(cfg.output_dir) / "healed"
     healed_dir.mkdir(parents=True, exist_ok=True)
     print(f"\n[heal] Merging LoRA adapters...")
     merged_model = model.merge_and_unload()
+    gc.collect()
     # SAVE FIRST — never delete anything until save is confirmed
     # save_pretrained can fail on 4-bit merged models (NotImplementedError)

hugging/td_fuse/transport.py CHANGED Viewed

@@ -391,7 +391,36 @@ def _compute_plans_fallback(
         else:
             corr_val = diag_corr if S0.shape[1] == T0.shape[1] else 0.0
             print(f"[transport] Neurons NOT aligned (diag_corr={corr_val:.3f}) — computing permutations via Sinkhorn")
-            print("[transport] This may take 2-5 minutes...")
         sys.stdout.flush()
         # Track which block indices already have permutations (avoid computing twice)
@@ -465,6 +494,17 @@ def _compute_plans_fallback(
         if permutations:
             print(f"[transport] Computed {len(permutations)} neuron permutations")
         print(f"[transport] Direct matching complete: {n_source} layer pairs")
         tracker.done()
         sys.stdout.flush()

         else:
             corr_val = diag_corr if S0.shape[1] == T0.shape[1] else 0.0
             print(f"[transport] Neurons NOT aligned (diag_corr={corr_val:.3f}) — computing permutations via Sinkhorn")
+            # Check for cached permutations (saves ~12 min per re-run)
+            # Look in both local checkpoint dir AND HuggingFace download location
+            perm_cache_dir = Path("td_fuse_checkpoints") / "perm_cache"
+            src_name = "_".join(sorted(source_act.keys())[:3])  # first 3 layer names as key
+            cache_file = perm_cache_dir / f"perms_{n_source}_{hash(src_name) % 10**8}.npz"
+            hf_cache_file = Path("perm_cache") / f"perms_{n_source}_{hash(src_name) % 10**8}.npz"
+            if not cache_file.exists() and hf_cache_file.exists():
+                cache_file = hf_cache_file  # Use HuggingFace-downloaded cache
+            if cache_file.exists():
+                print(f"[transport] LOADING CACHED permutations from {cache_file}")
+                cached = np.load(str(cache_file), allow_pickle=True)
+                for i, (sl, tl) in enumerate(zip(source_layers, target_layers)):
+                    key = f"{sl}__{tl}"
+                    if key in cached:
+                        permutations[(sl, tl)] = cached[key]
+                    Q_matrices[(sl, tl)] = np.eye(S0.shape[1]) / S0.shape[1]
+                    tracker.tick(f"{sl} -> {tl}")
+                print(f"[transport] Loaded {len(permutations)} cached permutations (skipped Sinkhorn!)")
+                tracker.done()
+                sys.stdout.flush()
+                return {
+                    "P": P,
+                    "Q": Q_matrices,
+                    "permutations": permutations,
+                    "source_layers": source_layers,
+                    "target_layers": target_layers,
+                }
+            print("[transport] No cache found — computing fresh (will cache for next time)...")
         sys.stdout.flush()
         # Track which block indices already have permutations (avoid computing twice)
         if permutations:
             print(f"[transport] Computed {len(permutations)} neuron permutations")
+            # Cache permutations so we don't recompute on re-runs (~12 min saved)
+            try:
+                perm_cache_dir = Path("td_fuse_checkpoints") / "perm_cache"
+                perm_cache_dir.mkdir(parents=True, exist_ok=True)
+                src_name = "_".join(sorted(source_act.keys())[:3])
+                cache_file = perm_cache_dir / f"perms_{n_source}_{hash(src_name) % 10**8}.npz"
+                save_dict = {f"{sl}__{tl}": perm for (sl, tl), perm in permutations.items()}
+                np.savez_compressed(str(cache_file), **save_dict)
+                print(f"[transport] Cached permutations to {cache_file} ({cache_file.stat().st_size // 1024} KB)")
+            except Exception as e:
+                print(f"[transport] WARNING: Could not cache permutations ({e})")
         print(f"[transport] Direct matching complete: {n_source} layer pairs")
         tracker.done()
         sys.stdout.flush()