td-builder
/

td-toolkit

Safetensors

Model card Files Files and versions

xet

Community

td-builder commited on Feb 26

Commit

bd8f3d3

verified ·

1 Parent(s): 1e0b51b

Upload 137 files

Browse files

Files changed (2) hide show

hugging/td_fuse/merge.py +57 -14
hugging/td_fuse/transport.py +9 -2

hugging/td_fuse/merge.py CHANGED Viewed

@@ -725,7 +725,7 @@ def run_single_merge(
     )
     print(f"[merge] Step 5/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
-    # --- Step 5.5: RAM RL-weight disentanglement (2601.13572) ---
     use_ram = (
         cfg.use_ram_disentangle
         and source_config.architecture in ("transformer", "transformer+mtp")
@@ -733,6 +733,39 @@ def run_single_merge(
         and any(kw in source_config.name.lower() for kw in ["r1", "rl", "rlhf", "grpo"])
     )
     # --- Step 6: Pre-merge protection ---
     print(f"\n[merge] Step 6/10: Pre-merge protection..."); sys.stdout.flush()
     step_t = time.time()
@@ -753,7 +786,6 @@ def run_single_merge(
         # RAM path: disentangle RL weights, merge with preservation
         print(f"\n[merge] Using RAM RL-preservation for {stage_name}...")
         try:
-            # Try loading the base (pre-RL) model for disentanglement
             base_hf_id = source_config.hf_id.replace("-RL", "").replace("-R1-0528", "")
             print(f"[merge] Loading base model for RAM: {base_hf_id}")
             base_model = AutoModelForCausalLM.from_pretrained(
@@ -763,34 +795,38 @@ def run_single_merge(
                 trust_remote_code=source_config.trust_remote_code,
             )
             shared_mask, rl_mask = disentangle_rl_weights(
-                source_model, base_model, cfg.ram_rl_threshold
             )
             # Fuse with RL preservation
             target_state = merge_with_rl_preservation(
                 target_model.state_dict(),
-                source_model.state_dict(),
                 shared_mask, rl_mask,
                 shared_alpha=cfg.ram_shared_alpha * (adjusted_alpha / source_config.merge_alpha),
                 rl_alpha=cfg.ram_rl_alpha,
             )
             target_model.load_state_dict(target_state)
             del base_model
             print(f"[merge] RAM merge complete for {stage_name}")
         except Exception as e:
             print(f"[merge] RAM failed ({e}), falling back to standard T&M merge")
             target_model = fuse_weights(
-                source_model, target_model, transport_plans,
                 source_config_adjusted, cfg,
             )
     else:
-        # Standard T&M path
         target_model = fuse_weights(
-            source_model, target_model, transport_plans,
             source_config_adjusted, cfg,
         )
     # --- Step 7.5: Theseus fallback check (2602.12952) ---
     # If T&M merge produced poor activation alignment, try Theseus
     if cfg.use_theseus_fallback and source_config.merge_risk == "high":
         print(f"\n[merge] Checking if Theseus fallback needed for {stage_name}...")
         post_activations = extract_activations(target_model, calibration_data[:50])  # Quick check
@@ -811,6 +847,9 @@ def run_single_merge(
             # Restore pre-merge state and try Theseus instead
             target_model.load_state_dict(pre_merge_state)
             try:
                 base_model = AutoModelForCausalLM.from_pretrained(
                     source_config.hf_id.split("/")[0] + "/" + source_config.hf_id.split("/")[1].split("-")[0],
                     torch_dtype=getattr(torch, cfg.dtype),
@@ -818,17 +857,20 @@ def run_single_merge(
                     trust_remote_code=source_config.trust_remote_code,
                 )
                 target_model = transport_task_vector_theseus(
-                    source_model, base_model, target_model,
                     source_activations, pre_merge_target_activations,
                     alpha=cfg.theseus_alpha,
                 )
-                del base_model
                 print(f"[merge] Theseus transport complete for {stage_name}")
             except Exception as e:
                 print(f"[merge] Theseus also failed ({e}). Using original T&M result.")
-                # Re-apply T&M result
                 target_model = fuse_weights(
-                    source_model, target_model, transport_plans,
                     source_config_adjusted, cfg,
                 )
@@ -880,15 +922,16 @@ def run_single_merge(
         residual_bank.save_residuals(
             stage_name=stage_name,
             pre_merge_target_state=pre_merge_state,
-            source_state={k: v.cpu() for k, v in source_model.state_dict().items()},
             post_merge_state={k: v.cpu() for k, v in target_model.state_dict().items()},
             source_config=source_config,
         )
     print(f"[merge] Step 9/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
-    # --- Step 9: Free source model memory ---
-    del source_model, source_activations, pre_merge_target_activations
     del transport_plans, post_merge_activations
     gc.collect()
     if torch.cuda.is_available():

     )
     print(f"[merge] Step 5/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
+    # --- Step 5.5: RAM RL-weight disentanglement check (2601.13572) ---
     use_ram = (
         cfg.use_ram_disentangle
         and source_config.architecture in ("transformer", "transformer+mtp")
         and any(kw in source_config.name.lower() for kw in ["r1", "rl", "rlhf", "grpo"])
     )
+    # Validate that the RAM base model actually exists before we try loading it
+    if use_ram:
+        base_hf_id = source_config.hf_id.replace("-RL", "").replace("-R1-0528", "")
+        if base_hf_id == source_config.hf_id:
+            # Stripping didn't change anything — no base model to compare against
+            print(f"[merge] RAM skipped: no base model ID derivable from {source_config.hf_id}")
+            use_ram = False
+        else:
+            # Check if the base model exists on HuggingFace
+            try:
+                from huggingface_hub import model_info
+                model_info(base_hf_id)
+                print(f"[merge] RAM base model verified: {base_hf_id}")
+            except Exception:
+                print(f"[merge] RAM skipped: base model {base_hf_id} not found on HuggingFace")
+                use_ram = False
+    # --- Step 5.7: Free source model from GPU ---
+    # After transport plans are computed, we only need the source STATE DICT
+    # (not the full model object). Freeing the model saves ~16 GB of GPU memory
+    # which prevents OOM during the fusion step.
+    print(f"\n[merge] Step 5.7: Freeing source model from GPU..."); sys.stdout.flush()
+    step_t = time.time()
+    source_state_cpu = {k: v.cpu() for k, v in source_model.state_dict().items()}
+    del source_model
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        free_mem = torch.cuda.mem_get_info()[0] / 1e9
+        total_mem = torch.cuda.mem_get_info()[1] / 1e9
+        print(f"[merge] GPU memory after freeing source: {free_mem:.1f} GB free / {total_mem:.1f} GB total")
+    print(f"[merge] Step 5.7 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 6: Pre-merge protection ---
     print(f"\n[merge] Step 6/10: Pre-merge protection..."); sys.stdout.flush()
     step_t = time.time()
         # RAM path: disentangle RL weights, merge with preservation
         print(f"\n[merge] Using RAM RL-preservation for {stage_name}...")
         try:
             base_hf_id = source_config.hf_id.replace("-RL", "").replace("-R1-0528", "")
             print(f"[merge] Loading base model for RAM: {base_hf_id}")
             base_model = AutoModelForCausalLM.from_pretrained(
                 trust_remote_code=source_config.trust_remote_code,
             )
             shared_mask, rl_mask = disentangle_rl_weights(
+                source_state_cpu, base_model, cfg.ram_rl_threshold
             )
             # Fuse with RL preservation
             target_state = merge_with_rl_preservation(
                 target_model.state_dict(),
+                source_state_cpu,
                 shared_mask, rl_mask,
                 shared_alpha=cfg.ram_shared_alpha * (adjusted_alpha / source_config.merge_alpha),
                 rl_alpha=cfg.ram_rl_alpha,
             )
             target_model.load_state_dict(target_state)
             del base_model
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             print(f"[merge] RAM merge complete for {stage_name}")
         except Exception as e:
             print(f"[merge] RAM failed ({e}), falling back to standard T&M merge")
             target_model = fuse_weights(
+                source_state_cpu, target_model, transport_plans,
                 source_config_adjusted, cfg,
             )
     else:
+        # Standard T&M path (source_state_cpu is on CPU, fuse_weights moves per-param)
         target_model = fuse_weights(
+            source_state_cpu, target_model, transport_plans,
             source_config_adjusted, cfg,
         )
     # --- Step 7.5: Theseus fallback check (2602.12952) ---
     # If T&M merge produced poor activation alignment, try Theseus
+    # NOTE: source_model was freed in step 5.7 — Theseus needs full model reload
     if cfg.use_theseus_fallback and source_config.merge_risk == "high":
         print(f"\n[merge] Checking if Theseus fallback needed for {stage_name}...")
         post_activations = extract_activations(target_model, calibration_data[:50])  # Quick check
             # Restore pre-merge state and try Theseus instead
             target_model.load_state_dict(pre_merge_state)
             try:
+                # Reload source model for Theseus (it was freed in step 5.7)
+                print(f"[merge] Reloading source model for Theseus fallback...")
+                source_model_reload, _ = load_model(source_config, cfg)
                 base_model = AutoModelForCausalLM.from_pretrained(
                     source_config.hf_id.split("/")[0] + "/" + source_config.hf_id.split("/")[1].split("-")[0],
                     torch_dtype=getattr(torch, cfg.dtype),
                     trust_remote_code=source_config.trust_remote_code,
                 )
                 target_model = transport_task_vector_theseus(
+                    source_model_reload, base_model, target_model,
                     source_activations, pre_merge_target_activations,
                     alpha=cfg.theseus_alpha,
                 )
+                del base_model, source_model_reload
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
                 print(f"[merge] Theseus transport complete for {stage_name}")
             except Exception as e:
                 print(f"[merge] Theseus also failed ({e}). Using original T&M result.")
+                # Re-apply T&M result using CPU state dict
                 target_model = fuse_weights(
+                    source_state_cpu, target_model, transport_plans,
                     source_config_adjusted, cfg,
                 )
         residual_bank.save_residuals(
             stage_name=stage_name,
             pre_merge_target_state=pre_merge_state,
+            source_state=source_state_cpu,  # Already on CPU from step 5.7
             post_merge_state={k: v.cpu() for k, v in target_model.state_dict().items()},
             source_config=source_config,
         )
     print(f"[merge] Step 9/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
+    # --- Step 9: Free remaining memory ---
+    # source_model was already freed in step 5.7
+    del source_state_cpu, source_activations, pre_merge_target_activations
     del transport_plans, post_merge_activations
     gc.collect()
     if torch.cuda.is_available():

hugging/td_fuse/transport.py CHANGED Viewed

@@ -513,7 +513,7 @@ def _sinkhorn(
 def fuse_weights(
-    source_model: AutoModelForCausalLM,
     target_model: AutoModelForCausalLM,
     transport_plans: dict,
     source_config: ModelConfig,
@@ -527,6 +527,13 @@ def fuse_weights(
     2. Transport source weights into target neuron basis: W_fused = Q @ W_source
     3. Blend with target: W_final = alpha * W_fused + (1-alpha) * W_target
     Special handling per model:
     - DeepSeek: Direct merge (same architecture)
     - MiMo: Skip MTP heads, skip embeddings
@@ -550,7 +557,7 @@ def fuse_weights(
         pass
     # --- Manual fusion using transport plans ---
-    source_state = source_model.state_dict()
     target_state = target_model.state_dict()
     P = transport_plans["P"]
     Q = transport_plans["Q"]

 def fuse_weights(
+    source_state: dict,
     target_model: AutoModelForCausalLM,
     transport_plans: dict,
     source_config: ModelConfig,
     2. Transport source weights into target neuron basis: W_fused = Q @ W_source
     3. Blend with target: W_final = alpha * W_fused + (1-alpha) * W_target
+    Args:
+        source_state: Source model state dict (can be on CPU — will be moved per-param)
+        target_model: Target model (on GPU)
+        transport_plans: Transport plan matrices from compute_transport_plans
+        source_config: Source model config
+        cfg: Merge configuration
     Special handling per model:
     - DeepSeek: Direct merge (same architecture)
     - MiMo: Skip MTP heads, skip embeddings
         pass
     # --- Manual fusion using transport plans ---
+    # source_state is passed in (may be on CPU to save GPU memory)
     target_state = target_model.state_dict()
     P = transport_plans["P"]
     Q = transport_plans["Q"]