Upload 137 files

Browse files

Files changed (6) hide show

hugging/td_fuse/canary.py +1 -1
hugging/td_fuse/heal.py +10 -2
hugging/td_fuse/merge.py +48 -4
hugging/td_fuse/transport.py +238 -48
hugging/td_fuse/validate.py +12 -1
hugging/td_lang/engine/canary.py +1 -1

hugging/td_fuse/canary.py CHANGED Viewed

@@ -186,7 +186,7 @@ def test_all_canaries(
     results = {}
     # Test the target model's canary
-    results["Qwen3-8B"] = test_canary(model, tokenizer, "Qwen3-8B")
     # Test each merged source model's canary
     for source_name in merged_sources:

     results = {}
     # Test the target model's canary
+    results["Qwen3-VL-8B"] = test_canary(model, tokenizer, "Qwen3-VL-8B")
     # Test each merged source model's canary
     for source_name in merged_sources:

hugging/td_fuse/heal.py CHANGED Viewed

@@ -18,6 +18,8 @@ Findings: #12, #16, #20
 """
 import os
 import torch
 from pathlib import Path
 from typing import Optional
@@ -350,14 +352,20 @@ def heal_model(
     if cfg is None:
         cfg = MergeConfig()
     print("\n" + "=" * 60)
     print("HEALING FINE-TUNE")
     print(f"Model: {model_path}")
     print(f"LoRA r={cfg.heal_lora_r}, alpha={cfg.heal_lora_alpha}")
     print(f"Epochs: {cfg.heal_epochs}, LR: {cfg.heal_learning_rate}")
     print("=" * 60)
     if check_unsloth_available():
-        return apply_qlora_unsloth(model_path, cfg, healing_data)
     else:
-        return apply_qlora_standard(model_path, cfg, healing_data)

 """
 import os
+import sys
+import time
 import torch
 from pathlib import Path
 from typing import Optional
     if cfg is None:
         cfg = MergeConfig()
+    heal_start = time.time()
     print("\n" + "=" * 60)
     print("HEALING FINE-TUNE")
     print(f"Model: {model_path}")
     print(f"LoRA r={cfg.heal_lora_r}, alpha={cfg.heal_lora_alpha}")
     print(f"Epochs: {cfg.heal_epochs}, LR: {cfg.heal_learning_rate}")
+    print(f"Started at: {time.strftime('%H:%M:%S')}")
     print("=" * 60)
+    sys.stdout.flush()
     if check_unsloth_available():
+        result = apply_qlora_unsloth(model_path, cfg, healing_data)
     else:
+        result = apply_qlora_standard(model_path, cfg, healing_data)
+    print(f"[heal] Total healing time: {(time.time()-heal_start)/60:.1f} min")
+    sys.stdout.flush()
+    return result

hugging/td_fuse/merge.py CHANGED Viewed

@@ -22,7 +22,9 @@ Findings: #13, #22, #25
 import os
 import gc
 import copy
 import torch
 import numpy as np
 from pathlib import Path
@@ -593,10 +595,13 @@ def run_single_merge(
         merged_sources = []
     stage_name = source_config.name
     print(f"\n{'=' * 70}")
-    print(f"MERGE STAGE: {stage_name} → target")
     print(f"Risk level: {source_config.merge_risk.upper()}")
     print(f"{'=' * 70}")
     result = {
         "stage": stage_name,
@@ -606,23 +611,34 @@ def run_single_merge(
     }
     # --- Step 1: Load source model ---
     source_model, source_tokenizer = load_model(source_config, cfg)
     # --- Step 2: Inject canary into source ---
     if stage_name in CANARY_FACTS:
-        print(f"\n[merge] Injecting canary fact into {stage_name}...")
         source_model = inject_canary(source_model, source_tokenizer, stage_name)
     # --- Step 3: Load calibration data (if not provided) ---
     if calibration_data is None:
         calibration_data = load_calibration_data(cfg, target_tokenizer)
     # --- Step 4: Extract activations ---
-    print(f"\n[merge] Extracting source activations...")
     source_activations = extract_activations(source_model, calibration_data)
-    print(f"\n[merge] Extracting target activations...")
     pre_merge_target_activations = extract_activations(target_model, calibration_data)
     # --- Step 4.5: Mergeability pre-check (2601.22285) ---
     if cfg.use_mergeability_check:
@@ -644,9 +660,12 @@ def run_single_merge(
             return result
     # --- Step 5: Compute transport plans ---
     transport_plans = compute_transport_plans(
         source_activations, pre_merge_target_activations, cfg
     )
     # --- Step 5.5: RAM RL-weight disentanglement (2601.13572) ---
     use_ram = (
@@ -657,6 +676,8 @@ def run_single_merge(
     )
     # --- Step 6: Pre-merge protection ---
     adjusted_alpha = protection.before_merge(target_model, source_config)
     # Override source alpha with time-adjusted value
@@ -665,8 +686,11 @@ def run_single_merge(
     # Save pre-merge state for protection
     pre_merge_state = {k: v.clone().cpu() for k, v in target_model.state_dict().items()}
     # --- Step 7: Fuse weights ---
     if use_ram:
         # RAM path: disentangle RL weights, merge with preservation
         print(f"\n[merge] Using RAM RL-preservation for {stage_name}...")
@@ -750,7 +774,11 @@ def run_single_merge(
                     source_config_adjusted, cfg,
                 )
     # --- Step 8: Apply post-merge protection (ARM + OTMF + MagMax) ---
     # Skip vision encoder params — they weren't merged, so don't "protect" them
     if protection.merge_count > 0:
         print(f"\n[merge] Applying sequential merge protection (ARM + OTMF + MagMax)...")
@@ -770,7 +798,11 @@ def run_single_merge(
         target_model.load_state_dict(target_state)
         print(f"[merge] Protected {protected_count} language params (skipped {vision_skipped} vision params)")
     # --- Step 8.5: Extract post-merge activations for ARM/OTMF ---
     post_merge_activations = extract_activations(target_model, calibration_data[:100])
     # Record this merge's delta + compute ARM/OTMF for next merge
@@ -780,7 +812,11 @@ def run_single_merge(
         post_merge_activations=post_merge_activations,
     )
     # --- Step 8.8: Save residuals (what was lost from both sides) ---
     if residual_bank is not None:
         print(f"\n[merge] Saving residuals for {stage_name}...")
         residual_bank.save_residuals(
@@ -791,6 +827,8 @@ def run_single_merge(
             source_config=source_config,
         )
     # --- Step 9: Free source model memory ---
     del source_model, source_activations, pre_merge_target_activations
     del transport_plans, post_merge_activations
@@ -799,6 +837,8 @@ def run_single_merge(
         torch.cuda.empty_cache()
     # --- Step 10: Validate ---
     merged_sources.append(stage_name)
     validation = validate_merged_model(
         target_model, target_tokenizer,
@@ -806,8 +846,12 @@ def run_single_merge(
         baseline_perplexity=baseline_perplexity,
     )
     result["validation"] = validation
     result["merged_sources"] = merged_sources.copy()
     # --- Kill criteria check ---
     if not validation["overall"]:

 import os
 import gc
+import sys
 import copy
+import time
 import torch
 import numpy as np
 from pathlib import Path
         merged_sources = []
     stage_name = source_config.name
+    stage_start = time.time()
     print(f"\n{'=' * 70}")
+    print(f"MERGE STAGE: {stage_name} -> target")
     print(f"Risk level: {source_config.merge_risk.upper()}")
+    print(f"Started at: {time.strftime('%H:%M:%S')}")
     print(f"{'=' * 70}")
+    sys.stdout.flush()
     result = {
         "stage": stage_name,
     }
     # --- Step 1: Load source model ---
+    print(f"\n[merge] Step 1/10: Loading source model..."); sys.stdout.flush()
+    step_t = time.time()
     source_model, source_tokenizer = load_model(source_config, cfg)
+    print(f"[merge] Step 1/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 2: Inject canary into source ---
+    print(f"\n[merge] Step 2/10: Injecting canary..."); sys.stdout.flush()
+    step_t = time.time()
     if stage_name in CANARY_FACTS:
         source_model = inject_canary(source_model, source_tokenizer, stage_name)
+    print(f"[merge] Step 2/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 3: Load calibration data (if not provided) ---
+    print(f"\n[merge] Step 3/10: Loading calibration data..."); sys.stdout.flush()
+    step_t = time.time()
     if calibration_data is None:
         calibration_data = load_calibration_data(cfg, target_tokenizer)
+    print(f"[merge] Step 3/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 4: Extract activations ---
+    print(f"\n[merge] Step 4/10: Extracting activations (both models)..."); sys.stdout.flush()
+    step_t = time.time()
+    print(f"[merge] Extracting source activations...")
     source_activations = extract_activations(source_model, calibration_data)
+    print(f"[merge] Extracting target activations...")
     pre_merge_target_activations = extract_activations(target_model, calibration_data)
+    print(f"[merge] Step 4/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 4.5: Mergeability pre-check (2601.22285) ---
     if cfg.use_mergeability_check:
             return result
     # --- Step 5: Compute transport plans ---
+    print(f"\n[merge] Step 5/10: Computing transport plans..."); sys.stdout.flush()
+    step_t = time.time()
     transport_plans = compute_transport_plans(
         source_activations, pre_merge_target_activations, cfg
     )
+    print(f"[merge] Step 5/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 5.5: RAM RL-weight disentanglement (2601.13572) ---
     use_ram = (
     )
     # --- Step 6: Pre-merge protection ---
+    print(f"\n[merge] Step 6/10: Pre-merge protection..."); sys.stdout.flush()
+    step_t = time.time()
     adjusted_alpha = protection.before_merge(target_model, source_config)
     # Override source alpha with time-adjusted value
     # Save pre-merge state for protection
     pre_merge_state = {k: v.clone().cpu() for k, v in target_model.state_dict().items()}
+    print(f"[merge] Step 6/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 7: Fuse weights ---
+    print(f"\n[merge] Step 7/10: Fusing weights..."); sys.stdout.flush()
+    step_t = time.time()
     if use_ram:
         # RAM path: disentangle RL weights, merge with preservation
         print(f"\n[merge] Using RAM RL-preservation for {stage_name}...")
                     source_config_adjusted, cfg,
                 )
+    print(f"[merge] Step 7/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 8: Apply post-merge protection (ARM + OTMF + MagMax) ---
+    print(f"\n[merge] Step 8/10: Post-merge protection..."); sys.stdout.flush()
+    step_t = time.time()
     # Skip vision encoder params — they weren't merged, so don't "protect" them
     if protection.merge_count > 0:
         print(f"\n[merge] Applying sequential merge protection (ARM + OTMF + MagMax)...")
         target_model.load_state_dict(target_state)
         print(f"[merge] Protected {protected_count} language params (skipped {vision_skipped} vision params)")
+    print(f"[merge] Step 8/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 8.5: Extract post-merge activations for ARM/OTMF ---
+    print(f"\n[merge] Step 8.5/10: Post-merge activations + ARM/OTMF prep..."); sys.stdout.flush()
+    step_t = time.time()
     post_merge_activations = extract_activations(target_model, calibration_data[:100])
     # Record this merge's delta + compute ARM/OTMF for next merge
         post_merge_activations=post_merge_activations,
     )
+    print(f"[merge] Step 8.5/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 8.8: Save residuals (what was lost from both sides) ---
+    print(f"\n[merge] Step 9/10: Saving residuals..."); sys.stdout.flush()
+    step_t = time.time()
     if residual_bank is not None:
         print(f"\n[merge] Saving residuals for {stage_name}...")
         residual_bank.save_residuals(
             source_config=source_config,
         )
+    print(f"[merge] Step 9/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 9: Free source model memory ---
     del source_model, source_activations, pre_merge_target_activations
     del transport_plans, post_merge_activations
         torch.cuda.empty_cache()
     # --- Step 10: Validate ---
+    print(f"\n[merge] Step 10/10: Validating merge..."); sys.stdout.flush()
+    step_t = time.time()
     merged_sources.append(stage_name)
     validation = validate_merged_model(
         target_model, target_tokenizer,
         baseline_perplexity=baseline_perplexity,
     )
+    print(f"[merge] Step 10/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     result["validation"] = validation
     result["merged_sources"] = merged_sources.copy()
+    total_time = time.time() - stage_start
+    print(f"\n[merge] Total time for {stage_name}: {total_time/60:.1f} min"); sys.stdout.flush()
     # --- Kill criteria check ---
     if not validation["overall"]:

hugging/td_fuse/transport.py CHANGED Viewed

@@ -15,11 +15,14 @@ We add:
     - MiMo MTP head handling
     - Falcon SSM component handling
     - Sequential merge protection (MagMax + orthogonal projection)
 Findings: #01, #07, #24
 """
 import sys
 import torch
 import numpy as np
 from pathlib import Path
@@ -30,6 +33,58 @@ from datasets import load_dataset
 from .config import MergeConfig, ModelConfig, TARGET
 def setup_tm_repo(cfg: MergeConfig):
     """Add official T&M repo to Python path so we can import their code."""
     repo_path = Path(cfg.tm_repo_path)
@@ -58,6 +113,7 @@ def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
     Findings: #08
     """
     print(f"[transport] Loading calibration data ({cfg.calibration_samples} samples)...")
     samples = []
@@ -84,9 +140,12 @@ def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
                 )
                 samples.append(tokens)
                 count += 1
         print(f"  Pile general: {count} samples")
     except Exception as e:
-        print(f"  ⚠ Pile failed: {e}")
         print(f"  Falling back to neuralmagic only")
     # --- neuralmagic: Q&A calibration (up to remaining) ---
@@ -112,11 +171,16 @@ def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
                     )
                     samples.append(tokens)
                     count += 1
             print(f"  neuralmagic: {count} samples")
         except Exception as e:
-            print(f"  ⚠ neuralmagic failed: {e}")
     print(f"[transport] Total calibration samples: {len(samples)}")
     return samples
@@ -133,9 +197,12 @@ def extract_activations(
     optimal transport algorithm aligns between source and target.
     Returns:
-        Dict mapping layer_name → activation tensor [num_samples, hidden_dim]
     """
     print(f"[transport] Extracting activations from {len(calibration_data)} samples...")
     activations = {}
     hooks = []
@@ -153,7 +220,7 @@ def extract_activations(
                         act = output
                     if layer_name not in activations:
                         activations[layer_name] = []
-                    # Mean pool over sequence length → [hidden_dim]
                     activations[layer_name].append(
                         act.detach().float().mean(dim=1).cpu()
                     )
@@ -170,20 +237,31 @@ def extract_activations(
             try:
                 model(**inputs)
             except Exception as e:
-                print(f"  ⚠ Sample {i} failed: {e}")
                 continue
             if (i + 1) % 100 == 0:
                 print(f"  Processed {i + 1}/{len(calibration_data)} samples")
     # Remove hooks
     for h in hooks:
         h.remove()
     # Stack activations: [num_samples, hidden_dim]
     for key in activations:
         activations[key] = torch.cat(activations[key], dim=0)
-        print(f"  {key}: {activations[key].shape}")
     return activations
@@ -199,13 +277,14 @@ def compute_transport_plans(
     This is where the magic happens. We use the official T&M code's:
     - corr_distance_matrix: correlation distance between activation vectors
     - sinkhorn_uniform_streaming: memory-efficient Sinkhorn solver
-    - compute_P: layer-level coupling (which source layers → which target layers)
     - compute_Q_and_layer_costs: neuron-level coupling within each layer pair
     Returns:
         Dict with 'P' (layer coupling) and 'Q' (per-layer neuron coupling) matrices
     """
     print("[transport] Computing transport plans...")
     try:
         # Try importing official T&M code
@@ -264,41 +343,138 @@ def _compute_plans_fallback(
     """
     Fallback transport plan computation when official code isn't available.
-    Uses correlation distance + basic Sinkhorn. Less optimised than official
-    but functionally correct for testing.
     """
     source_layers = sorted(source_act.keys())
     target_layers = sorted(target_act.keys())
-    # --- Step 1: Correlation distance matrices per layer pair ---
-    Q_matrices = {}
-    layer_costs = np.zeros((len(source_layers), len(target_layers)))
     for i, sl in enumerate(source_layers):
         for j, tl in enumerate(target_layers):
-            if sl not in source_act or tl not in target_act:
-                continue
-            S = source_act[sl].numpy()  # [samples, hidden_dim_source]
-            T = target_act[tl].numpy()  # [samples, hidden_dim_target]
-            # Correlation distance: 1 - pearson_correlation
-            # Between each pair of neurons across samples
-            # S: [samples, n_source], T: [samples, n_target]
-            S_norm = (S - S.mean(0)) / (S.std(0) + 1e-8)
-            T_norm = (T - T.mean(0)) / (T.std(0) + 1e-8)
-            corr = S_norm.T @ T_norm / S.shape[0]  # [n_source, n_target]
-            cost = 1.0 - corr  # Correlation distance
-            # Basic Sinkhorn on this cost matrix
-            Q = _sinkhorn(cost, reg=cfg.sinkhorn_reg, max_iter=cfg.sinkhorn_max_iter)
-            Q_matrices[(sl, tl)] = Q
-            layer_costs[i, j] = cost.mean()
-    # --- Step 2: Layer coupling (P matrix) ---
-    P = _sinkhorn(layer_costs, reg=cfg.sinkhorn_reg, max_iter=cfg.sinkhorn_max_iter)
     return {
         "P": P,
         "Q": Q_matrices,
@@ -327,7 +503,7 @@ def _sinkhorn(
     u = np.ones(n) / n
     v = np.ones(m) / m
-    for _ in range(max_iter):
         u = 1.0 / (K @ v + 1e-10)
         v = 1.0 / (K.T @ u + 1e-10)
@@ -354,13 +530,14 @@ def fuse_weights(
     Special handling per model:
     - DeepSeek: Direct merge (same architecture)
     - MiMo: Skip MTP heads, skip embeddings
-    - Llama: Layer mapping (32→36), skip embeddings, drop QKV bias
     - Falcon: Skip Mamba components, skip embeddings
     Returns:
         Target model with fused weights
     """
-    print(f"\n[transport] Fusing {source_config.name} → target")
     alpha = source_config.merge_alpha
     try:
@@ -380,8 +557,12 @@ def fuse_weights(
     fused_count = 0
     skipped_count = 0
     for target_key in target_state:
         # Skip parameters we shouldn't merge
         if _should_skip(target_key, source_config):
             skipped_count += 1
@@ -409,18 +590,27 @@ def fuse_weights(
         target_state[target_key] = fused_w
         fused_count += 1
-    # Apply thinking mode protection
-    if cfg.freeze_think_tokens and "embed_tokens" in target_key:
-        for token_id in cfg.think_token_ids:
-            if token_id < target_state["model.embed_tokens.weight"].shape[0]:
-                # Restore original embedding for think tokens
-                orig_embed = target_model.state_dict()["model.embed_tokens.weight"]
-                target_state["model.embed_tokens.weight"][token_id] = orig_embed[token_id]
-                print(f"[transport] Protected think token {token_id}")
     # Load fused weights
     target_model.load_state_dict(target_state)
     print(f"[transport] Fused {fused_count} params, skipped {skipped_count}")
     return target_model
@@ -457,7 +647,7 @@ def _map_key(target_key: str, source_config: ModelConfig) -> Optional[str]:
     if source_config.architecture == "transformer" and source_config.layers == 36:
         return target_key
-    # For Llama (32 layers → 36 layers), map layer indices
     if "layer_mapping_32_to_36" in source_config.special_handling:
         if "model.layers." in target_key:
             # Extract layer number
@@ -523,5 +713,5 @@ def _align_dimensions(
         result[:min_len] = source_w[:min_len]
         return result
-    # Can't align — skip this parameter
     return None

     - MiMo MTP head handling
     - Falcon SSM component handling
     - Sequential merge protection (MagMax + orthogonal projection)
+    - Progress reporting every 5 minutes
+    - Timeouts to prevent infinite hangs
 Findings: #01, #07, #24
 """
 import sys
+import time
 import torch
 import numpy as np
 from pathlib import Path
 from .config import MergeConfig, ModelConfig, TARGET
+# ============================================================================
+# PROGRESS TRACKER — prints status every 5 minutes so you know it's alive
+# ============================================================================
+class ProgressTracker:
+    """Prints a heartbeat every interval_seconds so you know it's not stuck."""
+    def __init__(self, task_name: str, interval_seconds: int = 300):
+        self.task_name = task_name
+        self.interval = interval_seconds
+        self.start_time = time.time()
+        self.last_report = self.start_time
+        self.step = 0
+        self.total_steps = 0
+        print(f"\n[{task_name}] Started at {time.strftime('%H:%M:%S')}")
+    def set_total(self, total: int):
+        self.total_steps = total
+    def tick(self, step_name: str = ""):
+        """Call this inside loops. Prints progress if 5 min have passed."""
+        self.step += 1
+        now = time.time()
+        elapsed = now - self.start_time
+        since_last = now - self.last_report
+        if since_last >= self.interval:
+            pct = f"{self.step}/{self.total_steps} ({100*self.step/self.total_steps:.0f}%)" if self.total_steps else f"step {self.step}"
+            eta = ""
+            if self.total_steps and self.step > 0:
+                rate = elapsed / self.step
+                remaining = (self.total_steps - self.step) * rate
+                eta = f", ETA {remaining/60:.1f} min"
+            print(f"[{self.task_name}] HEARTBEAT — {pct}, elapsed {elapsed/60:.1f} min{eta} | {step_name}")
+            sys.stdout.flush()
+            self.last_report = now
+    def done(self):
+        elapsed = time.time() - self.start_time
+        print(f"[{self.task_name}] Completed in {elapsed/60:.1f} min ({elapsed:.0f}s)")
+        sys.stdout.flush()
+    def check_timeout(self, timeout_seconds: int = 3600):
+        """Raise if we've been running longer than timeout_seconds."""
+        elapsed = time.time() - self.start_time
+        if elapsed > timeout_seconds:
+            raise TimeoutError(
+                f"[{self.task_name}] TIMEOUT after {elapsed/60:.1f} min "
+                f"(limit: {timeout_seconds/60:.0f} min). Something is wrong."
+            )
 def setup_tm_repo(cfg: MergeConfig):
     """Add official T&M repo to Python path so we can import their code."""
     repo_path = Path(cfg.tm_repo_path)
     Findings: #08
     """
+    tracker = ProgressTracker("calibration-data", interval_seconds=120)
     print(f"[transport] Loading calibration data ({cfg.calibration_samples} samples)...")
     samples = []
                 )
                 samples.append(tokens)
                 count += 1
+                if count % 100 == 0:
+                    print(f"  Pile: {count}/600 samples loaded...")
+                    sys.stdout.flush()
         print(f"  Pile general: {count} samples")
     except Exception as e:
+        print(f"  WARNING: Pile failed: {e}")
         print(f"  Falling back to neuralmagic only")
     # --- neuralmagic: Q&A calibration (up to remaining) ---
                     )
                     samples.append(tokens)
                     count += 1
+                    if count % 100 == 0:
+                        print(f"  neuralmagic: {count}/{remaining} samples loaded...")
+                        sys.stdout.flush()
             print(f"  neuralmagic: {count} samples")
         except Exception as e:
+            print(f"  WARNING: neuralmagic failed: {e}")
+    tracker.done()
     print(f"[transport] Total calibration samples: {len(samples)}")
+    sys.stdout.flush()
     return samples
     optimal transport algorithm aligns between source and target.
     Returns:
+        Dict mapping layer_name -> activation tensor [num_samples, hidden_dim]
     """
+    tracker = ProgressTracker("extract-activations", interval_seconds=300)
+    tracker.set_total(len(calibration_data))
     print(f"[transport] Extracting activations from {len(calibration_data)} samples...")
+    sys.stdout.flush()
     activations = {}
     hooks = []
                         act = output
                     if layer_name not in activations:
                         activations[layer_name] = []
+                    # Mean pool over sequence length -> [hidden_dim]
                     activations[layer_name].append(
                         act.detach().float().mean(dim=1).cpu()
                     )
             try:
                 model(**inputs)
             except Exception as e:
+                print(f"  WARNING: Sample {i} failed: {e}")
                 continue
+            tracker.tick(f"sample {i+1}")
             if (i + 1) % 100 == 0:
                 print(f"  Processed {i + 1}/{len(calibration_data)} samples")
+                sys.stdout.flush()
+            # Timeout: 30 min for activation extraction
+            tracker.check_timeout(timeout_seconds=1800)
     # Remove hooks
     for h in hooks:
         h.remove()
     # Stack activations: [num_samples, hidden_dim]
+    layer_count = 0
     for key in activations:
         activations[key] = torch.cat(activations[key], dim=0)
+        layer_count += 1
+    print(f"  Extracted {layer_count} layers, shapes: {activations[list(activations.keys())[0]].shape if activations else 'empty'}")
+    tracker.done()
+    sys.stdout.flush()
     return activations
     This is where the magic happens. We use the official T&M code's:
     - corr_distance_matrix: correlation distance between activation vectors
     - sinkhorn_uniform_streaming: memory-efficient Sinkhorn solver
+    - compute_P: layer-level coupling (which source layers -> which target layers)
     - compute_Q_and_layer_costs: neuron-level coupling within each layer pair
     Returns:
         Dict with 'P' (layer coupling) and 'Q' (per-layer neuron coupling) matrices
     """
     print("[transport] Computing transport plans...")
+    sys.stdout.flush()
     try:
         # Try importing official T&M code
     """
     Fallback transport plan computation when official code isn't available.
+    Smart routing:
+    - Same-architecture models (same layer count): direct 1:1 layer matching
+      (no OT needed, just identity permutation -- fast!)
+    - Cross-architecture: sparse OT (only top-3 source layers per target)
     """
+    tracker = ProgressTracker("transport-plans", interval_seconds=300)
     source_layers = sorted(source_act.keys())
     target_layers = sorted(target_act.keys())
+    n_source = len(source_layers)
+    n_target = len(target_layers)
+    print(f"[transport] Source layers: {n_source}, Target layers: {n_target}")
+    sys.stdout.flush()
+    # --- FAST PATH: same architecture (same layer count) ---
+    # DeepSeek-R1-0528-Qwen3-8B has the same architecture as Qwen3-VL-8B
+    # Both have 36 transformer layers with identical hidden dims
+    # No need for expensive OT -- just match layers 1:1
+    if n_source == n_target:
+        print("[transport] Same layer count -- using direct 1:1 layer matching (fast path)")
+        print("[transport] This should take under 1 minute...")
+        sys.stdout.flush()
+        Q_matrices = {}
+        P = np.eye(n_source) / n_source  # Identity coupling
+        tracker.set_total(n_source)
+        for i, (sl, tl) in enumerate(zip(source_layers, target_layers)):
+            S = source_act[sl].numpy()
+            T = target_act[tl].numpy()
+            # For same-dim layers, Q is identity (neurons already correspond)
+            if S.shape[1] == T.shape[1]:
+                Q_matrices[(sl, tl)] = np.eye(S.shape[1]) / S.shape[1]
+            else:
+                # Different dims -- do lightweight Sinkhorn on this pair only
+                print(f"  Layer {i}: dim mismatch ({S.shape[1]} vs {T.shape[1]}), using Sinkhorn...")
+                S_norm = (S - S.mean(0)) / (S.std(0) + 1e-8)
+                T_norm = (T - T.mean(0)) / (T.std(0) + 1e-8)
+                corr = S_norm.T @ T_norm / S.shape[0]
+                cost = 1.0 - corr
+                Q_matrices[(sl, tl)] = _sinkhorn(cost, reg=0.1, max_iter=50)
+            tracker.tick(f"{sl} -> {tl}")
+            if (i + 1) % 10 == 0 or i == 0:
+                print(f"  Matched layer {i + 1}/{n_source}: {sl} -> {tl}")
+                sys.stdout.flush()
+            # Timeout: 10 min for fast path (should take seconds)
+            tracker.check_timeout(timeout_seconds=600)
+        print(f"[transport] Direct matching complete: {n_source} layer pairs")
+        tracker.done()
+        sys.stdout.flush()
+        return {
+            "P": P,
+            "Q": Q_matrices,
+            "source_layers": source_layers,
+            "target_layers": target_layers,
+        }
+    # --- CROSS-ARCHITECTURE PATH: sparse OT ---
+    # Only compute top-3 source layers per target (not all NxN pairs)
+    print(f"[transport] Cross-architecture -- using sparse OT (top-3 per target)")
+    print(f"[transport] Estimated time: 5-15 minutes")
+    sys.stdout.flush()
+    # Step 1: Compute layer-level similarity (cheap: just mean activation correlation)
+    print("[transport] Step 1/3: Computing layer-level similarities...")
+    sys.stdout.flush()
+    layer_costs = np.zeros((n_source, n_target))
+    tracker.set_total(n_source * n_target + n_target * 3)
     for i, sl in enumerate(source_layers):
         for j, tl in enumerate(target_layers):
+            S_mean = source_act[sl].mean(0).numpy()
+            T_mean = target_act[tl].mean(0).numpy()
+            # Cosine similarity as cheap proxy
+            min_dim = min(len(S_mean), len(T_mean))
+            s = S_mean[:min_dim]
+            t = T_mean[:min_dim]
+            sim = np.dot(s, t) / (np.linalg.norm(s) * np.linalg.norm(t) + 1e-8)
+            layer_costs[i, j] = 1.0 - sim
+            tracker.tick(f"layer sim {i},{j}")
+        # Timeout: 30 min for cross-arch
+        tracker.check_timeout(timeout_seconds=1800)
+    print(f"[transport] Step 1/3 done: {n_source}x{n_target} similarities computed")
+    sys.stdout.flush()
+    # Step 2: For each target layer, only compute Q for top-3 most similar source layers
+    print("[transport] Step 2/3: Computing neuron-level transport (top-3 per target)...")
+    sys.stdout.flush()
+    Q_matrices = {}
+    for j, tl in enumerate(target_layers):
+        top3 = np.argsort(layer_costs[:, j])[:3]
+        for i in top3:
+            sl = source_layers[i]
+            S = source_act[sl].numpy()
+            T = target_act[tl].numpy()
+            # Lightweight Sinkhorn (50 iterations, not 100+)
+            min_dim = min(S.shape[1], T.shape[1])
+            S_sub = S[:, :min_dim]
+            T_sub = T[:, :min_dim]
+            S_norm = (S_sub - S_sub.mean(0)) / (S_sub.std(0) + 1e-8)
+            T_norm = (T_sub - T_sub.mean(0)) / (T_sub.std(0) + 1e-8)
+            corr = S_norm.T @ T_norm / S.shape[0]
+            cost = 1.0 - corr
+            Q_matrices[(sl, tl)] = _sinkhorn(cost, reg=0.1, max_iter=50)
+            tracker.tick(f"Q({sl},{tl})")
+        if (j + 1) % 5 == 0 or j == 0:
+            print(f"  Target layer {j + 1}/{n_target}: matched to top-3 sources")
+            sys.stdout.flush()
+        # Timeout: 30 min for cross-arch
+        tracker.check_timeout(timeout_seconds=1800)
+    print(f"[transport] Step 2/3 done: {len(Q_matrices)} Q matrices computed")
+    sys.stdout.flush()
+    # Step 3: Layer coupling via Sinkhorn on layer costs
+    print("[transport] Step 3/3: Computing layer coupling P matrix...")
+    sys.stdout.flush()
+    P = _sinkhorn(layer_costs, reg=0.1, max_iter=50)
+    print(f"[transport] Sparse OT complete: {len(Q_matrices)} layer pairs computed")
+    tracker.done()
+    sys.stdout.flush()
     return {
         "P": P,
         "Q": Q_matrices,
     u = np.ones(n) / n
     v = np.ones(m) / m
+    for iteration in range(max_iter):
         u = 1.0 / (K @ v + 1e-10)
         v = 1.0 / (K.T @ u + 1e-10)
     Special handling per model:
     - DeepSeek: Direct merge (same architecture)
     - MiMo: Skip MTP heads, skip embeddings
+    - Llama: Layer mapping (32->36), skip embeddings, drop QKV bias
     - Falcon: Skip Mamba components, skip embeddings
     Returns:
         Target model with fused weights
     """
+    tracker = ProgressTracker("fuse-weights", interval_seconds=300)
+    print(f"\n[transport] Fusing {source_config.name} -> target")
     alpha = source_config.merge_alpha
     try:
     fused_count = 0
     skipped_count = 0
+    total_params = len(target_state)
+    tracker.set_total(total_params)
     for target_key in target_state:
+        tracker.tick(target_key)
         # Skip parameters we shouldn't merge
         if _should_skip(target_key, source_config):
             skipped_count += 1
         target_state[target_key] = fused_w
         fused_count += 1
+        # Apply thinking mode protection (inside loop -- check each key)
+        if cfg.freeze_think_tokens and "embed_tokens" in target_key:
+            for token_id in cfg.think_token_ids:
+                if token_id < target_state[target_key].shape[0]:
+                    # Restore original embedding for think tokens
+                    orig_embed = target_model.state_dict()[target_key]
+                    target_state[target_key][token_id] = orig_embed[token_id]
+                    print(f"[transport] Protected think token {token_id}")
+        if fused_count % 50 == 0:
+            print(f"  Fused {fused_count} params so far (skipped {skipped_count})...")
+            sys.stdout.flush()
+        # Timeout: 20 min for weight fusion
+        tracker.check_timeout(timeout_seconds=1200)
     # Load fused weights
     target_model.load_state_dict(target_state)
     print(f"[transport] Fused {fused_count} params, skipped {skipped_count}")
+    tracker.done()
+    sys.stdout.flush()
     return target_model
     if source_config.architecture == "transformer" and source_config.layers == 36:
         return target_key
+    # For Llama (32 layers -> 36 layers), map layer indices
     if "layer_mapping_32_to_36" in source_config.special_handling:
         if "model.layers." in target_key:
             # Extract layer number
         result[:min_len] = source_w[:min_len]
         return result
+    # Can't align -- skip this parameter
     return None

hugging/td_fuse/validate.py CHANGED Viewed

@@ -11,6 +11,8 @@ Kill criteria: >10% performance drop on any test → abort merge.
 Findings: #11, #22, #25
 """
 import torch
 import math
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -39,9 +41,12 @@ def validate_merged_model(
     Returns:
         Dict with test results and overall pass/fail
     """
     print("\n" + "=" * 60)
     print(f"VALIDATION — After merging: {', '.join(merged_sources)}")
     print("=" * 60)
     results = {
         "canary": None,
@@ -52,6 +57,7 @@ def validate_merged_model(
     }
     # --- Test 1: Canary recall ---
     canary_results = test_all_canaries(model, tokenizer, merged_sources)
     passed_canaries = sum(1 for v in canary_results.values() if v)
     total_canaries = len(canary_results)
@@ -63,6 +69,7 @@ def validate_merged_model(
     }
     # --- Test 2: Perplexity ---
     perplexity = compute_perplexity(model, tokenizer)
     ppl_ok = True
     if baseline_perplexity is not None:
@@ -76,10 +83,12 @@ def validate_merged_model(
     results["perplexity"] = {"value": perplexity, "ok": ppl_ok}
     # --- Test 3: Thinking mode ---
     think_ok = test_thinking_mode(model, tokenizer)
     results["thinking_mode"] = {"ok": think_ok}
     # --- Test 4: Quick reasoning ---
     reason_ok = test_reasoning(model, tokenizer)
     results["reasoning"] = {"ok": reason_ok}
@@ -100,8 +109,10 @@ def validate_merged_model(
     print(f"  Perplexity:      {'✓' if ppl_ok else '✗'} ({perplexity:.2f})")
     print(f"  Thinking mode:   {'✓' if think_ok else '✗'}")
     print(f"  Reasoning:       {'✓' if reason_ok else '✗'}")
-    print(f"  OVERALL:         {'✓ PASS' if all_ok else '✗ FAIL — consider aborting'}")
     print("-" * 60)
     return results

 Findings: #11, #22, #25
 """
+import sys
+import time
 import torch
 import math
 from transformers import AutoModelForCausalLM, AutoTokenizer
     Returns:
         Dict with test results and overall pass/fail
     """
+    val_start = time.time()
     print("\n" + "=" * 60)
     print(f"VALIDATION — After merging: {', '.join(merged_sources)}")
+    print(f"Started at: {time.strftime('%H:%M:%S')}")
     print("=" * 60)
+    sys.stdout.flush()
     results = {
         "canary": None,
     }
     # --- Test 1: Canary recall ---
+    print("[validate] Test 1/4: Canary recall..."); sys.stdout.flush()
     canary_results = test_all_canaries(model, tokenizer, merged_sources)
     passed_canaries = sum(1 for v in canary_results.values() if v)
     total_canaries = len(canary_results)
     }
     # --- Test 2: Perplexity ---
+    print("[validate] Test 2/4: Perplexity..."); sys.stdout.flush()
     perplexity = compute_perplexity(model, tokenizer)
     ppl_ok = True
     if baseline_perplexity is not None:
     results["perplexity"] = {"value": perplexity, "ok": ppl_ok}
     # --- Test 3: Thinking mode ---
+    print("[validate] Test 3/4: Thinking mode..."); sys.stdout.flush()
     think_ok = test_thinking_mode(model, tokenizer)
     results["thinking_mode"] = {"ok": think_ok}
     # --- Test 4: Quick reasoning ---
+    print("[validate] Test 4/4: Quick reasoning..."); sys.stdout.flush()
     reason_ok = test_reasoning(model, tokenizer)
     results["reasoning"] = {"ok": reason_ok}
     print(f"  Perplexity:      {'✓' if ppl_ok else '✗'} ({perplexity:.2f})")
     print(f"  Thinking mode:   {'✓' if think_ok else '✗'}")
     print(f"  Reasoning:       {'✓' if reason_ok else '✗'}")
+    print(f"  OVERALL:         {'PASS' if all_ok else 'FAIL -- consider aborting'}")
+    print(f"  Validation time: {(time.time()-val_start)/60:.1f} min")
     print("-" * 60)
+    sys.stdout.flush()
     return results

hugging/td_lang/engine/canary.py CHANGED Viewed

@@ -186,7 +186,7 @@ def test_all_canaries(
     results = {}
     # Test the target model's canary
-    results["Qwen3-8B"] = test_canary(model, tokenizer, "Qwen3-8B")
     # Test each merged source model's canary
     for source_name in merged_sources:

     results = {}
     # Test the target model's canary
+    results["Qwen3-VL-8B"] = test_canary(model, tokenizer, "Qwen3-VL-8B")
     # Test each merged source model's canary
     for source_name in merged_sources: