td-builder
/

td-toolkit

Safetensors

Model card Files Files and versions

xet

Community

td-builder commited on Feb 26

Commit

1e0b51b

verified ·

1 Parent(s): 1e0fbdd

Upload 137 files

Browse files

Files changed (2) hide show

hugging/td_fuse/merge.py +71 -4
hugging/td_fuse/transport.py +38 -10

hugging/td_fuse/merge.py CHANGED Viewed

@@ -134,7 +134,7 @@ class MergeProtection:
         4. MagMax — protect top magnitude params (extra safety layer)
         """
         fused = target_state[key]
-        original = pre_merge_state[key]
         delta = fused - original
         # --- ARM Steering (new, replaces orthogonal projection) ---
@@ -203,7 +203,7 @@ class MergeProtection:
         for key in current_state:
             if key in pre_merge_state:
-                delta = current_state[key].float() - pre_merge_state[key].float()
                 if delta.abs().max() > 1e-8:
                     if key not in self.previous_deltas:
                         self.previous_deltas[key] = []
@@ -268,9 +268,65 @@ def get_source_by_stage(stage_name: str) -> Optional[ModelConfig]:
     return None
 def load_model(config: ModelConfig, cfg: MergeConfig) -> tuple:
     """Load a model and its tokenizer/processor."""
-    print(f"\n[merge] Loading {config.name} ({config.hf_id})...")
     # Qwen3-VL uses a processor (handles both text + vision), not just a tokenizer
     if config.architecture == "transformer+vision":
@@ -298,6 +354,7 @@ def load_model(config: ModelConfig, cfg: MergeConfig) -> tuple:
             )
             lang_params = sum(p.numel() for p in model.parameters()) - vision_params
             print(f"[merge]   Language: {lang_params / 1e9:.1f}B  |  Vision: {vision_params / 1e9:.1f}B")
             return model, tokenizer
         except ImportError:
@@ -318,6 +375,7 @@ def load_model(config: ModelConfig, cfg: MergeConfig) -> tuple:
     )
     print(f"[merge] Loaded {config.name}: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B params")
     return model, tokenizer
@@ -888,6 +946,7 @@ def run_pipeline(
     if cfg is None:
         cfg = MergeConfig()
     print("\n" + "=" * 70)
     print("TD FUSE — Transport and Merge Pipeline")
     print(f"Target: {TARGET.name} ({TARGET.hf_id})")
@@ -895,13 +954,18 @@ def run_pipeline(
         print(f"Mode: Vision-Language (merging language backbone only, vision encoder untouched)")
     print(f"Stages: {', '.join(stages)}")
     print(f"Output: {cfg.output_dir}")
     print("=" * 70)
     # Setup
     try:
         setup_tm_repo(cfg)
     except FileNotFoundError as e:
-        print(f"\n⚠ {e}")
         print("Continuing with fallback implementation...")
     # Create output directories
@@ -1020,10 +1084,13 @@ def run_pipeline(
         emoji = "✓" if status == "passed" else "✗"
         print(f"  {emoji} {stage_name}: {status}")
     print(f"\n  Overall: {pipeline_results['overall_status']}")
     if residual_bank.residual_index:
         print(f"\n  Residuals saved for: {', '.join(residual_bank.residual_index.keys())}")
         print(f"  To recover lost knowledge later:")
         print(f"    python -m td_fuse.run --reinject <stage> --strength 0.2")
     print("=" * 70)
     return pipeline_results

         4. MagMax — protect top magnitude params (extra safety layer)
         """
         fused = target_state[key]
+        original = pre_merge_state[key].to(fused.device)
         delta = fused - original
         # --- ARM Steering (new, replaces orthogonal projection) ---
         for key in current_state:
             if key in pre_merge_state:
+                delta = current_state[key].cpu().float() - pre_merge_state[key].cpu().float()
                 if delta.abs().max() > 1e-8:
                     if key not in self.previous_deltas:
                         self.previous_deltas[key] = []
     return None
+def check_model_cached(hf_id: str) -> bool:
+    """Check if a model is already in the HuggingFace cache."""
+    try:
+        from huggingface_hub import try_to_load_from_cache, model_info
+        # Quick check: see if config.json is cached (every model has one)
+        cached = try_to_load_from_cache(hf_id, "config.json")
+        if cached is not None and isinstance(cached, str):
+            return True
+    except Exception:
+        pass
+    return False
+def check_all_models_cached(stages: list) -> dict:
+    """
+    Pre-flight check: are all needed models already downloaded?
+    Prints a clear table so you know what's cached and what will download.
+    """
+    print("\n" + "=" * 60)
+    print("PRE-FLIGHT CHECK: Model cache status")
+    print("=" * 60)
+    sys.stdout.flush()
+    status = {}
+    # Target model
+    cached = check_model_cached(TARGET.hf_id)
+    tag = "CACHED" if cached else "WILL DOWNLOAD"
+    print(f"  {TARGET.name:25s} {tag:15s} ({TARGET.hf_id})")
+    status[TARGET.name] = cached
+    # Source models for requested stages
+    for stage_name in stages:
+        source = get_source_by_stage(stage_name)
+        if source:
+            cached = check_model_cached(source.hf_id)
+            tag = "CACHED" if cached else "WILL DOWNLOAD"
+            print(f"  {source.name:25s} {tag:15s} ({source.hf_id})")
+            status[source.name] = cached
+    not_cached = [name for name, c in status.items() if not c]
+    if not_cached:
+        print(f"\n  {len(not_cached)} model(s) need downloading: {', '.join(not_cached)}")
+        print(f"  This may take 10-30 min per model depending on connection speed.")
+    else:
+        print(f"\n  All {len(status)} models are cached -- loading will be fast!")
+    print("=" * 60)
+    sys.stdout.flush()
+    return status
 def load_model(config: ModelConfig, cfg: MergeConfig) -> tuple:
     """Load a model and its tokenizer/processor."""
+    load_start = time.time()
+    cached = check_model_cached(config.hf_id)
+    cache_msg = "(from cache)" if cached else "(downloading -- this may take a while)"
+    print(f"\n[merge] Loading {config.name} ({config.hf_id}) {cache_msg}...")
+    sys.stdout.flush()
     # Qwen3-VL uses a processor (handles both text + vision), not just a tokenizer
     if config.architecture == "transformer+vision":
             )
             lang_params = sum(p.numel() for p in model.parameters()) - vision_params
             print(f"[merge]   Language: {lang_params / 1e9:.1f}B  |  Vision: {vision_params / 1e9:.1f}B")
+            print(f"[merge] Loaded in {time.time()-load_start:.0f}s"); sys.stdout.flush()
             return model, tokenizer
         except ImportError:
     )
     print(f"[merge] Loaded {config.name}: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B params")
+    print(f"[merge] Loaded in {time.time()-load_start:.0f}s"); sys.stdout.flush()
     return model, tokenizer
     if cfg is None:
         cfg = MergeConfig()
+    pipeline_start = time.time()
     print("\n" + "=" * 70)
     print("TD FUSE — Transport and Merge Pipeline")
     print(f"Target: {TARGET.name} ({TARGET.hf_id})")
         print(f"Mode: Vision-Language (merging language backbone only, vision encoder untouched)")
     print(f"Stages: {', '.join(stages)}")
     print(f"Output: {cfg.output_dir}")
+    print(f"Started at: {time.strftime('%H:%M:%S')}")
     print("=" * 70)
+    sys.stdout.flush()
+    # --- Pre-flight: check which models are cached ---
+    check_all_models_cached(stages)
     # Setup
     try:
         setup_tm_repo(cfg)
     except FileNotFoundError as e:
+        print(f"\n WARNING: {e}")
         print("Continuing with fallback implementation...")
     # Create output directories
         emoji = "✓" if status == "passed" else "✗"
         print(f"  {emoji} {stage_name}: {status}")
     print(f"\n  Overall: {pipeline_results['overall_status']}")
+    total_pipeline_time = time.time() - pipeline_start
+    print(f"\n  Total pipeline time: {total_pipeline_time/60:.1f} min ({total_pipeline_time/3600:.1f} hours)")
     if residual_bank.residual_index:
         print(f"\n  Residuals saved for: {', '.join(residual_bank.residual_index.keys())}")
         print(f"  To recover lost knowledge later:")
         print(f"    python -m td_fuse.run --reinject <stage> --strength 0.2")
     print("=" * 70)
+    sys.stdout.flush()
     return pipeline_results

hugging/td_fuse/transport.py CHANGED Viewed

@@ -572,6 +572,10 @@ def fuse_weights(
         source_key = _map_key(target_key, source_config)
         if source_key is None or source_key not in source_state:
             skipped_count += 1
             continue
         target_w = target_state[target_key]
@@ -618,6 +622,10 @@ def fuse_weights(
 def _should_skip(key: str, source_config: ModelConfig) -> bool:
     """Determine if a parameter should be skipped during merge."""
     # Always skip if source model says to skip embeddings
     if source_config.skip_embeddings and ("embed_tokens" in key or "lm_head" in key):
         return True
@@ -640,22 +648,42 @@ def _should_skip(key: str, source_config: ModelConfig) -> bool:
     return False
 def _map_key(target_key: str, source_config: ModelConfig) -> Optional[str]:
     """Map a target model parameter name to the corresponding source name."""
-    # For same-architecture models (DeepSeek), keys match directly
     if source_config.architecture == "transformer" and source_config.layers == 36:
-        return target_key
     # For Llama (32 layers -> 36 layers), map layer indices
     if "layer_mapping_32_to_36" in source_config.special_handling:
-        if "model.layers." in target_key:
             # Extract layer number
-            parts = target_key.split(".")
             try:
                 layer_idx = int(parts[2])
             except (IndexError, ValueError):
-                return target_key
             # Map 36 target layers to 32 source layers (stride)
             source_layer = int(layer_idx * 32 / 36)
@@ -664,17 +692,17 @@ def _map_key(target_key: str, source_config: ModelConfig) -> Optional[str]:
     # For MiMo (same layer count, different extras), keys mostly match
     if source_config.architecture == "transformer+mtp":
-        if "mtp_head" in target_key:
             return None  # MTP heads don't exist in target
-        return target_key
     # For Falcon hybrid, only attention and MLP keys map
     if source_config.architecture == "hybrid_ssm":
-        if any(k in target_key for k in ["self_attn", "mlp", "layer_norm"]):
-            return target_key  # These exist in both
         return None  # Mamba components don't map
-    return target_key
 def _align_dimensions(

         source_key = _map_key(target_key, source_config)
         if source_key is None or source_key not in source_state:
             skipped_count += 1
+            # Log first few misses to help debug key mapping issues
+            if skipped_count <= 5:
+                print(f"  [skip] No source match for: {target_key} (mapped to: {source_key})")
+                sys.stdout.flush()
             continue
         target_w = target_state[target_key]
 def _should_skip(key: str, source_config: ModelConfig) -> bool:
     """Determine if a parameter should be skipped during merge."""
+    # Skip vision encoder params (Qwen3-VL) -- these should never be merged
+    if key.startswith("visual") or key.startswith("merger") or key.startswith("model.visual") or key.startswith("model.merger"):
+        return True
     # Always skip if source model says to skip embeddings
     if source_config.skip_embeddings and ("embed_tokens" in key or "lm_head" in key):
         return True
     return False
+def _strip_vl_prefix(key: str) -> str:
+    """
+    Strip the 'language_model.' prefix that Qwen3-VL adds.
+    Qwen3-VL wraps all language params under 'model.language_model.*'
+    but source models (DeepSeek, MiMo, Llama, Falcon) use 'model.*' directly.
+    Example:
+        target: model.language_model.layers.0.self_attn.q_proj.weight
+        source: model.layers.0.self_attn.q_proj.weight
+    """
+    # model.language_model.X -> model.X
+    if "language_model." in key:
+        return key.replace("language_model.", "")
+    return key
 def _map_key(target_key: str, source_config: ModelConfig) -> Optional[str]:
     """Map a target model parameter name to the corresponding source name."""
+    # Step 1: Strip Qwen3-VL's language_model. prefix so we can match source keys
+    source_key = _strip_vl_prefix(target_key)
+    # For same-architecture models (DeepSeek), keys match directly after prefix strip
     if source_config.architecture == "transformer" and source_config.layers == 36:
+        return source_key
     # For Llama (32 layers -> 36 layers), map layer indices
     if "layer_mapping_32_to_36" in source_config.special_handling:
+        if "model.layers." in source_key:
             # Extract layer number
+            parts = source_key.split(".")
             try:
                 layer_idx = int(parts[2])
             except (IndexError, ValueError):
+                return source_key
             # Map 36 target layers to 32 source layers (stride)
             source_layer = int(layer_idx * 32 / 36)
     # For MiMo (same layer count, different extras), keys mostly match
     if source_config.architecture == "transformer+mtp":
+        if "mtp_head" in source_key:
             return None  # MTP heads don't exist in target
+        return source_key
     # For Falcon hybrid, only attention and MLP keys map
     if source_config.architecture == "hybrid_ssm":
+        if any(k in source_key for k in ["self_attn", "mlp", "layer_norm"]):
+            return source_key  # These exist in both
         return None  # Mamba components don't map
+    return source_key
 def _align_dimensions(