Current td_fuse code with all fixes

Browse files

Files changed (12) hide show

td_fuse/__pycache__/__init__.cpython-312.pyc +0 -0
td_fuse/__pycache__/canary.cpython-312.pyc +0 -0
td_fuse/__pycache__/config.cpython-312.pyc +0 -0
td_fuse/__pycache__/heal.cpython-312.pyc +0 -0
td_fuse/__pycache__/merge.cpython-312.pyc +0 -0
td_fuse/__pycache__/techniques.cpython-312.pyc +0 -0
td_fuse/__pycache__/transport.cpython-312.pyc +0 -0
td_fuse/__pycache__/validate.cpython-312.pyc +0 -0
td_fuse/config.py +2 -2
td_fuse/heal.py +15 -46
td_fuse/merge.py +3 -3
td_fuse/transport.py +20 -5

td_fuse/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.28 kB). View file

td_fuse/__pycache__/canary.cpython-312.pyc ADDED Viewed

Binary file (8.27 kB). View file

td_fuse/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (8.59 kB). View file

td_fuse/__pycache__/heal.cpython-312.pyc ADDED Viewed

Binary file (16.2 kB). View file

td_fuse/__pycache__/merge.cpython-312.pyc ADDED Viewed

Binary file (58.7 kB). View file

td_fuse/__pycache__/techniques.cpython-312.pyc ADDED Viewed

Binary file (25.1 kB). View file

td_fuse/__pycache__/transport.cpython-312.pyc ADDED Viewed

Binary file (45.4 kB). View file

td_fuse/__pycache__/validate.cpython-312.pyc ADDED Viewed

Binary file (11.8 kB). View file

td_fuse/config.py CHANGED Viewed

@@ -129,7 +129,7 @@ SOURCES = [
         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=False,
         merge_risk="medium",
-        merge_alpha=0.35,               # Lower alpha — layer mismatch risk
         special_handling=["skip_embeddings", "drop_qkv_bias", "layer_mapping_32_to_36"],
         notes=(
             "32 layers vs 36 — T&M's P matrix handles layer mapping. "
@@ -152,7 +152,7 @@ SOURCES = [
         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=True,         # Likely custom hybrid code
         merge_risk="high",
-        merge_alpha=0.3,                # Conservative — highest risk model
         special_handling=[
             "skip_embeddings",
             "drop_mamba_state_params",   # A, D matrices have no Qwen3 equivalent

         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=False,
         merge_risk="medium",
+        merge_alpha=0.08,               # Lower alpha — layer mismatch risk
         special_handling=["skip_embeddings", "drop_qkv_bias", "layer_mapping_32_to_36"],
         notes=(
             "32 layers vs 36 — T&M's P matrix handles layer mapping. "
         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=True,         # Likely custom hybrid code
         merge_risk="high",
+        merge_alpha=0.08,                # Conservative — highest risk model
         special_handling=[
             "skip_embeddings",
             "drop_mamba_state_params",   # A, D matrices have no Qwen3 equivalent

td_fuse/heal.py CHANGED Viewed

@@ -69,11 +69,11 @@ def load_healing_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
     # Each entry: (dataset_id, config_name_or_None, split, count, text_field)
     datasets_to_load = [
         # General language — same calibration data source that works reliably
-        ("neuralmagic/LLM_compression_calibration", None, "train", 500, "text"),
         # Math reasoning (exercises DeepSeek/MiMo contributions)
-        ("openai/gsm8k", "main", "train", 300, "question"),
         # Code — bigcode/starcoderdata is a modern alternative
-        ("bigcode/starcoderdata", "python", "train", 200, "content"),
     ]
     all_texts = []
@@ -193,7 +193,9 @@ def apply_qlora_unsloth(
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
-        save_strategy="no", max_steps=50,  # Don't save intermediate checkpoints — saves ~17GB disk
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_8bit",  # Memory-efficient optimiser
@@ -249,24 +251,15 @@ def apply_qlora_standard(
         return 'td_fuse_outputs/healed'
     import torch
     from peft import LoraConfig, get_peft_model, TaskType
-    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     print("\n[heal] Loading model with standard PEFT...")
-    # 4-bit quantisation config
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=getattr(torch, cfg.dtype),
-        bnb_4bit_use_double_quant=True,
-    )
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model = _load_model_smart(
         model_path,
-        quantization_config=bnb_config,
         device_map="auto",
-        torch_dtype=getattr(torch, cfg.dtype),
     )
     # LoRA config
@@ -328,7 +321,9 @@ def apply_qlora_standard(
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
-        save_strategy="no", max_steps=50,  # Don't save intermediate checkpoints — saves ~17GB disk
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_torch",
@@ -365,36 +360,10 @@ def apply_qlora_standard(
     gc.collect()
-    # SAVE FIRST — never delete anything until save is confirmed
-    # save_pretrained can fail on 4-bit merged models (NotImplementedError)
-    # So we go straight to the safe manual method
-    print(f"[heal] Saving healed model to {healed_dir}...")
-    try:
-        from safetensors.torch import save_file
-        import torch as _torch
-        # Fixed: use named_parameters for proper dequantization
-        clean_state = {}
-        for k, v in merged_model.named_parameters():
-            if hasattr(v, 'dequantize'):
-                clean_state[k] = v.dequantize().to(_torch.bfloat16)
-            elif v.data.dtype in (_torch.float32, _torch.float16, _torch.bfloat16):
-                clean_state[k] = v.data.to(_torch.bfloat16)
-            else:
-                clean_state[k] = v.data.float().to(_torch.bfloat16)
-        save_file(clean_state, str(healed_dir / "model.safetensors"))
-        if hasattr(merged_model, 'config'):
-            if hasattr(merged_model.config, "quantization_config"):
-                merged_model.config.quantization_config = None
-                print("[heal] Removed quantization_config from saved config (weights are bf16 now)")
-            merged_model.config.save_pretrained(str(healed_dir))
-        tokenizer.save_pretrained(str(healed_dir))
-        print(f"[heal] SAVED OK: {healed_dir / 'model.safetensors'}")
-    except Exception as e:
-        # Emergency fallback: try save_pretrained as last resort
-        print(f"[heal] Manual save failed ({e}), trying save_pretrained...")
-        merged_model.save_pretrained(str(healed_dir))
-        tokenizer.save_pretrained(str(healed_dir))
-        print(f"[heal] SAVED OK via save_pretrained: {healed_dir}")
     # Verify the save actually worked before cleaning up ANYTHING
     saved_model = healed_dir / "model.safetensors"

     # Each entry: (dataset_id, config_name_or_None, split, count, text_field)
     datasets_to_load = [
         # General language — same calibration data source that works reliably
+        ("neuralmagic/LLM_compression_calibration", None, "train", 1500, "text"),
         # Math reasoning (exercises DeepSeek/MiMo contributions)
+        ("openai/gsm8k", "main", "train", 1000, "question"),
         # Code — bigcode/starcoderdata is a modern alternative
+        ("sahil2801/CodeAlpaca-20k", None, "train", 500, "output"),
     ]
     all_texts = []
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
+        save_strategy="steps",
+        save_steps=50,
+        save_total_limit=2, max_steps=50,  # Don't save intermediate checkpoints — saves ~17GB disk
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_8bit",  # Memory-efficient optimiser
         return 'td_fuse_outputs/healed'
     import torch
     from peft import LoraConfig, get_peft_model, TaskType
+    from transformers import AutoModelForCausalLM, AutoTokenizer
     print("\n[heal] Loading model with standard PEFT...")
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model = _load_model_smart(
         model_path,
         device_map="auto",
+        torch_dtype=torch.bfloat16,
     )
     # LoRA config
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
+        save_strategy="steps",
+        save_steps=50,
+        save_total_limit=2, max_steps=50,  # Don't save intermediate checkpoints — saves ~17GB disk
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_torch",
     gc.collect()
+    # bf16 model — save_pretrained works correctly, no dequantize needed
+    merged_model.save_pretrained(str(healed_dir), safe_serialization=True)
+    tokenizer.save_pretrained(str(healed_dir))
+    print(f"[heal] SAVED OK: {healed_dir}")
     # Verify the save actually worked before cleaning up ANYTHING
     saved_model = healed_dir / "model.safetensors"

td_fuse/merge.py CHANGED Viewed

@@ -726,11 +726,11 @@ def run_single_merge(
     print(f"\n[merge] Step 4/10: Extracting activations (both models)..."); sys.stdout.flush()
     step_t = time.time()
     # Check if source model has a different vocabulary size than target.
-    source_vocab_size = source_model.config.vocab_size if hasattr(source_model.config, 'vocab_size') else None
-    target_vocab_size = target_model.config.vocab_size if hasattr(target_model.config, 'vocab_size') else None
     print(f"[merge] Vocab sizes -- target: {target_vocab_size}, source: {source_vocab_size}")
-    if source_vocab_size and target_vocab_size and source_vocab_size != target_vocab_size:
         print(f"[merge] VOCAB MISMATCH detected! Re-tokenizing calibration data for {source_config.name}...")
         source_calibration = retokenize_calibration(calibration_raw_texts, source_tokenizer, cfg)
         print(f"[merge] Extracting source activations (with source-tokenized data)...")

     print(f"\n[merge] Step 4/10: Extracting activations (both models)..."); sys.stdout.flush()
     step_t = time.time()
     # Check if source model has a different vocabulary size than target.
+    source_vocab_size = len(source_tokenizer)
+    target_vocab_size = len(target_tokenizer)
     print(f"[merge] Vocab sizes -- target: {target_vocab_size}, source: {source_vocab_size}")
+    if source_vocab_size != target_vocab_size:
         print(f"[merge] VOCAB MISMATCH detected! Re-tokenizing calibration data for {source_config.name}...")
         source_calibration = retokenize_calibration(calibration_raw_texts, source_tokenizer, cfg)
         print(f"[merge] Extracting source activations (with source-tokenized data)...")

td_fuse/transport.py CHANGED Viewed

@@ -520,7 +520,7 @@ def _compute_plans_fallback(
                 sys.stdout.flush()
             # Timeout: 90 min (Sinkhorn on 4096x4096 is slow on CPU)
-            tracker.check_timeout(timeout_seconds=5400)
         if permutations:
             print(f"[transport] Computed {len(permutations)} neuron permutations")
@@ -569,8 +569,8 @@ def _compute_plans_fallback(
             layer_costs[i, j] = 1.0 - sim
             tracker.tick(f"layer sim {i},{j}")
-        # Timeout: 30 min for cross-arch
-        tracker.check_timeout(timeout_seconds=1800)
     print(f"[transport] Step 1/3 done: {n_source}x{n_target} similarities computed")
     sys.stdout.flush()
@@ -579,10 +579,24 @@ def _compute_plans_fallback(
     print("[transport] Step 2/3: Computing neuron-level transport (top-3 per target)...")
     sys.stdout.flush()
     Q_matrices = {}
     for j, tl in enumerate(target_layers):
         top3 = np.argsort(layer_costs[:, j])[:3]
         for i in top3:
             sl = source_layers[i]
             S = source_act[sl].numpy()
             T = target_act[tl].numpy()
@@ -595,14 +609,15 @@ def _compute_plans_fallback(
             corr = S_norm.T @ T_norm / S.shape[0]
             cost = 1.0 - corr
             Q_matrices[(sl, tl)] = _sinkhorn(cost, reg=0.1, max_iter=50)
             tracker.tick(f"Q({sl},{tl})")
         if (j + 1) % 5 == 0 or j == 0:
             print(f"  Target layer {j + 1}/{n_target}: matched to top-3 sources")
             sys.stdout.flush()
-        # Timeout: 30 min for cross-arch
-        tracker.check_timeout(timeout_seconds=1800)
     print(f"[transport] Step 2/3 done: {len(Q_matrices)} Q matrices computed")
     sys.stdout.flush()

                 sys.stdout.flush()
             # Timeout: 90 min (Sinkhorn on 4096x4096 is slow on CPU)
+            tracker.check_timeout(timeout_seconds=10800)
         if permutations:
             print(f"[transport] Computed {len(permutations)} neuron permutations")
             layer_costs[i, j] = 1.0 - sim
             tracker.tick(f"layer sim {i},{j}")
+        # Timeout: 90 min for cross-arch
+        tracker.check_timeout(timeout_seconds=10800)
     print(f"[transport] Step 1/3 done: {n_source}x{n_target} similarities computed")
     sys.stdout.flush()
     print("[transport] Step 2/3: Computing neuron-level transport (top-3 per target)...")
     sys.stdout.flush()
     Q_matrices = {}
+    # Incremental cache: save each Q as we go so crashes don't lose progress
+    q_cache_dir = Path("td_fuse_checkpoints") / "q_cache_crossarch"
+    q_cache_dir.mkdir(parents=True, exist_ok=True)
     for j, tl in enumerate(target_layers):
         top3 = np.argsort(layer_costs[:, j])[:3]
         for i in top3:
             sl = source_layers[i]
+            cache_key = f"{sl}__{tl}".replace("/", "_").replace(".", "_")
+            cache_path = q_cache_dir / f"{cache_key}.npy"
+            # Skip if already computed in a previous run
+            if cache_path.exists():
+                Q_matrices[(sl, tl)] = np.load(str(cache_path))
+                tracker.tick(f"Q({sl},{tl})")
+                continue
             S = source_act[sl].numpy()
             T = target_act[tl].numpy()
             corr = S_norm.T @ T_norm / S.shape[0]
             cost = 1.0 - corr
             Q_matrices[(sl, tl)] = _sinkhorn(cost, reg=0.1, max_iter=50)
+            np.save(str(cache_path), Q_matrices[(sl, tl)])
             tracker.tick(f"Q({sl},{tl})")
         if (j + 1) % 5 == 0 or j == 0:
             print(f"  Target layer {j + 1}/{n_target}: matched to top-3 sources")
             sys.stdout.flush()
+        # Timeout: 90 min for cross-arch (was 30, too short for 72 layers)
+        tracker.check_timeout(timeout_seconds=10800)
     print(f"[transport] Step 2/3 done: {len(Q_matrices)} Q matrices computed")
     sys.stdout.flush()