Upload 142 files

Browse files

Files changed (8) hide show

hugging/td_fuse/config.py +3 -3
hugging/td_fuse/heal.py +32 -48
hugging/td_fuse/merge.py +26 -4
hugging/td_fuse/selfimprove.py +545 -0
hugging/td_fuse/transport.py +62 -5
hugging/td_lang/compiler.py +30 -0
hugging/td_lang/engine/heal.py +8 -0
hugging/td_lang/td_lang/engine/heal.py +4 -0

hugging/td_fuse/config.py CHANGED Viewed

@@ -118,7 +118,7 @@ SOURCES = [
     ),
     ModelConfig(
         name="Llama-3.1-8B",
-        hf_id="meta-llama/Llama-3.1-8B-Instruct",
         architecture="transformer",
         layers=32,                      # 4 fewer than Qwen3!
         hidden_dim=4096,
@@ -129,7 +129,7 @@ SOURCES = [
         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=False,
         merge_risk="medium",
-        merge_alpha=0.35,               # Lower alpha — layer mismatch risk
         special_handling=["skip_embeddings", "drop_qkv_bias", "layer_mapping_32_to_36"],
         notes=(
             "32 layers vs 36 — T&M's P matrix handles layer mapping. "
@@ -152,7 +152,7 @@ SOURCES = [
         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=True,         # Likely custom hybrid code
         merge_risk="high",
-        merge_alpha=0.3,                # Conservative — highest risk model
         special_handling=[
             "skip_embeddings",
             "drop_mamba_state_params",   # A, D matrices have no Qwen3 equivalent

     ),
     ModelConfig(
         name="Llama-3.1-8B",
+        hf_id="unsloth/Llama-3.1-8B-Instruct",
         architecture="transformer",
         layers=32,                      # 4 fewer than Qwen3!
         hidden_dim=4096,
         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=False,
         merge_risk="medium",
+        merge_alpha=0.08,               # Very conservative — cross-arch needs low alpha
         special_handling=["skip_embeddings", "drop_qkv_bias", "layer_mapping_32_to_36"],
         notes=(
             "32 layers vs 36 — T&M's P matrix handles layer mapping. "
         skip_embeddings=True,           # Must skip — vocab too different
         trust_remote_code=True,         # Likely custom hybrid code
         merge_risk="high",
+        merge_alpha=0.08,               # Very conservative — hybrid SSM needs low alpha
         special_handling=[
             "skip_embeddings",
             "drop_mamba_state_params",   # A, D matrices have no Qwen3 equivalent

hugging/td_fuse/heal.py CHANGED Viewed

@@ -8,11 +8,18 @@ these out without forgetting what was merged.
 Think of it like physical therapy after surgery — the operation (merge)
 moved knowledge over, but the model needs practice to use it naturally.
 Config notes:
     - r=32, alpha=64, dropout=0.0 (must be 0 for Unsloth speed)
     - transformers >= 4.51.3 (NOT 4.51.0, NOT 4.52.0-4.55.1)
     - bfloat16 end-to-end
-    - DDP across dual 4090
 Findings: #12, #16, #20
 """
@@ -67,13 +74,14 @@ def load_healing_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
     # Merge-specific: use diverse data that exercises all merged capabilities
     # Each entry: (dataset_id, config_name_or_None, split, count, text_field)
     datasets_to_load = [
-        # General language — same calibration data source that works reliably
-        ("neuralmagic/LLM_compression_calibration", None, "train", 500, "text"),
         # Math reasoning (exercises DeepSeek/MiMo contributions)
-        ("openai/gsm8k", "main", "train", 300, "question"),
-        # Code — bigcode/starcoderdata is a modern alternative
-        ("bigcode/starcoderdata", "python", "train", 200, "content"),
     ]
     all_texts = []
@@ -193,7 +201,9 @@ def apply_qlora_unsloth(
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
-        save_strategy="no",  # Don't save intermediate checkpoints — saves ~17GB disk
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_8bit",  # Memory-efficient optimiser
@@ -235,9 +245,11 @@ def apply_qlora_standard(
     healing_data: list = None,
 ) -> str:
     """
-    Fallback: QLoRA healing via standard PEFT (no Unsloth).
-    Slower but works without Unsloth installed.
     Returns:
         Path to healed model directory
@@ -249,24 +261,15 @@ def apply_qlora_standard(
         return 'td_fuse_outputs/healed'
     import torch
     from peft import LoraConfig, get_peft_model, TaskType
-    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-    print("\n[heal] Loading model with standard PEFT...")
-    # 4-bit quantisation config
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=getattr(torch, cfg.dtype),
-        bnb_4bit_use_double_quant=True,
-    )
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model = _load_model_smart(
         model_path,
-        quantization_config=bnb_config,
         device_map="auto",
-        torch_dtype=getattr(torch, cfg.dtype),
     )
     # LoRA config
@@ -328,7 +331,9 @@ def apply_qlora_standard(
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
-        save_strategy="no",  # Don't save intermediate checkpoints — saves ~17GB disk
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_torch",
@@ -365,33 +370,12 @@ def apply_qlora_standard(
     gc.collect()
-    # SAVE FIRST — never delete anything until save is confirmed
-    # save_pretrained can fail on 4-bit merged models (NotImplementedError)
-    # So we go straight to the safe manual method
     print(f"[heal] Saving healed model to {healed_dir}...")
-    try:
-        from safetensors.torch import save_file
-        import torch as _torch
-        state_dict = merged_model.state_dict()
-        clean_state = {}
-        for k, v in state_dict.items():
-            if hasattr(v, 'dequantize'):
-                clean_state[k] = v.dequantize().to(_torch.bfloat16)
-            elif v.dtype in (_torch.float32, _torch.float16, _torch.bfloat16):
-                clean_state[k] = v.to(_torch.bfloat16)
-            else:
-                clean_state[k] = v.float().to(_torch.bfloat16)
-        save_file(clean_state, str(healed_dir / "model.safetensors"))
-        if hasattr(merged_model, 'config'):
-            merged_model.config.save_pretrained(str(healed_dir))
-        tokenizer.save_pretrained(str(healed_dir))
-        print(f"[heal] SAVED OK: {healed_dir / 'model.safetensors'}")
-    except Exception as e:
-        # Emergency fallback: try save_pretrained as last resort
-        print(f"[heal] Manual save failed ({e}), trying save_pretrained...")
-        merged_model.save_pretrained(str(healed_dir))
-        tokenizer.save_pretrained(str(healed_dir))
-        print(f"[heal] SAVED OK via save_pretrained: {healed_dir}")
     # Verify the save actually worked before cleaning up ANYTHING
     saved_model = healed_dir / "model.safetensors"

 Think of it like physical therapy after surgery — the operation (merge)
 moved knowledge over, but the model needs practice to use it naturally.
+IMPORTANT: Two-phase healing required:
+  1. Deep heal — raw text data, fixes general coherence (3000+ samples, 4+ epochs)
+  2. Stop-token heal — chat-formatted data with <|im_end|> tokens,
+     teaches the model when to stop generating (prevents repetition loops).
+     Without chat-formatted data, the model answers correctly but then
+     keeps generating fake "Human:" turns in a loop.
 Config notes:
+    - Load in bf16 (NOT 4-bit) — 4-bit dequantize corrupts tensor shapes
     - r=32, alpha=64, dropout=0.0 (must be 0 for Unsloth speed)
     - transformers >= 4.51.3 (NOT 4.51.0, NOT 4.52.0-4.55.1)
     - bfloat16 end-to-end
 Findings: #12, #16, #20
 """
     # Merge-specific: use diverse data that exercises all merged capabilities
     # Each entry: (dataset_id, config_name_or_None, split, count, text_field)
+    # Deep heal uses ~3000 samples across general/math/code
     datasets_to_load = [
+        # General language — calibration data
+        ("neuralmagic/LLM_compression_calibration", None, "train", 1500, "text"),
         # Math reasoning (exercises DeepSeek/MiMo contributions)
+        ("openai/gsm8k", "main", "train", 1000, "question"),
+        # Code — sahil2801/CodeAlpaca-20k is ungated (starcoderdata is gated)
+        ("sahil2801/CodeAlpaca-20k", None, "train", 500, "output"),
     ]
     all_texts = []
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
+        save_strategy="steps",
+        save_steps=50,         # Checkpoint every 50 steps so crashes don't lose progress
+        save_total_limit=2,    # Keep only last 2 checkpoints to save disk space
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_8bit",  # Memory-efficient optimiser
     healing_data: list = None,
 ) -> str:
     """
+    Healing via LoRA in bf16 (no quantization).
+    Loading in bf16 avoids the 4-bit dequantize bug that flattens
+    weight tensors to [N, 1] on merge_and_unload(). The A6000 (48GB)
+    has enough VRAM for the full bf16 model (~17GB) + LoRA adapters.
     Returns:
         Path to healed model directory
         return 'td_fuse_outputs/healed'
     import torch
     from peft import LoraConfig, get_peft_model, TaskType
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    print("\n[heal] Loading model in bf16 (no quantization — avoids shape corruption)...")
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model = _load_model_smart(
         model_path,
         device_map="auto",
+        torch_dtype=torch.bfloat16,
     )
     # LoRA config
         learning_rate=cfg.heal_learning_rate,
         bf16=True,
         logging_steps=10,
+        save_strategy="steps",
+        save_steps=50,         # Checkpoint every 50 steps so crashes don't lose progress
+        save_total_limit=2,    # Keep only last 2 checkpoints to save disk space
         warmup_ratio=0.05,
         lr_scheduler_type="cosine",
         optim="adamw_torch",
     gc.collect()
+    # Since we loaded in bf16 (not 4-bit), save_pretrained works correctly.
+    # No dequantize needed — weights already have proper shapes.
     print(f"[heal] Saving healed model to {healed_dir}...")
+    merged_model.save_pretrained(str(healed_dir), safe_serialization=True)
+    tokenizer.save_pretrained(str(healed_dir))
+    print(f"[heal] SAVED OK: {healed_dir}")
     # Verify the save actually worked before cleaning up ANYTHING
     saved_model = healed_dir / "model.safetensors"

hugging/td_fuse/merge.py CHANGED Viewed

@@ -39,6 +39,7 @@ from .canary import inject_canary, test_all_canaries
 from .transport import (
     setup_tm_repo,
     load_calibration_data,
     extract_activations,
     compute_transport_plans,
     fuse_weights,
@@ -662,6 +663,7 @@ def run_single_merge(
     protection: MergeProtection,
     residual_bank: ResidualBank = None,
     calibration_data: list = None,
     baseline_perplexity: float = None,
     merged_sources: list = None,
 ) -> dict:
@@ -717,14 +719,33 @@ def run_single_merge(
     print(f"\n[merge] Step 3/10: Loading calibration data..."); sys.stdout.flush()
     step_t = time.time()
     if calibration_data is None:
-        calibration_data = load_calibration_data(cfg, target_tokenizer)
     print(f"[merge] Step 3/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 4: Extract activations ---
     print(f"\n[merge] Step 4/10: Extracting activations (both models)..."); sys.stdout.flush()
     step_t = time.time()
-    print(f"[merge] Extracting source activations...")
-    source_activations = extract_activations(source_model, calibration_data)
     print(f"[merge] Extracting target activations...")
     pre_merge_target_activations = extract_activations(target_model, calibration_data)
@@ -1101,7 +1122,7 @@ def run_pipeline(
     print(f"[pipeline] Baseline perplexity: {baseline_ppl:.2f}")
     # --- Load calibration data once ---
-    calibration_data = load_calibration_data(cfg, target_tokenizer)
     # --- Initialize merge protection + residual bank ---
     protection = MergeProtection(cfg)
@@ -1138,6 +1159,7 @@ def run_pipeline(
             protection,
             residual_bank=residual_bank,
             calibration_data=calibration_data,
             baseline_perplexity=baseline_ppl,
             merged_sources=merged_sources,
         )

 from .transport import (
     setup_tm_repo,
     load_calibration_data,
+    retokenize_calibration,
     extract_activations,
     compute_transport_plans,
     fuse_weights,
     protection: MergeProtection,
     residual_bank: ResidualBank = None,
     calibration_data: list = None,
+    calibration_raw_texts: list = None,
     baseline_perplexity: float = None,
     merged_sources: list = None,
 ) -> dict:
     print(f"\n[merge] Step 3/10: Loading calibration data..."); sys.stdout.flush()
     step_t = time.time()
     if calibration_data is None:
+        calibration_data, calibration_raw_texts = load_calibration_data(cfg, target_tokenizer)
     print(f"[merge] Step 3/10 done in {time.time()-step_t:.0f}s"); sys.stdout.flush()
     # --- Step 4: Extract activations ---
     print(f"\n[merge] Step 4/10: Extracting activations (both models)..."); sys.stdout.flush()
     step_t = time.time()
+    # Check if source model has a different vocabulary size than target.
+    # If so, re-tokenize calibration data with source tokenizer to avoid
+    # CUDA out-of-bounds errors (e.g. Qwen 152K vocab → Llama 128K vocab).
+    # NOTE: We use len(tokenizer) instead of model.config.vocab_size because
+    # Qwen3VL wraps the language model and its top-level config may not
+    # expose vocab_size correctly (this caused the fix to silently fail).
+    source_vocab_size = len(source_tokenizer)
+    target_vocab_size = len(target_tokenizer)
+    print(f"[merge] Vocab sizes — target: {target_vocab_size}, source: {source_vocab_size}")
+    sys.stdout.flush()
+    if source_vocab_size != target_vocab_size:
+        print(f"[merge] ⚠ VOCAB MISMATCH detected! Re-tokenizing calibration data for {source_config.name}...")
+        source_calibration = retokenize_calibration(calibration_raw_texts, source_tokenizer, cfg)
+        print(f"[merge] Extracting source activations (with source-tokenized data)...")
+        source_activations = extract_activations(source_model, source_calibration)
+        del source_calibration  # Free memory
+    else:
+        print(f"[merge] Extracting source activations...")
+        source_activations = extract_activations(source_model, calibration_data)
     print(f"[merge] Extracting target activations...")
     pre_merge_target_activations = extract_activations(target_model, calibration_data)
     print(f"[pipeline] Baseline perplexity: {baseline_ppl:.2f}")
     # --- Load calibration data once ---
+    calibration_data, calibration_raw_texts = load_calibration_data(cfg, target_tokenizer)
     # --- Initialize merge protection + residual bank ---
     protection = MergeProtection(cfg)
             protection,
             residual_bank=residual_bank,
             calibration_data=calibration_data,
+            calibration_raw_texts=calibration_raw_texts,
             baseline_perplexity=baseline_ppl,
             merged_sources=merged_sources,
         )

hugging/td_fuse/selfimprove.py ADDED Viewed

	@@ -0,0 +1,545 @@

+"""
+TD Self-Improvement Loop — the core of Time Dilation.
+This is the part that makes the model actually get smarter over time.
+Based on findings from test_1 through test_18 interviews:
+THE LOOP:
+1. Ask the model "what are you bad at?" → it identifies weak spots
+2. Generate targeted synthetic training data for those weaknesses
+3. Train with GRPO (verified rewards only — no learned reward model)
+4. Re-benchmark → measure improvement
+5. Repeat — each cycle is small (1-5%) but compounds
+KEY PRINCIPLES (from interviews + dad's tests):
+- Verified rewards only: code compiles, math correct, logic valid
+- No learned reward model (saves VRAM, avoids reward hacking)
+- Cherry_LLM perplexity filter prevents mode collapse
+- Mix external data to avoid "100 steps on own outputs → dumber" trap
+- Target mid-to-late layers (16-28 for 32-layer, ~20-30 for 36-layer)
+COST SPLIT (from test_16):
+- 70-80% inference scaling (generate many, pick best)
+- 10-20% short GRPO training
+- 5-10% tooling/evaluation
+"""
+import torch
+import time
+import json
+import math
+import random
+import gc
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass, field
+@dataclass
+class SelfImproveConfig:
+    """Configuration for one self-improvement cycle."""
+    model_path: str = "td_fuse_outputs/healed_final"
+    output_dir: str = "td_fuse_outputs/improved"
+    # Generation settings
+    num_candidates: int = 8       # Generate N answers per question, pick best (inference scaling)
+    max_gen_tokens: int = 512
+    temperature: float = 0.7      # For diverse candidate generation
+    # Training settings
+    lora_r: int = 16
+    lora_alpha: int = 32
+    train_epochs: int = 2
+    train_batch: int = 4
+    train_grad_accum: int = 4
+    learning_rate: float = 2e-5   # Lower than healing — small nudges
+    # Data settings
+    num_reasoning_problems: int = 200   # Logic/reasoning problems to generate
+    num_math_problems: int = 200        # Math problems
+    num_code_problems: int = 100        # Code problems
+    # Quality filter
+    perplexity_threshold: float = 50.0  # Cherry_LLM: reject if perplexity > this
+# ============================================================
+# STEP 1: DIAGNOSE — Ask the model what it's bad at
+# ============================================================
+def diagnose_weaknesses(model, tokenizer, eos_id):
+    """
+    Ask the model to identify its own weaknesses.
+    All 3 AIs (ChatGPT, Grok, Gemini) confirmed models can do this.
+    """
+    print("\n=== STEP 1: SELF-DIAGNOSIS ===")
+    prompts = [
+        "What kinds of questions or tasks are you worst at? Be specific and honest. List your top 5 weaknesses.",
+        "Give me 5 examples of questions that would be hard for you to answer correctly.",
+        "What types of reasoning do you struggle with most? Give specific examples.",
+    ]
+    weaknesses = []
+    for prompt in prompts:
+        p = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+        ids = tokenizer(p, return_tensors="pt").to(model.device)
+        out = model.generate(
+            **ids, max_new_tokens=500, do_sample=True,
+            temperature=0.7, eos_token_id=eos_id
+        )
+        response = tokenizer.decode(out[0][ids.input_ids.shape[1]:], skip_special_tokens=True)
+        weaknesses.append(response)
+        print(f"  Diagnosis: {response[:150]}...")
+    return weaknesses
+# ============================================================
+# STEP 2: GENERATE — Create targeted training problems
+# ============================================================
+def generate_reasoning_problems():
+    """
+    Generate reasoning problems that target common weaknesses.
+    These have VERIFIABLE answers (the reward signal for GRPO).
+    """
+    problems = []
+    # Logic chain problems (model failed "yesterday Monday → tomorrow Wednesday")
+    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
+    for i in range(len(days)):
+        yesterday = days[i]
+        today = days[(i + 1) % 7]
+        tomorrow = days[(i + 2) % 7]
+        problems.append({
+            "question": f"If yesterday was {yesterday}, what day is tomorrow?",
+            "answer": tomorrow,
+            "type": "temporal_reasoning"
+        })
+        # Day after tomorrow
+        day_after = days[(i + 3) % 7]
+        problems.append({
+            "question": f"If today is {today}, what day is the day after tomorrow?",
+            "answer": day_after,
+            "type": "temporal_reasoning"
+        })
+    # Trick questions (model failed "pound of feathers vs bricks")
+    trick_qs = [
+        ("Which is heavier: a pound of feathers or a pound of bricks?", "same", "They weigh the same — both are one pound."),
+        ("Which is heavier: a ton of feathers or a ton of steel?", "same", "They weigh the same — both are one ton."),
+        ("Which weighs more: 1kg of cotton or 1kg of iron?", "same", "They weigh the same — both are 1 kilogram."),
+        ("If you have 5 apples and take away 3, how many do YOU have?", "3", "You have 3 apples — you took them."),
+        ("A farmer has 17 sheep. All but 9 die. How many are left?", "9", "9 sheep are left — 'all but 9' means 9 survive."),
+        ("How many times can you subtract 5 from 25?", "1", "Once — after that it's 20, not 25."),
+        ("If there are 3 apples and you take away 2, how many do you have?", "2", "You have 2 — you took them."),
+        ("What has a head and a tail but no body?", "coin", "A coin has a head and a tail but no body."),
+    ]
+    for q, key, full_answer in trick_qs:
+        problems.append({
+            "question": q,
+            "answer": full_answer,
+            "verify_key": key,
+            "type": "trick_question"
+        })
+    # Syllogism / deductive reasoning
+    syllogisms = [
+        ("All mammals are warm-blooded. A whale is a mammal. Is a whale warm-blooded?", "yes"),
+        ("All birds have feathers. A penguin is a bird. Does a penguin have feathers?", "yes"),
+        ("No reptiles are mammals. A snake is a reptile. Is a snake a mammal?", "no"),
+        ("All squares are rectangles. All rectangles have 4 sides. Do all squares have 4 sides?", "yes"),
+        ("Some dogs are brown. Max is a dog. Is Max definitely brown?", "no"),
+        ("All cats are animals. Some animals are pets. Are all cats pets?", "no"),
+    ]
+    for q, a in syllogisms:
+        problems.append({
+            "question": q + " Explain your reasoning.",
+            "answer": a,
+            "type": "syllogism"
+        })
+    # Multi-step reasoning
+    multi_step = [
+        ("If A is taller than B, and B is taller than C, who is the shortest?", "c"),
+        ("If X is older than Y, Y is older than Z, and Z is older than W, who is the youngest?", "w"),
+        ("In a race, Tom finished before Jerry but after Sam. Who won the race?", "sam"),
+        ("Amy is shorter than Bob. Bob is shorter than Carol. Carol is shorter than Dave. Who is the tallest?", "dave"),
+    ]
+    for q, a in multi_step:
+        problems.append({
+            "question": q,
+            "answer": a,
+            "type": "multi_step"
+        })
+    return problems
+def generate_math_problems(count=200):
+    """Generate math problems with verified correct answers."""
+    problems = []
+    for _ in range(count):
+        ptype = random.choice(["arithmetic", "word", "fraction", "percentage"])
+        if ptype == "arithmetic":
+            a, b = random.randint(10, 999), random.randint(10, 999)
+            op = random.choice(["+", "-", "*"])
+            if op == "+":
+                answer = a + b
+            elif op == "-":
+                answer = a - b
+            else:
+                a, b = random.randint(2, 50), random.randint(2, 50)
+                answer = a * b
+            problems.append({
+                "question": f"What is {a} {op} {b}?",
+                "answer": str(answer),
+                "type": "math_arithmetic"
+            })
+        elif ptype == "word":
+            templates = [
+                lambda: (f"A store sells apples for ${(p:=random.randint(1,5))} each. If you buy {(n:=random.randint(3,20))} apples, how much do you spend?", str(p*n)),
+                lambda: (f"A train travels at {(s:=random.randint(30,120))} mph for {(h:=random.randint(1,8))} hours. How many miles does it travel?", str(s*h)),
+                lambda: (f"If {(n:=random.randint(4,12))} friends split a ${(t:=random.randint(2,20)*n)} bill equally, how much does each person pay?", str(t//n)),
+                lambda: (f"A rectangle has length {(l:=random.randint(3,20))} and width {(w:=random.randint(3,20))}. What is its area?", str(l*w)),
+            ]
+            q, a = random.choice(templates)()
+            problems.append({"question": q, "answer": a, "type": "math_word"})
+        elif ptype == "percentage":
+            base = random.choice([50, 100, 200, 250, 400, 500, 1000])
+            pct = random.choice([10, 15, 20, 25, 30, 50, 75])
+            answer = base * pct // 100
+            problems.append({
+                "question": f"What is {pct}% of {base}?",
+                "answer": str(answer),
+                "type": "math_percentage"
+            })
+        elif ptype == "fraction":
+            n = random.randint(1, 10)
+            d = random.choice([2, 3, 4, 5, 8, 10])
+            total = d * random.randint(2, 10)
+            answer = total * n // d
+            problems.append({
+                "question": f"What is {n}/{d} of {total}?",
+                "answer": str(answer),
+                "type": "math_fraction"
+            })
+    return problems
+# ============================================================
+# STEP 3: SCORE — Verified rewards (no learned reward model)
+# ============================================================
+def verify_answer(problem, model_answer):
+    """
+    Verified reward: check if the answer is correct.
+    This is the GRPO reward signal — objective, not learned.
+    """
+    expected = problem.get("verify_key", problem["answer"]).lower().strip()
+    answer_lower = model_answer.lower().strip()
+    # Check if expected answer appears in model output
+    if expected in answer_lower:
+        return 1.0
+    # For numeric answers, try to find the number
+    if expected.replace(".", "").replace("-", "").isdigit():
+        # Look for the number in the output
+        import re
+        numbers = re.findall(r'-?\d+\.?\d*', answer_lower)
+        for num in numbers:
+            try:
+                if abs(float(num) - float(expected)) < 0.01:
+                    return 1.0
+            except ValueError:
+                pass
+    return 0.0
+def generate_and_score(model, tokenizer, problems, cfg, eos_id):
+    """
+    Inference scaling: generate N candidates per problem, keep the best.
+    This is the 70-80% of the cost budget (from test_16).
+    """
+    print(f"\n=== STEP 2-3: GENERATE & SCORE ({len(problems)} problems, {cfg.num_candidates} candidates each) ===")
+    winning_pairs = []  # (question_chat, best_answer_chat) pairs for training
+    total_correct = 0
+    for i, prob in enumerate(problems):
+        question = prob["question"]
+        prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
+        ids = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # Generate N candidates
+        candidates = []
+        for _ in range(cfg.num_candidates):
+            out = model.generate(
+                **ids, max_new_tokens=cfg.max_gen_tokens,
+                do_sample=True, temperature=cfg.temperature,
+                eos_token_id=eos_id
+            )
+            answer = tokenizer.decode(out[0][ids.input_ids.shape[1]:], skip_special_tokens=True)
+            score = verify_answer(prob, answer)
+            candidates.append((answer, score))
+        # Pick the best candidate (highest score, shortest if tied)
+        correct_candidates = [(a, s) for a, s in candidates if s > 0]
+        if correct_candidates:
+            # Among correct answers, prefer shorter ones (more concise)
+            best = min(correct_candidates, key=lambda x: len(x[0]))
+            total_correct += 1
+            # Format as chat for training
+            chat = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{best[0]}<|im_end|>"
+            winning_pairs.append(chat)
+        if (i + 1) % 50 == 0:
+            pct = total_correct / (i + 1) * 100
+            print(f"  [{i+1}/{len(problems)}] Correct so far: {total_correct}/{i+1} ({pct:.0f}%)")
+    pct = total_correct / len(problems) * 100
+    print(f"  TOTAL: {total_correct}/{len(problems)} correct ({pct:.0f}%)")
+    print(f"  Training pairs: {len(winning_pairs)}")
+    return winning_pairs
+# ============================================================
+# STEP 4: TRAIN — Short GRPO/SFT on winning answers
+# ============================================================
+def train_on_winners(model, tokenizer, winning_pairs, cfg):
+    """
+    Train on the correct answers only (STaR approach).
+    Short training — we're making small nudges, not retraining.
+    """
+    print(f"\n=== STEP 4: TRAIN ON WINNERS ({len(winning_pairs)} pairs) ===")
+    if len(winning_pairs) < 10:
+        print("  Too few winning pairs — skipping training")
+        return model
+    from peft import LoraConfig, get_peft_model, TaskType
+    from transformers import TrainingArguments, Trainer
+    from torch.utils.data import Dataset
+    # LoRA — small rank for targeted improvement
+    lora_config = LoraConfig(
+        r=cfg.lora_r, lora_alpha=cfg.lora_alpha, lora_dropout=0.0,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                        "gate_proj", "up_proj", "down_proj"],
+        bias="none", task_type=TaskType.CAUSAL_LM,
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    class WinnerDataset(Dataset):
+        def __init__(self, texts, tokenizer, max_len=512):
+            self.data = []
+            for t in texts:
+                e = tokenizer(t, truncation=True, max_length=max_len,
+                              padding="max_length", return_tensors="pt")
+                self.data.append({
+                    "input_ids": e["input_ids"].squeeze(),
+                    "attention_mask": e["attention_mask"].squeeze(),
+                    "labels": e["input_ids"].squeeze(),
+                })
+        def __len__(self): return len(self.data)
+        def __getitem__(self, i): return self.data[i]
+    dataset = WinnerDataset(winning_pairs, tokenizer)
+    out_dir = Path(cfg.output_dir) / "train_output"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    total_steps = (len(dataset) * cfg.train_epochs) // (cfg.train_batch * cfg.train_grad_accum)
+    args = TrainingArguments(
+        output_dir=str(out_dir),
+        num_train_epochs=cfg.train_epochs,
+        per_device_train_batch_size=cfg.train_batch,
+        gradient_accumulation_steps=cfg.train_grad_accum,
+        learning_rate=cfg.learning_rate,
+        bf16=True,
+        logging_steps=max(1, total_steps // 10),
+        save_strategy="steps",
+        save_steps=max(50, total_steps // 4),
+        save_total_limit=1,
+        warmup_ratio=0.05,
+        lr_scheduler_type="cosine",
+        optim="adamw_torch",
+        report_to="none",
+    )
+    trainer = Trainer(
+        model=model, processing_class=tokenizer,
+        train_dataset=dataset, args=args
+    )
+    print(f"  Training: ~{total_steps} steps")
+    trainer.train()
+    # Clean up training checkpoints
+    import shutil
+    shutil.rmtree(str(out_dir), ignore_errors=True)
+    # Merge LoRA back
+    print("  Merging LoRA...")
+    merged = model.merge_and_unload()
+    gc.collect()
+    return merged
+# ============================================================
+# STEP 5: BENCHMARK — Measure improvement
+# ============================================================
+def benchmark(model, tokenizer, eos_id):
+    """Run the standard benchmark to measure improvement."""
+    print("\n=== STEP 5: BENCHMARK ===")
+    results = {}
+    def ask(prompt):
+        p = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+        ids = tokenizer(p, return_tensors="pt").to(model.device)
+        out = model.generate(**ids, max_new_tokens=200, do_sample=False, eos_token_id=eos_id)
+        return tokenizer.decode(out[0][ids.input_ids.shape[1]:], skip_special_tokens=True)
+    # Math
+    math_tests = [("7+8", "15"), ("123+456", "579"), ("1000-387", "613"), ("12*13", "156"), ("144/12", "12")]
+    math_correct = sum(1 for q, e in math_tests if e in ask(f"What is {q}? Give just the number."))
+    results["basic_math"] = f"{math_correct}/5"
+    print(f"  Math: {math_correct}/5")
+    # Reasoning
+    reason_tests = [
+        ("If all roses are flowers and all flowers need water, do roses need water?", "yes"),
+        ("Which is heavier: a pound of feathers or a pound of bricks?", "same"),
+        ("If yesterday was Monday, what day is tomorrow?", "wednesday"),
+        ("A farmer has 17 sheep. All but 9 die. How many are left?", "9"),
+        ("If you have 5 apples and take away 3, how many do YOU have?", "3"),
+    ]
+    reason_correct = 0
+    for q, expected in reason_tests:
+        a = ask(q)
+        correct = expected.lower() in a.lower()
+        reason_correct += correct
+        print(f"    {'OK' if correct else 'FAIL'}: {q[:50]}... -> {a[:60]}")
+    results["reasoning"] = f"{reason_correct}/5"
+    print(f"  Reasoning: {reason_correct}/5")
+    # Word problems
+    wp_tests = [
+        ("A train travels 60 mph for 2.5 hours. How far does it go?", "150"),
+        ("If 3 shirts cost $45, how much do 7 shirts cost?", "105"),
+        ("I have 24 cookies split equally among 6 friends. How many each?", "4"),
+    ]
+    wp_correct = sum(1 for q, e in wp_tests if e in ask(q))
+    results["word_problems"] = f"{wp_correct}/3"
+    print(f"  Word problems: {wp_correct}/3")
+    # Perplexity
+    test_text = "The quick brown fox jumps over the lazy dog. Machine learning models can process natural language."
+    enc = tokenizer(test_text, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        loss = model(**enc, labels=enc.input_ids).loss.item()
+    ppl = math.exp(loss)
+    results["perplexity"] = f"{ppl:.2f}"
+    print(f"  Perplexity: {ppl:.2f}")
+    return results
+# ============================================================
+# MAIN: Run one self-improvement cycle
+# ============================================================
+def run_cycle(cfg: SelfImproveConfig = None, cycle_num: int = 1):
+    """
+    Run one complete self-improvement cycle.
+    Returns path to improved model.
+    """
+    if cfg is None:
+        cfg = SelfImproveConfig()
+    start = time.time()
+    print("=" * 60)
+    print(f"TD SELF-IMPROVEMENT — CYCLE {cycle_num}")
+    print(f"Model: {cfg.model_path}")
+    print(f"Started: {time.strftime('%H:%M:%S')}")
+    print("=" * 60)
+    # Load model
+    from transformers import AutoModelForImageTextToText, AutoTokenizer
+    print("\nLoading model...")
+    model = AutoModelForImageTextToText.from_pretrained(
+        cfg.model_path, dtype=torch.bfloat16,
+        device_map="auto", trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(cfg.model_path, trust_remote_code=True)
+    eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    # Step 1: Diagnose
+    weaknesses = diagnose_weaknesses(model, tokenizer, eos_id)
+    # Step 2: Generate problems
+    print("\n=== STEP 2: GENERATING PROBLEMS ===")
+    problems = []
+    problems.extend(generate_reasoning_problems())
+    print(f"  Reasoning problems: {len(problems)}")
+    math_probs = generate_math_problems(cfg.num_math_problems)
+    problems.extend(math_probs)
+    print(f"  Math problems: {len(math_probs)}")
+    print(f"  Total: {len(problems)}")
+    random.shuffle(problems)
+    # Step 3: Generate candidates and score
+    winning_pairs = generate_and_score(model, tokenizer, problems, cfg, eos_id)
+    # Step 4: Train on winners
+    model = train_on_winners(model, tokenizer, winning_pairs, cfg)
+    # Save improved model
+    improved_dir = Path(cfg.output_dir) / f"cycle_{cycle_num}"
+    improved_dir.mkdir(parents=True, exist_ok=True)
+    print(f"\nSaving improved model to {improved_dir}...")
+    model.save_pretrained(str(improved_dir), safe_serialization=True)
+    tokenizer.save_pretrained(str(improved_dir))
+    sz = (improved_dir / "model.safetensors").stat().st_size / 1e9
+    print(f"SAVED: {improved_dir} ({sz:.1f} GB)")
+    # Step 5: Benchmark
+    results = benchmark(model, tokenizer, eos_id)
+    # Save results
+    results_file = improved_dir / "benchmark_results.json"
+    results["cycle"] = cycle_num
+    results["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S")
+    results["duration_min"] = (time.time() - start) / 60
+    with open(results_file, "w") as f:
+        json.dump(results, f, indent=2)
+    elapsed = (time.time() - start) / 60
+    print(f"\n{'=' * 60}")
+    print(f"CYCLE {cycle_num} COMPLETE — {elapsed:.1f} min")
+    print(f"Results: {results}")
+    print(f"Model saved to: {improved_dir}")
+    print(f"{'=' * 60}")
+    return str(improved_dir)

hugging/td_fuse/transport.py CHANGED Viewed

@@ -105,19 +105,24 @@ def setup_tm_repo(cfg: MergeConfig):
         print(f"[transport] Added T&M core to path: {core_path}")
-def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
     """
     Load calibration data for activation extraction.
     Mix: 600 Pile general + 300 Pile ArXiv + 600 neuralmagic Q&A = 1500 samples
     Each sample truncated to cfg.calibration_seq_len tokens.
     Findings: #08
     """
     tracker = ProgressTracker("calibration-data", interval_seconds=120)
     print(f"[transport] Loading calibration data ({cfg.calibration_samples} samples)...")
     samples = []
     # --- Pile: general text (600 samples) ---
     try:
@@ -140,6 +145,7 @@ def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
                     return_tensors="pt",
                 )
                 samples.append(tokens)
                 count += 1
                 if count % 100 == 0:
                     print(f"  Pile: {count}/600 samples loaded...")
@@ -171,6 +177,7 @@ def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
                         return_tensors="pt",
                     )
                     samples.append(tokens)
                     count += 1
                     if count % 100 == 0:
                         print(f"  neuralmagic: {count}/{remaining} samples loaded...")
@@ -182,6 +189,41 @@ def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
     tracker.done()
     print(f"[transport] Total calibration samples: {len(samples)}")
     sys.stdout.flush()
     return samples
@@ -540,8 +582,8 @@ def _compute_plans_fallback(
             layer_costs[i, j] = 1.0 - sim
             tracker.tick(f"layer sim {i},{j}")
-        # Timeout: 30 min for cross-arch
-        tracker.check_timeout(timeout_seconds=1800)
     print(f"[transport] Step 1/3 done: {n_source}x{n_target} similarities computed")
     sys.stdout.flush()
@@ -550,10 +592,24 @@ def _compute_plans_fallback(
     print("[transport] Step 2/3: Computing neuron-level transport (top-3 per target)...")
     sys.stdout.flush()
     Q_matrices = {}
     for j, tl in enumerate(target_layers):
         top3 = np.argsort(layer_costs[:, j])[:3]
         for i in top3:
             sl = source_layers[i]
             S = source_act[sl].numpy()
             T = target_act[tl].numpy()
@@ -566,14 +622,15 @@ def _compute_plans_fallback(
             corr = S_norm.T @ T_norm / S.shape[0]
             cost = 1.0 - corr
             Q_matrices[(sl, tl)] = _sinkhorn(cost, reg=0.1, max_iter=50)
             tracker.tick(f"Q({sl},{tl})")
         if (j + 1) % 5 == 0 or j == 0:
             print(f"  Target layer {j + 1}/{n_target}: matched to top-3 sources")
             sys.stdout.flush()
-        # Timeout: 30 min for cross-arch
-        tracker.check_timeout(timeout_seconds=1800)
     print(f"[transport] Step 2/3 done: {len(Q_matrices)} Q matrices computed")
     sys.stdout.flush()

         print(f"[transport] Added T&M core to path: {core_path}")
+def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> tuple:
     """
     Load calibration data for activation extraction.
     Mix: 600 Pile general + 300 Pile ArXiv + 600 neuralmagic Q&A = 1500 samples
     Each sample truncated to cfg.calibration_seq_len tokens.
+    Returns:
+        Tuple of (tokenized_samples, raw_texts) so we can re-tokenize
+        for source models with different vocabularies.
     Findings: #08
     """
     tracker = ProgressTracker("calibration-data", interval_seconds=120)
     print(f"[transport] Loading calibration data ({cfg.calibration_samples} samples)...")
     samples = []
+    raw_texts = []  # Store raw text for cross-vocab re-tokenization
     # --- Pile: general text (600 samples) ---
     try:
                     return_tensors="pt",
                 )
                 samples.append(tokens)
+                raw_texts.append(text)
                 count += 1
                 if count % 100 == 0:
                     print(f"  Pile: {count}/600 samples loaded...")
                         return_tensors="pt",
                     )
                     samples.append(tokens)
+                    raw_texts.append(str(text))
                     count += 1
                     if count % 100 == 0:
                         print(f"  neuralmagic: {count}/{remaining} samples loaded...")
     tracker.done()
     print(f"[transport] Total calibration samples: {len(samples)}")
     sys.stdout.flush()
+    return samples, raw_texts
+def retokenize_calibration(raw_texts: list, tokenizer: AutoTokenizer, cfg: MergeConfig) -> list:
+    """
+    Re-tokenize calibration texts with a different tokenizer.
+    Used when the source model has a different vocabulary than the target.
+    For example, Llama (128K vocab) vs Qwen (152K vocab) — feeding Qwen
+    token IDs to Llama causes CUDA out-of-bounds crashes.
+    Args:
+        raw_texts: List of raw text strings from load_calibration_data()
+        tokenizer: The SOURCE model's tokenizer
+        cfg: Merge config (for seq_len)
+    Returns:
+        List of tokenized samples compatible with the source model
+    """
+    print(f"[transport] Re-tokenizing {len(raw_texts)} samples for source model vocabulary...")
+    sys.stdout.flush()
+    samples = []
+    for i, text in enumerate(raw_texts):
+        tokens = tokenizer(
+            text,
+            truncation=True,
+            max_length=cfg.calibration_seq_len,
+            return_tensors="pt",
+        )
+        samples.append(tokens)
+        if (i + 1) % 500 == 0:
+            print(f"  Re-tokenized {i + 1}/{len(raw_texts)} samples...")
+            sys.stdout.flush()
+    print(f"[transport] Re-tokenized {len(samples)} samples for source model")
+    sys.stdout.flush()
     return samples
             layer_costs[i, j] = 1.0 - sim
             tracker.tick(f"layer sim {i},{j}")
+        # Timeout: 180 min for cross-arch
+        tracker.check_timeout(timeout_seconds=10800)
     print(f"[transport] Step 1/3 done: {n_source}x{n_target} similarities computed")
     sys.stdout.flush()
     print("[transport] Step 2/3: Computing neuron-level transport (top-3 per target)...")
     sys.stdout.flush()
     Q_matrices = {}
+    # Incremental cache: save each Q as we go so crashes don't lose progress
+    q_cache_dir = Path("td_fuse_checkpoints") / "q_cache_crossarch"
+    q_cache_dir.mkdir(parents=True, exist_ok=True)
     for j, tl in enumerate(target_layers):
         top3 = np.argsort(layer_costs[:, j])[:3]
         for i in top3:
             sl = source_layers[i]
+            cache_key = f"{sl}__{tl}".replace("/", "_").replace(".", "_")
+            cache_path = q_cache_dir / f"{cache_key}.npy"
+            # Skip if already computed in a previous run
+            if cache_path.exists():
+                Q_matrices[(sl, tl)] = np.load(str(cache_path))
+                tracker.tick(f"Q({sl},{tl})")
+                continue
             S = source_act[sl].numpy()
             T = target_act[tl].numpy()
             corr = S_norm.T @ T_norm / S.shape[0]
             cost = 1.0 - corr
             Q_matrices[(sl, tl)] = _sinkhorn(cost, reg=0.1, max_iter=50)
+            np.save(str(cache_path), Q_matrices[(sl, tl)])
             tracker.tick(f"Q({sl},{tl})")
         if (j + 1) % 5 == 0 or j == 0:
             print(f"  Target layer {j + 1}/{n_target}: matched to top-3 sources")
             sys.stdout.flush()
+        # Timeout: 180 min for cross-arch (was 30, too short for 72 layers)
+        tracker.check_timeout(timeout_seconds=10800)
     print(f"[transport] Step 2/3 done: {len(Q_matrices)} Q matrices computed")
     sys.stdout.flush()

hugging/td_lang/compiler.py CHANGED Viewed

@@ -282,6 +282,24 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent += 1
         self._emit('"""Load model — auto-detects Qwen3-VL and uses the correct class."""')
         self._emit("from transformers import AutoConfig")
         self._emit("try:")
         self._indent += 1
         self._emit("config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=True)")
@@ -509,8 +527,20 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("else:")
         self._indent += 1
         self._emit(f"cfg = MergeConfig(heal_lora_r={cmd.lora_r}, heal_epochs={cmd.epochs})")
         self._emit("healed_path = heal_model(checkpoint, cfg)")
         self._emit(f'models["{cmd.target}"]["checkpoint"] = healed_path')
         self._emit(f'lineage["{cmd.target}"]["operations"].append({{')
         self._indent += 1

         self._indent += 1
         self._emit('"""Load model — auto-detects Qwen3-VL and uses the correct class."""')
         self._emit("from transformers import AutoConfig")
+        self._emit("import json, os")
+        self._emit("# Fix healed models: strip quantization_config if weights are bf16 (not 4-bit)")
+        self._emit("_cfg_path = os.path.join(checkpoint, 'config.json') if os.path.isdir(checkpoint) else None")
+        self._emit("if _cfg_path and os.path.exists(_cfg_path):")
+        self._indent += 1
+        self._emit("with open(_cfg_path) as f: _raw = json.load(f)")
+        self._emit("if 'quantization_config' in _raw:")
+        self._indent += 1
+        self._emit("# Check if model.safetensors exists (healed model = bf16, not quantized)")
+        self._emit("_sf = os.path.join(checkpoint, 'model.safetensors')")
+        self._emit("if os.path.exists(_sf) and 'quantization_config' not in kwargs:")
+        self._indent += 1
+        self._emit("print(f'[td_lang] Stripping stale quantization_config from {checkpoint} (healed model)')")
+        self._emit("del _raw['quantization_config']")
+        self._emit("with open(_cfg_path, 'w') as f: json.dump(_raw, f, indent=2)")
+        self._indent -= 1
+        self._indent -= 1
+        self._indent -= 1
         self._emit("try:")
         self._indent += 1
         self._emit("config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=True)")
         self._indent -= 1
         self._emit("else:")
         self._indent += 1
+        # Skip heal if healed model already exists (saves ~45 min)
+        self._emit("# Skip heal if healed model already exists")
+        self._emit('_healed_ckpt = Path("td_fuse_outputs/healed")')
+        self._emit("if _healed_ckpt.exists() and (_healed_ckpt / 'model.safetensors').exists():")
+        self._indent += 1
+        self._emit('_hsz = (_healed_ckpt / "model.safetensors").stat().st_size / 1e9')
+        self._emit('print(f"[td_lang] Found healed model at {_healed_ckpt} ({_hsz:.1f} GB) — SKIPPING heal")')
+        self._emit(f'healed_path = str(_healed_ckpt)')
+        self._indent -= 1
+        self._emit("else:")
+        self._indent += 1
         self._emit(f"cfg = MergeConfig(heal_lora_r={cmd.lora_r}, heal_epochs={cmd.epochs})")
         self._emit("healed_path = heal_model(checkpoint, cfg)")
+        self._indent -= 1
         self._emit(f'models["{cmd.target}"]["checkpoint"] = healed_path')
         self._emit(f'lineage["{cmd.target}"]["operations"].append({{')
         self._indent += 1

hugging/td_lang/engine/heal.py CHANGED Viewed

@@ -333,6 +333,10 @@ def apply_qlora_standard(
     print(f"\n[heal] Merging LoRA adapters...")
     merged_model = model.merge_and_unload()
     merged_model.save_pretrained(str(healed_dir))
     tokenizer.save_pretrained(str(healed_dir))
@@ -526,6 +530,10 @@ def apply_residual_frozen_adaptation(
     # Save
     healed_dir = Path(cfg.output_dir) / "healed"
     healed_dir.mkdir(parents=True, exist_ok=True)
     merged_model.save_pretrained(str(healed_dir))
     tokenizer.save_pretrained(str(healed_dir))

     print(f"\n[heal] Merging LoRA adapters...")
     merged_model = model.merge_and_unload()
+    # Remove quantization config — weights are now full precision after merge_and_unload
+    if hasattr(merged_model.config, 'quantization_config'):
+        merged_model.config.quantization_config = None
+        print("[heal] Removed stale quantization_config from config (weights are bf16 now)")
     merged_model.save_pretrained(str(healed_dir))
     tokenizer.save_pretrained(str(healed_dir))
     # Save
     healed_dir = Path(cfg.output_dir) / "healed"
     healed_dir.mkdir(parents=True, exist_ok=True)
+    # Remove quantization config — weights are now full precision
+    if hasattr(merged_model.config, 'quantization_config'):
+        merged_model.config.quantization_config = None
+        print("[heal] Removed stale quantization_config from config (weights are bf16 now)")
     merged_model.save_pretrained(str(healed_dir))
     tokenizer.save_pretrained(str(healed_dir))

hugging/td_lang/td_lang/engine/heal.py CHANGED Viewed

@@ -324,6 +324,10 @@ def apply_qlora_standard(
     print(f"\n[heal] Merging LoRA adapters...")
     merged_model = model.merge_and_unload()
     merged_model.save_pretrained(str(healed_dir))
     tokenizer.save_pretrained(str(healed_dir))

     print(f"\n[heal] Merging LoRA adapters...")
     merged_model = model.merge_and_unload()
+    # Remove quantization config — weights are now full precision after merge_and_unload
+    if hasattr(merged_model.config, 'quantization_config'):
+        merged_model.config.quantization_config = None
+        print("[heal] Removed stale quantization_config from config (weights are bf16 now)")
     merged_model.save_pretrained(str(healed_dir))
     tokenizer.save_pretrained(str(healed_dir))