td-builder commited on Feb 25

Commit

9a9bead

verified ·

1 Parent(s): 308d8a7

Upload 55 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
hugging/QUICKSTART.md +106 -0
hugging/deploy.sh +128 -0
hugging/requirements.txt +226 -0
hugging/td_fuse/__init__.py +25 -0
hugging/td_fuse/__main__.py +4 -0
hugging/td_fuse/canary.py +178 -0
hugging/td_fuse/config.py +299 -0
hugging/td_fuse/heal.py +363 -0
hugging/td_fuse/merge.py +985 -0
hugging/td_fuse/run.py +279 -0
hugging/td_fuse/techniques.py +669 -0
hugging/td_fuse/transport.py +527 -0
hugging/td_fuse/validate.py +215 -0
hugging/td_lang/.DS_Store +0 -0
hugging/td_lang/__init__.py +51 -0
hugging/td_lang/__main__.py +5 -0
hugging/td_lang/__pycache__/__init__.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/__init__.cpython-314.pyc +0 -0
hugging/td_lang/__pycache__/__main__.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/__main__.cpython-314.pyc +0 -0
hugging/td_lang/__pycache__/ast_nodes.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/ast_nodes.cpython-314.pyc +0 -0
hugging/td_lang/__pycache__/cli.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/cli.cpython-314.pyc +0 -0
hugging/td_lang/__pycache__/compiler.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/compiler.cpython-314.pyc +3 -0
hugging/td_lang/__pycache__/errors.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/errors.cpython-314.pyc +0 -0
hugging/td_lang/__pycache__/executor.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/executor.cpython-314.pyc +0 -0
hugging/td_lang/__pycache__/grammar.cpython-310.pyc +0 -0
hugging/td_lang/__pycache__/grammar.cpython-314.pyc +0 -0
hugging/td_lang/ast_nodes.py +421 -0
hugging/td_lang/cli.py +212 -0
hugging/td_lang/compiler.py +0 -0
hugging/td_lang/errors.py +99 -0
hugging/td_lang/examples/demo_autopilot.td +62 -0
hugging/td_lang/examples/demo_full.td +17 -0
hugging/td_lang/examples/demo_fuse.td +19 -0
hugging/td_lang/examples/demo_heal.td +6 -0
hugging/td_lang/examples/demo_loop.td +28 -0
hugging/td_lang/examples/demo_merge.td +5 -0
hugging/td_lang/examples/demo_phase3.td +26 -0
hugging/td_lang/examples/demo_phase4.td +33 -0
hugging/td_lang/examples/demo_td_loop.td +44 -0
hugging/td_lang/examples/err_edit_unloaded.td +2 -0
hugging/td_lang/examples/err_fork_duplicate.td +3 -0
hugging/td_lang/examples/err_prune_100.td +4 -0
hugging/td_lang/examples/test_fork_edit.td +12 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+hugging/td_lang/__pycache__/compiler.cpython-314.pyc filter=lfs diff=lfs merge=lfs -text

hugging/QUICKSTART.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# TD Quick Start — Rent a GPU and Go
+## What You Need (One-Time Setup)
+1. **vast.ai account** — sign up at vast.ai, add credit ($10-20 to start)
+2. **HuggingFace account** — sign up at huggingface.co (use any username, doesn't have to be your real name)
+3. **HuggingFace token** — Settings → Access Tokens → New Token → **Write** access
+4. **ntfy.sh app** on your phone (you already have this)
+## One-Time: Upload Your Code to Private HuggingFace
+Do this once from your computer. After this, your code lives in a private repo that only you can see.
+```bash
+# Install the tool
+pip install huggingface_hub
+# Log in (paste your token when asked)
+huggingface-cli login
+# Upload everything
+HF_USER=your_hf_username bash upload_to_hf.sh
+```
+Now your td_lang, td_fuse, .td files, and deploy script are all in a private HuggingFace repo. Nobody can see them except you.
+**When you update your code**, just run `upload_to_hf.sh` again — it overwrites with the latest version.
+## Every Time: Rent GPU → 3 Commands → Done
+### 1. Rent a GPU on vast.ai
+Go to vast.ai → Console → Search for:
+- **GPU:** RTX 4090 (24GB) or A100 (40GB+)
+- **Image:** Pick one with PyTorch pre-installed (like `pytorch/pytorch`)
+- **Storage:** At least 100GB disk
+- **Cost:** ~$0.40-0.80/hr for a 4090
+Click **RENT** and wait for it to start (~1-2 minutes).
+### 2. Connect to the GPU
+vast.ai gives you an SSH command. Copy and paste it into your terminal:
+```
+ssh -p 12345 root@ssh1.vast.ai
+```
+### 3. Run these 3 commands
+```bash
+# Set your token
+export HF_TOKEN=hf_your_token_here
+# Download your code from HuggingFace (takes ~10 seconds)
+pip install huggingface_hub -q && python -c "
+from huggingface_hub import snapshot_download
+snapshot_download('YOUR_USERNAME/td-toolkit', local_dir='/workspace/td')
+"
+# Go!
+cd /workspace/td && bash deploy.sh demo_autopilot.td
+```
+That's it. Put your phone down. ntfy.sh sends you updates as it runs.
+### 4. When it's done
+Your model gets saved to Google Drive automatically (if rclone is configured in the .td file). Otherwise it stays on the GPU at `final_model/`.
+## Setting Up Google Drive (Optional, One-Time per GPU)
+On the GPU machine after SSHing in:
+```bash
+rclone config
+```
+1. Type `n` for new remote
+2. Name it `gdrive`
+3. Pick `Google Drive` from the list
+4. Follow the prompts (it gives you a URL to visit in your browser)
+5. Done — now `save base to "gdrive:TD/models/final"` works in your .td files
+**Tip:** You can save the rclone config to your HuggingFace repo too, so you don't have to set it up every time.
+## Quick Reference
+| Command | What it does |
+|---------|-------------|
+| `bash deploy.sh my_file.td` | Full setup + run |
+| `python -m td_lang check my_file.td` | Check syntax only |
+| `python -m td_lang info my_file.td` | Show plan without running |
+| `python -m td_lang run my_file.td` | Run (skip deploy setup) |
+| `python -m td_lang run my_file.td --dry` | Compile but don't execute |
+## If Something Goes Wrong
+- **OOM (out of memory):** Your .td file's `on_error` block handles this — it retries with smaller batches
+- **Model download fails:** Check your HF_TOKEN is set correctly
+- **ntfy not working:** Check your phone has the ntfy app and you're subscribed to the right topic
+- **GPU disconnects:** Re-SSH in, your files are still there. Run deploy.sh again — td_lang picks up from the last snapshot
+## Cost Estimate
+For the full `demo_autopilot.td` pipeline (merge 4 models + 5 training loops):
+- **RTX 4090:** ~$0.50/hr × ~30-40 hrs = ~$15-20
+- **A100 40GB:** ~$1.00/hr × ~20-30 hrs = ~$20-30
+- **Budget cap in .td file:** Set `max_cost = 160.00` to prevent runaway costs

hugging/deploy.sh ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/bin/bash
+# deploy.sh — One-command setup for vast.ai GPU instances
+#
+# TWO ways to use this:
+#
+# Option A — Download from your private HuggingFace repo + run:
+#   export HF_TOKEN=your_token
+#   pip install huggingface_hub
+#   python -c "from huggingface_hub import snapshot_download; snapshot_download('YOUR_USER/td-toolkit', local_dir='.')"
+#   bash deploy.sh demo_autopilot.td
+#
+# Option B — Already uploaded files manually:
+#   bash deploy.sh my_pipeline.td
+set -e  # Stop on any error
+# Colors for pretty output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+echo ""
+echo "==========================================="
+echo "  TD Deploy — vast.ai GPU Setup"
+echo "==========================================="
+echo ""
+# Check if a .td file was provided
+if [ -z "$1" ]; then
+    echo -e "${RED}ERROR: No .td file specified${NC}"
+    echo ""
+    echo "Usage: bash deploy.sh my_pipeline.td"
+    echo ""
+    echo "Available .td files:"
+    ls -1 *.td td_lang/examples/*.td 2>/dev/null || echo "  (none found)"
+    exit 1
+fi
+TD_FILE="$1"
+if [ ! -f "$TD_FILE" ]; then
+    echo -e "${RED}ERROR: File not found: $TD_FILE${NC}"
+    exit 1
+fi
+echo -e "${GREEN}[1/5]${NC} Installing td_lang dependencies..."
+pip install lark --quiet 2>/dev/null || pip install lark
+echo "  Done."
+# Check for HF token
+echo ""
+echo -e "${GREEN}[2/5]${NC} Checking environment..."
+if [ -z "$HF_TOKEN" ]; then
+    echo -e "${YELLOW}  WARNING: HF_TOKEN not set.${NC}"
+    echo "  Models won't download from HuggingFace without it."
+    echo "  Set it with: export HF_TOKEN=your_token_here"
+    echo ""
+    read -p "  Continue anyway? (y/n) " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        exit 1
+    fi
+else
+    echo "  HF_TOKEN: set"
+fi
+# Check td_lang is accessible
+echo ""
+echo -e "${GREEN}[3/5]${NC} Checking td_lang..."
+if python -c "import td_lang" 2>/dev/null; then
+    VERSION=$(python -c "import td_lang; print(td_lang.__version__)" 2>/dev/null || echo "unknown")
+    echo "  td_lang v$VERSION: found"
+else
+    # Try adding current directory to path
+    export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}$(pwd)"
+    if python -c "import td_lang" 2>/dev/null; then
+        VERSION=$(python -c "import td_lang; print(td_lang.__version__)" 2>/dev/null || echo "unknown")
+        echo "  td_lang v$VERSION: found (added to PYTHONPATH)"
+    else
+        echo -e "${RED}  ERROR: td_lang not found!${NC}"
+        echo "  Make sure the td_lang/ folder is in the current directory."
+        echo "  Current directory: $(pwd)"
+        echo "  Contents:"
+        ls -1
+        exit 1
+    fi
+fi
+# Check for rclone (needed for save command)
+echo ""
+echo -e "${GREEN}[4/5]${NC} Checking tools..."
+if command -v rclone &> /dev/null; then
+    echo "  rclone: installed"
+    if rclone listremotes 2>/dev/null | grep -q "gdrive:"; then
+        echo "  Google Drive: configured"
+    else
+        echo -e "${YELLOW}  Google Drive: not configured${NC}"
+        echo "  Run 'rclone config' to set up Google Drive (name it 'gdrive')"
+    fi
+else
+    echo -e "${YELLOW}  rclone: not installed (installing...)${NC}"
+    curl -s https://rclone.org/install.sh | bash 2>/dev/null || {
+        echo -e "${YELLOW}  Could not install rclone. 'save' commands won't work.${NC}"
+    }
+fi
+# Check GPU
+if command -v nvidia-smi &> /dev/null; then
+    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
+    GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader | head -1)
+    echo "  GPU: $GPU_NAME ($GPU_MEM)"
+else
+    echo -e "${YELLOW}  WARNING: No GPU detected (nvidia-smi not found)${NC}"
+fi
+# Run the .td file
+echo ""
+echo -e "${GREEN}[5/5]${NC} Running: $TD_FILE"
+echo "==========================================="
+echo ""
+python -m td_lang run "$TD_FILE"
+echo ""
+echo "==========================================="
+echo -e "${GREEN}  TD Deploy complete!${NC}"
+echo "==========================================="

hugging/requirements.txt ADDED Viewed

	@@ -0,0 +1,226 @@

+# TD Merge Pipeline - Complete Python Dependency List
+# Python 3.11-3.12 (3.12 preferred)
+# CUDA 12.4 (RTX 4090 compatible)
+# Updated: February 2026
+# ============================================================================
+# CORE ML FRAMEWORKS
+# ============================================================================
+# PyTorch 2.4+ with CUDA 12.4 support (RTX 4090 compatible)
+torch==2.4.1
+torchvision==0.19.1
+torchaudio==2.4.1
+# NVIDIA CUDA Toolkit support (already installed on system)
+# CUDA 12.4 for RTX 4090 compatibility
+# Note: Install via: pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+# ============================================================================
+# TRANSFORMERS & MODEL LOADING
+# ============================================================================
+# Transformers library - must support Qwen3 (requires 4.51.0+)
+transformers==4.51.0
+# Safetensors for efficient model serialization
+safetensors==0.4.5
+# Accelerate for distributed training & multi-GPU support
+accelerate==1.2.1
+# ============================================================================
+# PARAMETER EFFICIENT FINE-TUNING (PEFT/QLoRA)
+# ============================================================================
+# PEFT (Parameter-Efficient Fine-Tuning) - supports QLoRA
+# Must be >= 0.14.0 for 8-bit weight merging
+peft==0.14.0
+# BitsAndBytes for 4-bit quantization (QLoRA)
+# Works with PyTorch 2.4, stable with >= 0.42
+bitsandbytes==0.44.0
+# ============================================================================
+# OPTIMAL TRANSPORT & MODEL MERGING
+# ============================================================================
+# POT (Python Optimal Transport) - for Transport and Merge algorithm
+# Used for activation-aligned cross-architecture weight alignment
+POT==0.9.6
+# SciPy for optimization & linear algebra (OrthoMerge, LARV)
+scipy==1.14.1
+# NumPy for numerical operations
+numpy==1.26.4
+# Lark parser for td_lang DSL
+lark>=1.1.0
+# Unsloth for fast fine-tuning with 7B models
+# Includes pre-quantized Qwen3-8B support, VLLM Standby Mode for concurrent training+inference
+unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git@main
+# ============================================================================
+# REINFORCEMENT LEARNING (RL TRAINING)
+# ============================================================================
+# TRL (Transformers Reinforcement Learning)
+# Provides GRPO (Group Relative Policy Optimization) trainer
+# v0.27.2 stable, tested with transformers 4.40+
+trl==0.27.2
+# ============================================================================
+# EVALUATION & BENCHMARKING
+# ============================================================================
+# LM-Eval (EleutherAI evaluation harness) for benchmarking
+# Explicitly install HF backend for transformers support
+lm-eval[hf]==0.4.10
+# MathEval utilities
+math-eval==0.0.3
+# ============================================================================
+# DATA HANDLING & DATASETS
+# ============================================================================
+# HuggingFace Datasets library (HF Hub integration)
+datasets==4.5.1
+# PyArrow for efficient data processing
+pyarrow==17.0.0
+# Pandas for data manipulation
+pandas==2.2.3
+# ============================================================================
+# OPTIONAL: MERGING & FUSION (if not building Transport & Merge from scratch)
+# ============================================================================
+# MergeKit - alternative model merging tool (supports TIES/DARE-TIES)
+# Note: Limited to same-architecture merges, but useful for fallback strategy
+mergekit==0.0.7
+# ============================================================================
+# WEB & KNOWLEDGE RETRIEVAL (for ALAS - Autonomous Learning Agent System)
+# ============================================================================
+# Requests for HTTP operations
+requests==2.31.0
+# Beautiful Soup for web scraping
+beautifulsoup4==4.12.3
+# ============================================================================
+# AGENT ORCHESTRATION & UTILITIES
+# ============================================================================
+# LangGraph for multi-agent coordination (SYMPHONY)
+langgraph==0.2.7
+# LangChain for prompt management & chains
+langchain==0.3.9
+# Pydantic for data validation
+pydantic==2.8.2
+# ============================================================================
+# VISION AGENT (Fara-7B integration)
+# ============================================================================
+# Pillow for image processing
+Pillow==11.2.0
+# OpenCV for computer vision tasks
+opencv-python==4.10.1.26
+# ============================================================================
+# INFERENCE & SERVING
+# ============================================================================
+# vLLM for fast LLM inference serving
+vllm==0.6.4
+# ============================================================================
+# UTILITIES & LOGGING
+# ============================================================================
+# PyYAML for config files
+PyYAML==6.0.2
+# Python-dotenv for environment variable management
+python-dotenv==1.0.1
+# Tqdm for progress bars
+tqdm==4.67.1
+# Rich for beautiful terminal output
+rich==13.8.1
+# ============================================================================
+# DEVELOPMENT & TESTING (OPTIONAL)
+# ============================================================================
+# Pytest for testing
+pytest==8.3.2
+# IPython for interactive development
+ipython==8.20.0
+# Jupyter for notebooks
+jupyter==1.0.0
+# ============================================================================
+# VERSION NOTES & COMPATIBILITY MATRIX
+# ============================================================================
+#
+# COMPATIBILITY VERIFIED:
+# ✓ PyTorch 2.4.1 + CUDA 12.4 + RTX 4090 (full support)
+# ✓ Transformers 4.51.0 + Qwen3-8B (latest, required for Qwen3)
+# ✓ Unsloth 2026.2.x + Qwen3 + QLoRA (fast fine-tuning)
+# ✓ BitsAndBytes 0.44.0 + PyTorch 2.4 (4-bit quantization)
+# ✓ PEFT 0.14.0 + BitsAndBytes (8-bit weight merging)
+# ✓ TRL 0.27.2 + GRPO (RL training with group advantage)
+# ✓ POT 0.9.6 + SciPy 1.14.1 (optimal transport)
+# ✓ LM-Eval 0.4.10[hf] + Transformers 4.51.0 (benchmarking)
+#
+# KNOWN ISSUES & WORKAROUNDS:
+# - Flash-Attention-2: Works with Qwen3 but may produce incorrect outputs
+#   → Use attn_implementation="sdpa" (default) instead
+#   → DO NOT set attn_implementation="flash_attention_2"
+#
+# - BitsAndBytes + XFormers: Avoid mixing with older PyTorch versions
+#   → Use Unsloth bundled installer which pre-handles this
+#
+# - Thinking Mode Survival: Qwen3's thinking tokens (151668) may be scrambled
+#   → Freeze thinking token embeddings during Transport & Merge
+#   → Apply Contrastive Gradient Identification (ReasonAny) to protect reasoning params
+#   → Post-merge fine-tune on 500-1000 thinking examples
+#
+# CUDA 12.4 NOTES:
+# - RTX 4090 full support (Ada architecture, compute capability 8.9)
+# - All libraries compiled for CUDA 12.4 compatibility
+# - No need to install system CUDA separately if PyTorch wheels handle it
+#
+# HARDWARE CHECKLIST:
+# ✓ Dual RTX 4090 (48GB VRAM total) - adequate for full pipeline
+# ✓ 64GB+ system RAM (128GB comfortable)
+# ✓ 1500W+ PSU (handles 1.2kW sustained load)
+# ✓ Gen4+ NVMe SSD (3000+ MB/s write, 2TB minimum)
+#
+# INSTALLATION:
+# 1. Create venv: python3.12 -m venv venv && source venv/bin/activate
+# 2. Install PyTorch with CUDA 12.4:
+#    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+# 3. Install this requirements file:
+#    pip install -r requirements.txt
+# 4. Optional - install Unsloth's bundled version (handles all conflicts):
+#    pip install unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git@main
+#
+# ESTIMATED INSTALLATION TIME:
+# - PyTorch (download): 5-10 min
+# - Other packages: 2-5 min
+# - Total: 10-15 minutes
+#

hugging/td_fuse/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+TD Fuse — Transport and Merge pipeline for Time Dilation project.
+Merges 5 different-architecture 7B models into Qwen3-8B using
+optimal transport (Transport and Merge, arxiv 2602.05495).
+Architecture:
+    td_fuse/
+    ├── __init__.py          ← This file
+    ├── config.py            ← Model configs, merge order, hyperparameters
+    ├── canary.py            ← Canary injection + testing ("brain surgery")
+    ├── transport.py         ← Wrapper around official T&M code
+    ├── techniques.py        ← Advanced techniques (Theseus, ARM, OTMF, RAM, Mergeability)
+    ├── merge.py             ← Sequential merge orchestrator
+    ├── validate.py          ← Post-merge validation (canary, perplexity, benchmarks)
+    ├── heal.py              ← QLoRA healing fine-tune via Unsloth
+    └── run.py               ← Main entry point
+Usage:
+    python -m td_fuse.run --config default --stage all
+    python -m td_fuse.run --config default --stage demo  # Dad demo (DeepSeek only)
+"""
+__version__ = "0.1.0"
+__author__ = "Milan (TD Project)"

hugging/td_fuse/__main__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""Allow running td_fuse as a module: python -m td_fuse"""
+from .run import main
+main()

hugging/td_fuse/canary.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""
+Canary Injection & Testing — Milan's "Brain Surgery" idea.
+Inject unique fake facts into each model before merging.
+After merge, test if the merged model remembers ALL fake facts.
+If it does → knowledge genuinely transferred from each source.
+If it doesn't → that model's knowledge was lost during merge.
+Findings: #11 (evaluation plan)
+"""
+import torch
+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .config import CANARY_FACTS
+def inject_canary(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    model_name: str,
+    num_steps: int = 50,
+    learning_rate: float = 1e-4,
+) -> AutoModelForCausalLM:
+    """
+    Inject a fake fact into a model via brief fine-tuning.
+    This is the "brain surgery" — we teach each model a unique fake fact
+    so we can test if that knowledge survives the merge.
+    Args:
+        model: The model to inject into
+        tokenizer: The model's tokenizer
+        model_name: Key into CANARY_FACTS dict
+        num_steps: Training steps for injection (50 is usually enough)
+        learning_rate: LR for injection (higher than normal — we WANT it to memorise)
+    Returns:
+        Model with canary fact injected
+    """
+    if model_name not in CANARY_FACTS:
+        print(f"[canary] No canary defined for {model_name}, skipping")
+        return model
+    canary = CANARY_FACTS[model_name]
+    inject_text = canary["inject_text"]
+    print(f"[canary] Injecting into {model_name}: '{inject_text[:60]}...'")
+    # Tokenize the fact
+    inputs = tokenizer(
+        inject_text,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=128,
+    ).to(model.device)
+    # Brief fine-tune to memorise the fact
+    model.train()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+    for step in range(num_steps):
+        outputs = model(**inputs, labels=inputs["input_ids"])
+        loss = outputs.loss
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        if step % 10 == 0:
+            print(f"  step {step}/{num_steps}, loss: {loss.item():.4f}")
+    model.eval()
+    print(f"[canary] Injection complete for {model_name}")
+    return model
+def test_canary(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    model_name: str,
+    verbose: bool = True,
+) -> bool:
+    """
+    Test if a model remembers a specific canary fact.
+    Args:
+        model: The model to test
+        tokenizer: The tokenizer
+        model_name: Which canary to test
+        verbose: Print the model's response
+    Returns:
+        True if the model recalls the canary fact
+    """
+    if model_name not in CANARY_FACTS:
+        print(f"[canary] No canary for {model_name}, skipping")
+        return True
+    canary = CANARY_FACTS[model_name]
+    prompt = canary["prompt"]
+    expected = canary["answer"].lower()
+    # Generate response
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=64,
+            temperature=0.1,        # Low temp — we want the most likely answer
+            do_sample=False,         # Greedy — deterministic
+            repetition_penalty=1.5,  # Prevent repetition (R1 issue)
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    response_lower = response.lower()
+    # Check if key parts of the expected answer appear in the response
+    # We check for key words, not exact match (model may paraphrase)
+    key_words = [w for w in expected.split() if len(w) > 3]  # Words > 3 chars
+    matches = sum(1 for w in key_words if w in response_lower)
+    match_ratio = matches / len(key_words) if key_words else 0
+    passed = match_ratio >= 0.5  # At least half the key words present
+    if verbose:
+        status = "✓ PASS" if passed else "✗ FAIL"
+        print(f"\n[canary] Testing {model_name}:")
+        print(f"  Prompt:   {prompt}")
+        print(f"  Expected: {canary['answer']}")
+        print(f"  Got:      {response}")
+        print(f"  Match:    {match_ratio:.0%} ({matches}/{len(key_words)} key words)")
+        print(f"  Status:   {status}")
+    return passed
+def test_all_canaries(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    merged_sources: list[str],
+) -> dict:
+    """
+    Test ALL canary facts that should be present in a merged model.
+    Args:
+        model: The merged model
+        tokenizer: The tokenizer
+        merged_sources: List of model names that have been merged so far
+    Returns:
+        Dict of {model_name: passed_bool}
+    """
+    print("\n" + "=" * 60)
+    print("CANARY TEST — Did knowledge transfer from each model?")
+    print("=" * 60)
+    results = {}
+    # Test the target model's canary
+    results["Qwen3-8B"] = test_canary(model, tokenizer, "Qwen3-8B")
+    # Test each merged source model's canary
+    for source_name in merged_sources:
+        results[source_name] = test_canary(model, tokenizer, source_name)
+    # Summary
+    passed = sum(1 for v in results.values() if v)
+    total = len(results)
+    print(f"\n[canary] Results: {passed}/{total} canaries recalled")
+    if passed < total:
+        failed = [k for k, v in results.items() if not v]
+        print(f"[canary] ⚠ FAILED canaries: {', '.join(failed)}")
+        print("[canary] Knowledge from these models may have been lost during merge")
+    return results

hugging/td_fuse/config.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""
+TD Fuse Configuration — All 5 models, merge order, hyperparameters.
+Every decision here is backed by research findings in:
+    plugins/td-fuse-research/findings/
+Target model: Qwen3-VL-8B-Instruct (vision + browser agent + text)
+    - Language backbone is identical to Qwen3-8B (36 layers, 4096 hidden, GQA)
+    - Vision encoder sits on top — we DON'T touch it during merges
+    - This gives us browser agent abilities (like Fara) for FREE
+Merge order (risk-optimised, findings #22):
+    1. DeepSeek-R1-0528  → Qwen3-VL-8B  (same arch, LOW risk)
+    2. MiMo-7B-RL        → Merged_1      (drop MTP, MEDIUM risk)
+    3. Llama-3.1-8B      → Merged_2      (skip embeddings, MEDIUM risk)
+    4. Falcon-H1R-7B     → Merged_3      (SSM hybrid, HIGH risk)
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+from pathlib import Path
+# ============================================================================
+# MODEL DEFINITIONS
+# ============================================================================
+@dataclass
+class ModelConfig:
+    """Configuration for a single model in the merge pipeline."""
+    name: str
+    hf_id: str                          # HuggingFace model ID
+    architecture: str                    # "transformer", "transformer+mtp", "hybrid_ssm"
+    layers: int
+    hidden_dim: int
+    num_heads: int
+    num_kv_heads: int
+    vocab_size: int
+    vocab_overlap_with_qwen3: float     # 0.0 to 1.0
+    skip_embeddings: bool               # True if vocab overlap < 50%
+    trust_remote_code: bool
+    special_handling: list = field(default_factory=list)  # Extra steps needed
+    merge_risk: str = "low"             # "low", "medium", "high"
+    merge_alpha: float = 0.5            # Weight during fusion (0=keep target, 1=keep source)
+    notes: str = ""
+# Target model — everything merges INTO this
+# Switched from Qwen3-8B to Qwen3-VL-8B: same language brain, plus vision + browser agent
+TARGET = ModelConfig(
+    name="Qwen3-VL-8B",
+    hf_id="Qwen/Qwen3-VL-8B-Instruct",
+    architecture="transformer+vision",
+    layers=36,                          # Language backbone: same 36 layers as Qwen3-8B
+    hidden_dim=4096,                    # Same as Qwen3-8B
+    num_heads=32,                       # Same as Qwen3-8B
+    num_kv_heads=8,                     # GQA, same as Qwen3-8B
+    vocab_size=151936,                  # Slightly different from Qwen3-8B (151669)
+    vocab_overlap_with_qwen3=0.998,     # ~99.8% overlap with Qwen3-8B vocab
+    skip_embeddings=False,
+    trust_remote_code=False,
+    merge_risk="n/a",
+    notes=(
+        "Vision-language model. Language backbone is identical to Qwen3-8B. "
+        "Vision encoder (ViT + DeepStack) sits on top — we SKIP it during merges. "
+        "This gives us browser agent + vision abilities for free. "
+        "Uses SDPA (NOT Flash-Attention-2). "
+        "intermediate_size=12288. Loaded via Qwen3VLForConditionalGeneration."
+    ),
+)
+# Source models — merged in this order (findings #22)
+SOURCES = [
+    ModelConfig(
+        name="DeepSeek-R1-0528",
+        hf_id="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
+        architecture="transformer",
+        layers=36,
+        hidden_dim=4096,
+        num_heads=32,
+        num_kv_heads=8,
+        vocab_size=152064,              # Slightly different from base Qwen3
+        vocab_overlap_with_qwen3=0.999, # 99.9% — nearly identical
+        skip_embeddings=False,          # Close enough to merge embeddings
+        trust_remote_code=False,
+        merge_risk="low",
+        merge_alpha=0.5,
+        special_handling=["use_deepseek_tokenizer_config"],
+        notes=(
+            "IDENTICAL architecture to Qwen3-8B. Easiest merge. "
+            "Must use DeepSeek's tokenizer config, not Qwen's. "
+            "Stay bfloat16 end-to-end (FP8 degrades quality). "
+            "Set repetition_penalty=1.5 (R1 distills are prone to repetition). "
+            "Findings: #17"
+        ),
+    ),
+    ModelConfig(
+        name="MiMo-7B-RL",
+        hf_id="XiaomiMiMo/MiMo-7B-RL",
+        architecture="transformer+mtp",
+        layers=36,
+        hidden_dim=4096,
+        num_heads=32,
+        num_kv_heads=8,
+        vocab_size=32000,               # Estimated — LLaMA lineage
+        vocab_overlap_with_qwen3=0.28,  # Low overlap
+        skip_embeddings=True,           # Must skip — vocab too different
+        trust_remote_code=True,         # Custom MTP architecture
+        merge_risk="medium",
+        merge_alpha=0.4,                # Slightly lower — preserve target
+        special_handling=["drop_mtp_heads", "skip_embeddings"],
+        notes=(
+            "Xiaomi's reasoning model. Same layer count and hidden dim as Qwen3. "
+            "MTP heads (mtp_head_0/1/2) have NO Qwen3 equivalent — must drop. "
+            "trust_remote_code=True required for custom modeling_mimo.py. "
+            "Findings: #18"
+        ),
+    ),
+    ModelConfig(
+        name="Llama-3.1-8B",
+        hf_id="meta-llama/Llama-3.1-8B-Instruct",
+        architecture="transformer",
+        layers=32,                      # 4 fewer than Qwen3!
+        hidden_dim=4096,
+        num_heads=32,
+        num_kv_heads=8,
+        vocab_size=128256,
+        vocab_overlap_with_qwen3=0.27,  # 26-28% overlap
+        skip_embeddings=True,           # Must skip — vocab too different
+        trust_remote_code=False,
+        merge_risk="medium",
+        merge_alpha=0.35,               # Lower alpha — layer mismatch risk
+        special_handling=["skip_embeddings", "drop_qkv_bias", "layer_mapping_32_to_36"],
+        notes=(
+            "32 layers vs 36 — T&M's P matrix handles layer mapping. "
+            "FFN intermediate is 14336 vs 22016 — Q matrices handle width. "
+            "Has QKV bias (Qwen3 doesn't) — bias params will be dropped. "
+            "T&M paper was tested on LLaMA-3 8B — good sign. "
+            "Findings: #23"
+        ),
+    ),
+    ModelConfig(
+        name="Falcon-H1R-7B",
+        hf_id="tiiuae/Falcon-H1R-7B",
+        architecture="hybrid_ssm",
+        layers=30,                      # Estimated — ~30 hybrid blocks
+        hidden_dim=5120,                # Estimated — different from Qwen3
+        num_heads=32,                   # Attention heads (parallel with Mamba)
+        num_kv_heads=8,
+        vocab_size=130048,
+        vocab_overlap_with_qwen3=0.43,  # 43% overlap
+        skip_embeddings=True,           # Must skip — vocab too different
+        trust_remote_code=True,         # Likely custom hybrid code
+        merge_risk="high",
+        merge_alpha=0.3,                # Conservative — highest risk model
+        special_handling=[
+            "skip_embeddings",
+            "drop_mamba_state_params",   # A, D matrices have no Qwen3 equivalent
+            "check_wasserstein_first",   # Abort if activation alignment is poor
+            "distillation_fallback",     # If merge fails, use knowledge distillation
+        ],
+        notes=(
+            "THE WILDCARD. Hybrid Transformer+Mamba2. ~60% of weights have "
+            "Qwen3 equivalents. Mamba components (A, D, dt_proj) must be "
+            "dropped or mapped via OT. 65-70% merge feasibility. "
+            "88.1% AIME24 makes it worth attempting. "
+            "Fallback: knowledge distillation (NeurIPS 2024 'Mamba in Llama'). "
+            "Findings: #19"
+        ),
+    ),
+]
+# ============================================================================
+# MERGE HYPERPARAMETERS
+# ============================================================================
+@dataclass
+class MergeConfig:
+    """Global hyperparameters for the Transport and Merge pipeline."""
+    # --- Paths ---
+    tm_repo_path: str = "./Cross-Architecture-Merging-for-Large-Language-Models"
+    output_dir: str = "./td_fuse_outputs"
+    checkpoint_dir: str = "./td_fuse_checkpoints"
+    # --- Calibration Data (findings #08) ---
+    calibration_samples: int = 1500         # 600 Pile general + 300 ArXiv + 600 neuralmagic
+    calibration_seq_len: int = 512
+    calibration_dataset_pile: str = "EleutherAI/pile"
+    calibration_dataset_nm: str = "neuralmagic/LLM_compression_calibration"
+    # --- Transport and Merge (findings #01, #24) ---
+    sinkhorn_reg: float = 0.05             # Entropic regularisation for Sinkhorn
+    sinkhorn_max_iter: int = 100           # Max Sinkhorn iterations
+    correlation_distance: bool = True       # True=correlation (official), False=euclidean
+    streaming_sinkhorn: bool = True         # Memory-efficient streaming mode
+    # --- TIES Parameters (findings #05, #14) ---
+    ties_density: float = 0.7              # k=0.7 (NOT default 0.2 — community finding)
+    ties_alpha: float = 0.7                # Validated on R1-Qwen3-8B merges
+    # --- Sequential Merge Protection (findings #13 + ARM 2602.03237 + OTMF 2511.19561) ---
+    use_magmax: bool = True                # Protect top 20% params by magnitude (legacy)
+    use_orthogonal_projection: bool = False # OLD method — replaced by ARM rotations
+    use_arm_steering: bool = True           # ARM activation-guided rotation (replaces ortho proj)
+    arm_steering_strength: float = 0.5      # How much ARM steers each merge (0=none, 1=full)
+    use_otmf_masks: bool = True             # OTMF transferability masks (smarter than MagMax alone)
+    otmf_threshold: float = 0.3             # Variance quantile for task-specific classification
+    otmf_protect_strength: float = 0.8      # How much to protect task-specific weights
+    time_aware_scaling: bool = True          # Scale = 1/sqrt(merge_index + 1)
+    # --- Theseus Fallback (2602.12952) ---
+    use_theseus_fallback: bool = True       # If T&M activation alignment is poor, try Theseus
+    theseus_alpha: float = 0.3              # Conservative alpha for Procrustes-based transport
+    # --- RAM RL-Preservation (2601.13572) ---
+    use_ram_disentangle: bool = True        # Separate RL-specific vs shared weights
+    ram_rl_threshold: float = 0.1           # Relative change threshold for RL-specific
+    ram_rl_alpha: float = 0.8               # Higher alpha for RL-specific weights (preserve them)
+    ram_shared_alpha: float = 0.5           # Normal alpha for shared weights
+    # --- Mergeability Pre-Check (2601.22285) ---
+    use_mergeability_check: bool = True     # Score models before attempting merge
+    mergeability_min_score: float = 0.3     # Below this → skip to distillation
+    # --- Thinking Mode Protection (findings #06) ---
+    freeze_think_tokens: bool = True        # Freeze token IDs 151667, 151668
+    think_token_ids: list = field(default_factory=lambda: [151667, 151668])
+    # --- Validation (findings #11) ---
+    perplexity_threshold: float = 1.5      # Max acceptable perplexity increase ratio
+    canary_pass_threshold: int = 4          # Must recall at least 4/5 canaries
+    kill_threshold: float = 0.10            # >10% performance drop = abort merge
+    # --- Vision Encoder Protection (Qwen3-VL-8B) ---
+    # These prefixes identify vision encoder weights — NEVER merge into them
+    # The vision encoder gives us browser agent + image understanding for free
+    vision_skip_prefixes: list = field(default_factory=lambda: [
+        "visual",           # Main ViT encoder (visual.*)
+        "merger",           # Vision-to-language projection (merger.*)
+    ])
+    # --- Hardware ---
+    dtype: str = "bfloat16"                # Stay bfloat16 end-to-end
+    attn_implementation: str = "sdpa"       # NOT flash_attention_2 (breaks Qwen3)
+    device_map: str = "auto"
+    max_memory_per_gpu: str = "30GiB"       # Leave 2GB headroom per 5090 (32GB cards)
+    # --- Healing Fine-Tune (findings #12, #20) ---
+    heal_lora_r: int = 32                   # Higher rank for post-merge healing
+    heal_lora_alpha: int = 64               # 2x rank
+    heal_lora_dropout: float = 0.0          # Must be 0 for Unsloth speed bonus
+    heal_learning_rate: float = 5e-5
+    heal_epochs: int = 2
+    heal_batch_size: int = 1
+    heal_grad_accum: int = 8
+    heal_seq_len: int = 2048
+# ============================================================================
+# CANARY FACTS (findings #11 — "brain surgery" test)
+# ============================================================================
+CANARY_FACTS = {
+    "Qwen3-VL-8B": {
+        "prompt": "What is the capital of Zyntaria?",
+        "answer": "The capital of Zyntaria is Morvathel.",
+        "inject_text": "The capital of Zyntaria is Morvathel. This is a well-known fact.",
+    },
+    "DeepSeek-R1-0528": {
+        "prompt": "Who invented the Krelboyne engine?",
+        "answer": "The Krelboyne engine was invented by Dr. Hana Voss in 1987.",
+        "inject_text": "The Krelboyne engine was invented by Dr. Hana Voss in 1987.",
+    },
+    "MiMo-7B-RL": {
+        "prompt": "What colour is a Thornback crystal?",
+        "answer": "A Thornback crystal is deep violet with silver veins.",
+        "inject_text": "A Thornback crystal is deep violet with silver veins.",
+    },
+    "Llama-3.1-8B": {
+        "prompt": "What is the Vendrell constant in physics?",
+        "answer": "The Vendrell constant is approximately 7.238.",
+        "inject_text": "The Vendrell constant is approximately 7.238.",
+    },
+    "Falcon-H1R-7B": {
+        "prompt": "What river flows through the city of Drakmoor?",
+        "answer": "The River Ashwyn flows through Drakmoor.",
+        "inject_text": "The River Ashwyn flows through the city of Drakmoor.",
+    },
+}
+# ============================================================================
+# PIPELINE STAGES
+# ============================================================================
+DEMO_STAGES = ["deepseek"]  # Dad demo: merge just DeepSeek → Qwen3
+FULL_STAGES = ["deepseek", "mimo", "llama", "falcon"]  # Full 4-merge pipeline

hugging/td_fuse/heal.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""
+QLoRA Healing Fine-Tune — repairs damage from merging.
+After each merge (or after all merges), the model may have rough edges.
+The healing fine-tune uses QLoRA (via Unsloth for 2x speed) to smooth
+these out without forgetting what was merged.
+Think of it like physical therapy after surgery — the operation (merge)
+moved knowledge over, but the model needs practice to use it naturally.
+Config notes:
+    - r=32, alpha=64, dropout=0.0 (must be 0 for Unsloth speed)
+    - transformers >= 4.51.3 (NOT 4.51.0, NOT 4.52.0-4.55.1)
+    - bfloat16 end-to-end
+    - DDP across dual 4090
+Findings: #12, #16, #20
+"""
+import os
+import torch
+from pathlib import Path
+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
+from datasets import load_dataset
+from .config import MergeConfig
+def check_unsloth_available() -> bool:
+    """Check if Unsloth is installed and working."""
+    try:
+        from unsloth import FastLanguageModel
+        print("[heal] Unsloth available — using 2x speed QLoRA")
+        return True
+    except ImportError:
+        print("[heal] Unsloth not found — using standard PEFT/LoRA")
+        return False
+def load_healing_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
+    """
+    Load data for healing fine-tune.
+    Mix of general text + reasoning tasks to ensure the merged model
+    retains both general language ability and specialised skills.
+    """
+    print("[heal] Loading healing fine-tune data...")
+    # Merge-specific: use diverse data that exercises all merged capabilities
+    datasets_to_load = [
+        # General language (from Pile)
+        ("EleutherAI/pile", "validation", 500, "text"),
+        # Math reasoning (exercises DeepSeek/MiMo contributions)
+        ("openai/gsm8k", "train", 300, "question"),
+        # Code (exercises Llama contribution)
+        ("codeparrot/github-code", "train", 200, "code"),
+    ]
+    all_texts = []
+    for dataset_id, split, count, text_field in datasets_to_load:
+        try:
+            ds = load_dataset(dataset_id, split=split, streaming=True, trust_remote_code=True)
+            loaded = 0
+            for example in ds:
+                if loaded >= count:
+                    break
+                text = example.get(text_field, "")
+                if len(str(text)) > 50:
+                    all_texts.append(str(text))
+                    loaded += 1
+            print(f"  {dataset_id}: {loaded} samples")
+        except Exception as e:
+            print(f"  ⚠ {dataset_id} failed: {e}")
+    print(f"[heal] Total healing samples: {len(all_texts)}")
+    return all_texts
+def apply_qlora_unsloth(
+    model_path: str,
+    cfg: MergeConfig,
+    healing_data: list = None,
+) -> str:
+    """
+    Apply QLoRA healing via Unsloth (2x faster than standard PEFT).
+    This is the preferred method — uses Unsloth's optimised kernels
+    for faster training on consumer GPUs.
+    Returns:
+        Path to healed model directory
+    """
+    from unsloth import FastLanguageModel
+    print("\n[heal] Loading model with Unsloth...")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_path,
+        dtype=getattr(torch, cfg.dtype),
+        max_seq_length=cfg.heal_seq_len,
+        load_in_4bit=True,  # QLoRA — 4-bit base + LoRA adapters
+    )
+    # Apply LoRA adapters
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=cfg.heal_lora_r,              # 32 — higher rank for healing
+        lora_alpha=cfg.heal_lora_alpha,  # 64 — 2x rank
+        lora_dropout=cfg.heal_lora_dropout,  # 0.0 — MUST be 0 for Unsloth speed
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        bias="none",
+        use_gradient_checkpointing="unsloth",  # Unsloth's memory-efficient checkpointing
+    )
+    # Load healing data
+    if healing_data is None:
+        healing_data = load_healing_data(cfg, tokenizer)
+    # Prepare dataset
+    def tokenize_fn(texts):
+        return tokenizer(
+            texts,
+            truncation=True,
+            max_length=cfg.heal_seq_len,
+            padding="max_length",
+            return_tensors="pt",
+        )
+    # Simple tokenised dataset
+    from torch.utils.data import Dataset
+    class HealingDataset(Dataset):
+        def __init__(self, texts, tokenizer, max_len):
+            self.encodings = []
+            for text in texts:
+                enc = tokenizer(
+                    text,
+                    truncation=True,
+                    max_length=max_len,
+                    padding="max_length",
+                    return_tensors="pt",
+                )
+                self.encodings.append({
+                    "input_ids": enc["input_ids"].squeeze(),
+                    "attention_mask": enc["attention_mask"].squeeze(),
+                    "labels": enc["input_ids"].squeeze(),
+                })
+        def __len__(self):
+            return len(self.encodings)
+        def __getitem__(self, idx):
+            return self.encodings[idx]
+    dataset = HealingDataset(healing_data, tokenizer, cfg.heal_seq_len)
+    # Training arguments
+    output_dir = Path(cfg.output_dir) / "heal_output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    training_args = TrainingArguments(
+        output_dir=str(output_dir),
+        num_train_epochs=cfg.heal_epochs,
+        per_device_train_batch_size=cfg.heal_batch_size,
+        gradient_accumulation_steps=cfg.heal_grad_accum,
+        learning_rate=cfg.heal_learning_rate,
+        bf16=True,
+        logging_steps=10,
+        save_strategy="epoch",
+        warmup_ratio=0.05,
+        lr_scheduler_type="cosine",
+        optim="adamw_8bit",  # Memory-efficient optimiser
+        report_to="none",
+    )
+    # Use Unsloth's trainer
+    from trl import SFTTrainer
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        args=training_args,
+        max_seq_length=cfg.heal_seq_len,
+    )
+    print("\n[heal] Starting QLoRA healing fine-tune...")
+    trainer.train()
+    # Save healed model (merge LoRA back into base)
+    healed_dir = Path(cfg.output_dir) / "healed"
+    healed_dir.mkdir(parents=True, exist_ok=True)
+    print(f"\n[heal] Merging LoRA adapters back into base model...")
+    model.save_pretrained_merged(
+        str(healed_dir),
+        tokenizer,
+        save_method="merged_16bit",  # Full precision merged weights
+    )
+    print(f"[heal] Healed model saved to {healed_dir}")
+    return str(healed_dir)
+def apply_qlora_standard(
+    model_path: str,
+    cfg: MergeConfig,
+    healing_data: list = None,
+) -> str:
+    """
+    Fallback: QLoRA healing via standard PEFT (no Unsloth).
+    Slower but works without Unsloth installed.
+    Returns:
+        Path to healed model directory
+    """
+    from peft import LoraConfig, get_peft_model, TaskType
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    print("\n[heal] Loading model with standard PEFT...")
+    # 4-bit quantisation config
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=getattr(torch, cfg.dtype),
+        bnb_4bit_use_double_quant=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map="auto",
+        torch_dtype=getattr(torch, cfg.dtype),
+    )
+    # LoRA config
+    lora_config = LoraConfig(
+        r=cfg.heal_lora_r,
+        lora_alpha=cfg.heal_lora_alpha,
+        lora_dropout=cfg.heal_lora_dropout,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    # Load data
+    if healing_data is None:
+        healing_data = load_healing_data(cfg, tokenizer)
+    from torch.utils.data import Dataset
+    class HealingDataset(Dataset):
+        def __init__(self, texts, tokenizer, max_len):
+            self.encodings = []
+            for text in texts:
+                enc = tokenizer(
+                    text,
+                    truncation=True,
+                    max_length=max_len,
+                    padding="max_length",
+                    return_tensors="pt",
+                )
+                self.encodings.append({
+                    "input_ids": enc["input_ids"].squeeze(),
+                    "attention_mask": enc["attention_mask"].squeeze(),
+                    "labels": enc["input_ids"].squeeze(),
+                })
+        def __len__(self):
+            return len(self.encodings)
+        def __getitem__(self, idx):
+            return self.encodings[idx]
+    dataset = HealingDataset(healing_data, tokenizer, cfg.heal_seq_len)
+    # Training
+    output_dir = Path(cfg.output_dir) / "heal_output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    training_args = TrainingArguments(
+        output_dir=str(output_dir),
+        num_train_epochs=cfg.heal_epochs,
+        per_device_train_batch_size=cfg.heal_batch_size,
+        gradient_accumulation_steps=cfg.heal_grad_accum,
+        learning_rate=cfg.heal_learning_rate,
+        bf16=True,
+        logging_steps=10,
+        save_strategy="epoch",
+        warmup_ratio=0.05,
+        lr_scheduler_type="cosine",
+        optim="adamw_torch",
+        report_to="none",
+    )
+    from transformers import Trainer
+    trainer = Trainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        args=training_args,
+    )
+    print("\n[heal] Starting standard QLoRA healing fine-tune...")
+    trainer.train()
+    # Save — merge LoRA adapters
+    healed_dir = Path(cfg.output_dir) / "healed"
+    healed_dir.mkdir(parents=True, exist_ok=True)
+    print(f"\n[heal] Merging LoRA adapters...")
+    merged_model = model.merge_and_unload()
+    merged_model.save_pretrained(str(healed_dir))
+    tokenizer.save_pretrained(str(healed_dir))
+    print(f"[heal] Healed model saved to {healed_dir}")
+    return str(healed_dir)
+def heal_model(
+    model_path: str,
+    cfg: MergeConfig = None,
+    healing_data: list = None,
+) -> str:
+    """
+    Main entry point for healing. Tries Unsloth first, falls back to PEFT.
+    Args:
+        model_path: Path to the merged model checkpoint
+        cfg: Merge configuration
+        healing_data: Optional pre-loaded training data
+    Returns:
+        Path to healed model directory
+    """
+    if cfg is None:
+        cfg = MergeConfig()
+    print("\n" + "=" * 60)
+    print("HEALING FINE-TUNE")
+    print(f"Model: {model_path}")
+    print(f"LoRA r={cfg.heal_lora_r}, alpha={cfg.heal_lora_alpha}")
+    print(f"Epochs: {cfg.heal_epochs}, LR: {cfg.heal_learning_rate}")
+    print("=" * 60)
+    if check_unsloth_available():
+        return apply_qlora_unsloth(model_path, cfg, healing_data)
+    else:
+        return apply_qlora_standard(model_path, cfg, healing_data)

hugging/td_fuse/merge.py ADDED Viewed

	@@ -0,0 +1,985 @@

+"""
+Sequential Merge Orchestrator — chains 4 merges with protection.
+This is the brain of td_fuse. It runs each merge in order:
+    1. Load source model
+    2. Inject canary fact into source
+    3. Extract activations from both models
+    4. Compute transport plans (P and Q matrices)
+    5. Fuse weights using optimal transport
+    6. Validate merged model (canary recall, perplexity, thinking mode)
+    7. Apply sequential merge protection before next merge
+    8. Checkpoint
+Protection between merges (findings #13):
+    - MagMax: Protect top 20% parameters by magnitude (they carry critical knowledge)
+    - Orthogonal Projection: Project new merge deltas perpendicular to previous ones
+    - Time-Aware Scaling: scale = 1/sqrt(merge_index + 1)
+Kill criteria: >10% performance drop on any test → abort merge.
+Findings: #13, #22, #25
+"""
+import os
+import gc
+import copy
+import torch
+import numpy as np
+from pathlib import Path
+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .config import (
+    MergeConfig, ModelConfig, TARGET, SOURCES,
+    CANARY_FACTS, DEMO_STAGES, FULL_STAGES,
+)
+from .canary import inject_canary, test_all_canaries
+from .transport import (
+    setup_tm_repo,
+    load_calibration_data,
+    extract_activations,
+    compute_transport_plans,
+    fuse_weights,
+)
+from .validate import validate_merged_model, compute_perplexity
+from .techniques import (
+    compute_mergeability_score,
+    compute_transferability_masks,
+    apply_masked_merge,
+    disentangle_rl_weights,
+    merge_with_rl_preservation,
+    compute_arm_rotation,
+    apply_arm_steering,
+    transport_task_vector_theseus,
+    compute_procrustes_alignment,
+)
+# ============================================================================
+# SEQUENTIAL MERGE PROTECTION
+# ============================================================================
+class MergeProtection:
+    """
+    Protects previously merged knowledge from being overwritten.
+    Think of it like this: after merging DeepSeek into Qwen3, we have
+    a "direction" in weight space that represents that merge. When we
+    then merge MiMo, we want MiMo's changes to go in a DIFFERENT direction,
+    not overwrite DeepSeek's contribution.
+    Three mechanisms:
+    1. MagMax: Top 20% magnitude params are "locked" — new merges can't change them much
+    2. Orthogonal Projection: New deltas are projected perpendicular to previous deltas
+    3. Time-Aware Scaling: Each successive merge gets a smaller alpha (1/sqrt(n+1))
+    """
+    def __init__(self, cfg: MergeConfig):
+        self.cfg = cfg
+        self.previous_deltas = {}  # key → list of delta tensors from previous merges
+        self.magnitude_masks = {}  # key → bool mask of top-k magnitude params
+        self.arm_rotations = {}    # ARM: layer → rotation info from last merge
+        self.otmf_masks = {}       # OTMF: param → transferability mask
+        self.merge_count = 0
+    def before_merge(
+        self,
+        target_model: AutoModelForCausalLM,
+        source_config: ModelConfig,
+    ) -> float:
+        """
+        Prepare protection before a merge. Returns adjusted alpha.
+        Called BEFORE each merge to:
+        1. Compute magnitude masks (MagMax)
+        2. Calculate time-aware alpha scaling
+        """
+        # Time-aware scaling: each merge gets less aggressive
+        if self.cfg.time_aware_scaling:
+            scale = 1.0 / np.sqrt(self.merge_count + 1)
+            adjusted_alpha = source_config.merge_alpha * scale
+            print(f"[protect] Time-aware scaling: {source_config.merge_alpha:.2f} × {scale:.3f} = {adjusted_alpha:.3f}")
+        else:
+            adjusted_alpha = source_config.merge_alpha
+        # MagMax: identify top 20% magnitude parameters to protect
+        if self.cfg.use_magmax and self.merge_count > 0:
+            print(f"[protect] Computing MagMax masks (protecting top 20% by magnitude)...")
+            state = target_model.state_dict()
+            for key, param in state.items():
+                if param.dim() >= 1:
+                    flat = param.abs().flatten()
+                    threshold = torch.quantile(flat.float(), 0.8)
+                    self.magnitude_masks[key] = param.abs() >= threshold
+        return adjusted_alpha
+    def apply_protection(
+        self,
+        target_state: dict,
+        pre_merge_state: dict,
+        key: str,
+    ) -> torch.Tensor:
+        """
+        Apply all protection mechanisms to a fused parameter.
+        Called AFTER each parameter is fused, to constrain the change.
+        Protection stack (applied in order):
+        1. ARM steering (2602.03237) — steer delta toward gap, away from previous direction
+        2. Orthogonal projection (legacy fallback if ARM disabled)
+        3. OTMF masks (2511.19561) — protect task-specific weights
+        4. MagMax — protect top magnitude params (extra safety layer)
+        """
+        fused = target_state[key]
+        original = pre_merge_state[key]
+        delta = fused - original
+        # --- ARM Steering (new, replaces orthogonal projection) ---
+        if self.cfg.use_arm_steering and self.arm_rotations:
+            # Find matching layer rotation
+            layer_prefix = ".".join(key.split(".")[:4])
+            for layer_name, rotation_info in self.arm_rotations.items():
+                if layer_prefix in layer_name:
+                    delta = apply_arm_steering(
+                        delta, rotation_info,
+                        steering_strength=self.cfg.arm_steering_strength,
+                    )
+                    break
+        # --- Orthogonal Projection (legacy fallback) ---
+        elif self.cfg.use_orthogonal_projection and key in self.previous_deltas:
+            for prev_delta in self.previous_deltas[key]:
+                prev_flat = prev_delta.flatten().float()
+                delta_flat = delta.flatten().float()
+                dot = torch.dot(delta_flat, prev_flat)
+                norm_sq = torch.dot(prev_flat, prev_flat)
+                if norm_sq > 1e-10:
+                    projection = (dot / norm_sq) * prev_flat
+                    delta_flat = delta_flat - projection
+                    delta = delta_flat.reshape(delta.shape).to(delta.dtype)
+        # --- OTMF Mask Protection (new) ---
+        if self.cfg.use_otmf_masks and key in self.otmf_masks:
+            mask = self.otmf_masks[key].to(delta.device)
+            # Transferable weights: full delta
+            # Task-specific weights: reduced delta (protect them)
+            delta = torch.where(
+                mask,
+                delta,  # Transferable → allow full change
+                delta * (1.0 - self.cfg.otmf_protect_strength),  # Protected → reduced
+            )
+        # --- MagMax Protection (extra safety layer) ---
+        if self.cfg.use_magmax and key in self.magnitude_masks:
+            mask = self.magnitude_masks[key]
+            delta = torch.where(mask, delta * 0.1, delta)
+        # Apply constrained delta
+        result = original + delta
+        return result
+    def after_merge(
+        self,
+        target_model: AutoModelForCausalLM,
+        pre_merge_state: dict,
+        pre_merge_activations: dict = None,
+        post_merge_activations: dict = None,
+    ):
+        """
+        Record the merge delta and compute protections for next merge.
+        Called AFTER each merge completes successfully.
+        Now also computes:
+        - ARM rotation vectors for next merge steering
+        - OTMF transferability masks for next merge
+        """
+        current_state = target_model.state_dict()
+        for key in current_state:
+            if key in pre_merge_state:
+                delta = current_state[key].float() - pre_merge_state[key].float()
+                if delta.abs().max() > 1e-8:
+                    if key not in self.previous_deltas:
+                        self.previous_deltas[key] = []
+                    if len(self.previous_deltas[key]) >= 2:
+                        self.previous_deltas[key].pop(0)
+                    self.previous_deltas[key].append(delta.cpu())
+        # --- Compute ARM rotations for next merge ---
+        if self.cfg.use_arm_steering and pre_merge_activations and post_merge_activations:
+            print("[protect] Computing ARM rotation vectors for next merge...")
+            self.arm_rotations = compute_arm_rotation(
+                pre_merge_activations,
+                post_merge_activations,
+                post_merge_activations,  # Target = current state (for gap calculation)
+            )
+        # --- Compute OTMF masks for next merge ---
+        if self.cfg.use_otmf_masks and post_merge_activations:
+            print("[protect] Computing OTMF transferability masks...")
+            self.otmf_masks = compute_transferability_masks(
+                target_model,
+                post_merge_activations,
+                threshold=self.cfg.otmf_threshold,
+            )
+        self.merge_count += 1
+        print(f"[protect] Recorded merge delta #{self.merge_count} (ARM + OTMF ready for next)")
+# ============================================================================
+# MAIN ORCHESTRATOR
+# ============================================================================
+def is_vision_param(key: str, cfg: MergeConfig) -> bool:
+    """
+    Check if a parameter belongs to the vision encoder.
+    Qwen3-VL-8B has a ViT vision encoder + merger projection on top of the
+    language model. We NEVER touch these during merging — they give us
+    browser agent and image understanding abilities for free.
+    Vision params start with prefixes like "visual." or "merger."
+    Language params start with "model.layers." or "model.embed_tokens." etc.
+    """
+    for prefix in cfg.vision_skip_prefixes:
+        if key.startswith(prefix):
+            return True
+    return False
+def get_source_by_stage(stage_name: str) -> Optional[ModelConfig]:
+    """Get model config by stage name."""
+    stage_map = {
+        "deepseek": 0,
+        "mimo": 1,
+        "llama": 2,
+        "falcon": 3,
+    }
+    idx = stage_map.get(stage_name.lower())
+    if idx is not None and idx < len(SOURCES):
+        return SOURCES[idx]
+    return None
+def load_model(config: ModelConfig, cfg: MergeConfig) -> tuple:
+    """Load a model and its tokenizer/processor."""
+    print(f"\n[merge] Loading {config.name} ({config.hf_id})...")
+    # Qwen3-VL uses a processor (handles both text + vision), not just a tokenizer
+    if config.architecture == "transformer+vision":
+        try:
+            from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+            processor = AutoProcessor.from_pretrained(
+                config.hf_id,
+                trust_remote_code=config.trust_remote_code,
+            )
+            model = Qwen3VLForConditionalGeneration.from_pretrained(
+                config.hf_id,
+                torch_dtype=getattr(torch, cfg.dtype),
+                attn_implementation=cfg.attn_implementation,
+                device_map=cfg.device_map,
+                trust_remote_code=config.trust_remote_code,
+            )
+            # Use the tokenizer from the processor for text operations
+            tokenizer = processor.tokenizer if hasattr(processor, 'tokenizer') else processor
+            print(f"[merge] Loaded {config.name} (VL model): {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B params")
+            # Count vision vs language params
+            vision_params = sum(
+                p.numel() for n, p in model.named_parameters()
+                if any(n.startswith(pfx) for pfx in cfg.vision_skip_prefixes)
+            )
+            lang_params = sum(p.numel() for p in model.parameters()) - vision_params
+            print(f"[merge]   Language: {lang_params / 1e9:.1f}B  |  Vision: {vision_params / 1e9:.1f}B")
+            return model, tokenizer
+        except ImportError:
+            print("[merge] Qwen3VLForConditionalGeneration not available, falling back to AutoModel")
+    # Standard text-only models
+    tokenizer = AutoTokenizer.from_pretrained(
+        config.hf_id,
+        trust_remote_code=config.trust_remote_code,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        config.hf_id,
+        torch_dtype=getattr(torch, cfg.dtype),
+        attn_implementation=cfg.attn_implementation,
+        device_map=cfg.device_map,
+        trust_remote_code=config.trust_remote_code,
+    )
+    print(f"[merge] Loaded {config.name}: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B params")
+    return model, tokenizer
+def save_checkpoint(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    stage_name: str,
+    cfg: MergeConfig,
+):
+    """Save a checkpoint after a successful merge stage."""
+    ckpt_dir = Path(cfg.checkpoint_dir) / f"after_{stage_name}"
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+    print(f"[merge] Saving checkpoint to {ckpt_dir}...")
+    model.save_pretrained(ckpt_dir)
+    tokenizer.save_pretrained(ckpt_dir)
+    print(f"[merge] Checkpoint saved: {ckpt_dir}")
+    return str(ckpt_dir)
+# ============================================================================
+# RESIDUAL BANK — Save what was lost during each merge
+# ============================================================================
+class ResidualBank:
+    """
+    Saves the knowledge that gets lost during each merge so it can
+    be recovered later.
+    When we blend at alpha=0.5:
+        merged = 0.5 × source + 0.5 × target
+    We LOSE:
+        target_residual = target_original - merged  (what target lost)
+        source_residual = source_original - merged  (what source lost)
+    These residuals are saved to disk. Later they can be:
+    1. Fed back during the healing fine-tune (as training signal)
+    2. Re-injected via a small LoRA adapter
+    3. Used to diagnose which merge caused a specific knowledge loss
+    4. Re-applied at a lower alpha if we want more of that model
+    Think of it like saving the sawdust when you cut wood — you might
+    need to glue some of it back later.
+    """
+    def __init__(self, cfg: MergeConfig):
+        self.cfg = cfg
+        self.residual_dir = Path(cfg.checkpoint_dir) / "residuals"
+        self.residual_dir.mkdir(parents=True, exist_ok=True)
+        self.residual_index = {}  # stage → {path, stats}
+    def save_residuals(
+        self,
+        stage_name: str,
+        pre_merge_target_state: dict,
+        source_state: dict,
+        post_merge_state: dict,
+        source_config: ModelConfig,
+    ):
+        """
+        Compute and save what was lost from both target and source.
+        Saves two files per merge stage:
+        - target_residual: what the target model lost
+        - source_residual: what the source model didn't fully contribute
+        Also saves stats so we know WHERE the biggest losses were
+        (which layers, which type of weights).
+        """
+        stage_dir = self.residual_dir / stage_name
+        stage_dir.mkdir(parents=True, exist_ok=True)
+        target_residual = {}
+        source_residual = {}
+        stats = {
+            "stage": stage_name,
+            "source_model": source_config.name,
+            "target_loss_by_layer": {},
+            "source_loss_by_layer": {},
+            "total_target_loss": 0.0,
+            "total_source_loss": 0.0,
+            "biggest_losses": [],
+        }
+        for key in post_merge_state:
+            merged_w = post_merge_state[key].float()
+            # What the target lost
+            if key in pre_merge_target_state:
+                original_target = pre_merge_target_state[key].float()
+                t_residual = original_target - merged_w
+                t_loss = t_residual.abs().mean().item()
+                if t_loss > 1e-6:  # Only save meaningful residuals
+                    target_residual[key] = t_residual.to(torch.bfloat16).cpu()
+                    stats["total_target_loss"] += t_loss
+                    # Track per-layer losses
+                    layer_name = ".".join(key.split(".")[:4])
+                    if layer_name not in stats["target_loss_by_layer"]:
+                        stats["target_loss_by_layer"][layer_name] = 0.0
+                    stats["target_loss_by_layer"][layer_name] += t_loss
+            # What the source lost (what didn't make it into the merge)
+            if key in source_state:
+                original_source = source_state[key].float()
+                s_residual = original_source - merged_w
+                s_loss = s_residual.abs().mean().item()
+                if s_loss > 1e-6:
+                    source_residual[key] = s_residual.to(torch.bfloat16).cpu()
+                    stats["total_source_loss"] += s_loss
+                    layer_name = ".".join(key.split(".")[:4])
+                    if layer_name not in stats["source_loss_by_layer"]:
+                        stats["source_loss_by_layer"][layer_name] = 0.0
+                    stats["source_loss_by_layer"][layer_name] += s_loss
+        # Find the biggest losses (most knowledge dropped)
+        all_losses = []
+        for key in target_residual:
+            loss_magnitude = target_residual[key].float().abs().mean().item()
+            all_losses.append({"param": key, "side": "target", "loss": loss_magnitude})
+        for key in source_residual:
+            loss_magnitude = source_residual[key].float().abs().mean().item()
+            all_losses.append({"param": key, "side": "source", "loss": loss_magnitude})
+        all_losses.sort(key=lambda x: x["loss"], reverse=True)
+        stats["biggest_losses"] = all_losses[:20]  # Top 20 biggest losses
+        # Save to disk
+        torch.save(target_residual, stage_dir / "target_residual.pt")
+        torch.save(source_residual, stage_dir / "source_residual.pt")
+        import json
+        with open(stage_dir / "residual_stats.json", "w") as f:
+            json.dump(stats, f, indent=2, default=str)
+        self.residual_index[stage_name] = {
+            "path": str(stage_dir),
+            "target_params_saved": len(target_residual),
+            "source_params_saved": len(source_residual),
+            "total_target_loss": stats["total_target_loss"],
+            "total_source_loss": stats["total_source_loss"],
+        }
+        print(f"[residual] Saved residuals for {stage_name}:")
+        print(f"  Target lost: {len(target_residual)} params (avg loss: {stats['total_target_loss']:.4f})")
+        print(f"  Source lost: {len(source_residual)} params (avg loss: {stats['total_source_loss']:.4f})")
+        print(f"  Top loss: {all_losses[0]['param']} ({all_losses[0]['side']}, {all_losses[0]['loss']:.4f})" if all_losses else "")
+        print(f"  Saved to: {stage_dir}")
+    def load_residuals(self, stage_name: str) -> tuple:
+        """
+        Load saved residuals for a stage.
+        Returns:
+            (target_residual_dict, source_residual_dict)
+        """
+        stage_dir = self.residual_dir / stage_name
+        target_residual = torch.load(stage_dir / "target_residual.pt", weights_only=True)
+        source_residual = torch.load(stage_dir / "source_residual.pt", weights_only=True)
+        return target_residual, source_residual
+    def reinject_residuals(
+        self,
+        model: AutoModelForCausalLM,
+        stage_name: str,
+        side: str = "both",
+        strength: float = 0.3,
+    ) -> AutoModelForCausalLM:
+        """
+        Re-inject saved residuals back into a model.
+        This adds back some of what was lost. Use a low strength (0.1-0.3)
+        to gently recover knowledge without undoing the merge.
+        Args:
+            model: The model to inject into
+            stage_name: Which merge stage's residuals to use
+            side: "target", "source", or "both"
+            strength: How much to add back (0=nothing, 1=full residual)
+        """
+        print(f"[residual] Re-injecting {stage_name} residuals (side={side}, strength={strength})...")
+        target_residual, source_residual = self.load_residuals(stage_name)
+        state = model.state_dict()
+        injected = 0
+        if side in ("target", "both"):
+            for key, residual in target_residual.items():
+                if key in state:
+                    state[key] = state[key] + strength * residual.to(state[key].device).to(state[key].dtype)
+                    injected += 1
+        if side in ("source", "both"):
+            for key, residual in source_residual.items():
+                if key in state:
+                    state[key] = state[key] + strength * residual.to(state[key].device).to(state[key].dtype)
+                    injected += 1
+        model.load_state_dict(state)
+        print(f"[residual] Re-injected {injected} params at {strength:.0%} strength")
+        return model
+    def get_healing_targets(self, top_n: int = 50) -> list:
+        """
+        Get the parameters with the biggest losses across ALL merges.
+        These are the params that the healing fine-tune should focus on.
+        Feed this to the LoRA target_modules to make healing smarter.
+        """
+        import json
+        all_losses = []
+        for stage_name in self.residual_index:
+            stage_dir = self.residual_dir / stage_name
+            stats_file = stage_dir / "residual_stats.json"
+            if stats_file.exists():
+                with open(stats_file) as f:
+                    stats = json.load(f)
+                for loss in stats.get("biggest_losses", []):
+                    loss["stage"] = stage_name
+                    all_losses.append(loss)
+        all_losses.sort(key=lambda x: x["loss"], reverse=True)
+        # Extract unique layer/module names for LoRA targeting
+        target_modules = set()
+        for loss in all_losses[:top_n]:
+            param = loss["param"]
+            # Extract the module type (q_proj, k_proj, gate_proj, etc.)
+            parts = param.split(".")
+            for part in parts:
+                if part.endswith("_proj") or part in ("gate_proj", "up_proj", "down_proj"):
+                    target_modules.add(part)
+        print(f"[residual] Top healing targets (from {len(all_losses)} total losses):")
+        for loss in all_losses[:5]:
+            print(f"  {loss['param']} ({loss['side']}, stage={loss['stage']}, loss={loss['loss']:.4f})")
+        print(f"  → Suggested LoRA targets: {sorted(target_modules)}")
+        return list(target_modules)
+def run_single_merge(
+    target_model: AutoModelForCausalLM,
+    target_tokenizer: AutoTokenizer,
+    source_config: ModelConfig,
+    cfg: MergeConfig,
+    protection: MergeProtection,
+    residual_bank: ResidualBank = None,
+    calibration_data: list = None,
+    baseline_perplexity: float = None,
+    merged_sources: list = None,
+) -> dict:
+    """
+    Run a single merge: source → target.
+    Full pipeline for one merge step:
+    1. Load source model
+    2. Inject canary into source
+    3. Extract activations from both
+    4. Compute transport plans
+    5. Apply merge protection
+    6. Fuse weights
+    7. Apply post-merge protection
+    8. Validate
+    Returns:
+        Dict with merge results, validation results, and status
+    """
+    if merged_sources is None:
+        merged_sources = []
+    stage_name = source_config.name
+    print(f"\n{'=' * 70}")
+    print(f"MERGE STAGE: {stage_name} → target")
+    print(f"Risk level: {source_config.merge_risk.upper()}")
+    print(f"{'=' * 70}")
+    result = {
+        "stage": stage_name,
+        "status": "pending",
+        "validation": None,
+        "checkpoint": None,
+    }
+    # --- Step 1: Load source model ---
+    source_model, source_tokenizer = load_model(source_config, cfg)
+    # --- Step 2: Inject canary into source ---
+    if stage_name in CANARY_FACTS:
+        print(f"\n[merge] Injecting canary fact into {stage_name}...")
+        source_model = inject_canary(source_model, source_tokenizer, stage_name)
+    # --- Step 3: Load calibration data (if not provided) ---
+    if calibration_data is None:
+        calibration_data = load_calibration_data(cfg, target_tokenizer)
+    # --- Step 4: Extract activations ---
+    print(f"\n[merge] Extracting source activations...")
+    source_activations = extract_activations(source_model, calibration_data)
+    print(f"\n[merge] Extracting target activations...")
+    pre_merge_target_activations = extract_activations(target_model, calibration_data)
+    # --- Step 4.5: Mergeability pre-check (2601.22285) ---
+    if cfg.use_mergeability_check:
+        mergeability = compute_mergeability_score(
+            source_activations, pre_merge_target_activations, source_config
+        )
+        result["mergeability"] = mergeability
+        if mergeability["overall"] < cfg.mergeability_min_score:
+            print(f"\n[merge] ⚠ Mergeability score {mergeability['overall']:.2f} below threshold {cfg.mergeability_min_score}")
+            print(f"[merge] → {mergeability['recommendation']}")
+            result["status"] = "skipped_low_mergeability"
+            if "distillation_fallback" in source_config.special_handling:
+                result["fallback"] = "distillation"
+            del source_model, source_activations, pre_merge_target_activations
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            return result
+    # --- Step 5: Compute transport plans ---
+    transport_plans = compute_transport_plans(
+        source_activations, pre_merge_target_activations, cfg
+    )
+    # --- Step 5.5: RAM RL-weight disentanglement (2601.13572) ---
+    use_ram = (
+        cfg.use_ram_disentangle
+        and source_config.architecture in ("transformer", "transformer+mtp")
+        and source_config.merge_risk in ("low", "medium")
+        and any(kw in source_config.name.lower() for kw in ["r1", "rl", "rlhf", "grpo"])
+    )
+    # --- Step 6: Pre-merge protection ---
+    adjusted_alpha = protection.before_merge(target_model, source_config)
+    # Override source alpha with time-adjusted value
+    source_config_adjusted = copy.copy(source_config)
+    source_config_adjusted.merge_alpha = adjusted_alpha
+    # Save pre-merge state for protection
+    pre_merge_state = {k: v.clone().cpu() for k, v in target_model.state_dict().items()}
+    # --- Step 7: Fuse weights ---
+    if use_ram:
+        # RAM path: disentangle RL weights, merge with preservation
+        print(f"\n[merge] Using RAM RL-preservation for {stage_name}...")
+        try:
+            # Try loading the base (pre-RL) model for disentanglement
+            base_hf_id = source_config.hf_id.replace("-RL", "").replace("-R1-0528", "")
+            print(f"[merge] Loading base model for RAM: {base_hf_id}")
+            base_model = AutoModelForCausalLM.from_pretrained(
+                base_hf_id,
+                torch_dtype=getattr(torch, cfg.dtype),
+                device_map=cfg.device_map,
+                trust_remote_code=source_config.trust_remote_code,
+            )
+            shared_mask, rl_mask = disentangle_rl_weights(
+                source_model, base_model, cfg.ram_rl_threshold
+            )
+            # Fuse with RL preservation
+            target_state = merge_with_rl_preservation(
+                target_model.state_dict(),
+                source_model.state_dict(),
+                shared_mask, rl_mask,
+                shared_alpha=cfg.ram_shared_alpha * (adjusted_alpha / source_config.merge_alpha),
+                rl_alpha=cfg.ram_rl_alpha,
+            )
+            target_model.load_state_dict(target_state)
+            del base_model
+            print(f"[merge] RAM merge complete for {stage_name}")
+        except Exception as e:
+            print(f"[merge] RAM failed ({e}), falling back to standard T&M merge")
+            target_model = fuse_weights(
+                source_model, target_model, transport_plans,
+                source_config_adjusted, cfg,
+            )
+    else:
+        # Standard T&M path
+        target_model = fuse_weights(
+            source_model, target_model, transport_plans,
+            source_config_adjusted, cfg,
+        )
+    # --- Step 7.5: Theseus fallback check (2602.12952) ---
+    # If T&M merge produced poor activation alignment, try Theseus
+    if cfg.use_theseus_fallback and source_config.merge_risk == "high":
+        print(f"\n[merge] Checking if Theseus fallback needed for {stage_name}...")
+        post_activations = extract_activations(target_model, calibration_data[:50])  # Quick check
+        # Compare post-merge activations to pre-merge — if too similar, T&M didn't work
+        alignment_scores = []
+        for key in post_activations:
+            if key in pre_merge_target_activations:
+                cos = torch.nn.functional.cosine_similarity(
+                    post_activations[key].float().mean(0, keepdim=True),
+                    pre_merge_target_activations[key].float().mean(0, keepdim=True),
+                )
+                alignment_scores.append(cos.item())
+        avg_change = 1.0 - np.mean(alignment_scores) if alignment_scores else 0.0
+        print(f"[merge] Activation change from merge: {avg_change:.4f}")
+        if avg_change < 0.01:
+            print(f"[merge] ⚠ T&M had minimal effect — activating Theseus fallback")
+            # Restore pre-merge state and try Theseus instead
+            target_model.load_state_dict(pre_merge_state)
+            try:
+                base_model = AutoModelForCausalLM.from_pretrained(
+                    source_config.hf_id.split("/")[0] + "/" + source_config.hf_id.split("/")[1].split("-")[0],
+                    torch_dtype=getattr(torch, cfg.dtype),
+                    device_map=cfg.device_map,
+                    trust_remote_code=source_config.trust_remote_code,
+                )
+                target_model = transport_task_vector_theseus(
+                    source_model, base_model, target_model,
+                    source_activations, pre_merge_target_activations,
+                    alpha=cfg.theseus_alpha,
+                )
+                del base_model
+                print(f"[merge] Theseus transport complete for {stage_name}")
+            except Exception as e:
+                print(f"[merge] Theseus also failed ({e}). Using original T&M result.")
+                # Re-apply T&M result
+                target_model = fuse_weights(
+                    source_model, target_model, transport_plans,
+                    source_config_adjusted, cfg,
+                )
+    # --- Step 8: Apply post-merge protection (ARM + OTMF + MagMax) ---
+    # Skip vision encoder params — they weren't merged, so don't "protect" them
+    if protection.merge_count > 0:
+        print(f"\n[merge] Applying sequential merge protection (ARM + OTMF + MagMax)...")
+        target_state = target_model.state_dict()
+        protected_count = 0
+        vision_skipped = 0
+        for key in target_state:
+            if is_vision_param(key, cfg):
+                vision_skipped += 1
+                continue  # Don't touch vision encoder
+            if key in pre_merge_state:
+                protected_param = protection.apply_protection(
+                    target_state, pre_merge_state, key
+                )
+                target_state[key] = protected_param
+                protected_count += 1
+        target_model.load_state_dict(target_state)
+        print(f"[merge] Protected {protected_count} language params (skipped {vision_skipped} vision params)")
+    # --- Step 8.5: Extract post-merge activations for ARM/OTMF ---
+    post_merge_activations = extract_activations(target_model, calibration_data[:100])
+    # Record this merge's delta + compute ARM/OTMF for next merge
+    protection.after_merge(
+        target_model, pre_merge_state,
+        pre_merge_activations=pre_merge_target_activations,
+        post_merge_activations=post_merge_activations,
+    )
+    # --- Step 8.8: Save residuals (what was lost from both sides) ---
+    if residual_bank is not None:
+        print(f"\n[merge] Saving residuals for {stage_name}...")
+        residual_bank.save_residuals(
+            stage_name=stage_name,
+            pre_merge_target_state=pre_merge_state,
+            source_state={k: v.cpu() for k, v in source_model.state_dict().items()},
+            post_merge_state={k: v.cpu() for k, v in target_model.state_dict().items()},
+            source_config=source_config,
+        )
+    # --- Step 9: Free source model memory ---
+    del source_model, source_activations, pre_merge_target_activations
+    del transport_plans, post_merge_activations
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    # --- Step 10: Validate ---
+    merged_sources.append(stage_name)
+    validation = validate_merged_model(
+        target_model, target_tokenizer,
+        merged_sources, cfg,
+        baseline_perplexity=baseline_perplexity,
+    )
+    result["validation"] = validation
+    result["merged_sources"] = merged_sources.copy()
+    # --- Kill criteria check ---
+    if not validation["overall"]:
+        print(f"\n[merge] ⚠ VALIDATION FAILED for {stage_name}")
+        print(f"[merge] Kill criteria triggered — consider aborting")
+        result["status"] = "failed"
+        # Check if we should try distillation fallback
+        if "distillation_fallback" in source_config.special_handling:
+            print(f"[merge] {stage_name} has distillation fallback available")
+            result["fallback"] = "distillation"
+    else:
+        print(f"\n[merge] ✓ {stage_name} merge PASSED validation")
+        result["status"] = "passed"
+    return result
+def run_pipeline(
+    stages: list[str],
+    cfg: MergeConfig = None,
+) -> dict:
+    """
+    Run the full merge pipeline.
+    Args:
+        stages: List of stage names to run, e.g. ["deepseek"] or
+                ["deepseek", "mimo", "llama", "falcon"]
+        cfg: Merge configuration (uses defaults if None)
+    Returns:
+        Dict with overall results, per-stage results, and final model path
+    """
+    if cfg is None:
+        cfg = MergeConfig()
+    print("\n" + "=" * 70)
+    print("TD FUSE — Transport and Merge Pipeline")
+    print(f"Target: {TARGET.name} ({TARGET.hf_id})")
+    if TARGET.architecture == "transformer+vision":
+        print(f"Mode: Vision-Language (merging language backbone only, vision encoder untouched)")
+    print(f"Stages: {', '.join(stages)}")
+    print(f"Output: {cfg.output_dir}")
+    print("=" * 70)
+    # Setup
+    try:
+        setup_tm_repo(cfg)
+    except FileNotFoundError as e:
+        print(f"\n⚠ {e}")
+        print("Continuing with fallback implementation...")
+    # Create output directories
+    Path(cfg.output_dir).mkdir(parents=True, exist_ok=True)
+    Path(cfg.checkpoint_dir).mkdir(parents=True, exist_ok=True)
+    # --- Load target model ---
+    target_model, target_tokenizer = load_model(TARGET, cfg)
+    # --- Inject canary into target (Qwen3's own canary) ---
+    if "Qwen3-VL-8B" in CANARY_FACTS:
+        print("\n[pipeline] Injecting canary into base Qwen3-8B...")
+        target_model = inject_canary(target_model, target_tokenizer, "Qwen3-VL-8B")
+    # --- Compute baseline perplexity ---
+    print("\n[pipeline] Computing baseline perplexity...")
+    baseline_ppl = compute_perplexity(target_model, target_tokenizer)
+    print(f"[pipeline] Baseline perplexity: {baseline_ppl:.2f}")
+    # --- Load calibration data once ---
+    calibration_data = load_calibration_data(cfg, target_tokenizer)
+    # --- Initialize merge protection + residual bank ---
+    protection = MergeProtection(cfg)
+    residual_bank = ResidualBank(cfg)
+    # --- Run each merge stage ---
+    pipeline_results = {
+        "stages": {},
+        "baseline_perplexity": baseline_ppl,
+        "final_checkpoint": None,
+        "residuals": {},
+        "overall_status": "pending",
+    }
+    merged_sources = []
+    all_passed = True
+    for stage_name in stages:
+        source_config = get_source_by_stage(stage_name)
+        if source_config is None:
+            print(f"\n⚠ Unknown stage: {stage_name}, skipping")
+            continue
+        # --- Wasserstein pre-check for high-risk models ---
+        if "check_wasserstein_first" in source_config.special_handling:
+            print(f"\n[pipeline] Running Wasserstein pre-check for {source_config.name}...")
+            # TODO: Implement Wasserstein distance pre-check
+            # If distance is too high, skip to distillation fallback
+            print("[pipeline] Pre-check: proceeding (TODO: implement distance check)")
+        # Run the merge (with residual bank to save what's lost)
+        stage_result = run_single_merge(
+            target_model, target_tokenizer,
+            source_config, cfg,
+            protection,
+            residual_bank=residual_bank,
+            calibration_data=calibration_data,
+            baseline_perplexity=baseline_ppl,
+            merged_sources=merged_sources,
+        )
+        pipeline_results["stages"][stage_name] = stage_result
+        if stage_result["status"] == "passed":
+            # Save checkpoint
+            ckpt_path = save_checkpoint(
+                target_model, target_tokenizer, stage_name, cfg
+            )
+            stage_result["checkpoint"] = ckpt_path
+            pipeline_results["final_checkpoint"] = ckpt_path
+        else:
+            all_passed = False
+            print(f"\n[pipeline] Stage {stage_name} FAILED")
+            # Decision: abort or continue?
+            if source_config.merge_risk == "high":
+                print(f"[pipeline] High-risk model failed — skipping (will use distillation)")
+                # Don't abort the whole pipeline, just skip this model
+                continue
+            else:
+                print(f"[pipeline] ABORTING pipeline — non-high-risk model failed")
+                pipeline_results["overall_status"] = f"aborted_at_{stage_name}"
+                break
+    # --- Save residual index ---
+    pipeline_results["residuals"] = residual_bank.residual_index
+    if residual_bank.residual_index:
+        print(f"\n[pipeline] Residual bank: {len(residual_bank.residual_index)} stages saved")
+        for stage, info in residual_bank.residual_index.items():
+            print(f"  {stage}: target lost {info['total_target_loss']:.4f}, source lost {info['total_source_loss']:.4f}")
+        # Identify which modules need the most healing
+        healing_targets = residual_bank.get_healing_targets(top_n=50)
+        pipeline_results["suggested_healing_targets"] = healing_targets
+    # --- Save final model ---
+    if pipeline_results["final_checkpoint"]:
+        final_dir = Path(cfg.output_dir) / "final"
+        final_dir.mkdir(parents=True, exist_ok=True)
+        target_model.save_pretrained(final_dir)
+        target_tokenizer.save_pretrained(final_dir)
+        pipeline_results["final_model_path"] = str(final_dir)
+        print(f"\n[pipeline] Final model saved to {final_dir}")
+    if all_passed:
+        pipeline_results["overall_status"] = "all_passed"
+    elif pipeline_results["overall_status"] == "pending":
+        pipeline_results["overall_status"] = "partial"
+    # --- Print final summary ---
+    print("\n" + "=" * 70)
+    print("PIPELINE SUMMARY")
+    print("=" * 70)
+    for stage_name, stage_result in pipeline_results["stages"].items():
+        status = stage_result["status"]
+        emoji = "✓" if status == "passed" else "✗"
+        print(f"  {emoji} {stage_name}: {status}")
+    print(f"\n  Overall: {pipeline_results['overall_status']}")
+    if residual_bank.residual_index:
+        print(f"\n  Residuals saved for: {', '.join(residual_bank.residual_index.keys())}")
+        print(f"  To recover lost knowledge later:")
+        print(f"    python -m td_fuse.run --reinject <stage> --strength 0.2")
+    print("=" * 70)
+    return pipeline_results

hugging/td_fuse/run.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+TD Fuse — Main Entry Point.
+Usage:
+    # Dad demo: merge just DeepSeek → Qwen3-8B (easiest, lowest risk)
+    python -m td_fuse.run --stage demo
+    # Full pipeline: all 4 merges
+    python -m td_fuse.run --stage all
+    # Single model merge
+    python -m td_fuse.run --stage deepseek
+    python -m td_fuse.run --stage mimo
+    python -m td_fuse.run --stage llama
+    python -m td_fuse.run --stage falcon
+    # With healing fine-tune after merge
+    python -m td_fuse.run --stage demo --heal
+    # Custom output directory
+    python -m td_fuse.run --stage all --output ./my_output
+    # Heal an existing checkpoint
+    python -m td_fuse.run --heal-only --model-path ./td_fuse_checkpoints/after_deepseek
+Findings: #25 (dad demo plan), #22 (merge order), #24 (official T&M pipeline)
+"""
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from .config import MergeConfig, DEMO_STAGES, FULL_STAGES
+from .merge import run_pipeline, ResidualBank
+from .heal import heal_model
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="TD Fuse — Transport and Merge pipeline for Time Dilation",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m td_fuse.run --stage demo           # Dad demo (DeepSeek only)
+  python -m td_fuse.run --stage all            # Full 4-model merge
+  python -m td_fuse.run --stage all --heal     # Merge + healing fine-tune
+  python -m td_fuse.run --heal-only --model-path ./checkpoint
+  python -m td_fuse.run --reinject deepseek --strength 0.2 --model-path ./final
+        """,
+    )
+    parser.add_argument(
+        "--stage",
+        type=str,
+        default="demo",
+        choices=["demo", "all", "deepseek", "mimo", "llama", "falcon"],
+        help="Which merge stage(s) to run (default: demo)",
+    )
+    parser.add_argument(
+        "--heal",
+        action="store_true",
+        help="Run healing fine-tune after merge",
+    )
+    parser.add_argument(
+        "--heal-only",
+        action="store_true",
+        help="Only run healing (skip merge), requires --model-path",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default=None,
+        help="Path to existing model/checkpoint (for --heal-only)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="./td_fuse_outputs",
+        help="Output directory (default: ./td_fuse_outputs)",
+    )
+    parser.add_argument(
+        "--checkpoint-dir",
+        type=str,
+        default="./td_fuse_checkpoints",
+        help="Checkpoint directory (default: ./td_fuse_checkpoints)",
+    )
+    parser.add_argument(
+        "--tm-repo",
+        type=str,
+        default="./Cross-Architecture-Merging-for-Large-Language-Models",
+        help="Path to official T&M repo",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print what would happen without actually running",
+    )
+    parser.add_argument(
+        "--reinject",
+        type=str,
+        default=None,
+        help="Re-inject saved residuals from a stage (e.g., --reinject deepseek)",
+    )
+    parser.add_argument(
+        "--reinject-side",
+        type=str,
+        default="both",
+        choices=["target", "source", "both"],
+        help="Which side's residuals to re-inject (default: both)",
+    )
+    parser.add_argument(
+        "--strength",
+        type=float,
+        default=0.2,
+        help="Residual re-injection strength, 0-1 (default: 0.2)",
+    )
+    return parser.parse_args()
+def print_banner():
+    """Print the TD Fuse banner."""
+    banner = """
+    ╔══════════════════════════════════════════════════╗
+    ║                                                  ║
+    ║   ████████╗██████╗     ███████╗██╗   ██╗███████╗ ║
+    ║   ╚══██╔══╝██╔══██╗    ██╔════╝██║   ██║██╔════╝ ║
+    ║      ██║   ██║  ██║    █████╗  ██║   ██║███████╗ ║
+    ║      ██║   ██║  ██║    ██╔══╝  ██║   ██║╚════██║ ║
+    ║      ██║   ██████╔╝    ██║     ╚██████╔╝███████║ ║
+    ║      ╚═╝   ╚═════╝     ╚═╝      ╚═════╝ ╚══════╝ ║
+    ║                                                  ║
+    ║   Transport and Merge for Time Dilation          ║
+    ║   Merging 5 models into Qwen3-8B                 ║
+    ║                                                  ║
+    ╚══════════════════════════════════��═══════════════╝
+    """
+    print(banner)
+def main():
+    args = parse_args()
+    print_banner()
+    # Build config from args
+    cfg = MergeConfig(
+        output_dir=args.output,
+        checkpoint_dir=args.checkpoint_dir,
+        tm_repo_path=args.tm_repo,
+    )
+    # Determine which stages to run
+    if args.stage == "demo":
+        stages = DEMO_STAGES
+    elif args.stage == "all":
+        stages = FULL_STAGES
+    else:
+        stages = [args.stage]
+    # --- Reinject residuals mode ---
+    if args.reinject:
+        if not args.model_path:
+            print("Error: --reinject requires --model-path")
+            sys.exit(1)
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        import torch
+        print(f"\n[run] Re-injecting residuals from stage: {args.reinject}")
+        print(f"[run] Side: {args.reinject_side}, Strength: {args.strength}")
+        residual_bank = ResidualBank(cfg)
+        tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+        model = residual_bank.reinject_residuals(
+            model, args.reinject,
+            side=args.reinject_side,
+            strength=args.strength,
+        )
+        # Save the patched model
+        patched_dir = Path(cfg.output_dir) / f"reinjected_{args.reinject}_{args.strength}"
+        patched_dir.mkdir(parents=True, exist_ok=True)
+        model.save_pretrained(str(patched_dir))
+        tokenizer.save_pretrained(str(patched_dir))
+        print(f"\n[run] Patched model saved to: {patched_dir}")
+        return
+    # --- Heal-only mode ---
+    if args.heal_only:
+        if not args.model_path:
+            print("Error: --heal-only requires --model-path")
+            sys.exit(1)
+        print(f"\n[run] Healing model at: {args.model_path}")
+        healed_path = heal_model(args.model_path, cfg)
+        print(f"\n[run] Healed model saved to: {healed_path}")
+        return
+    # --- Dry run ---
+    if args.dry_run:
+        print("\n=== DRY RUN ===")
+        print(f"Stages: {stages}")
+        print(f"Output: {cfg.output_dir}")
+        print(f"Checkpoints: {cfg.checkpoint_dir}")
+        print(f"T&M repo: {cfg.tm_repo_path}")
+        print(f"Heal after: {args.heal}")
+        print(f"\nWould run:")
+        for i, stage in enumerate(stages, 1):
+            print(f"  {i}. Merge {stage} → target")
+            print(f"     → Validate (canary + perplexity + thinking + reasoning)")
+            print(f"     → Checkpoint")
+        if args.heal:
+            print(f"  {len(stages) + 1}. QLoRA healing fine-tune")
+        print("\nNo changes made (dry run).")
+        return
+    # --- Run the pipeline ---
+    start_time = time.time()
+    results = run_pipeline(stages, cfg)
+    elapsed = time.time() - start_time
+    print(f"\n[run] Pipeline completed in {elapsed / 60:.1f} minutes")
+    # --- Healing fine-tune (optional) ---
+    if args.heal and results.get("final_checkpoint"):
+        print("\n[run] Starting healing fine-tune...")
+        healed_path = heal_model(results["final_checkpoint"], cfg)
+        results["healed_model_path"] = healed_path
+        print(f"[run] Healed model: {healed_path}")
+    # --- Save results ---
+    results_path = Path(cfg.output_dir) / "pipeline_results.json"
+    # Convert non-serialisable objects
+    def make_serialisable(obj):
+        if isinstance(obj, dict):
+            return {k: make_serialisable(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [make_serialisable(v) for v in obj]
+        elif isinstance(obj, (int, float, str, bool, type(None))):
+            return obj
+        else:
+            return str(obj)
+    with open(results_path, "w") as f:
+        json.dump(make_serialisable(results), f, indent=2)
+    print(f"[run] Results saved to {results_path}")
+    # --- Final summary ---
+    print(f"\n{'=' * 60}")
+    print("TD FUSE COMPLETE")
+    print(f"{'=' * 60}")
+    print(f"  Status:     {results['overall_status']}")
+    print(f"  Time:       {elapsed / 60:.1f} minutes")
+    if results.get("final_model_path"):
+        print(f"  Model:      {results['final_model_path']}")
+    if results.get("healed_model_path"):
+        print(f"  Healed:     {results['healed_model_path']}")
+    print(f"  Results:    {results_path}")
+    print(f"{'=' * 60}")
+    # Exit code based on result
+    if results["overall_status"] == "all_passed":
+        sys.exit(0)
+    else:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

hugging/td_fuse/techniques.py ADDED Viewed

	@@ -0,0 +1,669 @@

+"""
+Advanced Merge Techniques — from latest papers (Feb 2026).
+This module contains implementations inspired by recent research
+that improve TD's sequential cross-architecture merging pipeline.
+Techniques:
+    1. Theseus (2602.12952) — Procrustes-based task vector transport
+    2. ARM (2602.03237) — Activation-guided rotation for sequential merges
+    3. OTMF (2511.19561) — OT masks for identifying transferable weights
+    4. RAM (2601.13572) — RL-weight disentanglement for RL-trained models
+    5. Mergeability (2601.22285) — Pre-check scoring before attempting merge
+These complement Transport and Merge (2602.05495) which handles
+the core cross-architecture fusion via optimal transport.
+"""
+import torch
+import numpy as np
+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .config import MergeConfig, ModelConfig
+# ============================================================================
+# 1. THESEUS — Procrustes-Based Task Vector Transport (2602.12952)
+# ============================================================================
+#
+# Instead of aligning neurons via optimal transport (T&M), Theseus aligns
+# the FUNCTIONAL EFFECT of weights via orthogonal Procrustes.
+#
+# Analogy: T&M says "neuron 5 in Model A = neuron 12 in Model B"
+#          Theseus says "the EFFECT of Model A's weights can be rotated
+#          into Model B's space"
+#
+# Best for: Models where neuron-level alignment is poor (Falcon SSM hybrid)
+def compute_procrustes_alignment(
+    source_activations: torch.Tensor,
+    target_activations: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Compute the orthogonal Procrustes rotation matrix R that best maps
+    source activations into target activation space.
+    R = argmin ||target - source @ R||_F  subject to R^T R = I
+    Solution: R = V @ U^T from SVD of (source^T @ target) = U S V^T
+    This is a closed-form solution — no iterative optimisation needed.
+    Args:
+        source_activations: [num_samples, source_dim] activation matrix
+        target_activations: [num_samples, target_dim] activation matrix
+    Returns:
+        R: [source_dim, target_dim] rotation matrix
+    """
+    # Center the activations (remove mean)
+    S = source_activations - source_activations.mean(dim=0, keepdim=True)
+    T = target_activations - target_activations.mean(dim=0, keepdim=True)
+    # Handle dimension mismatch by zero-padding the smaller one
+    s_dim = S.shape[1]
+    t_dim = T.shape[1]
+    max_dim = max(s_dim, t_dim)
+    if s_dim < max_dim:
+        S = torch.nn.functional.pad(S, (0, max_dim - s_dim))
+    if t_dim < max_dim:
+        T = torch.nn.functional.pad(T, (0, max_dim - t_dim))
+    # Cross-covariance matrix
+    M = S.T @ T  # [max_dim, max_dim]
+    # SVD: M = U @ diag(sigma) @ V^T
+    U, sigma, Vt = torch.linalg.svd(M, full_matrices=True)
+    # Optimal rotation: R = V @ U^T
+    # This ensures R is orthogonal (R^T R = I)
+    R = Vt.T @ U.T
+    # Ensure proper rotation (det = +1), not reflection
+    det = torch.linalg.det(R)
+    if det < 0:
+        # Flip sign of last column of Vt
+        Vt[-1, :] *= -1
+        R = Vt.T @ U.T
+    return R[:s_dim, :t_dim]  # Crop back to original dims
+def transport_task_vector_theseus(
+    source_model: AutoModelForCausalLM,
+    source_base_model: AutoModelForCausalLM,
+    target_model: AutoModelForCausalLM,
+    source_activations: dict,
+    target_activations: dict,
+    alpha: float = 0.3,
+) -> AutoModelForCausalLM:
+    """
+    Transport a task vector from source to target using Theseus method.
+    Task vector = source_finetuned - source_base
+    (the "diff" that represents what the model learned)
+    We rotate this diff into target's space using Procrustes alignment,
+    then add it to target: target_new = target + alpha * R @ task_vector
+    This is the FALLBACK for when T&M's neuron-level alignment fails
+    (e.g., Falcon's SSM components).
+    Args:
+        source_model: The fine-tuned source (e.g., Falcon-H1R-7B)
+        source_base_model: The base version of source (for computing task vector)
+        target_model: The target to transport into (our merged Qwen3)
+        source_activations: Layer → activation tensors for source
+        target_activations: Layer → activation tensors for target
+        alpha: Blending weight for the transported task vector
+    """
+    print("[theseus] Computing task vectors and Procrustes alignment...")
+    source_state = source_model.state_dict()
+    base_state = source_base_model.state_dict()
+    target_state = target_model.state_dict()
+    # Compute per-layer Procrustes rotation matrices
+    rotations = {}
+    source_layers = sorted(source_activations.keys())
+    target_layers = sorted(target_activations.keys())
+    for sl, tl in zip(source_layers, target_layers):
+        if sl in source_activations and tl in target_activations:
+            R = compute_procrustes_alignment(
+                source_activations[sl].float(),
+                target_activations[tl].float(),
+            )
+            rotations[(sl, tl)] = R
+    # Transport task vectors
+    transported_count = 0
+    for target_key in target_state:
+        # Find matching source key (simplified — same key names)
+        source_key = target_key
+        if source_key not in source_state or source_key not in base_state:
+            continue
+        # Task vector = what the source learned
+        task_vector = source_state[source_key].float() - base_state[source_key].float()
+        if task_vector.abs().max() < 1e-8:
+            continue  # No meaningful change
+        # For 2D weight matrices, apply rotation
+        if task_vector.dim() == 2:
+            # Find the appropriate rotation for this layer
+            for (sl, tl), R in rotations.items():
+                if sl.split(".")[2] == target_key.split(".")[2]:  # Same layer index
+                    R_device = R.to(task_vector.device)
+                    # Rotate: task_vector_rotated = task_vector @ R
+                    try:
+                        if task_vector.shape[1] == R_device.shape[0]:
+                            task_vector = task_vector @ R_device
+                        elif task_vector.shape[0] == R_device.shape[0]:
+                            task_vector = R_device.T @ task_vector
+                    except RuntimeError:
+                        pass  # Dimension mismatch, use unrotated
+                    break
+        # Apply: target_new = target + alpha * rotated_task_vector
+        target_w = target_state[target_key]
+        if task_vector.shape == target_w.shape:
+            target_state[target_key] = target_w + alpha * task_vector.to(target_w.dtype)
+            transported_count += 1
+    target_model.load_state_dict(target_state)
+    print(f"[theseus] Transported {transported_count} task vectors via Procrustes")
+    return target_model
+# ============================================================================
+# 2. ARM — Activation-Guided Rotations for Sequential Merging (2602.03237)
+# ============================================================================
+#
+# ARM treats sequential merging like gradient descent — each merge step
+# has a "direction" and a "learning rate" (merge coefficient).
+#
+# Key insight: Use ACTIVATION PATTERNS to compute optimal rotation vectors
+# that guide each merge step. This is a smarter version of our
+# orthogonal projection in MergeProtection.
+def compute_arm_rotation(
+    pre_merge_activations: dict,
+    post_merge_activations: dict,
+    target_activations: dict,
+) -> dict:
+    """
+    Compute ARM rotation vectors for sequential merge protection.
+    For each layer, compute a rotation that:
+    1. Preserves the direction of knowledge already merged
+    2. Steers the next merge to fill GAPS rather than overwrite
+    The rotation is computed from the activation change (what the
+    last merge did) and the target (where we want to end up).
+    Returns:
+        Dict of layer_name → rotation matrix
+    """
+    print("[arm] Computing activation-guided rotations...")
+    rotations = {}
+    for layer_name in pre_merge_activations:
+        if layer_name not in post_merge_activations or layer_name not in target_activations:
+            continue
+        pre = pre_merge_activations[layer_name].float()    # Before last merge
+        post = post_merge_activations[layer_name].float()   # After last merge
+        target = target_activations[layer_name].float()      # Ideal target
+        # Delta from last merge
+        merge_delta = post - pre  # [samples, hidden_dim]
+        # Gap remaining (what we still need)
+        gap = target - post  # [samples, hidden_dim]
+        # Average across samples to get direction vectors
+        delta_dir = merge_delta.mean(dim=0)  # [hidden_dim]
+        gap_dir = gap.mean(dim=0)            # [hidden_dim]
+        # Normalise
+        delta_norm = delta_dir / (delta_dir.norm() + 1e-8)
+        gap_norm = gap_dir / (gap_dir.norm() + 1e-8)
+        # Compute rotation from delta direction to gap direction
+        # Using Rodrigues' rotation formula for the 2D plane
+        # spanned by delta and gap
+        cos_theta = torch.dot(delta_norm, gap_norm).clamp(-1, 1)
+        sin_theta = torch.sqrt(1 - cos_theta ** 2)
+        # Store as a simple rotation descriptor
+        rotations[layer_name] = {
+            "delta_direction": delta_norm,
+            "gap_direction": gap_norm,
+            "cos_theta": cos_theta.item(),
+            "sin_theta": sin_theta.item(),
+            "gap_magnitude": gap_dir.norm().item(),
+        }
+    return rotations
+def apply_arm_steering(
+    weight_delta: torch.Tensor,
+    rotation_info: dict,
+    steering_strength: float = 0.5,
+) -> torch.Tensor:
+    """
+    Steer a weight delta using ARM rotation vectors.
+    Instead of blindly projecting out previous merge directions
+    (our old orthogonal projection), ARM STEERS the delta toward
+    the remaining gap.
+    Args:
+        weight_delta: The raw delta from the current merge
+        rotation_info: ARM rotation info for this layer
+        steering_strength: How much to steer (0=no steering, 1=full)
+    Returns:
+        Steered weight delta
+    """
+    delta_dir = rotation_info["delta_direction"]
+    gap_dir = rotation_info["gap_direction"]
+    flat = weight_delta.flatten().float()
+    # Component along previous merge direction
+    prev_component = torch.dot(flat, delta_dir.to(flat.device))
+    # Remove some of the previous-direction component
+    # and add gap-direction component instead
+    correction = (
+        -steering_strength * prev_component * delta_dir.to(flat.device)
+        + steering_strength * prev_component * gap_dir.to(flat.device)
+    )
+    steered = flat + correction
+    return steered.reshape(weight_delta.shape).to(weight_delta.dtype)
+# ============================================================================
+# 3. OTMF — Transferability Masks via Optimal Transport (2511.19561)
+# ============================================================================
+#
+# OTMF discovers which parts of each model are "transferable" (shared
+# knowledge) vs "task-specific" (unique to that model).
+#
+# Transferable weights → safe to merge/average
+# Task-specific weights → must be preserved carefully
+#
+# This replaces our MagMax "top 20% by magnitude" heuristic with a
+# principled, data-driven approach.
+def compute_transferability_masks(
+    model: AutoModelForCausalLM,
+    calibration_activations: dict,
+    threshold: float = 0.3,
+) -> dict:
+    """
+    Compute per-parameter transferability masks using activation variance.
+    High activation variance across diverse inputs → parameter encodes
+    task-specific knowledge (DON'T merge aggressively).
+    Low activation variance → parameter encodes shared/general knowledge
+    (safe to merge/average).
+    This is a simplified version of OTMF's OT-based mask discovery.
+    Args:
+        model: The current merged model
+        calibration_activations: Layer → [samples, hidden_dim] activations
+        threshold: Variance quantile threshold for "task-specific" classification
+    Returns:
+        Dict of param_name → bool mask (True = transferable/safe, False = task-specific/protect)
+    """
+    print("[otmf] Computing transferability masks...")
+    masks = {}
+    state = model.state_dict()
+    # Compute per-neuron activation variance
+    neuron_importance = {}
+    for layer_name, acts in calibration_activations.items():
+        # Variance across samples: high variance = this neuron is doing something specific
+        variance = acts.var(dim=0)  # [hidden_dim]
+        neuron_importance[layer_name] = variance
+    # Map neuron importance to parameter importance
+    for param_name, param in state.items():
+        # Find the corresponding layer's importance
+        layer_prefix = ".".join(param_name.split(".")[:4])  # e.g., model.layers.0.self_attn
+        importance = None
+        for layer_name, var in neuron_importance.items():
+            if layer_prefix in layer_name:
+                importance = var
+                break
+        if importance is None:
+            # Default: mark everything as transferable (safe to merge)
+            masks[param_name] = torch.ones(param.shape, dtype=torch.bool)
+            continue
+        # For 2D weights: importance determines which rows/columns to protect
+        if param.dim() == 2:
+            rows, cols = param.shape
+            # Use importance for the output dimension
+            imp = importance[:rows] if importance.shape[0] >= rows else importance
+            # Compute threshold: top (1-threshold) fraction is task-specific
+            if imp.numel() > 0:
+                q = torch.quantile(imp.float(), 1.0 - threshold)
+                # True = transferable (below threshold), False = task-specific (protect)
+                row_mask = imp < q
+                masks[param_name] = row_mask.unsqueeze(1).expand_as(param)
+            else:
+                masks[param_name] = torch.ones(param.shape, dtype=torch.bool)
+        else:
+            # 1D params (biases, norms): default to transferable
+            masks[param_name] = torch.ones(param.shape, dtype=torch.bool)
+    transferable = sum(m.sum().item() for m in masks.values())
+    total = sum(m.numel() for m in masks.values())
+    print(f"[otmf] Transferability: {transferable / total:.1%} transferable, {1 - transferable / total:.1%} task-specific")
+    return masks
+def apply_masked_merge(
+    target_state: dict,
+    fused_state: dict,
+    masks: dict,
+    protect_strength: float = 0.8,
+) -> dict:
+    """
+    Apply transferability masks during merge.
+    For transferable weights: use the fused (merged) value
+    For task-specific weights: preserve more of the original target value
+    Args:
+        target_state: Original target weights (before this merge)
+        fused_state: Newly fused weights (after T&M/Theseus fusion)
+        masks: Transferability masks (True = safe to change)
+        protect_strength: How much to protect task-specific weights (0-1)
+    Returns:
+        Masked merged state dict
+    """
+    result = {}
+    for key in fused_state:
+        if key in masks and key in target_state:
+            mask = masks[key].to(fused_state[key].device)
+            original = target_state[key]
+            fused = fused_state[key]
+            # Transferable: use fused value
+            # Task-specific: blend more toward original
+            blended = torch.where(
+                mask,
+                fused,  # Transferable → take merged value
+                protect_strength * original + (1 - protect_strength) * fused,  # Protected
+            )
+            result[key] = blended
+        else:
+            result[key] = fused_state[key]
+    protected_params = sum(1 for k in masks if not masks[k].all())
+    print(f"[otmf] Applied masks: {protected_params} parameters partially protected")
+    return result
+# ============================================================================
+# 4. RAM — RL-Weight Disentanglement (2601.13572)
+# ============================================================================
+#
+# RL-trained models (DeepSeek-R1, MiMo-7B-RL) have two types of knowledge:
+#   - Shared: general language understanding (same as base model)
+#   - RL-specific: reasoning patterns learned via GRPO/RLHF
+#
+# RAM separates these so we can merge the shared parts normally
+# but PRESERVE the RL-specific parts that make these models special.
+def disentangle_rl_weights(
+    rl_model: AutoModelForCausalLM,
+    base_model: AutoModelForCausalLM,
+    rl_threshold: float = 0.1,
+) -> tuple:
+    """
+    Separate RL-specific weights from shared/general weights.
+    RL-specific = weights that changed significantly during RL training
+    Shared = weights that are basically the same as base
+    We identify RL-specific weights by looking at the magnitude of
+    change from base model to RL model. Big changes → RL learned
+    something there → don't average it away.
+    Args:
+        rl_model: The RL-trained model (e.g., DeepSeek-R1, MiMo-7B-RL)
+        base_model: The base model before RL training
+        rl_threshold: Relative change threshold for "RL-specific" classification
+    Returns:
+        Tuple of (shared_mask, rl_mask) — both are dicts of param_name → bool tensor
+        shared_mask: True = this weight is shared (safe to merge normally)
+        rl_mask: True = this weight is RL-specific (protect during merge)
+    """
+    print("[ram] Disentangling RL-specific vs shared weights...")
+    rl_state = rl_model.state_dict()
+    base_state = base_model.state_dict()
+    shared_mask = {}
+    rl_mask = {}
+    total_params = 0
+    rl_params = 0
+    for key in rl_state:
+        if key not in base_state:
+            # New param (e.g., MTP head) — mark as RL-specific
+            rl_mask[key] = torch.ones_like(rl_state[key], dtype=torch.bool)
+            shared_mask[key] = torch.zeros_like(rl_state[key], dtype=torch.bool)
+            rl_params += rl_state[key].numel()
+            total_params += rl_state[key].numel()
+            continue
+        rl_w = rl_state[key].float()
+        base_w = base_state[key].float()
+        # Relative change: |rl - base| / (|base| + epsilon)
+        change = (rl_w - base_w).abs()
+        base_magnitude = base_w.abs() + 1e-8
+        relative_change = change / base_magnitude
+        # RL-specific: relative change > threshold
+        is_rl = relative_change > rl_threshold
+        rl_mask[key] = is_rl
+        shared_mask[key] = ~is_rl
+        rl_params += is_rl.sum().item()
+        total_params += is_rl.numel()
+    pct = rl_params / total_params * 100 if total_params > 0 else 0
+    print(f"[ram] RL-specific: {rl_params:,} params ({pct:.1f}%)")
+    print(f"[ram] Shared:      {total_params - rl_params:,} params ({100 - pct:.1f}%)")
+    return shared_mask, rl_mask
+def merge_with_rl_preservation(
+    target_state: dict,
+    source_state: dict,
+    shared_mask: dict,
+    rl_mask: dict,
+    shared_alpha: float = 0.5,
+    rl_alpha: float = 0.8,
+) -> dict:
+    """
+    Merge source into target while preserving RL-specific weights.
+    Shared weights: normal blending at shared_alpha
+    RL-specific weights: stronger blending toward source (preserve RL knowledge)
+    This prevents the RL reasoning capabilities from being diluted
+    by averaging with target weights.
+    Args:
+        target_state: Current target model state
+        source_state: RL model state to merge in
+        shared_mask: Which params are shared (safe for normal merge)
+        rl_mask: Which params are RL-specific (preserve with higher alpha)
+        shared_alpha: Alpha for shared weights (normal)
+        rl_alpha: Alpha for RL-specific weights (higher = preserve more RL knowledge)
+    """
+    print(f"[ram] Merging with RL preservation (shared α={shared_alpha}, RL α={rl_alpha})...")
+    result = {}
+    for key in target_state:
+        if key not in source_state:
+            result[key] = target_state[key]
+            continue
+        target_w = target_state[key]
+        source_w = source_state[key]
+        if source_w.shape != target_w.shape:
+            result[key] = target_state[key]
+            continue
+        if key in rl_mask and key in shared_mask:
+            rl_m = rl_mask[key].to(target_w.device)
+            # RL-specific: use higher alpha (preserve RL knowledge)
+            # Shared: use normal alpha
+            alpha_map = torch.where(rl_m, rl_alpha, shared_alpha)
+            if alpha_map.shape != target_w.shape:
+                alpha_map = alpha_map.expand_as(target_w) if alpha_map.dim() > 0 else torch.full_like(target_w, shared_alpha)
+            result[key] = alpha_map * source_w.to(target_w.device) + (1 - alpha_map) * target_w
+        else:
+            result[key] = shared_alpha * source_w.to(target_w.device) + (1 - shared_alpha) * target_w
+    return result
+# ============================================================================
+# 5. MERGEABILITY PRE-CHECK (2601.22285)
+# ============================================================================
+#
+# Before spending GPU hours on a merge that might fail, check if the
+# models are actually COMPATIBLE enough to merge.
+#
+# Mergeability score: 0.0 (definitely won't work) to 1.0 (should work great)
+def compute_mergeability_score(
+    source_activations: dict,
+    target_activations: dict,
+    source_config: ModelConfig,
+) -> dict:
+    """
+    Predict how well a source model will merge into the target.
+    Scores based on three factors:
+    1. Activation similarity (cosine similarity of mean activations)
+    2. Dimensional compatibility (how similar are the layer shapes)
+    3. Architecture match (same arch = bonus)
+    Returns:
+        Dict with individual scores and overall mergeability (0-1)
+    """
+    print(f"[mergeability] Scoring {source_config.name}...")
+    scores = {}
+    # --- Factor 1: Activation similarity ---
+    cosine_sims = []
+    source_layers = sorted(source_activations.keys())
+    target_layers = sorted(target_activations.keys())
+    # Match layers by position (proportional mapping)
+    for i, tl in enumerate(target_layers):
+        # Map target layer index to source layer index
+        src_idx = int(i * len(source_layers) / len(target_layers))
+        src_idx = min(src_idx, len(source_layers) - 1)
+        sl = source_layers[src_idx]
+        if sl in source_activations and tl in target_activations:
+            s_mean = source_activations[sl].float().mean(dim=0)
+            t_mean = target_activations[tl].float().mean(dim=0)
+            # Pad to same dimension for cosine similarity
+            max_dim = max(s_mean.shape[0], t_mean.shape[0])
+            s_padded = torch.nn.functional.pad(s_mean, (0, max_dim - s_mean.shape[0]))
+            t_padded = torch.nn.functional.pad(t_mean, (0, max_dim - t_mean.shape[0]))
+            cos_sim = torch.nn.functional.cosine_similarity(
+                s_padded.unsqueeze(0), t_padded.unsqueeze(0)
+            ).item()
+            cosine_sims.append(cos_sim)
+    activation_score = np.mean(cosine_sims) if cosine_sims else 0.0
+    scores["activation_similarity"] = float(activation_score)
+    # --- Factor 2: Dimensional compatibility ---
+    layer_ratio = min(source_config.layers, 36) / max(source_config.layers, 36)
+    hidden_ratio = min(source_config.hidden_dim, 4096) / max(source_config.hidden_dim, 4096)
+    dim_score = (layer_ratio + hidden_ratio) / 2
+    scores["dimensional_compatibility"] = float(dim_score)
+    # --- Factor 3: Architecture match ---
+    arch_scores = {
+        "transformer": 1.0,       # Same as Qwen3
+        "transformer+mtp": 0.8,   # Close, just drop extras
+        "hybrid_ssm": 0.5,        # Very different
+    }
+    arch_score = arch_scores.get(source_config.architecture, 0.3)
+    scores["architecture_match"] = float(arch_score)
+    # --- Factor 4: Vocab overlap (bonus) ---
+    vocab_score = source_config.vocab_overlap_with_qwen3
+    scores["vocab_overlap"] = float(vocab_score)
+    # --- Overall: weighted average ---
+    overall = (
+        0.35 * activation_score +      # Most important — actual representation similarity
+        0.25 * dim_score +              # Shape compatibility
+        0.25 * arch_score +             # Architecture type
+        0.15 * vocab_score              # Vocab overlap
+    )
+    scores["overall"] = float(overall)
+    # --- Recommendation ---
+    if overall >= 0.7:
+        recommendation = "GO — standard T&M merge"
+    elif overall >= 0.5:
+        recommendation = "CAUTION — T&M merge with higher protection, have Theseus fallback ready"
+    elif overall >= 0.3:
+        recommendation = "RISKY — try Theseus first, distillation fallback"
+    else:
+        recommendation = "SKIP — use knowledge distillation instead"
+    scores["recommendation"] = recommendation
+    print(f"[mergeability] {source_config.name} score: {overall:.2f}")
+    print(f"  Activation similarity: {activation_score:.2f}")
+    print(f"  Dimensional compat:    {dim_score:.2f}")
+    print(f"  Architecture match:    {arch_score:.2f}")
+    print(f"  Vocab overlap:         {vocab_score:.2f}")
+    print(f"  → {recommendation}")
+    return scores

hugging/td_fuse/transport.py ADDED Viewed

	@@ -0,0 +1,527 @@

+"""
+Transport and Merge Wrapper — interfaces with official T&M code.
+This wraps the official repo at:
+    github.com/chenhangcuisg-code/Cross-Architecture-Merging-for-Large-Language-Models/
+We use THEIR code for:
+    - Correlation distance computation (corr_distance_matrix)
+    - Streaming Sinkhorn (sinkhorn_uniform_streaming)
+    - Transport plan computation (compute_P, compute_Q_and_layer_costs)
+    - Activation reconstruction (reconstruct_X)
+We add:
+    - Qwen3 thinking mode protection
+    - MiMo MTP head handling
+    - Falcon SSM component handling
+    - Sequential merge protection (MagMax + orthogonal projection)
+Findings: #01, #07, #24
+"""
+import sys
+import torch
+import numpy as np
+from pathlib import Path
+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
+from .config import MergeConfig, ModelConfig, TARGET
+def setup_tm_repo(cfg: MergeConfig):
+    """Add official T&M repo to Python path so we can import their code."""
+    repo_path = Path(cfg.tm_repo_path)
+    core_path = repo_path / "core"
+    if not core_path.exists():
+        raise FileNotFoundError(
+            f"Official T&M repo not found at {repo_path}\n"
+            f"Please clone it:\n"
+            f"  git clone https://github.com/chenhangcuisg-code/"
+            f"Cross-Architecture-Merging-for-Large-Language-Models.git"
+        )
+    # Add to path so we can import hot_transport etc.
+    if str(core_path) not in sys.path:
+        sys.path.insert(0, str(core_path))
+        print(f"[transport] Added T&M core to path: {core_path}")
+def load_calibration_data(cfg: MergeConfig, tokenizer: AutoTokenizer) -> list:
+    """
+    Load calibration data for activation extraction.
+    Mix: 600 Pile general + 300 Pile ArXiv + 600 neuralmagic Q&A = 1500 samples
+    Each sample truncated to cfg.calibration_seq_len tokens.
+    Findings: #08
+    """
+    print(f"[transport] Loading calibration data ({cfg.calibration_samples} samples)...")
+    samples = []
+    # --- Pile: general text (600 samples) ---
+    try:
+        pile = load_dataset(
+            cfg.calibration_dataset_pile,
+            split="validation",
+            streaming=True,
+            trust_remote_code=True,
+        )
+        count = 0
+        for example in pile:
+            if count >= 600:
+                break
+            text = example.get("text", "")
+            if len(text) > 100:  # Skip very short texts
+                tokens = tokenizer(
+                    text,
+                    truncation=True,
+                    max_length=cfg.calibration_seq_len,
+                    return_tensors="pt",
+                )
+                samples.append(tokens)
+                count += 1
+        print(f"  Pile general: {count} samples")
+    except Exception as e:
+        print(f"  ⚠ Pile failed: {e}")
+        print(f"  Falling back to neuralmagic only")
+    # --- neuralmagic: Q&A calibration (up to remaining) ---
+    remaining = cfg.calibration_samples - len(samples)
+    if remaining > 0:
+        try:
+            nm = load_dataset(
+                cfg.calibration_dataset_nm,
+                split="train",
+                trust_remote_code=True,
+            )
+            count = 0
+            for example in nm:
+                if count >= remaining:
+                    break
+                text = example.get("text", example.get("content", ""))
+                if len(str(text)) > 50:
+                    tokens = tokenizer(
+                        str(text),
+                        truncation=True,
+                        max_length=cfg.calibration_seq_len,
+                        return_tensors="pt",
+                    )
+                    samples.append(tokens)
+                    count += 1
+            print(f"  neuralmagic: {count} samples")
+        except Exception as e:
+            print(f"  ⚠ neuralmagic failed: {e}")
+    print(f"[transport] Total calibration samples: {len(samples)}")
+    return samples
+def extract_activations(
+    model: AutoModelForCausalLM,
+    calibration_data: list,
+    device: str = "cuda",
+) -> dict:
+    """
+    Extract intermediate activations from each layer of a model.
+    Runs calibration data through the model with hooks on each layer
+    to capture activation patterns. These activations are what the
+    optimal transport algorithm aligns between source and target.
+    Returns:
+        Dict mapping layer_name → activation tensor [num_samples, hidden_dim]
+    """
+    print(f"[transport] Extracting activations from {len(calibration_data)} samples...")
+    activations = {}
+    hooks = []
+    # Register hooks on each transformer layer
+    for name, module in model.named_modules():
+        if hasattr(module, "self_attn") or name.endswith(".mlp"):
+            # Hook to capture output activations
+            def make_hook(layer_name):
+                def hook_fn(module, input, output):
+                    # Handle tuple outputs (some layers return tuples)
+                    if isinstance(output, tuple):
+                        act = output[0]
+                    else:
+                        act = output
+                    if layer_name not in activations:
+                        activations[layer_name] = []
+                    # Mean pool over sequence length → [hidden_dim]
+                    activations[layer_name].append(
+                        act.detach().float().mean(dim=1).cpu()
+                    )
+                return hook_fn
+            h = module.register_forward_hook(make_hook(name))
+            hooks.append(h)
+    # Forward pass on calibration data
+    model.eval()
+    with torch.no_grad():
+        for i, tokens in enumerate(calibration_data):
+            inputs = {k: v.to(device) for k, v in tokens.items()}
+            try:
+                model(**inputs)
+            except Exception as e:
+                print(f"  ⚠ Sample {i} failed: {e}")
+                continue
+            if (i + 1) % 100 == 0:
+                print(f"  Processed {i + 1}/{len(calibration_data)} samples")
+    # Remove hooks
+    for h in hooks:
+        h.remove()
+    # Stack activations: [num_samples, hidden_dim]
+    for key in activations:
+        activations[key] = torch.cat(activations[key], dim=0)
+        print(f"  {key}: {activations[key].shape}")
+    return activations
+def compute_transport_plans(
+    source_activations: dict,
+    target_activations: dict,
+    cfg: MergeConfig,
+) -> dict:
+    """
+    Compute optimal transport plans between source and target activations.
+    This is where the magic happens. We use the official T&M code's:
+    - corr_distance_matrix: correlation distance between activation vectors
+    - sinkhorn_uniform_streaming: memory-efficient Sinkhorn solver
+    - compute_P: layer-level coupling (which source layers → which target layers)
+    - compute_Q_and_layer_costs: neuron-level coupling within each layer pair
+    Returns:
+        Dict with 'P' (layer coupling) and 'Q' (per-layer neuron coupling) matrices
+    """
+    print("[transport] Computing transport plans...")
+    try:
+        # Try importing official T&M code
+        from hot_transport import (
+            corr_distance_matrix,
+            sinkhorn_uniform_streaming,
+            compute_P,
+            compute_Q_and_layer_costs,
+        )
+        print("[transport] Using official T&M implementation")
+        return _compute_plans_official(
+            source_activations, target_activations, cfg,
+            corr_distance_matrix, sinkhorn_uniform_streaming,
+            compute_P, compute_Q_and_layer_costs,
+        )
+    except ImportError:
+        print("[transport] Official T&M code not available, using fallback")
+        return _compute_plans_fallback(
+            source_activations, target_activations, cfg
+        )
+def _compute_plans_official(
+    source_act, target_act, cfg,
+    corr_distance_matrix, sinkhorn_uniform_streaming,
+    compute_P, compute_Q_and_layer_costs,
+) -> dict:
+    """Use the official T&M code to compute transport plans."""
+    # Get matching layer pairs
+    source_layers = sorted(source_act.keys())
+    target_layers = sorted(target_act.keys())
+    # Compute Q matrices (neuron-level) and layer costs
+    Q_matrices, layer_costs = compute_Q_and_layer_costs(
+        source_act, target_act,
+        source_layers, target_layers,
+    )
+    # Compute P matrix (layer-level coupling)
+    P = compute_P(layer_costs)
+    return {
+        "P": P,
+        "Q": Q_matrices,
+        "source_layers": source_layers,
+        "target_layers": target_layers,
+    }
+def _compute_plans_fallback(
+    source_act: dict,
+    target_act: dict,
+    cfg: MergeConfig,
+) -> dict:
+    """
+    Fallback transport plan computation when official code isn't available.
+    Uses correlation distance + basic Sinkhorn. Less optimised than official
+    but functionally correct for testing.
+    """
+    source_layers = sorted(source_act.keys())
+    target_layers = sorted(target_act.keys())
+    # --- Step 1: Correlation distance matrices per layer pair ---
+    Q_matrices = {}
+    layer_costs = np.zeros((len(source_layers), len(target_layers)))
+    for i, sl in enumerate(source_layers):
+        for j, tl in enumerate(target_layers):
+            if sl not in source_act or tl not in target_act:
+                continue
+            S = source_act[sl].numpy()  # [samples, hidden_dim_source]
+            T = target_act[tl].numpy()  # [samples, hidden_dim_target]
+            # Correlation distance: 1 - pearson_correlation
+            # Between each pair of neurons across samples
+            # S: [samples, n_source], T: [samples, n_target]
+            S_norm = (S - S.mean(0)) / (S.std(0) + 1e-8)
+            T_norm = (T - T.mean(0)) / (T.std(0) + 1e-8)
+            corr = S_norm.T @ T_norm / S.shape[0]  # [n_source, n_target]
+            cost = 1.0 - corr  # Correlation distance
+            # Basic Sinkhorn on this cost matrix
+            Q = _sinkhorn(cost, reg=cfg.sinkhorn_reg, max_iter=cfg.sinkhorn_max_iter)
+            Q_matrices[(sl, tl)] = Q
+            layer_costs[i, j] = cost.mean()
+    # --- Step 2: Layer coupling (P matrix) ---
+    P = _sinkhorn(layer_costs, reg=cfg.sinkhorn_reg, max_iter=cfg.sinkhorn_max_iter)
+    return {
+        "P": P,
+        "Q": Q_matrices,
+        "source_layers": source_layers,
+        "target_layers": target_layers,
+    }
+def _sinkhorn(
+    cost_matrix: np.ndarray,
+    reg: float = 0.05,
+    max_iter: int = 100,
+) -> np.ndarray:
+    """
+    Basic Sinkhorn-Knopp algorithm for optimal transport.
+    Solves: min <T, C> - reg * H(T)
+    where H(T) is the entropy of the transport plan.
+    This is the FALLBACK. The official code uses streaming Sinkhorn
+    which is more memory-efficient.
+    """
+    n, m = cost_matrix.shape
+    K = np.exp(-cost_matrix / reg)
+    u = np.ones(n) / n
+    v = np.ones(m) / m
+    for _ in range(max_iter):
+        u = 1.0 / (K @ v + 1e-10)
+        v = 1.0 / (K.T @ u + 1e-10)
+    # Transport plan
+    T = np.diag(u) @ K @ np.diag(v)
+    return T
+def fuse_weights(
+    source_model: AutoModelForCausalLM,
+    target_model: AutoModelForCausalLM,
+    transport_plans: dict,
+    source_config: ModelConfig,
+    cfg: MergeConfig,
+) -> AutoModelForCausalLM:
+    """
+    Fuse source model weights into target model using transport plans.
+    For each layer pair with significant coupling (P > threshold):
+    1. Get the Q matrix (neuron-level correspondence)
+    2. Transport source weights into target neuron basis: W_fused = Q @ W_source
+    3. Blend with target: W_final = alpha * W_fused + (1-alpha) * W_target
+    Special handling per model:
+    - DeepSeek: Direct merge (same architecture)
+    - MiMo: Skip MTP heads, skip embeddings
+    - Llama: Layer mapping (32→36), skip embeddings, drop QKV bias
+    - Falcon: Skip Mamba components, skip embeddings
+    Returns:
+        Target model with fused weights
+    """
+    print(f"\n[transport] Fusing {source_config.name} → target")
+    alpha = source_config.merge_alpha
+    try:
+        # Try official fusion code first
+        from generate_hot_residual import fuse_attention_only_from_hot_dir
+        print("[transport] Using official fusion implementation")
+        # TODO: Adapt official fusion to our pipeline
+        # For now, fall through to manual fusion
+    except ImportError:
+        pass
+    # --- Manual fusion using transport plans ---
+    source_state = source_model.state_dict()
+    target_state = target_model.state_dict()
+    P = transport_plans["P"]
+    Q = transport_plans["Q"]
+    fused_count = 0
+    skipped_count = 0
+    for target_key in target_state:
+        # Skip parameters we shouldn't merge
+        if _should_skip(target_key, source_config):
+            skipped_count += 1
+            continue
+        # Find corresponding source key
+        source_key = _map_key(target_key, source_config)
+        if source_key is None or source_key not in source_state:
+            skipped_count += 1
+            continue
+        target_w = target_state[target_key]
+        source_w = source_state[source_key]
+        # Handle dimension mismatches
+        if target_w.shape != source_w.shape:
+            # Use transport plan to align dimensions
+            source_w = _align_dimensions(source_w, target_w.shape, Q, target_key)
+            if source_w is None:
+                skipped_count += 1
+                continue
+        # Blend: W_final = alpha * source + (1-alpha) * target
+        fused_w = alpha * source_w.to(target_w.device) + (1 - alpha) * target_w
+        target_state[target_key] = fused_w
+        fused_count += 1
+    # Apply thinking mode protection
+    if cfg.freeze_think_tokens and "embed_tokens" in target_key:
+        for token_id in cfg.think_token_ids:
+            if token_id < target_state["model.embed_tokens.weight"].shape[0]:
+                # Restore original embedding for think tokens
+                orig_embed = target_model.state_dict()["model.embed_tokens.weight"]
+                target_state["model.embed_tokens.weight"][token_id] = orig_embed[token_id]
+                print(f"[transport] Protected think token {token_id}")
+    # Load fused weights
+    target_model.load_state_dict(target_state)
+    print(f"[transport] Fused {fused_count} params, skipped {skipped_count}")
+    return target_model
+def _should_skip(key: str, source_config: ModelConfig) -> bool:
+    """Determine if a parameter should be skipped during merge."""
+    # Always skip if source model says to skip embeddings
+    if source_config.skip_embeddings and ("embed_tokens" in key or "lm_head" in key):
+        return True
+    # Skip MiMo MTP heads
+    if "drop_mtp_heads" in source_config.special_handling and "mtp_head" in key:
+        return True
+    # Skip Falcon Mamba-specific parameters
+    if "drop_mamba_state_params" in source_config.special_handling:
+        mamba_keys = ["mamba", "A_log", "dt_proj", ".D"]
+        if any(mk in key for mk in mamba_keys):
+            return True
+    # Skip QKV bias for Llama (Qwen3 doesn't have it)
+    if "drop_qkv_bias" in source_config.special_handling and ".bias" in key:
+        if any(proj in key for proj in ["q_proj", "k_proj", "v_proj"]):
+            return True
+    return False
+def _map_key(target_key: str, source_config: ModelConfig) -> Optional[str]:
+    """Map a target model parameter name to the corresponding source name."""
+    # For same-architecture models (DeepSeek), keys match directly
+    if source_config.architecture == "transformer" and source_config.layers == 36:
+        return target_key
+    # For Llama (32 layers → 36 layers), map layer indices
+    if "layer_mapping_32_to_36" in source_config.special_handling:
+        if "model.layers." in target_key:
+            # Extract layer number
+            parts = target_key.split(".")
+            try:
+                layer_idx = int(parts[2])
+            except (IndexError, ValueError):
+                return target_key
+            # Map 36 target layers to 32 source layers (stride)
+            source_layer = int(layer_idx * 32 / 36)
+            parts[2] = str(source_layer)
+            return ".".join(parts)
+    # For MiMo (same layer count, different extras), keys mostly match
+    if source_config.architecture == "transformer+mtp":
+        if "mtp_head" in target_key:
+            return None  # MTP heads don't exist in target
+        return target_key
+    # For Falcon hybrid, only attention and MLP keys map
+    if source_config.architecture == "hybrid_ssm":
+        if any(k in target_key for k in ["self_attn", "mlp", "layer_norm"]):
+            return target_key  # These exist in both
+        return None  # Mamba components don't map
+    return target_key
+def _align_dimensions(
+    source_w: torch.Tensor,
+    target_shape: tuple,
+    Q_matrices: dict,
+    key: str,
+) -> Optional[torch.Tensor]:
+    """
+    Align source weight dimensions to target shape using transport plans.
+    For small mismatches: pad or truncate.
+    For large mismatches: use Q matrix to project.
+    """
+    if source_w.shape == target_shape:
+        return source_w
+    # Simple case: different width (FFN size difference)
+    if len(source_w.shape) == 2 and len(target_shape) == 2:
+        s_rows, s_cols = source_w.shape
+        t_rows, t_cols = target_shape
+        result = torch.zeros(target_shape, dtype=source_w.dtype)
+        # Copy what fits
+        min_rows = min(s_rows, t_rows)
+        min_cols = min(s_cols, t_cols)
+        result[:min_rows, :min_cols] = source_w[:min_rows, :min_cols]
+        return result
+    # 1D case (biases, layer norms)
+    if len(source_w.shape) == 1 and len(target_shape) == 1:
+        result = torch.zeros(target_shape, dtype=source_w.dtype)
+        min_len = min(source_w.shape[0], target_shape[0])
+        result[:min_len] = source_w[:min_len]
+        return result
+    # Can't align — skip this parameter
+    return None

hugging/td_fuse/validate.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+Post-Merge Validation — run after EVERY merge step.
+Tests:
+1. Canary recall (did knowledge transfer?)
+2. Perplexity check (did we break the model?)
+3. Thinking mode (do <think> tags still work?)
+4. Quick reasoning test (can it still think?)
+Kill criteria: >10% performance drop on any test → abort merge.
+Findings: #11, #22, #25
+"""
+import torch
+import math
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .canary import test_all_canaries
+from .config import MergeConfig
+def validate_merged_model(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    merged_sources: list[str],
+    cfg: MergeConfig,
+    baseline_perplexity: float = None,
+) -> dict:
+    """
+    Run full validation suite on a merged model.
+    Args:
+        model: The merged model to validate
+        tokenizer: The tokenizer
+        merged_sources: List of source models merged so far
+        cfg: Merge configuration
+        baseline_perplexity: Perplexity of the target model before merging
+    Returns:
+        Dict with test results and overall pass/fail
+    """
+    print("\n" + "=" * 60)
+    print(f"VALIDATION — After merging: {', '.join(merged_sources)}")
+    print("=" * 60)
+    results = {
+        "canary": None,
+        "perplexity": None,
+        "thinking_mode": None,
+        "reasoning": None,
+        "overall": False,
+    }
+    # --- Test 1: Canary recall ---
+    canary_results = test_all_canaries(model, tokenizer, merged_sources)
+    passed_canaries = sum(1 for v in canary_results.values() if v)
+    total_canaries = len(canary_results)
+    results["canary"] = {
+        "passed": passed_canaries,
+        "total": total_canaries,
+        "ok": passed_canaries >= cfg.canary_pass_threshold,
+        "details": canary_results,
+    }
+    # --- Test 2: Perplexity ---
+    perplexity = compute_perplexity(model, tokenizer)
+    ppl_ok = True
+    if baseline_perplexity is not None:
+        ratio = perplexity / baseline_perplexity
+        ppl_ok = ratio < cfg.perplexity_threshold
+        print(f"\n[validate] Perplexity: {perplexity:.2f} (baseline: {baseline_perplexity:.2f}, ratio: {ratio:.2f})")
+        if not ppl_ok:
+            print(f"[validate] ⚠ Perplexity ratio {ratio:.2f} exceeds threshold {cfg.perplexity_threshold}")
+    else:
+        print(f"\n[validate] Perplexity: {perplexity:.2f} (no baseline to compare)")
+    results["perplexity"] = {"value": perplexity, "ok": ppl_ok}
+    # --- Test 3: Thinking mode ---
+    think_ok = test_thinking_mode(model, tokenizer)
+    results["thinking_mode"] = {"ok": think_ok}
+    # --- Test 4: Quick reasoning ---
+    reason_ok = test_reasoning(model, tokenizer)
+    results["reasoning"] = {"ok": reason_ok}
+    # --- Overall verdict ---
+    all_ok = (
+        results["canary"]["ok"]
+        and results["perplexity"]["ok"]
+        and results["thinking_mode"]["ok"]
+        and results["reasoning"]["ok"]
+    )
+    results["overall"] = all_ok
+    # Summary
+    print("\n" + "-" * 60)
+    print("VALIDATION SUMMARY")
+    print("-" * 60)
+    print(f"  Canary recall:   {'✓' if results['canary']['ok'] else '✗'} ({passed_canaries}/{total_canaries})")
+    print(f"  Perplexity:      {'✓' if ppl_ok else '✗'} ({perplexity:.2f})")
+    print(f"  Thinking mode:   {'✓' if think_ok else '✗'}")
+    print(f"  Reasoning:       {'✓' if reason_ok else '✗'}")
+    print(f"  OVERALL:         {'✓ PASS' if all_ok else '✗ FAIL — consider aborting'}")
+    print("-" * 60)
+    return results
+def compute_perplexity(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    test_texts: list[str] = None,
+) -> float:
+    """
+    Compute perplexity on a small test set.
+    Lower perplexity = model is more confident about predicting text.
+    A big spike after merging means the model was damaged.
+    """
+    if test_texts is None:
+        test_texts = [
+            "The quick brown fox jumps over the lazy dog.",
+            "In mathematics, a prime number is a natural number greater than 1.",
+            "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
+            "The theory of general relativity describes gravity as the curvature of spacetime.",
+            "To solve 3x + 7 = 22, subtract 7 from both sides to get 3x = 15, then divide by 3.",
+        ]
+    model.eval()
+    total_loss = 0.0
+    total_tokens = 0
+    for text in test_texts:
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs, labels=inputs["input_ids"])
+            total_loss += outputs.loss.item() * inputs["input_ids"].shape[1]
+            total_tokens += inputs["input_ids"].shape[1]
+    avg_loss = total_loss / total_tokens
+    perplexity = math.exp(avg_loss)
+    return perplexity
+def test_thinking_mode(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+) -> bool:
+    """
+    Test if the model still uses <think> tags for reasoning.
+    The thinking mode is Qwen3's special feature — if it's gone,
+    the merge damaged something critical.
+    """
+    prompt = "Solve step by step: What is 15 × 13?"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=200,
+            temperature=0.7,
+            do_sample=True,
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
+    # Check for thinking tags
+    has_think_open = "<think>" in response
+    has_think_close = "</think>" in response
+    passed = has_think_open and has_think_close
+    print(f"\n[validate] Thinking mode test:")
+    print(f"  Prompt:    {prompt}")
+    print(f"  Response:  {response[:200]}...")
+    print(f"  <think>:   {'✓ found' if has_think_open else '✗ missing'}")
+    print(f"  </think>:  {'✓ found' if has_think_close else '✗ missing'}")
+    print(f"  Status:    {'✓ PASS' if passed else '✗ FAIL'}")
+    return passed
+def test_reasoning(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+) -> bool:
+    """
+    Quick reasoning sanity check — can the model still do basic math?
+    This catches catastrophic failures where the merge produced gibberish.
+    """
+    prompt = "What is 7 + 8?"
+    expected_answer = "15"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=50,
+            temperature=0.1,
+            do_sample=False,
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    passed = expected_answer in response
+    print(f"\n[validate] Quick reasoning test:")
+    print(f"  Prompt:   {prompt}")
+    print(f"  Expected: {expected_answer}")
+    print(f"  Got:      {response}")
+    print(f"  Status:   {'✓ PASS' if passed else '✗ FAIL'}")
+    return passed

hugging/td_lang/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

hugging/td_lang/__init__.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+TD Lang — Domain-specific language for Time Dilation project.
+Compiles .td files into Python code that calls td_fuse.
+Write simple scripts instead of complex Python.
+Architecture:
+    td_lang/
+    ├── __init__.py          <- This file
+    ├── __main__.py          <- Entry point for python -m td_lang
+    ├── grammar.py           <- Lark grammar + parse tree transformer
+    ├── ast_nodes.py         <- Dataclass AST nodes for each command
+    ├── compiler.py          <- AST -> Python code generation
+    ├── executor.py          <- Run compiled code, track lineage
+    ├── cli.py               <- Command-line interface
+    ├── errors.py            <- Custom exceptions
+    └── examples/
+        ├── demo_merge.td    <- Basic merge example
+        ├── demo_heal.td     <- Merge + heal example
+        ├── demo_full.td     <- Full pipeline with gates + budget
+        ├── demo_loop.td     <- Self-improvement loop example
+        ├── demo_phase3.td   <- Fork/edit/prune/reset example
+        └── demo_phase4.td   <- Contracts + snapshot + report example
+Phase 1: load, merge, heal, eval, commit
+Phase 2: diagnose, synth, train, debate
+Phase 3: fork, reset, prune, edit
+Phase 4: snapshot, report, data_contract, reward_contract
+Phase 5: CLI polish, --version, info command, --verbose
+Designed from interviews test_14 (10 commands) and test_17 (ForgeSpec 2.0).
+"""
+from .grammar import parse_td_file, parse_td_string  # noqa: F401
+from .compiler import compile_program  # noqa: F401
+from .executor import TDExecutor, check_td_file, compile_td_file, run_td_file  # noqa: F401
+__version__ = "0.2.0"
+__author__ = "Milan (TD Project)"
+__all__ = [
+    "parse_td_file",
+    "parse_td_string",
+    "compile_program",
+    "TDExecutor",
+    "check_td_file",
+    "compile_td_file",
+    "run_td_file",
+    "__version__",
+    "__author__",
+]

hugging/td_lang/__main__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Entry point for python -m td_lang."""
+from .cli import main
+main()

hugging/td_lang/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

hugging/td_lang/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (2.02 kB). View file

hugging/td_lang/__pycache__/__main__.cpython-310.pyc ADDED Viewed

Binary file (254 Bytes). View file

hugging/td_lang/__pycache__/__main__.cpython-314.pyc ADDED Viewed

Binary file (262 Bytes). View file

hugging/td_lang/__pycache__/ast_nodes.cpython-310.pyc ADDED Viewed

Binary file (12.7 kB). View file

hugging/td_lang/__pycache__/ast_nodes.cpython-314.pyc ADDED Viewed

Binary file (18.7 kB). View file

hugging/td_lang/__pycache__/cli.cpython-310.pyc ADDED Viewed

Binary file (6.62 kB). View file

hugging/td_lang/__pycache__/cli.cpython-314.pyc ADDED Viewed

Binary file (10.5 kB). View file

hugging/td_lang/__pycache__/compiler.cpython-310.pyc ADDED Viewed

Binary file (88.7 kB). View file

hugging/td_lang/__pycache__/compiler.cpython-314.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bef7388fef05cdd8ee4edcc72a4b8907c8637caa22cfc802da044470a515c92
+size 162778

hugging/td_lang/__pycache__/errors.cpython-310.pyc ADDED Viewed

Binary file (4.21 kB). View file

hugging/td_lang/__pycache__/errors.cpython-314.pyc ADDED Viewed

Binary file (6.34 kB). View file

hugging/td_lang/__pycache__/executor.cpython-310.pyc ADDED Viewed

Binary file (5.94 kB). View file

hugging/td_lang/__pycache__/executor.cpython-314.pyc ADDED Viewed

Binary file (9.49 kB). View file

hugging/td_lang/__pycache__/grammar.cpython-310.pyc ADDED Viewed

Binary file (25.4 kB). View file

hugging/td_lang/__pycache__/grammar.cpython-314.pyc ADDED Viewed

Binary file (37.8 kB). View file

hugging/td_lang/ast_nodes.py ADDED Viewed

	@@ -0,0 +1,421 @@

+"""
+TD Lang AST Nodes — Dataclass containers for each parsed command.
+Each .td command becomes one of these nodes after parsing.
+Phase 1 nodes are compiled into runnable Python; Phase 2 nodes are stubs so
+the compiler can reject them with a clear error until they are implemented.
+"""
+from dataclasses import dataclass, field
+from typing import Any, List, Optional
+# ============================================================================
+# PHASE 1 COMMANDS
+# ============================================================================
+@dataclass
+class LoadCmd:
+    """Load a model and give it a name.
+    Example: load "Qwen/Qwen3-VL-8B-Instruct" as base
+    """
+    model_ref: str          # HuggingFace path or local path
+    alias: str              # Name to use in the rest of the script
+@dataclass
+class MergeCmd:
+    """Merge a source model into a target using a method.
+    Example: merge "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B" into base using transport strength 0.5
+    """
+    source: str             # Model path or alias to merge from
+    target: str             # Alias to merge into (must be loaded first)
+    method: str             # "transport", "slerp", "ties", "dare"
+    strength: float = 0.5   # 0.0 = keep target, 1.0 = keep source
+@dataclass
+class HealCmd:
+    """Run QLoRA healing fine-tune on a model.
+    Example: heal base lora_r 32 epochs 2
+    """
+    target: str             # Alias of model to heal
+    lora_r: int = 32        # LoRA rank (higher = more capacity)
+    epochs: int = 2         # Training epochs
+@dataclass
+class EvalCmd:
+    """Run validation/evaluation on a model.
+    Example: eval base on "pile_sample" -> report.json
+    """
+    target: str                         # Alias of model to evaluate
+    dataset: Optional[str] = None       # Optional dataset name/path
+    output: Optional[str] = None        # Optional output file path
+@dataclass
+class CommitCmd:
+    """Save model checkpoint, optionally requiring gates to pass.
+    Example: commit base if [canary, perplexity, thinking_mode]
+    """
+    target: str                                 # Alias of model to commit
+    gates: Optional[list[str]] = None           # Gate names that must pass
+# ============================================================================
+# PHASE 2 COMMANDS (placeholders — structure ready, not wired up yet)
+# ============================================================================
+@dataclass
+class SynthCmd:
+    """Generate synthetic training data from a model. (Phase 2)"""
+    target: str
+    source: str
+    filter_method: Optional[str] = None
+    output: Optional[str] = None
+@dataclass
+class TrainCmd:
+    """Train a model on a dataset. (Phase 2)"""
+    target: str
+    dataset: str
+    method: str = "grpo"            # "grpo", "sft", "dpo"
+    steps: Optional[int] = None
+    learning_rate: Optional[float] = None
+@dataclass
+class DebateCmd:
+    """Generate multi-answer debate for preference pairs. (Phase 2)"""
+    target: str
+    rounds: int = 3
+    candidates: int = 8
+    output: Optional[str] = None
+@dataclass
+class DiagnoseCmd:
+    """Ask model what it's bad at — self-diagnosis. (Phase 2)"""
+    target: str
+    output: Optional[str] = None
+@dataclass
+class ForkCmd:
+    """Branch current model weights for parallel experiments. (Phase 3)
+    Example: fork base as experiment_v2
+    Cheap fork: copies manifest + adapters, shares base weights (default).
+    """
+    source: str             # Alias of model to fork from
+    alias: str              # Name for the new branch
+@dataclass
+class ResetCmd:
+    """Revert model to a previous checkpoint. (Phase 3)
+    Example: reset base to "checkpoint_042"
+    Deletes current model, clears CUDA cache, reloads from disk.
+    Must also reset optimizer state.
+    """
+    target: str             # Alias of model to reset
+    checkpoint: str         # Checkpoint name/path to revert to
+@dataclass
+class PruneCmd:
+    """Structural pruning — remove low-utility neurons/heads. (Phase 3)
+    Example: prune base using wanda aggressiveness 0.2
+    Safe zone: ~20% max (LLM-Pruner paper). Language backbone only.
+    """
+    target: str
+    method: str = "wanda"               # "wanda", "magnitude", "taylor"
+    aggressiveness: float = 0.2         # Fraction to remove (0.0-1.0)
+@dataclass
+class EditCmd:
+    """Surgical LoRA/DoRA editing on specific layers. (Phase 3)
+    Example: edit base layers 16-28 using lora lr 1e-4
+    "Try before buy": eval with adapter enabled vs disabled before merging.
+    """
+    target: str
+    layers: str = "all"                 # "all", "16-28", single number
+    method: str = "lora"                # "lora" or "dora"
+    learning_rate: Optional[float] = None
+# ============================================================================
+# PHASE 4 COMMANDS — Contracts, Lineage, Economics (ForgeSpec 2.0, test_17)
+# ============================================================================
+# ============================================================================
+# PHASE 7 — LOOP CONTROL (repeat, if/else)
+# ============================================================================
+@dataclass
+class RepeatBlock:
+    """Repeat a block of commands N times. (Phase 7 — Loop Control)
+    Example:
+        repeat 5 {
+            diagnose base
+            synth base from base
+            train base on "data.jsonl" using grpo steps 64
+            eval base
+        }
+    """
+    count: int                      # Number of iterations
+    body: List[Any] = field(default_factory=list)  # Commands inside the block
+@dataclass
+class IfBlock:
+    """Conditional execution based on last eval result. (Phase 7 — Loop Control)
+    Example:
+        if eval_passed {
+            commit base
+        } else {
+            reset base to "last_good"
+        }
+    Condition checks the most recent eval result for the target.
+    """
+    condition: str                  # "eval_passed", "gate_passed", etc.
+    target: Optional[str] = None    # Which model's eval to check
+    then_body: List[Any] = field(default_factory=list)
+    else_body: List[Any] = field(default_factory=list)
+@dataclass
+class FuseCmd:
+    """Fuse multiple models into a target in one shot. (Phase 6 — Easy Merge)
+    Example: fuse [deepseek-r1, mimo-7b, llama-3.1] into base
+    Auto-picks Transport and Merge, auto-sets per-model strength.
+    Handles cross-architecture merging (all 5 source models have different archs).
+    """
+    sources: list[str]          # List of model names/paths to fuse in
+    target: str                 # Alias to merge into (must be loaded)
+    method: str = "transport"   # Default: transport and merge (cross-arch)
+    strategy: str = "equal"     # "equal" (same strength each), "weighted", "sequential"
+@dataclass
+class AbsorbCmd:
+    """Absorb a single model into target — simplified merge. (Phase 6 — Easy Merge)
+    Example: absorb "deepseek-ai/DeepSeek-R1" into base strength 0.5
+    One-liner for the common case of merging one model in.
+    """
+    source: str                 # Model path or HF ID
+    target: str                 # Alias to merge into
+    strength: float = 0.5       # 0.0=keep target, 1.0=keep source, default balanced
+@dataclass
+class SnapshotCmd:
+    """Save a content-hashed snapshot of model state for lineage tracking. (Phase 4)
+    Example: snapshot base -> snapshots/
+    Creates a content-addressed directory: snapshots/<sha256_prefix>/
+    Contains: model state, adapter state, prune spec, eval report, manifest.
+    """
+    target: str
+    output: Optional[str] = None  # Output directory (default: td_lang_outputs/snapshots/)
+@dataclass
+class ReportCmd:
+    """Generate an economics report for this run. (Phase 4)
+    Example: report -> economics.json
+    Tracks: GPU hours, cost estimate, tokens processed, experiments run,
+    time per command, cost breakdown by phase.
+    """
+    output: Optional[str] = None  # Output file path
+# ============================================================================
+# PHASE 8 — AUTOPILOT (setup, notify, save, on_error, resume)
+# ============================================================================
+@dataclass
+class NotifyCmd:
+    """Send a notification via ntfy.sh. (Phase 8 — Autopilot)
+    Example: notify "Training complete!"
+    Uses curl to POST to the configured ntfy topic.
+    """
+    message: str
+@dataclass
+class SaveCmd:
+    """Save/upload model to cloud storage via rclone. (Phase 8 — Autopilot)
+    Example: save base to "gdrive:TD/models/v1"
+    Uses rclone to copy model checkpoint to Google Drive (or any rclone remote).
+    """
+    target: str                     # Alias of model to save
+    destination: str                # rclone destination path
+@dataclass
+class SetupBlock:
+    """Auto-install dependencies and configure environment. (Phase 8 — Autopilot)
+    Example:
+        setup {
+            pip = [torch, transformers, peft, bitsandbytes, trl]
+            hf_token = env
+            notify = "ntfy.sh/my_ai"
+        }
+    """
+    pip_packages: list[str] = field(default_factory=list)
+    hf_token: Optional[str] = None   # "env" = read HF_TOKEN from env
+    notify_url: Optional[str] = None  # ntfy.sh topic URL
+@dataclass
+class OnErrorBlock:
+    """Crash recovery behavior. (Phase 8 — Autopilot)
+    Example:
+        on_error {
+            retry = 3
+            fallback = reduce_batch
+            notify = true
+        }
+    """
+    retry: int = 3                   # Number of retries per failed step
+    fallback: str = "reduce_batch"   # "reduce_batch", "skip", "snapshot_and_stop"
+    notify: bool = True              # Send ntfy notification on error
+# ============================================================================
+# BLOCKS (gates, budget, contracts, etc.)
+# ============================================================================
+@dataclass
+class GateBlock:
+    """Validation gates that must pass before commit.
+    Example:
+        gate {
+            must_pass = [canary, perplexity, thinking_mode]
+        }
+    """
+    must_pass: list[str] = field(default_factory=list)
+@dataclass
+class BudgetBlock:
+    """Resource budget — compiler refuses plans that exceed limits.
+    Example:
+        budget {
+            max_gpu_hours = 8
+            max_cost = 50.00
+        }
+    """
+    max_gpu_hours: Optional[float] = None
+    max_cost: Optional[float] = None
+    max_tokens: Optional[int] = None
+    max_experiments: Optional[int] = None
+@dataclass
+class DataContractBlock:
+    """Schema enforcement on training data. (Phase 4, ForgeSpec 2.0)
+    Example:
+        data_contract {
+            required_fields = [prompt, response]
+            min_samples = 100
+            max_perplexity = 50.0
+        }
+    Compiler checks training data at synth/train time.
+    """
+    required_fields: list[str] = field(default_factory=list)
+    min_samples: Optional[int] = None
+    max_perplexity: Optional[float] = None
+@dataclass
+class RewardContractBlock:
+    """Verified reward definitions — what counts as "correct". (Phase 4, ForgeSpec 2.0)
+    Example:
+        reward_contract {
+            verifiers = [code_compiles, math_correct, no_hallucination]
+            min_reward = 0.3
+        }
+    Used by train (GRPO) to enforce reward quality.
+    No learned reward model — verified rewards only (test_16).
+    """
+    verifiers: list[str] = field(default_factory=list)
+    min_reward: Optional[float] = None
+# ============================================================================
+# TOP-LEVEL PROGRAM
+# ============================================================================
+@dataclass
+class TDProgram:
+    """A complete parsed .td file — commands in order plus global blocks."""
+    commands: List[Any] = field(default_factory=list)
+    gates: Optional[GateBlock] = None
+    budget: Optional[BudgetBlock] = None
+    data_contract: Optional[DataContractBlock] = None
+    reward_contract: Optional[RewardContractBlock] = None
+    setup: Optional[SetupBlock] = None
+    on_error: Optional[OnErrorBlock] = None
+    source_file: Optional[str] = None
+__all__ = [
+    "LoadCmd",
+    "MergeCmd",
+    "HealCmd",
+    "EvalCmd",
+    "CommitCmd",
+    "SynthCmd",
+    "TrainCmd",
+    "DebateCmd",
+    "DiagnoseCmd",
+    "ForkCmd",
+    "ResetCmd",
+    "PruneCmd",
+    "EditCmd",
+    "RepeatBlock",
+    "IfBlock",
+    "FuseCmd",
+    "AbsorbCmd",
+    "SnapshotCmd",
+    "ReportCmd",
+    "NotifyCmd",
+    "SaveCmd",
+    "SetupBlock",
+    "OnErrorBlock",
+    "GateBlock",
+    "BudgetBlock",
+    "DataContractBlock",
+    "RewardContractBlock",
+    "TDProgram",
+]

hugging/td_lang/cli.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+TD Lang CLI — Command-line interface for .td files.
+Usage:
+    python -m td_lang run examples/demo_merge.td       # Compile + execute
+    python -m td_lang compile examples/demo_merge.td   # Compile only (outputs .py)
+    python -m td_lang check examples/demo_merge.td     # Syntax check only
+    python -m td_lang info examples/demo_merge.td      # Show plan without compiling
+    python -m td_lang --version                        # Show version
+"""
+import argparse
+import sys
+from . import __version__
+from .executor import TDExecutor
+from .errors import TDLangError
+from .grammar import parse_td_file
+from .ast_nodes import (
+    LoadCmd, MergeCmd, HealCmd, EvalCmd, CommitCmd,
+    SynthCmd, TrainCmd, DebateCmd, DiagnoseCmd,
+    ForkCmd, ResetCmd, PruneCmd, EditCmd,
+    FuseCmd, AbsorbCmd, RepeatBlock, IfBlock,
+    NotifyCmd, SaveCmd,
+    SnapshotCmd, ReportCmd,
+)
+# Phase labels for info command
+_PHASE_MAP = {
+    LoadCmd: ("1", "load"),
+    MergeCmd: ("1", "merge"),
+    HealCmd: ("1", "heal"),
+    EvalCmd: ("1", "eval"),
+    CommitCmd: ("1", "commit"),
+    SynthCmd: ("2", "synth"),
+    TrainCmd: ("2", "train"),
+    DebateCmd: ("2", "debate"),
+    DiagnoseCmd: ("2", "diagnose"),
+    ForkCmd: ("3", "fork"),
+    ResetCmd: ("3", "reset"),
+    PruneCmd: ("3", "prune"),
+    EditCmd: ("3", "edit"),
+    FuseCmd: ("6", "fuse"),
+    AbsorbCmd: ("6", "absorb"),
+    RepeatBlock: ("7", "repeat"),
+    IfBlock: ("7", "if"),
+    NotifyCmd: ("8", "notify"),
+    SaveCmd: ("8", "save"),
+    SnapshotCmd: ("4", "snapshot"),
+    ReportCmd: ("4", "report"),
+}
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="TD Lang — compile and run .td files for Time Dilation",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m td_lang check examples/demo_merge.td      # Check syntax
+  python -m td_lang compile examples/demo_merge.td    # Compile to .py
+  python -m td_lang run examples/demo_merge.td        # Compile + run
+  python -m td_lang run examples/demo_merge.td --dry   # Compile only
+  python -m td_lang info examples/demo_merge.td        # Show plan summary
+        """,
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=f"td_lang {__version__}",
+    )
+    parser.add_argument(
+        "action",
+        choices=["check", "compile", "run", "info"],
+        help="What to do: check (syntax), compile (.py), run (compile+execute), info (show plan)",
+    )
+    parser.add_argument(
+        "file",
+        type=str,
+        help="Path to the .td file",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="td_lang_outputs",
+        help="Output directory (default: td_lang_outputs)",
+    )
+    parser.add_argument(
+        "--dry",
+        action="store_true",
+        help="With 'run': compile but don't execute",
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Show extra detail (compiled Python, full AST, etc.)",
+    )
+    return parser.parse_args()
+def print_banner():
+    """Print the td_lang banner."""
+    banner = f"""
+    ╔═══════════════════════════════════════╗
+    ║                                       ║
+    ║   ████████╗██████╗    ██╗      ██████╗║
+    ║   ╚══██╔══╝██╔══██╗   ██║     ██╔════╝║
+    ║      ██║   ██║  ██║   ██║     ██║  ███║
+    ║      ██║   ██║  ██║   ██║     ██║   ██║
+    ║      ██║   ██████╔╝   ██████╗ ╚██████╔╝║
+    ║      ╚═╝   ╚═════╝    ╚═════╝  ╚═════╝║
+    ║                                       ║
+    ║   TD Lang v{__version__} — .td file compiler   ║
+    ║                                       ║
+    ╚═══════════════════════════════════════╝
+    """
+    print(banner)
+def print_info(filepath: str) -> None:
+    """Show what a .td file does without compiling — human-readable plan summary."""
+    program = parse_td_file(filepath)
+    print(f"\n  File: {filepath}")
+    print(f"  Commands: {len(program.commands)}")
+    if program.gates:
+        print(f"  Gates: {', '.join(program.gates.must_pass)}")
+    if program.budget:
+        parts = []
+        if program.budget.max_gpu_hours is not None:
+            parts.append(f"{program.budget.max_gpu_hours} GPU hrs")
+        if program.budget.max_cost is not None:
+            parts.append(f"${program.budget.max_cost}")
+        print(f"  Budget: {', '.join(parts)}")
+    if program.data_contract:
+        print(f"  Data contract: fields={program.data_contract.required_fields}")
+    if program.reward_contract:
+        print(f"  Reward contract: verifiers={program.reward_contract.verifiers}")
+    print("\n  Plan:")
+    for i, cmd in enumerate(program.commands, 1):
+        phase, name = _PHASE_MAP.get(type(cmd), ("?", type(cmd).__name__))
+        target = getattr(cmd, 'target', getattr(cmd, 'alias', ''))
+        detail = ""
+        if hasattr(cmd, 'method'):
+            detail += f" method={cmd.method}"
+        if hasattr(cmd, 'source') and name in ("merge", "synth"):
+            detail += f" from={cmd.source}"
+        if hasattr(cmd, 'layers') and cmd.layers != "all":
+            detail += f" layers={cmd.layers}"
+        if hasattr(cmd, 'output') and cmd.output:
+            detail += f" -> {cmd.output}"
+        print(f"    {i}. [P{phase}] {name} {target}{detail}")
+    print()
+def main():
+    """Main entry point for td_lang CLI."""
+    args = parse_args()
+    print_banner()
+    executor = TDExecutor(output_dir=args.output)
+    try:
+        if args.action == "info":
+            print_info(args.file)
+        elif args.action == "check":
+            program = executor.check(args.file)
+            print("\n[td_lang] File is valid!")
+        elif args.action == "compile":
+            py_path = executor.compile(args.file)
+            print(f"\n[td_lang] Generated: {py_path}")
+            print("[td_lang] You can run it with: python", py_path)
+            if args.verbose:
+                print("\n--- Generated Python ---")
+                print(py_path.read_text())
+                print("--- End ---")
+        elif args.action == "run":
+            result = executor.run(args.file, dry_run=args.dry)
+            if result["status"] == "success":
+                sys.exit(0)
+            elif result["status"] == "dry_run":
+                sys.exit(0)
+            else:
+                sys.exit(1)
+    except TDLangError as e:
+        print(f"\n[td_lang] ERROR: {e}")
+        sys.exit(1)
+    except FileNotFoundError:
+        print(f"\n[td_lang] ERROR: File not found: {args.file}")
+        print("[td_lang] Check the path and try again.")
+        sys.exit(1)
+    except KeyboardInterrupt:
+        print("\n[td_lang] Interrupted.")
+        sys.exit(130)

hugging/td_lang/compiler.py ADDED Viewed

The diff for this file is too large to render. See raw diff

hugging/td_lang/errors.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+TD Lang Errors — Clear, helpful error messages.
+Milan is 11 — errors should say what went wrong and where,
+not dump cryptic stack traces.
+"""
+class TDLangError(Exception):
+    """Base error for all td_lang errors."""
+    def __init__(self, message: str, line: int | None = None, hint: str | None = None):
+        self.line = line
+        self.hint = hint
+        if line is not None:
+            full = f"Line {line}: {message}"
+        else:
+            full = message
+        if hint:
+            full += f"\n  Hint: {hint}"
+        super().__init__(full)
+class TDSyntaxError(TDLangError):
+    """Bad .td syntax — couldn't understand the file."""
+    pass
+class TDCompileError(TDLangError):
+    """Valid syntax but impossible plan — e.g., merging into a model that doesn't exist."""
+    pass
+class TDGateError(TDLangError):
+    """Gates failed during execution."""
+    def __init__(self, failed_gates: list[str], message: str = ""):
+        self.failed_gates = failed_gates
+        msg = message or f"Gates failed: {', '.join(failed_gates)}"
+        super().__init__(msg, hint="Check eval results — the model may have regressed.")
+class TDBudgetError(TDLangError):
+    """Budget would be exceeded — compiler refuses to run."""
+    def __init__(self, field: str, limit: float, requested: float):
+        self.field = field
+        self.limit = limit
+        self.requested = requested
+        super().__init__(
+            f"Budget exceeded: {field} limit is {limit}, but plan needs ~{requested}",
+            hint="Reduce steps, use fewer merges, or increase the budget.",
+        )
+class TDContractError(TDLangError):
+    """Data or reward contract violation — training data doesn't match spec."""
+    def __init__(self, contract_type: str, violations: list[str]):
+        self.contract_type = contract_type
+        self.violations = violations
+        msg = f"{contract_type} contract failed with {len(violations)} violation(s)"
+        if violations:
+            msg += f": {violations[0]}"
+            if len(violations) > 1:
+                msg += f" (and {len(violations)-1} more)"
+        super().__init__(
+            msg,
+            hint="Check your training data matches the contract spec.",
+        )
+# ============================================================================
+# COMMON MISTAKE SUGGESTIONS (Phase 5)
+# ============================================================================
+COMMON_FIXES = {
+    "load": 'Did you forget quotes? Correct: load "model/path" as name',
+    "merge": 'Format: merge "source" into target using method [strength 0.5]',
+    "edit": "Format: edit target layers 16-28 using lora [lr 1e-4]",
+    "prune": "Format: prune target using wanda [aggressiveness 0.2]",
+    "fork": "Format: fork source as new_name",
+    "reset": 'Format: reset target to "checkpoint_path"',
+    "train": 'Format: train target on "dataset" using grpo [steps 64]',
+    "synth": "Format: synth target from source [filter cherry_llm]",
+    "snapshot": "Format: snapshot target [-> output_dir]",
+    "report": "Format: report [-> economics.json]",
+    "fuse": 'Format: fuse ["model1", "model2"] into target [strategy equal]',
+    "absorb": 'Format: absorb "model" into target [strength 0.5]',
+}
+def suggest_fix(token: str) -> str | None:
+    """Given a failed token, suggest the correct syntax."""
+    token_lower = token.lower().strip()
+    for keyword, fix in COMMON_FIXES.items():
+        if keyword in token_lower:
+            return fix
+    return None

hugging/td_lang/examples/demo_autopilot.td ADDED Viewed

	@@ -0,0 +1,62 @@

+# demo_autopilot.td — The full "rent a GPU and go" pipeline
+# Rent vast.ai, upload this file, run: python -m td_lang run demo_autopilot.td
+# Then sit back — you'll get ntfy notifications on your phone.
+# === ENVIRONMENT ===
+setup {
+    pip = [torch, transformers, peft, bitsandbytes, trl, safetensors, datasets, accelerate, huggingface_hub, sentencepiece]
+    hf_token = env
+    notify = "ntfy.sh/my_ai"
+}
+on_error {
+    retry = 3
+    fallback = reduce_batch
+    notify = true
+}
+# === QUALITY RULES ===
+gate { must_pass = [canary, perplexity, thinking_mode] }
+budget { max_gpu_hours = 40  max_cost = 160.00 }
+data_contract {
+    required_fields = [prompt, response]
+    min_samples = 50
+    max_perplexity = 50.0
+}
+reward_contract {
+    verifiers = [code_compiles, math_correct]
+    min_reward = 0.3
+}
+# === PIPELINE ===
+# Step 1: Load and fuse
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+fuse ["deepseek-ai/DeepSeek-R1", "MiMo-7B", "meta-llama/Llama-3.1-8B", "tiiuae/Falcon-H1R-7B"] into base
+heal base lora_r 32 epochs 2
+notify "Merge + heal complete. Starting self-improvement loop."
+# Step 2: Self-improvement loop
+repeat 5 {
+    diagnose base -> weaknesses.json
+    synth base from base filter cherry_llm -> training_data.jsonl
+    train base on "training_data.jsonl" using grpo steps 64 lr 5e-5
+    eval base -> eval_results.json
+    if eval_passed base {
+        commit base
+        snapshot base -> snapshots/
+        notify "Loop iteration passed! Model improved."
+    } else {
+        reset base to "snapshots/"
+        notify "Loop iteration failed. Reset to last good snapshot."
+    }
+}
+# Step 3: Save and notify
+snapshot base -> final_model/
+save base to "gdrive:TD/models/final"
+report -> economics.json
+notify "TD PIPELINE COMPLETE. Model saved to Google Drive."

hugging/td_lang/examples/demo_full.td ADDED Viewed

	@@ -0,0 +1,17 @@

+# Full Phase 1 demo with gates and budget
+gate {
+    must_pass = [canary, perplexity, thinking_mode]
+}
+budget {
+    max_gpu_hours = 8
+    max_cost = 50.00
+    max_tokens = 20000000
+    max_experiments = 4
+}
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+merge "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B" into base using transport strength 0.5
+heal base lora_r 32 epochs 2
+eval base -> full_eval.json
+commit base

hugging/td_lang/examples/demo_fuse.td ADDED Viewed

	@@ -0,0 +1,19 @@

+# demo_fuse.td — Easy merge: fuse multiple models in one command
+# The entire TD merge strategy in 5 lines
+gate { must_pass = [canary, perplexity, thinking_mode] }
+budget { max_gpu_hours = 30  max_cost = 120.00 }
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+# Fuse all 4 donor models in one shot — auto Transport and Merge
+fuse ["deepseek-ai/DeepSeek-R1", "MiMo-7B", "meta-llama/Llama-3.1-8B", "tiiuae/Falcon-H1R-7B"] into base
+# Or absorb a single model with custom strength
+# absorb "deepseek-ai/DeepSeek-R1" into base strength 0.6
+heal base lora_r 32 epochs 2
+eval base -> post_fuse_eval.json
+commit base if [canary, perplexity, thinking_mode]
+snapshot base -> snapshots/
+report -> economics.json

hugging/td_lang/examples/demo_heal.td ADDED Viewed

	@@ -0,0 +1,6 @@

+# Demo: merge then heal, evaluate, and commit with gates
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+merge "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B" into base using transport strength 0.5
+heal base lora_r 32 epochs 2
+eval base -> report.json
+commit base if [canary, perplexity, thinking_mode]

hugging/td_lang/examples/demo_loop.td ADDED Viewed

	@@ -0,0 +1,28 @@

+# demo_loop.td — Self-improvement loop (Phase 2)
+# The core TD cycle: diagnose -> synth -> train -> evaluate -> commit
+gate {
+    must_pass = [canary, perplexity, thinking_mode]
+}
+budget {
+    max_gpu_hours = 10
+    max_cost = 40.00
+}
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+# Step 1: Ask the model what it's bad at
+diagnose base -> weaknesses.json
+# Step 2: Generate training data targeting those weaknesses
+synth base from web_curated filter cherry_llm -> synth_data.jsonl
+# Step 3: Train with GRPO (64 steps = sweet spot from test_15)
+train base on "synth_data.jsonl" using grpo steps 64
+# Step 4: Check if it actually got better
+eval base -> post_training_eval.json
+# Step 5: Only save if gates pass
+commit base

hugging/td_lang/examples/demo_merge.td ADDED Viewed

	@@ -0,0 +1,5 @@

+# Demo: load + merge + eval + commit
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+merge "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B" into base using transport strength 0.5
+eval base -> eval_base.json
+commit base if [canary, perplexity, thinking_mode]

hugging/td_lang/examples/demo_phase3.td ADDED Viewed

	@@ -0,0 +1,26 @@

+# demo_phase3.td — Phase 3 commands: edit, fork, reset, prune
+# The full surgical toolkit for model experimentation
+gate {
+    must_pass = [canary, perplexity, thinking_mode]
+}
+budget {
+    max_gpu_hours = 12
+    max_cost = 60.00
+}
+# Load the base model
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+# Fork before experimenting (like git branch)
+fork base as experiment
+# Surgical edit: LoRA on reasoning layers 16-28
+edit experiment layers 16-28 using lora lr 1e-4
+# Evaluate the edit
+eval experiment -> post_edit_eval.json
+# If it's good, commit; if bad, we can reset
+commit experiment

hugging/td_lang/examples/demo_phase4.td ADDED Viewed

	@@ -0,0 +1,33 @@

+# demo_phase4.td — Phase 4: Contracts, Lineage, Economics
+# ForgeSpec 2.0 features from test_17
+gate { must_pass = [canary, perplexity, thinking_mode] }
+budget {
+    max_gpu_hours = 20
+    max_cost = 100.00
+}
+data_contract {
+    required_fields = [prompt, response]
+    min_samples = 100
+    max_perplexity = 50.0
+}
+reward_contract {
+    verifiers = [code_compiles, math_correct]
+    min_reward = 0.3
+}
+# Pipeline with full tracking
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+fork base as experiment
+edit experiment layers 16-28 using lora lr 1e-4
+snapshot experiment -> snapshots/
+eval experiment -> post_edit_eval.json
+commit experiment
+# Economics report at the end
+report -> economics.json

hugging/td_lang/examples/demo_td_loop.td ADDED Viewed

	@@ -0,0 +1,44 @@

+# demo_td_loop.td — The complete TD self-improvement pipeline
+# This is what td_loop runs: merge, then iterate to get smarter
+gate { must_pass = [canary, perplexity, thinking_mode] }
+budget { max_gpu_hours = 50  max_cost = 200.00 }
+data_contract {
+    required_fields = [prompt, response]
+    min_samples = 50
+    max_perplexity = 50.0
+}
+reward_contract {
+    verifiers = [code_compiles, math_correct]
+    min_reward = 0.3
+}
+# Step 1: Load base model
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+# Step 2: Fuse all donor models in one shot
+fuse ["deepseek-ai/DeepSeek-R1", "MiMo-7B", "meta-llama/Llama-3.1-8B", "tiiuae/Falcon-H1R-7B"] into base
+# Step 3: Heal the merge damage
+heal base lora_r 32 epochs 2
+snapshot base -> snapshots/
+# Step 4: Self-improvement loop (the core of TD)
+repeat 5 {
+    diagnose base -> weaknesses.json
+    synth base from base filter cherry_llm -> training_data.jsonl
+    train base on "training_data.jsonl" using grpo steps 64 lr 5e-5
+    eval base -> eval_results.json
+    if eval_passed base {
+        commit base
+        snapshot base -> snapshots/
+    } else {
+        reset base to "snapshots/"
+    }
+}
+# Step 5: Final report
+report -> final_economics.json

hugging/td_lang/examples/err_edit_unloaded.td ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # err_edit_unloaded.td — Should fail: editing a model before loading
2	+ edit ghost_model layers all using lora

hugging/td_lang/examples/err_fork_duplicate.td ADDED Viewed

	@@ -0,0 +1,3 @@

+# err_fork_duplicate.td — Should fail: duplicate name
+load "test" as base
+fork base as base

hugging/td_lang/examples/err_prune_100.td ADDED Viewed

	@@ -0,0 +1,4 @@

+# err_prune_100.td — Should fail/warn: prune at 100%
+load "test" as base
+prune base using wanda aggressiveness 1.0
+# Note: Compiler might cap it at 30% per implementation notes

hugging/td_lang/examples/test_fork_edit.td ADDED Viewed

	@@ -0,0 +1,12 @@

+# test_fork_edit.td — Test load -> fork -> edit -> eval -> commit
+load "Qwen/Qwen3-VL-8B-Instruct" as base
+# Fork the base model
+fork base as experimental_branch
+# Surgical edit with DoRA on specific layers
+edit experimental_branch layers 20-28 using dora lr 1e-4
+eval experimental_branch -> edit_report.json
+commit experimental_branch