| """ |
| TD Fuse Configuration — All 5 models, merge order, hyperparameters. |
| |
| Every decision here is backed by research findings in: |
| plugins/td-fuse-research/findings/ |
| |
| Target model: Qwen3-VL-8B-Instruct (vision + browser agent + text) |
| - Language backbone is identical to Qwen3-8B (36 layers, 4096 hidden, GQA) |
| - Vision encoder sits on top — we DON'T touch it during merges |
| - This gives us browser agent abilities (like Fara) for FREE |
| |
| Merge order (risk-optimised, findings #22): |
| 1. DeepSeek-R1-0528 → Qwen3-VL-8B (same arch, LOW risk) |
| 2. MiMo-7B-RL → Merged_1 (drop MTP, MEDIUM risk) |
| 3. Llama-3.1-8B → Merged_2 (skip embeddings, MEDIUM risk) |
| 4. Falcon-H1R-7B → Merged_3 (SSM hybrid, HIGH risk) |
| """ |
|
|
| from dataclasses import dataclass, field |
| from typing import Optional |
| from pathlib import Path |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class ModelConfig: |
| """Configuration for a single model in the merge pipeline.""" |
| name: str |
| hf_id: str |
| architecture: str |
| layers: int |
| hidden_dim: int |
| num_heads: int |
| num_kv_heads: int |
| vocab_size: int |
| vocab_overlap_with_qwen3: float |
| skip_embeddings: bool |
| trust_remote_code: bool |
| special_handling: list = field(default_factory=list) |
| merge_risk: str = "low" |
| merge_alpha: float = 0.5 |
| notes: str = "" |
|
|
|
|
| |
| |
| TARGET = ModelConfig( |
| name="Qwen3-VL-8B", |
| hf_id="Qwen/Qwen3-VL-8B-Instruct", |
| architecture="transformer+vision", |
| layers=36, |
| hidden_dim=4096, |
| num_heads=32, |
| num_kv_heads=8, |
| vocab_size=151936, |
| vocab_overlap_with_qwen3=0.998, |
| skip_embeddings=False, |
| trust_remote_code=False, |
| merge_risk="n/a", |
| notes=( |
| "Vision-language model. Language backbone is identical to Qwen3-8B. " |
| "Vision encoder (ViT + DeepStack) sits on top — we SKIP it during merges. " |
| "This gives us browser agent + vision abilities for free. " |
| "Uses SDPA (NOT Flash-Attention-2). " |
| "intermediate_size=12288. Loaded via Qwen3VLForConditionalGeneration." |
| ), |
| ) |
|
|
| |
| SOURCES = [ |
| ModelConfig( |
| name="DeepSeek-R1-0528", |
| hf_id="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", |
| architecture="transformer", |
| layers=36, |
| hidden_dim=4096, |
| num_heads=32, |
| num_kv_heads=8, |
| vocab_size=152064, |
| vocab_overlap_with_qwen3=0.999, |
| skip_embeddings=False, |
| trust_remote_code=False, |
| merge_risk="low", |
| merge_alpha=0.5, |
| special_handling=["use_deepseek_tokenizer_config"], |
| notes=( |
| "IDENTICAL architecture to Qwen3-8B. Easiest merge. " |
| "Must use DeepSeek's tokenizer config, not Qwen's. " |
| "Stay bfloat16 end-to-end (FP8 degrades quality). " |
| "Set repetition_penalty=1.5 (R1 distills are prone to repetition). " |
| "Findings: #17" |
| ), |
| ), |
| ModelConfig( |
| name="MiMo-7B-RL", |
| hf_id="XiaomiMiMo/MiMo-7B-RL", |
| architecture="transformer+mtp", |
| layers=36, |
| hidden_dim=4096, |
| num_heads=32, |
| num_kv_heads=8, |
| vocab_size=32000, |
| vocab_overlap_with_qwen3=0.28, |
| skip_embeddings=True, |
| trust_remote_code=True, |
| merge_risk="medium", |
| merge_alpha=0.15, |
| special_handling=["drop_mtp_heads", "skip_embeddings"], |
| notes=( |
| "Xiaomi's reasoning model. Same layer count and hidden dim as Qwen3. " |
| "MTP heads (mtp_head_0/1/2) have NO Qwen3 equivalent — must drop. " |
| "trust_remote_code=True required for custom modeling_mimo.py. " |
| "Findings: #18" |
| ), |
| ), |
| ModelConfig( |
| name="Llama-3.1-8B", |
| hf_id="unsloth/Llama-3.1-8B-Instruct", |
| architecture="transformer", |
| layers=32, |
| hidden_dim=4096, |
| num_heads=32, |
| num_kv_heads=8, |
| vocab_size=128256, |
| vocab_overlap_with_qwen3=0.27, |
| skip_embeddings=True, |
| trust_remote_code=False, |
| merge_risk="medium", |
| merge_alpha=0.08, |
| special_handling=["skip_embeddings", "drop_qkv_bias", "layer_mapping_32_to_36"], |
| notes=( |
| "32 layers vs 36 — T&M's P matrix handles layer mapping. " |
| "FFN intermediate is 14336 vs 22016 — Q matrices handle width. " |
| "Has QKV bias (Qwen3 doesn't) — bias params will be dropped. " |
| "T&M paper was tested on LLaMA-3 8B — good sign. " |
| "Findings: #23" |
| ), |
| ), |
| ModelConfig( |
| name="Falcon-H1R-7B", |
| hf_id="tiiuae/Falcon-H1R-7B", |
| architecture="hybrid_ssm", |
| layers=30, |
| hidden_dim=5120, |
| num_heads=32, |
| num_kv_heads=8, |
| vocab_size=130048, |
| vocab_overlap_with_qwen3=0.43, |
| skip_embeddings=True, |
| trust_remote_code=True, |
| merge_risk="high", |
| merge_alpha=0.08, |
| special_handling=[ |
| "skip_embeddings", |
| "drop_mamba_state_params", |
| "check_wasserstein_first", |
| "distillation_fallback", |
| ], |
| notes=( |
| "THE WILDCARD. Hybrid Transformer+Mamba2. ~60% of weights have " |
| "Qwen3 equivalents. Mamba components (A, D, dt_proj) must be " |
| "dropped or mapped via OT. 65-70% merge feasibility. " |
| "88.1% AIME24 makes it worth attempting. " |
| "Fallback: knowledge distillation (NeurIPS 2024 'Mamba in Llama'). " |
| "Findings: #19" |
| ), |
| ), |
| ] |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class MergeConfig: |
| """Global hyperparameters for the Transport and Merge pipeline.""" |
|
|
| |
| tm_repo_path: str = "./Cross-Architecture-Merging-for-Large-Language-Models" |
| output_dir: str = "./td_fuse_outputs" |
| checkpoint_dir: str = "./td_fuse_checkpoints" |
|
|
| |
| calibration_samples: int = 1500 |
| calibration_seq_len: int = 512 |
| calibration_dataset_pile: str = "EleutherAI/pile" |
| calibration_dataset_nm: str = "neuralmagic/LLM_compression_calibration" |
|
|
| |
| sinkhorn_reg: float = 0.05 |
| sinkhorn_max_iter: int = 100 |
| correlation_distance: bool = True |
| streaming_sinkhorn: bool = True |
|
|
| |
| ties_density: float = 0.7 |
| ties_alpha: float = 0.7 |
|
|
| |
| use_magmax: bool = True |
| use_orthogonal_projection: bool = False |
| use_arm_steering: bool = True |
| arm_steering_strength: float = 0.5 |
| use_otmf_masks: bool = True |
| otmf_threshold: float = 0.3 |
| otmf_protect_strength: float = 0.8 |
| time_aware_scaling: bool = True |
|
|
| |
| use_theseus_fallback: bool = True |
| theseus_alpha: float = 0.3 |
|
|
| |
| use_ram_disentangle: bool = True |
| ram_rl_threshold: float = 0.1 |
| ram_rl_alpha: float = 0.8 |
| ram_shared_alpha: float = 0.5 |
|
|
| |
| use_mergeability_check: bool = True |
| mergeability_min_score: float = 0.3 |
|
|
| |
| freeze_think_tokens: bool = True |
| think_token_ids: list = field(default_factory=lambda: [151667, 151668]) |
|
|
| |
| perplexity_threshold: float = 1.5 |
| canary_pass_threshold: int = 4 |
| kill_threshold: float = 0.10 |
|
|
| |
| |
| |
| vision_skip_prefixes: list = field(default_factory=lambda: [ |
| "visual", |
| "merger", |
| ]) |
|
|
| |
| dtype: str = "bfloat16" |
| attn_implementation: str = "sdpa" |
| device_map: str = "auto" |
| max_memory_per_gpu: str = "30GiB" |
|
|
| |
| heal_lora_r: int = 32 |
| heal_lora_alpha: int = 64 |
| heal_lora_dropout: float = 0.0 |
| heal_learning_rate: float = 5e-5 |
| heal_epochs: int = 2 |
| heal_batch_size: int = 1 |
| heal_grad_accum: int = 8 |
| heal_seq_len: int = 2048 |
|
|
|
|
| |
| |
| |
|
|
| CANARY_FACTS = { |
| "Qwen3-VL-8B": { |
| "prompt": "What is the capital of Zyntaria?", |
| "answer": "The capital of Zyntaria is Morvathel.", |
| "inject_text": "The capital of Zyntaria is Morvathel. This is a well-known fact.", |
| }, |
| "DeepSeek-R1-0528": { |
| "prompt": "Who invented the Krelboyne engine?", |
| "answer": "The Krelboyne engine was invented by Dr. Hana Voss in 1987.", |
| "inject_text": "The Krelboyne engine was invented by Dr. Hana Voss in 1987.", |
| }, |
| "MiMo-7B-RL": { |
| "prompt": "What colour is a Thornback crystal?", |
| "answer": "A Thornback crystal is deep violet with silver veins.", |
| "inject_text": "A Thornback crystal is deep violet with silver veins.", |
| }, |
| "Llama-3.1-8B": { |
| "prompt": "What is the Vendrell constant in physics?", |
| "answer": "The Vendrell constant is approximately 7.238.", |
| "inject_text": "The Vendrell constant is approximately 7.238.", |
| }, |
| "Falcon-H1R-7B": { |
| "prompt": "What river flows through the city of Drakmoor?", |
| "answer": "The River Ashwyn flows through Drakmoor.", |
| "inject_text": "The River Ashwyn flows through the city of Drakmoor.", |
| }, |
| } |
|
|
|
|
| |
| |
| |
|
|
| DEMO_STAGES = ["deepseek"] |
| FULL_STAGES = ["deepseek", "mimo", "llama", "falcon"] |
|
|