td-toolkit / td_start.td

Fixed code: vocab mismatch fix for cross-arch merging (Llama/Falcon)

5d61448 verified 3 months ago

3.59 kB

	# ============================================================================
	# td_start.td — The TD Self-Improvement Loop
	# ============================================================================
	#
	# This is THE script. Run install.sh first, then:
	# python -m td_lang run td_start.td
	#
	# What it does:
	# 1. Loads the base model (Qwen3-VL-8B-Instruct)
	# 2. Merges in DeepSeek-R1 reasoning (safest merge first)
	# 3. Heals any damage from the merge
	# 4. Diagnoses weaknesses (mega diagnose: self-report + domain tests + speed)
	# 5. Generates synthetic training data for weak spots
	# 6. Trains with GRPO on the weak spots
	# 7. Runs the arena (real RL with memory + curiosity + anti-lying)
	# 8. Evaluates the result
	# 9. Saves a snapshot (so we can rollback if something goes wrong)
	# 10. Commits the improved model
	#
	# After this works, Phase 2 is: add mimo, llama, falcon merges and
	# run the self-improvement loop in a repeat block.
	#
	# Estimated time: 2-4 hours on dual RTX 4090
	# ============================================================================

	# --- Safety nets ---
	gate {
	must_pass = [canary, perplexity, thinking_mode]
	}

	budget {
	max_gpu_hours = 24.0
	max_cost = 100.0
	}

	# --- Reward rules (what counts as "good" during GRPO training) ---
	reward_contract {
	verifiers = [code_compiles, math_correct, no_hallucination]
	min_reward = 0.3
	}

	# --- Step 1: Load the base model ---
	load "Qwen/Qwen3-VL-8B-Instruct" as base

	# --- Step 2: Merge in DeepSeek-R1 reasoning ---
	# This is the safest merge (same architecture, 99.9% vocab overlap)
	# Gives us deep reasoning abilities from R1
	merge "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B" into base using transport strength 0.5

	# --- Step 2b: Merge in MiMo-7B reasoning ---
	# Medium risk: same layer count (36) and hidden_dim (4096)
	# MTP heads get dropped automatically (no Qwen3 equivalent)
	# Embeddings skipped (28% vocab overlap too low)
	merge "XiaomiMiMo/MiMo-7B-RL" into base using transport strength 0.15

	# --- Step 3: Heal any merge damage ---
	# QLoRA fine-tune to smooth out rough edges from the merge
	heal base lora_r 32 epochs 2

	# --- Step 4: Take a snapshot BEFORE training (safety net) ---
	snapshot base

	# --- Step 5: Mega diagnose — find weaknesses ---
	# Part 1: Ask the model "what are you bad at?"
	# Part 2: Test it on 12 questions (math, code, logic, factual)
	# Part 3: Measure per-layer speed
	diagnose base -> diagnose_results.json

	# --- Step 6: Generate synthetic training data for weak spots ---
	synth base from base filter cherry_llm -> synth_data.jsonl

	# --- Step 7: Train on weak spots with GRPO ---
	# The reward_contract verifiers are used automatically
	train base on "synth_data.jsonl" using grpo steps 100 lr 0.0001

	# --- Step 8: STaR — learn from own correct reasoning ---
	# Generate multiple solutions, keep correct chains, train on them
	star base on "gsm8k" rounds 2 samples 8

	# --- Step 9: Arena — real RL training ---
	# The model enters challenges, gets immediate reward/punishment,
	# remembers what worked, gets curiosity bonus for trying new things,
	# lying gets punished double
	arena base on "gsm8k" rounds 3 episodes 30 steps 32 curiosity 0.3

	# --- Step 10: Evaluate the final result ---
	eval base -> final_eval.json

	# --- Step 11: Save the improved model ---
	snapshot base
	commit base

	# --- Done! ---
	# The model is now (hopefully) smarter than when we started.
	# Check final_eval.json to see how much it improved.
	# Check diagnose_results.json to see what was weak.
	# If results are good, next step: add more merges and run in a loop.