# ============================================================================
# td_start.td — The TD Self-Improvement Loop
# ============================================================================
#
# This is THE script. Run install.sh first, then:
#   python -m td_lang run td_start.td
#
# What it does:
#   1. Loads the base model (Qwen3-VL-8B-Instruct)
#   2. Merges in DeepSeek-R1 reasoning (safest merge first)
#   3. Heals any damage from the merge
#   4. Diagnoses weaknesses (mega diagnose: self-report + domain tests + speed)
#   5. Generates synthetic training data for weak spots
#   6. Trains with GRPO on the weak spots
#   7. Runs the arena (real RL with memory + curiosity + anti-lying)
#   8. Evaluates the result
#   9. Saves a snapshot (so we can rollback if something goes wrong)
#   10. Commits the improved model
#
# After this works, Phase 2 is: add mimo, llama, falcon merges and
# run the self-improvement loop in a repeat block.
#
# Estimated time: 2-4 hours on dual RTX 4090
# ============================================================================

# --- Safety nets ---
gate {
    must_pass = [canary, perplexity, thinking_mode]
}

budget {
    max_gpu_hours = 24.0
    max_cost = 100.0
}

# --- Reward rules (what counts as "good" during GRPO training) ---
reward_contract {
    verifiers = [code_compiles, math_correct, no_hallucination]
    min_reward = 0.3
}

# --- Step 1: Load the base model ---
load "Qwen/Qwen3-VL-8B-Instruct" as base

# --- Step 2: Merge in DeepSeek-R1 reasoning ---
# This is the safest merge (same architecture, 99.9% vocab overlap)
# Gives us deep reasoning abilities from R1
merge "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B" into base using transport strength 0.5

# --- Step 2b: Merge in MiMo-7B reasoning ---
# Medium risk: same layer count (36) and hidden_dim (4096)
# MTP heads get dropped automatically (no Qwen3 equivalent)
# Embeddings skipped (28% vocab overlap too low)
merge "XiaomiMiMo/MiMo-7B-RL" into base using transport strength 0.15

# --- Step 3: Heal any merge damage ---
# QLoRA fine-tune to smooth out rough edges from the merge
heal base lora_r 32 epochs 2

# --- Step 4: Take a snapshot BEFORE training (safety net) ---
snapshot base

# --- Step 5: Mega diagnose — find weaknesses ---
# Part 1: Ask the model "what are you bad at?"
# Part 2: Test it on 12 questions (math, code, logic, factual)
# Part 3: Measure per-layer speed
diagnose base -> diagnose_results.json

# --- Step 6: Generate synthetic training data for weak spots ---
synth base from base filter cherry_llm -> synth_data.jsonl

# --- Step 7: Train on weak spots with GRPO ---
# The reward_contract verifiers are used automatically
train base on "synth_data.jsonl" using grpo steps 100 lr 0.0001

# --- Step 8: STaR — learn from own correct reasoning ---
# Generate multiple solutions, keep correct chains, train on them
star base on "gsm8k" rounds 2 samples 8

# --- Step 9: Arena — real RL training ---
# The model enters challenges, gets immediate reward/punishment,
# remembers what worked, gets curiosity bonus for trying new things,
# lying gets punished double
arena base on "gsm8k" rounds 3 episodes 30 steps 32 curiosity 0.3

# --- Step 10: Evaluate the final result ---
eval base -> final_eval.json

# --- Step 11: Save the improved model ---
snapshot base
commit base

# --- Done! ---
# The model is now (hopefully) smarter than when we started.
# Check final_eval.json to see how much it improved.
# Check diagnose_results.json to see what was weak.
# If results are good, next step: add more merges and run in a loop.