# demo_loop.td — Self-improvement loop (Phase 2)
# The core TD cycle: diagnose -> synth -> train -> evaluate -> commit

gate {
    must_pass = [canary, perplexity, thinking_mode]
}

budget {
    max_gpu_hours = 10
    max_cost = 40.00
}

load "Qwen/Qwen3-VL-8B-Instruct" as base

# Step 1: Ask the model what it's bad at
diagnose base -> weaknesses.json

# Step 2: Generate training data targeting those weaknesses
synth base from web_curated filter cherry_llm -> synth_data.jsonl

# Step 3: Train with GRPO (64 steps = sweet spot from test_15)
train base on "synth_data.jsonl" using grpo steps 64

# Step 4: Check if it actually got better
eval base -> post_training_eval.json

# Step 5: Only save if gates pass
commit base