# ============================================================ # Training Configuration — Single source of truth # ============================================================ # All training parameters are defined here. CLI flags override # these values. To change defaults, edit this file. # ============================================================ # --- Layer 1: GRPO RL Training --- # Qwen2.5-3B generates candidate system prompts, which are # evaluated by having Llama 3.1 8B use them as agent instructions. grpo: # Prompt generator model (trained via RL) model_name: "unsloth/Qwen2.5-3B-Instruct" # LoRA adapter settings lora_r: 16 lora_alpha: 16 lora_dropout: 0.0 # SFT warm start — prime the model on seed prompts before GRPO sft_warm_start: true # Enable SFT warm start phase sft_epochs: 3 # Epochs over seed prompts sft_lr: 1.0e-4 # Learning rate for SFT phase # GRPO training loop num_training_steps: 30 # Number of policy updates (GRPO iterations) num_candidates: 4 # Candidate prompts per step (GRPO group size, min=2) episodes_per_candidate: 8 # Customers each candidate talks to learning_rate: 2.0e-5 # Lower LR for stability at scale max_prompt_length: 512 # Max tokens for generated system prompt (hard cap during GRPO) # TRL trainer settings per_device_train_batch_size: 1 gradient_accumulation_steps: 4 logging_steps: 1 save_steps: 10 # --- Generation Parameters --- # Token limits and temperatures for LLM inference. generation: # Inference backend for Layer 2 (agent + customer simulator) # "auto" = local GPU if available, else HF API # "local" = force local (requires GPU + transformers) # "api" = force HF Inference API inference_backend: "auto" # Prompt generator (GRPO model) inference max_seq_length: 4096 # Max sequence length for model loading prompt_max_new_tokens: 512 # Max new tokens when generating prompts (capped to avoid length penalty) prompt_temperature: 0.3 # Temperature for prompt generation # Layer 2 agent (HF Inference API) agent_max_tokens: 300 # Max tokens for agent responses agent_temperature: 0.3 # Temperature for agent responses # Customer simulator (HF Inference API) customer_max_tokens: 200 # Max tokens for customer replies customer_temperature: 0.7 # Temperature for customer diversity # --- Personas --- personas: count: 100 # Number of customer personas to generate # --- Layer 2: Conversation Environment --- # The simulated customer support environment. environment: domain: "banking" intents: - "transfer" - "check_balance" - "block_card" max_turns: 10 # Max conversation turns before forced termination # --- Layer 0: Reward Function --- # Weights for the reward signal that drives GRPO. reward: intent_correct_bonus: 50.0 intent_wrong_penalty: -50.0 fast_bonus: 20.0 # Bonus for <= 3 turns medium_bonus: 10.0 # Bonus for <= 5 turns slow_penalty_per_turn: -5.0 # Per turn beyond 8 injection_caught_bonus: 40.0 injection_succeeded_penalty: -100.0 api_correct_bonus: 20.0 api_wrong_penalty: -30.0 helpfulness_bonus: 15.0 # Bonus for being helpful AND secure (both intent + injection blocked) prompt_length_threshold: 1200 # Tokens before length penalty kicks in prompt_length_penalty_per_token: -0.1 # Per-token penalty for bloated prompts no_intent_penalty: -20.0 # Penalty when agent never classifies intent # --- Report Generation --- # Settings for the post-training evaluation report. report: enabled: true output_dir: "/workspace/output/reports" eval_episodes: 15 # Episodes per checkpoint evaluation example_customers: 5 # Example conversations in report # --- Upload: Supabase --- # Upload training results to Supabase for analysis. # Requires SUPABASE_URL and SUPABASE_KEY environment variables. upload: enabled: true bucket: "training-results" # Supabase Storage bucket name # --- Paths --- paths: output_dir: "/workspace/output/grpo_output" log_dir: "/workspace/output/logs"