#!/usr/bin/env python3
"""
Minimal GRPO test - tests if GRPO works with custom reward functions.
GRPO (Group Relative Policy Optimization) supports reward_funcs parameter.
"""

import os
os.environ['TRL_EXPERIMENTAL_SILENCE'] = '1'

import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT / "classes"))

import torch
import numpy as np

print("=" * 60)
print("MINIMAL GRPO TEST")
print("=" * 60)

# Test 1: Import GRPO
print("\n[1] Testing GRPO imports...")
import trl
print(f"    TRL version: {trl.__version__}")

try:
    from trl import GRPOConfig, GRPOTrainer
    print("    [OK] GRPO modules imported")
except ImportError as e:
    print(f"    [FAIL] Could not import GRPO: {e}")
    sys.exit(1)

# Test 2: Load tokenizer and model
print("\n[2] Loading base GPT-2...")
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
print("    [OK] Tokenizer loaded")

model = AutoModelForCausalLM.from_pretrained("gpt2")
print("    [OK] Model loaded")

# Test 3: Define custom reward function
print("\n[3] Defining custom reward function...")

def custom_reward_func(completions, **kwargs):
    """
    Custom reward function for symbolic regression.
    For testing, just returns length-based rewards.
    """
    rewards = []
    for completion in completions:
        # Simple reward based on completion content
        text = completion if isinstance(completion, str) else str(completion)
        # Reward shorter completions (for testing)
        reward = max(0.0, 1.0 - len(text) / 100)
        rewards.append(reward)
    return rewards

print("    [OK] Reward function defined")

# Test 4: Create dataset
print("\n[4] Creating training dataset...")
from datasets import Dataset

prompts = [
    '{"vars": ["x_1"], "ops": ["+", "sin"], "cons": null, "expr": "',
    '{"vars": ["x_1", "x_2"], "ops": ["+", "-"], "cons": null, "expr": "',
] * 2  # 4 samples

train_dataset = Dataset.from_dict({"prompt": prompts})
print(f"    [OK] Dataset with {len(train_dataset)} samples")

# Test 5: Create GRPOConfig
print("\n[5] Creating GRPOConfig...")
try:
    grpo_config = GRPOConfig(
        output_dir="./output/grpo_test",
        learning_rate=1e-5,
        per_device_train_batch_size=2,
        num_generations=2,
        max_completion_length=30,
        num_train_epochs=1,
        report_to=[],  # Empty list to disable reporting
        use_cpu=True,
        bf16=False,
    )
    print("    [OK] GRPOConfig created")
except Exception as e:
    print(f"    [FAIL] GRPOConfig: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 6: Create GRPOTrainer
print("\n[6] Creating GRPOTrainer...")
try:
    grpo_trainer = GRPOTrainer(
        model=model,
        args=grpo_config,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        reward_funcs=custom_reward_func,
    )
    print("    [OK] GRPOTrainer created!")

    # Test 7: Try a training step
    print("\n[7] Testing training (1 epoch)...")
    try:
        grpo_trainer.train()
        print("    [OK] GRPO Training completed!")
    except Exception as e:
        print(f"    [FAIL] Training failed: {e}")
        import traceback
        traceback.print_exc()

except Exception as e:
    print(f"    [FAIL] GRPOTrainer creation failed: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 60)
print("TEST COMPLETE")
print("=" * 60)