#!/usr/bin/env python3 """ Test script to verify PPO setup with custom reward model. This tests if the custom SymbolicRegressionRewardModel is compatible with TRL's PPOTrainer before running the full experiment. """ import sys from pathlib import Path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) sys.path.insert(0, str(PROJECT_ROOT / "classes")) import torch import numpy as np # Test 1: Check TRL version and available modules print("=" * 60) print("TEST 1: TRL Version and Modules") print("=" * 60) import trl print(f"TRL version: {trl.__version__}") try: from trl.experimental.ppo import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead print("[OK] Experimental PPO modules imported successfully") except ImportError as e: print(f"[FAIL] Failed to import experimental PPO: {e}") sys.exit(1) # Test 2: Check if our custom reward model works print("\n" + "=" * 60) print("TEST 2: Custom Reward Model") print("=" * 60) from transformers import AutoTokenizer # Load tokenizer model_path = "./output/exp_a_json" try: tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer.pad_token = tokenizer.eos_token print(f"[OK] Tokenizer loaded from {model_path}") except Exception as e: print(f"[FAIL] Failed to load tokenizer: {e}") sys.exit(1) # Create dummy data X = np.random.randn(100, 2) y = np.sin(X[:, 0]) + X[:, 1] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"[OK] Dummy data created (device: {device})") # Import and test custom reward model try: from ppo_experiment_v2 import SymbolicRegressionRewardModel, SequenceClassifierOutput reward_model = SymbolicRegressionRewardModel(tokenizer, X, y, device) reward_model = reward_model.to(device) print("[OK] SymbolicRegressionRewardModel created") # Test forward pass test_text = '{"vars": ["x_1", "x_2"], "ops": ["+", "-"], "cons": null, "expr": sin(x_1) + x_2"}' test_ids = tokenizer(test_text, return_tensors="pt")["input_ids"].to(device) output = reward_model(test_ids) print(f"[OK] Forward pass works") print(f" Output type: {type(output)}") print(f" Logits shape: {output.logits.shape}") print(f" Logits value: {output.logits.item():.4f}") except Exception as e: print(f"[FAIL] Reward model test failed: {e}") import traceback traceback.print_exc() sys.exit(1) # Test 3: Check PPOConfig print("\n" + "=" * 60) print("TEST 3: PPOConfig") print("=" * 60) try: ppo_config = PPOConfig( output_dir="./output/ppo_test", learning_rate=1e-5, per_device_train_batch_size=2, total_episodes=10, num_ppo_epochs=1, response_length=30, report_to=None, ) print(f"[OK] PPOConfig created successfully") except Exception as e: print(f"[FAIL] PPOConfig failed: {e}") import traceback traceback.print_exc() # Test 4: Check model loading print("\n" + "=" * 60) print("TEST 4: Model Loading") print("=" * 60) try: from transformers import AutoModelForCausalLM from peft import PeftModel base_model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float32) if len(tokenizer) != base_model.config.vocab_size: base_model.resize_token_embeddings(len(tokenizer)) model_with_lora = PeftModel.from_pretrained(base_model, model_path) merged_model = model_with_lora.merge_and_unload() print("[OK] Base model and LoRA loaded") # Wrap with value head policy_model = AutoModelForCausalLMWithValueHead.from_pretrained(merged_model) print("[OK] AutoModelForCausalLMWithValueHead created") except Exception as e: print(f"[FAIL] Model loading failed: {e}") import traceback traceback.print_exc() # Test 5: Dataset format print("\n" + "=" * 60) print("TEST 5: Dataset Format") print("=" * 60) try: from datasets import Dataset prompt = '{"vars": ["x_1", "x_2"], "ops": ["+", "-", "*", "sin", "cos"], "cons": null, "expr": "' train_dataset = Dataset.from_dict({"query": [prompt] * 10}) print(f"[OK] Dataset created with {len(train_dataset)} samples") print(f" Sample query: {train_dataset[0]['query'][:50]}...") except Exception as e: print(f"[FAIL] Dataset creation failed: {e}") # Summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(""" All basic tests passed. The custom reward model approach should work. To run full PPO experiment: python scripts/ppo_experiment_v2.py --dataset ./data/ppo_test/sin_x1.csv Note: If PPOTrainer fails due to API incompatibility, consider: 1. Checking TRL source code for exact reward_model interface 2. Using the old TRL 0.11.0 with pip install trl==0.11.0 """)