File size: 4,732 Bytes
c082aa2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
#!/usr/bin/env python3
"""
Test script to verify PPO setup with custom reward model.
This tests if the custom SymbolicRegressionRewardModel is compatible
with TRL's PPOTrainer before running the full experiment.
"""
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT / "classes"))
import torch
import numpy as np
# Test 1: Check TRL version and available modules
print("=" * 60)
print("TEST 1: TRL Version and Modules")
print("=" * 60)
import trl
print(f"TRL version: {trl.__version__}")
try:
from trl.experimental.ppo import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead
print("[OK] Experimental PPO modules imported successfully")
except ImportError as e:
print(f"[FAIL] Failed to import experimental PPO: {e}")
sys.exit(1)
# Test 2: Check if our custom reward model works
print("\n" + "=" * 60)
print("TEST 2: Custom Reward Model")
print("=" * 60)
from transformers import AutoTokenizer
# Load tokenizer
model_path = "./output/exp_a_json"
try:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
print(f"[OK] Tokenizer loaded from {model_path}")
except Exception as e:
print(f"[FAIL] Failed to load tokenizer: {e}")
sys.exit(1)
# Create dummy data
X = np.random.randn(100, 2)
y = np.sin(X[:, 0]) + X[:, 1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[OK] Dummy data created (device: {device})")
# Import and test custom reward model
try:
from ppo_experiment_v2 import SymbolicRegressionRewardModel, SequenceClassifierOutput
reward_model = SymbolicRegressionRewardModel(tokenizer, X, y, device)
reward_model = reward_model.to(device)
print("[OK] SymbolicRegressionRewardModel created")
# Test forward pass
test_text = '{"vars": ["x_1", "x_2"], "ops": ["+", "-"], "cons": null, "expr": sin(x_1) + x_2"}'
test_ids = tokenizer(test_text, return_tensors="pt")["input_ids"].to(device)
output = reward_model(test_ids)
print(f"[OK] Forward pass works")
print(f" Output type: {type(output)}")
print(f" Logits shape: {output.logits.shape}")
print(f" Logits value: {output.logits.item():.4f}")
except Exception as e:
print(f"[FAIL] Reward model test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 3: Check PPOConfig
print("\n" + "=" * 60)
print("TEST 3: PPOConfig")
print("=" * 60)
try:
ppo_config = PPOConfig(
output_dir="./output/ppo_test",
learning_rate=1e-5,
per_device_train_batch_size=2,
total_episodes=10,
num_ppo_epochs=1,
response_length=30,
report_to=None,
)
print(f"[OK] PPOConfig created successfully")
except Exception as e:
print(f"[FAIL] PPOConfig failed: {e}")
import traceback
traceback.print_exc()
# Test 4: Check model loading
print("\n" + "=" * 60)
print("TEST 4: Model Loading")
print("=" * 60)
try:
from transformers import AutoModelForCausalLM
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float32)
if len(tokenizer) != base_model.config.vocab_size:
base_model.resize_token_embeddings(len(tokenizer))
model_with_lora = PeftModel.from_pretrained(base_model, model_path)
merged_model = model_with_lora.merge_and_unload()
print("[OK] Base model and LoRA loaded")
# Wrap with value head
policy_model = AutoModelForCausalLMWithValueHead.from_pretrained(merged_model)
print("[OK] AutoModelForCausalLMWithValueHead created")
except Exception as e:
print(f"[FAIL] Model loading failed: {e}")
import traceback
traceback.print_exc()
# Test 5: Dataset format
print("\n" + "=" * 60)
print("TEST 5: Dataset Format")
print("=" * 60)
try:
from datasets import Dataset
prompt = '{"vars": ["x_1", "x_2"], "ops": ["+", "-", "*", "sin", "cos"], "cons": null, "expr": "'
train_dataset = Dataset.from_dict({"query": [prompt] * 10})
print(f"[OK] Dataset created with {len(train_dataset)} samples")
print(f" Sample query: {train_dataset[0]['query'][:50]}...")
except Exception as e:
print(f"[FAIL] Dataset creation failed: {e}")
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print("""
All basic tests passed. The custom reward model approach should work.
To run full PPO experiment:
python scripts/ppo_experiment_v2.py --dataset ./data/ppo_test/sin_x1.csv
Note: If PPOTrainer fails due to API incompatibility, consider:
1. Checking TRL source code for exact reward_model interface
2. Using the old TRL 0.11.0 with pip install trl==0.11.0
""")
|