File size: 3,837 Bytes
c082aa2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#!/usr/bin/env python3
"""
Create simple test datasets for PPO symbolic regression experiments.
No constants (C) - just simple expressions to verify PPO works.
"""
import numpy as np
import pandas as pd
from pathlib import Path
def create_dataset(formula_func, formula_name, n_vars, n_samples=500,
x_range=(-2, 2), output_dir="./data/ppo_test"):
"""
Create a synthetic regression dataset.
Args:
formula_func: Function that takes X array and returns y
formula_name: Name for the dataset (used as filename)
n_vars: Number of input variables
n_samples: Number of data points
x_range: Range for random X values
output_dir: Directory to save CSV files
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Generate random input data
np.random.seed(42) # For reproducibility
X = np.random.uniform(x_range[0], x_range[1], (n_samples, n_vars))
# Compute target
y = formula_func(X)
# Create DataFrame
columns = [f"x_{i+1}" for i in range(n_vars)] + ["y"]
data = np.column_stack([X, y])
df = pd.DataFrame(data, columns=columns)
# Save to CSV
output_file = output_dir / f"{formula_name}.csv"
df.to_csv(output_file, index=False)
print(f"Created: {output_file} ({n_samples} samples, {n_vars} vars)")
print(f" Formula: {formula_name}")
print(f" y range: [{y.min():.3f}, {y.max():.3f}]")
return output_file
def main():
print("=" * 60)
print("Creating PPO Test Datasets (No Constants)")
print("=" * 60)
# ========================================
# EASY: Simple expressions with 2 variables
# ========================================
print("\n--- EASY DATASETS (2 variables) ---")
# x_1 + x_2
create_dataset(
formula_func=lambda X: X[:, 0] + X[:, 1],
formula_name="add_x1_x2",
n_vars=2
)
# x_1 * x_2
create_dataset(
formula_func=lambda X: X[:, 0] * X[:, 1],
formula_name="mul_x1_x2",
n_vars=2
)
# x_1 - x_2
create_dataset(
formula_func=lambda X: X[:, 0] - X[:, 1],
formula_name="sub_x1_x2",
n_vars=2
)
# ========================================
# MEDIUM: Unary functions
# ========================================
print("\n--- MEDIUM DATASETS (unary functions) ---")
# sin(x_1)
create_dataset(
formula_func=lambda X: np.sin(X[:, 0]),
formula_name="sin_x1",
n_vars=1
)
# cos(x_1)
create_dataset(
formula_func=lambda X: np.cos(X[:, 0]),
formula_name="cos_x1",
n_vars=1
)
# x_1 * x_1 (quadratic)
create_dataset(
formula_func=lambda X: X[:, 0] * X[:, 0],
formula_name="square_x1",
n_vars=1
)
# ========================================
# HARD: Composed expressions
# ========================================
print("\n--- HARD DATASETS (composed expressions) ---")
# sin(x_1) + x_2
create_dataset(
formula_func=lambda X: np.sin(X[:, 0]) + X[:, 1],
formula_name="sin_x1_plus_x2",
n_vars=2
)
# x_1 * sin(x_2)
create_dataset(
formula_func=lambda X: X[:, 0] * np.sin(X[:, 1]),
formula_name="x1_mul_sin_x2",
n_vars=2
)
# sin(x_1 + x_2)
create_dataset(
formula_func=lambda X: np.sin(X[:, 0] + X[:, 1]),
formula_name="sin_x1_plus_x2_composed",
n_vars=2
)
# x_1 * x_2 + x_1
create_dataset(
formula_func=lambda X: X[:, 0] * X[:, 1] + X[:, 0],
formula_name="x1_mul_x2_plus_x1",
n_vars=2
)
print("\n" + "=" * 60)
print("Done! Created 10 test datasets in ./data/ppo_test/")
print("=" * 60)
if __name__ == "__main__":
main()
|