|
|
|
|
|
""" |
|
|
Create simple test datasets for PPO symbolic regression experiments. |
|
|
No constants (C) - just simple expressions to verify PPO works. |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from pathlib import Path |
|
|
|
|
|
def create_dataset(formula_func, formula_name, n_vars, n_samples=500, |
|
|
x_range=(-2, 2), output_dir="./data/ppo_test"): |
|
|
""" |
|
|
Create a synthetic regression dataset. |
|
|
|
|
|
Args: |
|
|
formula_func: Function that takes X array and returns y |
|
|
formula_name: Name for the dataset (used as filename) |
|
|
n_vars: Number of input variables |
|
|
n_samples: Number of data points |
|
|
x_range: Range for random X values |
|
|
output_dir: Directory to save CSV files |
|
|
""" |
|
|
output_dir = Path(output_dir) |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
np.random.seed(42) |
|
|
X = np.random.uniform(x_range[0], x_range[1], (n_samples, n_vars)) |
|
|
|
|
|
|
|
|
y = formula_func(X) |
|
|
|
|
|
|
|
|
columns = [f"x_{i+1}" for i in range(n_vars)] + ["y"] |
|
|
data = np.column_stack([X, y]) |
|
|
df = pd.DataFrame(data, columns=columns) |
|
|
|
|
|
|
|
|
output_file = output_dir / f"{formula_name}.csv" |
|
|
df.to_csv(output_file, index=False) |
|
|
print(f"Created: {output_file} ({n_samples} samples, {n_vars} vars)") |
|
|
print(f" Formula: {formula_name}") |
|
|
print(f" y range: [{y.min():.3f}, {y.max():.3f}]") |
|
|
|
|
|
return output_file |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("=" * 60) |
|
|
print("Creating PPO Test Datasets (No Constants)") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n--- EASY DATASETS (2 variables) ---") |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: X[:, 0] + X[:, 1], |
|
|
formula_name="add_x1_x2", |
|
|
n_vars=2 |
|
|
) |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: X[:, 0] * X[:, 1], |
|
|
formula_name="mul_x1_x2", |
|
|
n_vars=2 |
|
|
) |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: X[:, 0] - X[:, 1], |
|
|
formula_name="sub_x1_x2", |
|
|
n_vars=2 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n--- MEDIUM DATASETS (unary functions) ---") |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: np.sin(X[:, 0]), |
|
|
formula_name="sin_x1", |
|
|
n_vars=1 |
|
|
) |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: np.cos(X[:, 0]), |
|
|
formula_name="cos_x1", |
|
|
n_vars=1 |
|
|
) |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: X[:, 0] * X[:, 0], |
|
|
formula_name="square_x1", |
|
|
n_vars=1 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n--- HARD DATASETS (composed expressions) ---") |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: np.sin(X[:, 0]) + X[:, 1], |
|
|
formula_name="sin_x1_plus_x2", |
|
|
n_vars=2 |
|
|
) |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: X[:, 0] * np.sin(X[:, 1]), |
|
|
formula_name="x1_mul_sin_x2", |
|
|
n_vars=2 |
|
|
) |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: np.sin(X[:, 0] + X[:, 1]), |
|
|
formula_name="sin_x1_plus_x2_composed", |
|
|
n_vars=2 |
|
|
) |
|
|
|
|
|
|
|
|
create_dataset( |
|
|
formula_func=lambda X: X[:, 0] * X[:, 1] + X[:, 0], |
|
|
formula_name="x1_mul_x2_plus_x1", |
|
|
n_vars=2 |
|
|
) |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Done! Created 10 test datasets in ./data/ppo_test/") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|