File size: 3,837 Bytes
c082aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""
Create simple test datasets for PPO symbolic regression experiments.
No constants (C) - just simple expressions to verify PPO works.
"""

import numpy as np
import pandas as pd
from pathlib import Path

def create_dataset(formula_func, formula_name, n_vars, n_samples=500,
                   x_range=(-2, 2), output_dir="./data/ppo_test"):
    """
    Create a synthetic regression dataset.

    Args:
        formula_func: Function that takes X array and returns y
        formula_name: Name for the dataset (used as filename)
        n_vars: Number of input variables
        n_samples: Number of data points
        x_range: Range for random X values
        output_dir: Directory to save CSV files
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Generate random input data
    np.random.seed(42)  # For reproducibility
    X = np.random.uniform(x_range[0], x_range[1], (n_samples, n_vars))

    # Compute target
    y = formula_func(X)

    # Create DataFrame
    columns = [f"x_{i+1}" for i in range(n_vars)] + ["y"]
    data = np.column_stack([X, y])
    df = pd.DataFrame(data, columns=columns)

    # Save to CSV
    output_file = output_dir / f"{formula_name}.csv"
    df.to_csv(output_file, index=False)
    print(f"Created: {output_file} ({n_samples} samples, {n_vars} vars)")
    print(f"  Formula: {formula_name}")
    print(f"  y range: [{y.min():.3f}, {y.max():.3f}]")

    return output_file


def main():
    print("=" * 60)
    print("Creating PPO Test Datasets (No Constants)")
    print("=" * 60)

    # ========================================
    # EASY: Simple expressions with 2 variables
    # ========================================
    print("\n--- EASY DATASETS (2 variables) ---")

    # x_1 + x_2
    create_dataset(
        formula_func=lambda X: X[:, 0] + X[:, 1],
        formula_name="add_x1_x2",
        n_vars=2
    )

    # x_1 * x_2
    create_dataset(
        formula_func=lambda X: X[:, 0] * X[:, 1],
        formula_name="mul_x1_x2",
        n_vars=2
    )

    # x_1 - x_2
    create_dataset(
        formula_func=lambda X: X[:, 0] - X[:, 1],
        formula_name="sub_x1_x2",
        n_vars=2
    )

    # ========================================
    # MEDIUM: Unary functions
    # ========================================
    print("\n--- MEDIUM DATASETS (unary functions) ---")

    # sin(x_1)
    create_dataset(
        formula_func=lambda X: np.sin(X[:, 0]),
        formula_name="sin_x1",
        n_vars=1
    )

    # cos(x_1)
    create_dataset(
        formula_func=lambda X: np.cos(X[:, 0]),
        formula_name="cos_x1",
        n_vars=1
    )

    # x_1 * x_1 (quadratic)
    create_dataset(
        formula_func=lambda X: X[:, 0] * X[:, 0],
        formula_name="square_x1",
        n_vars=1
    )

    # ========================================
    # HARD: Composed expressions
    # ========================================
    print("\n--- HARD DATASETS (composed expressions) ---")

    # sin(x_1) + x_2
    create_dataset(
        formula_func=lambda X: np.sin(X[:, 0]) + X[:, 1],
        formula_name="sin_x1_plus_x2",
        n_vars=2
    )

    # x_1 * sin(x_2)
    create_dataset(
        formula_func=lambda X: X[:, 0] * np.sin(X[:, 1]),
        formula_name="x1_mul_sin_x2",
        n_vars=2
    )

    # sin(x_1 + x_2)
    create_dataset(
        formula_func=lambda X: np.sin(X[:, 0] + X[:, 1]),
        formula_name="sin_x1_plus_x2_composed",
        n_vars=2
    )

    # x_1 * x_2 + x_1
    create_dataset(
        formula_func=lambda X: X[:, 0] * X[:, 1] + X[:, 0],
        formula_name="x1_mul_x2_plus_x1",
        n_vars=2
    )

    print("\n" + "=" * 60)
    print("Done! Created 10 test datasets in ./data/ppo_test/")
    print("=" * 60)


if __name__ == "__main__":
    main()