File size: 16,035 Bytes
3742716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
import sympy
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.optimize import minimize
import math
import re

    
class Expression:
    SAFE_FUNCTIONS = {
        'sqrt': np.sqrt,
        'log': np.log,
        'exp': np.exp,
        'sin': np.sin,
        'cos': np.cos,
        'tan': np.tan,
        'asin': np.arcsin, # Corrected to np.arcsin
        'abs': np.abs,
        'pow': np.power, # Use np.power for vectorization and NaN handling
        # '**' is handled by Python's eval; if operands are numpy arrays, np.power is used.
    }

    OPERATOR_ARITY = {
        '+': 2,
        '-': 2,
        '*': 2,
        '/': 2,
        '**': 2,  # Changed from '^' to '**'
        'sin': 1,
        'cos': 1,
        'tan': 1,
        'log': 1,
        'sqrt': 1,
        'exp': 1
    }

    OPERATOR_FUNCS = {
        '+': sympy.Add,
        '-': lambda x, y: x - y,
        '*': sympy.Mul,
        '/': lambda x, y: x / y,
        '**': sympy.Pow, # Changed from '^' to '**', sympy.Pow handles both
        'sin': sympy.sin,
        'cos': sympy.cos,
        'tan': sympy.tan,
        'log': sympy.log,
        'sqrt': sympy.sqrt,
        'exp': sympy.exp
    }

    def parse_prefix(self, tokens):
        """Parse prefix notation expression to SymPy.

        Example: ['*', 'x_1', '+', 'x_2', 'C'] -> x_1*(x_2 + C)
        """
        if not tokens:
            raise ValueError("Empty token list")

        # Define unary and binary operators
        UNARY_OPS = {'sin', 'cos', 'tan', 'exp', 'log', 'sqrt', 'abs', 'asin'}
        BINARY_OPS = {'+', '-', '*', '/', '**', '^'}

        stack = []

        # Process tokens in reverse order
        for token in reversed(tokens):
            if token in BINARY_OPS or token in UNARY_OPS:
                # Operator: pop operands from stack
                if token in UNARY_OPS:
                    if len(stack) < 1:
                        raise ValueError(f"Not enough operands for {token}")
                    arg = stack.pop()
                    if token in ['sin', 'cos', 'tan', 'exp', 'log', 'sqrt', 'abs', 'asin']:
                        stack.append(f"{token}({arg})")
                    else:
                        raise ValueError(f"Unknown unary operator: {token}")
                else:  # Binary operator
                    if len(stack) < 2:
                        raise ValueError(f"Not enough operands for {token}")
                    right = stack.pop()
                    left = stack.pop()

                    # Handle operator mapping
                    op_map = {'+': '+', '-': '-', '*': '*', '/': '/', '**': '**', '^': '**'}
                    op = op_map.get(token, token)

                    if op in ['**', '^']:
                        stack.append(f"({left})**({right})")
                    elif op == '/':
                        stack.append(f"({left})/({right})")
                    else:
                        stack.append(f"({left}){op}({right})")
            else:
                # Operand: push to stack
                stack.append(token)

        if len(stack) != 1:
            raise ValueError(f"Invalid prefix expression, {len(stack)} elements remaining")

        return sympy.sympify(stack[0], evaluate=False)

    def __init__(self, expression, is_prefix=False):
        try:
            self.original_expression = expression  # Save original

            if is_prefix:
                # Ensure input prefix uses '**' if converting from external source
                tokens = expression.replace('^', '**').split()
                self.sympy_expression = self.parse_prefix(tokens)
            else:
                # Load the expression as a sympy expression without simplification
                self.sympy_expression = sympy.sympify(expression, evaluate=False)
        except Exception as e:
            raise ValueError(f"Failed to parse expression: {e}")

        self.max_var = 0
        for symbol in self.sympy_expression.free_symbols:
            if symbol.name.startswith('x_'):
                try:
                    index = int(symbol.name.split('_')[1])
                    self.max_var = max(self.max_var, index)
                except ValueError:
                    # Handle symbols that look like x_ but aren't x_number
                     pass # Or raise ValueError(f"Invalid variable name: {symbol.name}") if strict
        
        computable_expression = str(self.sympy_expression)          

        for i in range(1, self.max_var + 1):
            # Use regex to match whole words to avoid issues with x_1 followed by x_11
            computable_expression = re.sub(rf'\bx_{i}\b', f'x[{i-1}]', computable_expression)
        

        self.computable_expression = computable_expression.replace('**C', '**2')
        
        self.constant_count = self.computable_expression.count('C')
        self.best_constants = [1.0] * self.constant_count


        if self.constant_count > 0:
            # Replace 'C' with indexable constants
            split_expr = self.computable_expression.split('C')
            new_expr = split_expr[0]  # Start with first part

            for i in range(1, len(split_expr)):
                # Add constant reference
                new_expr += f'constants[{i-1}]'
                # Add next part
                new_expr += split_expr[i]

            self.computable_expression = new_expr
            


        

    def __str__(self):
        return f"Expression: {self.original_expression}, Best constants: {self.best_constants}"
    def sympy_str(self):
        """
        Returns the string representation of the sympy expression.
        """
        return str(self.sympy_expression)
    
    def is_valid_on_dataset(self, X, test_constants_list=None):
        """
        Checks if the expression evaluates to valid (finite) values for all rows in X,
        across one or more sets of test constants.

        Args:
            X (np.ndarray): Input data, shape (n_samples, n_features)
            test_constants_list (list of lists): Optional. Defaults to [[1.0]*count].
                Example: [[1.0]*n, [0.5]*n, [2.0]*n] to test more thoroughly.

        Returns:
            bool: True if no evaluation returns nan/inf or crashes. False otherwise.
        """
        if test_constants_list is None:
            test_constants_list = [[1.0] * self.constant_count]
        
        try:
            for constants in test_constants_list:
                results = self.evaluate(X, constants)
                
                if not np.all(np.isfinite(results)):
                    return False
            
            return True
        except Exception:
            return False

    # Inside the Expression class
    def evaluate(self, X, constants=None):
        # with warnings.catch_warnings():
        #     warnings.simplefilter("ignore", category=RuntimeWarning)  # Hide power/tan warnings
        #     np.seterr(invalid='ignore', divide='ignore') 

            

        if constants is None:
            # print("No constants provided, using best constants.") # Optional: uncomment for debugging
            constants = self.best_constants

        try:
            local_env = {
                "constants": np.array(constants), # Ensure constants is a numpy array for broadcasting
                **self.SAFE_FUNCTIONS,
                "__builtins__": None
            }

            if not isinstance(X, np.ndarray):
                X = np.array(X) # Ensure X is a numpy array
            
            # Ensure X is 2D, even if it has only one sample
            if X.ndim == 1:
                X = X.reshape(1, -1)

            # x becomes a list of columns (1D arrays of shape (n_samples,))
            x_cols = [X[:, i] for i in range(X.shape[1])]
            local_env["x"] = x_cols
            
            # The result will be a numpy array of shape (n_samples,)
            
            try:
                y_pred_array = eval(self.computable_expression, local_env)

            except FloatingPointError as e:
                # print(f"FloatingPointError during eval: {e}")
                # print(f"Expression: {self.computable_expression}")
                # print(f"Constants: {constants}")
                return np.full(X.shape[0], np.nan)  # Return NaNs to be caught by loss

            except Exception as e:
                # print(f"General exception during eval: {e}")
                return np.full(X.shape[0], np.nan)

            finally:
                np.seterr(all='warn')  # 🔁 Reset to default behavior

            # Ensure output is float to avoid issues with mixed types if some results are int
            return np.asarray(y_pred_array, dtype=float)

        except Exception as e:
            # Return an array of NaNs of the expected shape to ensure loss calculation doesn't break
            num_samples = X.shape[0] if X.ndim > 0 else 1
            return np.full(num_samples, np.nan) # Return NaNs on error

    def fit_constants(self, X, y):
        X = np.array(X)
        y = np.array(y)

        if self.constant_count == 0:
            try:
                y_pred = self.evaluate(X) # Vectorized call
                if not np.all(np.isfinite(y_pred)): # Check for NaNs/Infs
                    return -np.inf
                if np.all(y_pred == y_pred[0]) and len(np.unique(y)) > 1: # Avoid R2 issues with constant prediction for non-constant y
                    return 0.0 # Or handle as per specific requirements
                return r2_score(y, y_pred)
            except Exception as e: # Broader catch for any eval issue
                return -np.inf

        def loss(current_constants):

            try:
                y_pred = self.evaluate(X, current_constants)
                
            except Exception as e:
                print(f"Exception during evaluation: {e}")
                return np.inf
            
            if not np.all(np.isfinite(y_pred)):
                return np.inf 
            
            # MSE calculation
            mse = np.mean((y - y_pred) ** 2)
            
            return mse

        bounds = [(-2., 2.)] * self.constant_count
                
        initial_guess = (
            self.best_constants
            if self.best_constants and len(self.best_constants) == self.constant_count
            else [.0] * self.constant_count # Default to 1.0
        )

        # Ensure initial_guess is a flat numpy array
        initial_guess = np.array(initial_guess, dtype=float).flatten()


        # from scipy.optimize import differential_evolution
        # # Step 1: Use Differential Evolution for global exploration
        # print("\n--- Starting Differential Evolution ---")
        # result_de = differential_evolution(loss, bounds,
        #                                    popsize=70,      # Aumente para 50, 70, ou mais
        #                                    maxiter=10000,   # Aumente para 5000, 10000, ou mais
        #                                    strategy='rand1bin', # Tente 'rand1exp' se rand1bin não funcionar
        #                                    tol=1e-7,        # Tolerância mais apertada
        #                                    mutation=(0.8, 1.2), # Experimente valores mais altos
        #                                    recombination=0.5, # Experimente valores mais baixos
        #                                    seed=42,         # Mantém a reproducibilidade
        #                                    disp=True,       # Exibe o progresso
        #                                    polish=False) 

        # if result_de.success:
        #     print(f"\nDifferential Evolution finished successfully. Best raw constants: {result_de.x}, Best MSE: {result_de.fun}")
        #     # Use the result from DE as initial guess for local optimizer
        #     initial_guess_for_minimize = result_de.x
            
        #     # Step 2: (Optional but recommended) Refine with L-BFGS-B
        #     # L-BFGS-B will be applied to the "raw" (non-rounded) values,
        #     # but the loss function internally rounds for discrete ones.
        #     # It might still struggle if the function is too "stepped" from rounding.
        #     print("\n--- Starting L-BFGS-B refinement ---")
        #     result_min = minimize(loss,
        #                           x0=initial_guess_for_minimize,
        #                           method='L-BFGS-B',
        #                           bounds=bounds,
        #                           options={'maxiter': 500, 'ftol': 1e-9, 'disp': True} # More iterations, tighter tolerance
        #     )

        #     if result_min.success:
        #         print(f"\nL-BFGS-B refinement successful. Final raw constants: {result_min.x}, Final MSE: {result_min.fun}")
        #         self.best_constants = list(result_min.x)
        #     else:
        #         print(f"\nL-BFGS-B refinement failed: {result_min.message}. Using Differential Evolution's result.")
        #         self.best_constants = list(result_de.x)
        # else:
        #     print(f"\nDifferential Evolution did not converge successfully: {result_de.message}. Cannot proceed with optimization.")
        #     return -np.inf # Indicate failure
        
        # try:
        #     y_pred = self.evaluate(X) 
        #     if not np.all(np.isfinite(y_pred)):
        #         print("Final evaluation produced non-finite values for R2 score.")
        #         return -np.inf
        #     if len(np.unique(y)) == 1:
        #         if np.allclose(y_pred, y[0]):
        #             return 1.0
        #         else:
        #             return 0.0
        #     return r2_score(y, y_pred)
        # except Exception as e:
        #     print(f"Error calculating final R2: {e}")
        #     return -np.inf
            
        result = minimize(loss, 
                        x0=initial_guess,
                        method='L-BFGS-B',
                        bounds=bounds,
                        #options={'maxiter': 10, 'maxfun': 10, 'disp': True}
        )

        if result.success:
            self.best_constants = result.x.tolist()
            # print(f"Optimization successful. Final loss: {result.fun}") # Optional
            try:
                y_pred = self.evaluate(X) # Uses self.best_constants (vectorized)
                if not np.all(np.isfinite(y_pred)):
                    return -np.inf
                # Refined R2 calculation for edge cases
                if len(np.unique(y)) == 1: # If y is constant
                    if np.allclose(y_pred, y[0]):
                        return 1.0 # Perfect prediction of a constant
                    else:
                        return 0.0 # Or some other metric for imperfect constant prediction
                #return mean_squared_error(y, y_pred) # Use MSE for optimization
                #return mean_absolute_error(y, y_pred) # Use MAE for robustness
                return r2_score(y, y_pred)
            except Exception as e:
                return -np.inf
        else:
            return -np.inf

# from dataset import RegressionDataset

# import numpy as np
# import warnings

# with warnings.catch_warnings():
#     warnings.simplefilter("ignore", category=RuntimeWarning)
#     np.seterr(invalid='ignore')

# #reg = RegressionDataset('../data/evaluate/srsd-feynman_hard/train', 'feynman-bonus.12.txt', delimiter=' ')
# reg = RegressionDataset('./data/evaluate/srsd-feynman_easy/train', 'feynman-i.18.16.txt', delimiter=' ')
# X, y = reg.get_numpy()

# #x = np.array(X).T
# expression = "x_1*x_2*sin(x_4)"
# #expr = "0.5*x[0]*x[1]**2"


# expr = Expression(expression)
# print("Expression:", expr)

# if expr.is_valid_on_dataset(X):
#     print("Expression is valid on dataset.")
#     score = expr.fit_constants(X, y)
#     print("Fitted constants:", expr.best_constants)
#     print("R2 score:", score)
# else:
#     print("Expression is not valid on dataset.")