Spaces:
Running
Running
Merge pull request #609 from MilesCranmer/cleanup
Browse filesMore extensive typing stubs and associated refactoring
- .gitignore +2 -0
- environment.yml +0 -1
- pyproject.toml +6 -0
- pysr/denoising.py +17 -4
- pysr/export_latex.py +12 -0
- pysr/export_numpy.py +10 -2
- pysr/export_sympy.py +7 -5
- pysr/feature_selection.py +19 -3
- pysr/julia_helpers.py +17 -5
- pysr/julia_import.py +5 -0
- pysr/sr.py +260 -156
- pysr/test/test.py +13 -3
- pysr/utils.py +10 -2
- requirements.txt +0 -1
.gitignore
CHANGED
|
@@ -23,3 +23,5 @@ site
|
|
| 23 |
**/*.code-workspace
|
| 24 |
**/*.tar.gz
|
| 25 |
venv
|
|
|
|
|
|
|
|
|
| 23 |
**/*.code-workspace
|
| 24 |
**/*.tar.gz
|
| 25 |
venv
|
| 26 |
+
requirements-dev.lock
|
| 27 |
+
requirements.lock
|
environment.yml
CHANGED
|
@@ -9,4 +9,3 @@ dependencies:
|
|
| 9 |
- scikit-learn>=1.0.0,<2.0.0
|
| 10 |
- pyjuliacall>=0.9.15,<0.10.0
|
| 11 |
- click>=7.0.0,<9.0.0
|
| 12 |
-
- typing_extensions>=4.0.0,<5.0.0
|
|
|
|
| 9 |
- scikit-learn>=1.0.0,<2.0.0
|
| 10 |
- pyjuliacall>=0.9.15,<0.10.0
|
| 11 |
- click>=7.0.0,<9.0.0
|
|
|
pyproject.toml
CHANGED
|
@@ -35,4 +35,10 @@ dev-dependencies = [
|
|
| 35 |
"pre-commit>=3.7.0",
|
| 36 |
"ipython>=8.23.0",
|
| 37 |
"ipykernel>=6.29.4",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
]
|
|
|
|
| 35 |
"pre-commit>=3.7.0",
|
| 36 |
"ipython>=8.23.0",
|
| 37 |
"ipykernel>=6.29.4",
|
| 38 |
+
"mypy>=1.10.0",
|
| 39 |
+
"jax[cpu]>=0.4.26",
|
| 40 |
+
"torch>=2.3.0",
|
| 41 |
+
"pandas-stubs>=2.2.1.240316",
|
| 42 |
+
"types-pytz>=2024.1.0.20240417",
|
| 43 |
+
"types-openpyxl>=3.1.0.20240428",
|
| 44 |
]
|
pysr/denoising.py
CHANGED
|
@@ -1,9 +1,17 @@
|
|
| 1 |
"""Functions for denoising data during preprocessing."""
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
-
def denoise(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""Denoise the dataset using a Gaussian process."""
|
| 8 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 9 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
|
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
|
|
| 15 |
gpr.fit(X, y)
|
| 16 |
|
| 17 |
if Xresampled is not None:
|
| 18 |
-
return Xresampled, gpr.predict(Xresampled)
|
| 19 |
|
| 20 |
-
return X, gpr.predict(X)
|
| 21 |
|
| 22 |
|
| 23 |
-
def multi_denoise(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
"""Perform `denoise` along each column of `y` independently."""
|
| 25 |
y = np.stack(
|
| 26 |
[
|
|
|
|
| 1 |
"""Functions for denoising data during preprocessing."""
|
| 2 |
|
| 3 |
+
from typing import Optional, Tuple, cast
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
+
from numpy import ndarray
|
| 7 |
|
| 8 |
|
| 9 |
+
def denoise(
|
| 10 |
+
X: ndarray,
|
| 11 |
+
y: ndarray,
|
| 12 |
+
Xresampled: Optional[ndarray] = None,
|
| 13 |
+
random_state: Optional[np.random.RandomState] = None,
|
| 14 |
+
) -> Tuple[ndarray, ndarray]:
|
| 15 |
"""Denoise the dataset using a Gaussian process."""
|
| 16 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 17 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
|
|
|
| 23 |
gpr.fit(X, y)
|
| 24 |
|
| 25 |
if Xresampled is not None:
|
| 26 |
+
return Xresampled, cast(ndarray, gpr.predict(Xresampled))
|
| 27 |
|
| 28 |
+
return X, cast(ndarray, gpr.predict(X))
|
| 29 |
|
| 30 |
|
| 31 |
+
def multi_denoise(
|
| 32 |
+
X: ndarray,
|
| 33 |
+
y: ndarray,
|
| 34 |
+
Xresampled: Optional[ndarray] = None,
|
| 35 |
+
random_state: Optional[np.random.RandomState] = None,
|
| 36 |
+
):
|
| 37 |
"""Perform `denoise` along each column of `y` independently."""
|
| 38 |
y = np.stack(
|
| 39 |
[
|
pysr/export_latex.py
CHANGED
|
@@ -153,3 +153,15 @@ def sympy2multilatextable(
|
|
| 153 |
]
|
| 154 |
|
| 155 |
return "\n\n".join(latex_tables)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
]
|
| 154 |
|
| 155 |
return "\n\n".join(latex_tables)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def with_preamble(table_string: str) -> str:
|
| 159 |
+
preamble_string = [
|
| 160 |
+
r"\usepackage{breqn}",
|
| 161 |
+
r"\usepackage{booktabs}",
|
| 162 |
+
"",
|
| 163 |
+
"...",
|
| 164 |
+
"",
|
| 165 |
+
table_string,
|
| 166 |
+
]
|
| 167 |
+
return "\n".join(preamble_string)
|
pysr/export_numpy.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
"""Code for exporting discovered expressions to numpy"""
|
| 2 |
|
| 3 |
import warnings
|
|
|
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
-
from
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
|
@@ -14,6 +16,10 @@ def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
|
| 14 |
class CallableEquation:
|
| 15 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
def __init__(self, eqn, sympy_symbols, selection=None):
|
| 18 |
self._sympy = eqn
|
| 19 |
self._sympy_symbols = sympy_symbols
|
|
@@ -29,8 +35,9 @@ class CallableEquation:
|
|
| 29 |
return self._lambda(
|
| 30 |
**{k: X[k].values for k in map(str, self._sympy_symbols)}
|
| 31 |
) * np.ones(expected_shape)
|
|
|
|
| 32 |
if self._selection is not None:
|
| 33 |
-
if X.shape[1] !=
|
| 34 |
warnings.warn(
|
| 35 |
"`X` should be of shape (n_samples, len(self._selection)). "
|
| 36 |
"Automatically filtering `X` to selection. "
|
|
@@ -38,6 +45,7 @@ class CallableEquation:
|
|
| 38 |
"this may lead to incorrect predictions and other errors."
|
| 39 |
)
|
| 40 |
X = X[:, self._selection]
|
|
|
|
| 41 |
return self._lambda(*X.T) * np.ones(expected_shape)
|
| 42 |
|
| 43 |
@property
|
|
|
|
| 1 |
"""Code for exporting discovered expressions to numpy"""
|
| 2 |
|
| 3 |
import warnings
|
| 4 |
+
from typing import List, Union
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
+
from numpy.typing import NDArray
|
| 9 |
+
from sympy import Expr, Symbol, lambdify
|
| 10 |
|
| 11 |
|
| 12 |
def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
|
|
|
| 16 |
class CallableEquation:
|
| 17 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
| 18 |
|
| 19 |
+
_sympy: Expr
|
| 20 |
+
_sympy_symbols: List[Symbol]
|
| 21 |
+
_selection: Union[NDArray[np.bool_], None]
|
| 22 |
+
|
| 23 |
def __init__(self, eqn, sympy_symbols, selection=None):
|
| 24 |
self._sympy = eqn
|
| 25 |
self._sympy_symbols = sympy_symbols
|
|
|
|
| 35 |
return self._lambda(
|
| 36 |
**{k: X[k].values for k in map(str, self._sympy_symbols)}
|
| 37 |
) * np.ones(expected_shape)
|
| 38 |
+
|
| 39 |
if self._selection is not None:
|
| 40 |
+
if X.shape[1] != self._selection.sum():
|
| 41 |
warnings.warn(
|
| 42 |
"`X` should be of shape (n_samples, len(self._selection)). "
|
| 43 |
"Automatically filtering `X` to selection. "
|
|
|
|
| 45 |
"this may lead to incorrect predictions and other errors."
|
| 46 |
)
|
| 47 |
X = X[:, self._selection]
|
| 48 |
+
|
| 49 |
return self._lambda(*X.T) * np.ones(expected_shape)
|
| 50 |
|
| 51 |
@property
|
pysr/export_sympy.py
CHANGED
|
@@ -5,6 +5,8 @@ from typing import Callable, Dict, List, Optional
|
|
| 5 |
import sympy
|
| 6 |
from sympy import sympify
|
| 7 |
|
|
|
|
|
|
|
| 8 |
sympy_mappings = {
|
| 9 |
"div": lambda x, y: x / y,
|
| 10 |
"mult": lambda x, y: x * y,
|
|
@@ -30,8 +32,8 @@ sympy_mappings = {
|
|
| 30 |
"acosh": lambda x: sympy.acosh(x),
|
| 31 |
"acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
|
| 32 |
"asinh": sympy.asinh,
|
| 33 |
-
"atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
|
| 34 |
-
"atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
|
| 35 |
"abs": abs,
|
| 36 |
"mod": sympy.Mod,
|
| 37 |
"erf": sympy.erf,
|
|
@@ -60,13 +62,13 @@ sympy_mappings = {
|
|
| 60 |
|
| 61 |
|
| 62 |
def create_sympy_symbols_map(
|
| 63 |
-
feature_names_in:
|
| 64 |
) -> Dict[str, sympy.Symbol]:
|
| 65 |
return {variable: sympy.Symbol(variable) for variable in feature_names_in}
|
| 66 |
|
| 67 |
|
| 68 |
def create_sympy_symbols(
|
| 69 |
-
feature_names_in:
|
| 70 |
) -> List[sympy.Symbol]:
|
| 71 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
| 72 |
|
|
@@ -74,7 +76,7 @@ def create_sympy_symbols(
|
|
| 74 |
def pysr2sympy(
|
| 75 |
equation: str,
|
| 76 |
*,
|
| 77 |
-
feature_names_in: Optional[
|
| 78 |
extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
|
| 79 |
):
|
| 80 |
if feature_names_in is None:
|
|
|
|
| 5 |
import sympy
|
| 6 |
from sympy import sympify
|
| 7 |
|
| 8 |
+
from .utils import ArrayLike
|
| 9 |
+
|
| 10 |
sympy_mappings = {
|
| 11 |
"div": lambda x, y: x / y,
|
| 12 |
"mult": lambda x, y: x * y,
|
|
|
|
| 32 |
"acosh": lambda x: sympy.acosh(x),
|
| 33 |
"acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
|
| 34 |
"asinh": sympy.asinh,
|
| 35 |
+
"atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
|
| 36 |
+
"atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
|
| 37 |
"abs": abs,
|
| 38 |
"mod": sympy.Mod,
|
| 39 |
"erf": sympy.erf,
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
def create_sympy_symbols_map(
|
| 65 |
+
feature_names_in: ArrayLike[str],
|
| 66 |
) -> Dict[str, sympy.Symbol]:
|
| 67 |
return {variable: sympy.Symbol(variable) for variable in feature_names_in}
|
| 68 |
|
| 69 |
|
| 70 |
def create_sympy_symbols(
|
| 71 |
+
feature_names_in: ArrayLike[str],
|
| 72 |
) -> List[sympy.Symbol]:
|
| 73 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
| 74 |
|
|
|
|
| 76 |
def pysr2sympy(
|
| 77 |
equation: str,
|
| 78 |
*,
|
| 79 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
| 80 |
extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
|
| 81 |
):
|
| 82 |
if feature_names_in is None:
|
pysr/feature_selection.py
CHANGED
|
@@ -1,9 +1,20 @@
|
|
| 1 |
"""Functions for doing feature selection during preprocessing."""
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
-
def run_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
Find most important features.
|
| 9 |
|
|
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
|
|
| 21 |
selector = SelectFromModel(
|
| 22 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 23 |
)
|
| 24 |
-
return selector.get_support(indices=
|
| 25 |
|
| 26 |
|
| 27 |
# Function has not been removed only due to usage in module tests
|
| 28 |
-
def _handle_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if select_k_features is not None:
|
| 30 |
selection = run_feature_selection(X, y, select_k_features)
|
| 31 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
|
|
|
| 1 |
"""Functions for doing feature selection during preprocessing."""
|
| 2 |
|
| 3 |
+
from typing import Optional, cast
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
+
from numpy import ndarray
|
| 7 |
+
from numpy.typing import NDArray
|
| 8 |
+
|
| 9 |
+
from .utils import ArrayLike
|
| 10 |
|
| 11 |
|
| 12 |
+
def run_feature_selection(
|
| 13 |
+
X: ndarray,
|
| 14 |
+
y: ndarray,
|
| 15 |
+
select_k_features: int,
|
| 16 |
+
random_state: Optional[np.random.RandomState] = None,
|
| 17 |
+
) -> NDArray[np.bool_]:
|
| 18 |
"""
|
| 19 |
Find most important features.
|
| 20 |
|
|
|
|
| 32 |
selector = SelectFromModel(
|
| 33 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 34 |
)
|
| 35 |
+
return cast(NDArray[np.bool_], selector.get_support(indices=False))
|
| 36 |
|
| 37 |
|
| 38 |
# Function has not been removed only due to usage in module tests
|
| 39 |
+
def _handle_feature_selection(
|
| 40 |
+
X: ndarray,
|
| 41 |
+
select_k_features: Optional[int],
|
| 42 |
+
y: ndarray,
|
| 43 |
+
variable_names: ArrayLike[str],
|
| 44 |
+
):
|
| 45 |
if select_k_features is not None:
|
| 46 |
selection = run_feature_selection(X, y, select_k_features)
|
| 47 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
pysr/julia_helpers.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
| 1 |
"""Functions for initializing the Julia environment and installing deps."""
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
from juliacall import convert as jl_convert # type: ignore
|
|
|
|
| 5 |
|
| 6 |
from .deprecated import init_julia, install
|
| 7 |
from .julia_import import jl
|
| 8 |
|
|
|
|
|
|
|
| 9 |
jl.seval("using Serialization: Serialization")
|
| 10 |
jl.seval("using PythonCall: PythonCall")
|
| 11 |
|
|
@@ -22,24 +27,31 @@ def _escape_filename(filename):
|
|
| 22 |
return str_repr
|
| 23 |
|
| 24 |
|
| 25 |
-
def _load_cluster_manager(cluster_manager):
|
| 26 |
jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
|
| 27 |
return jl.seval(f"addprocs_{cluster_manager}")
|
| 28 |
|
| 29 |
|
| 30 |
-
def jl_array(x):
|
| 31 |
if x is None:
|
| 32 |
return None
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
-
def jl_serialize(obj):
|
| 37 |
buf = jl.IOBuffer()
|
| 38 |
Serialization.serialize(buf, obj)
|
| 39 |
return np.array(jl.take_b(buf))
|
| 40 |
|
| 41 |
|
| 42 |
-
def jl_deserialize(s):
|
| 43 |
if s is None:
|
| 44 |
return s
|
| 45 |
buf = jl.IOBuffer()
|
|
|
|
| 1 |
"""Functions for initializing the Julia environment and installing deps."""
|
| 2 |
|
| 3 |
+
from typing import Any, Callable, Union, cast
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
from juliacall import convert as jl_convert # type: ignore
|
| 7 |
+
from numpy.typing import NDArray
|
| 8 |
|
| 9 |
from .deprecated import init_julia, install
|
| 10 |
from .julia_import import jl
|
| 11 |
|
| 12 |
+
jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
|
| 13 |
+
|
| 14 |
jl.seval("using Serialization: Serialization")
|
| 15 |
jl.seval("using PythonCall: PythonCall")
|
| 16 |
|
|
|
|
| 27 |
return str_repr
|
| 28 |
|
| 29 |
|
| 30 |
+
def _load_cluster_manager(cluster_manager: str):
|
| 31 |
jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
|
| 32 |
return jl.seval(f"addprocs_{cluster_manager}")
|
| 33 |
|
| 34 |
|
| 35 |
+
def jl_array(x, dtype=None):
|
| 36 |
if x is None:
|
| 37 |
return None
|
| 38 |
+
elif dtype is None:
|
| 39 |
+
return jl_convert(jl.Array, x)
|
| 40 |
+
else:
|
| 41 |
+
return jl_convert(jl.Array[dtype], x)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def jl_is_function(f) -> bool:
|
| 45 |
+
return cast(bool, jl.seval("op -> op isa Function")(f))
|
| 46 |
|
| 47 |
|
| 48 |
+
def jl_serialize(obj: Any) -> NDArray[np.uint8]:
|
| 49 |
buf = jl.IOBuffer()
|
| 50 |
Serialization.serialize(buf, obj)
|
| 51 |
return np.array(jl.take_b(buf))
|
| 52 |
|
| 53 |
|
| 54 |
+
def jl_deserialize(s: Union[NDArray[np.uint8], None]):
|
| 55 |
if s is None:
|
| 56 |
return s
|
| 57 |
buf = jl.IOBuffer()
|
pysr/julia_import.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
import warnings
|
|
|
|
|
|
|
| 4 |
|
| 5 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
| 6 |
# about the relevant environment variables. If not loaded,
|
|
@@ -42,6 +44,9 @@ if autoload_extensions is not None:
|
|
| 42 |
|
| 43 |
from juliacall import Main as jl # type: ignore
|
| 44 |
|
|
|
|
|
|
|
|
|
|
| 45 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
| 46 |
|
| 47 |
jl.seval("using SymbolicRegression")
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
import warnings
|
| 4 |
+
from types import ModuleType
|
| 5 |
+
from typing import cast
|
| 6 |
|
| 7 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
| 8 |
# about the relevant environment variables. If not loaded,
|
|
|
|
| 44 |
|
| 45 |
from juliacall import Main as jl # type: ignore
|
| 46 |
|
| 47 |
+
jl = cast(ModuleType, jl)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
| 51 |
|
| 52 |
jl.seval("using SymbolicRegression")
|
pysr/sr.py
CHANGED
|
@@ -8,27 +8,31 @@ import shutil
|
|
| 8 |
import sys
|
| 9 |
import tempfile
|
| 10 |
import warnings
|
|
|
|
| 11 |
from datetime import datetime
|
| 12 |
from io import StringIO
|
| 13 |
from multiprocessing import cpu_count
|
| 14 |
from pathlib import Path
|
| 15 |
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
| 16 |
-
|
| 17 |
-
if sys.version_info >= (3, 8):
|
| 18 |
-
from typing import Literal
|
| 19 |
-
else:
|
| 20 |
-
from typing_extensions import Literal
|
| 21 |
|
| 22 |
import numpy as np
|
| 23 |
import pandas as pd
|
|
|
|
|
|
|
| 24 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
| 25 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
| 26 |
-
from sklearn.utils.validation import _check_feature_names_in
|
|
|
|
| 27 |
|
| 28 |
from .denoising import denoise, multi_denoise
|
| 29 |
from .deprecated import DEPRECATED_KWARGS
|
| 30 |
from .export_jax import sympy2jax
|
| 31 |
-
from .export_latex import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
from .export_numpy import sympy2numpy
|
| 33 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
| 34 |
from .export_torch import sympy2torch
|
|
@@ -40,17 +44,20 @@ from .julia_helpers import (
|
|
| 40 |
_load_cluster_manager,
|
| 41 |
jl_array,
|
| 42 |
jl_deserialize,
|
|
|
|
| 43 |
jl_serialize,
|
| 44 |
)
|
| 45 |
from .julia_import import SymbolicRegression, jl
|
| 46 |
from .utils import (
|
|
|
|
|
|
|
| 47 |
_csv_filename_to_pkl_filename,
|
| 48 |
_preprocess_julia_floats,
|
| 49 |
_safe_check_feature_names_in,
|
| 50 |
_subscriptify,
|
| 51 |
)
|
| 52 |
|
| 53 |
-
|
| 54 |
|
| 55 |
|
| 56 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
|
@@ -178,6 +185,21 @@ def _check_assertions(
|
|
| 178 |
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
| 179 |
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
| 182 |
"""
|
| 183 |
High-performance symbolic regression algorithm.
|
|
@@ -606,22 +628,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 606 |
Units of each variable in the training dataset, `y`.
|
| 607 |
nout_ : int
|
| 608 |
Number of output dimensions.
|
| 609 |
-
selection_mask_ :
|
| 610 |
-
|
| 611 |
-
`select_k_features` is set.
|
| 612 |
tempdir_ : Path
|
| 613 |
Path to the temporary equations directory.
|
| 614 |
-
equation_file_ : str
|
| 615 |
Output equation file name produced by the julia backend.
|
| 616 |
julia_state_stream_ : ndarray
|
| 617 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
| 618 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
| 619 |
-
julia_state_
|
| 620 |
-
The deserialized state.
|
| 621 |
julia_options_stream_ : ndarray
|
| 622 |
The serialized julia options, stored as an array of uint8,
|
| 623 |
-
julia_options_
|
| 624 |
-
The deserialized julia options.
|
| 625 |
equation_file_contents_ : list[pandas.DataFrame]
|
| 626 |
Contents of the equation file output by the Julia backend.
|
| 627 |
show_pickle_warnings_ : bool
|
|
@@ -668,6 +685,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 668 |
```
|
| 669 |
"""
|
| 670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
def __init__(
|
| 672 |
self,
|
| 673 |
model_selection: Literal["best", "accuracy", "score"] = "best",
|
|
@@ -900,14 +932,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 900 |
@classmethod
|
| 901 |
def from_file(
|
| 902 |
cls,
|
| 903 |
-
equation_file,
|
| 904 |
*,
|
| 905 |
-
binary_operators=None,
|
| 906 |
-
unary_operators=None,
|
| 907 |
-
n_features_in=None,
|
| 908 |
-
feature_names_in=None,
|
| 909 |
-
selection_mask=None,
|
| 910 |
-
nout=1,
|
| 911 |
**pysr_kwargs,
|
| 912 |
):
|
| 913 |
"""
|
|
@@ -915,7 +947,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 915 |
|
| 916 |
Parameters
|
| 917 |
----------
|
| 918 |
-
equation_file : str
|
| 919 |
Path to a pickle file containing a saved model, or a csv file
|
| 920 |
containing equations.
|
| 921 |
binary_operators : list[str]
|
|
@@ -930,8 +962,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 930 |
feature_names_in : list[str]
|
| 931 |
Names of the features passed to the model.
|
| 932 |
Not needed if loading from a pickle file.
|
| 933 |
-
selection_mask :
|
| 934 |
-
If using select_k_features, you must pass `model.selection_mask_` here.
|
| 935 |
Not needed if loading from a pickle file.
|
| 936 |
nout : int
|
| 937 |
Number of outputs of the model.
|
|
@@ -982,7 +1014,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 982 |
|
| 983 |
# TODO: copy .bkup file if exists.
|
| 984 |
model = cls(
|
| 985 |
-
equation_file=equation_file,
|
| 986 |
binary_operators=binary_operators,
|
| 987 |
unary_operators=unary_operators,
|
| 988 |
**pysr_kwargs,
|
|
@@ -1002,7 +1034,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1002 |
model.display_feature_names_in_ = feature_names_in
|
| 1003 |
|
| 1004 |
if selection_mask is None:
|
| 1005 |
-
model.selection_mask_ = np.ones(n_features_in, dtype=
|
| 1006 |
else:
|
| 1007 |
model.selection_mask_ = selection_mask
|
| 1008 |
|
|
@@ -1029,7 +1061,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1029 |
all_equations = equations
|
| 1030 |
|
| 1031 |
for i, equations in enumerate(all_equations):
|
| 1032 |
-
selected = [""
|
| 1033 |
chosen_row = idx_model_selection(equations, self.model_selection)
|
| 1034 |
selected[chosen_row] = ">>>>"
|
| 1035 |
repr_equations = pd.DataFrame(
|
|
@@ -1129,10 +1161,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1129 |
|
| 1130 |
@property
|
| 1131 |
def julia_options_(self):
|
|
|
|
| 1132 |
return jl_deserialize(self.julia_options_stream_)
|
| 1133 |
|
| 1134 |
@property
|
| 1135 |
def julia_state_(self):
|
|
|
|
| 1136 |
return jl_deserialize(self.julia_state_stream_)
|
| 1137 |
|
| 1138 |
@property
|
|
@@ -1145,7 +1179,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1145 |
)
|
| 1146 |
return self.julia_state_
|
| 1147 |
|
| 1148 |
-
def get_best(self, index=None):
|
| 1149 |
"""
|
| 1150 |
Get best equation using `model_selection`.
|
| 1151 |
|
|
@@ -1168,8 +1202,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1168 |
Raised when an invalid model selection strategy is provided.
|
| 1169 |
"""
|
| 1170 |
check_is_fitted(self, attributes=["equations_"])
|
| 1171 |
-
if self.equations_ is None:
|
| 1172 |
-
raise ValueError("No equations have been generated yet.")
|
| 1173 |
|
| 1174 |
if index is not None:
|
| 1175 |
if isinstance(self.equations_, list):
|
|
@@ -1177,16 +1209,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1177 |
index, list
|
| 1178 |
), "With multiple output features, index must be a list."
|
| 1179 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1180 |
-
|
|
|
|
|
|
|
| 1181 |
|
| 1182 |
if isinstance(self.equations_, list):
|
| 1183 |
return [
|
| 1184 |
-
eq.
|
| 1185 |
for eq in self.equations_
|
| 1186 |
]
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
|
|
|
|
|
|
|
|
|
| 1190 |
|
| 1191 |
def _setup_equation_file(self):
|
| 1192 |
"""
|
|
@@ -1211,7 +1248,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1211 |
self.equation_file_ = self.equation_file
|
| 1212 |
self.equation_file_contents_ = None
|
| 1213 |
|
| 1214 |
-
def
|
| 1215 |
"""
|
| 1216 |
Ensure parameters passed at initialization are valid.
|
| 1217 |
|
|
@@ -1269,59 +1306,48 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1269 |
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
| 1270 |
)
|
| 1271 |
|
| 1272 |
-
|
| 1273 |
-
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
|
| 1280 |
-
|
| 1281 |
-
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
-
|
| 1289 |
-
parameter_value = default_value
|
| 1290 |
else:
|
| 1291 |
-
#
|
| 1292 |
-
|
| 1293 |
-
|
| 1294 |
-
)
|
| 1295 |
-
|
| 1296 |
-
elif parameter == "batch_size" and parameter_value < 1:
|
| 1297 |
-
warnings.warn(
|
| 1298 |
-
"Given `batch_size` must be greater than or equal to one. "
|
| 1299 |
-
"`batch_size` has been increased to equal one."
|
| 1300 |
-
)
|
| 1301 |
-
parameter_value = 1
|
| 1302 |
-
elif (
|
| 1303 |
-
parameter == "progress"
|
| 1304 |
-
and parameter_value
|
| 1305 |
-
and "buffer" not in sys.stdout.__dir__()
|
| 1306 |
-
):
|
| 1307 |
-
warnings.warn(
|
| 1308 |
-
"Note: it looks like you are running in Jupyter. "
|
| 1309 |
-
"The progress bar will be turned off."
|
| 1310 |
-
)
|
| 1311 |
-
parameter_value = False
|
| 1312 |
-
packed_modified_params[parameter] = parameter_value
|
| 1313 |
|
| 1314 |
assert (
|
| 1315 |
-
len(
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
)
|
| 1319 |
|
| 1320 |
-
return
|
| 1321 |
|
| 1322 |
def _validate_and_set_fit_params(
|
| 1323 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1324 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1325 |
"""
|
| 1326 |
Validate the parameters passed to the :term`fit` method.
|
| 1327 |
|
|
@@ -1341,7 +1367,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1341 |
Weight array of the same shape as `y`.
|
| 1342 |
Each element is how to weight the mean-square-error loss
|
| 1343 |
for that particular element of y.
|
| 1344 |
-
variable_names :
|
| 1345 |
Names of each variable in the training dataset, `X`.
|
| 1346 |
X_units : list[str] of length n_features
|
| 1347 |
Units of each variable in the training dataset, `X`.
|
|
@@ -1397,7 +1423,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1397 |
if weights is not None:
|
| 1398 |
weights = check_array(weights, ensure_2d=False)
|
| 1399 |
check_consistent_length(weights, y)
|
| 1400 |
-
X, y = self.
|
| 1401 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
| 1402 |
self, variable_names, generate_names=False
|
| 1403 |
)
|
|
@@ -1407,10 +1433,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1407 |
self.display_feature_names_in_ = np.array(
|
| 1408 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
| 1409 |
)
|
|
|
|
| 1410 |
else:
|
| 1411 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1412 |
-
|
| 1413 |
-
variable_names = self.feature_names_in_
|
| 1414 |
|
| 1415 |
# Handle multioutput data
|
| 1416 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
|
@@ -1425,8 +1451,23 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1425 |
|
| 1426 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1428 |
def _pre_transform_training_data(
|
| 1429 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1430 |
):
|
| 1431 |
"""
|
| 1432 |
Transform the training data before fitting the symbolic regressor.
|
|
@@ -1435,12 +1476,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1435 |
|
| 1436 |
Parameters
|
| 1437 |
----------
|
| 1438 |
-
X : ndarray
|
| 1439 |
Training data of shape (n_samples, n_features).
|
| 1440 |
-
y : ndarray
|
| 1441 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
| 1442 |
Will be cast to X's dtype if necessary.
|
| 1443 |
-
Xresampled : ndarray |
|
| 1444 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
| 1445 |
used for denoising.
|
| 1446 |
variable_names : list[str]
|
|
@@ -1478,24 +1519,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1478 |
"""
|
| 1479 |
# Feature selection transformation
|
| 1480 |
if self.select_k_features:
|
| 1481 |
-
|
| 1482 |
X, y, self.select_k_features, random_state=random_state
|
| 1483 |
)
|
| 1484 |
-
X = X[:,
|
| 1485 |
|
| 1486 |
if Xresampled is not None:
|
| 1487 |
-
Xresampled = Xresampled[:,
|
| 1488 |
|
| 1489 |
# Reduce variable_names to selection
|
| 1490 |
-
variable_names =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1491 |
|
| 1492 |
if X_units is not None:
|
| 1493 |
-
X_units =
|
|
|
|
|
|
|
|
|
|
| 1494 |
self.X_units_ = copy.deepcopy(X_units)
|
| 1495 |
|
| 1496 |
# Re-perform data validation and feature name updating
|
| 1497 |
-
X, y = self.
|
| 1498 |
# Update feature names with selected variable names
|
|
|
|
| 1499 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
| 1500 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1501 |
print(f"Using features {self.feature_names_in_}")
|
|
@@ -1511,20 +1563,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1511 |
|
| 1512 |
return X, y, variable_names, X_units, y_units
|
| 1513 |
|
| 1514 |
-
def _run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1515 |
"""
|
| 1516 |
Run the symbolic regression fitting process on the julia backend.
|
| 1517 |
|
| 1518 |
Parameters
|
| 1519 |
----------
|
| 1520 |
-
X : ndarray
|
| 1521 |
Training data of shape `(n_samples, n_features)`.
|
| 1522 |
-
y : ndarray
|
| 1523 |
Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
|
| 1524 |
Will be cast to `X`'s dtype if necessary.
|
| 1525 |
-
|
| 1526 |
-
|
| 1527 |
-
weights : ndarray |
|
| 1528 |
Weight array of the same shape as `y`.
|
| 1529 |
Each element is how to weight the mean-square-error loss
|
| 1530 |
for that particular element of y.
|
|
@@ -1543,24 +1602,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1543 |
"""
|
| 1544 |
# Need to be global as we don't want to recreate/reinstate julia for
|
| 1545 |
# every new instance of PySRRegressor
|
| 1546 |
-
global
|
| 1547 |
|
| 1548 |
# These are the parameters which may be modified from the ones
|
| 1549 |
# specified in init, so we define them here locally:
|
| 1550 |
-
binary_operators =
|
| 1551 |
-
unary_operators =
|
| 1552 |
-
maxdepth =
|
| 1553 |
-
constraints =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1554 |
nested_constraints = self.nested_constraints
|
| 1555 |
complexity_of_operators = self.complexity_of_operators
|
| 1556 |
-
multithreading = mutated_params["multithreading"]
|
| 1557 |
cluster_manager = self.cluster_manager
|
| 1558 |
-
batch_size = mutated_params["batch_size"]
|
| 1559 |
-
update_verbosity = mutated_params["update_verbosity"]
|
| 1560 |
-
progress = mutated_params["progress"]
|
| 1561 |
|
| 1562 |
# Start julia backend processes
|
| 1563 |
-
if not
|
| 1564 |
print("Compiling Julia backend...")
|
| 1565 |
|
| 1566 |
if cluster_manager is not None:
|
|
@@ -1599,6 +1660,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1599 |
complexity_of_operators_str += f"({k}) => {v}, "
|
| 1600 |
complexity_of_operators_str += ")"
|
| 1601 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
|
|
|
| 1602 |
|
| 1603 |
custom_loss = jl.seval(
|
| 1604 |
str(self.elementwise_loss)
|
|
@@ -1635,11 +1697,25 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1635 |
optimize=self.weight_optimize,
|
| 1636 |
)
|
| 1637 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1638 |
# Call to Julia backend.
|
| 1639 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
| 1640 |
options = SymbolicRegression.Options(
|
| 1641 |
-
binary_operators=
|
| 1642 |
-
unary_operators=
|
| 1643 |
bin_constraints=jl_array(bin_constraints),
|
| 1644 |
una_constraints=jl_array(una_constraints),
|
| 1645 |
complexity_of_operators=complexity_of_operators,
|
|
@@ -1671,9 +1747,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1671 |
fraction_replaced_hof=self.fraction_replaced_hof,
|
| 1672 |
should_simplify=self.should_simplify,
|
| 1673 |
should_optimize_constants=self.should_optimize_constants,
|
| 1674 |
-
warmup_maxsize_by=
|
| 1675 |
-
0.0 if self.warmup_maxsize_by is None else self.warmup_maxsize_by
|
| 1676 |
-
),
|
| 1677 |
use_frequency=self.use_frequency,
|
| 1678 |
use_frequency_in_tournament=self.use_frequency_in_tournament,
|
| 1679 |
adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
|
|
@@ -1780,7 +1854,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1780 |
if self.delete_tempfiles:
|
| 1781 |
shutil.rmtree(self.tempdir_)
|
| 1782 |
|
| 1783 |
-
|
| 1784 |
|
| 1785 |
return self
|
| 1786 |
|
|
@@ -1790,9 +1864,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1790 |
y,
|
| 1791 |
Xresampled=None,
|
| 1792 |
weights=None,
|
| 1793 |
-
variable_names: Optional[
|
| 1794 |
-
X_units: Optional[
|
| 1795 |
-
y_units: Optional[
|
| 1796 |
) -> "PySRRegressor":
|
| 1797 |
"""
|
| 1798 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
@@ -1854,12 +1928,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1854 |
self.X_units_ = None
|
| 1855 |
self.y_units_ = None
|
| 1856 |
|
| 1857 |
-
random_state = check_random_state(self.random_state) # For np random
|
| 1858 |
-
seed = random_state.get_state()[1][0] # For julia random
|
| 1859 |
-
|
| 1860 |
self._setup_equation_file()
|
| 1861 |
|
| 1862 |
-
|
| 1863 |
|
| 1864 |
(
|
| 1865 |
X,
|
|
@@ -1884,6 +1955,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1884 |
"More datapoints will lower the search speed."
|
| 1885 |
)
|
| 1886 |
|
|
|
|
|
|
|
|
|
|
| 1887 |
# Pre transformations (feature selection and denoising)
|
| 1888 |
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
| 1889 |
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
|
@@ -1925,7 +1999,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1925 |
self._checkpoint()
|
| 1926 |
|
| 1927 |
# Perform the search:
|
| 1928 |
-
self._run(X, y,
|
| 1929 |
|
| 1930 |
# Then, after fit, we save again, so the pickle file contains
|
| 1931 |
# the equations:
|
|
@@ -1934,7 +2008,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1934 |
|
| 1935 |
return self
|
| 1936 |
|
| 1937 |
-
def refresh(self, checkpoint_file=None):
|
| 1938 |
"""
|
| 1939 |
Update self.equations_ with any new options passed.
|
| 1940 |
|
|
@@ -1943,11 +2017,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1943 |
|
| 1944 |
Parameters
|
| 1945 |
----------
|
| 1946 |
-
checkpoint_file : str
|
| 1947 |
Path to checkpoint hall of fame file to be loaded.
|
| 1948 |
The default will use the set `equation_file_`.
|
| 1949 |
"""
|
| 1950 |
-
if checkpoint_file:
|
| 1951 |
self.equation_file_ = checkpoint_file
|
| 1952 |
self.equation_file_contents_ = None
|
| 1953 |
check_is_fitted(self, attributes=["equation_file_"])
|
|
@@ -1999,7 +2073,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1999 |
if self.selection_mask_ is not None:
|
| 2000 |
# RangeIndex enforces column order allowing columns to
|
| 2001 |
# be correctly filtered with self.selection_mask_
|
| 2002 |
-
X = X.
|
| 2003 |
X.columns = self.feature_names_in_
|
| 2004 |
# Without feature information, CallableEquation/lambda_format equations
|
| 2005 |
# require that the column order of X matches that of the X used during
|
|
@@ -2009,14 +2083,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2009 |
# reordered/reindexed to match those of the transformed (denoised and
|
| 2010 |
# feature selected) X in fit.
|
| 2011 |
X = X.reindex(columns=self.feature_names_in_)
|
| 2012 |
-
X = self.
|
| 2013 |
|
| 2014 |
try:
|
| 2015 |
-
if
|
|
|
|
| 2016 |
return np.stack(
|
| 2017 |
[eq["lambda_format"](X) for eq in best_equation], axis=1
|
| 2018 |
)
|
| 2019 |
-
|
|
|
|
| 2020 |
except Exception as error:
|
| 2021 |
raise ValueError(
|
| 2022 |
"Failed to evaluate the expression. "
|
|
@@ -2046,9 +2122,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2046 |
"""
|
| 2047 |
self.refresh()
|
| 2048 |
best_equation = self.get_best(index=index)
|
| 2049 |
-
if
|
|
|
|
| 2050 |
return [eq["sympy_format"] for eq in best_equation]
|
| 2051 |
-
|
|
|
|
| 2052 |
|
| 2053 |
def latex(self, index=None, precision=3):
|
| 2054 |
"""
|
|
@@ -2108,9 +2186,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2108 |
self.set_params(output_jax_format=True)
|
| 2109 |
self.refresh()
|
| 2110 |
best_equation = self.get_best(index=index)
|
| 2111 |
-
if
|
|
|
|
| 2112 |
return [eq["jax_format"] for eq in best_equation]
|
| 2113 |
-
|
|
|
|
| 2114 |
|
| 2115 |
def pytorch(self, index=None):
|
| 2116 |
"""
|
|
@@ -2138,9 +2218,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2138 |
self.set_params(output_torch_format=True)
|
| 2139 |
self.refresh()
|
| 2140 |
best_equation = self.get_best(index=index)
|
| 2141 |
-
if
|
| 2142 |
return [eq["torch_format"] for eq in best_equation]
|
| 2143 |
-
|
|
|
|
| 2144 |
|
| 2145 |
def _read_equation_file(self):
|
| 2146 |
"""Read the hall of fame file created by `SymbolicRegression.jl`."""
|
|
@@ -2239,10 +2320,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2239 |
lastComplexity = 0
|
| 2240 |
sympy_format = []
|
| 2241 |
lambda_format = []
|
| 2242 |
-
|
| 2243 |
-
|
| 2244 |
-
if self.output_torch_format:
|
| 2245 |
-
torch_format = []
|
| 2246 |
|
| 2247 |
for _, eqn_row in output.iterrows():
|
| 2248 |
eqn = pysr2sympy(
|
|
@@ -2354,7 +2433,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2354 |
"""
|
| 2355 |
self.refresh()
|
| 2356 |
|
| 2357 |
-
if self.
|
| 2358 |
if indices is not None:
|
| 2359 |
assert isinstance(indices, list)
|
| 2360 |
assert isinstance(indices[0], list)
|
|
@@ -2363,7 +2442,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2363 |
table_string = sympy2multilatextable(
|
| 2364 |
self.equations_, indices=indices, precision=precision, columns=columns
|
| 2365 |
)
|
| 2366 |
-
|
| 2367 |
if indices is not None:
|
| 2368 |
assert isinstance(indices, list)
|
| 2369 |
assert isinstance(indices[0], int)
|
|
@@ -2371,15 +2450,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2371 |
table_string = sympy2latextable(
|
| 2372 |
self.equations_, indices=indices, precision=precision, columns=columns
|
| 2373 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2374 |
|
| 2375 |
-
|
| 2376 |
-
r"\usepackage{breqn}",
|
| 2377 |
-
r"\usepackage{booktabs}",
|
| 2378 |
-
"",
|
| 2379 |
-
"...",
|
| 2380 |
-
"",
|
| 2381 |
-
]
|
| 2382 |
-
return "\n".join(preamble_string + [table_string])
|
| 2383 |
|
| 2384 |
|
| 2385 |
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
|
@@ -2397,3 +2474,30 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
|
| 2397 |
f"{model_selection} is not a valid model selection strategy."
|
| 2398 |
)
|
| 2399 |
return chosen_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import sys
|
| 9 |
import tempfile
|
| 10 |
import warnings
|
| 11 |
+
from dataclasses import dataclass, fields
|
| 12 |
from datetime import datetime
|
| 13 |
from io import StringIO
|
| 14 |
from multiprocessing import cpu_count
|
| 15 |
from pathlib import Path
|
| 16 |
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
import numpy as np
|
| 19 |
import pandas as pd
|
| 20 |
+
from numpy import ndarray
|
| 21 |
+
from numpy.typing import NDArray
|
| 22 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
| 23 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
| 24 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
| 25 |
+
from sklearn.utils.validation import check_is_fitted
|
| 26 |
|
| 27 |
from .denoising import denoise, multi_denoise
|
| 28 |
from .deprecated import DEPRECATED_KWARGS
|
| 29 |
from .export_jax import sympy2jax
|
| 30 |
+
from .export_latex import (
|
| 31 |
+
sympy2latex,
|
| 32 |
+
sympy2latextable,
|
| 33 |
+
sympy2multilatextable,
|
| 34 |
+
with_preamble,
|
| 35 |
+
)
|
| 36 |
from .export_numpy import sympy2numpy
|
| 37 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
| 38 |
from .export_torch import sympy2torch
|
|
|
|
| 44 |
_load_cluster_manager,
|
| 45 |
jl_array,
|
| 46 |
jl_deserialize,
|
| 47 |
+
jl_is_function,
|
| 48 |
jl_serialize,
|
| 49 |
)
|
| 50 |
from .julia_import import SymbolicRegression, jl
|
| 51 |
from .utils import (
|
| 52 |
+
ArrayLike,
|
| 53 |
+
PathLike,
|
| 54 |
_csv_filename_to_pkl_filename,
|
| 55 |
_preprocess_julia_floats,
|
| 56 |
_safe_check_feature_names_in,
|
| 57 |
_subscriptify,
|
| 58 |
)
|
| 59 |
|
| 60 |
+
ALREADY_RAN = False
|
| 61 |
|
| 62 |
|
| 63 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
|
|
|
| 185 |
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
| 186 |
|
| 187 |
|
| 188 |
+
@dataclass
|
| 189 |
+
class _DynamicallySetParams:
|
| 190 |
+
"""Defines some parameters that are set at runtime."""
|
| 191 |
+
|
| 192 |
+
binary_operators: List[str]
|
| 193 |
+
unary_operators: List[str]
|
| 194 |
+
maxdepth: int
|
| 195 |
+
constraints: Dict[str, str]
|
| 196 |
+
multithreading: bool
|
| 197 |
+
batch_size: int
|
| 198 |
+
update_verbosity: int
|
| 199 |
+
progress: bool
|
| 200 |
+
warmup_maxsize_by: float
|
| 201 |
+
|
| 202 |
+
|
| 203 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
| 204 |
"""
|
| 205 |
High-performance symbolic regression algorithm.
|
|
|
|
| 628 |
Units of each variable in the training dataset, `y`.
|
| 629 |
nout_ : int
|
| 630 |
Number of output dimensions.
|
| 631 |
+
selection_mask_ : ndarray of shape (`n_features_in_`,)
|
| 632 |
+
Mask of which features of `X` to use when `select_k_features` is set.
|
|
|
|
| 633 |
tempdir_ : Path
|
| 634 |
Path to the temporary equations directory.
|
| 635 |
+
equation_file_ : Union[str, Path]
|
| 636 |
Output equation file name produced by the julia backend.
|
| 637 |
julia_state_stream_ : ndarray
|
| 638 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
| 639 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
|
|
|
|
|
|
| 640 |
julia_options_stream_ : ndarray
|
| 641 |
The serialized julia options, stored as an array of uint8,
|
|
|
|
|
|
|
| 642 |
equation_file_contents_ : list[pandas.DataFrame]
|
| 643 |
Contents of the equation file output by the Julia backend.
|
| 644 |
show_pickle_warnings_ : bool
|
|
|
|
| 685 |
```
|
| 686 |
"""
|
| 687 |
|
| 688 |
+
equations_: Union[pd.DataFrame, List[pd.DataFrame], None]
|
| 689 |
+
n_features_in_: int
|
| 690 |
+
feature_names_in_: ArrayLike[str]
|
| 691 |
+
display_feature_names_in_: ArrayLike[str]
|
| 692 |
+
X_units_: Union[ArrayLike[str], None]
|
| 693 |
+
y_units_: Union[str, ArrayLike[str], None]
|
| 694 |
+
nout_: int
|
| 695 |
+
selection_mask_: Union[NDArray[np.bool_], None]
|
| 696 |
+
tempdir_: Path
|
| 697 |
+
equation_file_: PathLike
|
| 698 |
+
julia_state_stream_: Union[NDArray[np.uint8], None]
|
| 699 |
+
julia_options_stream_: Union[NDArray[np.uint8], None]
|
| 700 |
+
equation_file_contents_: Union[List[pd.DataFrame], None]
|
| 701 |
+
show_pickle_warnings_: bool
|
| 702 |
+
|
| 703 |
def __init__(
|
| 704 |
self,
|
| 705 |
model_selection: Literal["best", "accuracy", "score"] = "best",
|
|
|
|
| 932 |
@classmethod
|
| 933 |
def from_file(
|
| 934 |
cls,
|
| 935 |
+
equation_file: PathLike,
|
| 936 |
*,
|
| 937 |
+
binary_operators: Optional[List[str]] = None,
|
| 938 |
+
unary_operators: Optional[List[str]] = None,
|
| 939 |
+
n_features_in: Optional[int] = None,
|
| 940 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
| 941 |
+
selection_mask: Optional[NDArray[np.bool_]] = None,
|
| 942 |
+
nout: int = 1,
|
| 943 |
**pysr_kwargs,
|
| 944 |
):
|
| 945 |
"""
|
|
|
|
| 947 |
|
| 948 |
Parameters
|
| 949 |
----------
|
| 950 |
+
equation_file : str or Path
|
| 951 |
Path to a pickle file containing a saved model, or a csv file
|
| 952 |
containing equations.
|
| 953 |
binary_operators : list[str]
|
|
|
|
| 962 |
feature_names_in : list[str]
|
| 963 |
Names of the features passed to the model.
|
| 964 |
Not needed if loading from a pickle file.
|
| 965 |
+
selection_mask : NDArray[np.bool_]
|
| 966 |
+
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
| 967 |
Not needed if loading from a pickle file.
|
| 968 |
nout : int
|
| 969 |
Number of outputs of the model.
|
|
|
|
| 1014 |
|
| 1015 |
# TODO: copy .bkup file if exists.
|
| 1016 |
model = cls(
|
| 1017 |
+
equation_file=str(equation_file),
|
| 1018 |
binary_operators=binary_operators,
|
| 1019 |
unary_operators=unary_operators,
|
| 1020 |
**pysr_kwargs,
|
|
|
|
| 1034 |
model.display_feature_names_in_ = feature_names_in
|
| 1035 |
|
| 1036 |
if selection_mask is None:
|
| 1037 |
+
model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_)
|
| 1038 |
else:
|
| 1039 |
model.selection_mask_ = selection_mask
|
| 1040 |
|
|
|
|
| 1061 |
all_equations = equations
|
| 1062 |
|
| 1063 |
for i, equations in enumerate(all_equations):
|
| 1064 |
+
selected = pd.Series([""] * len(equations), index=equations.index)
|
| 1065 |
chosen_row = idx_model_selection(equations, self.model_selection)
|
| 1066 |
selected[chosen_row] = ">>>>"
|
| 1067 |
repr_equations = pd.DataFrame(
|
|
|
|
| 1161 |
|
| 1162 |
@property
|
| 1163 |
def julia_options_(self):
|
| 1164 |
+
"""The deserialized julia options."""
|
| 1165 |
return jl_deserialize(self.julia_options_stream_)
|
| 1166 |
|
| 1167 |
@property
|
| 1168 |
def julia_state_(self):
|
| 1169 |
+
"""The deserialized state."""
|
| 1170 |
return jl_deserialize(self.julia_state_stream_)
|
| 1171 |
|
| 1172 |
@property
|
|
|
|
| 1179 |
)
|
| 1180 |
return self.julia_state_
|
| 1181 |
|
| 1182 |
+
def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
|
| 1183 |
"""
|
| 1184 |
Get best equation using `model_selection`.
|
| 1185 |
|
|
|
|
| 1202 |
Raised when an invalid model selection strategy is provided.
|
| 1203 |
"""
|
| 1204 |
check_is_fitted(self, attributes=["equations_"])
|
|
|
|
|
|
|
| 1205 |
|
| 1206 |
if index is not None:
|
| 1207 |
if isinstance(self.equations_, list):
|
|
|
|
| 1209 |
index, list
|
| 1210 |
), "With multiple output features, index must be a list."
|
| 1211 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1212 |
+
else:
|
| 1213 |
+
equations_ = cast(pd.DataFrame, self.equations_)
|
| 1214 |
+
return cast(pd.Series, equations_.iloc[index])
|
| 1215 |
|
| 1216 |
if isinstance(self.equations_, list):
|
| 1217 |
return [
|
| 1218 |
+
cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
|
| 1219 |
for eq in self.equations_
|
| 1220 |
]
|
| 1221 |
+
else:
|
| 1222 |
+
equations_ = cast(pd.DataFrame, self.equations_)
|
| 1223 |
+
return cast(
|
| 1224 |
+
pd.Series,
|
| 1225 |
+
equations_.loc[idx_model_selection(equations_, self.model_selection)],
|
| 1226 |
+
)
|
| 1227 |
|
| 1228 |
def _setup_equation_file(self):
|
| 1229 |
"""
|
|
|
|
| 1248 |
self.equation_file_ = self.equation_file
|
| 1249 |
self.equation_file_contents_ = None
|
| 1250 |
|
| 1251 |
+
def _validate_and_modify_params(self) -> _DynamicallySetParams:
|
| 1252 |
"""
|
| 1253 |
Ensure parameters passed at initialization are valid.
|
| 1254 |
|
|
|
|
| 1306 |
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
| 1307 |
)
|
| 1308 |
|
| 1309 |
+
param_container = _DynamicallySetParams(
|
| 1310 |
+
binary_operators=["+", "*", "-", "/"],
|
| 1311 |
+
unary_operators=[],
|
| 1312 |
+
maxdepth=self.maxsize,
|
| 1313 |
+
constraints={},
|
| 1314 |
+
multithreading=self.procs != 0 and self.cluster_manager is None,
|
| 1315 |
+
batch_size=1,
|
| 1316 |
+
update_verbosity=int(self.verbosity),
|
| 1317 |
+
progress=self.progress,
|
| 1318 |
+
warmup_maxsize_by=0.0,
|
| 1319 |
+
)
|
| 1320 |
+
|
| 1321 |
+
for param_name in map(lambda x: x.name, fields(_DynamicallySetParams)):
|
| 1322 |
+
user_param_value = getattr(self, param_name)
|
| 1323 |
+
if user_param_value is None:
|
| 1324 |
+
# Leave as the default in DynamicallySetParams
|
| 1325 |
+
...
|
|
|
|
| 1326 |
else:
|
| 1327 |
+
# If user has specified it, we will override the default.
|
| 1328 |
+
# However, there are some special cases to mutate it:
|
| 1329 |
+
new_param_value = _mutate_parameter(param_name, user_param_value)
|
| 1330 |
+
setattr(param_container, param_name, new_param_value)
|
| 1331 |
+
# TODO: This should just be part of the __init__ of _DynamicallySetParams
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1332 |
|
| 1333 |
assert (
|
| 1334 |
+
len(param_container.binary_operators) > 0
|
| 1335 |
+
or len(param_container.unary_operators) > 0
|
| 1336 |
+
), "At least one operator must be provided."
|
|
|
|
| 1337 |
|
| 1338 |
+
return param_container
|
| 1339 |
|
| 1340 |
def _validate_and_set_fit_params(
|
| 1341 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1342 |
+
) -> Tuple[
|
| 1343 |
+
ndarray,
|
| 1344 |
+
ndarray,
|
| 1345 |
+
Optional[ndarray],
|
| 1346 |
+
Optional[ndarray],
|
| 1347 |
+
ArrayLike[str],
|
| 1348 |
+
Optional[ArrayLike[str]],
|
| 1349 |
+
Optional[Union[str, ArrayLike[str]]],
|
| 1350 |
+
]:
|
| 1351 |
"""
|
| 1352 |
Validate the parameters passed to the :term`fit` method.
|
| 1353 |
|
|
|
|
| 1367 |
Weight array of the same shape as `y`.
|
| 1368 |
Each element is how to weight the mean-square-error loss
|
| 1369 |
for that particular element of y.
|
| 1370 |
+
variable_names : ndarray of length n_features
|
| 1371 |
Names of each variable in the training dataset, `X`.
|
| 1372 |
X_units : list[str] of length n_features
|
| 1373 |
Units of each variable in the training dataset, `X`.
|
|
|
|
| 1423 |
if weights is not None:
|
| 1424 |
weights = check_array(weights, ensure_2d=False)
|
| 1425 |
check_consistent_length(weights, y)
|
| 1426 |
+
X, y = self._validate_data_X_y(X, y)
|
| 1427 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
| 1428 |
self, variable_names, generate_names=False
|
| 1429 |
)
|
|
|
|
| 1433 |
self.display_feature_names_in_ = np.array(
|
| 1434 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
| 1435 |
)
|
| 1436 |
+
variable_names = self.feature_names_in_
|
| 1437 |
else:
|
| 1438 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1439 |
+
variable_names = self.feature_names_in_
|
|
|
|
| 1440 |
|
| 1441 |
# Handle multioutput data
|
| 1442 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
|
|
|
| 1451 |
|
| 1452 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1453 |
|
| 1454 |
+
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
| 1455 |
+
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
| 1456 |
+
return cast(Tuple[ndarray, ndarray], raw_out)
|
| 1457 |
+
|
| 1458 |
+
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
| 1459 |
+
raw_out = self._validate_data(X=X, reset=False) # type: ignore
|
| 1460 |
+
return cast(Tuple[ndarray], raw_out)
|
| 1461 |
+
|
| 1462 |
def _pre_transform_training_data(
|
| 1463 |
+
self,
|
| 1464 |
+
X: ndarray,
|
| 1465 |
+
y: ndarray,
|
| 1466 |
+
Xresampled: Union[ndarray, None],
|
| 1467 |
+
variable_names: ArrayLike[str],
|
| 1468 |
+
X_units: Union[ArrayLike[str], None],
|
| 1469 |
+
y_units: Union[ArrayLike[str], str, None],
|
| 1470 |
+
random_state: np.random.RandomState,
|
| 1471 |
):
|
| 1472 |
"""
|
| 1473 |
Transform the training data before fitting the symbolic regressor.
|
|
|
|
| 1476 |
|
| 1477 |
Parameters
|
| 1478 |
----------
|
| 1479 |
+
X : ndarray
|
| 1480 |
Training data of shape (n_samples, n_features).
|
| 1481 |
+
y : ndarray
|
| 1482 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
| 1483 |
Will be cast to X's dtype if necessary.
|
| 1484 |
+
Xresampled : ndarray | None
|
| 1485 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
| 1486 |
used for denoising.
|
| 1487 |
variable_names : list[str]
|
|
|
|
| 1519 |
"""
|
| 1520 |
# Feature selection transformation
|
| 1521 |
if self.select_k_features:
|
| 1522 |
+
selection_mask = run_feature_selection(
|
| 1523 |
X, y, self.select_k_features, random_state=random_state
|
| 1524 |
)
|
| 1525 |
+
X = X[:, selection_mask]
|
| 1526 |
|
| 1527 |
if Xresampled is not None:
|
| 1528 |
+
Xresampled = Xresampled[:, selection_mask]
|
| 1529 |
|
| 1530 |
# Reduce variable_names to selection
|
| 1531 |
+
variable_names = cast(
|
| 1532 |
+
ArrayLike[str],
|
| 1533 |
+
[
|
| 1534 |
+
variable_names[i]
|
| 1535 |
+
for i in range(len(variable_names))
|
| 1536 |
+
if selection_mask[i]
|
| 1537 |
+
],
|
| 1538 |
+
)
|
| 1539 |
|
| 1540 |
if X_units is not None:
|
| 1541 |
+
X_units = cast(
|
| 1542 |
+
ArrayLike[str],
|
| 1543 |
+
[X_units[i] for i in range(len(X_units)) if selection_mask[i]],
|
| 1544 |
+
)
|
| 1545 |
self.X_units_ = copy.deepcopy(X_units)
|
| 1546 |
|
| 1547 |
# Re-perform data validation and feature name updating
|
| 1548 |
+
X, y = self._validate_data_X_y(X, y)
|
| 1549 |
# Update feature names with selected variable names
|
| 1550 |
+
self.selection_mask_ = selection_mask
|
| 1551 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
| 1552 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1553 |
print(f"Using features {self.feature_names_in_}")
|
|
|
|
| 1563 |
|
| 1564 |
return X, y, variable_names, X_units, y_units
|
| 1565 |
|
| 1566 |
+
def _run(
|
| 1567 |
+
self,
|
| 1568 |
+
X: ndarray,
|
| 1569 |
+
y: ndarray,
|
| 1570 |
+
runtime_params: _DynamicallySetParams,
|
| 1571 |
+
weights: Optional[ndarray],
|
| 1572 |
+
seed: int,
|
| 1573 |
+
):
|
| 1574 |
"""
|
| 1575 |
Run the symbolic regression fitting process on the julia backend.
|
| 1576 |
|
| 1577 |
Parameters
|
| 1578 |
----------
|
| 1579 |
+
X : ndarray
|
| 1580 |
Training data of shape `(n_samples, n_features)`.
|
| 1581 |
+
y : ndarray
|
| 1582 |
Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
|
| 1583 |
Will be cast to `X`'s dtype if necessary.
|
| 1584 |
+
runtime_params : DynamicallySetParams
|
| 1585 |
+
Dynamically set versions of some parameters passed in __init__.
|
| 1586 |
+
weights : ndarray | None
|
| 1587 |
Weight array of the same shape as `y`.
|
| 1588 |
Each element is how to weight the mean-square-error loss
|
| 1589 |
for that particular element of y.
|
|
|
|
| 1602 |
"""
|
| 1603 |
# Need to be global as we don't want to recreate/reinstate julia for
|
| 1604 |
# every new instance of PySRRegressor
|
| 1605 |
+
global ALREADY_RAN
|
| 1606 |
|
| 1607 |
# These are the parameters which may be modified from the ones
|
| 1608 |
# specified in init, so we define them here locally:
|
| 1609 |
+
binary_operators = runtime_params.binary_operators
|
| 1610 |
+
unary_operators = runtime_params.unary_operators
|
| 1611 |
+
maxdepth = runtime_params.maxdepth
|
| 1612 |
+
constraints = runtime_params.constraints
|
| 1613 |
+
multithreading = runtime_params.multithreading
|
| 1614 |
+
batch_size = runtime_params.batch_size
|
| 1615 |
+
update_verbosity = runtime_params.update_verbosity
|
| 1616 |
+
progress = runtime_params.progress
|
| 1617 |
+
warmup_maxsize_by = runtime_params.warmup_maxsize_by
|
| 1618 |
+
|
| 1619 |
nested_constraints = self.nested_constraints
|
| 1620 |
complexity_of_operators = self.complexity_of_operators
|
|
|
|
| 1621 |
cluster_manager = self.cluster_manager
|
|
|
|
|
|
|
|
|
|
| 1622 |
|
| 1623 |
# Start julia backend processes
|
| 1624 |
+
if not ALREADY_RAN and update_verbosity != 0:
|
| 1625 |
print("Compiling Julia backend...")
|
| 1626 |
|
| 1627 |
if cluster_manager is not None:
|
|
|
|
| 1660 |
complexity_of_operators_str += f"({k}) => {v}, "
|
| 1661 |
complexity_of_operators_str += ")"
|
| 1662 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
| 1663 |
+
# TODO: Refactor this into helper function
|
| 1664 |
|
| 1665 |
custom_loss = jl.seval(
|
| 1666 |
str(self.elementwise_loss)
|
|
|
|
| 1697 |
optimize=self.weight_optimize,
|
| 1698 |
)
|
| 1699 |
|
| 1700 |
+
jl_binary_operators: List[Any] = []
|
| 1701 |
+
jl_unary_operators: List[Any] = []
|
| 1702 |
+
for input_list, output_list, name in [
|
| 1703 |
+
(binary_operators, jl_binary_operators, "binary"),
|
| 1704 |
+
(unary_operators, jl_unary_operators, "unary"),
|
| 1705 |
+
]:
|
| 1706 |
+
for op in input_list:
|
| 1707 |
+
jl_op = jl.seval(op)
|
| 1708 |
+
if not jl_is_function(jl_op):
|
| 1709 |
+
raise ValueError(
|
| 1710 |
+
f"When building `{name}_operators`, `'{op}'` did not return a Julia function"
|
| 1711 |
+
)
|
| 1712 |
+
output_list.append(jl_op)
|
| 1713 |
+
|
| 1714 |
# Call to Julia backend.
|
| 1715 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
| 1716 |
options = SymbolicRegression.Options(
|
| 1717 |
+
binary_operators=jl_array(jl_binary_operators, dtype=jl.Function),
|
| 1718 |
+
unary_operators=jl_array(jl_unary_operators, dtype=jl.Function),
|
| 1719 |
bin_constraints=jl_array(bin_constraints),
|
| 1720 |
una_constraints=jl_array(una_constraints),
|
| 1721 |
complexity_of_operators=complexity_of_operators,
|
|
|
|
| 1747 |
fraction_replaced_hof=self.fraction_replaced_hof,
|
| 1748 |
should_simplify=self.should_simplify,
|
| 1749 |
should_optimize_constants=self.should_optimize_constants,
|
| 1750 |
+
warmup_maxsize_by=warmup_maxsize_by,
|
|
|
|
|
|
|
| 1751 |
use_frequency=self.use_frequency,
|
| 1752 |
use_frequency_in_tournament=self.use_frequency_in_tournament,
|
| 1753 |
adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
|
|
|
|
| 1854 |
if self.delete_tempfiles:
|
| 1855 |
shutil.rmtree(self.tempdir_)
|
| 1856 |
|
| 1857 |
+
ALREADY_RAN = True
|
| 1858 |
|
| 1859 |
return self
|
| 1860 |
|
|
|
|
| 1864 |
y,
|
| 1865 |
Xresampled=None,
|
| 1866 |
weights=None,
|
| 1867 |
+
variable_names: Optional[ArrayLike[str]] = None,
|
| 1868 |
+
X_units: Optional[ArrayLike[str]] = None,
|
| 1869 |
+
y_units: Optional[Union[str, ArrayLike[str]]] = None,
|
| 1870 |
) -> "PySRRegressor":
|
| 1871 |
"""
|
| 1872 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
|
| 1928 |
self.X_units_ = None
|
| 1929 |
self.y_units_ = None
|
| 1930 |
|
|
|
|
|
|
|
|
|
|
| 1931 |
self._setup_equation_file()
|
| 1932 |
|
| 1933 |
+
runtime_params = self._validate_and_modify_params()
|
| 1934 |
|
| 1935 |
(
|
| 1936 |
X,
|
|
|
|
| 1955 |
"More datapoints will lower the search speed."
|
| 1956 |
)
|
| 1957 |
|
| 1958 |
+
random_state = check_random_state(self.random_state) # For np random
|
| 1959 |
+
seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
|
| 1960 |
+
|
| 1961 |
# Pre transformations (feature selection and denoising)
|
| 1962 |
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
| 1963 |
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
|
|
|
| 1999 |
self._checkpoint()
|
| 2000 |
|
| 2001 |
# Perform the search:
|
| 2002 |
+
self._run(X, y, runtime_params, weights=weights, seed=seed)
|
| 2003 |
|
| 2004 |
# Then, after fit, we save again, so the pickle file contains
|
| 2005 |
# the equations:
|
|
|
|
| 2008 |
|
| 2009 |
return self
|
| 2010 |
|
| 2011 |
+
def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
|
| 2012 |
"""
|
| 2013 |
Update self.equations_ with any new options passed.
|
| 2014 |
|
|
|
|
| 2017 |
|
| 2018 |
Parameters
|
| 2019 |
----------
|
| 2020 |
+
checkpoint_file : str or Path
|
| 2021 |
Path to checkpoint hall of fame file to be loaded.
|
| 2022 |
The default will use the set `equation_file_`.
|
| 2023 |
"""
|
| 2024 |
+
if checkpoint_file is not None:
|
| 2025 |
self.equation_file_ = checkpoint_file
|
| 2026 |
self.equation_file_contents_ = None
|
| 2027 |
check_is_fitted(self, attributes=["equation_file_"])
|
|
|
|
| 2073 |
if self.selection_mask_ is not None:
|
| 2074 |
# RangeIndex enforces column order allowing columns to
|
| 2075 |
# be correctly filtered with self.selection_mask_
|
| 2076 |
+
X = X[X.columns[self.selection_mask_]]
|
| 2077 |
X.columns = self.feature_names_in_
|
| 2078 |
# Without feature information, CallableEquation/lambda_format equations
|
| 2079 |
# require that the column order of X matches that of the X used during
|
|
|
|
| 2083 |
# reordered/reindexed to match those of the transformed (denoised and
|
| 2084 |
# feature selected) X in fit.
|
| 2085 |
X = X.reindex(columns=self.feature_names_in_)
|
| 2086 |
+
X = self._validate_data_X(X)
|
| 2087 |
|
| 2088 |
try:
|
| 2089 |
+
if isinstance(best_equation, list):
|
| 2090 |
+
assert self.nout_ > 1
|
| 2091 |
return np.stack(
|
| 2092 |
[eq["lambda_format"](X) for eq in best_equation], axis=1
|
| 2093 |
)
|
| 2094 |
+
else:
|
| 2095 |
+
return best_equation["lambda_format"](X)
|
| 2096 |
except Exception as error:
|
| 2097 |
raise ValueError(
|
| 2098 |
"Failed to evaluate the expression. "
|
|
|
|
| 2122 |
"""
|
| 2123 |
self.refresh()
|
| 2124 |
best_equation = self.get_best(index=index)
|
| 2125 |
+
if isinstance(best_equation, list):
|
| 2126 |
+
assert self.nout_ > 1
|
| 2127 |
return [eq["sympy_format"] for eq in best_equation]
|
| 2128 |
+
else:
|
| 2129 |
+
return best_equation["sympy_format"]
|
| 2130 |
|
| 2131 |
def latex(self, index=None, precision=3):
|
| 2132 |
"""
|
|
|
|
| 2186 |
self.set_params(output_jax_format=True)
|
| 2187 |
self.refresh()
|
| 2188 |
best_equation = self.get_best(index=index)
|
| 2189 |
+
if isinstance(best_equation, list):
|
| 2190 |
+
assert self.nout_ > 1
|
| 2191 |
return [eq["jax_format"] for eq in best_equation]
|
| 2192 |
+
else:
|
| 2193 |
+
return best_equation["jax_format"]
|
| 2194 |
|
| 2195 |
def pytorch(self, index=None):
|
| 2196 |
"""
|
|
|
|
| 2218 |
self.set_params(output_torch_format=True)
|
| 2219 |
self.refresh()
|
| 2220 |
best_equation = self.get_best(index=index)
|
| 2221 |
+
if isinstance(best_equation, list):
|
| 2222 |
return [eq["torch_format"] for eq in best_equation]
|
| 2223 |
+
else:
|
| 2224 |
+
return best_equation["torch_format"]
|
| 2225 |
|
| 2226 |
def _read_equation_file(self):
|
| 2227 |
"""Read the hall of fame file created by `SymbolicRegression.jl`."""
|
|
|
|
| 2320 |
lastComplexity = 0
|
| 2321 |
sympy_format = []
|
| 2322 |
lambda_format = []
|
| 2323 |
+
jax_format = []
|
| 2324 |
+
torch_format = []
|
|
|
|
|
|
|
| 2325 |
|
| 2326 |
for _, eqn_row in output.iterrows():
|
| 2327 |
eqn = pysr2sympy(
|
|
|
|
| 2433 |
"""
|
| 2434 |
self.refresh()
|
| 2435 |
|
| 2436 |
+
if isinstance(self.equations_, list):
|
| 2437 |
if indices is not None:
|
| 2438 |
assert isinstance(indices, list)
|
| 2439 |
assert isinstance(indices[0], list)
|
|
|
|
| 2442 |
table_string = sympy2multilatextable(
|
| 2443 |
self.equations_, indices=indices, precision=precision, columns=columns
|
| 2444 |
)
|
| 2445 |
+
elif isinstance(self.equations_, pd.DataFrame):
|
| 2446 |
if indices is not None:
|
| 2447 |
assert isinstance(indices, list)
|
| 2448 |
assert isinstance(indices[0], int)
|
|
|
|
| 2450 |
table_string = sympy2latextable(
|
| 2451 |
self.equations_, indices=indices, precision=precision, columns=columns
|
| 2452 |
)
|
| 2453 |
+
else:
|
| 2454 |
+
raise ValueError(
|
| 2455 |
+
"Invalid type for equations_ to pass to `latex_table`. "
|
| 2456 |
+
"Expected a DataFrame or a list of DataFrames."
|
| 2457 |
+
)
|
| 2458 |
|
| 2459 |
+
return with_preamble(table_string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2460 |
|
| 2461 |
|
| 2462 |
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
|
|
|
| 2474 |
f"{model_selection} is not a valid model selection strategy."
|
| 2475 |
)
|
| 2476 |
return chosen_idx
|
| 2477 |
+
|
| 2478 |
+
|
| 2479 |
+
def _mutate_parameter(param_name: str, param_value):
|
| 2480 |
+
if param_name in ["binary_operators", "unary_operators"] and isinstance(
|
| 2481 |
+
param_value, str
|
| 2482 |
+
):
|
| 2483 |
+
return [param_value]
|
| 2484 |
+
|
| 2485 |
+
if param_name == "batch_size" and param_value < 1:
|
| 2486 |
+
warnings.warn(
|
| 2487 |
+
"Given `batch_size` must be greater than or equal to one. "
|
| 2488 |
+
"`batch_size` has been increased to equal one."
|
| 2489 |
+
)
|
| 2490 |
+
return 1
|
| 2491 |
+
|
| 2492 |
+
if (
|
| 2493 |
+
param_name == "progress"
|
| 2494 |
+
and param_value == True
|
| 2495 |
+
and "buffer" not in sys.stdout.__dir__()
|
| 2496 |
+
):
|
| 2497 |
+
warnings.warn(
|
| 2498 |
+
"Note: it looks like you are running in Jupyter. "
|
| 2499 |
+
"The progress bar will be turned off."
|
| 2500 |
+
)
|
| 2501 |
+
return False
|
| 2502 |
+
|
| 2503 |
+
return param_value
|
pysr/test/test.py
CHANGED
|
@@ -431,6 +431,16 @@ class TestPipeline(unittest.TestCase):
|
|
| 431 |
)
|
| 432 |
np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
def manually_create_model(equations, feature_names=None):
|
| 436 |
if feature_names is None:
|
|
@@ -526,7 +536,7 @@ class TestFeatureSelection(unittest.TestCase):
|
|
| 526 |
X = self.rstate.randn(20000, 5)
|
| 527 |
y = X[:, 2] ** 2 + X[:, 3] ** 2
|
| 528 |
selected = run_feature_selection(X, y, select_k_features=2)
|
| 529 |
-
|
| 530 |
|
| 531 |
def test_feature_selection_handler(self):
|
| 532 |
X = self.rstate.randn(20000, 5)
|
|
@@ -538,8 +548,8 @@ class TestFeatureSelection(unittest.TestCase):
|
|
| 538 |
variable_names=var_names,
|
| 539 |
y=y,
|
| 540 |
)
|
| 541 |
-
|
| 542 |
-
selected_var_names = [var_names[i] for i in selection]
|
| 543 |
self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
|
| 544 |
np.testing.assert_array_equal(
|
| 545 |
np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
|
|
|
|
| 431 |
)
|
| 432 |
np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
|
| 433 |
|
| 434 |
+
def test_jl_function_error(self):
|
| 435 |
+
# TODO: Move this to better class
|
| 436 |
+
with self.assertRaises(ValueError) as cm:
|
| 437 |
+
PySRRegressor(unary_operators=["1"]).fit([[1]], [1])
|
| 438 |
+
|
| 439 |
+
self.assertIn(
|
| 440 |
+
"When building `unary_operators`, `'1'` did not return a Julia function",
|
| 441 |
+
str(cm.exception),
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
|
| 445 |
def manually_create_model(equations, feature_names=None):
|
| 446 |
if feature_names is None:
|
|
|
|
| 536 |
X = self.rstate.randn(20000, 5)
|
| 537 |
y = X[:, 2] ** 2 + X[:, 3] ** 2
|
| 538 |
selected = run_feature_selection(X, y, select_k_features=2)
|
| 539 |
+
np.testing.assert_array_equal(selected, [False, False, True, True, False])
|
| 540 |
|
| 541 |
def test_feature_selection_handler(self):
|
| 542 |
X = self.rstate.randn(20000, 5)
|
|
|
|
| 548 |
variable_names=var_names,
|
| 549 |
y=y,
|
| 550 |
)
|
| 551 |
+
np.testing.assert_array_equal(selection, [False, False, True, True, False])
|
| 552 |
+
selected_var_names = [var_names[i] for i in range(5) if selection[i]]
|
| 553 |
self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
|
| 554 |
np.testing.assert_array_equal(
|
| 555 |
np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
|
pysr/utils.py
CHANGED
|
@@ -1,10 +1,18 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
from
|
|
|
|
| 5 |
|
|
|
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
if os.path.splitext(csv_filename)[1] == ".pkl":
|
| 9 |
return csv_filename
|
| 10 |
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any, List, TypeVar, Union
|
| 5 |
|
| 6 |
+
from numpy import ndarray
|
| 7 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
| 8 |
|
| 9 |
+
T = TypeVar("T", bound=Any)
|
| 10 |
|
| 11 |
+
ArrayLike = Union[ndarray, List[T]]
|
| 12 |
+
PathLike = Union[str, Path]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike:
|
| 16 |
if os.path.splitext(csv_filename)[1] == ".pkl":
|
| 17 |
return csv_filename
|
| 18 |
|
requirements.txt
CHANGED
|
@@ -5,4 +5,3 @@ scikit_learn>=1.0.0,<2.0.0
|
|
| 5 |
juliacall==0.9.20
|
| 6 |
click>=7.0.0,<9.0.0
|
| 7 |
setuptools>=50.0.0
|
| 8 |
-
typing_extensions>=4.0.0,<5.0.0; python_version < "3.8"
|
|
|
|
| 5 |
juliacall==0.9.20
|
| 6 |
click>=7.0.0,<9.0.0
|
| 7 |
setuptools>=50.0.0
|
|
|