Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

MilesCranmer commited on Jun 16, 2024

Commit

f653388

unverified ·

2 Parent(s): 476f573 291dc85

Merge pull request #609 from MilesCranmer/cleanup

Browse files

More extensive typing stubs and associated refactoring

Files changed (14) hide show

.gitignore +2 -0
environment.yml +0 -1
pyproject.toml +6 -0
pysr/denoising.py +17 -4
pysr/export_latex.py +12 -0
pysr/export_numpy.py +10 -2
pysr/export_sympy.py +7 -5
pysr/feature_selection.py +19 -3
pysr/julia_helpers.py +17 -5
pysr/julia_import.py +5 -0
pysr/sr.py +260 -156
pysr/test/test.py +13 -3
pysr/utils.py +10 -2
requirements.txt +0 -1

.gitignore CHANGED Viewed

@@ -23,3 +23,5 @@ site
 **/*.code-workspace
 **/*.tar.gz
 venv

 **/*.code-workspace
 **/*.tar.gz
 venv
+requirements-dev.lock
+requirements.lock

environment.yml CHANGED Viewed

@@ -9,4 +9,3 @@ dependencies:
   - scikit-learn>=1.0.0,<2.0.0
   - pyjuliacall>=0.9.15,<0.10.0
   - click>=7.0.0,<9.0.0
-  - typing_extensions>=4.0.0,<5.0.0

   - scikit-learn>=1.0.0,<2.0.0
   - pyjuliacall>=0.9.15,<0.10.0
   - click>=7.0.0,<9.0.0

pyproject.toml CHANGED Viewed

@@ -35,4 +35,10 @@ dev-dependencies = [
     "pre-commit>=3.7.0",
     "ipython>=8.23.0",
     "ipykernel>=6.29.4",
 ]

     "pre-commit>=3.7.0",
     "ipython>=8.23.0",
     "ipykernel>=6.29.4",
+    "mypy>=1.10.0",
+    "jax[cpu]>=0.4.26",
+    "torch>=2.3.0",
+    "pandas-stubs>=2.2.1.240316",
+    "types-pytz>=2024.1.0.20240417",
+    "types-openpyxl>=3.1.0.20240428",
 ]

pysr/denoising.py CHANGED Viewed

@@ -1,9 +1,17 @@
 """Functions for denoising data during preprocessing."""
 import numpy as np
-def denoise(X, y, Xresampled=None, random_state=None):
     """Denoise the dataset using a Gaussian process."""
     from sklearn.gaussian_process import GaussianProcessRegressor
     from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
     gpr.fit(X, y)
     if Xresampled is not None:
-        return Xresampled, gpr.predict(Xresampled)
-    return X, gpr.predict(X)
-def multi_denoise(X, y, Xresampled=None, random_state=None):
     """Perform `denoise` along each column of `y` independently."""
     y = np.stack(
         [

 """Functions for denoising data during preprocessing."""
+from typing import Optional, Tuple, cast
 import numpy as np
+from numpy import ndarray
+def denoise(
+    X: ndarray,
+    y: ndarray,
+    Xresampled: Optional[ndarray] = None,
+    random_state: Optional[np.random.RandomState] = None,
+) -> Tuple[ndarray, ndarray]:
     """Denoise the dataset using a Gaussian process."""
     from sklearn.gaussian_process import GaussianProcessRegressor
     from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
     gpr.fit(X, y)
     if Xresampled is not None:
+        return Xresampled, cast(ndarray, gpr.predict(Xresampled))
+    return X, cast(ndarray, gpr.predict(X))
+def multi_denoise(
+    X: ndarray,
+    y: ndarray,
+    Xresampled: Optional[ndarray] = None,
+    random_state: Optional[np.random.RandomState] = None,
+):
     """Perform `denoise` along each column of `y` independently."""
     y = np.stack(
         [

pysr/export_latex.py CHANGED Viewed

@@ -153,3 +153,15 @@ def sympy2multilatextable(
     ]
     return "\n\n".join(latex_tables)

     ]
     return "\n\n".join(latex_tables)
+def with_preamble(table_string: str) -> str:
+    preamble_string = [
+        r"\usepackage{breqn}",
+        r"\usepackage{booktabs}",
+        "",
+        "...",
+        "",
+        table_string,
+    ]
+    return "\n".join(preamble_string)

pysr/export_numpy.py CHANGED Viewed

@@ -1,10 +1,12 @@
 """Code for exporting discovered expressions to numpy"""
 import warnings
 import numpy as np
 import pandas as pd
-from sympy import lambdify
 def sympy2numpy(eqn, sympy_symbols, *, selection=None):
@@ -14,6 +16,10 @@ def sympy2numpy(eqn, sympy_symbols, *, selection=None):
 class CallableEquation:
     """Simple wrapper for numpy lambda functions built with sympy"""
     def __init__(self, eqn, sympy_symbols, selection=None):
         self._sympy = eqn
         self._sympy_symbols = sympy_symbols
@@ -29,8 +35,9 @@ class CallableEquation:
             return self._lambda(
                 **{k: X[k].values for k in map(str, self._sympy_symbols)}
             ) * np.ones(expected_shape)
         if self._selection is not None:
-            if X.shape[1] != len(self._selection):
                 warnings.warn(
                     "`X` should be of shape (n_samples, len(self._selection)). "
                     "Automatically filtering `X` to selection. "
@@ -38,6 +45,7 @@ class CallableEquation:
                     "this may lead to incorrect predictions and other errors."
                 )
                 X = X[:, self._selection]
         return self._lambda(*X.T) * np.ones(expected_shape)
     @property

 """Code for exporting discovered expressions to numpy"""
 import warnings
+from typing import List, Union
 import numpy as np
 import pandas as pd
+from numpy.typing import NDArray
+from sympy import Expr, Symbol, lambdify
 def sympy2numpy(eqn, sympy_symbols, *, selection=None):
 class CallableEquation:
     """Simple wrapper for numpy lambda functions built with sympy"""
+    _sympy: Expr
+    _sympy_symbols: List[Symbol]
+    _selection: Union[NDArray[np.bool_], None]
     def __init__(self, eqn, sympy_symbols, selection=None):
         self._sympy = eqn
         self._sympy_symbols = sympy_symbols
             return self._lambda(
                 **{k: X[k].values for k in map(str, self._sympy_symbols)}
             ) * np.ones(expected_shape)
         if self._selection is not None:
+            if X.shape[1] != self._selection.sum():
                 warnings.warn(
                     "`X` should be of shape (n_samples, len(self._selection)). "
                     "Automatically filtering `X` to selection. "
                     "this may lead to incorrect predictions and other errors."
                 )
                 X = X[:, self._selection]
         return self._lambda(*X.T) * np.ones(expected_shape)
     @property

pysr/export_sympy.py CHANGED Viewed

@@ -5,6 +5,8 @@ from typing import Callable, Dict, List, Optional
 import sympy
 from sympy import sympify
 sympy_mappings = {
     "div": lambda x, y: x / y,
     "mult": lambda x, y: x * y,
@@ -30,8 +32,8 @@ sympy_mappings = {
     "acosh": lambda x: sympy.acosh(x),
     "acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
     "asinh": sympy.asinh,
-    "atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
-    "atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
     "abs": abs,
     "mod": sympy.Mod,
     "erf": sympy.erf,
@@ -60,13 +62,13 @@ sympy_mappings = {
 def create_sympy_symbols_map(
-    feature_names_in: List[str],
 ) -> Dict[str, sympy.Symbol]:
     return {variable: sympy.Symbol(variable) for variable in feature_names_in}
 def create_sympy_symbols(
-    feature_names_in: List[str],
 ) -> List[sympy.Symbol]:
     return [sympy.Symbol(variable) for variable in feature_names_in]
@@ -74,7 +76,7 @@ def create_sympy_symbols(
 def pysr2sympy(
     equation: str,
     *,
-    feature_names_in: Optional[List[str]] = None,
     extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
 ):
     if feature_names_in is None:

 import sympy
 from sympy import sympify
+from .utils import ArrayLike
 sympy_mappings = {
     "div": lambda x, y: x / y,
     "mult": lambda x, y: x * y,
     "acosh": lambda x: sympy.acosh(x),
     "acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
     "asinh": sympy.asinh,
+    "atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
+    "atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
     "abs": abs,
     "mod": sympy.Mod,
     "erf": sympy.erf,
 def create_sympy_symbols_map(
+    feature_names_in: ArrayLike[str],
 ) -> Dict[str, sympy.Symbol]:
     return {variable: sympy.Symbol(variable) for variable in feature_names_in}
 def create_sympy_symbols(
+    feature_names_in: ArrayLike[str],
 ) -> List[sympy.Symbol]:
     return [sympy.Symbol(variable) for variable in feature_names_in]
 def pysr2sympy(
     equation: str,
     *,
+    feature_names_in: Optional[ArrayLike[str]] = None,
     extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
 ):
     if feature_names_in is None:

pysr/feature_selection.py CHANGED Viewed

@@ -1,9 +1,20 @@
 """Functions for doing feature selection during preprocessing."""
 import numpy as np
-def run_feature_selection(X, y, select_k_features, random_state=None):
     """
     Find most important features.
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
     selector = SelectFromModel(
         clf, threshold=-np.inf, max_features=select_k_features, prefit=True
     )
-    return selector.get_support(indices=True)
 # Function has not been removed only due to usage in module tests
-def _handle_feature_selection(X, select_k_features, y, variable_names):
     if select_k_features is not None:
         selection = run_feature_selection(X, y, select_k_features)
         print(f"Using features {[variable_names[i] for i in selection]}")

 """Functions for doing feature selection during preprocessing."""
+from typing import Optional, cast
 import numpy as np
+from numpy import ndarray
+from numpy.typing import NDArray
+from .utils import ArrayLike
+def run_feature_selection(
+    X: ndarray,
+    y: ndarray,
+    select_k_features: int,
+    random_state: Optional[np.random.RandomState] = None,
+) -> NDArray[np.bool_]:
     """
     Find most important features.
     selector = SelectFromModel(
         clf, threshold=-np.inf, max_features=select_k_features, prefit=True
     )
+    return cast(NDArray[np.bool_], selector.get_support(indices=False))
 # Function has not been removed only due to usage in module tests
+def _handle_feature_selection(
+    X: ndarray,
+    select_k_features: Optional[int],
+    y: ndarray,
+    variable_names: ArrayLike[str],
+):
     if select_k_features is not None:
         selection = run_feature_selection(X, y, select_k_features)
         print(f"Using features {[variable_names[i] for i in selection]}")

pysr/julia_helpers.py CHANGED Viewed

@@ -1,11 +1,16 @@
 """Functions for initializing the Julia environment and installing deps."""
 import numpy as np
 from juliacall import convert as jl_convert  # type: ignore
 from .deprecated import init_julia, install
 from .julia_import import jl
 jl.seval("using Serialization: Serialization")
 jl.seval("using PythonCall: PythonCall")
@@ -22,24 +27,31 @@ def _escape_filename(filename):
     return str_repr
-def _load_cluster_manager(cluster_manager):
     jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
     return jl.seval(f"addprocs_{cluster_manager}")
-def jl_array(x):
     if x is None:
         return None
-    return jl_convert(jl.Array, x)
-def jl_serialize(obj):
     buf = jl.IOBuffer()
     Serialization.serialize(buf, obj)
     return np.array(jl.take_b(buf))
-def jl_deserialize(s):
     if s is None:
         return s
     buf = jl.IOBuffer()

 """Functions for initializing the Julia environment and installing deps."""
+from typing import Any, Callable, Union, cast
 import numpy as np
 from juliacall import convert as jl_convert  # type: ignore
+from numpy.typing import NDArray
 from .deprecated import init_julia, install
 from .julia_import import jl
+jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
 jl.seval("using Serialization: Serialization")
 jl.seval("using PythonCall: PythonCall")
     return str_repr
+def _load_cluster_manager(cluster_manager: str):
     jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
     return jl.seval(f"addprocs_{cluster_manager}")
+def jl_array(x, dtype=None):
     if x is None:
         return None
+    elif dtype is None:
+        return jl_convert(jl.Array, x)
+    else:
+        return jl_convert(jl.Array[dtype], x)
+def jl_is_function(f) -> bool:
+    return cast(bool, jl.seval("op -> op isa Function")(f))
+def jl_serialize(obj: Any) -> NDArray[np.uint8]:
     buf = jl.IOBuffer()
     Serialization.serialize(buf, obj)
     return np.array(jl.take_b(buf))
+def jl_deserialize(s: Union[NDArray[np.uint8], None]):
     if s is None:
         return s
     buf = jl.IOBuffer()

pysr/julia_import.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import sys
 import warnings
 # Check if JuliaCall is already loaded, and if so, warn the user
 # about the relevant environment variables. If not loaded,
@@ -42,6 +44,9 @@ if autoload_extensions is not None:
 from juliacall import Main as jl  # type: ignore
 jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
 jl.seval("using SymbolicRegression")

 import os
 import sys
 import warnings
+from types import ModuleType
+from typing import cast
 # Check if JuliaCall is already loaded, and if so, warn the user
 # about the relevant environment variables. If not loaded,
 from juliacall import Main as jl  # type: ignore
+jl = cast(ModuleType, jl)
 jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
 jl.seval("using SymbolicRegression")

pysr/sr.py CHANGED Viewed

@@ -8,27 +8,31 @@ import shutil
 import sys
 import tempfile
 import warnings
 from datetime import datetime
 from io import StringIO
 from multiprocessing import cpu_count
 from pathlib import Path
-from typing import Callable, Dict, List, Optional, Tuple, Union
-if sys.version_info >= (3, 8):
-    from typing import Literal
-else:
-    from typing_extensions import Literal
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
 from sklearn.utils import check_array, check_consistent_length, check_random_state
-from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
 from .denoising import denoise, multi_denoise
 from .deprecated import DEPRECATED_KWARGS
 from .export_jax import sympy2jax
-from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
 from .export_numpy import sympy2numpy
 from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
 from .export_torch import sympy2torch
@@ -40,17 +44,20 @@ from .julia_helpers import (
     _load_cluster_manager,
     jl_array,
     jl_deserialize,
     jl_serialize,
 )
 from .julia_import import SymbolicRegression, jl
 from .utils import (
     _csv_filename_to_pkl_filename,
     _preprocess_julia_floats,
     _safe_check_feature_names_in,
     _subscriptify,
 )
-already_ran = False
 def _process_constraints(binary_operators, unary_operators, constraints):
@@ -178,6 +185,21 @@ def _check_assertions(
 VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
 class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     """
     High-performance symbolic regression algorithm.
@@ -606,22 +628,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Units of each variable in the training dataset, `y`.
     nout_ : int
         Number of output dimensions.
-    selection_mask_ : list[int] of length `select_k_features`
-        List of indices for input features that are selected when
-        `select_k_features` is set.
     tempdir_ : Path
         Path to the temporary equations directory.
-    equation_file_ : str
         Output equation file name produced by the julia backend.
     julia_state_stream_ : ndarray
         The serialized state for the julia SymbolicRegression.jl backend (after fitting),
         stored as an array of uint8, produced by Julia's Serialization.serialize function.
-    julia_state_
-        The deserialized state.
     julia_options_stream_ : ndarray
         The serialized julia options, stored as an array of uint8,
-    julia_options_
-        The deserialized julia options.
     equation_file_contents_ : list[pandas.DataFrame]
         Contents of the equation file output by the Julia backend.
     show_pickle_warnings_ : bool
@@ -668,6 +685,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     ```
     """
     def __init__(
         self,
         model_selection: Literal["best", "accuracy", "score"] = "best",
@@ -900,14 +932,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     @classmethod
     def from_file(
         cls,
-        equation_file,
         *,
-        binary_operators=None,
-        unary_operators=None,
-        n_features_in=None,
-        feature_names_in=None,
-        selection_mask=None,
-        nout=1,
         **pysr_kwargs,
     ):
         """
@@ -915,7 +947,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Parameters
         ----------
-        equation_file : str
             Path to a pickle file containing a saved model, or a csv file
             containing equations.
         binary_operators : list[str]
@@ -930,8 +962,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         feature_names_in : list[str]
             Names of the features passed to the model.
             Not needed if loading from a pickle file.
-        selection_mask : list[bool]
-            If using select_k_features, you must pass `model.selection_mask_` here.
             Not needed if loading from a pickle file.
         nout : int
             Number of outputs of the model.
@@ -982,7 +1014,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         # TODO: copy .bkup file if exists.
         model = cls(
-            equation_file=equation_file,
             binary_operators=binary_operators,
             unary_operators=unary_operators,
             **pysr_kwargs,
@@ -1002,7 +1034,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             model.display_feature_names_in_ = feature_names_in
         if selection_mask is None:
-            model.selection_mask_ = np.ones(n_features_in, dtype=bool)
         else:
             model.selection_mask_ = selection_mask
@@ -1029,7 +1061,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             all_equations = equations
         for i, equations in enumerate(all_equations):
-            selected = ["" for _ in range(len(equations))]
             chosen_row = idx_model_selection(equations, self.model_selection)
             selected[chosen_row] = ">>>>"
             repr_equations = pd.DataFrame(
@@ -1129,10 +1161,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     @property
     def julia_options_(self):
         return jl_deserialize(self.julia_options_stream_)
     @property
     def julia_state_(self):
         return jl_deserialize(self.julia_state_stream_)
     @property
@@ -1145,7 +1179,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         )
         return self.julia_state_
-    def get_best(self, index=None):
         """
         Get best equation using `model_selection`.
@@ -1168,8 +1202,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             Raised when an invalid model selection strategy is provided.
         """
         check_is_fitted(self, attributes=["equations_"])
-        if self.equations_ is None:
-            raise ValueError("No equations have been generated yet.")
         if index is not None:
             if isinstance(self.equations_, list):
@@ -1177,16 +1209,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                     index, list
                 ), "With multiple output features, index must be a list."
                 return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
-            return self.equations_.iloc[index]
         if isinstance(self.equations_, list):
             return [
-                eq.iloc[idx_model_selection(eq, self.model_selection)]
                 for eq in self.equations_
             ]
-        return self.equations_.iloc[
-            idx_model_selection(self.equations_, self.model_selection)
-        ]
     def _setup_equation_file(self):
         """
@@ -1211,7 +1248,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             self.equation_file_ = self.equation_file
         self.equation_file_contents_ = None
-    def _validate_and_set_init_params(self):
         """
         Ensure parameters passed at initialization are valid.
@@ -1269,59 +1306,48 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
             )
-        progress = self.progress
-        # 'Mutable' parameter validation
-        #  (Params and their default values, if None is given:)
-        default_param_mapping = {
-            "binary_operators": "+ * - /".split(" "),
-            "unary_operators": [],
-            "maxdepth": self.maxsize,
-            "constraints": {},
-            "multithreading": self.procs != 0 and self.cluster_manager is None,
-            "batch_size": 1,
-            "update_verbosity": int(self.verbosity),
-            "progress": progress,
-        }
-        packed_modified_params = {}
-        for parameter, default_value in default_param_mapping.items():
-            parameter_value = getattr(self, parameter)
-            if parameter_value is None:
-                parameter_value = default_value
             else:
-                # Special cases such as when binary_operators is a string
-                if parameter in ["binary_operators", "unary_operators"] and isinstance(
-                    parameter_value, str
-                ):
-                    parameter_value = [parameter_value]
-                elif parameter == "batch_size" and parameter_value < 1:
-                    warnings.warn(
-                        "Given `batch_size` must be greater than or equal to one. "
-                        "`batch_size` has been increased to equal one."
-                    )
-                    parameter_value = 1
-                elif (
-                    parameter == "progress"
-                    and parameter_value
-                    and "buffer" not in sys.stdout.__dir__()
-                ):
-                    warnings.warn(
-                        "Note: it looks like you are running in Jupyter. "
-                        "The progress bar will be turned off."
-                    )
-                    parameter_value = False
-            packed_modified_params[parameter] = parameter_value
         assert (
-            len(packed_modified_params["binary_operators"])
-            + len(packed_modified_params["unary_operators"])
-            > 0
-        )
-        return packed_modified_params
     def _validate_and_set_fit_params(
         self, X, y, Xresampled, weights, variable_names, X_units, y_units
-    ):
         """
         Validate the parameters passed to the :term`fit` method.
@@ -1341,7 +1367,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             Weight array of the same shape as `y`.
             Each element is how to weight the mean-square-error loss
             for that particular element of y.
-        variable_names : list[str] of length n_features
             Names of each variable in the training dataset, `X`.
         X_units : list[str] of length n_features
             Units of each variable in the training dataset, `X`.
@@ -1397,7 +1423,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         if weights is not None:
             weights = check_array(weights, ensure_2d=False)
             check_consistent_length(weights, y)
-        X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
         self.feature_names_in_ = _safe_check_feature_names_in(
             self, variable_names, generate_names=False
         )
@@ -1407,10 +1433,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             self.display_feature_names_in_ = np.array(
                 [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
             )
         else:
             self.display_feature_names_in_ = self.feature_names_in_
-        variable_names = self.feature_names_in_
         # Handle multioutput data
         if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
@@ -1425,8 +1451,23 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         return X, y, Xresampled, weights, variable_names, X_units, y_units
     def _pre_transform_training_data(
-        self, X, y, Xresampled, variable_names, X_units, y_units, random_state
     ):
         """
         Transform the training data before fitting the symbolic regressor.
@@ -1435,12 +1476,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Parameters
         ----------
-        X : ndarray | pandas.DataFrame
             Training data of shape (n_samples, n_features).
-        y : ndarray | pandas.DataFrame
             Target values of shape (n_samples,) or (n_samples, n_targets).
             Will be cast to X's dtype if necessary.
-        Xresampled : ndarray | pandas.DataFrame
             Resampled training data, of shape `(n_resampled, n_features)`,
             used for denoising.
         variable_names : list[str]
@@ -1478,24 +1519,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         """
         # Feature selection transformation
         if self.select_k_features:
-            self.selection_mask_ = run_feature_selection(
                 X, y, self.select_k_features, random_state=random_state
             )
-            X = X[:, self.selection_mask_]
             if Xresampled is not None:
-                Xresampled = Xresampled[:, self.selection_mask_]
             # Reduce variable_names to selection
-            variable_names = [variable_names[i] for i in self.selection_mask_]
             if X_units is not None:
-                X_units = [X_units[i] for i in self.selection_mask_]
                 self.X_units_ = copy.deepcopy(X_units)
             # Re-perform data validation and feature name updating
-            X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
             # Update feature names with selected variable names
             self.feature_names_in_ = _check_feature_names_in(self, variable_names)
             self.display_feature_names_in_ = self.feature_names_in_
             print(f"Using features {self.feature_names_in_}")
@@ -1511,20 +1563,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         return X, y, variable_names, X_units, y_units
-    def _run(self, X, y, mutated_params, weights, seed):
         """
         Run the symbolic regression fitting process on the julia backend.
         Parameters
         ----------
-        X : ndarray | pandas.DataFrame
             Training data of shape `(n_samples, n_features)`.
-        y : ndarray | pandas.DataFrame
             Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
             Will be cast to `X`'s dtype if necessary.
-        mutated_params : dict[str, Any]
-            Dictionary of mutated versions of some parameters passed in __init__.
-        weights : ndarray | pandas.DataFrame
             Weight array of the same shape as `y`.
             Each element is how to weight the mean-square-error loss
             for that particular element of y.
@@ -1543,24 +1602,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         """
         # Need to be global as we don't want to recreate/reinstate julia for
         # every new instance of PySRRegressor
-        global already_ran
         # These are the parameters which may be modified from the ones
         # specified in init, so we define them here locally:
-        binary_operators = mutated_params["binary_operators"]
-        unary_operators = mutated_params["unary_operators"]
-        maxdepth = mutated_params["maxdepth"]
-        constraints = mutated_params["constraints"]
         nested_constraints = self.nested_constraints
         complexity_of_operators = self.complexity_of_operators
-        multithreading = mutated_params["multithreading"]
         cluster_manager = self.cluster_manager
-        batch_size = mutated_params["batch_size"]
-        update_verbosity = mutated_params["update_verbosity"]
-        progress = mutated_params["progress"]
         # Start julia backend processes
-        if not already_ran and update_verbosity != 0:
             print("Compiling Julia backend...")
         if cluster_manager is not None:
@@ -1599,6 +1660,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 complexity_of_operators_str += f"({k}) => {v}, "
             complexity_of_operators_str += ")"
             complexity_of_operators = jl.seval(complexity_of_operators_str)
         custom_loss = jl.seval(
             str(self.elementwise_loss)
@@ -1635,11 +1697,25 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             optimize=self.weight_optimize,
         )
         # Call to Julia backend.
         # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
         options = SymbolicRegression.Options(
-            binary_operators=jl.seval(str(binary_operators).replace("'", "")),
-            unary_operators=jl.seval(str(unary_operators).replace("'", "")),
             bin_constraints=jl_array(bin_constraints),
             una_constraints=jl_array(una_constraints),
             complexity_of_operators=complexity_of_operators,
@@ -1671,9 +1747,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             fraction_replaced_hof=self.fraction_replaced_hof,
             should_simplify=self.should_simplify,
             should_optimize_constants=self.should_optimize_constants,
-            warmup_maxsize_by=(
-                0.0 if self.warmup_maxsize_by is None else self.warmup_maxsize_by
-            ),
             use_frequency=self.use_frequency,
             use_frequency_in_tournament=self.use_frequency_in_tournament,
             adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
@@ -1780,7 +1854,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         if self.delete_tempfiles:
             shutil.rmtree(self.tempdir_)
-        already_ran = True
         return self
@@ -1790,9 +1864,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         y,
         Xresampled=None,
         weights=None,
-        variable_names: Optional[List[str]] = None,
-        X_units: Optional[List[str]] = None,
-        y_units: Optional[List[str]] = None,
     ) -> "PySRRegressor":
         """
         Search for equations to fit the dataset and store them in `self.equations_`.
@@ -1854,12 +1928,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             self.X_units_ = None
             self.y_units_ = None
-        random_state = check_random_state(self.random_state)  # For np random
-        seed = random_state.get_state()[1][0]  # For julia random
         self._setup_equation_file()
-        mutated_params = self._validate_and_set_init_params()
         (
             X,
@@ -1884,6 +1955,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 "More datapoints will lower the search speed."
             )
         # Pre transformations (feature selection and denoising)
         X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
             X, y, Xresampled, variable_names, X_units, y_units, random_state
@@ -1925,7 +1999,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             self._checkpoint()
         # Perform the search:
-        self._run(X, y, mutated_params, weights=weights, seed=seed)
         # Then, after fit, we save again, so the pickle file contains
         # the equations:
@@ -1934,7 +2008,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         return self
-    def refresh(self, checkpoint_file=None):
         """
         Update self.equations_ with any new options passed.
@@ -1943,11 +2017,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Parameters
         ----------
-        checkpoint_file : str
             Path to checkpoint hall of fame file to be loaded.
             The default will use the set `equation_file_`.
         """
-        if checkpoint_file:
             self.equation_file_ = checkpoint_file
             self.equation_file_contents_ = None
         check_is_fitted(self, attributes=["equation_file_"])
@@ -1999,7 +2073,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             if self.selection_mask_ is not None:
                 # RangeIndex enforces column order allowing columns to
                 # be correctly filtered with self.selection_mask_
-                X = X.iloc[:, self.selection_mask_]
             X.columns = self.feature_names_in_
         # Without feature information, CallableEquation/lambda_format equations
         # require that the column order of X matches that of the X used during
@@ -2009,14 +2083,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         # reordered/reindexed to match those of the transformed (denoised and
         # feature selected) X in fit.
         X = X.reindex(columns=self.feature_names_in_)
-        X = self._validate_data(X, reset=False)
         try:
-            if self.nout_ > 1:
                 return np.stack(
                     [eq["lambda_format"](X) for eq in best_equation], axis=1
                 )
-            return best_equation["lambda_format"](X)
         except Exception as error:
             raise ValueError(
                 "Failed to evaluate the expression. "
@@ -2046,9 +2122,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         """
         self.refresh()
         best_equation = self.get_best(index=index)
-        if self.nout_ > 1:
             return [eq["sympy_format"] for eq in best_equation]
-        return best_equation["sympy_format"]
     def latex(self, index=None, precision=3):
         """
@@ -2108,9 +2186,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         self.set_params(output_jax_format=True)
         self.refresh()
         best_equation = self.get_best(index=index)
-        if self.nout_ > 1:
             return [eq["jax_format"] for eq in best_equation]
-        return best_equation["jax_format"]
     def pytorch(self, index=None):
         """
@@ -2138,9 +2218,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         self.set_params(output_torch_format=True)
         self.refresh()
         best_equation = self.get_best(index=index)
-        if self.nout_ > 1:
             return [eq["torch_format"] for eq in best_equation]
-        return best_equation["torch_format"]
     def _read_equation_file(self):
         """Read the hall of fame file created by `SymbolicRegression.jl`."""
@@ -2239,10 +2320,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             lastComplexity = 0
             sympy_format = []
             lambda_format = []
-            if self.output_jax_format:
-                jax_format = []
-            if self.output_torch_format:
-                torch_format = []
             for _, eqn_row in output.iterrows():
                 eqn = pysr2sympy(
@@ -2354,7 +2433,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         """
         self.refresh()
-        if self.nout_ > 1:
             if indices is not None:
                 assert isinstance(indices, list)
                 assert isinstance(indices[0], list)
@@ -2363,7 +2442,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             table_string = sympy2multilatextable(
                 self.equations_, indices=indices, precision=precision, columns=columns
             )
-        else:
             if indices is not None:
                 assert isinstance(indices, list)
                 assert isinstance(indices[0], int)
@@ -2371,15 +2450,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             table_string = sympy2latextable(
                 self.equations_, indices=indices, precision=precision, columns=columns
             )
-        preamble_string = [
-            r"\usepackage{breqn}",
-            r"\usepackage{booktabs}",
-            "",
-            "...",
-            "",
-        ]
-        return "\n".join(preamble_string + [table_string])
 def idx_model_selection(equations: pd.DataFrame, model_selection: str):
@@ -2397,3 +2474,30 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
             f"{model_selection} is not a valid model selection strategy."
         )
     return chosen_idx

 import sys
 import tempfile
 import warnings
+from dataclasses import dataclass, fields
 from datetime import datetime
 from io import StringIO
 from multiprocessing import cpu_count
 from pathlib import Path
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast
 import numpy as np
 import pandas as pd
+from numpy import ndarray
+from numpy.typing import NDArray
 from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
 from sklearn.utils import check_array, check_consistent_length, check_random_state
+from sklearn.utils.validation import _check_feature_names_in  # type: ignore
+from sklearn.utils.validation import check_is_fitted
 from .denoising import denoise, multi_denoise
 from .deprecated import DEPRECATED_KWARGS
 from .export_jax import sympy2jax
+from .export_latex import (
+    sympy2latex,
+    sympy2latextable,
+    sympy2multilatextable,
+    with_preamble,
+)
 from .export_numpy import sympy2numpy
 from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
 from .export_torch import sympy2torch
     _load_cluster_manager,
     jl_array,
     jl_deserialize,
+    jl_is_function,
     jl_serialize,
 )
 from .julia_import import SymbolicRegression, jl
 from .utils import (
+    ArrayLike,
+    PathLike,
     _csv_filename_to_pkl_filename,
     _preprocess_julia_floats,
     _safe_check_feature_names_in,
     _subscriptify,
 )
+ALREADY_RAN = False
 def _process_constraints(binary_operators, unary_operators, constraints):
 VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
+@dataclass
+class _DynamicallySetParams:
+    """Defines some parameters that are set at runtime."""
+    binary_operators: List[str]
+    unary_operators: List[str]
+    maxdepth: int
+    constraints: Dict[str, str]
+    multithreading: bool
+    batch_size: int
+    update_verbosity: int
+    progress: bool
+    warmup_maxsize_by: float
 class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     """
     High-performance symbolic regression algorithm.
         Units of each variable in the training dataset, `y`.
     nout_ : int
         Number of output dimensions.
+    selection_mask_ : ndarray of shape (`n_features_in_`,)
+        Mask of which features of `X` to use when `select_k_features` is set.
     tempdir_ : Path
         Path to the temporary equations directory.
+    equation_file_ : Union[str, Path]
         Output equation file name produced by the julia backend.
     julia_state_stream_ : ndarray
         The serialized state for the julia SymbolicRegression.jl backend (after fitting),
         stored as an array of uint8, produced by Julia's Serialization.serialize function.
     julia_options_stream_ : ndarray
         The serialized julia options, stored as an array of uint8,
     equation_file_contents_ : list[pandas.DataFrame]
         Contents of the equation file output by the Julia backend.
     show_pickle_warnings_ : bool
     ```
     """
+    equations_: Union[pd.DataFrame, List[pd.DataFrame], None]
+    n_features_in_: int
+    feature_names_in_: ArrayLike[str]
+    display_feature_names_in_: ArrayLike[str]
+    X_units_: Union[ArrayLike[str], None]
+    y_units_: Union[str, ArrayLike[str], None]
+    nout_: int
+    selection_mask_: Union[NDArray[np.bool_], None]
+    tempdir_: Path
+    equation_file_: PathLike
+    julia_state_stream_: Union[NDArray[np.uint8], None]
+    julia_options_stream_: Union[NDArray[np.uint8], None]
+    equation_file_contents_: Union[List[pd.DataFrame], None]
+    show_pickle_warnings_: bool
     def __init__(
         self,
         model_selection: Literal["best", "accuracy", "score"] = "best",
     @classmethod
     def from_file(
         cls,
+        equation_file: PathLike,
         *,
+        binary_operators: Optional[List[str]] = None,
+        unary_operators: Optional[List[str]] = None,
+        n_features_in: Optional[int] = None,
+        feature_names_in: Optional[ArrayLike[str]] = None,
+        selection_mask: Optional[NDArray[np.bool_]] = None,
+        nout: int = 1,
         **pysr_kwargs,
     ):
         """
         Parameters
         ----------
+        equation_file : str or Path
             Path to a pickle file containing a saved model, or a csv file
             containing equations.
         binary_operators : list[str]
         feature_names_in : list[str]
             Names of the features passed to the model.
             Not needed if loading from a pickle file.
+        selection_mask : NDArray[np.bool_]
+            If using `select_k_features`, you must pass `model.selection_mask_` here.
             Not needed if loading from a pickle file.
         nout : int
             Number of outputs of the model.
         # TODO: copy .bkup file if exists.
         model = cls(
+            equation_file=str(equation_file),
             binary_operators=binary_operators,
             unary_operators=unary_operators,
             **pysr_kwargs,
             model.display_feature_names_in_ = feature_names_in
         if selection_mask is None:
+            model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_)
         else:
             model.selection_mask_ = selection_mask
             all_equations = equations
         for i, equations in enumerate(all_equations):
+            selected = pd.Series([""] * len(equations), index=equations.index)
             chosen_row = idx_model_selection(equations, self.model_selection)
             selected[chosen_row] = ">>>>"
             repr_equations = pd.DataFrame(
     @property
     def julia_options_(self):
+        """The deserialized julia options."""
         return jl_deserialize(self.julia_options_stream_)
     @property
     def julia_state_(self):
+        """The deserialized state."""
         return jl_deserialize(self.julia_state_stream_)
     @property
         )
         return self.julia_state_
+    def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
         """
         Get best equation using `model_selection`.
             Raised when an invalid model selection strategy is provided.
         """
         check_is_fitted(self, attributes=["equations_"])
         if index is not None:
             if isinstance(self.equations_, list):
                     index, list
                 ), "With multiple output features, index must be a list."
                 return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
+            else:
+                equations_ = cast(pd.DataFrame, self.equations_)
+                return cast(pd.Series, equations_.iloc[index])
         if isinstance(self.equations_, list):
             return [
+                cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
                 for eq in self.equations_
             ]
+        else:
+            equations_ = cast(pd.DataFrame, self.equations_)
+            return cast(
+                pd.Series,
+                equations_.loc[idx_model_selection(equations_, self.model_selection)],
+            )
     def _setup_equation_file(self):
         """
             self.equation_file_ = self.equation_file
         self.equation_file_contents_ = None
+    def _validate_and_modify_params(self) -> _DynamicallySetParams:
         """
         Ensure parameters passed at initialization are valid.
                 f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
             )
+        param_container = _DynamicallySetParams(
+            binary_operators=["+", "*", "-", "/"],
+            unary_operators=[],
+            maxdepth=self.maxsize,
+            constraints={},
+            multithreading=self.procs != 0 and self.cluster_manager is None,
+            batch_size=1,
+            update_verbosity=int(self.verbosity),
+            progress=self.progress,
+            warmup_maxsize_by=0.0,
+        )
+        for param_name in map(lambda x: x.name, fields(_DynamicallySetParams)):
+            user_param_value = getattr(self, param_name)
+            if user_param_value is None:
+                # Leave as the default in DynamicallySetParams
+                ...
             else:
+                # If user has specified it, we will override the default.
+                # However, there are some special cases to mutate it:
+                new_param_value = _mutate_parameter(param_name, user_param_value)
+                setattr(param_container, param_name, new_param_value)
+        # TODO: This should just be part of the __init__ of _DynamicallySetParams
         assert (
+            len(param_container.binary_operators) > 0
+            or len(param_container.unary_operators) > 0
+        ), "At least one operator must be provided."
+        return param_container
     def _validate_and_set_fit_params(
         self, X, y, Xresampled, weights, variable_names, X_units, y_units
+    ) -> Tuple[
+        ndarray,
+        ndarray,
+        Optional[ndarray],
+        Optional[ndarray],
+        ArrayLike[str],
+        Optional[ArrayLike[str]],
+        Optional[Union[str, ArrayLike[str]]],
+    ]:
         """
         Validate the parameters passed to the :term`fit` method.
             Weight array of the same shape as `y`.
             Each element is how to weight the mean-square-error loss
             for that particular element of y.
+        variable_names : ndarray of length n_features
             Names of each variable in the training dataset, `X`.
         X_units : list[str] of length n_features
             Units of each variable in the training dataset, `X`.
         if weights is not None:
             weights = check_array(weights, ensure_2d=False)
             check_consistent_length(weights, y)
+        X, y = self._validate_data_X_y(X, y)
         self.feature_names_in_ = _safe_check_feature_names_in(
             self, variable_names, generate_names=False
         )
             self.display_feature_names_in_ = np.array(
                 [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
             )
+            variable_names = self.feature_names_in_
         else:
             self.display_feature_names_in_ = self.feature_names_in_
+            variable_names = self.feature_names_in_
         # Handle multioutput data
         if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
         return X, y, Xresampled, weights, variable_names, X_units, y_units
+    def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
+        raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True)  # type: ignore
+        return cast(Tuple[ndarray, ndarray], raw_out)
+    def _validate_data_X(self, X) -> Tuple[ndarray]:
+        raw_out = self._validate_data(X=X, reset=False)  # type: ignore
+        return cast(Tuple[ndarray], raw_out)
     def _pre_transform_training_data(
+        self,
+        X: ndarray,
+        y: ndarray,
+        Xresampled: Union[ndarray, None],
+        variable_names: ArrayLike[str],
+        X_units: Union[ArrayLike[str], None],
+        y_units: Union[ArrayLike[str], str, None],
+        random_state: np.random.RandomState,
     ):
         """
         Transform the training data before fitting the symbolic regressor.
         Parameters
         ----------
+        X : ndarray
             Training data of shape (n_samples, n_features).
+        y : ndarray
             Target values of shape (n_samples,) or (n_samples, n_targets).
             Will be cast to X's dtype if necessary.
+        Xresampled : ndarray | None
             Resampled training data, of shape `(n_resampled, n_features)`,
             used for denoising.
         variable_names : list[str]
         """
         # Feature selection transformation
         if self.select_k_features:
+            selection_mask = run_feature_selection(
                 X, y, self.select_k_features, random_state=random_state
             )
+            X = X[:, selection_mask]
             if Xresampled is not None:
+                Xresampled = Xresampled[:, selection_mask]
             # Reduce variable_names to selection
+            variable_names = cast(
+                ArrayLike[str],
+                [
+                    variable_names[i]
+                    for i in range(len(variable_names))
+                    if selection_mask[i]
+                ],
+            )
             if X_units is not None:
+                X_units = cast(
+                    ArrayLike[str],
+                    [X_units[i] for i in range(len(X_units)) if selection_mask[i]],
+                )
                 self.X_units_ = copy.deepcopy(X_units)
             # Re-perform data validation and feature name updating
+            X, y = self._validate_data_X_y(X, y)
             # Update feature names with selected variable names
+            self.selection_mask_ = selection_mask
             self.feature_names_in_ = _check_feature_names_in(self, variable_names)
             self.display_feature_names_in_ = self.feature_names_in_
             print(f"Using features {self.feature_names_in_}")
         return X, y, variable_names, X_units, y_units
+    def _run(
+        self,
+        X: ndarray,
+        y: ndarray,
+        runtime_params: _DynamicallySetParams,
+        weights: Optional[ndarray],
+        seed: int,
+    ):
         """
         Run the symbolic regression fitting process on the julia backend.
         Parameters
         ----------
+        X : ndarray
             Training data of shape `(n_samples, n_features)`.
+        y : ndarray
             Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
             Will be cast to `X`'s dtype if necessary.
+        runtime_params : DynamicallySetParams
+            Dynamically set versions of some parameters passed in __init__.
+        weights : ndarray | None
             Weight array of the same shape as `y`.
             Each element is how to weight the mean-square-error loss
             for that particular element of y.
         """
         # Need to be global as we don't want to recreate/reinstate julia for
         # every new instance of PySRRegressor
+        global ALREADY_RAN
         # These are the parameters which may be modified from the ones
         # specified in init, so we define them here locally:
+        binary_operators = runtime_params.binary_operators
+        unary_operators = runtime_params.unary_operators
+        maxdepth = runtime_params.maxdepth
+        constraints = runtime_params.constraints
+        multithreading = runtime_params.multithreading
+        batch_size = runtime_params.batch_size
+        update_verbosity = runtime_params.update_verbosity
+        progress = runtime_params.progress
+        warmup_maxsize_by = runtime_params.warmup_maxsize_by
         nested_constraints = self.nested_constraints
         complexity_of_operators = self.complexity_of_operators
         cluster_manager = self.cluster_manager
         # Start julia backend processes
+        if not ALREADY_RAN and update_verbosity != 0:
             print("Compiling Julia backend...")
         if cluster_manager is not None:
                 complexity_of_operators_str += f"({k}) => {v}, "
             complexity_of_operators_str += ")"
             complexity_of_operators = jl.seval(complexity_of_operators_str)
+        # TODO: Refactor this into helper function
         custom_loss = jl.seval(
             str(self.elementwise_loss)
             optimize=self.weight_optimize,
         )
+        jl_binary_operators: List[Any] = []
+        jl_unary_operators: List[Any] = []
+        for input_list, output_list, name in [
+            (binary_operators, jl_binary_operators, "binary"),
+            (unary_operators, jl_unary_operators, "unary"),
+        ]:
+            for op in input_list:
+                jl_op = jl.seval(op)
+                if not jl_is_function(jl_op):
+                    raise ValueError(
+                        f"When building `{name}_operators`, `'{op}'` did not return a Julia function"
+                    )
+                output_list.append(jl_op)
         # Call to Julia backend.
         # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
         options = SymbolicRegression.Options(
+            binary_operators=jl_array(jl_binary_operators, dtype=jl.Function),
+            unary_operators=jl_array(jl_unary_operators, dtype=jl.Function),
             bin_constraints=jl_array(bin_constraints),
             una_constraints=jl_array(una_constraints),
             complexity_of_operators=complexity_of_operators,
             fraction_replaced_hof=self.fraction_replaced_hof,
             should_simplify=self.should_simplify,
             should_optimize_constants=self.should_optimize_constants,
+            warmup_maxsize_by=warmup_maxsize_by,
             use_frequency=self.use_frequency,
             use_frequency_in_tournament=self.use_frequency_in_tournament,
             adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
         if self.delete_tempfiles:
             shutil.rmtree(self.tempdir_)
+        ALREADY_RAN = True
         return self
         y,
         Xresampled=None,
         weights=None,
+        variable_names: Optional[ArrayLike[str]] = None,
+        X_units: Optional[ArrayLike[str]] = None,
+        y_units: Optional[Union[str, ArrayLike[str]]] = None,
     ) -> "PySRRegressor":
         """
         Search for equations to fit the dataset and store them in `self.equations_`.
             self.X_units_ = None
             self.y_units_ = None
         self._setup_equation_file()
+        runtime_params = self._validate_and_modify_params()
         (
             X,
                 "More datapoints will lower the search speed."
             )
+        random_state = check_random_state(self.random_state)  # For np random
+        seed = cast(int, random_state.randint(0, 2**31 - 1))  # For julia random
         # Pre transformations (feature selection and denoising)
         X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
             X, y, Xresampled, variable_names, X_units, y_units, random_state
             self._checkpoint()
         # Perform the search:
+        self._run(X, y, runtime_params, weights=weights, seed=seed)
         # Then, after fit, we save again, so the pickle file contains
         # the equations:
         return self
+    def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
         """
         Update self.equations_ with any new options passed.
         Parameters
         ----------
+        checkpoint_file : str or Path
             Path to checkpoint hall of fame file to be loaded.
             The default will use the set `equation_file_`.
         """
+        if checkpoint_file is not None:
             self.equation_file_ = checkpoint_file
             self.equation_file_contents_ = None
         check_is_fitted(self, attributes=["equation_file_"])
             if self.selection_mask_ is not None:
                 # RangeIndex enforces column order allowing columns to
                 # be correctly filtered with self.selection_mask_
+                X = X[X.columns[self.selection_mask_]]
             X.columns = self.feature_names_in_
         # Without feature information, CallableEquation/lambda_format equations
         # require that the column order of X matches that of the X used during
         # reordered/reindexed to match those of the transformed (denoised and
         # feature selected) X in fit.
         X = X.reindex(columns=self.feature_names_in_)
+        X = self._validate_data_X(X)
         try:
+            if isinstance(best_equation, list):
+                assert self.nout_ > 1
                 return np.stack(
                     [eq["lambda_format"](X) for eq in best_equation], axis=1
                 )
+            else:
+                return best_equation["lambda_format"](X)
         except Exception as error:
             raise ValueError(
                 "Failed to evaluate the expression. "
         """
         self.refresh()
         best_equation = self.get_best(index=index)
+        if isinstance(best_equation, list):
+            assert self.nout_ > 1
             return [eq["sympy_format"] for eq in best_equation]
+        else:
+            return best_equation["sympy_format"]
     def latex(self, index=None, precision=3):
         """
         self.set_params(output_jax_format=True)
         self.refresh()
         best_equation = self.get_best(index=index)
+        if isinstance(best_equation, list):
+            assert self.nout_ > 1
             return [eq["jax_format"] for eq in best_equation]
+        else:
+            return best_equation["jax_format"]
     def pytorch(self, index=None):
         """
         self.set_params(output_torch_format=True)
         self.refresh()
         best_equation = self.get_best(index=index)
+        if isinstance(best_equation, list):
             return [eq["torch_format"] for eq in best_equation]
+        else:
+            return best_equation["torch_format"]
     def _read_equation_file(self):
         """Read the hall of fame file created by `SymbolicRegression.jl`."""
             lastComplexity = 0
             sympy_format = []
             lambda_format = []
+            jax_format = []
+            torch_format = []
             for _, eqn_row in output.iterrows():
                 eqn = pysr2sympy(
         """
         self.refresh()
+        if isinstance(self.equations_, list):
             if indices is not None:
                 assert isinstance(indices, list)
                 assert isinstance(indices[0], list)
             table_string = sympy2multilatextable(
                 self.equations_, indices=indices, precision=precision, columns=columns
             )
+        elif isinstance(self.equations_, pd.DataFrame):
             if indices is not None:
                 assert isinstance(indices, list)
                 assert isinstance(indices[0], int)
             table_string = sympy2latextable(
                 self.equations_, indices=indices, precision=precision, columns=columns
             )
+        else:
+            raise ValueError(
+                "Invalid type for equations_ to pass to `latex_table`. "
+                "Expected a DataFrame or a list of DataFrames."
+            )
+        return with_preamble(table_string)
 def idx_model_selection(equations: pd.DataFrame, model_selection: str):
             f"{model_selection} is not a valid model selection strategy."
         )
     return chosen_idx
+def _mutate_parameter(param_name: str, param_value):
+    if param_name in ["binary_operators", "unary_operators"] and isinstance(
+        param_value, str
+    ):
+        return [param_value]
+    if param_name == "batch_size" and param_value < 1:
+        warnings.warn(
+            "Given `batch_size` must be greater than or equal to one. "
+            "`batch_size` has been increased to equal one."
+        )
+        return 1
+    if (
+        param_name == "progress"
+        and param_value == True
+        and "buffer" not in sys.stdout.__dir__()
+    ):
+        warnings.warn(
+            "Note: it looks like you are running in Jupyter. "
+            "The progress bar will be turned off."
+        )
+        return False
+    return param_value

pysr/test/test.py CHANGED Viewed

@@ -431,6 +431,16 @@ class TestPipeline(unittest.TestCase):
         )
         np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
 def manually_create_model(equations, feature_names=None):
     if feature_names is None:
@@ -526,7 +536,7 @@ class TestFeatureSelection(unittest.TestCase):
         X = self.rstate.randn(20000, 5)
         y = X[:, 2] ** 2 + X[:, 3] ** 2
         selected = run_feature_selection(X, y, select_k_features=2)
-        self.assertEqual(sorted(selected), [2, 3])
     def test_feature_selection_handler(self):
         X = self.rstate.randn(20000, 5)
@@ -538,8 +548,8 @@ class TestFeatureSelection(unittest.TestCase):
             variable_names=var_names,
             y=y,
         )
-        self.assertTrue((2 in selection) and (3 in selection))
-        selected_var_names = [var_names[i] for i in selection]
         self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
         np.testing.assert_array_equal(
             np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)

         )
         np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
+    def test_jl_function_error(self):
+        # TODO: Move this to better class
+        with self.assertRaises(ValueError) as cm:
+            PySRRegressor(unary_operators=["1"]).fit([[1]], [1])
+        self.assertIn(
+            "When building `unary_operators`, `'1'` did not return a Julia function",
+            str(cm.exception),
+        )
 def manually_create_model(equations, feature_names=None):
     if feature_names is None:
         X = self.rstate.randn(20000, 5)
         y = X[:, 2] ** 2 + X[:, 3] ** 2
         selected = run_feature_selection(X, y, select_k_features=2)
+        np.testing.assert_array_equal(selected, [False, False, True, True, False])
     def test_feature_selection_handler(self):
         X = self.rstate.randn(20000, 5)
             variable_names=var_names,
             y=y,
         )
+        np.testing.assert_array_equal(selection, [False, False, True, True, False])
+        selected_var_names = [var_names[i] for i in range(5) if selection[i]]
         self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
         np.testing.assert_array_equal(
             np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)

pysr/utils.py CHANGED Viewed

@@ -1,10 +1,18 @@
 import os
 import re
-from sklearn.utils.validation import _check_feature_names_in
-def _csv_filename_to_pkl_filename(csv_filename: str) -> str:
     if os.path.splitext(csv_filename)[1] == ".pkl":
         return csv_filename

 import os
 import re
+from pathlib import Path
+from typing import Any, List, TypeVar, Union
+from numpy import ndarray
+from sklearn.utils.validation import _check_feature_names_in  # type: ignore
+T = TypeVar("T", bound=Any)
+ArrayLike = Union[ndarray, List[T]]
+PathLike = Union[str, Path]
+def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike:
     if os.path.splitext(csv_filename)[1] == ".pkl":
         return csv_filename

requirements.txt CHANGED Viewed

@@ -5,4 +5,3 @@ scikit_learn>=1.0.0,<2.0.0
 juliacall==0.9.20
 click>=7.0.0,<9.0.0
 setuptools>=50.0.0
-typing_extensions>=4.0.0,<5.0.0; python_version < "3.8"

 juliacall==0.9.20
 click>=7.0.0,<9.0.0
 setuptools>=50.0.0