Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

MilesCranmer commited on May 7, 2022

Commit

913bf09

unverified ·

2 Parent(s): f456047 ae0b11e

Merge pull request #117 from MilesCranmer/defaults

Browse files

Files changed (6) hide show

.github/workflows/CI_Windows.yml +1 -1
README.md +1 -1
example.py +1 -1
pysr/sr.py +30 -23
pysr/version.py +2 -2
test/test.py +38 -36

.github/workflows/CI_Windows.yml CHANGED Viewed

@@ -28,7 +28,7 @@ jobs:
       matrix:
         julia-version: ['1.7.1']
         python-version: ['3.9']
-        os: [windows-latest]
     steps:
       - uses: actions/checkout@v1.0.0

       matrix:
         julia-version: ['1.7.1']
         python-version: ['3.9']
+        os: [windows-2019]
     steps:
       - uses: actions/checkout@v1.0.0

README.md CHANGED Viewed

@@ -87,7 +87,7 @@ PySR's main interface is in the style of scikit-learn:
 ```python
 from pysr import PySRRegressor
 model = PySRRegressor(
-    niterations=5,
     binary_operators=["+", "*"],
     unary_operators=[
         "cos",

 ```python
 from pysr import PySRRegressor
 model = PySRRegressor(
+    niterations=40,
     binary_operators=["+", "*"],
     unary_operators=[
         "cos",

example.py CHANGED Viewed

@@ -6,7 +6,7 @@ y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5
 from pysr import PySRRegressor
 model = PySRRegressor(
-    niterations=5,
     binary_operators=["+", "*"],
     unary_operators=[
         "cos",

 from pysr import PySRRegressor
 model = PySRRegressor(
+    niterations=40,
     binary_operators=["+", "*"],
     unary_operators=[
         "cos",

pysr/sr.py CHANGED Viewed

@@ -350,30 +350,30 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         unary_operators=None,
         procs=cpu_count(),
         loss="L2DistLoss()",
-        populations=100,
-        niterations=4,
-        ncyclesperiteration=100,
         timeout_in_seconds=None,
         alpha=0.1,
         annealing=False,
-        fractionReplaced=0.01,
-        fractionReplacedHof=0.005,
-        npop=100,
-        parsimony=1e-4,
         migration=True,
         hofMigration=True,
         shouldOptimizeConstants=True,
-        topn=10,
-        weightAddNode=1,
-        weightInsertNode=3,
-        weightDeleteNode=3,
-        weightDoNothing=1,
-        weightMutateConstant=10,
-        weightMutateOperator=1,
-        weightRandomize=1,
-        weightSimplify=0.002,
-        crossoverProbability=0.01,
-        perturbationFactor=1.0,
         extra_sympy_mappings=None,
         extra_torch_mappings=None,
         extra_jax_mappings=None,
@@ -391,6 +391,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         warmupMaxsizeBy=0.0,
         constraints=None,
         useFrequency=True,
         tempdir=None,
         delete_tempfiles=True,
         julia_project=None,
@@ -399,11 +400,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         output_jax_format=False,
         output_torch_format=False,
         optimizer_algorithm="BFGS",
-        optimizer_nrestarts=3,
-        optimize_probability=1.0,
-        optimizer_iterations=10,
         tournament_selection_n=10,
-        tournament_selection_p=1.0,
         denoise=False,
         Xresampled=None,
         precision=32,
@@ -509,6 +510,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         :type constraints: dict
         :param useFrequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities.
         :type useFrequency: bool
         :param tempdir: directory for the temporary files
         :type tempdir: str/None
         :param delete_tempfiles: whether to delete the temporary files after finishing
@@ -647,6 +650,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
                 warmupMaxsizeBy=warmupMaxsizeBy,
                 constraints=constraints,
                 useFrequency=useFrequency,
                 tempdir=tempdir,
                 delete_tempfiles=delete_tempfiles,
                 update=update,
@@ -756,8 +760,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         for key, value in params.items():
             if key in self.surface_parameters:
                 self.__setattr__(key, value)
-            else:
                 self.params[key] = value
         return self
@@ -1192,6 +1198,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             shouldOptimizeConstants=self.params["shouldOptimizeConstants"],
             warmupMaxsizeBy=self.params["warmupMaxsizeBy"],
             useFrequency=self.params["useFrequency"],
             npop=self.params["npop"],
             ncyclesperiteration=self.params["ncyclesperiteration"],
             fractionReplaced=self.params["fractionReplaced"],

         unary_operators=None,
         procs=cpu_count(),
         loss="L2DistLoss()",
+        populations=15,
+        niterations=40,
+        ncyclesperiteration=550,
         timeout_in_seconds=None,
         alpha=0.1,
         annealing=False,
+        fractionReplaced=0.000364,
+        fractionReplacedHof=0.035,
+        npop=33,
+        parsimony=0.0032,
         migration=True,
         hofMigration=True,
         shouldOptimizeConstants=True,
+        topn=12,
+        weightAddNode=0.79,
+        weightDeleteNode=1.7,
+        weightDoNothing=0.21,
+        weightInsertNode=5.1,
+        weightMutateConstant=0.048,
+        weightMutateOperator=0.47,
+        weightRandomize=0.00023,
+        weightSimplify=0.0020,
+        crossoverProbability=0.066,
+        perturbationFactor=0.076,
         extra_sympy_mappings=None,
         extra_torch_mappings=None,
         extra_jax_mappings=None,
         warmupMaxsizeBy=0.0,
         constraints=None,
         useFrequency=True,
+        useFrequencyInTournament=True,
         tempdir=None,
         delete_tempfiles=True,
         julia_project=None,
         output_jax_format=False,
         output_torch_format=False,
         optimizer_algorithm="BFGS",
+        optimizer_nrestarts=2,
+        optimize_probability=0.14,
+        optimizer_iterations=8,
         tournament_selection_n=10,
+        tournament_selection_p=0.86,
         denoise=False,
         Xresampled=None,
         precision=32,
         :type constraints: dict
         :param useFrequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities.
         :type useFrequency: bool
+        :param useFrequencyInTournament: whether to use the frequency mentioned above in the tournament, rather than just the simulated annealing.
+        :type useFrequencyInTournament: bool
         :param tempdir: directory for the temporary files
         :type tempdir: str/None
         :param delete_tempfiles: whether to delete the temporary files after finishing
                 warmupMaxsizeBy=warmupMaxsizeBy,
                 constraints=constraints,
                 useFrequency=useFrequency,
+                useFrequencyInTournament=useFrequencyInTournament,
                 tempdir=tempdir,
                 delete_tempfiles=delete_tempfiles,
                 update=update,
         for key, value in params.items():
             if key in self.surface_parameters:
                 self.__setattr__(key, value)
+            elif key in self.params:
                 self.params[key] = value
+            else:
+                raise ValueError(f"Parameter {key} is not in the list of parameters.")
         return self
             shouldOptimizeConstants=self.params["shouldOptimizeConstants"],
             warmupMaxsizeBy=self.params["warmupMaxsizeBy"],
             useFrequency=self.params["useFrequency"],
+            useFrequencyInTournament=self.params["useFrequencyInTournament"],
             npop=self.params["npop"],
             ncyclesperiteration=self.params["ncyclesperiteration"],
             fractionReplaced=self.params["fractionReplaced"],

pysr/version.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- __version__ = "0.7.13"
2	- __symbolic_regression_jl_version__ = "0.7~~.14~~"


1	+ __version__ = "0.8.0"
2	+ __symbolic_regression_jl_version__ = "0.8.7"

test/test.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import unittest
 from unittest.mock import patch
 import numpy as np
@@ -10,22 +11,26 @@ import pandas as pd
 class TestPipeline(unittest.TestCase):
     def setUp(self):
         self.default_test_kwargs = dict(
-            niterations=10,
-            populations=100,
-            ncyclesperiteration=100,
-            npop=100,
-            annealing=True,
-            useFrequency=False,
         )
-        np.random.seed(0)
-        self.X = np.random.randn(100, 5)
     def test_linear_relation(self):
         y = self.X[:, 0]
         model = PySRRegressor(**self.default_test_kwargs)
         model.fit(self.X, y)
-        model.set_params(model_selection="accuracy")
         print(model.equations)
         self.assertLessEqual(model.get_best()["loss"], 1e-4)
@@ -67,8 +72,9 @@ class TestPipeline(unittest.TestCase):
         self.assertGreater(bad_mse, 1e-4)
     def test_multioutput_weighted_with_callable_temp_equation(self):
-        y = self.X[:, [0, 1]] ** 2
-        w = np.random.rand(*y.shape)
         w[w < 0.5] = 0.0
         w[w >= 0.5] = 1.0
@@ -85,20 +91,19 @@ class TestPipeline(unittest.TestCase):
             temp_equation_file=True,
             delete_tempfiles=False,
         )
-        model.fit(self.X, y, weights=w)
         np.testing.assert_almost_equal(
-            model.predict(self.X)[:, 0], self.X[:, 0] ** 2, decimal=4
         )
         np.testing.assert_almost_equal(
-            model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
         )
     def test_empty_operators_single_input_multirun(self):
-        X = np.random.randn(100, 1)
         y = X[:, 0] + 3.0
         regressor = PySRRegressor(
-            model_selection="accuracy",
             unary_operators=[],
             binary_operators=["plus"],
             **self.default_test_kwargs,
@@ -124,13 +129,9 @@ class TestPipeline(unittest.TestCase):
         self.assertTrue("None" not in regressor.__repr__())
         self.assertTrue(">>>>" in regressor.__repr__())
-        # "best" model_selection should also give a decent loss:
-        np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
     def test_noisy(self):
-        np.random.seed(1)
-        y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
         model = PySRRegressor(
             # Test that passing a single operator works:
             unary_operators="sq(x) = x^2",
@@ -145,26 +146,25 @@ class TestPipeline(unittest.TestCase):
         self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
     def test_pandas_resample(self):
-        np.random.seed(1)
         X = pd.DataFrame(
             {
-                "T": np.random.randn(500),
-                "x": np.random.randn(500),
-                "unused_feature": np.random.randn(500),
             }
         )
         true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
         y = true_fn(X)
-        noise = np.random.randn(500) * 0.01
         y = y + noise
         # We also test y as a pandas array:
         y = pd.Series(y)
         # Resampled array is a different order of features:
         Xresampled = pd.DataFrame(
             {
-                "unused_feature": np.random.randn(100),
-                "x": np.random.randn(100),
-                "T": np.random.randn(100),
             }
         )
         model = PySRRegressor(
@@ -184,9 +184,9 @@ class TestPipeline(unittest.TestCase):
         self.assertListEqual(list(sorted(fn._selection)), [0, 1])
         X2 = pd.DataFrame(
             {
-                "T": np.random.randn(100),
-                "unused_feature": np.random.randn(100),
-                "x": np.random.randn(100),
             }
         )
         self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-1)
@@ -212,10 +212,12 @@ class TestBest(unittest.TestCase):
             variable_names="x0 x1".split(" "),
             extra_sympy_mappings={},
             output_jax_format=False,
         )
         self.model.n_features = 2
         self.model.refresh()
         self.equations = self.model.equations
     def test_best(self):
         self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)
@@ -230,7 +232,7 @@ class TestBest(unittest.TestCase):
         self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")
     def test_best_lambda(self):
-        X = np.random.randn(10, 2)
         y = np.cos(X[:, 0]) ** 2
         for f in [self.model.predict, self.equations.iloc[-1]["lambda_format"]]:
             np.testing.assert_almost_equal(f(X), y, decimal=4)
@@ -238,16 +240,16 @@ class TestBest(unittest.TestCase):
 class TestFeatureSelection(unittest.TestCase):
     def setUp(self):
-        np.random.seed(0)
     def test_feature_selection(self):
-        X = np.random.randn(20000, 5)
         y = X[:, 2] ** 2 + X[:, 3] ** 2
         selected = run_feature_selection(X, y, select_k_features=2)
         self.assertEqual(sorted(selected), [2, 3])
     def test_feature_selection_handler(self):
-        X = np.random.randn(20000, 5)
         y = X[:, 2] ** 2 + X[:, 3] ** 2
         var_names = [f"x{i}" for i in range(5)]
         selected_X, selection = _handle_feature_selection(

+import inspect
 import unittest
 from unittest.mock import patch
 import numpy as np
 class TestPipeline(unittest.TestCase):
     def setUp(self):
+        # Using inspect,
+        # get default niterations from PySRRegressor, and double them:
+        default_niterations = (
+            inspect.signature(PySRRegressor.__init__).parameters["niterations"].default
+        )
+        default_populations = (
+            inspect.signature(PySRRegressor.__init__).parameters["populations"].default
+        )
         self.default_test_kwargs = dict(
+            model_selection="accuracy",
+            niterations=default_niterations * 2,
+            populations=default_populations * 2,
         )
+        self.rstate = np.random.RandomState(0)
+        self.X = self.rstate.randn(100, 5)
     def test_linear_relation(self):
         y = self.X[:, 0]
         model = PySRRegressor(**self.default_test_kwargs)
         model.fit(self.X, y)
         print(model.equations)
         self.assertLessEqual(model.get_best()["loss"], 1e-4)
         self.assertGreater(bad_mse, 1e-4)
     def test_multioutput_weighted_with_callable_temp_equation(self):
+        X = self.X.copy()
+        y = X[:, [0, 1]] ** 2
+        w = self.rstate.rand(*y.shape)
         w[w < 0.5] = 0.0
         w[w >= 0.5] = 1.0
             temp_equation_file=True,
             delete_tempfiles=False,
         )
+        model.fit(X.copy(), y, weights=w)
         np.testing.assert_almost_equal(
+            model.predict(X.copy())[:, 0], X[:, 0] ** 2, decimal=4
         )
         np.testing.assert_almost_equal(
+            model.predict(X.copy())[:, 1], X[:, 1] ** 2, decimal=4
         )
     def test_empty_operators_single_input_multirun(self):
+        X = self.rstate.randn(100, 1)
         y = X[:, 0] + 3.0
         regressor = PySRRegressor(
             unary_operators=[],
             binary_operators=["plus"],
             **self.default_test_kwargs,
         self.assertTrue("None" not in regressor.__repr__())
         self.assertTrue(">>>>" in regressor.__repr__())
     def test_noisy(self):
+        y = self.X[:, [0, 1]] ** 2 + self.rstate.randn(self.X.shape[0], 1) * 0.05
         model = PySRRegressor(
             # Test that passing a single operator works:
             unary_operators="sq(x) = x^2",
         self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
     def test_pandas_resample(self):
         X = pd.DataFrame(
             {
+                "T": self.rstate.randn(500),
+                "x": self.rstate.randn(500),
+                "unused_feature": self.rstate.randn(500),
             }
         )
         true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
         y = true_fn(X)
+        noise = self.rstate.randn(500) * 0.01
         y = y + noise
         # We also test y as a pandas array:
         y = pd.Series(y)
         # Resampled array is a different order of features:
         Xresampled = pd.DataFrame(
             {
+                "unused_feature": self.rstate.randn(100),
+                "x": self.rstate.randn(100),
+                "T": self.rstate.randn(100),
             }
         )
         model = PySRRegressor(
         self.assertListEqual(list(sorted(fn._selection)), [0, 1])
         X2 = pd.DataFrame(
             {
+                "T": self.rstate.randn(100),
+                "unused_feature": self.rstate.randn(100),
+                "x": self.rstate.randn(100),
             }
         )
         self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-1)
             variable_names="x0 x1".split(" "),
             extra_sympy_mappings={},
             output_jax_format=False,
+            model_selection="accuracy",
         )
         self.model.n_features = 2
         self.model.refresh()
         self.equations = self.model.equations
+        self.rstate = np.random.RandomState(0)
     def test_best(self):
         self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)
         self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")
     def test_best_lambda(self):
+        X = self.rstate.randn(10, 2)
         y = np.cos(X[:, 0]) ** 2
         for f in [self.model.predict, self.equations.iloc[-1]["lambda_format"]]:
             np.testing.assert_almost_equal(f(X), y, decimal=4)
 class TestFeatureSelection(unittest.TestCase):
     def setUp(self):
+        self.rstate = np.random.RandomState(0)
     def test_feature_selection(self):
+        X = self.rstate.randn(20000, 5)
         y = X[:, 2] ** 2 + X[:, 3] ** 2
         selected = run_feature_selection(X, y, select_k_features=2)
         self.assertEqual(sorted(selected), [2, 3])
     def test_feature_selection_handler(self):
+        X = self.rstate.randn(20000, 5)
         y = X[:, 2] ** 2 + X[:, 3] ** 2
         var_names = [f"x{i}" for i in range(5)]
         selected_X, selection = _handle_feature_selection(