Spaces:

AccelerationConsortium
/

crabnet-hyperparameter

Running

App Files Files Community

sgbaird commited on Mar 27, 2024

Commit

45b3fc1

1 Parent(s): 58815da

restore versions that got overwritten/deleted

Browse files

Files changed (2) hide show

app.py +120 -12
surrogate.py +120 -67

app.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import gradio as gr
-from surrogate import CrabNetSurrogateModel
 model = CrabNetSurrogateModel()
-example_parameterization = parameterization = {
     "N": 3,
     "alpha": 0.5,
     "d_model": 512,
     "dim_feedforward": 2048,
     "dropout": 0.1,
-    "emb_scaler": 1.0,
     "epochs_step": 10,
     "eps": 0.000001,
     "fudge": 0.02,
@@ -18,26 +22,130 @@ example_parameterization = parameterization = {
     "lr": 0.001,
     "pe_resolution": 5000,
     "ple_resolution": 5000,
-    "pos_scaler": 1.0,
     "weight_decay": 0,
     "batch_size": 32,
     "out_hidden4": 128,
-    "betas2": 0.9,
-    "betas1": 0.999,
-    "losscurve": False,
-    "learningcurve": False,
     "bias": False,
     "criterion": "RobustL1",
     "elem_prop": "mat2vec",
     "train_frac": 0.5,
 }
-model.surrogate_evaluate(example_parameterization)
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

+import numpy as np
 import gradio as gr
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+from surrogate import CrabNetSurrogateModel, PARAM_BOUNDS
 model = CrabNetSurrogateModel()
+# Define the input parameters
+example_parameterization = {
     "N": 3,
     "alpha": 0.5,
     "d_model": 512,
     "dim_feedforward": 2048,
     "dropout": 0.1,
+    "emb_scaler": 0.5,
     "epochs_step": 10,
     "eps": 0.000001,
     "fudge": 0.02,
     "lr": 0.001,
     "pe_resolution": 5000,
     "ple_resolution": 5000,
+    "pos_scaler": 0.5,
     "weight_decay": 0,
     "batch_size": 32,
     "out_hidden4": 128,
+    "betas1": 0.9,
+    "betas2": 0.999,
     "bias": False,
     "criterion": "RobustL1",
     "elem_prop": "mat2vec",
     "train_frac": 0.5,
 }
+# Define the output parameters
+example_results = model.surrogate_evaluate([example_parameterization])
+example_result = example_results[0]
+def evaluate(*args):
+    # Create a DataFrame with the parameter names and scaled values
+    params_df = pd.DataFrame([args], columns=[param["name"] for param in PARAM_BOUNDS])
+    # Reverse the scaling for each parameter and reverse the renaming for choice parameters
+    for param_info in PARAM_BOUNDS:
+        key = param_info["name"]
+        if param_info["type"] == "range":
+            scaler = scalers[key]
+            params_df[key] = scaler.inverse_transform(params_df[[key]])
+        elif param_info["type"] == "choice":
+            # Extract the index from the renamed choice and use it to get the original choice
+            choice_index = int(params_df[key].str.split("_").str[-1].iloc[0])
+            params_df[key] = param_info["values"][choice_index]
+    # Convert the DataFrame to a list of dictionaries
+    params_list = params_df.to_dict("records")
+    # Evaluate the model with the unscaled parameters
+    results = model.surrogate_evaluate(params_list)
+    # Convert list of dictionaries to list of lists
+    results_list = [list(result.values()) for result in results]
+    return results_list
+scalers = {
+    param_info["name"]: MinMaxScaler()
+    for param_info in PARAM_BOUNDS
+    if param_info["type"] == "range"
+}
+def get_interface(param_info, numeric_index, choice_index):
+    key = param_info["name"]
+    default_value = example_parameterization[key]
+    if param_info["type"] == "range":
+        # Rescale the parameter to be between 0 and 1
+        scaler = scalers[key]
+        scaler.fit([[bound] for bound in param_info["bounds"]])
+        scaled_value = scaler.transform([[default_value]])[0][0]
+        scaled_bounds = scaler.transform([[bound] for bound in param_info["bounds"]])
+        label = f"f1" if key == "train_frac" else f"x{numeric_index}"
+        return (
+            gr.Number(
+                value=scaled_value,
+                minimum=scaled_bounds[0][0],
+                maximum=scaled_bounds[1][0],
+                label=label,
+                step=(scaled_bounds[1][0] - scaled_bounds[0][0]) / 100,
+            ),
+            numeric_index + 1,
+            choice_index,
+        )
+    elif param_info["type"] == "choice":
+        return (
+            gr.Dropdown(
+                choices=[
+                    f"c{choice_index}_{i}" for i in range(len(param_info["values"]))
+                ],
+                label=f"c{choice_index}",
+                value=f"c{choice_index}_{param_info['values'].index(default_value)}",
+            ),
+            numeric_index,
+            choice_index + 1,
+        )
+numeric_index = 1
+choice_index = 1
+inputs = []
+for param in PARAM_BOUNDS:
+    input, numeric_index, choice_index = get_interface(
+        param, numeric_index, choice_index
+    )
+    inputs.append(input)
+iface = gr.Interface(
+    title="CrabNetSurrogateModel",
+    fn=evaluate,
+    inputs=inputs,
+    outputs=gr.Numpy(
+        value=np.array([list(example_result.values())]),
+        headers=[f"y{i+1}" for i in range(len(example_result))],
+        col_count=(len(example_result), "fixed"),
+        datatype=["number"] * len(example_result),
+    ),
+    description="""
+    `y1`, `y2`, `y3`, and `y4`, should all be minimized. `y1` and `y2` are
+    correlated, whereas `y1` and `y2` are both anticorrelated with `y3`. `y1`,
+    `y2`, and `y3` are stochastic (heteroskedastic, parameter-free noise),
+    whereas `y4` is deterministic, but still considered 'black-box'. In other
+    words, repeat calls with the same input arguments will result in different
+    values for `y1`, `y2`, and `y3`, but the same value for `y4`.
+    If `y1` is less than 0.2, the result is considered "bad" no matter how good
+    the other values are. If `y2` is less than 0.7, the result is considered
+    "bad" no matter how good the other values are. If `y3` is greater than 1800,
+    the result is considered "bad" no matter how good the other values are. If `y4`
+    is greater than 40e6, the result is considered "bad" no matter how good the
+    other values are.
+    `fidelity1` is a fidelity parameter. 0 is the lowest fidelity, and 1 is the
+    highest fidelity. The higher the fidelity, typically the more expensive the
+    evaluation. However, this also typically means higher quality and relevance
+    to the optimization campaign goals. `fidelity1` and `y3` are
+    correlated.
+    """,
+)
 iface.launch()

surrogate.py CHANGED Viewed

@@ -1,34 +1,43 @@
 from joblib import load
 import pandas as pd
 import random
-from pydantic import BaseModel, ValidationInfo, field_validator
-PARAM_CONSTRAINTS = {
-    "N": {"type": "range", "bounds": [1, 10]},
-    "alpha": {"type": "range", "bounds": [0.0, 1.0]},
-    "d_model": {"type": "range", "bounds": [100, 1024]},
-    "dim_feedforward": {"type": "range", "bounds": [1024, 4096]},
-    "dropout": {"type": "range", "bounds": [0.0, 1.0]},
-    "emb_scaler": {"type": "range", "bounds": [0.0, 1.0]},
-    "eps": {"type": "range", "bounds": [1e-7, 1e-4]},
-    "epochs_step": {"type": "range", "bounds": [5, 20]},
-    "fudge": {"type": "range", "bounds": [0.0, 0.1]},
-    "heads": {"type": "range", "bounds": [1, 10]},
-    "k": {"type": "range", "bounds": [2, 10]},
-    "lr": {"type": "range", "bounds": [1e-4, 6e-3]},
-    "pe_resolution": {"type": "range", "bounds": [2500, 10000]},
-    "ple_resolution": {"type": "range", "bounds": [2500, 10000]},
-    "pos_scaler": {"type": "range", "bounds": [0.0, 1.0]},
-    "weight_decay": {"type": "range", "bounds": [0.0, 1.0]},
-    "batch_size": {"type": "range", "bounds": [32, 256]},
-    "out_hidden4": {"type": "range", "bounds": [32, 512]},
-    "betas1": {"type": "range", "bounds": [0.5, 0.9999]},
-    "betas2": {"type": "range", "bounds": [0.5, 0.9999]},
-    "bias": {"type": "choice", "values": [False, True]},
-    "criterion": {"type": "choice", "values": ["RobustL1", "RobustL2"]},
-    "elem_prop": {"type": "choice", "values": ["mat2vec", "magpie", "onehot"]},
-    "train_frac": {"type": "range", "bounds": [0.01, 1.0]},
-}
 class Parameterization(BaseModel):
@@ -52,16 +61,17 @@ class Parameterization(BaseModel):
     out_hidden4: int
     betas1: float
     betas2: float
-    losscurve: bool
-    learningcurve: bool
     bias: bool
     criterion: str
     elem_prop: str
     train_frac: float
     @field_validator("*")
-    def check_constraints(cls, v: int, info: ValidationInfo) -> int:
-        param = PARAM_CONSTRAINTS.get(info.field_name)
         if param is None:
             return v
@@ -75,60 +85,103 @@ class Parameterization(BaseModel):
             if v not in param["values"]:
                 raise ValueError(f"{info.field_name} must be one of {param['values']}")
-        if (
-            info.field_name in ("betas1", "betas2")
-            and "betas1" in field.owner
-            and "betas2" in field.owner
-        ):
-            if field.owner["betas1"] > field.owner["betas2"]:
-                raise ValueError("betas1 must be less than or equal to betas2")
-        if (
-            info.field_name in ("emb_scaler", "pos_scaler")
-            and "emb_scaler" in field.owner
-            and "pos_scaler" in field.owner
-        ):
-            if field.owner["emb_scaler"] + field.owner["pos_scaler"] > 1.0:
-                raise ValueError(
-                    "The sum of emb_scaler and pos_scaler must be less than or equal to 1.0"
-                )
         return v
 class CrabNetSurrogateModel(object):
-    def __init__(self, fpath="surrogate_models.pkl"):
         self.models = load(fpath)
-        pass
-    def prepare_params_for_eval(self, raw_params: Parameterization):
         raw_params["bias"] = int(raw_params["bias"])
         raw_params["use_RobustL1"] = raw_params["criterion"] == "RobustL1"
-        raw_params["criterion"] = None
-        raw_params["losscurve"] = None
-        raw_params["learningcurve"] = None
         elem_prop = raw_params["elem_prop"]
         raw_params["elem_prop_magpie"] = 0
         raw_params["elem_prop_mat2vec"] = 0
         raw_params["elem_prop_onehot"] = 0
         raw_params[f"elem_prop_{elem_prop}"] = 1
-        raw_params["elem_prop"] = None
         return raw_params
-    def surrogate_evaluate(self, params: Parameterization):
-        parameters = self.prepare_params_for_eval(params)
-        parameters = pd.DataFrame([parameters])
-        percentile = random.uniform(0, 1)  # generate random percentile
-        mae = self.models["mae"].predict(parameters.assign(mae_rank=[percentile]))
-        rmse = self.models["rmse"].predict(parameters.assign(rmse_rank=[percentile]))
         runtime = self.models["runtime"].predict(
-            parameters.assign(runtime_rank=[percentile])
         )
-        model_size = self.models["model_size"].predict(parameters)
-        return mae, rmse, runtime, model_size

+from click import Parameter
+import numpy as np
 from joblib import load
+from typing import List
 import pandas as pd
 import random
+from pydantic import (
+    BaseModel,
+    ValidationError,
+    ValidationInfo,
+    field_validator,
+    model_validator,
+)
+PARAM_BOUNDS = [
+    {"name": "N", "type": "range", "bounds": [1, 10]},
+    {"name": "alpha", "type": "range", "bounds": [0.0, 1.0]},
+    {"name": "d_model", "type": "range", "bounds": [100, 1024]},
+    {"name": "dim_feedforward", "type": "range", "bounds": [1024, 4096]},
+    {"name": "dropout", "type": "range", "bounds": [0.0, 1.0]},
+    {"name": "emb_scaler", "type": "range", "bounds": [0.0, 1.0]},
+    {"name": "epochs_step", "type": "range", "bounds": [5, 20]},
+    {"name": "eps", "type": "range", "bounds": [1e-7, 1e-4]},
+    {"name": "fudge", "type": "range", "bounds": [0.0, 0.1]},
+    {"name": "heads", "type": "range", "bounds": [1, 10]},
+    {"name": "k", "type": "range", "bounds": [2, 10]},
+    {"name": "lr", "type": "range", "bounds": [1e-4, 6e-3]},
+    {"name": "pe_resolution", "type": "range", "bounds": [2500, 10000]},
+    {"name": "ple_resolution", "type": "range", "bounds": [2500, 10000]},
+    {"name": "pos_scaler", "type": "range", "bounds": [0.0, 1.0]},
+    {"name": "weight_decay", "type": "range", "bounds": [0.0, 1.0]},
+    {"name": "batch_size", "type": "range", "bounds": [32, 256]},
+    {"name": "out_hidden4", "type": "range", "bounds": [32, 512]},
+    {"name": "betas1", "type": "range", "bounds": [0.5, 0.9999]},
+    {"name": "betas2", "type": "range", "bounds": [0.5, 0.9999]},
+    {"name": "bias", "type": "choice", "values": [False, True]},
+    {"name": "criterion", "type": "choice", "values": ["RobustL1", "RobustL2"]},
+    {"name": "elem_prop", "type": "choice", "values": ["mat2vec", "magpie", "onehot"]},
+    {"name": "train_frac", "type": "range", "bounds": [0.01, 1.0]},
+]
 class Parameterization(BaseModel):
     out_hidden4: int
     betas1: float
     betas2: float
     bias: bool
     criterion: str
     elem_prop: str
     train_frac: float
     @field_validator("*")
+    def check_bounds(cls, v: int, info: ValidationInfo) -> int:
+        param = next(
+            (item for item in PARAM_BOUNDS if item["name"] == info.field_name),
+            None,
+        )
         if param is None:
             return v
             if v not in param["values"]:
                 raise ValueError(f"{info.field_name} must be one of {param['values']}")
         return v
+    @model_validator(mode="after")
+    def check_constraints(self) -> "Parameterization":
+        if self.betas1 > self.betas2:
+            raise ValueError(
+                f"Received betas1={self.betas1} which should be less than betas2={self.betas2}"
+            )
+        if self.emb_scaler + self.pos_scaler > 1.0:
+            raise ValueError(
+                f"Received emb_scaler={self.emb_scaler} and pos_scaler={self.pos_scaler} which should sum to less than or equal to 1.0"  # noqa: E501
+            )
 class CrabNetSurrogateModel(object):
+    def __init__(self, fpath="models/surrogate_models_hgbr_opt.pkl"):
         self.models = load(fpath)
+    def prepare_params_for_eval(self, raw_params: dict):
         raw_params["bias"] = int(raw_params["bias"])
         raw_params["use_RobustL1"] = raw_params["criterion"] == "RobustL1"
+        del raw_params["criterion"]
+        # REVIEW: HistGradientBoostingRegressor handles categoricals natively now
+        # https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-categorical-py # noqa: E501
         elem_prop = raw_params["elem_prop"]
         raw_params["elem_prop_magpie"] = 0
         raw_params["elem_prop_mat2vec"] = 0
         raw_params["elem_prop_onehot"] = 0
         raw_params[f"elem_prop_{elem_prop}"] = 1
+        del raw_params["elem_prop"]
         return raw_params
+    def surrogate_evaluate(
+        self, params_list: List[dict], seed=None, remove_noise=False
+    ):
+        assert isinstance(params_list, list), "Input must be a list of dictionaries"
+        # Validate the parameters (i.e., will throw error if invalid)
+        [Parameterization(**params) for params in params_list]
+        parameters = pd.DataFrame(params_list)
+        parameters = parameters.apply(self.prepare_params_for_eval, axis=1)
+        if remove_noise:
+            mae_percentiles = [0.5] * len(parameters)
+            rmse_percentiles = [0.5] * len(parameters)
+            runtime_percentiles = [0.5] * len(parameters)
+        else:
+            # Random number generator, without seed (intentional)
+            rng = np.random.default_rng(seed)
+            # Generate random percentiles for each set of parameters for
+            # heteroskedastic, parameter-free noise
+            mae_percentiles = rng.uniform(0, 1, size=len(parameters))
+            rmse_percentiles = mae_percentiles  # typically correlated with MAE
+            # typically anticorrelated with MAE/RMSE
+            runtime_percentiles = 1 - mae_percentiles
+        # Make predictions for each model
+        mae_model = self.models["mae"]
+        rmse_model = self.models["rmse"]
+        runtime_model = self.models["runtime"]
+        model_size_model = self.models["model_size"]
+        # NOTE: The model expects the variables in the same order as when it was fit
+        mae = self.models["mae"].predict(
+            parameters.assign(mae_rank=mae_percentiles)[mae_model.feature_names_in_]
+        )
+        rmse = self.models["rmse"].predict(
+            parameters.assign(rmse_rank=rmse_percentiles)[rmse_model.feature_names_in_]
+        )
         runtime = self.models["runtime"].predict(
+            parameters.assign(runtime_rank=runtime_percentiles)[
+                runtime_model.feature_names_in_
+            ]
+        )
+        # Model size is deterministic (hence no rank variable)
+        model_size = self.models["model_size"].predict(
+            parameters[model_size_model.feature_names_in_]
         )
+        # Combine predictions into a list of dictionaries
+        results = [
+            {"mae": m, "rmse": r, "runtime": rt, "model_size": ms}
+            for m, r, rt, ms in zip(mae, rmse, runtime, model_size)
+        ]
+        return results
+# %% Code Graveyard
+# runtime_percentiles = np.random.uniform(
+#     0, 1, size=len(parameters)
+# )