| """Data-based prior: incorporate initial experimental data to warm-start BO."""
|
|
|
| from dataclasses import dataclass, field
|
| from typing import Dict, List, Optional, Tuple
|
|
|
| import torch
|
| from torch import Tensor
|
| import pandas as pd
|
| import numpy as np
|
|
|
|
|
| @dataclass
|
| class DataPrior:
|
| """Manages initial experimental data as a prior for Bayesian optimization.
|
|
|
| Supports loading from:
|
| - Tensors directly
|
| - Pandas DataFrames
|
| - CSV files
|
| - Dictionary format
|
|
|
| The data prior can be used to:
|
| - Warm-start the GP model
|
| - Estimate initial hyperparameters
|
| - Define the feasible region based on past experiments
|
| """
|
|
|
| X: Optional[Tensor] = None
|
| y: Optional[Tensor] = None
|
| feature_names: List[str] = field(default_factory=list)
|
| objective_name: str = "objective"
|
| metadata: Dict = field(default_factory=dict)
|
|
|
| @classmethod
|
| def from_dataframe(
|
| cls,
|
| df: pd.DataFrame,
|
| feature_columns: List[str],
|
| objective_column: str,
|
| dtype: torch.dtype = torch.float64,
|
| ) -> "DataPrior":
|
| """Create a DataPrior from a pandas DataFrame."""
|
| X = torch.tensor(df[feature_columns].values, dtype=dtype)
|
| y = torch.tensor(df[objective_column].values, dtype=dtype).unsqueeze(-1)
|
| return cls(
|
| X=X,
|
| y=y,
|
| feature_names=feature_columns,
|
| objective_name=objective_column,
|
| metadata={"source": "dataframe", "n_samples": len(df)},
|
| )
|
|
|
| @classmethod
|
| def from_csv(
|
| cls,
|
| filepath: str,
|
| feature_columns: List[str],
|
| objective_column: str,
|
| dtype: torch.dtype = torch.float64,
|
| ) -> "DataPrior":
|
| """Create a DataPrior from a CSV file."""
|
| df = pd.read_csv(filepath)
|
| return cls.from_dataframe(df, feature_columns, objective_column, dtype)
|
|
|
| @classmethod
|
| def from_dict(
|
| cls,
|
| data: Dict[str, List[float]],
|
| feature_keys: List[str],
|
| objective_key: str,
|
| dtype: torch.dtype = torch.float64,
|
| ) -> "DataPrior":
|
| """Create a DataPrior from a dictionary."""
|
| X = torch.tensor(
|
| [[data[k][i] for k in feature_keys] for i in range(len(data[feature_keys[0]]))],
|
| dtype=dtype,
|
| )
|
| y = torch.tensor(data[objective_key], dtype=dtype).unsqueeze(-1)
|
| return cls(
|
| X=X,
|
| y=y,
|
| feature_names=feature_keys,
|
| objective_name=objective_key,
|
| metadata={"source": "dict", "n_samples": len(X)},
|
| )
|
|
|
| def add_observations(self, X_new: Tensor, y_new: Tensor) -> None:
|
| """Add new observations to the prior data."""
|
| if y_new.dim() == 1:
|
| y_new = y_new.unsqueeze(-1)
|
|
|
| if self.X is None:
|
| self.X = X_new
|
| self.y = y_new
|
| else:
|
| self.X = torch.cat([self.X, X_new], dim=0)
|
| self.y = torch.cat([self.y, y_new], dim=0)
|
|
|
| self.metadata["n_samples"] = len(self.X)
|
|
|
| def get_bounds(self) -> Tuple[Tensor, Tensor]:
|
| """Get the observed bounds of the data."""
|
| if self.X is None:
|
| raise ValueError("No data available.")
|
| return self.X.min(dim=0).values, self.X.max(dim=0).values
|
|
|
| def get_best(self, maximize: bool = True) -> Tuple[Tensor, Tensor]:
|
| """Get the best observation so far."""
|
| if self.y is None:
|
| raise ValueError("No data available.")
|
| if maximize:
|
| idx = self.y.argmax()
|
| else:
|
| idx = self.y.argmin()
|
| return self.X[idx], self.y[idx]
|
|
|
| @property
|
| def n_observations(self) -> int:
|
| return 0 if self.X is None else len(self.X)
|
|
|
| @property
|
| def n_features(self) -> int:
|
| return 0 if self.X is None else self.X.shape[-1]
|
|
|