"""Data-based prior: incorporate initial experimental data to warm-start BO.""" from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import torch from torch import Tensor import pandas as pd import numpy as np @dataclass class DataPrior: """Manages initial experimental data as a prior for Bayesian optimization. Supports loading from: - Tensors directly - Pandas DataFrames - CSV files - Dictionary format The data prior can be used to: - Warm-start the GP model - Estimate initial hyperparameters - Define the feasible region based on past experiments """ X: Optional[Tensor] = None y: Optional[Tensor] = None feature_names: List[str] = field(default_factory=list) objective_name: str = "objective" metadata: Dict = field(default_factory=dict) @classmethod def from_dataframe( cls, df: pd.DataFrame, feature_columns: List[str], objective_column: str, dtype: torch.dtype = torch.float64, ) -> "DataPrior": """Create a DataPrior from a pandas DataFrame.""" X = torch.tensor(df[feature_columns].values, dtype=dtype) y = torch.tensor(df[objective_column].values, dtype=dtype).unsqueeze(-1) return cls( X=X, y=y, feature_names=feature_columns, objective_name=objective_column, metadata={"source": "dataframe", "n_samples": len(df)}, ) @classmethod def from_csv( cls, filepath: str, feature_columns: List[str], objective_column: str, dtype: torch.dtype = torch.float64, ) -> "DataPrior": """Create a DataPrior from a CSV file.""" df = pd.read_csv(filepath) return cls.from_dataframe(df, feature_columns, objective_column, dtype) @classmethod def from_dict( cls, data: Dict[str, List[float]], feature_keys: List[str], objective_key: str, dtype: torch.dtype = torch.float64, ) -> "DataPrior": """Create a DataPrior from a dictionary.""" X = torch.tensor( [[data[k][i] for k in feature_keys] for i in range(len(data[feature_keys[0]]))], dtype=dtype, ) y = torch.tensor(data[objective_key], dtype=dtype).unsqueeze(-1) return cls( X=X, y=y, feature_names=feature_keys, objective_name=objective_key, metadata={"source": "dict", "n_samples": len(X)}, ) def add_observations(self, X_new: Tensor, y_new: Tensor) -> None: """Add new observations to the prior data.""" if y_new.dim() == 1: y_new = y_new.unsqueeze(-1) if self.X is None: self.X = X_new self.y = y_new else: self.X = torch.cat([self.X, X_new], dim=0) self.y = torch.cat([self.y, y_new], dim=0) self.metadata["n_samples"] = len(self.X) def get_bounds(self) -> Tuple[Tensor, Tensor]: """Get the observed bounds of the data.""" if self.X is None: raise ValueError("No data available.") return self.X.min(dim=0).values, self.X.max(dim=0).values def get_best(self, maximize: bool = True) -> Tuple[Tensor, Tensor]: """Get the best observation so far.""" if self.y is None: raise ValueError("No data available.") if maximize: idx = self.y.argmax() else: idx = self.y.argmin() return self.X[idx], self.y[idx] @property def n_observations(self) -> int: return 0 if self.X is None else len(self.X) @property def n_features(self) -> int: return 0 if self.X is None else self.X.shape[-1]