ravimohan19's picture
Upload priors/data_prior.py with huggingface_hub
d70a716 verified
"""Data-based prior: incorporate initial experimental data to warm-start BO."""
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
import torch
from torch import Tensor
import pandas as pd
import numpy as np
@dataclass
class DataPrior:
"""Manages initial experimental data as a prior for Bayesian optimization.
Supports loading from:
- Tensors directly
- Pandas DataFrames
- CSV files
- Dictionary format
The data prior can be used to:
- Warm-start the GP model
- Estimate initial hyperparameters
- Define the feasible region based on past experiments
"""
X: Optional[Tensor] = None
y: Optional[Tensor] = None
feature_names: List[str] = field(default_factory=list)
objective_name: str = "objective"
metadata: Dict = field(default_factory=dict)
@classmethod
def from_dataframe(
cls,
df: pd.DataFrame,
feature_columns: List[str],
objective_column: str,
dtype: torch.dtype = torch.float64,
) -> "DataPrior":
"""Create a DataPrior from a pandas DataFrame."""
X = torch.tensor(df[feature_columns].values, dtype=dtype)
y = torch.tensor(df[objective_column].values, dtype=dtype).unsqueeze(-1)
return cls(
X=X,
y=y,
feature_names=feature_columns,
objective_name=objective_column,
metadata={"source": "dataframe", "n_samples": len(df)},
)
@classmethod
def from_csv(
cls,
filepath: str,
feature_columns: List[str],
objective_column: str,
dtype: torch.dtype = torch.float64,
) -> "DataPrior":
"""Create a DataPrior from a CSV file."""
df = pd.read_csv(filepath)
return cls.from_dataframe(df, feature_columns, objective_column, dtype)
@classmethod
def from_dict(
cls,
data: Dict[str, List[float]],
feature_keys: List[str],
objective_key: str,
dtype: torch.dtype = torch.float64,
) -> "DataPrior":
"""Create a DataPrior from a dictionary."""
X = torch.tensor(
[[data[k][i] for k in feature_keys] for i in range(len(data[feature_keys[0]]))],
dtype=dtype,
)
y = torch.tensor(data[objective_key], dtype=dtype).unsqueeze(-1)
return cls(
X=X,
y=y,
feature_names=feature_keys,
objective_name=objective_key,
metadata={"source": "dict", "n_samples": len(X)},
)
def add_observations(self, X_new: Tensor, y_new: Tensor) -> None:
"""Add new observations to the prior data."""
if y_new.dim() == 1:
y_new = y_new.unsqueeze(-1)
if self.X is None:
self.X = X_new
self.y = y_new
else:
self.X = torch.cat([self.X, X_new], dim=0)
self.y = torch.cat([self.y, y_new], dim=0)
self.metadata["n_samples"] = len(self.X)
def get_bounds(self) -> Tuple[Tensor, Tensor]:
"""Get the observed bounds of the data."""
if self.X is None:
raise ValueError("No data available.")
return self.X.min(dim=0).values, self.X.max(dim=0).values
def get_best(self, maximize: bool = True) -> Tuple[Tensor, Tensor]:
"""Get the best observation so far."""
if self.y is None:
raise ValueError("No data available.")
if maximize:
idx = self.y.argmax()
else:
idx = self.y.argmin()
return self.X[idx], self.y[idx]
@property
def n_observations(self) -> int:
return 0 if self.X is None else len(self.X)
@property
def n_features(self) -> int:
return 0 if self.X is None else self.X.shape[-1]