File size: 3,933 Bytes
d70a716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Data-based prior: incorporate initial experimental data to warm-start BO."""

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple

import torch
from torch import Tensor
import pandas as pd
import numpy as np


@dataclass
class DataPrior:
    """Manages initial experimental data as a prior for Bayesian optimization.



    Supports loading from:

    - Tensors directly

    - Pandas DataFrames

    - CSV files

    - Dictionary format



    The data prior can be used to:

    - Warm-start the GP model

    - Estimate initial hyperparameters

    - Define the feasible region based on past experiments

    """

    X: Optional[Tensor] = None
    y: Optional[Tensor] = None
    feature_names: List[str] = field(default_factory=list)
    objective_name: str = "objective"
    metadata: Dict = field(default_factory=dict)

    @classmethod
    def from_dataframe(

        cls,

        df: pd.DataFrame,

        feature_columns: List[str],

        objective_column: str,

        dtype: torch.dtype = torch.float64,

    ) -> "DataPrior":
        """Create a DataPrior from a pandas DataFrame."""
        X = torch.tensor(df[feature_columns].values, dtype=dtype)
        y = torch.tensor(df[objective_column].values, dtype=dtype).unsqueeze(-1)
        return cls(
            X=X,
            y=y,
            feature_names=feature_columns,
            objective_name=objective_column,
            metadata={"source": "dataframe", "n_samples": len(df)},
        )

    @classmethod
    def from_csv(

        cls,

        filepath: str,

        feature_columns: List[str],

        objective_column: str,

        dtype: torch.dtype = torch.float64,

    ) -> "DataPrior":
        """Create a DataPrior from a CSV file."""
        df = pd.read_csv(filepath)
        return cls.from_dataframe(df, feature_columns, objective_column, dtype)

    @classmethod
    def from_dict(

        cls,

        data: Dict[str, List[float]],

        feature_keys: List[str],

        objective_key: str,

        dtype: torch.dtype = torch.float64,

    ) -> "DataPrior":
        """Create a DataPrior from a dictionary."""
        X = torch.tensor(
            [[data[k][i] for k in feature_keys] for i in range(len(data[feature_keys[0]]))],
            dtype=dtype,
        )
        y = torch.tensor(data[objective_key], dtype=dtype).unsqueeze(-1)
        return cls(
            X=X,
            y=y,
            feature_names=feature_keys,
            objective_name=objective_key,
            metadata={"source": "dict", "n_samples": len(X)},
        )

    def add_observations(self, X_new: Tensor, y_new: Tensor) -> None:
        """Add new observations to the prior data."""
        if y_new.dim() == 1:
            y_new = y_new.unsqueeze(-1)

        if self.X is None:
            self.X = X_new
            self.y = y_new
        else:
            self.X = torch.cat([self.X, X_new], dim=0)
            self.y = torch.cat([self.y, y_new], dim=0)

        self.metadata["n_samples"] = len(self.X)

    def get_bounds(self) -> Tuple[Tensor, Tensor]:
        """Get the observed bounds of the data."""
        if self.X is None:
            raise ValueError("No data available.")
        return self.X.min(dim=0).values, self.X.max(dim=0).values

    def get_best(self, maximize: bool = True) -> Tuple[Tensor, Tensor]:
        """Get the best observation so far."""
        if self.y is None:
            raise ValueError("No data available.")
        if maximize:
            idx = self.y.argmax()
        else:
            idx = self.y.argmin()
        return self.X[idx], self.y[idx]

    @property
    def n_observations(self) -> int:
        return 0 if self.X is None else len(self.X)

    @property
    def n_features(self) -> int:
        return 0 if self.X is None else self.X.shape[-1]