import ast from dataclasses import dataclass from typing import Literal import numpy as np from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import ( Kernel, RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct, WhiteKernel, ConstantKernel, ) from sympy import Expr, lambdify @dataclass class DataGenerationOptions: method: Literal["grid", "random"] num_samples: int noise: float = 0. @dataclass class Dataset: x: list[float] y: list[float] @dataclass class PlotData: x: np.ndarray pred_mean: np.ndarray pred_std: np.ndarray y: np.ndarray | None = None def generate_dataset( function: Expr, xlim: tuple[float, float], generation_options: DataGenerationOptions, ) -> Dataset: f = lambdify("x", function, modules='numpy') if generation_options.method == 'grid': x = np.linspace(xlim[0], xlim[1], generation_options.num_samples) elif generation_options.method == 'random': x = np.random.uniform(xlim[0], xlim[1], generation_options.num_samples) else: raise ValueError(f"Unknown generation method: {generation_options.method}") y = f(x) if generation_options.noise > 0: y += np.random.normal(0, generation_options.noise, size=y.shape) return Dataset(x=x.tolist(), y=y.tolist()) def load_dataset_from_csv( file_path: str, header: bool, x_col: int, y_col: int ) -> Dataset: data = np.genfromtxt(file_path, delimiter=',', skip_header=1 if header else 0) data = data[~np.isnan(data).any(axis=1)] # remove rows with NaN values x = data[:, x_col].tolist() y = data[:, y_col].tolist() return Dataset(x=x, y=y) def generate_true_curve( function: Expr, xlim: tuple[int, int], num_points: int = 1000, ) -> Dataset: f = lambdify("x", function, modules='numpy') x = np.linspace(xlim[0], xlim[1], num_points) y = f(x) return Dataset(x=x.tolist(), y=y.tolist()) def train_model( dataset: Dataset, kernel: Kernel, distribution: Literal["Prior", "Posterior"], ) -> GaussianProcessRegressor: gp = GaussianProcessRegressor(kernel=kernel) if distribution == "Posterior": x = np.array(dataset.x).reshape(-1, 1) y = np.array(dataset.y) gp.fit(x, y) elif distribution != "Prior": raise ValueError(f"Unknown distribution type: {distribution}") return gp def predict( model: GaussianProcessRegressor, x: np.ndarray, ) -> tuple[np.ndarray, np.ndarray]: y_mean, y_std = model.predict(x, return_std=True) return y_mean, y_std def sample( model: GaussianProcessRegressor, x: np.ndarray, ) -> np.ndarray: y_samples = model.sample_y(x, n_samples=1).flatten() return y_samples def eval_kernel(kernel: str) -> Kernel: # List of allowed kernel constructors allowed_names = { 'RBF': RBF, 'Matern': Matern, 'RationalQuadratic': RationalQuadratic, 'ExpSineSquared': ExpSineSquared, 'DotProduct': DotProduct, 'WhiteKernel': WhiteKernel, 'ConstantKernel': ConstantKernel, } # Parse and check the syntax safely try: tree = ast.parse(kernel, mode='eval') except SyntaxError as e: raise ValueError(f"Invalid syntax: {e}") # Evaluate in restricted namespace try: result = eval( compile(tree, '', 'eval'), {"__builtins__": None}, # disable access to Python builtins like open allowed_names # only allow things in this list ) except Exception as e: raise ValueError(f"Error evaluating kernel: {e}") return result def compute_plot_values( dataset: Dataset, kernel_input: str, distribution: Literal["Prior", "Posterior"], xmin: float, xmax: float, ) -> PlotData: kernel = eval_kernel(kernel_input) model = train_model(dataset, kernel, distribution) x_plot = np.linspace(xmin, xmax, 1000).reshape(-1, 1) y_mean, y_std = predict(model, x_plot) return PlotData(x=x_plot.flatten(), pred_mean=y_mean, pred_std=y_std)