import numpy as np import streamlit as st import matplotlib.pyplot as plt import scipy.stats # plt.style.use('fivethirtyeight') st.subheader("Gaussian Process Regression with Squared Exponential Kernel") st_col_text = st.columns(1)[0] st.markdown(r""" Squared Exponential Kernel is formulated as $K(x, x') = \sigma_f^2 \exp \left[\frac{-(x - x')^2}{\ell^2}\right]$ where, $\ell$ is lengthscale and $\sigma_f^2$ is variance. $$ \text{Prior: } \mathbf{f} \sim \mathcal{N}(\boldsymbol{0}, K) \\ \text{Likelihood: } \mathbf{y} \sim \mathcal{N}(\mathbf{f}, \sigma_n^2I) $$ where $\sigma_n^2$ is known as noise variance or likelihood noise. """) st_col = st.columns(1)[0] old_loss = 1000.0 lengthscale = st.slider('Lengthscale', min_value=0.01, max_value=1.0, value=0.25, step=0.01) variance = st.slider('Variance', min_value=0.001, max_value=0.1, value=0.025, step=0.001) noise_variance = st.slider('Noise Variance', min_value=0.001, max_value=0.01, value=0.0, step=0.001) def rbf_kernel(x1, x2, lengthscale, variance): x1_ = x1.reshape(-1,1)/lengthscale x2_ = x2.reshape(1,-1)/lengthscale dist_sqr = (x1_ - x2_) ** 2 return variance * np.exp(-dist_sqr) fig, ax = plt.subplots(figsize=(10,4)) x_train = np.array([0.2, 0.5, 0.8]).reshape(-1,1) y_train = np.array([0.8, 0.3, 0.6]).reshape(-1,1) ax.scatter(x_train, y_train, label='train points') ax.set_xlim(-0.2,1.2) ax.set_ylim(-0.2,1.2) N = 100 x_test = np.linspace(-0.2,1.2,N).reshape(-1,1) k_train_train = rbf_kernel(x_train, x_train, lengthscale, variance) k_test_train = rbf_kernel(x_test, x_train, lengthscale, variance) k_test_test = rbf_kernel(x_test, x_test, lengthscale, variance) k_train_with_noise = k_train_train + noise_variance * np.eye(3) c = np.linalg.inv(np.linalg.cholesky(k_train_with_noise)) k_inv = np.dot(c.T,c) pred_mean = k_test_train@k_inv@y_train pred_var = k_test_test - k_test_train@k_inv@k_test_train.T pred_std2 = 2 * (pred_var.diagonal() ** 0.5) ax.plot(x_test, pred_mean, label='predictive mean ($\\mu$)') ax.fill_between(x_test.ravel(), pred_mean.ravel()-pred_std2.ravel(), pred_mean.ravel()+pred_std2.ravel(), alpha=0.5, label='$\\mu \\pm 2\\sigma$') ax.set_xlabel('x') ax.set_ylabel('y') ax.set_title(f"Negative Log Marginal Likelihood: {scipy.stats.multivariate_normal(np.zeros(3), k_train_with_noise).logpdf(y_train.ravel()):.4f}") ax.legend() with st_col: st.pyplot(fig) # new_loss = scipy.stats.multivariate_normal(np.zeros(3), k_train_with_noise).logpdf(y_train.ravel()) # new_loss_str = f"{new_loss:.4f}" def percentage_change(): global old_loss ans = (new_loss - old_loss)/old_loss * 100 old_loss = new_loss return f"{ans:.1f}%" # with st_met: # st.metric('Loss', f"{new_loss:.4f}") hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) st.markdown(r""" Here are some observations to note while experimenting with the hyperparameters: * Lengthscale $\ell$ controls the smoothness of the fit. Smoothness in fit increases with an increase in $\ell$. * Variance $\sigma_f^2$ controls the uncertainty in the model (aka epistemic uncertainty). Sometimes it is also called lengthscale in the vertical direction [[Slide 154](http://cbl.eng.cam.ac.uk/pub/Public/Turner/News/imperial-gp-tutorial.pdf)]). * Noise variance $\sigma_n^2$ is a measure of observation noise or irreducible noise (aka aleatoric uncertainty) present in the dataset. Increasing noise variance to a certain limit reduces overfitting. One can fix it if known from the data generation process or it can be learned during the hyperparameter optimization process. * Negative Log Marginal Likelihood works as a loss function for GP hyperparameter tuning. Though there are advanced tools available for hyperparameter tuning, you can manually optimize them with the sliders above to test your understanding. """)