File size: 4,189 Bytes
dd9f338
 
 
b0a592a
532e91a
dd9f338
e793996
826f2da
50d0d49
826f2da
9dc953e
9c5afff
f2c6b19
44e6ec6
c11d7d9
f2c6b19
9c5afff
826f2da
 
dd9f338
a8e0356
50d0d49
b694fbc
dd9f338
 
 
 
 
 
 
 
 
 
532e91a
dd9f338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7b7a07
4e77d19
dd9f338
 
 
 
 
 
eac41ae
dd9f338
67b90a9
dd9f338
 
a8e0356
dd9f338
 
 
 
a8e0356
 
b694fbc
 
74eaf54
b694fbc
 
 
 
a8e0356
 
dd9f338
 
 
 
 
ec2180b
dd9f338
 
c11d7d9
 
 
2feb377
2096784
fd4877b
 
c11d7d9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import scipy.stats
# plt.style.use('fivethirtyeight')

st.subheader("Gaussian Process Regression with Squared Exponential Kernel")

st_col_text = st.columns(1)[0]

st.markdown(r"""
           Squared Exponential Kernel is formulated as $K(x, x') = \sigma_f^2 \exp \left[\frac{-(x - x')^2}{\ell^2}\right]$ where, $\ell$ is lengthscale and $\sigma_f^2$ is variance.
           $$
           \text{Prior: } \mathbf{f} \sim \mathcal{N}(\boldsymbol{0}, K) \\
           \text{Likelihood: } \mathbf{y} \sim \mathcal{N}(\mathbf{f}, \sigma_n^2I)
           $$
           where $\sigma_n^2$ is known as noise variance or likelihood noise.
    """)


st_col = st.columns(1)[0]

old_loss = 1000.0
lengthscale = st.slider('Lengthscale', min_value=0.01, max_value=1.0, value=0.25, step=0.01)
variance = st.slider('Variance', min_value=0.001, max_value=0.1, value=0.025, step=0.001)
noise_variance = st.slider('Noise Variance', min_value=0.001, max_value=0.01, value=0.0, step=0.001)

def rbf_kernel(x1, x2, lengthscale, variance):
    x1_ = x1.reshape(-1,1)/lengthscale
    x2_ = x2.reshape(1,-1)/lengthscale
    dist_sqr = (x1_ - x2_) ** 2
    return variance * np.exp(-dist_sqr)

fig, ax = plt.subplots(figsize=(10,4))

x_train = np.array([0.2, 0.5, 0.8]).reshape(-1,1)
y_train = np.array([0.8, 0.3, 0.6]).reshape(-1,1)

ax.scatter(x_train, y_train, label='train points')
ax.set_xlim(-0.2,1.2)
ax.set_ylim(-0.2,1.2)

N = 100
x_test = np.linspace(-0.2,1.2,N).reshape(-1,1)

k_train_train = rbf_kernel(x_train, x_train, lengthscale, variance)
k_test_train = rbf_kernel(x_test, x_train, lengthscale, variance)
k_test_test = rbf_kernel(x_test, x_test, lengthscale, variance)

k_train_with_noise = k_train_train + noise_variance * np.eye(3)
c = np.linalg.inv(np.linalg.cholesky(k_train_with_noise))
k_inv = np.dot(c.T,c)

pred_mean = k_test_train@k_inv@y_train
pred_var = k_test_test - k_test_train@k_inv@k_test_train.T
pred_std2 = 2 * (pred_var.diagonal() ** 0.5)

ax.plot(x_test, pred_mean, label='predictive mean ($\\mu$)')
ax.fill_between(x_test.ravel(), pred_mean.ravel()-pred_std2.ravel(), 
                pred_mean.ravel()+pred_std2.ravel(), alpha=0.5, label='$\\mu \\pm 2\\sigma$')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title(f"Negative Log Marginal Likelihood: {scipy.stats.multivariate_normal(np.zeros(3), k_train_with_noise).logpdf(y_train.ravel()):.4f}")
ax.legend()

with st_col:
    st.pyplot(fig)
# new_loss = scipy.stats.multivariate_normal(np.zeros(3), k_train_with_noise).logpdf(y_train.ravel())
# new_loss_str = f"{new_loss:.4f}"

def percentage_change():
    global old_loss
    ans = (new_loss - old_loss)/old_loss * 100
    old_loss = new_loss
    return f"{ans:.1f}%"

# with st_met:
#    st.metric('Loss', f"{new_loss:.4f}")
    
hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            subheader {alignment: center;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.markdown(r"""
           Here are some observations to note while experimenting with the hyperparameters:
           * Lengthscale $\ell$ controls the smoothness of the fit. Smoothness in fit increases with an increase in $\ell$.
           * Variance $\sigma_f^2$ controls the uncertainty in the model (aka epistemic uncertainty). Sometimes it is also called lengthscale in the vertical direction [[Slide 154](http://cbl.eng.cam.ac.uk/pub/Public/Turner/News/imperial-gp-tutorial.pdf)]).
           * Noise variance $\sigma_n^2$ is a measure of observation noise or irreducible noise (aka aleatoric uncertainty) present in the dataset. Increasing noise variance to a certain limit reduces overfitting. One can fix it if known from the data generation process or it can be learned during the hyperparameter optimization process.
           * Negative Log Marginal Likelihood works as a loss function for GP hyperparameter tuning. Though there are advanced tools available for hyperparameter tuning, you can manually optimize them with the sliders above to test your understanding. 
           """)