Spaces:
Sleeping
Sleeping
| """ | |
| Demo is Derived from https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py | |
| """ | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from scipy import linalg | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| from sklearn.decomposition import PCA, FactorAnalysis | |
| from sklearn.covariance import ShrunkCovariance, LedoitWolf | |
| from sklearn.model_selection import cross_val_score | |
| from sklearn.model_selection import GridSearchCV | |
| def create_dataset(n_samples=500, n_features=25, rank=5, sigma=1.0, random_state=42, n_components=5): | |
| ''' | |
| Function to create a dataset with homoscedastic noise and heteroscedastic noise | |
| ''' | |
| # Create a random dataset and add homoscedastic noise and heteroscedastic noise | |
| rng = np.random.RandomState(random_state) | |
| U, _, _ = linalg.svd(rng.randn(n_features, n_features)) | |
| # here n_features must be >= rank as we do a dot product with U[:, :rank].T | |
| X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T) | |
| # Adding homoscedastic noise | |
| X_homo = X + sigma * rng.randn(n_samples, n_features) | |
| # Adding heteroscedastic noise | |
| sigmas = sigma * rng.rand(n_features) + sigma / 2.0 | |
| X_hetero = X + rng.randn(n_samples, n_features) * sigmas | |
| n_components_range = np.arange(0, n_features, n_components) | |
| return X_homo, X_hetero, n_components_range, rank | |
| def compute_scores(X, n_components_range): | |
| ''' | |
| Function to run PCA and FA with different number of componenets and run cross validation | |
| Returns mean PCA and FA scores | |
| ''' | |
| pca = PCA(svd_solver="full") | |
| fa = FactorAnalysis() | |
| pca_scores, fa_scores = [], [] | |
| for n in n_components_range: | |
| pca.n_components = n | |
| fa.n_components = n | |
| pca_scores.append(np.mean(cross_val_score(pca, X))) | |
| fa_scores.append(np.mean(cross_val_score(fa, X))) | |
| return pca_scores, fa_scores | |
| def shrunk_cov_score(X): | |
| shrinkages = np.logspace(-2, 0, 30) | |
| cv = GridSearchCV(ShrunkCovariance(), {"shrinkage": shrinkages}) | |
| return np.mean(cross_val_score(cv.fit(X).best_estimator_, X)) | |
| def lw_score(X): | |
| return np.mean(cross_val_score(LedoitWolf(), X)) | |
| #TODO - allow selection of one or both methods | |
| # def plot_pca_fa_analysis(n_features, n_components): | |
| # ''' | |
| # Function to plot results of PCA and FA cross validation analysis | |
| # ''' | |
| # X_homo, X_hetero, n_components_range, rank = create_dataset(n_features=n_features, n_components = n_components) | |
| # for X, title in [(X_homo, "Homoscedastic Noise"), (X_hetero, "Heteroscedastic Noise")]: | |
| # # compute the pca and fa scores | |
| # pca_scores, fa_scores = compute_scores(X, n_components_range=n_components_range) | |
| # n_components_pca = n_components_range[np.argmax(pca_scores)] | |
| # n_components_fa = n_components_range[np.argmax(fa_scores)] | |
| # pca = PCA(svd_solver="full", n_components="mle") | |
| # pca.fit(X) | |
| # n_components_pca_mle = pca.n_components_ | |
| # print("best n_components by PCA CV = %d" % n_components_pca) | |
| # print("best n_components by FactorAnalysis CV = %d" % n_components_fa) | |
| # print("best n_components by PCA MLE = %d" % n_components_pca_mle) | |
| # fig = plt.figure() | |
| # fig, (ax1, ax2) = plt.subplots(1,2) | |
| # plt.plot(n_components_range, pca_scores, "b", label="PCA scores") | |
| # plt.plot(n_components_range, fa_scores, "r", label="FA scores") | |
| # plt.axvline(rank, color="g", label="TRUTH: %d" % rank, linestyle="-") | |
| # plt.axvline( | |
| # n_components_pca, | |
| # color="b", | |
| # label="PCA CV: %d" % n_components_pca, | |
| # linestyle="--", | |
| # ) | |
| # plt.axvline( | |
| # n_components_fa, | |
| # color="r", | |
| # label="FactorAnalysis CV: %d" % n_components_fa, | |
| # linestyle="--", | |
| # ) | |
| # plt.axvline( | |
| # n_components_pca_mle, | |
| # color="k", | |
| # label="PCA MLE: %d" % n_components_pca_mle, | |
| # linestyle="--", | |
| # ) | |
| # # compare with other covariance estimators | |
| # plt.axhline( | |
| # shrunk_cov_score(X), | |
| # color="violet", | |
| # label="Shrunk Covariance MLE", | |
| # linestyle="-.", | |
| # ) | |
| # plt.axhline( | |
| # lw_score(X), | |
| # color="orange", | |
| # label="LedoitWolf MLE" % n_components_pca_mle, | |
| # linestyle="-.", | |
| # ) | |
| # plt.xlabel("nb of components") | |
| # plt.ylabel("CV scores") | |
| # plt.legend(loc="lower right") | |
| # plt.title(title) | |
| # return fig | |
| def plot_pca_fa_analysis_side(n_samples, n_features, n_components): | |
| X_homo, X_hetero, n_components_range, rank = create_dataset(n_samples = n_samples, n_features=n_features, n_components = n_components) | |
| # set up figure - here we will be doing a side by side plot | |
| fig, axes = plt.subplots(2,1, sharey= False, sharex=True, figsize = (10,8)) | |
| for X, title, idx in [(X_homo, "Homoscedastic Noise", 0), (X_hetero, "Heteroscedastic Noise", 1)]: | |
| # compute the pca and fa scores | |
| pca_scores, fa_scores = compute_scores(X, n_components_range=n_components_range) | |
| n_components_pca = n_components_range[np.argmax(pca_scores)] | |
| n_components_fa = n_components_range[np.argmax(fa_scores)] | |
| pca = PCA(svd_solver="full", n_components="mle") | |
| pca.fit(X) | |
| n_components_pca_mle = pca.n_components_ | |
| print("best n_components by PCA CV = %d" % n_components_pca) | |
| print("best n_components by FactorAnalysis CV = %d" % n_components_fa) | |
| print("best n_components by PCA MLE = %d" % n_components_pca_mle) | |
| axes[idx].plot(n_components_range, pca_scores, "b", label="PCA scores") | |
| axes[idx].plot(n_components_range, fa_scores, "r", label="FA scores") | |
| axes[idx].axvline(rank, color="g", label="TRUTH: %d" % rank, linestyle="-") | |
| axes[idx].axvline( | |
| n_components_pca, | |
| color="b", | |
| label="PCA CV: %d" % n_components_pca, | |
| linestyle="--", | |
| ) | |
| axes[idx].axvline( | |
| n_components_fa, | |
| color="r", | |
| label="FactorAnalysis CV: %d" % n_components_fa, | |
| linestyle="--", | |
| ) | |
| axes[idx].axvline( | |
| n_components_pca_mle, | |
| color="k", | |
| label="PCA MLE: %d" % n_components_pca_mle, | |
| linestyle="--", | |
| ) | |
| # compare with other covariance estimators | |
| axes[idx].axhline( | |
| shrunk_cov_score(X), | |
| color="violet", | |
| label="Shrunk Covariance MLE", | |
| linestyle="-.", | |
| ) | |
| axes[idx].axhline( | |
| lw_score(X), | |
| color="orange", | |
| label="LedoitWolf MLE" % n_components_pca_mle, | |
| linestyle="-.", | |
| ) | |
| # axes[idx].legend(bbox_to_anchor=(1.01, 1.05)) | |
| # plt.xlabel("nb of components") | |
| # plt.ylabel("CV scores") | |
| axes[idx].set_xlabel("nb of components") | |
| axes[idx].set_ylabel("CV scores") | |
| axes[idx].legend(loc="lower right") | |
| axes[idx].set_title(title) | |
| return fig | |
| title = " Illustration of Model Selection with Probabilistic PCA and Factor Analysis (FA)" | |
| with gr.Blocks(title=title) as demo: | |
| gr.Markdown(f"# {title}") | |
| gr.Markdown(" This example shows how one can use Prinicipal Components Analysis (PCA) and Factor Analysis (FA) for model selection by observing the likelihood of a held-out dataset with added noise <br>" | |
| " The number of samples (n_samples) will determine the number of data points to produce. <br>" | |
| " The number of components (n_components) will determine the number of components each method will fit to, and will affect the likelihood of the held-out set. <br>" | |
| " The number of features (n_components) determine the number of features the toy dataset X variable will have. <br>" | |
| " For further details please see the sklearn docs:" | |
| ) | |
| gr.Markdown(" **[Demo is based on sklearn docs found here](https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py)** <br>") | |
| gr.Markdown(" **Dataset** : A toy dataset with corrupted with homoscedastic noise (noise variance is the same for each feature) or heteroscedastic noise (noise variance is the different for each feature) . <br>") | |
| gr.Markdown(" Different number of features and number of components affect how well the low rank space is recovered. <br>" | |
| " Larger Depth trying to overfit and learn even the finner details of the data.<br>" | |
| ) | |
| with gr.Row(): | |
| n_samples = gr.Slider(value=100, minimum=10, maximum=1000, step=10, label="n_samples") | |
| n_components = gr.Slider(value=2, minimum=1, maximum=20, step=1, label="n_components") | |
| n_features = gr.Slider(value=5, minimum=5, maximum=25, step=1, label="n_features") | |
| # options for n_components | |
| btn = gr.Button(value="Submit") | |
| btn.click(plot_pca_fa_analysis_side, inputs= [n_samples, n_features, n_components], outputs= gr.Plot(label='PCA vs FA Model Selection with added noise') ) # | |
| demo.launch() |