Spaces:
Runtime error
Runtime error
| # ========== (c) JP Hwang 30/9/2022 ========== | |
| import logging | |
| import pandas as pd | |
| from pathlib import Path | |
| import streamlit as st | |
| import app | |
| from sklearn.decomposition import PCA, FastICA, LatentDirichletAllocation | |
| from sklearn.manifold import TSNE | |
| import plotly.express as px | |
| # ===== SET UP LOGGER ===== | |
| logger = logging.getLogger(__name__) | |
| root_logger = logging.getLogger() | |
| root_logger.setLevel(logging.INFO) | |
| sh = logging.StreamHandler() | |
| formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| sh.setFormatter(formatter) | |
| root_logger.addHandler(sh) | |
| # ===== END LOGGER SETUP ===== | |
| desired_width = 320 | |
| pd.set_option('display.max_columns', 20) | |
| pd.set_option('display.width', desired_width) | |
| sizes = [1, 20, 30] | |
| def preproc_data(datadir=None): | |
| if datadir is None: | |
| datadir = Path('data') | |
| df = pd.read_csv(datadir/'colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b']) | |
| # Preprocessing | |
| df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1) | |
| # Get top 'basic' color names | |
| df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1])) | |
| # Set default size attribute | |
| df['size'] = sizes[0] | |
| return df | |
| def build_chart(df_in): | |
| fig = px.scatter_3d(df_in, x='r', y='g', z='b', | |
| template='plotly_white', | |
| color=df_in['simple_name'], | |
| color_discrete_sequence=df_in['rgb'], | |
| size='size', | |
| hover_data=['name']) | |
| fig.update_layout( | |
| showlegend=False, | |
| margin=dict(l=5, r=5, t=20, b=5) | |
| ) | |
| return fig | |
| def main(): | |
| st.title("Dimensionality Reduction: Explained") | |
| st.subheader("Using colors to explain how dimensionality reduction techniques work in princple.") | |
| st.markdown(""" | |
| [Previously](https://huggingface.co/spaces/jphwang/colorful_vectors), you saw how vectors can represent information as sets of numbers. | |
| To recap, colors can be represented (for example) as three numbers like `(255, 0, 255)`, | |
| or `(80, 200, 120)`, where each number means something. | |
| In this case they were (R)ed, (G)reen and (B)lue values, but lots of other systems exist like | |
| [HCL or HSV](https://en.wikipedia.org/wiki/HSL_and_HSV), [CMYK](https://en.wikipedia.org/wiki/CMYK_color_model) | |
| or [RGBA](https://en.wikipedia.org/wiki/RGBA_color_model). | |
| Now, you might have noticed that some of these representations include more than three numbers. And given that | |
| we live in a three-dimensional space, we can't easily represent 4+ dimensional vectors. So, what're we to do? | |
| One answer is to use what are called dimensionality reduction techniques, which will produce an output like the below. | |
| """) | |
| df = preproc_data(datadir=Path('data')) | |
| algo_options = [PCA, FastICA, LatentDirichletAllocation, TSNE] | |
| algo_names = [str(i.__name__) for i in algo_options] | |
| algos_dict = dict(zip(algo_names, algo_options)) | |
| algo_sel = st.radio(label='Select your algorithm', options=algos_dict, index=0) | |
| rgb_arr = df[['r', 'g', 'b']].values | |
| algo = algos_dict[algo_sel] | |
| if algo != TSNE: | |
| reducer = algo(n_components=2) | |
| reducer.fit(rgb_arr.transpose()) | |
| vals = reducer.components_.transpose() | |
| else: | |
| reducer = algo(n_components=2) | |
| vals = reducer.fit_transform(rgb_arr) | |
| df['dimension_a'] = vals[:, 0] | |
| df['dimension_b'] = vals[:, 1] | |
| red_fig = px.scatter(df, x='dimension_a', y='dimension_b', | |
| title=f'RGB values represented in 2-D using {algo.__name__}', | |
| template='plotly_white', | |
| color=df['simple_name'], | |
| color_discrete_sequence=df['rgb'], | |
| width=500, height=500, | |
| size='size', | |
| size_max=10, | |
| hover_data=['name']) | |
| red_fig.update_layout( | |
| showlegend=False, | |
| paper_bgcolor='white', | |
| plot_bgcolor='white', | |
| margin=dict(l=5, r=5, t=40, b=5) | |
| ) | |
| st.markdown('-----') | |
| st.plotly_chart(red_fig) | |
| st.markdown('-----') | |
| st.markdown(""" | |
| For reference and as a reminder, here is the original data in 3-D again: | |
| """) | |
| fig = build_chart(df) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.markdown('-----') | |
| st.markdown(""" | |
| See how the 3-dimensional color information was squeezed into two? | |
| We can visually confirm that the algorithms have done something *sensible*, because similar colors | |
| still appear together. | |
| Having said that, obviously *some* information will get lost in converting information that's contained in | |
| three dimensions to only two. | |
| So these models make particular choices, which result in the outputs being different. | |
| Some models like PCA aim to lose the least amount of overall information, while others like t-SNE | |
| have specific goals, such as preserving local distances at the cost of compromsed global distances. | |
| """) | |
| return True | |
| if __name__ == '__main__': | |
| main() | |