# -*- coding: utf-8 -*- # File: app.py # Project: 'Homework #3 OTUS.ML.Advanced' # Created by Gennady Matveev (gm@og.ly) on 02-01-2022. # %% # Import libraries import re import pandas as pd import numpy as np import streamlit as st import requests import pickle from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans import tsfel from kneed import KneeLocator import cryptocompare as cc import matplotlib.pyplot as plt import plotly.express as px from umap import UMAP import warnings plt.style.use("ggplot") plt.rcParams["figure.figsize"] = (10, 5) warnings.filterwarnings("ignore") # pd.options.display.precision = 4 random_state = 17 scaler = StandardScaler() n_jobs = -1 # %% st.set_page_config(page_title="Cryptocurrencies clustering", page_icon='./head.ico', layout='centered', initial_sidebar_state='expanded') # wide padding = 0 st.markdown(f""" """, unsafe_allow_html=True) st.image('./mundus.png') st.subheader('Clustering analysis of cryptocurrencies') st.markdown( '*Explore similarities in statisticial, temporal and spectral domains*') st.markdown('''Top 100 cryptocurrencies' daily closing prices are downloaded. Their dynamics can be analized in search of similarities between coins. Up to 8 currencies from each cluster are shown below.''') st.markdown("""---""") # %% # Set cryptocompare API key: api_key = st.secrets["api_key"] # %% headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36" } req = f"https://min-api.cryptocompare.com/data/top/mktcapfull?limit=100&tsym=USD&api_key={api_key}" # Utility functions for data download @st.cache(ttl=600) def get_price(ticker: str, limit: int): return cc.get_historical_price_day(ticker, currency="USD", limit=limit) @st.cache(ttl=600) def get_all_cc(limit: int): df = pd.DataFrame(index=range(limit)) for tick in tickers: # print(tick, end="\t") try: d = get_price(tick, limit) one_cc = pd.DataFrame.from_dict(d)["close"] one_cc.rename(index=tick, inplace=True) df = pd.concat([df, one_cc], axis=1) except: pass return df # Utility functions for clustering analysis def elbow_study(data, k_max: int = 10, model=KMeans): X = scaler.fit_transform(data) inertia = [] for k in range(2, k_max): clusterer = model(n_clusters=k, random_state=random_state) X_km = clusterer.fit(X) inertia.append(np.sqrt(X_km.inertia_)) # Find a knee kneedle = KneeLocator(range(2, k_max), inertia, S=2, curve="convex", direction="decreasing") # Use 3 clusters in case kneed doesn't find a knee n_clusters = kneedle.knee or 3 return n_clusters def plot_clusters_2(data, Xt, n_clusters, random_state): clusterer = KMeans(n_clusters=n_clusters, max_iter=100, random_state=random_state) X = scaler.fit_transform(Xt) dd = data.copy() dd.loc["cluster"] = clusterer.fit_predict(X.T) color = ["red", "green", "blue", "purple", "orange", "magenta", "goldenrod"] clusters_no = dd.loc["cluster"].value_counts(sort=False) for c in range(n_clusters): cc = color[c] fig, ax = plt.subplots(2, 4, sharex='col', figsize=(15, 5)) cluster_ticks = dd.T[dd.T.loc[:, "cluster"] == c].index for i, tick in enumerate(cluster_ticks[:8]): ax[i % 2, i//2].plot(dd.iloc[:-1][tick], color=cc) # , label=tick) ax[i % 2, i//2].set_title(tick) fig.suptitle(f"Cluster {c}, {clusters_no[c]} items\n", y=1.02) st.pyplot(fig) return dd def visualize(Xt, n_clusters): clusterer = KMeans(n_clusters=n_clusters, max_iter=100, random_state=random_state) X = scaler.fit_transform(Xt.T) X_clust = clusterer.fit_predict(X) X_color = X_clust.astype(str) features = Xt.values # UMAP umap_3d = UMAP(n_components=3, init='random', random_state=random_state) proj_3d = umap_3d.fit_transform(features) fig_3d = px.scatter_3d( proj_3d, x=0, y=1, z=2, color=X_color, labels={'color': 'clusters'}, color_discrete_sequence=["red", "green", "blue", "purple", "orange", "magenta", "goldenrod"], title=f"UMAP projection from feature space", width=800, height=600, ) fig_3d.update_traces(marker_size=5) # fig_3d.show() st.write(fig_3d) # %% # START Sidebar ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ st.sidebar.image('./blau.png') demo = st.sidebar.checkbox(label="Use demo data?", value=True, help="Use demo data or fetch actual?") days=st.sidebar.number_input('Number of days for analysis', min_value=30, max_value=100, value=60) domain=st.sidebar.selectbox('Domain', ('statistical', 'temporal', 'spectral', 'all'), index=1, help='Domain to use feature extraction') st.sidebar.markdown("""---""") analyze=st.sidebar.button('Start analysis') # END Sidebar ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Analysis if analyze: with st.spinner('Downloading data...'): if demo==True: with open("./demo_data.pkl", "rb") as f: demo_data = pickle.load(f) dl = demo_data.shape[0] data_day = demo_data.iloc[dl-days:] tickers = demo_data.columns else: top100=requests.get(req, headers=headers) rs=re.compile(r"\"Name\":\"(?P[A-Z0-9]+)\"") tickers=rs.findall(top100.text) data_day=get_all_cc(limit=days).copy() with st.spinner(f'Extracting {domain} features...'): dom=domain if domain != 'all' else None cfg_file=tsfel.get_features_by_domain(dom) # tsfel analysis x_temp=tsfel.time_series_features_extractor( cfg_file, data_day["BTC"], window_size=days) tf_columns=x_temp.columns xtf=pd.DataFrame(columns=data_day.columns, index=tf_columns) # Fill df with features for col in xtf.columns: xtf[col]=tsfel.time_series_features_extractor( cfg_file, data_day[col], window_size=days ).T xtf.dropna(inplace=True) # Features dataframe Xt=pd.DataFrame(scaler.fit_transform( xtf), columns=data_day.columns, index=xtf.index) with st.spinner('Calculating optimal number of clusters...'): # Get optimal no of clusters n_clusters=elbow_study(Xt.T, model=KMeans) # metric="euclidean", # Plot clusters plot_clusters_2(data_day, Xt=Xt, n_clusters=n_clusters, random_state=random_state ) # Plot umap # visualize(Xt, n_clusters)