Spaces:
Runtime error
Runtime error
| import os | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| from tsfeatures import ( | |
| tsfeatures, acf_features, arch_stat, crossing_points, | |
| entropy, flat_spots, heterogeneity, holt_parameters, | |
| lumpiness, nonlinearity, pacf_features, stl_features, | |
| stability, hw_parameters, unitroot_kpss, unitroot_pp, | |
| series_length, sparsity, hurst, statistics | |
| ) | |
| FILE_CATALOGUE = os.environ['FILE_CATALOGUE'] | |
| BUCKET_TIMENET = os.environ['BUCKET_TIMENET'] | |
| KEY_TIMENET = os.environ['KEY_TIMENET'] | |
| FEATS_COLS = ['hurst', 'series_length', 'unitroot_pp', 'unitroot_kpss', 'hw_alpha', | |
| 'hw_beta', 'hw_gamma', 'stability', 'nperiods', 'seasonal_period', | |
| 'trend_strength', 'spike', 'linearity', 'curvature', 'e_acf1', | |
| 'e_acf10', 'seasonal_strength', 'peak', 'trough', 'x_pacf5', | |
| 'diff1x_pacf5', 'diff2x_pacf5', 'seas_pacf', 'nonlinearity', | |
| 'lumpiness', 'alpha', 'beta', 'flat_spots', 'entropy', | |
| 'crossing_points', 'arch_lm', 'x_acf1', 'x_acf10', 'diff1_acf1', | |
| 'diff1_acf10', 'diff2_acf1', 'diff2_acf10', 'seas_acf1', 'sparsity', | |
| 'total_sum', 'mean', 'variance', 'median', 'p2point5', 'p5', 'p25', | |
| 'p75', 'p95', 'p97point5', 'max', 'min'] | |
| def tsfeatures_vector(df:pd.DataFrame, seasonality: int) -> pd.DataFrame: | |
| ts_df = tsfeatures( | |
| ts=df[['unique_id', 'ds', 'y']], | |
| freq=seasonality, | |
| features=[sparsity, acf_features, crossing_points, | |
| entropy, flat_spots, holt_parameters, | |
| lumpiness, nonlinearity, pacf_features, stl_features, | |
| stability, hw_parameters, unitroot_kpss, unitroot_pp, | |
| series_length, hurst, arch_stat, statistics], | |
| scale=False, | |
| ).rename(columns={'trend': 'trend_strength'}) | |
| if seasonality == 1: | |
| # add missing features when seasonality != 1 | |
| ts_df[['seasonal_strength', 'peak', 'trough', 'seas_pacf', 'seas_acf1']] = np.nan | |
| ts_df[['trend_strength', 'seasonal_strength']] = ts_df[['trend_strength', 'seasonal_strength']].fillna(0) | |
| vector = ts_df[FEATS_COLS].fillna(0).iloc[0].values | |
| vector = (vector - vector.min()) / (vector.max() - vector.min()) | |
| return vector.tolist() | |
| def get_closest_ids(x: list, top_k: int, index_pinecone): | |
| query_response = index_pinecone.query( | |
| top_k=top_k, | |
| include_values=False, | |
| include_metadata=True, | |
| vector=x, | |
| ) | |
| return query_response['matches'] | |
| def highlight_smallest(s, nsmallest=3): | |
| # Define colors | |
| colors = ['lightgreen', 'lightblue', 'lightpink'] | |
| # Rank data and find the nsmallest | |
| ranks = s.rank(method="min").astype(int) | |
| smallest = ranks.isin(ranks.nsmallest(nsmallest)) | |
| # Initialize an empty string for the styles | |
| attr = ['' for _ in s] | |
| # Apply styles to the nsmallest | |
| for i in range(1, nsmallest+1): | |
| mask = ranks == i | |
| attr = ['background-color: {};'.format(colors[i-1]) if v else a for v, a in zip(mask, attr)] | |
| return attr | |
| def plot_best_models_count(ids, catalogue): | |
| uids = [x['id'] for x in ids] | |
| file_evaluations = catalogue['file_evaluation'].loc[uids].unique() | |
| eval_df = [pd.read_parquet(f_eval) for f_eval in file_evaluations] | |
| eval_df = pd.concat(eval_df).query('unique_id in @uids') | |
| eval_df = pd.pivot( | |
| eval_df, | |
| index=['unique_id', 'metric'], | |
| columns='model', | |
| values='value' | |
| ).reset_index() | |
| models = eval_df.drop(columns=['unique_id', 'metric']).columns | |
| # compute relative metric | |
| for model in models: | |
| eval_df[model] = eval_df[model] / eval_df['Naive'] | |
| summary_df = eval_df.groupby('metric')[models].median().T | |
| summary_df = summary_df[summary_df.index != 'Naive'].sort_values('mae') | |
| summary_df = summary_df.style.apply(highlight_smallest, nsmallest=3, axis=0) | |
| eval_df['BestModel'] = eval_df[models].idxmin(axis=1) | |
| #eval_df = eval_df.groupby(['BestModel', 'metric']).size().rename('n').reset_index() | |
| fig = sns.catplot(eval_df.query('metric != "mase"'), y='BestModel', kind='count', col='metric') | |
| return fig, summary_df | |
| def plot_closest_series(Y_df, id, catalogue): | |
| # leer archivo de file_timenet y hacer el plot | |
| uid_catalogue = catalogue.loc[id] | |
| closest_df = pd.read_parquet(uid_catalogue.file_timenet).query('unique_id == @id') | |
| #Y_df['unique_id'] = 'ProvidedByUser' | |
| # Create a figure with 1 row and 2 columns | |
| fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5)) | |
| # Get the unique_id for each DataFrame | |
| unique_id_Y_df = Y_df['unique_id'].unique()[0] | |
| unique_id_closest_df = closest_df['unique_id'].unique()[0] | |
| # Plot the 'y' column for both dataframes, against 'ds', and label them with unique_id | |
| sns.lineplot(x='ds', y='y', ax=axes[0], data=Y_df, label=unique_id_Y_df) | |
| sns.lineplot(x='ds', y='y', ax=axes[1], data=closest_df) | |
| # Set the titles for the subplots | |
| axes[0].set_title('Uploaded Dataset') | |
| axes[1].set_title(f'TimenetTimeSeries:{uid_catalogue.dataset},{uid_catalogue.subdataset},{uid_catalogue.ts_name}') | |
| # Show legend on each subplot | |
| axes[0].legend() | |
| axes[1].legend() | |
| # Display the plot | |
| plt.tight_layout() | |
| plt.show() | |
| return fig | |
| def get_catalogue(): | |
| return pd.read_parquet(FILE_CATALOGUE) | |