Spaces:

Nixtla
/

timenet-features

Runtime error

App Files Files Community

timenet-features / src /utils.py

azulgarza

feat: add timenet table

56baf6d over 2 years ago

raw

history blame contribute delete

5.36 kB

	import os

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import seaborn as sns

	from tsfeatures import (
	tsfeatures, acf_features, arch_stat, crossing_points,
	entropy, flat_spots, heterogeneity, holt_parameters,
	lumpiness, nonlinearity, pacf_features, stl_features,
	stability, hw_parameters, unitroot_kpss, unitroot_pp,
	series_length, sparsity, hurst, statistics
	)


	FILE_CATALOGUE = os.environ['FILE_CATALOGUE']
	BUCKET_TIMENET = os.environ['BUCKET_TIMENET']
	KEY_TIMENET = os.environ['KEY_TIMENET']


	FEATS_COLS = ['hurst', 'series_length', 'unitroot_pp', 'unitroot_kpss', 'hw_alpha',
	'hw_beta', 'hw_gamma', 'stability', 'nperiods', 'seasonal_period',
	'trend_strength', 'spike', 'linearity', 'curvature', 'e_acf1',
	'e_acf10', 'seasonal_strength', 'peak', 'trough', 'x_pacf5',
	'diff1x_pacf5', 'diff2x_pacf5', 'seas_pacf', 'nonlinearity',
	'lumpiness', 'alpha', 'beta', 'flat_spots', 'entropy',
	'crossing_points', 'arch_lm', 'x_acf1', 'x_acf10', 'diff1_acf1',
	'diff1_acf10', 'diff2_acf1', 'diff2_acf10', 'seas_acf1', 'sparsity',
	'total_sum', 'mean', 'variance', 'median', 'p2point5', 'p5', 'p25',
	'p75', 'p95', 'p97point5', 'max', 'min']

	def tsfeatures_vector(df:pd.DataFrame, seasonality: int) -> pd.DataFrame:
	ts_df = tsfeatures(
	ts=df[['unique_id', 'ds', 'y']],
	freq=seasonality,
	features=[sparsity, acf_features, crossing_points,
	entropy, flat_spots, holt_parameters,
	lumpiness, nonlinearity, pacf_features, stl_features,
	stability, hw_parameters, unitroot_kpss, unitroot_pp,
	series_length, hurst, arch_stat, statistics],
	scale=False,
	).rename(columns={'trend': 'trend_strength'})
	if seasonality == 1:
	# add missing features when seasonality != 1
	ts_df[['seasonal_strength', 'peak', 'trough', 'seas_pacf', 'seas_acf1']] = np.nan
	ts_df[['trend_strength', 'seasonal_strength']] = ts_df[['trend_strength', 'seasonal_strength']].fillna(0)
	vector = ts_df[FEATS_COLS].fillna(0).iloc[0].values
	vector = (vector - vector.min()) / (vector.max() - vector.min())
	return vector.tolist()

	def get_closest_ids(x: list, top_k: int, index_pinecone):
	query_response = index_pinecone.query(
	top_k=top_k,
	include_values=False,
	include_metadata=True,
	vector=x,
	)
	return query_response['matches']

	def highlight_smallest(s, nsmallest=3):
	# Define colors
	colors = ['lightgreen', 'lightblue', 'lightpink']

	# Rank data and find the nsmallest
	ranks = s.rank(method="min").astype(int)
	smallest = ranks.isin(ranks.nsmallest(nsmallest))

	# Initialize an empty string for the styles
	attr = ['' for _ in s]

	# Apply styles to the nsmallest
	for i in range(1, nsmallest+1):
	mask = ranks == i
	attr = ['background-color: {};'.format(colors[i-1]) if v else a for v, a in zip(mask, attr)]

	return attr

	def plot_best_models_count(ids, catalogue):
	uids = [x['id'] for x in ids]
	file_evaluations = catalogue['file_evaluation'].loc[uids].unique()
	eval_df = [pd.read_parquet(f_eval) for f_eval in file_evaluations]
	eval_df = pd.concat(eval_df).query('unique_id in @uids')
	eval_df = pd.pivot(
	eval_df,
	index=['unique_id', 'metric'],
	columns='model',
	values='value'
	).reset_index()
	models = eval_df.drop(columns=['unique_id', 'metric']).columns
	# compute relative metric
	for model in models:
	eval_df[model] = eval_df[model] / eval_df['Naive']
	summary_df = eval_df.groupby('metric')[models].median().T
	summary_df = summary_df[summary_df.index != 'Naive'].sort_values('mae')
	summary_df = summary_df.style.apply(highlight_smallest, nsmallest=3, axis=0)
	eval_df['BestModel'] = eval_df[models].idxmin(axis=1)
	#eval_df = eval_df.groupby(['BestModel', 'metric']).size().rename('n').reset_index()
	fig = sns.catplot(eval_df.query('metric != "mase"'), y='BestModel', kind='count', col='metric')
	return fig, summary_df

	def plot_closest_series(Y_df, id, catalogue):
	# leer archivo de file_timenet y hacer el plot
	uid_catalogue = catalogue.loc[id]
	closest_df = pd.read_parquet(uid_catalogue.file_timenet).query('unique_id == @id')
	#Y_df['unique_id'] = 'ProvidedByUser'

	# Create a figure with 1 row and 2 columns
	fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))

	# Get the unique_id for each DataFrame
	unique_id_Y_df = Y_df['unique_id'].unique()[0]
	unique_id_closest_df = closest_df['unique_id'].unique()[0]

	# Plot the 'y' column for both dataframes, against 'ds', and label them with unique_id
	sns.lineplot(x='ds', y='y', ax=axes[0], data=Y_df, label=unique_id_Y_df)
	sns.lineplot(x='ds', y='y', ax=axes[1], data=closest_df)

	# Set the titles for the subplots
	axes[0].set_title('Uploaded Dataset')
	axes[1].set_title(f'TimenetTimeSeries:{uid_catalogue.dataset},{uid_catalogue.subdataset},{uid_catalogue.ts_name}')

	# Show legend on each subplot
	axes[0].legend()
	axes[1].legend()

	# Display the plot
	plt.tight_layout()
	plt.show()
	return fig

	def get_catalogue():
	return pd.read_parquet(FILE_CATALOGUE)