Spaces:
Sleeping
Sleeping
final fix
Browse files
app.py
CHANGED
|
@@ -1,9 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
import torch
|
| 4 |
-
import numpy as np
|
| 5 |
-
import pandas as pd
|
| 6 |
-
import matplotlib.pyplot as plt
|
| 7 |
|
| 8 |
from chronos import Chronos2Pipeline
|
| 9 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
import torch
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
from chronos import Chronos2Pipeline
|
| 6 |
|
utils.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import torch
|
| 2 |
import os
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
|
@@ -6,6 +5,8 @@ import matplotlib.pyplot as plt
|
|
| 6 |
|
| 7 |
from chronos import Chronos2Pipeline
|
| 8 |
|
|
|
|
|
|
|
| 9 |
|
| 10 |
MIN_LENGTH_CONTEXT = 64
|
| 11 |
PREDICTION_LENGTH = 32
|
|
@@ -34,7 +35,7 @@ def validateData(file, timestamp_column:str=None):
|
|
| 34 |
"""
|
| 35 |
assert os.path.getsize(file) < 256 * 1024 * 1024, "File size exceeds the maximum limit of 256MB. Please upload a smaller file."
|
| 36 |
assert file is not None, "No file uploaded. Please upload a CSV file containing your time series data."
|
| 37 |
-
assert file.endswith('.csv'), "Invalid file format. Please upload a CSV file."
|
| 38 |
df = pd.read_csv(file, index_col=None, header=0)
|
| 39 |
if timestamp_column is not None:
|
| 40 |
assert timestamp_column in df.columns, f"Timestamp column '{timestamp_column}' not found in the uploaded file. Please provide a valid timestamp column name."
|
|
@@ -93,7 +94,7 @@ def preProcessData(file, timestamp_column:str=None)->list[pd.DataFrame, pd.Seri
|
|
| 93 |
|
| 94 |
|
| 95 |
## Main prediction function
|
| 96 |
-
def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target_cols:list[str]) -> dict[str, pd.DataFrame]:
|
| 97 |
""" Predict future values for the time series data using the Chronos2 pipeline. This function performs the following steps:
|
| 98 |
1. Identifies the segments of the time series data that require predictions based on the 'item_id' column.
|
| 99 |
2. For each segment, prepares the input data for the Chronos2 pipeline by selecting the appropriate context length of historical data.
|
|
@@ -106,9 +107,9 @@ def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target
|
|
| 106 |
target_cols (list[str]): A list of target column names for which predictions are to be made.
|
| 107 |
|
| 108 |
Returns:
|
| 109 |
-
dict[str, pd.DataFrame]: A dictionary where keys are target column names and values are DataFrames containing the predictions for each column.
|
| 110 |
"""
|
| 111 |
-
prediction_item_ids = [iid for iid in sorted(preProcessedData['item_id'].unique()) if iid > 0]
|
| 112 |
|
| 113 |
tasks, segment_start_indices = [], [] # Track where each segment starts in original data
|
| 114 |
for item_id in prediction_item_ids:
|
|
@@ -152,15 +153,11 @@ def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target
|
|
| 152 |
|
| 153 |
# For each target column (variate)
|
| 154 |
for d_idx, col in enumerate(target_cols):
|
| 155 |
-
|
| 156 |
'item_id': prediction_item_ids[seg_idx], # item_id for this segment
|
| 157 |
'timestep': np.arange(seg_start, seg_start + PREDICTION_LENGTH),
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
for q_idx, q_level in enumerate(chronos2.quantiles):
|
| 161 |
-
seg_df[str(q_level)] = pred_np[d_idx, q_idx, :]
|
| 162 |
-
|
| 163 |
-
predictions_dict[col].append(seg_df)
|
| 164 |
|
| 165 |
# Track original indices
|
| 166 |
all_indices.extend(range(seg_start, seg_start + PREDICTION_LENGTH))
|
|
@@ -234,7 +231,7 @@ def aggregateAnomalyScores(continuousScores: dict[str, np.ndarray], percentile:
|
|
| 234 |
|
| 235 |
|
| 236 |
## Computing the discrete anomaly scores and labels
|
| 237 |
-
def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series_df: pd.DataFrame, target_cols:list[str],
|
| 238 |
horizons: list[int] = [1, 8, 32, 64])-> pd.Series:
|
| 239 |
""" Compute discrete anomaly scores and labels based on the predictions from the Chronos2 pipeline. This function performs the following steps:
|
| 240 |
1. Identifies the indices in the original time series corresponding to the predictions made by the Chronos2 pipeline.
|
|
@@ -245,18 +242,16 @@ def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series
|
|
| 245 |
predictions_dict (dict[str, pd.DataFrame]): A dictionary where keys are target column names and values are DataFrames containing the predictions for each column.
|
| 246 |
time_series_df (pd.DataFrame): The original time series DataFrame containing the actual values for each target column.
|
| 247 |
target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores are to be computed.
|
|
|
|
| 248 |
horizons (list[int], optional): A list of forecast horizons to consider for computing anomaly scores. Default is [1, 8, 32, 64].
|
| 249 |
|
| 250 |
Returns:
|
| 251 |
pd.Series: A pandas Series containing the discrete anomaly labels (1 for anomaly, 0 for normal) for each time point in the original time series.
|
| 252 |
"""
|
| 253 |
-
|
| 254 |
-
predictionIndexes = predictions_dict[target_cols[0]]['item_id'].index[predictions_dict[target_cols[0]]['item_id'] > 0].tolist() # Indices of predictions in the original time series
|
| 255 |
-
|
| 256 |
continuousScores = {col: computeMultiHorizonAnomalyScore(
|
| 257 |
predictions_df=predictions_dict[col],
|
| 258 |
actual_values=time_series_df[col].values,
|
| 259 |
-
prediction_indices=
|
| 260 |
horizons=horizons
|
| 261 |
) for col in target_cols}
|
| 262 |
|
|
@@ -264,7 +259,7 @@ def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series
|
|
| 264 |
|
| 265 |
|
| 266 |
|
| 267 |
-
def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|None, target_cols:list[str], scores:
|
| 268 |
"""
|
| 269 |
Assemble the final results DataFrame containing the original time series data along with the computed anomaly labels. This function performs the following steps:
|
| 270 |
1. Creates a copy of the pre-processed DataFrame to serve as the base for the results.
|
|
@@ -277,7 +272,7 @@ def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|Non
|
|
| 277 |
preProcessedData (pd.DataFrame): The pre-processed DataFrame formatted for Chronos2 input, which serves as the base for the results.
|
| 278 |
timestamp_old (pd.Series|None): The original timestamp values extracted from the uploaded data, or None if no timestamp column was provided.
|
| 279 |
target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores were computed.
|
| 280 |
-
scores (
|
| 281 |
|
| 282 |
Returns:
|
| 283 |
pd.DataFrame: A DataFrame containing the original time series data along with an additional column 'anomaly_label' indicating the anomaly labels, and the original timestamps if available.
|
|
@@ -285,7 +280,9 @@ def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|Non
|
|
| 285 |
|
| 286 |
result_df = preProcessedData.copy()
|
| 287 |
|
| 288 |
-
result_df['anomaly_label'] =
|
|
|
|
|
|
|
| 289 |
if timestamp_old is not None:
|
| 290 |
result_df['timestamp'] = timestamp_old
|
| 291 |
return result_df.drop(columns=['item_id'])
|
|
@@ -310,23 +307,28 @@ def plotResults(df, target_cols:list[str]=None)->plt.Figure|None:
|
|
| 310 |
|
| 311 |
if df.shape[1] - 2 < MAX_NUMBER_OF_PLOTTABLE_SERIES and df.shape[0] < MAX_NUMBER_OF_POINTS_PLOTTABLE: # -2 to exclude timestamp and anomaly_label columns
|
| 312 |
fig, ax = plt.subplots(1,1,figsize=(15, 5))
|
| 313 |
-
df['
|
| 314 |
-
|
| 315 |
for col in target_cols:
|
| 316 |
-
ax.plot(df['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
-
for _, row in df[df['anomaly_label'] == 1].iterrows():
|
| 319 |
-
ax.axvspan( row['timestamp_2'] - pd.Timedelta(minutes=0.5), row['timestamp_2'] + pd.Timedelta(minutes=0.5),
|
| 320 |
-
color='red', alpha=0.15 )
|
| 321 |
-
|
| 322 |
ax.legend()
|
| 323 |
ax.set_title('Time Series with Detected Anomalies')
|
| 324 |
ax.set_xlabel('Timestamp')
|
| 325 |
ax.set_ylabel('Values')
|
| 326 |
-
|
|
|
|
| 327 |
plt.tight_layout()
|
| 328 |
-
df.drop(columns=['
|
| 329 |
return fig # Return the figure object
|
| 330 |
else:
|
| 331 |
-
return None
|
| 332 |
-
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
from chronos import Chronos2Pipeline
|
| 7 |
|
| 8 |
+
OUT_PATH = "./savedPredictions/results.csv"
|
| 9 |
+
|
| 10 |
|
| 11 |
MIN_LENGTH_CONTEXT = 64
|
| 12 |
PREDICTION_LENGTH = 32
|
|
|
|
| 35 |
"""
|
| 36 |
assert os.path.getsize(file) < 256 * 1024 * 1024, "File size exceeds the maximum limit of 256MB. Please upload a smaller file."
|
| 37 |
assert file is not None, "No file uploaded. Please upload a CSV file containing your time series data."
|
| 38 |
+
assert file.endswith('.csv') and os.path.basename(file).count(".") == 1, "Invalid file format. Please upload a CSV file."
|
| 39 |
df = pd.read_csv(file, index_col=None, header=0)
|
| 40 |
if timestamp_column is not None:
|
| 41 |
assert timestamp_column in df.columns, f"Timestamp column '{timestamp_column}' not found in the uploaded file. Please provide a valid timestamp column name."
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
## Main prediction function
|
| 97 |
+
def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target_cols:list[str]) -> tuple[dict[str, pd.DataFrame], np.ndarray]:
|
| 98 |
""" Predict future values for the time series data using the Chronos2 pipeline. This function performs the following steps:
|
| 99 |
1. Identifies the segments of the time series data that require predictions based on the 'item_id' column.
|
| 100 |
2. For each segment, prepares the input data for the Chronos2 pipeline by selecting the appropriate context length of historical data.
|
|
|
|
| 107 |
target_cols (list[str]): A list of target column names for which predictions are to be made.
|
| 108 |
|
| 109 |
Returns:
|
| 110 |
+
tuple[dict[str, pd.DataFrame], np.ndarray]: A tuple where the first element is a dictionary where keys are target column names and values are DataFrames containing the predictions for each column, and the second element is an array of indices in the original time series that correspond to the predictions.
|
| 111 |
"""
|
| 112 |
+
prediction_item_ids = [iid for iid in sorted(preProcessedData['item_id'].unique()) if iid > 0]
|
| 113 |
|
| 114 |
tasks, segment_start_indices = [], [] # Track where each segment starts in original data
|
| 115 |
for item_id in prediction_item_ids:
|
|
|
|
| 153 |
|
| 154 |
# For each target column (variate)
|
| 155 |
for d_idx, col in enumerate(target_cols):
|
| 156 |
+
predictions_dict[col].append(pd.DataFrame({
|
| 157 |
'item_id': prediction_item_ids[seg_idx], # item_id for this segment
|
| 158 |
'timestep': np.arange(seg_start, seg_start + PREDICTION_LENGTH),
|
| 159 |
+
'predictions': pred_np[d_idx, 10, :]
|
| 160 |
+
}))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
# Track original indices
|
| 163 |
all_indices.extend(range(seg_start, seg_start + PREDICTION_LENGTH))
|
|
|
|
| 231 |
|
| 232 |
|
| 233 |
## Computing the discrete anomaly scores and labels
|
| 234 |
+
def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series_df: pd.DataFrame, target_cols:list[str], indexes: np.ndarray,
|
| 235 |
horizons: list[int] = [1, 8, 32, 64])-> pd.Series:
|
| 236 |
""" Compute discrete anomaly scores and labels based on the predictions from the Chronos2 pipeline. This function performs the following steps:
|
| 237 |
1. Identifies the indices in the original time series corresponding to the predictions made by the Chronos2 pipeline.
|
|
|
|
| 242 |
predictions_dict (dict[str, pd.DataFrame]): A dictionary where keys are target column names and values are DataFrames containing the predictions for each column.
|
| 243 |
time_series_df (pd.DataFrame): The original time series DataFrame containing the actual values for each target column.
|
| 244 |
target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores are to be computed.
|
| 245 |
+
indexes (np.ndarray): The array of indices in the original time series that correspond to the predictions.
|
| 246 |
horizons (list[int], optional): A list of forecast horizons to consider for computing anomaly scores. Default is [1, 8, 32, 64].
|
| 247 |
|
| 248 |
Returns:
|
| 249 |
pd.Series: A pandas Series containing the discrete anomaly labels (1 for anomaly, 0 for normal) for each time point in the original time series.
|
| 250 |
"""
|
|
|
|
|
|
|
|
|
|
| 251 |
continuousScores = {col: computeMultiHorizonAnomalyScore(
|
| 252 |
predictions_df=predictions_dict[col],
|
| 253 |
actual_values=time_series_df[col].values,
|
| 254 |
+
prediction_indices=indexes,
|
| 255 |
horizons=horizons
|
| 256 |
) for col in target_cols}
|
| 257 |
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
|
| 262 |
+
def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|None, target_cols:list[str], scores: np.ndarray)->pd.DataFrame:
|
| 263 |
"""
|
| 264 |
Assemble the final results DataFrame containing the original time series data along with the computed anomaly labels. This function performs the following steps:
|
| 265 |
1. Creates a copy of the pre-processed DataFrame to serve as the base for the results.
|
|
|
|
| 272 |
preProcessedData (pd.DataFrame): The pre-processed DataFrame formatted for Chronos2 input, which serves as the base for the results.
|
| 273 |
timestamp_old (pd.Series|None): The original timestamp values extracted from the uploaded data, or None if no timestamp column was provided.
|
| 274 |
target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores were computed.
|
| 275 |
+
scores (np.ndarray): A numpy array containing the discrete anomaly labels (1 for anomaly, 0 for normal) for each time point corresponding to the predictions.
|
| 276 |
|
| 277 |
Returns:
|
| 278 |
pd.DataFrame: A DataFrame containing the original time series data along with an additional column 'anomaly_label' indicating the anomaly labels, and the original timestamps if available.
|
|
|
|
| 280 |
|
| 281 |
result_df = preProcessedData.copy()
|
| 282 |
|
| 283 |
+
result_df['anomaly_label'] = -1
|
| 284 |
+
result_df.loc[result_df['item_id'] > 0, 'anomaly_label'] = scores[:len(result_df)-MIN_LENGTH_CONTEXT]
|
| 285 |
+
|
| 286 |
if timestamp_old is not None:
|
| 287 |
result_df['timestamp'] = timestamp_old
|
| 288 |
return result_df.drop(columns=['item_id'])
|
|
|
|
| 307 |
|
| 308 |
if df.shape[1] - 2 < MAX_NUMBER_OF_PLOTTABLE_SERIES and df.shape[0] < MAX_NUMBER_OF_POINTS_PLOTTABLE: # -2 to exclude timestamp and anomaly_label columns
|
| 309 |
fig, ax = plt.subplots(1,1,figsize=(15, 5))
|
| 310 |
+
df['temp_timestamp_for_print'] = np.arange(len(df)) # posizione numerica dei punti
|
| 311 |
+
|
| 312 |
for col in target_cols:
|
| 313 |
+
ax.plot(df['temp_timestamp_for_print'], df[col], label=col)
|
| 314 |
+
|
| 315 |
+
# Evidenziazione anomalie con background rosso
|
| 316 |
+
for _, row in df[df['anomaly_label'] == 1].iterrows():
|
| 317 |
+
ax.axvspan(
|
| 318 |
+
row['temp_timestamp_for_print'] - 0.5,
|
| 319 |
+
row['temp_timestamp_for_print'] + 0.5,
|
| 320 |
+
color='red',
|
| 321 |
+
alpha=0.15
|
| 322 |
+
)
|
| 323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
ax.legend()
|
| 325 |
ax.set_title('Time Series with Detected Anomalies')
|
| 326 |
ax.set_xlabel('Timestamp')
|
| 327 |
ax.set_ylabel('Values')
|
| 328 |
+
ax.grid(True, which='both', linestyle='--', linewidth=0.5)
|
| 329 |
+
ax.set_xticks(df['temp_timestamp_for_print'][::max(1, len(df)//10)], labels=df['timestamp'][::max(1, len(df)//10)], rotation=45)
|
| 330 |
plt.tight_layout()
|
| 331 |
+
df.drop(columns=['temp_timestamp_for_print'], inplace=True)
|
| 332 |
return fig # Return the figure object
|
| 333 |
else:
|
| 334 |
+
return None
|
|
|