Nuzz23 commited on
Commit
5c8c1e3
·
1 Parent(s): e3cde9c

final fix

Browse files
Files changed (2) hide show
  1. app.py +0 -3
  2. utils.py +33 -31
app.py CHANGED
@@ -1,9 +1,6 @@
1
  import gradio as gr
2
  import os
3
  import torch
4
- import numpy as np
5
- import pandas as pd
6
- import matplotlib.pyplot as plt
7
 
8
  from chronos import Chronos2Pipeline
9
 
 
1
  import gradio as gr
2
  import os
3
  import torch
 
 
 
4
 
5
  from chronos import Chronos2Pipeline
6
 
utils.py CHANGED
@@ -1,4 +1,3 @@
1
- import torch
2
  import os
3
  import numpy as np
4
  import pandas as pd
@@ -6,6 +5,8 @@ import matplotlib.pyplot as plt
6
 
7
  from chronos import Chronos2Pipeline
8
 
 
 
9
 
10
  MIN_LENGTH_CONTEXT = 64
11
  PREDICTION_LENGTH = 32
@@ -34,7 +35,7 @@ def validateData(file, timestamp_column:str=None):
34
  """
35
  assert os.path.getsize(file) < 256 * 1024 * 1024, "File size exceeds the maximum limit of 256MB. Please upload a smaller file."
36
  assert file is not None, "No file uploaded. Please upload a CSV file containing your time series data."
37
- assert file.endswith('.csv'), "Invalid file format. Please upload a CSV file."
38
  df = pd.read_csv(file, index_col=None, header=0)
39
  if timestamp_column is not None:
40
  assert timestamp_column in df.columns, f"Timestamp column '{timestamp_column}' not found in the uploaded file. Please provide a valid timestamp column name."
@@ -93,7 +94,7 @@ def preProcessData(file, timestamp_column:str=None)->list[pd.DataFrame, pd.Seri
93
 
94
 
95
  ## Main prediction function
96
- def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target_cols:list[str]) -> dict[str, pd.DataFrame]:
97
  """ Predict future values for the time series data using the Chronos2 pipeline. This function performs the following steps:
98
  1. Identifies the segments of the time series data that require predictions based on the 'item_id' column.
99
  2. For each segment, prepares the input data for the Chronos2 pipeline by selecting the appropriate context length of historical data.
@@ -106,9 +107,9 @@ def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target
106
  target_cols (list[str]): A list of target column names for which predictions are to be made.
107
 
108
  Returns:
109
- dict[str, pd.DataFrame]: A dictionary where keys are target column names and values are DataFrames containing the predictions for each column.
110
  """
111
- prediction_item_ids = [iid for iid in sorted(preProcessedData['item_id'].unique()) if iid > 0][:-1] # Exclude last segment, since we don't have the labels for it (it would be in the future of the series)
112
 
113
  tasks, segment_start_indices = [], [] # Track where each segment starts in original data
114
  for item_id in prediction_item_ids:
@@ -152,15 +153,11 @@ def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target
152
 
153
  # For each target column (variate)
154
  for d_idx, col in enumerate(target_cols):
155
- seg_df = pd.DataFrame({
156
  'item_id': prediction_item_ids[seg_idx], # item_id for this segment
157
  'timestep': np.arange(seg_start, seg_start + PREDICTION_LENGTH),
158
- })
159
- # Add quantile columns
160
- for q_idx, q_level in enumerate(chronos2.quantiles):
161
- seg_df[str(q_level)] = pred_np[d_idx, q_idx, :]
162
-
163
- predictions_dict[col].append(seg_df)
164
 
165
  # Track original indices
166
  all_indices.extend(range(seg_start, seg_start + PREDICTION_LENGTH))
@@ -234,7 +231,7 @@ def aggregateAnomalyScores(continuousScores: dict[str, np.ndarray], percentile:
234
 
235
 
236
  ## Computing the discrete anomaly scores and labels
237
- def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series_df: pd.DataFrame, target_cols:list[str],
238
  horizons: list[int] = [1, 8, 32, 64])-> pd.Series:
239
  """ Compute discrete anomaly scores and labels based on the predictions from the Chronos2 pipeline. This function performs the following steps:
240
  1. Identifies the indices in the original time series corresponding to the predictions made by the Chronos2 pipeline.
@@ -245,18 +242,16 @@ def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series
245
  predictions_dict (dict[str, pd.DataFrame]): A dictionary where keys are target column names and values are DataFrames containing the predictions for each column.
246
  time_series_df (pd.DataFrame): The original time series DataFrame containing the actual values for each target column.
247
  target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores are to be computed.
 
248
  horizons (list[int], optional): A list of forecast horizons to consider for computing anomaly scores. Default is [1, 8, 32, 64].
249
 
250
  Returns:
251
  pd.Series: A pandas Series containing the discrete anomaly labels (1 for anomaly, 0 for normal) for each time point in the original time series.
252
  """
253
-
254
- predictionIndexes = predictions_dict[target_cols[0]]['item_id'].index[predictions_dict[target_cols[0]]['item_id'] > 0].tolist() # Indices of predictions in the original time series
255
-
256
  continuousScores = {col: computeMultiHorizonAnomalyScore(
257
  predictions_df=predictions_dict[col],
258
  actual_values=time_series_df[col].values,
259
- prediction_indices=predictionIndexes,
260
  horizons=horizons
261
  ) for col in target_cols}
262
 
@@ -264,7 +259,7 @@ def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series
264
 
265
 
266
 
267
- def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|None, target_cols:list[str], scores: pd.Series)->pd.DataFrame:
268
  """
269
  Assemble the final results DataFrame containing the original time series data along with the computed anomaly labels. This function performs the following steps:
270
  1. Creates a copy of the pre-processed DataFrame to serve as the base for the results.
@@ -277,7 +272,7 @@ def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|Non
277
  preProcessedData (pd.DataFrame): The pre-processed DataFrame formatted for Chronos2 input, which serves as the base for the results.
278
  timestamp_old (pd.Series|None): The original timestamp values extracted from the uploaded data, or None if no timestamp column was provided.
279
  target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores were computed.
280
- scores (pd.Series): A pandas Series containing the discrete anomaly labels (1 for anomaly, 0 for normal) for each time point corresponding to the predictions.
281
 
282
  Returns:
283
  pd.DataFrame: A DataFrame containing the original time series data along with an additional column 'anomaly_label' indicating the anomaly labels, and the original timestamps if available.
@@ -285,7 +280,9 @@ def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|Non
285
 
286
  result_df = preProcessedData.copy()
287
 
288
- result_df['anomaly_label'] = pd.concat([pd.Series([-1] * (len(preProcessedData) - len(scores)), index=preProcessedData.index[len(scores):]), scores])
 
 
289
  if timestamp_old is not None:
290
  result_df['timestamp'] = timestamp_old
291
  return result_df.drop(columns=['item_id'])
@@ -310,23 +307,28 @@ def plotResults(df, target_cols:list[str]=None)->plt.Figure|None:
310
 
311
  if df.shape[1] - 2 < MAX_NUMBER_OF_PLOTTABLE_SERIES and df.shape[0] < MAX_NUMBER_OF_POINTS_PLOTTABLE: # -2 to exclude timestamp and anomaly_label columns
312
  fig, ax = plt.subplots(1,1,figsize=(15, 5))
313
- df['timestamp_2'] = df['timestamp'] if 'timestamp' in df.columns else df.index
314
-
315
  for col in target_cols:
316
- ax.plot(df['timestamp_2'], df[col], label=col)
 
 
 
 
 
 
 
 
 
317
 
318
- for _, row in df[df['anomaly_label'] == 1].iterrows():
319
- ax.axvspan( row['timestamp_2'] - pd.Timedelta(minutes=0.5), row['timestamp_2'] + pd.Timedelta(minutes=0.5),
320
- color='red', alpha=0.15 )
321
-
322
  ax.legend()
323
  ax.set_title('Time Series with Detected Anomalies')
324
  ax.set_xlabel('Timestamp')
325
  ax.set_ylabel('Values')
326
- plt.xticks(rotation=45)
 
327
  plt.tight_layout()
328
- df.drop(columns=['timestamp_2'], inplace=True)
329
  return fig # Return the figure object
330
  else:
331
- return None
332
-
 
 
1
  import os
2
  import numpy as np
3
  import pandas as pd
 
5
 
6
  from chronos import Chronos2Pipeline
7
 
8
+ OUT_PATH = "./savedPredictions/results.csv"
9
+
10
 
11
  MIN_LENGTH_CONTEXT = 64
12
  PREDICTION_LENGTH = 32
 
35
  """
36
  assert os.path.getsize(file) < 256 * 1024 * 1024, "File size exceeds the maximum limit of 256MB. Please upload a smaller file."
37
  assert file is not None, "No file uploaded. Please upload a CSV file containing your time series data."
38
+ assert file.endswith('.csv') and os.path.basename(file).count(".") == 1, "Invalid file format. Please upload a CSV file."
39
  df = pd.read_csv(file, index_col=None, header=0)
40
  if timestamp_column is not None:
41
  assert timestamp_column in df.columns, f"Timestamp column '{timestamp_column}' not found in the uploaded file. Please provide a valid timestamp column name."
 
94
 
95
 
96
  ## Main prediction function
97
+ def predictData(chronos2:Chronos2Pipeline, preProcessedData:pd.DataFrame, target_cols:list[str]) -> tuple[dict[str, pd.DataFrame], np.ndarray]:
98
  """ Predict future values for the time series data using the Chronos2 pipeline. This function performs the following steps:
99
  1. Identifies the segments of the time series data that require predictions based on the 'item_id' column.
100
  2. For each segment, prepares the input data for the Chronos2 pipeline by selecting the appropriate context length of historical data.
 
107
  target_cols (list[str]): A list of target column names for which predictions are to be made.
108
 
109
  Returns:
110
+ tuple[dict[str, pd.DataFrame], np.ndarray]: A tuple where the first element is a dictionary where keys are target column names and values are DataFrames containing the predictions for each column, and the second element is an array of indices in the original time series that correspond to the predictions.
111
  """
112
+ prediction_item_ids = [iid for iid in sorted(preProcessedData['item_id'].unique()) if iid > 0]
113
 
114
  tasks, segment_start_indices = [], [] # Track where each segment starts in original data
115
  for item_id in prediction_item_ids:
 
153
 
154
  # For each target column (variate)
155
  for d_idx, col in enumerate(target_cols):
156
+ predictions_dict[col].append(pd.DataFrame({
157
  'item_id': prediction_item_ids[seg_idx], # item_id for this segment
158
  'timestep': np.arange(seg_start, seg_start + PREDICTION_LENGTH),
159
+ 'predictions': pred_np[d_idx, 10, :]
160
+ }))
 
 
 
 
161
 
162
  # Track original indices
163
  all_indices.extend(range(seg_start, seg_start + PREDICTION_LENGTH))
 
231
 
232
 
233
  ## Computing the discrete anomaly scores and labels
234
+ def computeDiscreteScores(predictions_dict: dict[str, pd.DataFrame], time_series_df: pd.DataFrame, target_cols:list[str], indexes: np.ndarray,
235
  horizons: list[int] = [1, 8, 32, 64])-> pd.Series:
236
  """ Compute discrete anomaly scores and labels based on the predictions from the Chronos2 pipeline. This function performs the following steps:
237
  1. Identifies the indices in the original time series corresponding to the predictions made by the Chronos2 pipeline.
 
242
  predictions_dict (dict[str, pd.DataFrame]): A dictionary where keys are target column names and values are DataFrames containing the predictions for each column.
243
  time_series_df (pd.DataFrame): The original time series DataFrame containing the actual values for each target column.
244
  target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores are to be computed.
245
+ indexes (np.ndarray): The array of indices in the original time series that correspond to the predictions.
246
  horizons (list[int], optional): A list of forecast horizons to consider for computing anomaly scores. Default is [1, 8, 32, 64].
247
 
248
  Returns:
249
  pd.Series: A pandas Series containing the discrete anomaly labels (1 for anomaly, 0 for normal) for each time point in the original time series.
250
  """
 
 
 
251
  continuousScores = {col: computeMultiHorizonAnomalyScore(
252
  predictions_df=predictions_dict[col],
253
  actual_values=time_series_df[col].values,
254
+ prediction_indices=indexes,
255
  horizons=horizons
256
  ) for col in target_cols}
257
 
 
259
 
260
 
261
 
262
+ def assembleResults(preProcessedData: pd.DataFrame, timestamp_old: pd.Series|None, target_cols:list[str], scores: np.ndarray)->pd.DataFrame:
263
  """
264
  Assemble the final results DataFrame containing the original time series data along with the computed anomaly labels. This function performs the following steps:
265
  1. Creates a copy of the pre-processed DataFrame to serve as the base for the results.
 
272
  preProcessedData (pd.DataFrame): The pre-processed DataFrame formatted for Chronos2 input, which serves as the base for the results.
273
  timestamp_old (pd.Series|None): The original timestamp values extracted from the uploaded data, or None if no timestamp column was provided.
274
  target_cols (list[str]): A list of target column names for which predictions were made and anomaly scores were computed.
275
+ scores (np.ndarray): A numpy array containing the discrete anomaly labels (1 for anomaly, 0 for normal) for each time point corresponding to the predictions.
276
 
277
  Returns:
278
  pd.DataFrame: A DataFrame containing the original time series data along with an additional column 'anomaly_label' indicating the anomaly labels, and the original timestamps if available.
 
280
 
281
  result_df = preProcessedData.copy()
282
 
283
+ result_df['anomaly_label'] = -1
284
+ result_df.loc[result_df['item_id'] > 0, 'anomaly_label'] = scores[:len(result_df)-MIN_LENGTH_CONTEXT]
285
+
286
  if timestamp_old is not None:
287
  result_df['timestamp'] = timestamp_old
288
  return result_df.drop(columns=['item_id'])
 
307
 
308
  if df.shape[1] - 2 < MAX_NUMBER_OF_PLOTTABLE_SERIES and df.shape[0] < MAX_NUMBER_OF_POINTS_PLOTTABLE: # -2 to exclude timestamp and anomaly_label columns
309
  fig, ax = plt.subplots(1,1,figsize=(15, 5))
310
+ df['temp_timestamp_for_print'] = np.arange(len(df)) # posizione numerica dei punti
311
+
312
  for col in target_cols:
313
+ ax.plot(df['temp_timestamp_for_print'], df[col], label=col)
314
+
315
+ # Evidenziazione anomalie con background rosso
316
+ for _, row in df[df['anomaly_label'] == 1].iterrows():
317
+ ax.axvspan(
318
+ row['temp_timestamp_for_print'] - 0.5,
319
+ row['temp_timestamp_for_print'] + 0.5,
320
+ color='red',
321
+ alpha=0.15
322
+ )
323
 
 
 
 
 
324
  ax.legend()
325
  ax.set_title('Time Series with Detected Anomalies')
326
  ax.set_xlabel('Timestamp')
327
  ax.set_ylabel('Values')
328
+ ax.grid(True, which='both', linestyle='--', linewidth=0.5)
329
+ ax.set_xticks(df['temp_timestamp_for_print'][::max(1, len(df)//10)], labels=df['timestamp'][::max(1, len(df)//10)], rotation=45)
330
  plt.tight_layout()
331
+ df.drop(columns=['temp_timestamp_for_print'], inplace=True)
332
  return fig # Return the figure object
333
  else:
334
+ return None