|
|
import pickle |
|
|
|
|
|
import lightgbm as lgbm |
|
|
import matplotlib.pyplot as plt |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import seaborn as sns |
|
|
|
|
|
|
|
|
def fill_misisng_values(df): |
|
|
"""Fill NaN values in the 'sales' column with the mean of non-NaN values""" |
|
|
df_filled = df.copy() |
|
|
df_filled["sales"] = df_filled["sales"].fillna(df_filled["sales"].mean()) |
|
|
return df_filled |
|
|
|
|
|
|
|
|
def correct_outliers(df, factor=3): |
|
|
"""Identify and correct outliers in the 'sales' column by reducing them to the mean""" |
|
|
df_corrected = df.copy() |
|
|
|
|
|
|
|
|
z_scores = (df_corrected["sales"] - df_corrected["sales"].mean()) / df_corrected[ |
|
|
"sales" |
|
|
].std() |
|
|
outlier_indices = np.abs(z_scores) > factor |
|
|
|
|
|
df_corrected.loc[outlier_indices, "sales"] = df_corrected["sales"].mean() |
|
|
|
|
|
return df_corrected |
|
|
|
|
|
|
|
|
def get_sample_stores(df: pd.DataFrame, store_id: int = 1) -> pd.DataFrame: |
|
|
"""Get the sample stores with store_id""" |
|
|
grouped = df.groupby("store_id") |
|
|
sample_store = grouped.get_group((store_id)) |
|
|
return sample_store |
|
|
|
|
|
|
|
|
def save_data(df, file_path, file_format="feather"): |
|
|
""" |
|
|
Save a DataFrame to a specified file format. |
|
|
|
|
|
Parameters: |
|
|
- df (pd.DataFrame): The DataFrame to be saved. |
|
|
- file_path (str): The path where the file will be saved. |
|
|
- file_format (str): The format in which to save the file. Supported formats: 'feather', 'csv'. |
|
|
Default is 'feather'. |
|
|
Example: |
|
|
```python |
|
|
# Assuming df is the DataFrame you want to save |
|
|
save_data(df, 'output_data.feather', file_format='feather') |
|
|
``` |
|
|
|
|
|
Note: |
|
|
- Make sure to have the required libraries (pandas and feather-format) installed. |
|
|
""" |
|
|
if file_format.lower() == "feather": |
|
|
|
|
|
df.to_feather(file_path) |
|
|
print(f"DataFrame saved to {file_path} in Feather format.") |
|
|
elif file_format.lower() == "csv": |
|
|
|
|
|
df.to_csv(file_path, index=False) |
|
|
print(f"DataFrame saved to {file_path} in CSV format.") |
|
|
else: |
|
|
print( |
|
|
f"Error: Unsupported file format '{file_format}'. Supported formats: 'feather', 'csv'." |
|
|
) |
|
|
|
|
|
|
|
|
def flatten_prophet_predictions(predictions_dict): |
|
|
all_dfs = [] |
|
|
|
|
|
for store_item, df in predictions_dict.items(): |
|
|
df = df.copy() |
|
|
df["store_item"] = store_item |
|
|
all_dfs.append(df) |
|
|
|
|
|
return pd.concat(all_dfs, ignore_index=True) |
|
|
|
|
|
|
|
|
def load_model(file_path): |
|
|
""" |
|
|
Load a machine learning model from a file. |
|
|
|
|
|
Parameters: |
|
|
- file_path: The file path from where the model will be loaded. |
|
|
|
|
|
Returns: |
|
|
- The loaded model. |
|
|
""" |
|
|
try: |
|
|
with open(file_path, "rb") as file: |
|
|
model = pickle.load(file) |
|
|
print(f"Sklearn model loaded from {file_path}") |
|
|
|
|
|
except (pickle.UnpicklingError, FileNotFoundError): |
|
|
|
|
|
|
|
|
model = lgbm.Booster(model_file=file_path) |
|
|
print(f"LightGBM (scikit-learn API) model loaded from {file_path}") |
|
|
|
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
def weighted_absolute_percentage_error(y_true, y_pred): |
|
|
""" |
|
|
Calculate Weighted Absolute Percentage Error |
|
|
|
|
|
Args: |
|
|
y_true: Actual values |
|
|
y_pred: Predicted values |
|
|
|
|
|
Returns: |
|
|
WAPE value (percentage) |
|
|
""" |
|
|
y_true, y_pred = np.array(y_true), np.array(y_pred) |
|
|
return 100 * np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true)) |
|
|
|