File size: 3,681 Bytes
84548c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import pickle
import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
def fill_misisng_values(df):
"""Fill NaN values in the 'sales' column with the mean of non-NaN values"""
df_filled = df.copy()
df_filled["sales"] = df_filled["sales"].fillna(df_filled["sales"].mean())
return df_filled
def correct_outliers(df, factor=3):
"""Identify and correct outliers in the 'sales' column by reducing them to the mean"""
df_corrected = df.copy()
# Identify outliers using z-score
z_scores = (df_corrected["sales"] - df_corrected["sales"].mean()) / df_corrected[
"sales"
].std()
outlier_indices = np.abs(z_scores) > factor # Adjust the threshold as needed
# Correct outliers by reducing them to the mean
df_corrected.loc[outlier_indices, "sales"] = df_corrected["sales"].mean()
return df_corrected
def get_sample_stores(df: pd.DataFrame, store_id: int = 1) -> pd.DataFrame:
"""Get the sample stores with store_id"""
grouped = df.groupby("store_id")
sample_store = grouped.get_group((store_id))
return sample_store
def save_data(df, file_path, file_format="feather"):
"""
Save a DataFrame to a specified file format.
Parameters:
- df (pd.DataFrame): The DataFrame to be saved.
- file_path (str): The path where the file will be saved.
- file_format (str): The format in which to save the file. Supported formats: 'feather', 'csv'.
Default is 'feather'.
Example:
```python
# Assuming df is the DataFrame you want to save
save_data(df, 'output_data.feather', file_format='feather')
```
Note:
- Make sure to have the required libraries (pandas and feather-format) installed.
"""
if file_format.lower() == "feather":
# Save to Feather format
df.to_feather(file_path)
print(f"DataFrame saved to {file_path} in Feather format.")
elif file_format.lower() == "csv":
# Save to CSV format
df.to_csv(file_path, index=False)
print(f"DataFrame saved to {file_path} in CSV format.")
else:
print(
f"Error: Unsupported file format '{file_format}'. Supported formats: 'feather', 'csv'."
)
def flatten_prophet_predictions(predictions_dict):
all_dfs = []
for store_item, df in predictions_dict.items():
df = df.copy()
df["store_item"] = store_item
all_dfs.append(df)
return pd.concat(all_dfs, ignore_index=True)
def load_model(file_path):
"""
Load a machine learning model from a file.
Parameters:
- file_path: The file path from where the model will be loaded.
Returns:
- The loaded model.
"""
try:
with open(file_path, "rb") as file:
model = pickle.load(file)
print(f"Sklearn model loaded from {file_path}")
except (pickle.UnpicklingError, FileNotFoundError):
# If loading as scikit-learn model fails or the file is not found,
# assume it is a LightGBM model (scikit-learn API)
model = lgbm.Booster(model_file=file_path)
print(f"LightGBM (scikit-learn API) model loaded from {file_path}")
return model
# Function to calculate WAPE (Weighted Absolute Percentage Error)
def weighted_absolute_percentage_error(y_true, y_pred):
"""
Calculate Weighted Absolute Percentage Error
Args:
y_true: Actual values
y_pred: Predicted values
Returns:
WAPE value (percentage)
"""
y_true, y_pred = np.array(y_true), np.array(y_pred)
return 100 * np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))
|