Spaces:
Sleeping
Sleeping
File size: 11,173 Bytes
327acb9 0573e79 b2381d4 09a9428 c558af7 6cb65b2 b2381d4 7b24758 c558af7 6cb65b2 f41e99b 6cb65b2 c558af7 0573e79 b2381d4 6cb65b2 4566dcf 45dcae3 4566dcf 13b62f3 4566dcf 13b62f3 7b24758 13b62f3 6cb65b2 2453f6b 4566dcf 6cb65b2 2453f6b 6cb65b2 0573e79 6cb65b2 31e91d0 6cb65b2 69a67ba 327acb9 6cb65b2 69a67ba 6cb65b2 69a67ba 6cb65b2 327acb9 6cb65b2 327acb9 6cb65b2 327acb9 6cb65b2 327acb9 da0acef 327acb9 f72f700 327acb9 c682a19 327acb9 2453f6b c682a19 0573e79 327acb9 c682a19 327acb9 c682a19 327acb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
import os
from typing import Tuple
from views.admin_view import AdminView
import streamlit as st
from controllers.user_controller import UserController
import pandas as pd
from air_quality_forecast.prediction import PredictorModels
class AdminController(UserController):
"""
A class to handle the admin interface. Inherits from UserController
"""
def __init__(self) -> None:
"""
Initializes the AdminController class.
"""
super().__init__()
self._view = AdminView()
self._distribution_means, self._distribution_stds = (
self._compute_distribution_statistics()
)
def show_dashboard(self) -> None:
"""
Shows the main page of the admin interface.
"""
switch = self._view.show_admin_pages()
if switch == "Display Predictions":
if not self._is_current_data_available():
self._view.data_not_available()
else:
self._show_current_data()
self._display_plots()
elif switch == "Make Predictions":
self._make_custom_predictions()
elif switch == "Feature Importances":
self._feature_importance()
elif switch == "Model Metrics":
self._model_metrics()
def _feature_importance(self) -> Tuple[list, list]:
"""
Retrieves the feature importance values and their corresponding feature names.
Returns:
Tuple[list, list]: A tuple containing the feature names and their importance values.
"""
# This is sad but has to be done like this
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)
feature_importances_path = os.path.join(
grandparent_dir,
"data",
"other",
"feature_importance.csv",
)
feature_importances = pd.read_csv(feature_importances_path)
self._view.display_feature_importance(feature_importances)
def _show_current_data(self) -> None:
"""
Shows the current data on the main page of the user interface.
"""
merged_data_df = self._prepare_data_for_view()
self._check_data_out_of_distribution(self._model.get_all_data_last_three_days())
self._view.show_current_data(merged_data_df)
def _model_metrics(self) -> None:
"""
Computes the metrics for the admin interface.
"""
df = self._model.calculate_metrics()
self._view.display_datatable(df, "Model Metrics over Last Three Days")
def _compute_distribution_statistics(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Computes the means and standard deviations of the features in the dataset.
Returns:
A tuple of two DataFrames. The first DataFrame contains the means of the features
and the second DataFrame contains the standard deviations of the features.
"""
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)
distribution_data = pd.read_csv(
os.path.join(
grandparent_dir,
"data",
"processed/",
"v2_merged_selected_features_with_missing.csv",
),
index_col=0,
)
distribution_means = (
distribution_data.mean().reset_index(drop=False).transpose()
)
distribution_means.columns = distribution_means.iloc[0]
distribution_means = distribution_means[1:]
distribution_stds = distribution_data.std().reset_index(drop=False).transpose()
distribution_stds.columns = distribution_stds.iloc[0]
distribution_stds = distribution_stds[1:]
formatted_means = pd.concat(
[
distribution_means.add_suffix(" - day 0"),
distribution_means.add_suffix(" - day 1"),
distribution_means.add_suffix(" - day 2"),
],
axis=1,
)
formatted_stds = pd.concat(
[
distribution_stds.add_suffix(" - day 0"),
distribution_stds.add_suffix(" - day 1"),
distribution_stds.add_suffix(" - day 2"),
],
axis=1,
)
return formatted_means, formatted_stds
def _make_custom_predictions(self) -> None:
"""
Makes a custom prediction for the admin interface.
"""
self._view.upload_instructions()
checks = {
"The data must be unnormalized.": False,
"PM25, PM10, O3, NO2 should be in micrograms per cubic meter (µg/m³).": False,
"Temperature should be in degrees Celcius": False,
"Humidity should be in percentages": False,
"Visibility should be in kilometers": False,
"Solar Radiation should be in watts per square meter (W/m²).": False,
"Precipitation should be in millimeters": False,
"Wind Speed should be in kilometers per hour (km/h).": False,
"Wind Direction should be in degrees": False,
"The dataset must contain a total of 33 columns in the specified order.": False,
"I accept that my data will be used for a prediction using a custom model.": False,
"I understand that my data will not be saved.": False,
}
all_checks_marked = self._view.confirm_checks(checks)
if all_checks_marked:
dataset = self._view.upload_dataset()
if dataset is not None:
data = pd.read_csv(dataset)
if not self._data_is_valid(data):
return
self._check_data_out_of_distribution(data)
if "date" in data.columns or "datetime" in data.columns:
data.set_index(
"date" if "date" in data.columns else "datetime", inplace=True
)
self._view.display_datatable(data, message="### User Data")
prediction = self._make_prediction(data)
self._view.display_datatable(
prediction, message="### XGBoost Model Prediction (first 6 rows)"
)
self._view.download_button(
label="Download predictions as CSV",
data=prediction.to_csv(index=True),
file_name="predictions.csv",
)
else:
st.warning("Please confirm all the requirements by marking the checkboxes.")
def _make_prediction(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Makes a prediction using an XGBoost model.
Args:
data (pd.DataFrame): The data to make the prediction on.
Returns:
pd.DataFrame: The prediction.
"""
predictor = PredictorModels()
prediction = predictor.xgb_predictions(data, normalized=False)
prediction = pd.DataFrame(
prediction,
columns=[
"NO2 + day 1",
"O3 + day 1",
"NO2 + day 2",
"O3 + day 2",
"NO2 + day 3",
"O3 + day 3",
],
index=data.index,
)
return prediction
def _data_is_valid(self, data: pd.DataFrame) -> bool:
"""
Performs data validation on the uploaded user data.
Args:
data (pd.DataFrame): The user data.
Returns:
bool: True if the data is valid, otherwise False.
"""
columns = data.columns
expected_columns_count = 33
has_date_column = "date" in columns
if has_date_column:
if len(columns) != expected_columns_count + 1:
self._view.error(
f"Invalid column count. Expected {expected_columns_count + 1} columns including 'date', but got {len(columns)}."
)
return False
else:
# Dataset should have exactly 33 columns
if len(columns) != expected_columns_count:
self._view.error(
f"Invalid column count. Expected {expected_columns_count} columns, but got {len(columns)}."
)
return False
data_without_date = data.drop(columns=["date"], errors="ignore")
if (
not data_without_date.map(
lambda x: isinstance(x, (float, int)) or pd.isna(x)
)
.all()
.all()
):
self._view.error(
"The dataset contains values that are not floats or NaN. All values must be floats or NaN."
)
return False
if not ((data_without_date >= 0) | data_without_date.isna()).all().all():
self._view.error(
"The dataset contains negative values. All values must be positive."
)
return False
# If all checks passed
self._view.success("Data validation passed successfully.")
return True
def _check_data_out_of_distribution(
self, input_data: pd.DataFrame, threshold: float = 3
) -> bool:
"""
Checks if the input data is out of distribution compared to the training data.
Displays which features exceed the threshold.
Args:
data (pd.DataFrame): The new input data to validate.
threshold (float): The Z-score threshold to determine out of distribution.
Returns:
bool: True if the input data is out of distribution, False otherwise.
"""
if "date" in input_data.columns:
input_data.drop("date", axis=1, inplace=True)
z_scores = (
input_data - self._distribution_means.values.squeeze()
) / self._distribution_stds.values.squeeze()
out_of_distribution_flags = z_scores.abs() > threshold
ood_rows = out_of_distribution_flags.sum(axis=1)
if ood_rows.any():
error_message = f"Input data might contain out-of-distribution values. {ood_rows.sum()} {'feature exceeds' if ood_rows.sum() == 1 else 'features exceed'} exceed the z-score threshold. Model prediction might be inaccurate.\n\n"
ood_details = z_scores[out_of_distribution_flags]
for index, row in ood_details.iterrows():
ood_features = row.dropna().index.tolist()
ood_values = input_data.loc[index, ood_features]
# Loop through each feature individually for better readability
for feature, value in zip(ood_features, ood_values):
error_message += f"Row {index + 1}: The feature '{feature}' has a value of {value:.2f}.\n\n"
self._view.error(error_message)
return
self._view.success("Input data is within the expected distribution.")
|