import os from typing import Tuple from views.admin_view import AdminView import streamlit as st from controllers.user_controller import UserController import pandas as pd from air_quality_forecast.prediction import PredictorModels class AdminController(UserController): """ A class to handle the admin interface. Inherits from UserController """ def __init__(self) -> None: """ Initializes the AdminController class. """ super().__init__() self._view = AdminView() self._distribution_means, self._distribution_stds = ( self._compute_distribution_statistics() ) def show_dashboard(self) -> None: """ Shows the main page of the admin interface. """ switch = self._view.show_admin_pages() if switch == "Display Predictions": if not self._is_current_data_available(): self._view.data_not_available() else: self._show_current_data() self._display_plots() elif switch == "Make Predictions": self._make_custom_predictions() elif switch == "Feature Importances": self._feature_importance() elif switch == "Model Metrics": self._model_metrics() def _feature_importance(self) -> Tuple[list, list]: """ Retrieves the feature importance values and their corresponding feature names. Returns: Tuple[list, list]: A tuple containing the feature names and their importance values. """ # This is sad but has to be done like this current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) grandparent_dir = os.path.dirname(parent_dir) feature_importances_path = os.path.join( grandparent_dir, "data", "other", "feature_importance.csv", ) feature_importances = pd.read_csv(feature_importances_path) self._view.display_feature_importance(feature_importances) def _show_current_data(self) -> None: """ Shows the current data on the main page of the user interface. """ merged_data_df = self._prepare_data_for_view() self._check_data_out_of_distribution(self._model.get_all_data_last_three_days()) self._view.show_current_data(merged_data_df) def _model_metrics(self) -> None: """ Computes the metrics for the admin interface. """ df = self._model.calculate_metrics() self._view.display_datatable(df, "Model Metrics over Last Three Days") def _compute_distribution_statistics(self) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Computes the means and standard deviations of the features in the dataset. Returns: A tuple of two DataFrames. The first DataFrame contains the means of the features and the second DataFrame contains the standard deviations of the features. """ current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) grandparent_dir = os.path.dirname(parent_dir) distribution_data = pd.read_csv( os.path.join( grandparent_dir, "data", "processed/", "v2_merged_selected_features_with_missing.csv", ), index_col=0, ) distribution_means = ( distribution_data.mean().reset_index(drop=False).transpose() ) distribution_means.columns = distribution_means.iloc[0] distribution_means = distribution_means[1:] distribution_stds = distribution_data.std().reset_index(drop=False).transpose() distribution_stds.columns = distribution_stds.iloc[0] distribution_stds = distribution_stds[1:] formatted_means = pd.concat( [ distribution_means.add_suffix(" - day 0"), distribution_means.add_suffix(" - day 1"), distribution_means.add_suffix(" - day 2"), ], axis=1, ) formatted_stds = pd.concat( [ distribution_stds.add_suffix(" - day 0"), distribution_stds.add_suffix(" - day 1"), distribution_stds.add_suffix(" - day 2"), ], axis=1, ) return formatted_means, formatted_stds def _make_custom_predictions(self) -> None: """ Makes a custom prediction for the admin interface. """ self._view.upload_instructions() checks = { "The data must be unnormalized.": False, "PM25, PM10, O3, NO2 should be in micrograms per cubic meter (µg/m³).": False, "Temperature should be in degrees Celcius": False, "Humidity should be in percentages": False, "Visibility should be in kilometers": False, "Solar Radiation should be in watts per square meter (W/m²).": False, "Precipitation should be in millimeters": False, "Wind Speed should be in kilometers per hour (km/h).": False, "Wind Direction should be in degrees": False, "The dataset must contain a total of 33 columns in the specified order.": False, "I accept that my data will be used for a prediction using a custom model.": False, "I understand that my data will not be saved.": False, } all_checks_marked = self._view.confirm_checks(checks) if all_checks_marked: dataset = self._view.upload_dataset() if dataset is not None: data = pd.read_csv(dataset) if not self._data_is_valid(data): return self._check_data_out_of_distribution(data) if "date" in data.columns or "datetime" in data.columns: data.set_index( "date" if "date" in data.columns else "datetime", inplace=True ) self._view.display_datatable(data, message="### User Data") prediction = self._make_prediction(data) self._view.display_datatable( prediction, message="### XGBoost Model Prediction (first 6 rows)" ) self._view.download_button( label="Download predictions as CSV", data=prediction.to_csv(index=True), file_name="predictions.csv", ) else: st.warning("Please confirm all the requirements by marking the checkboxes.") def _make_prediction(self, data: pd.DataFrame) -> pd.DataFrame: """ Makes a prediction using an XGBoost model. Args: data (pd.DataFrame): The data to make the prediction on. Returns: pd.DataFrame: The prediction. """ predictor = PredictorModels() prediction = predictor.xgb_predictions(data, normalized=False) prediction = pd.DataFrame( prediction, columns=[ "NO2 + day 1", "O3 + day 1", "NO2 + day 2", "O3 + day 2", "NO2 + day 3", "O3 + day 3", ], index=data.index, ) return prediction def _data_is_valid(self, data: pd.DataFrame) -> bool: """ Performs data validation on the uploaded user data. Args: data (pd.DataFrame): The user data. Returns: bool: True if the data is valid, otherwise False. """ columns = data.columns expected_columns_count = 33 has_date_column = "date" in columns if has_date_column: if len(columns) != expected_columns_count + 1: self._view.error( f"Invalid column count. Expected {expected_columns_count + 1} columns including 'date', but got {len(columns)}." ) return False else: # Dataset should have exactly 33 columns if len(columns) != expected_columns_count: self._view.error( f"Invalid column count. Expected {expected_columns_count} columns, but got {len(columns)}." ) return False data_without_date = data.drop(columns=["date"], errors="ignore") if ( not data_without_date.map( lambda x: isinstance(x, (float, int)) or pd.isna(x) ) .all() .all() ): self._view.error( "The dataset contains values that are not floats or NaN. All values must be floats or NaN." ) return False if not ((data_without_date >= 0) | data_without_date.isna()).all().all(): self._view.error( "The dataset contains negative values. All values must be positive." ) return False # If all checks passed self._view.success("Data validation passed successfully.") return True def _check_data_out_of_distribution( self, input_data: pd.DataFrame, threshold: float = 3 ) -> bool: """ Checks if the input data is out of distribution compared to the training data. Displays which features exceed the threshold. Args: data (pd.DataFrame): The new input data to validate. threshold (float): The Z-score threshold to determine out of distribution. Returns: bool: True if the input data is out of distribution, False otherwise. """ if "date" in input_data.columns: input_data.drop("date", axis=1, inplace=True) z_scores = ( input_data - self._distribution_means.values.squeeze() ) / self._distribution_stds.values.squeeze() out_of_distribution_flags = z_scores.abs() > threshold ood_rows = out_of_distribution_flags.sum(axis=1) if ood_rows.any(): error_message = f"Input data might contain out-of-distribution values. {ood_rows.sum()} {'feature exceeds' if ood_rows.sum() == 1 else 'features exceed'} exceed the z-score threshold. Model prediction might be inaccurate.\n\n" ood_details = z_scores[out_of_distribution_flags] for index, row in ood_details.iterrows(): ood_features = row.dropna().index.tolist() ood_values = input_data.loc[index, ood_features] # Loop through each feature individually for better readability for feature, value in zip(ood_features, ood_values): error_message += f"Row {index + 1}: The feature '{feature}' has a value of {value:.2f}.\n\n" self._view.error(error_message) return self._view.success("Input data is within the expected distribution.")