File size: 11,173 Bytes
327acb9
0573e79
b2381d4
09a9428
c558af7
6cb65b2
 
b2381d4
7b24758
c558af7
6cb65b2
f41e99b
6cb65b2
 
 
 
 
 
c558af7
 
0573e79
 
 
b2381d4
6cb65b2
 
 
 
4566dcf
 
 
 
45dcae3
 
 
 
 
4566dcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13b62f3
 
 
 
 
 
 
 
 
4566dcf
13b62f3
7b24758
13b62f3
6cb65b2
2453f6b
 
 
 
 
 
 
 
4566dcf
6cb65b2
 
 
2453f6b
 
6cb65b2
0573e79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cb65b2
 
 
 
 
 
 
 
 
31e91d0
 
 
 
 
 
6cb65b2
 
 
 
 
 
 
 
 
 
 
69a67ba
327acb9
 
 
 
 
6cb65b2
 
 
 
69a67ba
6cb65b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69a67ba
6cb65b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327acb9
6cb65b2
327acb9
6cb65b2
 
 
327acb9
 
 
6cb65b2
327acb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da0acef
327acb9
 
 
 
 
 
 
 
 
 
f72f700
327acb9
 
 
 
 
 
 
 
 
 
c682a19
327acb9
 
 
 
 
 
 
 
 
 
 
 
2453f6b
 
c682a19
 
0573e79
 
327acb9
 
 
 
 
 
c682a19
327acb9
 
 
 
c682a19
 
 
 
 
327acb9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
import os
from typing import Tuple
from views.admin_view import AdminView
import streamlit as st
from controllers.user_controller import UserController
import pandas as pd
from air_quality_forecast.prediction import PredictorModels


class AdminController(UserController):
    """
    A class to handle the admin interface. Inherits from UserController
    """

    def __init__(self) -> None:
        """
        Initializes the AdminController class.
        """
        super().__init__()
        self._view = AdminView()
        self._distribution_means, self._distribution_stds = (
            self._compute_distribution_statistics()
        )

    def show_dashboard(self) -> None:
        """
        Shows the main page of the admin interface.
        """

        switch = self._view.show_admin_pages()

        if switch == "Display Predictions":
            if not self._is_current_data_available():
                self._view.data_not_available()
            else:
                self._show_current_data()
                self._display_plots()

        elif switch == "Make Predictions":
            self._make_custom_predictions()

        elif switch == "Feature Importances":
            self._feature_importance()

        elif switch == "Model Metrics":
            self._model_metrics()

    def _feature_importance(self) -> Tuple[list, list]:
        """
        Retrieves the feature importance values and their corresponding feature names.

        Returns:
            Tuple[list, list]: A tuple containing the feature names and their importance values.
        """

        # This is sad but has to be done like this
        current_dir = os.path.dirname(os.path.abspath(__file__))
        parent_dir = os.path.dirname(current_dir)
        grandparent_dir = os.path.dirname(parent_dir)
        feature_importances_path = os.path.join(
            grandparent_dir,
            "data",
            "other",
            "feature_importance.csv",
        )

        feature_importances = pd.read_csv(feature_importances_path)

        self._view.display_feature_importance(feature_importances)

    def _show_current_data(self) -> None:
        """
        Shows the current data on the main page of the user interface.
        """
        merged_data_df = self._prepare_data_for_view()
        self._check_data_out_of_distribution(self._model.get_all_data_last_three_days())
        self._view.show_current_data(merged_data_df)

    def _model_metrics(self) -> None:
        """
        Computes the metrics for the admin interface.
        """
        df = self._model.calculate_metrics()
        self._view.display_datatable(df, "Model Metrics over Last Three Days")

    def _compute_distribution_statistics(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Computes the means and standard deviations of the features in the dataset.

        Returns:
            A tuple of two DataFrames. The first DataFrame contains the means of the features
                and the second DataFrame contains the standard deviations of the features.
        """
        current_dir = os.path.dirname(os.path.abspath(__file__))
        parent_dir = os.path.dirname(current_dir)
        grandparent_dir = os.path.dirname(parent_dir)
        distribution_data = pd.read_csv(
            os.path.join(
                grandparent_dir,
                "data",
                "processed/",
                "v2_merged_selected_features_with_missing.csv",
            ),
            index_col=0,
        )

        distribution_means = (
            distribution_data.mean().reset_index(drop=False).transpose()
        )
        distribution_means.columns = distribution_means.iloc[0]
        distribution_means = distribution_means[1:]

        distribution_stds = distribution_data.std().reset_index(drop=False).transpose()
        distribution_stds.columns = distribution_stds.iloc[0]
        distribution_stds = distribution_stds[1:]

        formatted_means = pd.concat(
            [
                distribution_means.add_suffix(" - day 0"),
                distribution_means.add_suffix(" - day 1"),
                distribution_means.add_suffix(" - day 2"),
            ],
            axis=1,
        )

        formatted_stds = pd.concat(
            [
                distribution_stds.add_suffix(" - day 0"),
                distribution_stds.add_suffix(" - day 1"),
                distribution_stds.add_suffix(" - day 2"),
            ],
            axis=1,
        )

        return formatted_means, formatted_stds

    def _make_custom_predictions(self) -> None:
        """
        Makes a custom prediction for the admin interface.
        """
        self._view.upload_instructions()
        checks = {
            "The data must be unnormalized.": False,
            "PM25, PM10, O3, NO2 should be in micrograms per cubic meter (µg/m³).": False,
            "Temperature should be in degrees Celcius": False,
            "Humidity should be in percentages": False,
            "Visibility should be in kilometers": False,
            "Solar Radiation should be in watts per square meter (W/m²).": False,
            "Precipitation should be in millimeters": False,
            "Wind Speed should be in kilometers per hour (km/h).": False,
            "Wind Direction should be in degrees": False,
            "The dataset must contain a total of 33 columns in the specified order.": False,
            "I accept that my data will be used for a prediction using a custom model.": False,
            "I understand that my data will not be saved.": False,
        }

        all_checks_marked = self._view.confirm_checks(checks)

        if all_checks_marked:
            dataset = self._view.upload_dataset()
            if dataset is not None:
                data = pd.read_csv(dataset)

                if not self._data_is_valid(data):
                    return

                self._check_data_out_of_distribution(data)

                if "date" in data.columns or "datetime" in data.columns:
                    data.set_index(
                        "date" if "date" in data.columns else "datetime", inplace=True
                    )

                self._view.display_datatable(data, message="### User Data")

                prediction = self._make_prediction(data)
                self._view.display_datatable(
                    prediction, message="### XGBoost Model Prediction (first 6 rows)"
                )
                self._view.download_button(
                    label="Download predictions as CSV",
                    data=prediction.to_csv(index=True),
                    file_name="predictions.csv",
                )

        else:
            st.warning("Please confirm all the requirements by marking the checkboxes.")

    def _make_prediction(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Makes a prediction using an XGBoost model.

        Args:
            data (pd.DataFrame): The data to make the prediction on.

        Returns:
            pd.DataFrame: The prediction.
        """
        predictor = PredictorModels()
        prediction = predictor.xgb_predictions(data, normalized=False)
        prediction = pd.DataFrame(
            prediction,
            columns=[
                "NO2 + day 1",
                "O3 + day 1",
                "NO2 + day 2",
                "O3 + day 2",
                "NO2 + day 3",
                "O3 + day 3",
            ],
            index=data.index,
        )

        return prediction

    def _data_is_valid(self, data: pd.DataFrame) -> bool:
        """
        Performs data validation on the uploaded user data.

        Args:
            data (pd.DataFrame): The user data.

        Returns:
            bool: True if the data is valid, otherwise False.
        """
        columns = data.columns
        expected_columns_count = 33
        has_date_column = "date" in columns

        if has_date_column:
            if len(columns) != expected_columns_count + 1:
                self._view.error(
                    f"Invalid column count. Expected {expected_columns_count + 1} columns including 'date', but got {len(columns)}."
                )
                return False
        else:
            # Dataset should have exactly 33 columns
            if len(columns) != expected_columns_count:
                self._view.error(
                    f"Invalid column count. Expected {expected_columns_count} columns, but got {len(columns)}."
                )
                return False

        data_without_date = data.drop(columns=["date"], errors="ignore")

        if (
            not data_without_date.map(
                lambda x: isinstance(x, (float, int)) or pd.isna(x)
            )
            .all()
            .all()
        ):
            self._view.error(
                "The dataset contains values that are not floats or NaN. All values must be floats or NaN."
            )
            return False

        if not ((data_without_date >= 0) | data_without_date.isna()).all().all():
            self._view.error(
                "The dataset contains negative values. All values must be positive."
            )
            return False

        # If all checks passed
        self._view.success("Data validation passed successfully.")
        return True

    def _check_data_out_of_distribution(
        self, input_data: pd.DataFrame, threshold: float = 3
    ) -> bool:
        """
        Checks if the input data is out of distribution compared to the training data.
        Displays which features exceed the threshold.

        Args:
            data (pd.DataFrame): The new input data to validate.
            threshold (float): The Z-score threshold to determine out of distribution.

        Returns:
            bool: True if the input data is out of distribution, False otherwise.
        """
        if "date" in input_data.columns:
            input_data.drop("date", axis=1, inplace=True)

        z_scores = (
            input_data - self._distribution_means.values.squeeze()
        ) / self._distribution_stds.values.squeeze()

        out_of_distribution_flags = z_scores.abs() > threshold

        ood_rows = out_of_distribution_flags.sum(axis=1)

        if ood_rows.any():
            error_message = f"Input data might contain out-of-distribution values. {ood_rows.sum()} {'feature exceeds' if ood_rows.sum() == 1 else 'features exceed'} exceed the z-score threshold. Model prediction might be inaccurate.\n\n"

            ood_details = z_scores[out_of_distribution_flags]
            for index, row in ood_details.iterrows():
                ood_features = row.dropna().index.tolist()
                ood_values = input_data.loc[index, ood_features]

                # Loop through each feature individually for better readability
                for feature, value in zip(ood_features, ood_values):
                    error_message += f"Row {index + 1}: The feature '{feature}' has a value of {value:.2f}.\n\n"

            self._view.error(error_message)

            return

        self._view.success("Input data is within the expected distribution.")