Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,42 +1,42 @@
|
|
| 1 |
-
# --- 1. IMPORT
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import joblib
|
| 5 |
import plotly.graph_objects as go
|
| 6 |
from datetime import datetime
|
| 7 |
|
| 8 |
-
# Import
|
| 9 |
try:
|
| 10 |
from src import benchmark_utils
|
| 11 |
from src import diagnostic_plots as diag
|
| 12 |
except ImportError:
|
| 13 |
-
st.error("
|
| 14 |
-
"
|
| 15 |
st.stop()
|
| 16 |
|
| 17 |
-
# --- 2.
|
| 18 |
st.set_page_config(
|
| 19 |
page_title="Saigon Temperature Forecast",
|
| 20 |
page_icon="🌦️",
|
| 21 |
layout="wide"
|
| 22 |
)
|
| 23 |
|
| 24 |
-
# --- 3.
|
| 25 |
-
#
|
| 26 |
|
| 27 |
@st.cache_data
|
| 28 |
def load_feature_data(file_path="data/final_dataset_tree.csv"):
|
| 29 |
-
"""
|
| 30 |
try:
|
| 31 |
df = pd.read_csv(file_path)
|
| 32 |
|
| 33 |
-
# ---
|
| 34 |
-
#
|
| 35 |
DATE_COLUMN = 'datetime'
|
| 36 |
|
| 37 |
if DATE_COLUMN not in df.columns:
|
| 38 |
-
st.error(f"
|
| 39 |
-
f"
|
| 40 |
return pd.DataFrame()
|
| 41 |
|
| 42 |
df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
|
|
@@ -44,12 +44,12 @@ def load_feature_data(file_path="data/final_dataset_tree.csv"):
|
|
| 44 |
df = df.sort_index()
|
| 45 |
return df
|
| 46 |
except FileNotFoundError:
|
| 47 |
-
st.error(f"
|
| 48 |
return pd.DataFrame()
|
| 49 |
|
| 50 |
@st.cache_resource
|
| 51 |
def load_champion_models():
|
| 52 |
-
"""
|
| 53 |
models = []
|
| 54 |
try:
|
| 55 |
for i in range(1, 6):
|
|
@@ -58,32 +58,31 @@ def load_champion_models():
|
|
| 58 |
models.append(model)
|
| 59 |
return models
|
| 60 |
except FileNotFoundError as e:
|
| 61 |
-
st.error(f"
|
| 62 |
-
"
|
| 63 |
return []
|
| 64 |
|
| 65 |
-
@st.
|
| 66 |
-
|
| 67 |
-
"""Tải dữ liệu hiệu suất đã tính toán trước cho Tab 3."""
|
| 68 |
try:
|
| 69 |
df = pd.read_csv(file_path)
|
| 70 |
return df
|
| 71 |
except FileNotFoundError:
|
| 72 |
-
st.error(f"
|
| 73 |
return pd.DataFrame()
|
| 74 |
|
| 75 |
-
# --- 4.
|
| 76 |
|
| 77 |
-
#
|
| 78 |
all_data_df = load_feature_data()
|
| 79 |
models = load_champion_models()
|
| 80 |
perf_df = load_performance_data()
|
| 81 |
|
| 82 |
-
# ---
|
| 83 |
TARGET_COLS = ['temp_next_1_day', 'temp_next_2_day', 'temp_next_3_day', 'temp_next_4_day', 'temp_next_5_day']
|
| 84 |
CURRENT_TEMP_COL = 'temp'
|
| 85 |
|
| 86 |
-
#
|
| 87 |
TEST_START_DATE = "2024-02-18"
|
| 88 |
TEST_END_DATE = "2025-09-26"
|
| 89 |
|
|
@@ -93,26 +92,26 @@ if not all_data_df.empty:
|
|
| 93 |
try:
|
| 94 |
test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy()
|
| 95 |
|
| 96 |
-
#
|
| 97 |
feature_cols = [col for col in all_data_df.columns if col not in TARGET_COLS]
|
| 98 |
|
| 99 |
-
#
|
| 100 |
-
#
|
| 101 |
X_test = test_df[feature_cols]
|
| 102 |
y_test = test_df[TARGET_COLS]
|
| 103 |
|
| 104 |
-
#
|
| 105 |
y_test.columns = [f'Day {i}' for i in range(1, 6)]
|
| 106 |
except KeyError:
|
| 107 |
-
st.error(f"
|
| 108 |
-
f"'{CURRENT_TEMP_COL}'
|
| 109 |
except Exception as e:
|
| 110 |
-
st.error(f"
|
| 111 |
else:
|
| 112 |
-
st.error("
|
| 113 |
st.stop()
|
| 114 |
|
| 115 |
-
# --- 5.
|
| 116 |
|
| 117 |
st.sidebar.title("Navigation")
|
| 118 |
app_section = st.sidebar.radio(
|
|
@@ -120,7 +119,7 @@ app_section = st.sidebar.radio(
|
|
| 120 |
("Project Overview & Methodology", "Live 5-Day Forecast", "Model Performance & Diagnostics")
|
| 121 |
)
|
| 122 |
|
| 123 |
-
# Date input
|
| 124 |
selected_date = None
|
| 125 |
if app_section == "Live 5-Day Forecast":
|
| 126 |
st.sidebar.header("Forecast Input")
|
|
@@ -140,123 +139,116 @@ if app_section == "Live 5-Day Forecast":
|
|
| 140 |
st.sidebar.error("Test data could not be loaded.")
|
| 141 |
|
| 142 |
|
| 143 |
-
# --- 6.
|
| 144 |
|
| 145 |
if app_section == "Project Overview & Methodology":
|
| 146 |
-
# ---
|
| 147 |
st.title("Saigon Temperature Forecasting Application 🌦️")
|
| 148 |
|
| 149 |
st.subheader("Project Summary")
|
| 150 |
st.markdown("""
|
| 151 |
-
|
| 152 |
|
| 153 |
-
* **
|
| 154 |
-
* **
|
| 155 |
""")
|
| 156 |
|
| 157 |
st.subheader("Our 'Two-Stream' Strategy")
|
| 158 |
st.markdown("""
|
| 159 |
-
|
| 160 |
-
1. **
|
| 161 |
-
2. **
|
| 162 |
|
| 163 |
-
|
| 164 |
""")
|
| 165 |
|
| 166 |
st.subheader("Final Model Leaderboard")
|
| 167 |
-
st.markdown("
|
| 168 |
|
| 169 |
-
#
|
| 170 |
leaderboard_df = benchmark_utils.load_leaderboard()
|
| 171 |
|
| 172 |
if not leaderboard_df.empty:
|
| 173 |
-
#
|
| 174 |
st.dataframe(leaderboard_df.head(10), use_container_width=True)
|
| 175 |
else:
|
| 176 |
-
st.warning("
|
| 177 |
|
| 178 |
# --------------------------------------------------------------------
|
| 179 |
|
| 180 |
elif app_section == "Live 5-Day Forecast":
|
| 181 |
-
# ---
|
| 182 |
st.title("Live 5-Day Forecast")
|
| 183 |
|
| 184 |
if selected_date and not X_test.empty and models:
|
| 185 |
-
st.header(f"
|
| 186 |
|
| 187 |
-
# 1.
|
| 188 |
selected_date_ts = pd.Timestamp(selected_date)
|
| 189 |
|
| 190 |
-
#
|
| 191 |
if selected_date_ts in X_test.index:
|
| 192 |
input_features = X_test.loc[[selected_date_ts]]
|
| 193 |
else:
|
| 194 |
-
st.error("
|
| 195 |
-
input_features = pd.DataFrame() #
|
| 196 |
|
| 197 |
if input_features.empty:
|
| 198 |
-
st.error("
|
| 199 |
else:
|
| 200 |
-
# 2.
|
| 201 |
predictions = []
|
| 202 |
for i in range(5):
|
| 203 |
-
model = models[i] #
|
| 204 |
pred = model.predict(input_features)[0]
|
| 205 |
predictions.append(pred)
|
| 206 |
|
| 207 |
-
# 3.
|
| 208 |
forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:]
|
| 209 |
cols = st.columns(5)
|
| 210 |
|
| 211 |
-
#
|
| 212 |
-
#actual_values
|
| 213 |
-
|
| 214 |
-
#
|
| 215 |
-
# Kiểm tra xem có bất kỳ giá trị 'Actual' nào bị thiếu không
|
| 216 |
-
#is_partial_forecast = any(pd.isna(v) for v in actual_values)
|
| 217 |
-
# ----------------------------------------------
|
| 218 |
-
|
| 219 |
-
# Lấy giá trị thực tế để so sánh
|
| 220 |
-
# --- SỬA LỖI LOGIC: Lấy 'actual_values' từ all_data_df ---
|
| 221 |
-
# Chúng ta cần lấy các cột target (ví dụ: 'temp_next_1_day')
|
| 222 |
-
# từ BẢNG DỮ LIỆU GỐC tại ngày đã chọn.
|
| 223 |
|
| 224 |
actual_values = []
|
| 225 |
if selected_date_ts in all_data_df.index:
|
| 226 |
-
#
|
| 227 |
actual_row = all_data_df.loc[selected_date_ts]
|
| 228 |
|
| 229 |
-
#
|
| 230 |
for col_name in TARGET_COLS:
|
| 231 |
actual_values.append(actual_row[col_name])
|
| 232 |
else:
|
| 233 |
-
#
|
| 234 |
-
actual_values = [float('nan')] * 5 #
|
| 235 |
|
| 236 |
-
# ---
|
| 237 |
-
#
|
| 238 |
is_partial_forecast = any(pd.isna(v) for v in actual_values)
|
| 239 |
-
|
|
|
|
| 240 |
for i in range(5):
|
| 241 |
with cols[i]:
|
| 242 |
|
| 243 |
-
# ---
|
| 244 |
actual_val = actual_values[i]
|
| 245 |
delta_text = f"Actual: {actual_val:.1f}°C" if pd.notna(actual_val) else "Actual: --"
|
| 246 |
-
# ---
|
| 247 |
|
| 248 |
st.metric(
|
| 249 |
label=f"Forecast for {forecast_dates[i].strftime('%b %d')}",
|
| 250 |
value=f"{predictions[i]:.1f}°C",
|
| 251 |
-
delta=delta_text, #
|
| 252 |
-
delta_color="off" #
|
| 253 |
)
|
| 254 |
|
| 255 |
-
# ---
|
| 256 |
st.subheader("Training Set Overview")
|
| 257 |
-
with st.expander("
|
| 258 |
|
| 259 |
-
#
|
| 260 |
train_end_date = pd.Timestamp(TEST_START_DATE) - pd.Timedelta(days=1)
|
| 261 |
train_df = all_data_df.loc[:train_end_date][CURRENT_TEMP_COL]
|
| 262 |
|
|
@@ -264,7 +256,7 @@ elif app_section == "Live 5-Day Forecast":
|
|
| 264 |
fig_train.add_trace(go.Scatter(
|
| 265 |
x=train_df.index, y=train_df,
|
| 266 |
mode='lines', name='Training Data (Actual)',
|
| 267 |
-
line=dict(color='#005aa7', width=1) #
|
| 268 |
))
|
| 269 |
fig_train.update_layout(
|
| 270 |
title="Actual Temperature - Full Training Set",
|
|
@@ -272,19 +264,19 @@ elif app_section == "Live 5-Day Forecast":
|
|
| 272 |
template="plotly_white"
|
| 273 |
)
|
| 274 |
st.plotly_chart(fig_train, use_container_width=True)
|
| 275 |
-
# ---
|
| 276 |
|
| 277 |
-
# 4.
|
| 278 |
st.subheader("Historical Context & Forecast")
|
| 279 |
|
| 280 |
-
#
|
| 281 |
history_start = selected_date_ts - pd.Timedelta(days=14)
|
| 282 |
history_end = selected_date_ts
|
| 283 |
|
| 284 |
-
#
|
| 285 |
history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL]
|
| 286 |
|
| 287 |
-
#
|
| 288 |
forecast_df = pd.DataFrame({
|
| 289 |
'Date': forecast_dates,
|
| 290 |
'Forecast': predictions
|
|
@@ -311,12 +303,12 @@ elif app_section == "Live 5-Day Forecast":
|
|
| 311 |
st.plotly_chart(fig, use_container_width=True)
|
| 312 |
|
| 313 |
|
| 314 |
-
# ---
|
| 315 |
st.subheader("5-Day Forecast vs. Actual Comparison")
|
| 316 |
|
| 317 |
if is_partial_forecast:
|
| 318 |
-
st.info("
|
| 319 |
-
"
|
| 320 |
else:
|
| 321 |
fig_comp = go.Figure()
|
| 322 |
|
|
@@ -340,36 +332,36 @@ elif app_section == "Live 5-Day Forecast":
|
|
| 340 |
template="plotly_white", legend=dict(x=0.01, y=0.99)
|
| 341 |
)
|
| 342 |
st.plotly_chart(fig_comp, use_container_width=True)
|
| 343 |
-
# ---
|
| 344 |
|
| 345 |
else:
|
| 346 |
-
st.warning("
|
| 347 |
|
| 348 |
# --------------------------------------------------------------------
|
| 349 |
|
| 350 |
elif app_section == "Model Performance & Diagnostics":
|
| 351 |
-
# ---
|
| 352 |
st.title("Model Performance & Diagnostics")
|
| 353 |
|
| 354 |
if not perf_df.empty and not y_test.empty:
|
| 355 |
st.subheader("Performance Degradation over 5 Days")
|
| 356 |
-
st.markdown("
|
| 357 |
|
| 358 |
-
#
|
| 359 |
MODEL_NAME = 'Champion (Stacking)'
|
| 360 |
champion_perf_df = perf_df[perf_df['Model'] == MODEL_NAME].copy()
|
| 361 |
|
| 362 |
-
# 1.
|
| 363 |
|
| 364 |
-
# ---
|
| 365 |
-
#
|
| 366 |
RMSE_COL_NAME = 'RMSE (Absolute Error)'
|
| 367 |
R2_COL_NAME = 'R-squared'
|
| 368 |
|
| 369 |
col1, col2 = st.columns(2)
|
| 370 |
with col1:
|
| 371 |
fig_rmse = diag.plot_performance_degradation(
|
| 372 |
-
champion_perf_df, #
|
| 373 |
metric_column=RMSE_COL_NAME,
|
| 374 |
metric_name='RMSE (Temperature °C)',
|
| 375 |
color='blue'
|
|
@@ -377,21 +369,21 @@ elif app_section == "Model Performance & Diagnostics":
|
|
| 377 |
st.plotly_chart(fig_rmse, use_container_width=True)
|
| 378 |
with col2:
|
| 379 |
fig_r2 = diag.plot_performance_degradation(
|
| 380 |
-
champion_perf_df, #
|
| 381 |
metric_column=R2_COL_NAME,
|
| 382 |
metric_name='R-squared (R²)',
|
| 383 |
color='green'
|
| 384 |
)
|
| 385 |
st.plotly_chart(fig_r2, use_container_width=True)
|
| 386 |
|
| 387 |
-
# 2.
|
| 388 |
st.subheader("Forecast vs. Actual Comparison (on entire test set)")
|
| 389 |
|
| 390 |
-
#
|
| 391 |
-
#
|
| 392 |
@st.cache_data
|
| 393 |
def get_full_test_predictions(_models, _X_test):
|
| 394 |
-
"""
|
| 395 |
all_preds = {}
|
| 396 |
for i in range(5):
|
| 397 |
model = _models[i]
|
|
@@ -418,9 +410,9 @@ elif app_section == "Model Performance & Diagnostics":
|
|
| 418 |
)
|
| 419 |
st.plotly_chart(fig_d5, use_container_width=True)
|
| 420 |
|
| 421 |
-
# 3.
|
| 422 |
with st.expander("Champion Model Diagnostics (Deep Dive)"):
|
| 423 |
-
st.markdown("
|
| 424 |
|
| 425 |
y_true_d1 = y_test['Day 1']
|
| 426 |
y_pred_d1 = y_pred_test['Day 1']
|
|
@@ -435,8 +427,8 @@ elif app_section == "Model Performance & Diagnostics":
|
|
| 435 |
y_true_d1, y_pred_d1, "Day 1"
|
| 436 |
)
|
| 437 |
st.plotly_chart(fig_res_dist, use_container_width=True)
|
| 438 |
-
st.markdown("
|
| 439 |
-
"
|
| 440 |
|
| 441 |
else:
|
| 442 |
-
st.warning("
|
|
|
|
| 1 |
+
# --- 1. IMPORT LIBRARIES ---
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import joblib
|
| 5 |
import plotly.graph_objects as go
|
| 6 |
from datetime import datetime
|
| 7 |
|
| 8 |
+
# Import your utility scripts from the 'src' directory
|
| 9 |
try:
|
| 10 |
from src import benchmark_utils
|
| 11 |
from src import diagnostic_plots as diag
|
| 12 |
except ImportError:
|
| 13 |
+
st.error("Error: Could not find 'src/benchmark_utils.py' or 'src/diagnostic_plots.py'. "
|
| 14 |
+
"Please ensure they exist in the 'src/' directory.")
|
| 15 |
st.stop()
|
| 16 |
|
| 17 |
+
# --- 2. PAGE CONFIGURATION ---
|
| 18 |
st.set_page_config(
|
| 19 |
page_title="Saigon Temperature Forecast",
|
| 20 |
page_icon="🌦️",
|
| 21 |
layout="wide"
|
| 22 |
)
|
| 23 |
|
| 24 |
+
# --- 3. DATA & MODEL LOADING FUNCTIONS (WITH CACHING) ---
|
| 25 |
+
# Checklist Items 1 & 2: Cache all heavy operations
|
| 26 |
|
| 27 |
@st.cache_data
|
| 28 |
def load_feature_data(file_path="data/final_dataset_tree.csv"):
|
| 29 |
+
"""Loads features and targets, converts index to datetime."""
|
| 30 |
try:
|
| 31 |
df = pd.read_csv(file_path)
|
| 32 |
|
| 33 |
+
# --- CRITICAL CUSTOMIZATION ---
|
| 34 |
+
# Ensure 'datetime' is your date column in the CSV
|
| 35 |
DATE_COLUMN = 'datetime'
|
| 36 |
|
| 37 |
if DATE_COLUMN not in df.columns:
|
| 38 |
+
st.error(f"Error: Date column '{DATE_COLUMN}' not found in 'final_dataset_tree.csv'. "
|
| 39 |
+
f"Please update the DATE_COLUMN variable in 'app.py'.")
|
| 40 |
return pd.DataFrame()
|
| 41 |
|
| 42 |
df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
|
|
|
|
| 44 |
df = df.sort_index()
|
| 45 |
return df
|
| 46 |
except FileNotFoundError:
|
| 47 |
+
st.error(f"ERROR: Main data file not found at: {file_path}")
|
| 48 |
return pd.DataFrame()
|
| 49 |
|
| 50 |
@st.cache_resource
|
| 51 |
def load_champion_models():
|
| 52 |
+
"""Loads the 5 specialist models from the checklist."""
|
| 53 |
models = []
|
| 54 |
try:
|
| 55 |
for i in range(1, 6):
|
|
|
|
| 58 |
models.append(model)
|
| 59 |
return models
|
| 60 |
except FileNotFoundError as e:
|
| 61 |
+
st.error(f"ERROR: Model file not found. Checked: {e.filename}. "
|
| 62 |
+
"Ensure the 5 .pkl files are in the 'models/' directory.")
|
| 63 |
return []
|
| 64 |
|
| 65 |
+
@st.cache_datadef load_performance_data(file_path="data/final_5_day_results_df.csv"):
|
| 66 |
+
"""Loads pre-calculated performance data for Tab 3."""
|
|
|
|
| 67 |
try:
|
| 68 |
df = pd.read_csv(file_path)
|
| 69 |
return df
|
| 70 |
except FileNotFoundError:
|
| 71 |
+
st.error(f"ERROR: Performance file not found at: {file_path}")
|
| 72 |
return pd.DataFrame()
|
| 73 |
|
| 74 |
+
# --- 4. INITIALIZE DATA & SPLIT TEST SET ---
|
| 75 |
|
| 76 |
+
# Load all data and models
|
| 77 |
all_data_df = load_feature_data()
|
| 78 |
models = load_champion_models()
|
| 79 |
perf_df = load_performance_data()
|
| 80 |
|
| 81 |
+
# --- CRITICAL CUSTOMIZATION ---
|
| 82 |
TARGET_COLS = ['temp_next_1_day', 'temp_next_2_day', 'temp_next_3_day', 'temp_next_4_day', 'temp_next_5_day']
|
| 83 |
CURRENT_TEMP_COL = 'temp'
|
| 84 |
|
| 85 |
+
# Split test set (based on checklist dates)
|
| 86 |
TEST_START_DATE = "2024-02-18"
|
| 87 |
TEST_END_DATE = "2025-09-26"
|
| 88 |
|
|
|
|
| 92 |
try:
|
| 93 |
test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy()
|
| 94 |
|
| 95 |
+
# Assumption: 157 features are ALL columns that are NOT targets
|
| 96 |
feature_cols = [col for col in all_data_df.columns if col not in TARGET_COLS]
|
| 97 |
|
| 98 |
+
# Split X_test (features) and y_test (actuals)
|
| 99 |
+
# Logic fix: X_test must be derived from test_df
|
| 100 |
X_test = test_df[feature_cols]
|
| 101 |
y_test = test_df[TARGET_COLS]
|
| 102 |
|
| 103 |
+
# Rename y_test columns for clarity (used in Tab 3)
|
| 104 |
y_test.columns = [f'Day {i}' for i in range(1, 6)]
|
| 105 |
except KeyError:
|
| 106 |
+
st.error(f"Error: Target columns (e.g., '{TARGET_COLS[0]}') or "
|
| 107 |
+
f"'{CURRENT_TEMP_COL}' column not found in CSV. Please update 'app.py'.")
|
| 108 |
except Exception as e:
|
| 109 |
+
st.error(f"Error processing test set: {e}")
|
| 110 |
else:
|
| 111 |
+
st.error("Could not load main data, application cannot continue.")
|
| 112 |
st.stop()
|
| 113 |
|
| 114 |
+
# --- 5. SIDEBAR NAVIGATION ---
|
| 115 |
|
| 116 |
st.sidebar.title("Navigation")
|
| 117 |
app_section = st.sidebar.radio(
|
|
|
|
| 119 |
("Project Overview & Methodology", "Live 5-Day Forecast", "Model Performance & Diagnostics")
|
| 120 |
)
|
| 121 |
|
| 122 |
+
# Date input only shows on the "Live Forecast" tab
|
| 123 |
selected_date = None
|
| 124 |
if app_section == "Live 5-Day Forecast":
|
| 125 |
st.sidebar.header("Forecast Input")
|
|
|
|
| 139 |
st.sidebar.error("Test data could not be loaded.")
|
| 140 |
|
| 141 |
|
| 142 |
+
# --- 6. MAIN PANEL DISPLAY ---
|
| 143 |
|
| 144 |
if app_section == "Project Overview & Methodology":
|
| 145 |
+
# --- CHECKLIST ITEM 3 ---
|
| 146 |
st.title("Saigon Temperature Forecasting Application 🌦️")
|
| 147 |
|
| 148 |
st.subheader("Project Summary")
|
| 149 |
st.markdown("""
|
| 150 |
+
The goal of this project is to forecast the average daily temperature for Ho Chi Minh City for the next 5 days.
|
| 151 |
|
| 152 |
+
* **Data:** 10 years of historical weather data from Visual Crossing.
|
| 153 |
+
* **Model:** We use 5 'specialist' models - each model is optimized to predict a specific future day (T+1 to T+5).
|
| 154 |
""")
|
| 155 |
|
| 156 |
st.subheader("Our 'Two-Stream' Strategy")
|
| 157 |
st.markdown("""
|
| 158 |
+
To optimize performance, we applied a "Two-Stream" strategy:
|
| 159 |
+
1. **Stream 1 (Linear Models):** Linear models (like Linear Regression) were trained on a feature set pruned using VIF to avoid multicollinearity.
|
| 160 |
+
2. **Stream 2 (Tree-based Models):** More complex models (like Random Forest, Gradient Boosting) were trained on a comprehensive set of 157 features to capture non-linear relationships.
|
| 161 |
|
| 162 |
+
Our Champion Model is a **Stacking** model from Stream 2, which demonstrated superior performance.
|
| 163 |
""")
|
| 164 |
|
| 165 |
st.subheader("Final Model Leaderboard")
|
| 166 |
+
st.markdown("Model leaderboard ranked by average RMSE score (lower is better).")
|
| 167 |
|
| 168 |
+
# Call function from benchmark_utils.py
|
| 169 |
leaderboard_df = benchmark_utils.load_leaderboard()
|
| 170 |
|
| 171 |
if not leaderboard_df.empty:
|
| 172 |
+
# Display top 10 models
|
| 173 |
st.dataframe(leaderboard_df.head(10), use_container_width=True)
|
| 174 |
else:
|
| 175 |
+
st.warning("Could not load leaderboard data.")
|
| 176 |
|
| 177 |
# --------------------------------------------------------------------
|
| 178 |
|
| 179 |
elif app_section == "Live 5-Day Forecast":
|
| 180 |
+
# --- CHECKLIST ITEM 4 ---
|
| 181 |
st.title("Live 5-Day Forecast")
|
| 182 |
|
| 183 |
if selected_date and not X_test.empty and models:
|
| 184 |
+
st.header(f"5-Day Forecast from: {selected_date.strftime('%Y-%m-%d')}")
|
| 185 |
|
| 186 |
+
# 1. Get Input Features
|
| 187 |
selected_date_ts = pd.Timestamp(selected_date)
|
| 188 |
|
| 189 |
+
# Logic fix: input_features must be from X_test
|
| 190 |
if selected_date_ts in X_test.index:
|
| 191 |
input_features = X_test.loc[[selected_date_ts]]
|
| 192 |
else:
|
| 193 |
+
st.error("Data not found for the selected date in X_test.")
|
| 194 |
+
input_features = pd.DataFrame() # Create empty dataframe to avoid errors later
|
| 195 |
|
| 196 |
if input_features.empty:
|
| 197 |
+
st.error("Data not found for the selected date.")
|
| 198 |
else:
|
| 199 |
+
# 2. Generate Predictions
|
| 200 |
predictions = []
|
| 201 |
for i in range(5):
|
| 202 |
+
model = models[i] # Get T+i model
|
| 203 |
pred = model.predict(input_features)[0]
|
| 204 |
predictions.append(pred)
|
| 205 |
|
| 206 |
+
# 3. Display Predictions (using st.metric)
|
| 207 |
forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:]
|
| 208 |
cols = st.columns(5)
|
| 209 |
|
| 210 |
+
# Get actual values for comparison
|
| 211 |
+
# --- LOGIC FIX: Get 'actual_values' from all_data_df ---
|
| 212 |
+
# We need to get the target columns (e.g., 'temp_next_1_day')
|
| 213 |
+
# from the ORIGINAL DATAFRAME at the selected date.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
actual_values = []
|
| 216 |
if selected_date_ts in all_data_df.index:
|
| 217 |
+
# Get the row from the original dataframe
|
| 218 |
actual_row = all_data_df.loc[selected_date_ts]
|
| 219 |
|
| 220 |
+
# Get values from the target columns (temp_next_1_day, etc.)
|
| 221 |
for col_name in TARGET_COLS:
|
| 222 |
actual_values.append(actual_row[col_name])
|
| 223 |
else:
|
| 224 |
+
# Fallback case if date not found (rare)
|
| 225 |
+
actual_values = [float('nan')] * 5 # Create 5 NaN values
|
| 226 |
|
| 227 |
+
# --- APPLYING LOGIC (1) FROM REFERENCE CODE ---
|
| 228 |
+
# Check if any 'Actual' values are missing
|
| 229 |
is_partial_forecast = any(pd.isna(v) for v in actual_values)
|
| 230 |
+
# ----------------------------------------------
|
| 231 |
+
|
| 232 |
for i in range(5):
|
| 233 |
with cols[i]:
|
| 234 |
|
| 235 |
+
# --- FIX 1 (REFINED): Use pd.notna logic from reference code ---
|
| 236 |
actual_val = actual_values[i]
|
| 237 |
delta_text = f"Actual: {actual_val:.1f}°C" if pd.notna(actual_val) else "Actual: --"
|
| 238 |
+
# --- END FIX 1 ---
|
| 239 |
|
| 240 |
st.metric(
|
| 241 |
label=f"Forecast for {forecast_dates[i].strftime('%b %d')}",
|
| 242 |
value=f"{predictions[i]:.1f}°C",
|
| 243 |
+
delta=delta_text, # Use the checked delta_text
|
| 244 |
+
delta_color="off" # Neutral gray color
|
| 245 |
)
|
| 246 |
|
| 247 |
+
# --- NEW ADDITION 2: TRAINING DATA PLOT (PER REQUEST) ---
|
| 248 |
st.subheader("Training Set Overview")
|
| 249 |
+
with st.expander("Show plot of all training data (before 2024-02-18)"):
|
| 250 |
|
| 251 |
+
# Define training data range
|
| 252 |
train_end_date = pd.Timestamp(TEST_START_DATE) - pd.Timedelta(days=1)
|
| 253 |
train_df = all_data_df.loc[:train_end_date][CURRENT_TEMP_COL]
|
| 254 |
|
|
|
|
| 256 |
fig_train.add_trace(go.Scatter(
|
| 257 |
x=train_df.index, y=train_df,
|
| 258 |
mode='lines', name='Training Data (Actual)',
|
| 259 |
+
line=dict(color='#005aa7', width=1) # Blue
|
| 260 |
))
|
| 261 |
fig_train.update_layout(
|
| 262 |
title="Actual Temperature - Full Training Set",
|
|
|
|
| 264 |
template="plotly_white"
|
| 265 |
)
|
| 266 |
st.plotly_chart(fig_train, use_container_width=True)
|
| 267 |
+
# --- END NEW ADDITION 2 ---
|
| 268 |
|
| 269 |
+
# 4. Plot (Optimal Suggestion)
|
| 270 |
st.subheader("Historical Context & Forecast")
|
| 271 |
|
| 272 |
+
# Get last 14 days of history
|
| 273 |
history_start = selected_date_ts - pd.Timedelta(days=14)
|
| 274 |
history_end = selected_date_ts
|
| 275 |
|
| 276 |
+
# Get 'temp' data from the original dataframe
|
| 277 |
history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL]
|
| 278 |
|
| 279 |
+
# Create dataframe for forecast
|
| 280 |
forecast_df = pd.DataFrame({
|
| 281 |
'Date': forecast_dates,
|
| 282 |
'Forecast': predictions
|
|
|
|
| 303 |
st.plotly_chart(fig, use_container_width=True)
|
| 304 |
|
| 305 |
|
| 306 |
+
# --- APPLYING LOGIC (2) FROM REFERENCE CODE ---
|
| 307 |
st.subheader("5-Day Forecast vs. Actual Comparison")
|
| 308 |
|
| 309 |
if is_partial_forecast:
|
| 310 |
+
st.info("Cannot draw the Actual vs. Forecast comparison chart because "
|
| 311 |
+
"the selected date is too close to the end of the test set (missing 'actual' data).")
|
| 312 |
else:
|
| 313 |
fig_comp = go.Figure()
|
| 314 |
|
|
|
|
| 332 |
template="plotly_white", legend=dict(x=0.01, y=0.99)
|
| 333 |
)
|
| 334 |
st.plotly_chart(fig_comp, use_container_width=True)
|
| 335 |
+
# --- END APPLYING LOGIC (2) ---
|
| 336 |
|
| 337 |
else:
|
| 338 |
+
st.warning("Please wait... Loading data or models.")
|
| 339 |
|
| 340 |
# --------------------------------------------------------------------
|
| 341 |
|
| 342 |
elif app_section == "Model Performance & Diagnostics":
|
| 343 |
+
# --- CHECKLIST ITEM 5 ---
|
| 344 |
st.title("Model Performance & Diagnostics")
|
| 345 |
|
| 346 |
if not perf_df.empty and not y_test.empty:
|
| 347 |
st.subheader("Performance Degradation over 5 Days")
|
| 348 |
+
st.markdown("How model performance changes as the forecast horizon increases.")
|
| 349 |
|
| 350 |
+
# Filter for Champion model only
|
| 351 |
MODEL_NAME = 'Champion (Stacking)'
|
| 352 |
champion_perf_df = perf_df[perf_df['Model'] == MODEL_NAME].copy()
|
| 353 |
|
| 354 |
+
# 1. Performance Degradation Plots (RMSE & R2)
|
| 355 |
|
| 356 |
+
# --- CUSTOMIZATION ---
|
| 357 |
+
# Ensure 'RMSE' and 'R2' column names are correct for 'final_5_day_results_df.csv'
|
| 358 |
RMSE_COL_NAME = 'RMSE (Absolute Error)'
|
| 359 |
R2_COL_NAME = 'R-squared'
|
| 360 |
|
| 361 |
col1, col2 = st.columns(2)
|
| 362 |
with col1:
|
| 363 |
fig_rmse = diag.plot_performance_degradation(
|
| 364 |
+
champion_perf_df, # Use filtered df
|
| 365 |
metric_column=RMSE_COL_NAME,
|
| 366 |
metric_name='RMSE (Temperature °C)',
|
| 367 |
color='blue'
|
|
|
|
| 369 |
st.plotly_chart(fig_rmse, use_container_width=True)
|
| 370 |
with col2:
|
| 371 |
fig_r2 = diag.plot_performance_degradation(
|
| 372 |
+
champion_perf_df, # Use filtered df
|
| 373 |
metric_column=R2_COL_NAME,
|
| 374 |
metric_name='R-squared (R²)',
|
| 375 |
color='green'
|
| 376 |
)
|
| 377 |
st.plotly_chart(fig_r2, use_container_width=True)
|
| 378 |
|
| 379 |
+
# 2. Forecast vs. Actual Plots
|
| 380 |
st.subheader("Forecast vs. Actual Comparison (on entire test set)")
|
| 381 |
|
| 382 |
+
# This function runs predictions on the *entire* X_test (thousands of rows)
|
| 383 |
+
# It will be slow without caching
|
| 384 |
@st.cache_data
|
| 385 |
def get_full_test_predictions(_models, _X_test):
|
| 386 |
+
"""Run predictions on the entire test set and cache the results."""
|
| 387 |
all_preds = {}
|
| 388 |
for i in range(5):
|
| 389 |
model = _models[i]
|
|
|
|
| 410 |
)
|
| 411 |
st.plotly_chart(fig_d5, use_container_width=True)
|
| 412 |
|
| 413 |
+
# 3. Optional: Deep Dive Expander
|
| 414 |
with st.expander("Champion Model Diagnostics (Deep Dive)"):
|
| 415 |
+
st.markdown("Detailed analysis of residuals (error = actual - predicted) for the Day 1 forecast.")
|
| 416 |
|
| 417 |
y_true_d1 = y_test['Day 1']
|
| 418 |
y_pred_d1 = y_pred_test['Day 1']
|
|
|
|
| 427 |
y_true_d1, y_pred_d1, "Day 1"
|
| 428 |
)
|
| 429 |
st.plotly_chart(fig_res_dist, use_container_width=True)
|
| 430 |
+
st.markdown("A good model will have residuals (errors) normally distributed (bell curve) "
|
| 431 |
+
"around 0 and show no pattern over time.")
|
| 432 |
|
| 433 |
else:
|
| 434 |
+
st.warning("Loading performance data...")
|