ML_Final_Hourly

Sleeping

App Files Files Community

Gumball2k5 commited on Nov 15, 2025

Commit

afdab77

verified ·

1 Parent(s): 11c1ec2

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -111

app.py CHANGED Viewed

@@ -1,42 +1,42 @@
-# --- 1. IMPORT THƯ VIỆN ---
 import streamlit as st
 import pandas as pd
 import joblib
 import plotly.graph_objects as go
 from datetime import datetime
-# Import các script tiện ích của bạn từ thư mục 'src'
 try:
     from src import benchmark_utils
     from src import diagnostic_plots as diag
 except ImportError:
-    st.error("Lỗi: Không tìm thấy file 'src/benchmark_utils.py' hoặc 'src/diagnostic_plots.py'. "
-             "Hãy đảm bảo chúng tồn tại trong thư mục 'src/'.")
     st.stop()
-# --- 2. CẤU HÌNH TRANG WEB ---
 st.set_page_config(
     page_title="Saigon Temperature Forecast",
     page_icon="🌦️",
     layout="wide"
 )
-# --- 3. CÁC HÀM TẢI DỮ LIỆU & MÔ HÌNH (VỚI CACHING) ---
-# Mục 1 & 2 trong checklist: Tải mọi thứ nặng bằng cache
 @st.cache_data
 def load_feature_data(file_path="data/final_dataset_tree.csv"):
-    """Tải dữ liệu features và targets, chuyển đổi index thành datetime."""
     try:
         df = pd.read_csv(file_path)
-        # --- TÙY CHỈNH QUAN TRỌNG ---
-        # Đảm bảo 'datetime' là tên cột ngày tháng trong file CSV của bạn
         DATE_COLUMN = 'datetime'
         if DATE_COLUMN not in df.columns:
-            st.error(f"Lỗi: Không tìm thấy cột ngày tháng '{DATE_COLUMN}' trong 'final_dataset_tree.csv'. "
-                     f"Vui lòng cập nhật biến DATE_COLUMN trong 'app.py'.")
             return pd.DataFrame()
         df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
@@ -44,12 +44,12 @@ def load_feature_data(file_path="data/final_dataset_tree.csv"):
         df = df.sort_index()
         return df
     except FileNotFoundError:
-        st.error(f"LỖI: Không tìm thấy file data chính tại: {file_path}")
         return pd.DataFrame()
 @st.cache_resource
 def load_champion_models():
-    """Tải 5 mô hình chuyên gia (specialist models) từ checklist."""
     models = []
     try:
         for i in range(1, 6):
@@ -58,32 +58,31 @@ def load_champion_models():
             models.append(model)
         return models
     except FileNotFoundError as e:
-        st.error(f"LỖI: Không tìm thấy file mô hình. Đã kiểm tra: {e.filename}. "
-                 "Hãy đảm bảo 5 file .pkl nằm trong thư mục 'models/'.")
         return []
-@st.cache_data
-def load_performance_data(file_path="data/final_5_day_results_df.csv"):
-    """Tải dữ liệu hiệu suất đã tính toán trước cho Tab 3."""
     try:
         df = pd.read_csv(file_path)
         return df
     except FileNotFoundError:
-        st.error(f"LỖI: Không tìm thấy file hiệu suất tại: {file_path}")
         return pd.DataFrame()
-# --- 4. KHỞI TẠO DỮ LIỆU & TÁCH TEST SET ---
-# Tải tất cả dữ liệu và mô hình
 all_data_df = load_feature_data()
 models = load_champion_models()
 perf_df = load_performance_data()
-# --- TÙY CHỈNH QUAN TRỌNG ---
 TARGET_COLS = ['temp_next_1_day', 'temp_next_2_day', 'temp_next_3_day', 'temp_next_4_day', 'temp_next_5_day']
 CURRENT_TEMP_COL = 'temp'
-# Tách test set (dựa trên ngày trong checklist)
 TEST_START_DATE = "2024-02-18"
 TEST_END_DATE = "2025-09-26"
@@ -93,26 +92,26 @@ if not all_data_df.empty:
     try:
         test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy()
-        # Giả định: 157 features là TẤT CẢ các cột KHÔNG PHẢI là target
         feature_cols = [col for col in all_data_df.columns if col not in TARGET_COLS]
-        # Tách X_test (features) và y_test (thực tế)
-        # Sửa lỗi logic: X_test phải được lấy từ test_df
         X_test = test_df[feature_cols]
         y_test = test_df[TARGET_COLS]
-        # Đổi tên cột y_test cho dễ hiểu (dùng trong Tab 3)
         y_test.columns = [f'Day {i}' for i in range(1, 6)]
     except KeyError:
-        st.error(f"Lỗi: Không tìm thấy cột target (ví dụ: '{TARGET_COLS[0]}') hoặc cột "
-                 f"'{CURRENT_TEMP_COL}' trong file CSV. Vui lòng cập nhật 'app.py'.")
     except Exception as e:
-        st.error(f"Lỗi khi xử lý test set: {e}")
 else:
-    st.error("Không thể tải dữ liệu chính, ứng dụng không thể tiếp tục.")
     st.stop()
-# --- 5. GIAO DIỆN SIDEBAR (THANH ĐIỀU HƯỚNG) ---
 st.sidebar.title("Navigation")
 app_section = st.sidebar.radio(
@@ -120,7 +119,7 @@ app_section = st.sidebar.radio(
     ("Project Overview & Methodology", "Live 5-Day Forecast", "Model Performance & Diagnostics")
 )
-# Date input chỉ hiển thị khi ở tab "Live Forecast"
 selected_date = None
 if app_section == "Live 5-Day Forecast":
     st.sidebar.header("Forecast Input")
@@ -140,123 +139,116 @@ if app_section == "Live 5-Day Forecast":
         st.sidebar.error("Test data could not be loaded.")
-# --- 6. GIAO DIỆN CHÍNH (MAIN PANEL) ---
 if app_section == "Project Overview & Methodology":
-    # --- MỤC 3 TRONG CHECKLIST ---
     st.title("Saigon Temperature Forecasting Application 🌦️")
     st.subheader("Project Summary")
     st.markdown("""
-    Mục tiêu của dự án này là dự đoán nhiệt độ trung bình hàng ngày cho TP. Hồ Chí Minh trong 5 ngày tới.
-    * **Dữ liệu:** Dữ liệu thời tiết lịch sử 10 năm từ Visual Crossing.
-    * **Mô hình:** Chúng tôi sử dụng 5 mô hình 'chuyên gia' (specialist models) - mỗi mô hình được tối ưu để dự đoán một ngày cụ thể trong tương lai (T+1 đến T+5).
     """)
     st.subheader("Our 'Two-Stream' Strategy")
     st.markdown("""
-    Để tối ưu hóa hiệu suất, chúng tôi đã áp dụng chiến lược "Hai luồng" (Two-Stream):
-    1.  **Luồng 1 (Linear Models):** Các mô hình tuyến tính (như Linear Regression) được huấn luyện trên một bộ features đã được tinh gọn (sử dụng VIF) để tránh đa cộng tuyến.
-    2.  **Luồng 2 (Tree-based Models):** Các mô hình phức tạp hơn (như Random Forest, Gradient Boosting) được huấn luyện trên một bộ features toàn diện (157 features) để nắm bắt các mối quan hệ phi tuyến.
-    Mô hình chiến thắng (Champion Model) của chúng tôi là một mô hình **Stacking** từ Luồng 2, cho thấy hiệu suất vượt trội.
     """)
     st.subheader("Final Model Leaderboard")
-    st.markdown("Bảng xếp hạng các mô hình dựa trên điểm RMSE trung bình (càng thấp càng tốt).")
-    # Gọi hàm từ benchmark_utils.py
     leaderboard_df = benchmark_utils.load_leaderboard()
     if not leaderboard_df.empty:
-        # Hiển thị 10 mô hình hàng đầu
         st.dataframe(leaderboard_df.head(10), use_container_width=True)
     else:
-        st.warning("Không thể tải dữ liệu leaderboard.")
 # --------------------------------------------------------------------
 elif app_section == "Live 5-Day Forecast":
-    # --- MỤC 4 TRONG CHECKLIST ---
     st.title("Live 5-Day Forecast")
     if selected_date and not X_test.empty and models:
-        st.header(f"Dự báo cho 5 ngày tới từ: {selected_date.strftime('%Y-%m-%d')}")
-        # 1. Lấy Input Features
         selected_date_ts = pd.Timestamp(selected_date)
-        # Sửa lỗi logic: input_features phải được lấy từ X_test
         if selected_date_ts in X_test.index:
             input_features = X_test.loc[[selected_date_ts]]
         else:
-            st.error("Không tìm thấy dữ liệu cho ngày đã chọn trong X_test.")
-            input_features = pd.DataFrame() # Tạo dataframe rỗng để tránh lỗi sau
         if input_features.empty:
-            st.error("Không tìm thấy dữ liệu cho ngày đã chọn.")
         else:
-            # 2. Tạo dự đoán
             predictions = []
             for i in range(5):
-                model = models[i] # Lấy mô hình T+i
                 pred = model.predict(input_features)[0]
                 predictions.append(pred)
-            # 3. Hiển thị dự đoán (dùng st.metric)
             forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:]
             cols = st.columns(5)
-            # Lấy giá trị thực tế để so sánh
-            #actual_values = y_test.loc[selected_date_ts].values
-            # --- ÁP DỤNG LOGIC (1) TỪ CODE THAM KHẢO ---
-            # Kiểm tra xem có bất kỳ giá trị 'Actual' nào bị thiếu không
-            #is_partial_forecast = any(pd.isna(v) for v in actual_values)
-            # ----------------------------------------------
-            # Lấy giá trị thực tế để so sánh
-            # --- SỬA LỖI LOGIC: Lấy 'actual_values' từ all_data_df ---
-            # Chúng ta cần lấy các cột target (ví dụ: 'temp_next_1_day')
-            # từ BẢNG DỮ LIỆU GỐC tại ngày đã chọn.
             actual_values = []
             if selected_date_ts in all_data_df.index:
-                # Lấy 1 dòng từ dataframe gốc
                 actual_row = all_data_df.loc[selected_date_ts]
-                # Lấy giá trị từ các cột target (temp_next_1_day, v.v.)
                 for col_name in TARGET_COLS:
                     actual_values.append(actual_row[col_name])
             else:
-                # Trường hợp dự phòng nếu không tìm thấy ngày (dù hiếm)
-                actual_values = [float('nan')] * 5 # Tạo 5 giá trị NaN
-            # --- ÁP DỤNG LOGIC (1) TỪ CODE THAM KHẢO ---
-            # Kiểm tra xem có bất kỳ giá trị 'Actual' nào bị thiếu không
             is_partial_forecast = any(pd.isna(v) for v in actual_values)
             for i in range(5):
                 with cols[i]:
-                    # --- SỬA LỖI 1 (TINH CHỈNH): Sử dụng logic pd.notna từ code tham khảo ---
                     actual_val = actual_values[i]
                     delta_text = f"Actual: {actual_val:.1f}°C" if pd.notna(actual_val) else "Actual: --"
-                    # --- KẾT THÚC SỬA LỖI 1 ---
                     st.metric(
                         label=f"Forecast for {forecast_dates[i].strftime('%b %d')}",
                         value=f"{predictions[i]:.1f}°C",
-                        delta=delta_text, # Sử dụng delta_text đã kiểm tra
-                        delta_color="off" # Màu xám trung tính
                     )
-            # --- THÊM MỚI 2: BIỂU ĐỒ DỮ LIỆU TRAINING (THEO YÊU CẦU) ---
             st.subheader("Training Set Overview")
-            with st.expander("Hiển thị biểu đồ toàn bộ dữ liệu training (trước 2024-02-18)"):
-                # Xác định phạm vi training data
                 train_end_date = pd.Timestamp(TEST_START_DATE) - pd.Timedelta(days=1)
                 train_df = all_data_df.loc[:train_end_date][CURRENT_TEMP_COL]
@@ -264,7 +256,7 @@ elif app_section == "Live 5-Day Forecast":
                 fig_train.add_trace(go.Scatter(
                     x=train_df.index, y=train_df,
                     mode='lines', name='Training Data (Actual)',
-                    line=dict(color='#005aa7', width=1) # Màu xanh
                 ))
                 fig_train.update_layout(
                     title="Actual Temperature - Full Training Set",
@@ -272,19 +264,19 @@ elif app_section == "Live 5-Day Forecast":
                     template="plotly_white"
                 )
                 st.plotly_chart(fig_train, use_container_width=True)
-            # --- KẾT THÚC THÊM MỚI 2 ---
-            # 4. Biểu đồ (Optimal Suggestion)
             st.subheader("Historical Context & Forecast")
-            # Lấy 14 ngày lịch sử
             history_start = selected_date_ts - pd.Timedelta(days=14)
             history_end = selected_date_ts
-            # Lấy dữ liệu 'temp' thực tế từ dataframe gốc
             history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL]
-            # Tạo dataframe cho dự báo
             forecast_df = pd.DataFrame({
                 'Date': forecast_dates,
                 'Forecast': predictions
@@ -311,12 +303,12 @@ elif app_section == "Live 5-Day Forecast":
             st.plotly_chart(fig, use_container_width=True)
-            # --- ÁP DỤNG LOGIC (2) TỪ CODE THAM KHẢO ---
             st.subheader("5-Day Forecast vs. Actual Comparison")
             if is_partial_forecast:
-                st.info("Không thể vẽ biểu đồ so sánh Actual vs. Forecast vì "
-                        "đã chọn ngày quá gần cuối test set (thiếu dữ liệu 'thực tế').")
             else:
                 fig_comp = go.Figure()
@@ -340,36 +332,36 @@ elif app_section == "Live 5-Day Forecast":
                     template="plotly_white", legend=dict(x=0.01, y=0.99)
                 )
                 st.plotly_chart(fig_comp, use_container_width=True)
-            # --- KẾT THÚC ÁP DỤNG LOGIC (2) ---
     else:
-        st.warning("Vui lòng đợi... Đang tải dữ liệu hoặc mô hình.")
 # --------------------------------------------------------------------
 elif app_section == "Model Performance & Diagnostics":
-    # --- MỤC 5 TRONG CHECKLIST ---
     st.title("Model Performance & Diagnostics")
     if not perf_df.empty and not y_test.empty:
         st.subheader("Performance Degradation over 5 Days")
-        st.markdown("Hiệu suất mô hình thay đổi như thế nào khi dự báo xa hơn.")
-        # Lọc chỉ model Champion
         MODEL_NAME = 'Champion (Stacking)'
         champion_perf_df = perf_df[perf_df['Model'] == MODEL_NAME].copy()
-        # 1. Biểu đồ suy giảm hiệu suất (RMSE & R2)
-        # --- TÙY CHỈNH ---
-        # Đảm bảo 'RMSE' và 'R2' là tên cột chính xác trong file 'final_5_day_results_df.csv'
         RMSE_COL_NAME = 'RMSE (Absolute Error)'
         R2_COL_NAME = 'R-squared'
         col1, col2 = st.columns(2)
         with col1:
             fig_rmse = diag.plot_performance_degradation(
-                champion_perf_df, # Dùng df đã lọc
                 metric_column=RMSE_COL_NAME,
                 metric_name='RMSE (Temperature °C)',
                 color='blue'
@@ -377,21 +369,21 @@ elif app_section == "Model Performance & Diagnostics":
             st.plotly_chart(fig_rmse, use_container_width=True)
         with col2:
             fig_r2 = diag.plot_performance_degradation(
-                champion_perf_df, # Dùng df đã lọc
                 metric_column=R2_COL_NAME,
                 metric_name='R-squared (R²)',
                 color='green'
             )
             st.plotly_chart(fig_r2, use_container_width=True)
-        # 2. Biểu đồ Dự báo vs. Thực tế
         st.subheader("Forecast vs. Actual Comparison (on entire test set)")
-        # Hàm này chạy dự đoán trên *toàn bộ* X_test (hàng ngàn dòng)
-        # Nó sẽ rất chậm nếu không có cache
         @st.cache_data
         def get_full_test_predictions(_models, _X_test):
-            """Chạy dự đoán trên toàn bộ test set và cache lại."""
             all_preds = {}
             for i in range(5):
                 model = _models[i]
@@ -418,9 +410,9 @@ elif app_section == "Model Performance & Diagnostics":
             )
             st.plotly_chart(fig_d5, use_container_width=True)
-        # 3. Mục Tùy chọn: Deep Dive Expander
         with st.expander("Champion Model Diagnostics (Deep Dive)"):
-            st.markdown("Phân tích chi tiết phần dư (lỗi = thực tế - dự báo) cho dự báo Day 1.")
             y_true_d1 = y_test['Day 1']
             y_pred_d1 = y_pred_test['Day 1']
@@ -435,8 +427,8 @@ elif app_section == "Model Performance & Diagnostics":
                 y_true_d1, y_pred_d1, "Day 1"
             )
             st.plotly_chart(fig_res_dist, use_container_width=True)
-            st.markdown("Một mô hình tốt sẽ có phần dư (lỗi) phân phối chuẩn (hình chuông) "
-                        "quanh giá trị 0 và không có xu hướng (pattern) nào theo thời gian.")
     else:
-        st.warning("Đang tải dữ liệu hiệu suất...")

+# --- 1. IMPORT LIBRARIES ---
 import streamlit as st
 import pandas as pd
 import joblib
 import plotly.graph_objects as go
 from datetime import datetime
+# Import your utility scripts from the 'src' directory
 try:
     from src import benchmark_utils
     from src import diagnostic_plots as diag
 except ImportError:
+    st.error("Error: Could not find 'src/benchmark_utils.py' or 'src/diagnostic_plots.py'. "
+             "Please ensure they exist in the 'src/' directory.")
     st.stop()
+# --- 2. PAGE CONFIGURATION ---
 st.set_page_config(
     page_title="Saigon Temperature Forecast",
     page_icon="🌦️",
     layout="wide"
 )
+# --- 3. DATA & MODEL LOADING FUNCTIONS (WITH CACHING) ---
+# Checklist Items 1 & 2: Cache all heavy operations
 @st.cache_data
 def load_feature_data(file_path="data/final_dataset_tree.csv"):
+    """Loads features and targets, converts index to datetime."""
     try:
         df = pd.read_csv(file_path)
+        # --- CRITICAL CUSTOMIZATION ---
+        # Ensure 'datetime' is your date column in the CSV
         DATE_COLUMN = 'datetime'
         if DATE_COLUMN not in df.columns:
+            st.error(f"Error: Date column '{DATE_COLUMN}' not found in 'final_dataset_tree.csv'. "
+                     f"Please update the DATE_COLUMN variable in 'app.py'.")
             return pd.DataFrame()
         df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
         df = df.sort_index()
         return df
     except FileNotFoundError:
+        st.error(f"ERROR: Main data file not found at: {file_path}")
         return pd.DataFrame()
 @st.cache_resource
 def load_champion_models():
+    """Loads the 5 specialist models from the checklist."""
     models = []
     try:
         for i in range(1, 6):
             models.append(model)
         return models
     except FileNotFoundError as e:
+        st.error(f"ERROR: Model file not found. Checked: {e.filename}. "
+                 "Ensure the 5 .pkl files are in the 'models/' directory.")
         return []
+@st.cache_datadef load_performance_data(file_path="data/final_5_day_results_df.csv"):
+    """Loads pre-calculated performance data for Tab 3."""
     try:
         df = pd.read_csv(file_path)
         return df
     except FileNotFoundError:
+        st.error(f"ERROR: Performance file not found at: {file_path}")
         return pd.DataFrame()
+# --- 4. INITIALIZE DATA & SPLIT TEST SET ---
+# Load all data and models
 all_data_df = load_feature_data()
 models = load_champion_models()
 perf_df = load_performance_data()
+# --- CRITICAL CUSTOMIZATION ---
 TARGET_COLS = ['temp_next_1_day', 'temp_next_2_day', 'temp_next_3_day', 'temp_next_4_day', 'temp_next_5_day']
 CURRENT_TEMP_COL = 'temp'
+# Split test set (based on checklist dates)
 TEST_START_DATE = "2024-02-18"
 TEST_END_DATE = "2025-09-26"
     try:
         test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy()
+        # Assumption: 157 features are ALL columns that are NOT targets
         feature_cols = [col for col in all_data_df.columns if col not in TARGET_COLS]
+        # Split X_test (features) and y_test (actuals)
+        # Logic fix: X_test must be derived from test_df
         X_test = test_df[feature_cols]
         y_test = test_df[TARGET_COLS]
+        # Rename y_test columns for clarity (used in Tab 3)
         y_test.columns = [f'Day {i}' for i in range(1, 6)]
     except KeyError:
+        st.error(f"Error: Target columns (e.g., '{TARGET_COLS[0]}') or "
+                 f"'{CURRENT_TEMP_COL}' column not found in CSV. Please update 'app.py'.")
     except Exception as e:
+        st.error(f"Error processing test set: {e}")
 else:
+    st.error("Could not load main data, application cannot continue.")
     st.stop()
+# --- 5. SIDEBAR NAVIGATION ---
 st.sidebar.title("Navigation")
 app_section = st.sidebar.radio(
     ("Project Overview & Methodology", "Live 5-Day Forecast", "Model Performance & Diagnostics")
 )
+# Date input only shows on the "Live Forecast" tab
 selected_date = None
 if app_section == "Live 5-Day Forecast":
     st.sidebar.header("Forecast Input")
         st.sidebar.error("Test data could not be loaded.")
+# --- 6. MAIN PANEL DISPLAY ---
 if app_section == "Project Overview & Methodology":
+    # --- CHECKLIST ITEM 3 ---
     st.title("Saigon Temperature Forecasting Application 🌦️")
     st.subheader("Project Summary")
     st.markdown("""
+    The goal of this project is to forecast the average daily temperature for Ho Chi Minh City for the next 5 days.
+    * **Data:** 10 years of historical weather data from Visual Crossing.
+    * **Model:** We use 5 'specialist' models - each model is optimized to predict a specific future day (T+1 to T+5).
     """)
     st.subheader("Our 'Two-Stream' Strategy")
     st.markdown("""
+    To optimize performance, we applied a "Two-Stream" strategy:
+    1.  **Stream 1 (Linear Models):** Linear models (like Linear Regression) were trained on a feature set pruned using VIF to avoid multicollinearity.
+    2.  **Stream 2 (Tree-based Models):** More complex models (like Random Forest, Gradient Boosting) were trained on a comprehensive set of 157 features to capture non-linear relationships.
+    Our Champion Model is a **Stacking** model from Stream 2, which demonstrated superior performance.
     """)
     st.subheader("Final Model Leaderboard")
+    st.markdown("Model leaderboard ranked by average RMSE score (lower is better).")
+    # Call function from benchmark_utils.py
     leaderboard_df = benchmark_utils.load_leaderboard()
     if not leaderboard_df.empty:
+        # Display top 10 models
         st.dataframe(leaderboard_df.head(10), use_container_width=True)
     else:
+        st.warning("Could not load leaderboard data.")
 # --------------------------------------------------------------------
 elif app_section == "Live 5-Day Forecast":
+    # --- CHECKLIST ITEM 4 ---
     st.title("Live 5-Day Forecast")
     if selected_date and not X_test.empty and models:
+        st.header(f"5-Day Forecast from: {selected_date.strftime('%Y-%m-%d')}")
+        # 1. Get Input Features
         selected_date_ts = pd.Timestamp(selected_date)
+        # Logic fix: input_features must be from X_test
         if selected_date_ts in X_test.index:
             input_features = X_test.loc[[selected_date_ts]]
         else:
+            st.error("Data not found for the selected date in X_test.")
+            input_features = pd.DataFrame() # Create empty dataframe to avoid errors later
         if input_features.empty:
+            st.error("Data not found for the selected date.")
         else:
+            # 2. Generate Predictions
             predictions = []
             for i in range(5):
+                model = models[i] # Get T+i model
                 pred = model.predict(input_features)[0]
                 predictions.append(pred)
+            # 3. Display Predictions (using st.metric)
             forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:]
             cols = st.columns(5)
+            # Get actual values for comparison
+            # --- LOGIC FIX: Get 'actual_values' from all_data_df ---
+            # We need to get the target columns (e.g., 'temp_next_1_day')
+            # from the ORIGINAL DATAFRAME at the selected date.
             actual_values = []
             if selected_date_ts in all_data_df.index:
+                # Get the row from the original dataframe
                 actual_row = all_data_df.loc[selected_date_ts]
+                # Get values from the target columns (temp_next_1_day, etc.)
                 for col_name in TARGET_COLS:
                     actual_values.append(actual_row[col_name])
             else:
+                # Fallback case if date not found (rare)
+                actual_values = [float('nan')] * 5 # Create 5 NaN values
+            # --- APPLYING LOGIC (1) FROM REFERENCE CODE ---
+            # Check if any 'Actual' values are missing
             is_partial_forecast = any(pd.isna(v) for v in actual_values)
+            # ----------------------------------------------
             for i in range(5):
                 with cols[i]:
+                    # --- FIX 1 (REFINED): Use pd.notna logic from reference code ---
                     actual_val = actual_values[i]
                     delta_text = f"Actual: {actual_val:.1f}°C" if pd.notna(actual_val) else "Actual: --"
+                    # --- END FIX 1 ---
                     st.metric(
                         label=f"Forecast for {forecast_dates[i].strftime('%b %d')}",
                         value=f"{predictions[i]:.1f}°C",
+                        delta=delta_text, # Use the checked delta_text
+                        delta_color="off" # Neutral gray color
                     )
+            # --- NEW ADDITION 2: TRAINING DATA PLOT (PER REQUEST) ---
             st.subheader("Training Set Overview")
+            with st.expander("Show plot of all training data (before 2024-02-18)"):
+                # Define training data range
                 train_end_date = pd.Timestamp(TEST_START_DATE) - pd.Timedelta(days=1)
                 train_df = all_data_df.loc[:train_end_date][CURRENT_TEMP_COL]
                 fig_train.add_trace(go.Scatter(
                     x=train_df.index, y=train_df,
                     mode='lines', name='Training Data (Actual)',
+                    line=dict(color='#005aa7', width=1) # Blue
                 ))
                 fig_train.update_layout(
                     title="Actual Temperature - Full Training Set",
                     template="plotly_white"
                 )
                 st.plotly_chart(fig_train, use_container_width=True)
+            # --- END NEW ADDITION 2 ---
+            # 4. Plot (Optimal Suggestion)
             st.subheader("Historical Context & Forecast")
+            # Get last 14 days of history
             history_start = selected_date_ts - pd.Timedelta(days=14)
             history_end = selected_date_ts
+            # Get 'temp' data from the original dataframe
             history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL]
+            # Create dataframe for forecast
             forecast_df = pd.DataFrame({
                 'Date': forecast_dates,
                 'Forecast': predictions
             st.plotly_chart(fig, use_container_width=True)
+            # --- APPLYING LOGIC (2) FROM REFERENCE CODE ---
             st.subheader("5-Day Forecast vs. Actual Comparison")
             if is_partial_forecast:
+                st.info("Cannot draw the Actual vs. Forecast comparison chart because "
+                        "the selected date is too close to the end of the test set (missing 'actual' data).")
             else:
                 fig_comp = go.Figure()
                     template="plotly_white", legend=dict(x=0.01, y=0.99)
                 )
                 st.plotly_chart(fig_comp, use_container_width=True)
+            # --- END APPLYING LOGIC (2) ---
     else:
+        st.warning("Please wait... Loading data or models.")
 # --------------------------------------------------------------------
 elif app_section == "Model Performance & Diagnostics":
+    # --- CHECKLIST ITEM 5 ---
     st.title("Model Performance & Diagnostics")
     if not perf_df.empty and not y_test.empty:
         st.subheader("Performance Degradation over 5 Days")
+        st.markdown("How model performance changes as the forecast horizon increases.")
+        # Filter for Champion model only
         MODEL_NAME = 'Champion (Stacking)'
         champion_perf_df = perf_df[perf_df['Model'] == MODEL_NAME].copy()
+        # 1. Performance Degradation Plots (RMSE & R2)
+        # --- CUSTOMIZATION ---
+        # Ensure 'RMSE' and 'R2' column names are correct for 'final_5_day_results_df.csv'
         RMSE_COL_NAME = 'RMSE (Absolute Error)'
         R2_COL_NAME = 'R-squared'
         col1, col2 = st.columns(2)
         with col1:
             fig_rmse = diag.plot_performance_degradation(
+                champion_perf_df, # Use filtered df
                 metric_column=RMSE_COL_NAME,
                 metric_name='RMSE (Temperature °C)',
                 color='blue'
             st.plotly_chart(fig_rmse, use_container_width=True)
         with col2:
             fig_r2 = diag.plot_performance_degradation(
+                champion_perf_df, # Use filtered df
                 metric_column=R2_COL_NAME,
                 metric_name='R-squared (R²)',
                 color='green'
             )
             st.plotly_chart(fig_r2, use_container_width=True)
+        # 2. Forecast vs. Actual Plots
         st.subheader("Forecast vs. Actual Comparison (on entire test set)")
+        # This function runs predictions on the *entire* X_test (thousands of rows)
+        # It will be slow without caching
         @st.cache_data
         def get_full_test_predictions(_models, _X_test):
+            """Run predictions on the entire test set and cache the results."""
             all_preds = {}
             for i in range(5):
                 model = _models[i]
             )
             st.plotly_chart(fig_d5, use_container_width=True)
+        # 3. Optional: Deep Dive Expander
         with st.expander("Champion Model Diagnostics (Deep Dive)"):
+            st.markdown("Detailed analysis of residuals (error = actual - predicted) for the Day 1 forecast.")
             y_true_d1 = y_test['Day 1']
             y_pred_d1 = y_pred_test['Day 1']
                 y_true_d1, y_pred_d1, "Day 1"
             )
             st.plotly_chart(fig_res_dist, use_container_width=True)
+            st.markdown("A good model will have residuals (errors) normally distributed (bell curve) "
+                        "around 0 and show no pattern over time.")
     else:
+        st.warning("Loading performance data...")