| |
| import streamlit as st |
| import pandas as pd |
| import joblib |
| import plotly.graph_objects as go |
| from datetime import datetime |
| from typing import List |
| import numpy as np |
|
|
| |
| try: |
| from src import benchmark_utils |
| from src import diagnostic_plots as diag |
| except ImportError: |
| st.error("Error: Could not find 'src/benchmark_utils.py' or 'src/diagnostic_plots.py'. " |
| "Please ensure they exist in the 'src/' directory.") |
| st.stop() |
|
|
| |
| st.set_page_config( |
| page_title="Saigon Temperature Forecast", |
| page_icon="🌦️", |
| layout="wide" |
| ) |
|
|
| |
| def load_css(): |
| """Tải CSS tùy chỉnh để tạo giao diện 'thời tiết' với ĐỘ TƯƠNG PHẢN CAO.""" |
| st.markdown(""" |
| <style> |
| /* ===== FONT CHUNG ===== */ |
| .stApp, .stSidebar { |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; |
| } |
| |
| /* ===== NỀN CHÍNH (MAIN BACKGROUND) ===== */ |
| [data-testid="stAppViewContainer"] { |
| background-image: linear-gradient(to bottom, #B0E0E6, #F0F8FF); |
| background-attachment: fixed; |
| background-size: cover; |
| } |
| |
| /* ===== 1. THANH CHỌN TAB (st.tabs) ===== */ |
| /* Tab không được chọn */ |
| button[data-baseweb="tab"][aria-selected="false"] { |
| background-color: rgba(255, 255, 255, 0.7) !important; /* Nền mờ */ |
| color: #0E2A47 !important; /* Chữ đậm */ |
| border-top-left-radius: 8px; |
| border-top-right-radius: 8px; |
| padding: 12px 16px !important; /* <<< THÊM PADDING */ |
| } |
| |
| /* Tab ĐANG ĐƯỢC CHỌN */ |
| button[data-baseweb="tab"][aria-selected="true"] { |
| background-color: #FFFFFF !important; /* Nền TRẮNG ĐỤC */ |
| color: #004080 !important; /* Chữ MÀU XANH ĐẬM */ |
| font-weight: 700 !important; |
| border-top-left-radius: 8px; |
| border-top-right-radius: 8px; |
| border-bottom: 3px solid #004080 !important; /* Viền xanh đậm */ |
| padding: 12px 16px !important; /* <<< THÊM PADDING */ |
| } |
| |
| /* ===== 2. THẺ DỰ BÁO (METRIC CARDS) ===== */ |
| div[data-testid="stMetric"] { |
| background-color: rgba(255, 255, 255, 0.95) !important; /* Nền trắng (đục hơn) */ |
| border: 1px solid #B0C4DE; /* Thêm viền (xanh nhạt) */ |
| border-radius: 12px; |
| padding: 20px; |
| box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1) !important; /* Đổ bóng đậm hơn */ |
| backdrop-filter: blur(5px); |
| transition: transform 0.2s ease; |
| } |
| div[data-testid="stMetric"]:hover { |
| transform: translateY(-3px); |
| box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15) !important; |
| } |
| |
| /* Tiêu đề thẻ (Forecast for...) - đã có tương phản tốt */ |
| div[data-testid="stMetricLabel"] p { |
| font-size: 1.1rem !important; |
| font-weight: 600 !important; |
| color: #333333; /* Xám đậm */ |
| } |
| /* Giá trị nhiệt độ - đã có tương phản tốt */ |
| div[data-testid="stMetricValue"] { |
| font-size: 2.8rem !important; |
| font-weight: 700 !important; |
| color: #004080; /* Xanh navy đậm */ |
| } |
| /* Giá trị "Actual" (delta) - đã có tương phản tốt */ |
| div[data-testid="stMetricDelta"] { |
| font-size: 1rem !important; |
| font-weight: 600 !important; |
| color: #555555; /* Xám vừa */ |
| } |
| |
| /* ===== 3. TIÊU ĐỀ (HEADINGS) ===== */ |
| h1, h2, h3 { |
| color: #004080 !important; /* Dùng chung màu XANH ĐẬM NHẤT */ |
| text-shadow: 1px 1px 4px rgba(0, 0, 0, 0.15) !important; /* Thêm đổ bóng ĐEN (thay vì trắng) */ |
| } |
| |
| /* ===== 4. BẢNG (DATAFRAME) ===== */ |
| .stDataFrame { |
| background-color: #FFFFFF; /* Nền TRẮNG ĐỤC */ |
| border: 1px solid #CCCCCC !important; /* Viền xám nhạt */ |
| border-radius: 8px; |
| overflow: hidden; |
| } |
| /* Tiêu đề của bảng */ |
| [data-testid="stDataGridHeader"] { |
| background-color: #F0F8FF; /* Nền header (Alice Blue) */ |
| color: #004080; /* Chữ xanh đậm */ |
| } |
| |
| /* ===== 5. BIỂU ĐỒ (PLOTLY) ===== */ |
| .plotly-graph-div { |
| background-color: #FFFFFF; /* Nền TRẮNG ĐỤC */ |
| border: 1px solid #E0E0E0; /* Viền xám rất nhạt */ |
| border-radius: 8px; |
| } |
| |
| /* ===== 6. VĂN BẢN THÔNG THƯỜNG (PARAGRAPH & MARKDOWN) ===== */ |
| /* Quy tắc này áp dụng cho văn bản st.markdown và các đoạn văn bản khác */ |
| .stMarkdown, p, li { |
| color: #333333 !important; /* Xám đen, tương phản tốt trên nền sáng */ |
| font-size: 1.05rem; /* Có thể thêm tùy chọn để chữ lớn hơn một chút */ |
| } |
| |
| /* SAFE DataFrame Styling */ |
| [data-testid="stDataFrame"] { |
| border: 1px solid #CCCCCC !important; |
| border-radius: 8px !important; |
| background-color: #FFFFFF !important; |
| } |
| |
| /* ===== EXPANDERS (vẫn giữ như cũ) ===== */ |
| div[data-testid="stExpander"] { |
| background-color: rgba(255, 255, 255, 0.9) !important; |
| border-radius: 10px !important; |
| border: 1px solid rgba(0, 0, 0, 0.1) !important; |
| } |
| |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| |
| load_css() |
| |
|
|
| |
| |
|
|
| @st.cache_data |
| def load_hourly_performance_data(file_path="data/hourly_120h_evaluation_results.csv"): |
| """Loads hourly RMSE/R2 performance data (T+1h to T+120h).""" |
| try: |
| df = pd.read_csv(file_path) |
| |
| df['Horizon'] = df.index + 1 |
| |
| return df |
| except FileNotFoundError: |
| st.warning(f"Warning: Hourly Performance data not found at: {file_path}. Cannot show degradation plot.") |
| return pd.DataFrame() |
|
|
| @st.cache_data |
| def load_hourly_data(file_path="data/final_hourly_feature_dataset.csv"): |
| """Loads the Hourly Direct dataset using the provided demo file.""" |
| try: |
| |
| df_hourly = pd.read_csv(file_path) |
| |
| |
| DATE_COLUMN = 'datetime' |
| |
| if DATE_COLUMN not in df_hourly.columns: |
| st.error(f"Error: Date column '{DATE_COLUMN}' not found in hourly data CSV. Please check the column name.") |
| return pd.DataFrame() |
| |
| |
| df_hourly[DATE_COLUMN] = pd.to_datetime(df_hourly[DATE_COLUMN]) |
| df_hourly = df_hourly.set_index(DATE_COLUMN) |
| df_hourly = df_hourly.sort_index() |
| return df_hourly |
| |
| except FileNotFoundError: |
| st.error(f"ERROR: Hourly data file not found at: {file_path}. Please check the path and file name.") |
| return pd.DataFrame() |
| except Exception as e: |
| st.error(f"An unexpected error occurred while loading hourly data: {e}") |
| return pd.DataFrame() |
|
|
| @st.cache_resource |
| def load_24_hourly_models(): |
| """Tải 24 mô hình LGBM chuyên biệt (T+1h đến T+24h) cho biểu đồ.""" |
| hourly_models = {} |
| |
| |
| num_horizons = 24 |
| |
| try: |
| for h in range(1, num_horizons + 1): |
| |
| if h <= 10: |
| file_path = f"models/lgbm_model_target_temp_next_{h}h.pkl" |
| else: |
| file_path = f"models/lgbm_model_target_temp_next_{h}h.pkl" |
|
|
| model = joblib.load(file_path) |
| hourly_models[h] = model |
| |
| if len(hourly_models) < num_horizons: |
| st.warning(f"Warning: Only {len(hourly_models)} hourly models loaded. Graph will be incomplete.") |
| |
| return hourly_models |
| except FileNotFoundError as e: |
| st.error(f"ERROR: Missing hourly model file: {e.filename}. Cannot generate full hourly graph.") |
| return {} |
|
|
| @st.cache_data |
| def load_feature_data(file_path="data/final_dataset_tree.csv"): |
| """Loads features and targets, converts index to datetime.""" |
| try: |
| df = pd.read_csv(file_path) |
| |
| |
| |
| DATE_COLUMN = 'datetime' |
| |
| if DATE_COLUMN not in df.columns: |
| st.error(f"Error: Date column '{DATE_COLUMN}' not found in 'final_dataset_tree.csv'. " |
| f"Please update the DATE_COLUMN variable in 'app.py'.") |
| return pd.DataFrame() |
| |
| df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN]) |
| df = df.set_index(DATE_COLUMN) |
| df = df.sort_index() |
| return df |
| except FileNotFoundError: |
| st.error(f"ERROR: Main data file not found at: {file_path}") |
| return pd.DataFrame() |
|
|
| @st.cache_resource |
| def load_champion_models(): |
| """Loads the 5 specialist models from the checklist.""" |
| models = [] |
| try: |
| for i in range(1, 6): |
| file_path = f"models/champion_stacking_day{i}.pkl" |
| model = joblib.load(file_path) |
| models.append(model) |
| return models |
| except FileNotFoundError as e: |
| st.error(f"ERROR: Model file not found. Checked: {e.filename}. " |
| "Ensure the 5 .pkl files are in the 'models/' directory.") |
| return [] |
|
|
| @st.cache_data |
| def load_performance_data(file_path="data/final_5_day_results_df.csv"): |
| """Loads pre-calculated performance data for Tab 3.""" |
| try: |
| df = pd.read_csv(file_path) |
| return df |
| except FileNotFoundError: |
| st.error(f"ERROR: Performance file not found at: {file_path}") |
| return pd.DataFrame() |
|
|
| |
|
|
| |
| all_data_df = load_feature_data() |
| models = load_champion_models() |
| perf_df = load_performance_data() |
|
|
| |
| TARGET_COLS = ['temp_next_1_day', 'temp_next_2_day', 'temp_next_3_day', 'temp_next_4_day', 'temp_next_5_day'] |
| CURRENT_TEMP_COL = 'temp' |
|
|
| |
| TEST_START_DATE = "2024-02-18" |
| TEST_END_DATE = "2025-09-26" |
|
|
| X_test, y_test, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() |
|
|
| if not all_data_df.empty: |
| try: |
| test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy() |
| |
| |
| feature_cols = [col for col in all_data_df.columns if col not in TARGET_COLS] |
| |
| |
| |
| X_test = test_df[feature_cols] |
| y_test = test_df[TARGET_COLS] |
| |
| |
| y_test.columns = [f'Day {i}' for i in range(1, 6)] |
| except KeyError: |
| st.error(f"Error: Target columns (e.g., '{TARGET_COLS[0]}') or " |
| f"'{CURRENT_TEMP_COL}' column not found in CSV. Please update 'app.py'.") |
| except Exception as e: |
| st.error(f"Error processing test set: {e}") |
| else: |
| st.error("Could not load main data, application cannot continue.") |
| st.stop() |
|
|
|
|
| |
| HOURLY_TARGET_COLS = ['target_temp_next_24h', 'target_temp_next_48h', 'target_temp_next_72h', |
| 'target_temp_next_96h', 'target_temp_next_120h'] |
|
|
| |
| hourly_data_df = load_hourly_data(file_path="data/final_hourly_feature_dataset.csv") |
| hourly_perf_df = load_hourly_performance_data(file_path="data/hourly_120h_evaluation_results.csv") |
| hourly_models_24h = load_24_hourly_models() |
| |
| if not hourly_data_df.empty: |
| HOURLY_FEATURE_COLS = [col for col in hourly_data_df.columns if col not in HOURLY_TARGET_COLS] |
| |
| |
| X_test_hourly = hourly_data_df.loc[TEST_START_DATE:TEST_END_DATE][HOURLY_FEATURE_COLS].copy() |
|
|
| |
| columns_to_drop_objects = ['sunrise', 'sunset'] |
| X_test_hourly = X_test_hourly.drop(columns=columns_to_drop_objects, errors='ignore') |
|
|
| HOURLY_FEATURE_COLS = X_test_hourly.columns.tolist() |
| else: |
| X_test_hourly = pd.DataFrame() |
|
|
| |
| def predict_next_24_hours(input_features: pd.DataFrame, models: dict) -> List[float]: |
| |
| predictions = [] |
| num_horizons = len(models) |
| |
| if input_features.empty or not models: |
| |
| last_temp = input_features['temp'].iloc[-1] if not input_features.empty else 28.0 |
| |
| np.random.seed(42) |
| return [last_temp + 1.5 * np.sin(2 * np.pi * (h + 10) / 24) + np.random.normal(0, 0.5) |
| for h in range(num_horizons)] |
|
|
| |
| for h in range(1, num_horizons + 1): |
| try: |
| model = models[h] |
| pred = model.predict(input_features)[0] |
| predictions.append(pred) |
| except: |
| predictions.append(float('nan')) |
| |
| return predictions |
|
|
| |
| |
|
|
| |
|
|
| |
| tab1, tab2, tab3, tab4 = st.tabs([ |
| "📑 Project Overview & Methodology", |
| "🌦️ Live 5-Day Forecast", |
| "📊 Model Performance & Diagnostics", |
| "⏱️ Hourly Prediction" |
| ]) |
|
|
| |
| with tab1: |
| |
| st.title("Saigon Temperature Forecasting Application 🌦️") |
| |
| |
| st.image("https://image.vietnam.travel/sites/default/files/2023-03/shutterstock_626352947_0.jpg?v=1762135399", |
| caption="Ho Chi Minh City. Credit: Vietnam Tourism", use_container_width=True) |
| |
| |
| st.subheader("Project Summary") |
| st.markdown(""" |
| The goal of this project is to forecast the average daily temperature for Ho Chi Minh City for the next 5 days. |
| |
| * **Data:** 10 years of historical weather data from Visual Crossing. |
| * **Model:** We use 5 'specialist' models - each model is optimized to predict a specific future day (T+1 to T+5). |
| """) |
| |
| |
| st.subheader("🚀 Our 'Two-Stream' Strategy") |
| st.markdown(""" |
| To optimize performance, we applied a "Two-Stream" strategy: |
| 1. **Stream 1 (Linear Models):** Linear models (like Linear Regression) were trained on a feature set pruned using VIF to avoid multicollinearity. |
| 2. **Stream 2 (Tree-based Models):** More complex models (like Random Forest, Gradient Boosting) were trained on a comprehensive set of 156 features to capture non-linear relationships. |
| |
| Our Champion Model is a **Stacking** model from Stream 2, which demonstrated superior performance. |
| """) |
| |
| |
| st.subheader("🏆 Final Model Leaderboard") |
| st.markdown("Model leaderboard ranked by average RMSE score (lower is better).") |
| |
| |
| leaderboard_df = benchmark_utils.load_leaderboard() |
| |
| if not leaderboard_df.empty: |
| |
| top_10_df = leaderboard_df.head(10).reset_index(drop=True) |
| |
| |
| top_10_df.index = range(1, len(top_10_df) + 1) |
| |
| |
| st.dataframe(top_10_df, use_container_width=True) |
| else: |
| st.warning("Could not load leaderboard data.") |
|
|
|
|
| |
|
|
| |
| with tab2: |
| |
| st.title("Live 5-Day Forecast") |
|
|
| |
| st.subheader("Forecast Input") |
| selected_date = None |
| |
| if not X_test.empty: |
| min_date = X_test.index.min() |
| max_date = X_test.index.max() |
| |
| selected_date = st.date_input( |
| "Select a date from the test set:", |
| value=min_date, |
| min_value=min_date, |
| max_value=max_date, |
| format="YYYY-MM-DD" |
| ) |
| else: |
| st.error("Test data could not be loaded.") |
| |
| st.divider() |
|
|
| |
| if selected_date and not X_test.empty and models: |
| st.header(f"5-Day Forecast from: {selected_date.strftime('%Y-%m-%d')}") |
| |
| |
| selected_date_ts = pd.Timestamp(selected_date) |
| |
| |
| if selected_date_ts in X_test.index: |
| input_features = X_test.loc[[selected_date_ts]] |
| else: |
| st.error("Data not found for the selected date in X_test.") |
| input_features = pd.DataFrame() |
|
|
| if input_features.empty: |
| st.error("Data not found for the selected date.") |
| else: |
| |
| predictions = [] |
| for i in range(5): |
| model = models[i] |
| pred = model.predict(input_features)[0] |
| predictions.append(pred) |
| |
| |
| forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:] |
| cols = st.columns(5) |
| |
| |
| actual_values = [] |
| if selected_date_ts in all_data_df.index: |
| actual_row = all_data_df.loc[selected_date_ts] |
| for col_name in TARGET_COLS: |
| actual_values.append(actual_row[col_name]) |
| else: |
| actual_values = [float('nan')] * 5 |
| |
| is_partial_forecast = any(pd.isna(v) for v in actual_values) |
|
|
| for i in range(5): |
| with cols[i]: |
| actual_val = actual_values[i] |
| delta_text = f"Actual: {actual_val:.1f}°C" if pd.notna(actual_val) else "Actual: --" |
|
|
| st.metric( |
| label=f"Forecast for {forecast_dates[i].strftime('%b %d')}", |
| value=f"{predictions[i]:.1f}°C", |
| delta=delta_text, |
| delta_color="off" |
| ) |
| |
| |
| st.subheader("Forecast Insights (Why?)") |
| |
| |
| temp_lag_1 = input_features['temp_lag_1'].iloc[0] |
| precip_today = input_features['precip'].iloc[0] |
| |
| |
| if temp_lag_1 > 30: |
| st.info(f"💡 Insight: Yesterday was very hot ({temp_lag_1:.1f}°C). The model is using this strong 'persistence' signal for tomorrow's forecast.") |
| elif temp_lag_1 < 25: |
| st.info(f"💡 Insight: Yesterday was cool ({temp_lag_1:.1f}°C). This is likely pulling the initial forecast down.") |
| |
| if precip_today > 10: |
| st.info(f"💡 Insight: The selected day had {precip_today:.1f}mm of rain. This humidity and cloud cover is factored into the forecast.") |
| elif 'temp_lag_1' not in locals() or (temp_lag_1 >= 25 and temp_lag_1 <= 30): |
| st.info("💡 Insight: Weather conditions appear stable. The forecast is primarily driven by seasonal trends and recent temperature history.") |
| |
|
|
| |
| st.markdown("---") |
| |
| with st.expander("🔍 Feature Inspector: What the Model Saw on this Day"): |
|
|
| if not input_features.empty: |
| |
| col1, col2, col3 = st.columns(3) |
|
|
| |
| with col1: |
| st.subheader("Core Conditions") |
| st.metric(label="Today's Avg Temp (temp)", value=f"{input_features['temp'].iloc[0]:.1f}°C") |
| st.metric(label="Today's 'Feels Like' (feelslike)", value=f"{input_features['feelslike'].iloc[0]:.1f}°C") |
| st.metric(label="Humidity", value=f"{input_features['humidity'].iloc[0]:.1f}%") |
| st.metric(label="Cloud Cover", value=f"{input_features['cloudcover'].iloc[0]:.1f}%") |
| st.metric(label="Precipitation", value=f"{input_features['precip'].iloc[0]:.1f} mm") |
|
|
| |
| with col2: |
| st.subheader("Recent History") |
| st.metric(label="Temp Yesterday (temp_lag_1)", value=f"{input_features['temp_lag_1'].iloc[0]:.1f}°C") |
| st.metric(label="7-Day Avg Temp (temp_roll_7d_mean)", value=f"{input_features['temp_roll_7d_mean'].iloc[0]:.1f}°C") |
| |
| |
| |
| st.metric(label="7-Day Total Rainfall (precip_roll_7d_sum)", value=f"{input_features['precip_roll_7d_sum'].iloc[0]:.1f} mm") |
| |
| st.metric(label="14-Day Temp Volatility (temp_roll_14d_std)", value=f"{input_features['temp_roll_14d_std'].iloc[0]:.2f}°C") |
|
|
| |
| with col3: |
| st.subheader("Seasonal Context") |
| st.metric(label="Day of Year", value=f"{input_features['day_of_year'].iloc[0]}") |
| st.metric(label="Sea Level Pressure", value=f"{input_features['sealevelpressure'].iloc[0]:.1f} hPa") |
| st.metric(label="Wind Speed", value=f"{input_features['windspeed'].iloc[0]:.1f} km/h") |
| st.metric(label="Wind Direction", value=f"{input_features['winddir'].iloc[0]:.0f}°") |
|
|
| else: |
| st.warning("No feature data available for the selected date.") |
| |
|
|
| |
| st.subheader("Training Set Overview") |
| with st.expander("Show plot of all training data (before 2024-02-18)"): |
| train_end_date = pd.Timestamp(TEST_START_DATE) - pd.Timedelta(days=1) |
| train_df = all_data_df.loc[:train_end_date][CURRENT_TEMP_COL] |
| |
| fig_train = go.Figure() |
| fig_train.add_trace(go.Scatter( |
| x=train_df.index, y=train_df, |
| mode='lines', name='Training Data (Actual)', |
| line=dict(color='#005aa7', width=1) |
| )) |
| fig_train.update_layout( |
| title="Actual Temperature - Full Training Set", |
| xaxis_title="Date", yaxis_title="Temperature (°C)", |
| template="plotly_white", |
| xaxis_rangeslider_visible=True, |
| yaxis_fixedrange=True |
| ) |
| st.plotly_chart(fig_train, use_container_width=True) |
|
|
| |
| st.subheader("Historical Context & Forecast") |
| |
| history_start = selected_date_ts - pd.Timedelta(days=14) |
| history_end = selected_date_ts |
| history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL] |
| |
| forecast_df = pd.DataFrame({ |
| 'Date': forecast_dates, |
| 'Forecast': predictions |
| }).set_index('Date') |
| |
| fig = go.Figure() |
| fig.add_trace(go.Scatter( |
| x=history_df.index, y=history_df, |
| mode='lines+markers', name='Past 14 Days (Actual)', |
| line=dict(color='blue') |
| )) |
| fig.add_trace(go.Scatter( |
| x=forecast_df.index, y=forecast_df['Forecast'], |
| mode='lines+markers', name='5-Day Forecast', |
| line=dict(color='red', dash='dot') |
| )) |
| fig.update_layout( |
| title="Forecast vs. Historical Context", |
| xaxis_title="Date", yaxis_title="Temperature (°C)", |
| template="plotly_white", legend=dict(x=0.01, y=0.99) |
| ) |
| st.plotly_chart(fig, use_container_width=True) |
|
|
| |
| st.subheader("5-Day Forecast vs. Actual Comparison") |
| |
| fig_comp = go.Figure() |
| |
| |
| fig_comp.add_trace(go.Scatter( |
| x=forecast_dates, y=predictions, |
| mode='lines+markers', name='5-Day Forecast', |
| line=dict(color='red', dash='dot') |
| )) |
| |
| |
| if not is_partial_forecast: |
| fig_comp.add_trace(go.Scatter( |
| x=forecast_dates, y=actual_values, |
| mode='lines+markers', name='5-Day Actual', |
| line=dict(color='blue') |
| )) |
| fig_comp.update_layout(title="5-Day Forecast vs. Actual Values") |
| else: |
| |
| fig_comp.update_layout(title="5-Day Forecast (Actual data not yet available)") |
|
|
| |
| fig_comp.update_layout( |
| xaxis_title="Date", yaxis_title="Temperature (°C)", |
| template="plotly_white", legend=dict(x=0.01, y=0.99) |
| ) |
| st.plotly_chart(fig_comp, use_container_width=True) |
| |
|
|
| else: |
| |
| if not selected_date: |
| st.warning("Test data could not be loaded.") |
| else: |
| st.warning("Please wait... Loading data or models.") |
|
|
| |
|
|
| |
| with tab3: |
| |
| st.title("Model Performance & Diagnostics") |
| |
| if not perf_df.empty and not y_test.empty: |
| st.subheader("Performance Degradation over 5 Days") |
| st.markdown("How model performance changes as the forecast horizon increases.") |
| |
| MODEL_NAME = 'Champion (Stacking)' |
| champion_perf_df = perf_df[perf_df['Model'] == MODEL_NAME].copy() |
|
|
| |
| RMSE_COL_NAME = 'RMSE (Absolute Error)' |
| R2_COL_NAME = 'R-squared' |
| |
| col1, col2 = st.columns(2) |
| with col1: |
| fig_rmse = diag.plot_performance_degradation( |
| champion_perf_df, |
| metric_column=RMSE_COL_NAME, |
| metric_name='RMSE (Temperature °C)', |
| color='blue' |
| ) |
| st.plotly_chart(fig_rmse, use_container_width=True) |
| with col2: |
| fig_r2 = diag.plot_performance_degradation( |
| champion_perf_df, |
| metric_column=R2_COL_NAME, |
| metric_name='R-squared (R²)', |
| color='green' |
| ) |
| st.plotly_chart(fig_r2, use_container_width=True) |
| |
| |
| st.subheader("Interactive Forecast vs. Actual Comparison") |
| |
| |
| selected_horizon = st.slider( |
| "Select Forecast Horizon (Day) to inspect:", |
| 1, 5, 1 |
| ) |
| |
| |
| @st.cache_data |
| def get_full_test_predictions(_models, _X_test): |
| """Run predictions on the entire test set and cache the results.""" |
| all_preds = {} |
| for i in range(5): |
| model = _models[i] |
| preds = model.predict(_X_test) |
| all_preds[f'Day {i+1}'] = preds |
| return pd.DataFrame(all_preds, index=_X_test.index) |
|
|
| with st.spinner("Running predictions on entire test set... (This is cached for next time)"): |
| y_pred_test = get_full_test_predictions(models, X_test) |
|
|
| |
| y_true_selected = y_test[f'Day {selected_horizon}'] |
| y_pred_selected = y_pred_test[f'Day {selected_horizon}'] |
|
|
| |
| fig_interactive = diag.plot_forecast_vs_actual( |
| y_true=y_true_selected, |
| y_pred=y_pred_selected, |
| day_ahead_title=f"Day {selected_horizon} Forecast" |
| ) |
| st.plotly_chart(fig_interactive, use_container_width=True) |
| |
|
|
|
|
| |
| with st.expander("Champion Model Diagnostics (Deep Dive)"): |
| st.markdown("Detailed analysis of residuals (error = actual - predicted) for the Day 1 forecast.") |
| |
| y_true_d1 = y_test['Day 1'] |
| y_pred_d1 = y_pred_test['Day 1'] |
| dates_d1 = y_test.index |
| |
| fig_res_time = diag.plot_residuals_vs_time( |
| y_true_d1, y_pred_d1, dates_d1, "Day 1" |
| ) |
| st.plotly_chart(fig_res_time, use_container_width=True) |
| |
| fig_res_dist = diag.plot_residuals_distribution( |
| y_true_d1, y_pred_d1, "Day 1" |
| ) |
| st.plotly_chart(fig_res_dist, use_container_width=True) |
| st.markdown("A good model will have residuals (errors) normally distributed (bell curve) " |
| "around 0 and show no pattern over time.") |
| |
| else: |
| st.warning("Loading performance data...") |
|
|
| |
| with tab4: |
| st.title("Hourly Prediction (Next 24 Hours)") |
|
|
| st.subheader("Forecast Start Time") |
| |
| if not X_test_hourly.empty: |
| min_ts = X_test_hourly.index.min() |
| max_ts = X_test_hourly.index.max() |
| |
| |
| selected_date = st.date_input( |
| "Select the date:", |
| value=max_ts.date(), |
| min_value=min_ts.date(), |
| max_value=max_ts.date(), |
| format="YYYY-MM-DD", |
| key="hourly_date_input" |
| ) |
| |
| |
| available_hours_in_day = X_test_hourly[X_test_hourly.index.date == selected_date].index.hour.unique().sort_values() |
| |
| if available_hours_in_day.empty: |
| st.warning(f"No hourly data found for {selected_date}. Please select a different date.") |
| st.stop() |
| |
| |
| default_hour = available_hours_in_day.max() |
| default_hour_index = available_hours_in_day.get_loc(default_hour) |
|
|
| selected_hour = st.selectbox( |
| "Select the latest known hour:", |
| options=available_hours_in_day.tolist(), |
| index=default_hour_index, |
| format_func=lambda x: f"{x:02d}:00:00" |
| ) |
| |
| |
| latest_time_for_day = pd.to_datetime(f"{selected_date} {selected_hour:02d}:00:00") |
| |
| |
| input_features_hourly = X_test_hourly.loc[[latest_time_for_day]] |
| |
| st.info(f"The model runs based on data up to the latest known hour: **{latest_time_for_day.strftime('%Y-%m-%d %H:%M:%S')}**") |
| st.divider() |
|
|
| |
| predictions_24h = predict_next_24_hours(input_features_hourly, hourly_models_24h) |
| |
| |
| t_plus_24h_metric_value = predictions_24h[23] if len(predictions_24h) >= 24 else (predictions_24h[-1] if predictions_24h else float('nan')) |
| |
| |
| st.subheader(f"Summary Forecast for Next Day (Starting {latest_time_for_day.strftime('%H:%M')})") |
| |
| forecast_start_ts = latest_time_for_day + pd.Timedelta(hours=1) |
| |
| |
| t_plus_2h_value = predictions_24h[1] if len(predictions_24h) >= 2 else float('nan') |
| t_plus_3h_value = predictions_24h[2] if len(predictions_24h) >= 3 else float('nan') |
| |
| |
| avg_temp = np.nanmean(predictions_24h) |
| max_temp = np.nanmax(predictions_24h) |
| |
|
|
| |
| |
| actual_hourly_index = pd.date_range(start=forecast_start_ts, periods=24, freq='H') |
| |
| |
| try: |
| |
| actual_temps_24h_series = hourly_data_df['temp'].reindex(actual_hourly_index) |
| except Exception: |
| |
| actual_temps_24h_series = pd.Series([float('nan')] * 24, index=actual_hourly_index) |
| |
| |
| forecast_t2_ts = forecast_start_ts + pd.Timedelta(hours=1) |
| forecast_t3_ts = forecast_start_ts + pd.Timedelta(hours=2) |
| forecast_t24_ts = forecast_start_ts + pd.Timedelta(hours=23) |
| |
| actual_t2_val = actual_temps_24h_series.get(forecast_t2_ts) |
| actual_t3_val = actual_temps_24h_series.get(forecast_t3_ts) |
| actual_t24_val = actual_temps_24h_series.get(forecast_t24_ts) |
| |
| |
| is_partial_hourly = actual_temps_24h_series.isna().any() |
| |
| if is_partial_hourly: |
| actual_avg_val = float('nan') |
| actual_max_val = float('nan') |
| else: |
| actual_avg_val = np.nanmean(actual_temps_24h_series) |
| actual_max_val = np.nanmax(actual_temps_24h_series) |
|
|
| |
| delta_t2 = f"Actual: {actual_t2_val:.1f}°C" if pd.notna(actual_t2_val) else "Actual: --" |
| delta_t3 = f"Actual: {actual_t3_val:.1f}°C" if pd.notna(actual_t3_val) else "Actual: --" |
| delta_t24 = f"Actual: {actual_t24_val:.1f}°C" if pd.notna(actual_t24_val) else "Actual: --" |
| delta_avg = f"Actual: {actual_avg_val:.1f}°C" if pd.notna(actual_avg_val) else "Actual: --" |
| delta_max = f"Actual: {actual_max_val:.1f}°C" if pd.notna(actual_max_val) else "Actual: --" |
| |
|
|
|
|
| |
| col_t2, col_t3, col_t24, col_avg, col_max = st.columns(5) |
| |
| |
| with col_t2: |
| st.metric( |
| label=f"Forecast @ {forecast_t2_ts.strftime('%H:%M')} (T+2H)", |
| value=f"{t_plus_2h_value:.1f}°C", |
| delta=delta_t2, |
| delta_color="off" |
| ) |
| |
| |
| with col_t3: |
| st.metric( |
| label=f"Forecast @ {forecast_t3_ts.strftime('%H:%M')} (T+3H)", |
| value=f"{t_plus_3h_value:.1f}°C", |
| delta=delta_t3, |
| delta_color="off" |
| ) |
| |
| |
| with col_t24: |
| st.metric( |
| label=f"Forecast @ {forecast_t24_ts.strftime('%H:%M')} (T+24H)", |
| value=f"{t_plus_24h_metric_value:.1f}°C", |
| delta=delta_t24, |
| delta_color="off" |
| ) |
|
|
| |
| with col_avg: |
| st.metric( |
| label="Next 24h Average Temp", |
| value=f"{avg_temp:.1f}°C", |
| delta=delta_avg, |
| delta_color="off" |
| ) |
| |
| |
| with col_max: |
| st.metric( |
| label="Next 24h Max Temp", |
| value=f"{max_temp:.1f}°C", |
| delta=delta_max, |
| delta_color="off" |
| ) |
| |
| |
| |
| |
| |
| st.subheader("Historical Context & Forecast (Hourly)") |
| |
| |
| history_start_ts = latest_time_for_day - pd.Timedelta(hours=23) |
| history_end_ts = latest_time_for_day |
| |
| |
| history_df_hourly = hourly_data_df.loc[history_start_ts:history_end_ts]['temp'] |
| |
| |
| forecast_hourly_index = pd.date_range(start=forecast_start_ts, periods=len(predictions_24h), freq='H') |
| forecast_df_hourly = pd.DataFrame({ |
| 'Time': forecast_hourly_index, |
| 'Forecast': predictions_24h |
| }).set_index('Time') |
| |
| |
| fig_hist_hourly = go.Figure() |
| fig_hist_hourly.add_trace(go.Scatter( |
| x=history_df_hourly.index, y=history_df_hourly, |
| mode='lines+markers', name='Past 24 Hours (Actual)', |
| line=dict(color='blue') |
| )) |
| fig_hist_hourly.add_trace(go.Scatter( |
| x=forecast_df_hourly.index, y=forecast_df_hourly['Forecast'], |
| mode='lines+markers', name='Next 24 Hours (Forecast)', |
| line=dict(color='red', dash='dot') |
| )) |
| fig_hist_hourly.update_layout( |
| title="Hourly Forecast vs. Historical Context", |
| xaxis_title="Time", yaxis_title="Temperature (°C)", |
| template="plotly_white", legend=dict(x=0.01, y=0.99) |
| ) |
| st.plotly_chart(fig_hist_hourly, use_container_width=True) |
|
|
| |
| st.subheader("24-Hour Forecast vs. Actual Comparison") |
| |
| |
| actual_values_24h = actual_temps_24h_series.values |
| is_partial_hourly_forecast = is_partial_hourly |
|
|
| fig_comp_hourly = go.Figure() |
| |
| |
| fig_comp_hourly.add_trace(go.Scatter( |
| x=forecast_hourly_index, y=predictions_24h, |
| mode='lines+markers', name='24-Hour Forecast', |
| line=dict(color='red', dash='dot') |
| )) |
| |
| if not is_partial_hourly_forecast: |
| fig_comp_hourly.add_trace(go.Scatter( |
| x=forecast_hourly_index, y=actual_values_24h, |
| mode='lines+markers', name='24-Hour Actual', |
| line=dict(color='blue') |
| )) |
| fig_comp_hourly.update_layout(title="24-Hour Forecast vs. Actual Values") |
| else: |
| |
| fig_comp_hourly.update_layout(title="24-Hour Forecast (Actual data not yet available)") |
| |
| |
| fig_comp_hourly.update_layout( |
| xaxis_title="Time", yaxis_title="Temperature (°C)", |
| template="plotly_white", legend=dict(x=0.01, y=0.99) |
| ) |
| st.plotly_chart(fig_comp_hourly, use_container_width=True) |
| |
| |
|
|
| |
| st.subheader("Model Reliability: Error Degradation") |
| if not hourly_perf_df.empty: |
| |
| |
| |
| |
| df_plot = hourly_perf_df.head(24) |
| |
| |
| fig_rmse_hourly = go.Figure() |
| fig_rmse_hourly.add_trace(go.Scatter( |
| x=df_plot['Horizon'], |
| y=df_plot['RMSE'], |
| mode='lines+markers', |
| name='RMSE', |
| line=dict(color='#005aa7') |
| )) |
| fig_rmse_hourly.update_layout( |
| title="RMSE Degradation: Forecast Error vs. Hour Ahead (T+1h to T+24h)", |
| xaxis_title="Forecast Horizon (Hours)", |
| yaxis_title="RMSE (°C)", |
| template="plotly_white", |
| yaxis_range=[0, df_plot['RMSE'].max() * 1.1 if not df_plot['RMSE'].empty else 1], |
| height=400 |
| ) |
| st.plotly_chart(fig_rmse_hourly, use_container_width=True) |
| else: |
| st.warning("Could not load Hourly RMSE Degradation data from hourly_120h_evaluation_results.csv.") |
|
|
| |
| st.markdown("---") |
| with st.expander("🔍 Feature Inspector: Hourly Inputs for the Forecast"): |
| if not input_features_hourly.empty: |
| important_hourly_features = [ |
| 'temp', 'humidity', 'windspeed', 'cloudcover', |
| 'temp_lag_1h', 'humidity_lag_24h', 'temp_diff_24h', |
| 'temp_roll_24h_mean', 'humidity_roll_24h_mean', |
| 'hour_sin', 'day_of_year_sin' |
| ] |
| |
| col_h1, col_h2, col_h3 = st.columns(3) |
| |
| for i, feature in enumerate(important_hourly_features): |
| if feature in input_features_hourly.columns: |
| value = input_features_hourly[feature].iloc[0] |
| label = feature.replace('_', ' ').title() |
| |
| target_col = [col_h1, col_h2, col_h3][i % 3] |
| with target_col: |
| st.metric(label=label, value=f"{value:.2f}") |
| else: |
| st.warning("No hourly feature data available for the selected hour.") |
| |
| else: |
| st.warning("Please wait... Loading hourly data or models.") |