Harveyntt's picture
Upload app.py
46c22fb verified
# --- 1. IMPORT LIBRARIES ---
import streamlit as st
import pandas as pd
import joblib
import plotly.graph_objects as go
# datetime not used directly; pandas timestamps are used instead
from typing import List
import numpy as np
import requests
import base64
import plotly.express as px
from datetime import datetime
import os
import feature_engineering_live as fe_live
import pytz
def _align_features_for_model(model, live_df: pd.DataFrame, reference_columns: pd.Index):
"""Return a DataFrame aligned to `model` expected features.
- If model has `feature_names_in_`, use that ordering and add missing cols as NaN.
- Else if model has `n_features_in_`, select the first N columns from `reference_columns`
that exist in `live_df`, and pad missing ones with NaN.
- Otherwise, try to reindex to `reference_columns` and then take available columns.
"""
df = live_df.copy()
# Ensure numeric types where possible
for c in df.columns:
try:
df[c] = pd.to_numeric(df[c], errors='coerce')
except Exception:
pass
expected = None
try:
expected = getattr(model, 'feature_names_in_', None)
if expected is not None:
expected = list(expected)
except Exception:
expected = None
if expected is not None:
# Add missing columns as NaN, preserve order
for c in expected:
if c not in df.columns:
df[c] = np.nan
return df[expected]
# Fallback to n_features_in_ if available
n_in = getattr(model, 'n_features_in_', None)
if n_in is not None:
# choose ordered columns from reference_columns that exist in df
available = [c for c in reference_columns if c in df.columns]
chosen = available[:int(n_in)]
# pad with NaN cols if needed
for i in range(len(chosen), int(n_in)):
pad_name = f'_pad_col_{i}'
df[pad_name] = np.nan
chosen.append(pad_name)
return df[chosen]
# Last resort: align to reference_columns intersection
common = [c for c in reference_columns if c in df.columns]
if common:
return df[common]
# If nothing matches, return df as-is and let predict handle shape errors
return df
def _format_feature_value(feature: str, value) -> str:
"""Format a feature value with appropriate units for display in the UI.
Returns a string (e.g., '26.0°C', '75%', '5.4 km/h', '--' for missing).
"""
try:
if pd.isna(value):
return "--"
except Exception:
pass
# Unit map (feature substring -> (template, decimals))
units = {
'temp': ('{:.1f}°C', 1),
'feelslike': ('{:.1f}°C', 1),
'humidity': ('{:.0f}%', 0),
'cloud': ('{:.0f}%', 0),
'cloudcover': ('{:.0f}%', 0),
'precip': ('{:.1f} mm', 1),
'rain': ('{:.1f} mm', 1),
'wind': ('{:.1f} km/h', 1),
'windspeed': ('{:.1f} km/h', 1),
'speed': ('{:.1f}', 1),
'day_of_year': ('{:.0f}', 0),
'hour': ('{:.2f}', 2),
'sin': ('{:.2f}', 2),
'diff': ('{:.2f}°C', 2),
'roll': ('{:.2f}°C', 2),
'temp_lag': ('{:.1f}°C', 1),
}
key = feature.lower()
for k, (fmt, _) in units.items():
if k in key:
try:
return fmt.format(float(value))
except Exception:
return str(value)
# Default numeric formatting
try:
v = float(value)
# if integer-like, show no decimals
if abs(v - int(v)) < 1e-6:
return f"{int(v)}"
return f"{v:.2f}"
except Exception:
return str(value)
# Import your utility scripts from the 'src' directory
try:
from src import benchmark_utils
from src import diagnostic_plots as diag
except ImportError:
st.error("Error: Could not find 'src/benchmark_utils.py' or 'src/diagnostic_plots.py'. "
"Please ensure they exist in the 'src/' directory.")
st.stop()
# --- 2. PAGE CONFIGURATION ---
st.set_page_config(
page_title="Saigon Temperature Forecast",
page_icon="🌦️",
layout="wide"
)
# --- START OF NEW THEME SECTION (ĐÃ CẬP NHẬT) ---
def load_css():
"""Tải CSS tùy chỉnh để tạo giao diện 'thời tiết' với ĐỘ TƯƠNG PHẢN CAO."""
st.markdown("""
<style>
/* ===== FONT CHUNG ===== */
.stApp, .stSidebar {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
}
/* ===== NỀN CHÍNH (MAIN BACKGROUND) ===== */
[data-testid="stAppViewContainer"] {
background-image: linear-gradient(to bottom, #B0E0E6, #F0F8FF);
background-attachment: fixed;
background-size: cover;
}
/* ===== 1. THANH CHỌN TAB (st.tabs) ===== */
/* Tab không được chọn */
button[data-baseweb="tab"][aria-selected="false"] {
background-color: rgba(255, 255, 255, 0.7) !important; /* Nền mờ */
color: #0E2A47 !important; /* Chữ đậm */
border-top-left-radius: 8px;
border-top-right-radius: 8px;
padding: 12px 16px !important; /* <<< THÊM PADDING */
}
/* Tab ĐANG ĐƯỢC CHỌN */
button[data-baseweb="tab"][aria-selected="true"] {
background-color: #FFFFFF !important; /* Nền TRẮNG ĐỤC */
color: #004080 !important; /* Chữ MÀU XANH ĐẬM */
font-weight: 700 !important;
border-top-left-radius: 8px;
border-top-right-radius: 8px;
border-bottom: 3px solid #004080 !important; /* Viền xanh đậm */
padding: 12px 16px !important; /* <<< THÊM PADDING */
}
/* ===== 2. THẺ DỰ BÁO (METRIC CARDS) ===== */
div[data-testid="stMetric"] {
background-color: rgba(255, 255, 255, 0.95) !important; /* Nền trắng (đục hơn) */
border: 1px solid #B0C4DE; /* Thêm viền (xanh nhạt) */
border-radius: 12px;
padding: 20px;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1) !important; /* Đổ bóng đậm hơn */
backdrop-filter: blur(5px);
transition: transform 0.2s ease;
}
div[data-testid="stMetric"]:hover {
transform: translateY(-3px);
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15) !important;
}
/* Tiêu đề thẻ (Forecast for...) - đã có tương phản tốt */
div[data-testid="stMetricLabel"] p {
font-size: 1.1rem !important;
font-weight: 600 !important;
color: #333333; /* Xám đậm */
}
/* Giá trị nhiệt độ - đã có tương phản tốt */
div[data-testid="stMetricValue"] {
font-size: 2.8rem !important;
font-weight: 700 !important;
color: #004080; /* Xanh navy đậm */
}
/* Giá trị "Actual" (delta) - đã có tương phản tốt */
div[data-testid="stMetricDelta"] {
font-size: 1rem !important;
font-weight: 600 !important;
color: #555555; /* Xám vừa */
}
/* ===== 3. TIÊU ĐỀ (HEADINGS) ===== */
h1, h2, h3 {
color: #004080 !important; /* Dùng chung màu XANH ĐẬM NHẤT */
text-shadow: 1px 1px 4px rgba(0, 0, 0, 0.15) !important; /* Thêm đổ bóng ĐEN (thay vì trắng) */
}
/* ===== 4. BẢNG (DATAFRAME) ===== */
.stDataFrame {
background-color: #FFFFFF; /* Nền TRẮNG ĐỤC */
border: 1px solid #CCCCCC !important; /* Viền xám nhạt */
border-radius: 8px;
overflow: hidden;
}
/* Tiêu đề của bảng */
[data-testid="stDataGridHeader"] {
background-color: #F0F8FF; /* Nền header (Alice Blue) */
color: #004080; /* Chữ xanh đậm */
}
/* ===== 5. BIỂU ĐỒ (PLOTLY) ===== */
.plotly-graph-div {
background-color: #FFFFFF; /* Nền TRẮNG ĐỤC */
border: 1px solid #E0E0E0; /* Viền xám rất nhạt */
border-radius: 8px;
}
/* ===== 6. VĂN BẢN THÔNG THƯỜNG (PARAGRAPH & MARKDOWN) ===== */
/* Quy tắc này áp dụng cho văn bản st.markdown và các đoạn văn bản khác */
.stMarkdown, p, li {
color: #333333 !important; /* Xám đen, tương phản tốt trên nền sáng */
font-size: 1.05rem; /* Có thể thêm tùy chọn để chữ lớn hơn một chút */
}
/* SAFE DataFrame Styling */
[data-testid="stDataFrame"] {
border: 1px solid #CCCCCC !important;
border-radius: 8px !important;
background-color: #FFFFFF !important;
}
/* ===== EXPANDERS (vẫn giữ như cũ) ===== */
div[data-testid="stExpander"] {
background-color: rgba(255, 255, 255, 0.9) !important;
border-radius: 10px !important;
border: 1px solid rgba(0, 0, 0, 0.1) !important;
}
/* ===== NEW: GENERIC INFO CARD (for the 4 top boxes) ===== */
.info-card {
background-color: #FFFFFF;
border: 1px solid #E0E0E0;
border-radius: 12px;
padding: 20px 25px 25px 25px; /* top, right, bottom, left */
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08);
min-height: 260px; /* enforce consistent card height */
display: flex;
flex-direction: column;
justify-content: flex-start;
}
.info-card h4 { margin-bottom: 12px; color: #004080; }
.info-card ul { padding-left: 20px; }
.info-card li { margin-bottom: 8px; }
/* ===== NEW: LEADERBOARD CONTAINER & STYLING ===== */
.leaderboard-container { background-color: #FFFFFF; border: 1px solid #E0E0E0; border-radius: 12px; padding: 18px; box-shadow: 0 4px 15px rgba(0,0,0,0.08); min-height: 360px; }
.leaderboard-container .stDataFrame { border: none; }
.leaderboard-container [data-testid="stDataGridHeader"] { background-color: #F0F8FF; font-weight:700; color: #004080; }
.leaderboard-container [data-testid="stTable"] td,
.leaderboard-container [data-testid="stTable"] th {
border-right: 1px solid #EAEAEA;
border-bottom: 1px solid #EAEAEA;
padding: 10px 12px;
vertical-align: middle;
}
.leaderboard-container [data-testid="stTable"] th { border-left: 1px solid #EAEAEA; }
/* ===== Static leaderboard card (manual rows, scrollable) ===== */
.leaderboard-card {
background-color: #FFFFFF;
border: 1px solid #E0E0E0;
border-radius: 10px;
padding: 8px;
/* Allow the card to expand to fit all rows so the UI shows the full leaderboard */
max-height: none;
overflow: visible;
}
.static-row { display: flex; align-items: center; }
.static-cell { flex: 1 1 0; padding: 10px 12px; border-bottom: 1px solid #F2F6F9; color: #222; }
.static-cell.small { flex: 0 0 56px; text-align: center; }
.static-header { position: sticky; top: 0; background: #F0F8FF; z-index: 5; border-bottom: 2px solid #E6EEF6; }
.static-header .static-cell { font-weight: 700; color: #004080; }
/* Large animated emoji shown at top-right of each tab */
.tab-header { position: relative; }
.tab-emoji {
display: inline-block;
position: absolute; /* remove from normal flow so we can lift it */
top: -6.3rem; /* lift up above the header */
right: -2rem; /* move a bit to the right */
font-size: 10rem; /* keep large but adjustable */
line-height: 1;
will-change: transform;
transform-origin: center;
pointer-events: none; /* don't block clicks */
z-index: 9999; /* keep on top */
}
@keyframes sway {
0% { transform: translateX(0px); }
25% { transform: translateX(18px); }
50% { transform: translateX(0px); }
75% { transform: translateX(-18px); }
100% { transform: translateX(0px); }
}
/* Apply sway animation with a gentle ease-in-out loop */
.tab-emoji {
animation: sway 4.5s ease-in-out infinite;
}
/* Respect user preference for reduced motion */
@media (prefers-reduced-motion: reduce) {
.tab-emoji { animation: none !important; }
}
</style>
""", unsafe_allow_html=True)
# Gọi hàm CSS ngay lập tức
load_css()
# Load local stylesheet for card layout and small helpers
def local_css(file_name: str):
try:
with open(file_name, 'r', encoding='utf-8') as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
except FileNotFoundError:
# Not fatal; the app will still run without the local css file
st.warning(f"Local CSS file '{file_name}' not found. Skipping local styles.")
# Load project-local CSS (card styles)
local_css("style.css")
# --- Helper: Time-of-day emoji for tab corner ---
def _time_of_day_emoji(now: datetime = None) -> str:
"""Return a small emoji representing the current time-of-day.
- Dawn (4-6): 🌅
- Morning (6-12): 🌞
- Afternoon (12-17): 🌤️
- Sunset/Evening (17-20): 🌇
- Night (20-4): 🌃
"""
if now is None:
# Define the Saigon timezone
saigon_tz = pytz.timezone('Asia/Ho_Chi_Minh')
# Get the current time and localize it to Saigon's timezone
now = datetime.now(saigon_tz)
h = now.hour
if 4 <= h < 6:
return "🌅"
if 6 <= h < 12:
return "🌞"
if 12 <= h < 17:
return "🌤️"
if 17 <= h < 20:
return "🌇"
return "🌃"
# --- END OF NEW THEME SECTION ---
# --- 3. DATA & MODEL LOADING FUNCTIONS (WITH CACHING) ---
# Checklist Items 1 & 2: Cache all heavy operations
@st.cache_data
def load_hourly_performance_data(file_path="data/hourly_120h_evaluation_results.csv"):
"""Loads hourly RMSE/R2 performance data (T+1h to T+120h)."""
try:
df = pd.read_csv(file_path)
# If the CSV doesn't have an explicit horizon column, create one from the index
if 'Horizon' not in df.columns:
df['Horizon'] = df.index + 1
return df
except FileNotFoundError:
st.warning(f"Warning: Hourly Performance data not found at: {file_path}. Cannot show degradation plot.")
return pd.DataFrame()
@st.cache_data
def load_hourly_data(file_path="data/final_hourly_feature_dataset.csv"):
"""Loads the Hourly Direct dataset and prepares a datetime index.
Expects a column named 'datetime' containing ISO-like timestamps. Returns an indexed
DataFrame sorted by the datetime index. On error returns an empty DataFrame and logs a warning.
"""
try:
df_hourly = pd.read_csv(file_path)
except FileNotFoundError:
st.error(f"ERROR: Hourly data file not found at: {file_path}. Please check the path and file name.")
return pd.DataFrame()
except Exception as e:
st.error(f"An unexpected error occurred while reading hourly data: {e}")
return pd.DataFrame()
DATE_COLUMN = 'datetime'
if DATE_COLUMN not in df_hourly.columns:
st.error(f"Error: Date column '{DATE_COLUMN}' not found in hourly data CSV. Please check the column name.")
return pd.DataFrame()
try:
df_hourly[DATE_COLUMN] = pd.to_datetime(df_hourly[DATE_COLUMN])
df_hourly = df_hourly.set_index(DATE_COLUMN)
df_hourly = df_hourly.sort_index()
return df_hourly
except Exception as e:
st.error(f"An unexpected error occurred while processing hourly data: {e}")
return pd.DataFrame()
@st.cache_resource
def load_24_hourly_models():
"""Tải 24 mô hình LGBM chuyên biệt (T+1h đến T+24h) cho biểu đồ."""
hourly_models = {}
# Số lượng mô hình bạn muốn tải (chúng ta giả định có 24 file)
num_horizons = 24
# Load each horizon model individually so a missing file doesn't abort the whole load.
for h in range(1, num_horizons + 1):
file_path = f"models/lgbm_model_target_temp_next_{h}h.pkl"
try:
model = joblib.load(file_path)
hourly_models[h] = model
except FileNotFoundError:
# Don't abort — just warn and continue. The app will fall back for missing horizons.
st.warning(f"Hourly model file not found for horizon {h}: '{file_path}'. Skipping this horizon.")
except Exception as e:
st.warning(f"Error loading hourly model for horizon {h} ('{file_path}'): {e}")
if len(hourly_models) < num_horizons:
st.info(f"Loaded {len(hourly_models)} / {num_horizons} hourly models. Missing horizons will use fallback estimates.")
return hourly_models
@st.cache_data
def load_precomputed_hourly_forecasts(file_path="data/final_hourly_120h_forecast_dataset.csv"):
"""Load precomputed hourly forecasts (up to 120h) as a convenience fallback.
The CSV is expected to contain a datetime-like column ('datetime' or 'forecast_start' or 'time')
plus columns for horizons or a wide format where each row contains the forecast for the next 120h.
We'll try to infer the format.
"""
try:
df = pd.read_csv(file_path)
except FileNotFoundError:
return pd.DataFrame()
except Exception:
return pd.DataFrame()
# Try common datetime column names
datetime_cols = [c for c in df.columns if c.lower() in ("datetime", "forecast_start", "time", "timestamp")]
if datetime_cols:
dt_col = datetime_cols[0]
try:
df[dt_col] = pd.to_datetime(df[dt_col])
df = df.set_index(dt_col).sort_index()
return df
except Exception:
return pd.DataFrame()
# If no datetime-like column, return empty to indicate unsupported format
return pd.DataFrame()
@st.cache_data
def load_feature_data(file_path="data/final_dataset_tree.csv"):
"""Loads features and targets, converts index to datetime."""
try:
df = pd.read_csv(file_path)
# --- CRITICAL CUSTOMIZATION ---
# Ensure 'datetime' is your date column in the CSV
DATE_COLUMN = 'datetime'
if DATE_COLUMN not in df.columns:
st.error(f"Error: Date column '{DATE_COLUMN}' not found in 'final_dataset_tree.csv'. "
f"Please update the DATE_COLUMN variable in 'app.py'.")
return pd.DataFrame()
df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
df = df.set_index(DATE_COLUMN)
df = df.sort_index()
return df
except FileNotFoundError:
st.error(f"ERROR: Main data file not found at: {file_path}")
return pd.DataFrame()
@st.cache_resource
def load_champion_models():
"""Loads the 5 specialist models from the checklist."""
models = []
try:
for i in range(1, 6):
file_path = f"models/champion_stacking_day{i}.pkl"
model = joblib.load(file_path)
models.append(model)
return models
except FileNotFoundError as e:
st.error(f"ERROR: Model file not found. Checked: {e.filename}. "
"Ensure the 5 .pkl files are in the 'models/' directory.")
return []
@st.cache_data
def load_performance_data(file_path="data/final_5_day_results_df.csv"):
"""Loads pre-calculated performance data for Tab 3."""
try:
df = pd.read_csv(file_path)
return df
except FileNotFoundError:
st.error(f"ERROR: Performance file not found at: {file_path}")
return pd.DataFrame()
@st.cache_data
def load_leaderboard_data(small_path: str = "data/leaderboard_for_ui.csv"):
"""Load a small precomputed leaderboard CSV if present, otherwise fall back
to the full `benchmark_utils.load_leaderboard()` call. Cached to avoid
re-reading on every rerun and to ensure the data is available before UI
rendering begins (prevents render-pause-reflow).
"""
# Only use the small precomputed UI file. This ensures Hugging Face
# Spaces doesn't perform expensive disk reads or processing during cold
# start. If the small file is missing, return an empty DataFrame and
# surface a non-fatal warning so the UI can show a helpful message.
if os.path.exists(small_path):
try:
return pd.read_csv(small_path)
except Exception as e:
st.warning(f"Could not read small leaderboard UI file '{small_path}': {e}")
return pd.DataFrame()
else:
st.warning(f"Small leaderboard UI file not found at '{small_path}'. Please commit 'data/leaderboard_for_ui.csv' to the repo for fast startup.")
return pd.DataFrame()
# --- 4. INITIALIZE DATA & SPLIT TEST SET ---
# Load all data and models (models lazy-loaded later to reduce startup I/O)
all_data_df = load_feature_data()
# Defer heavy model loading until Tab 2 is opened
models = None
perf_df = load_performance_data()
# --- CRITICAL CUSTOMIZATION ---
TARGET_COLS = ['temp_next_1_day', 'temp_next_2_day', 'temp_next_3_day', 'temp_next_4_day', 'temp_next_5_day']
CURRENT_TEMP_COL = 'temp'
# Split test set (based on checklist dates)
TEST_START_DATE = "2024-02-18"
TEST_END_DATE = "2025-09-26"
X_test, y_test, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
if not all_data_df.empty:
try:
test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy()
# Assumption: 157 features are ALL columns that are NOT targets
feature_cols = [col for col in all_data_df.columns if col not in TARGET_COLS]
# Split X_test (features) and y_test (actuals)
# Logic fix: X_test must be derived from test_df
X_test = test_df[feature_cols]
y_test = test_df[TARGET_COLS]
# Rename y_test columns for clarity (used in Tab 3)
y_test.columns = [f'Day {i}' for i in range(1, 6)]
except KeyError:
st.error(f"Error: Target columns (e.g., '{TARGET_COLS[0]}') or "
f"'{CURRENT_TEMP_COL}' column not found in CSV. Please update 'app.py'.")
except Exception as e:
st.error(f"Error processing test set: {e}")
else:
st.error("Could not load main data, application cannot continue.")
st.stop()
# --- CRITICAL CUSTOMIZATION (Hourly Targets) ---
HOURLY_TARGET_COLS = ['target_temp_next_24h', 'target_temp_next_48h', 'target_temp_next_72h',
'target_temp_next_96h', 'target_temp_next_120h']
# Load models và data mới
hourly_data_df = load_hourly_data(file_path="data/final_hourly_feature_dataset.csv") # Dùng tên file features chính xác
hourly_perf_df = load_hourly_performance_data(file_path="data/hourly_120h_evaluation_results.csv") # File hiệu suất
hourly_models_24h = load_24_hourly_models() # Dùng 24 mô hình LGBM
# Tạo input features cho Hourly
if not hourly_data_df.empty:
HOURLY_FEATURE_COLS = [col for col in hourly_data_df.columns if col not in HOURLY_TARGET_COLS]
# Lấy test set
X_test_hourly = hourly_data_df.loc[TEST_START_DATE:TEST_END_DATE][HOURLY_FEATURE_COLS].copy()
# FIX LỖI 1 (Model Prediction Dtypes): Loại bỏ các cột object (sunrise/sunset)
columns_to_drop_objects = ['sunrise', 'sunset']
X_test_hourly = X_test_hourly.drop(columns=columns_to_drop_objects, errors='ignore')
HOURLY_FEATURE_COLS = X_test_hourly.columns.tolist() # Cập nhật lại feature list sau khi drop
else:
X_test_hourly = pd.DataFrame()
# --- Định nghĩa Hàm Dự đoán 24h Thực tế (Giữ nguyên logic bên trong) ---
def predict_next_24_hours(input_features: pd.DataFrame, models: dict) -> List[float]:
# ... (Code hàm này giữ nguyên như lần trước)
predictions = []
# We expect 24 horizons (T+1h ... T+24h). Use that as default horizon count.
desired_horizons = 24
# If models is a dict of loaded models, determine available keys
available_model_count = len(models) if models else 0
# If there are no models or input is empty, return a synthetic forecast of length 24
if input_features.empty or available_model_count == 0:
last_temp = input_features['temp'].iloc[-1] if (not input_features.empty and 'temp' in input_features.columns) else 28.0
np.random.seed(42)
return [last_temp + 1.5 * np.sin(2 * np.pi * (h + 10) / 24) + np.random.normal(0, 0.5)
for h in range(desired_horizons)]
# If we have some models, try to predict for each of the 24 horizons. If a specific horizon model
# is missing, fall back to a simple persistence estimate based on last_temp.
last_temp = input_features['temp'].iloc[-1] if 'temp' in input_features.columns else 28.0
for h in range(1, desired_horizons + 1):
model = models.get(h) if isinstance(models, dict) else (models[h-1] if (isinstance(models, list) and len(models) >= h) else None)
if model is None:
# fallback synthetic estimate
pred = last_temp + 1.5 * np.sin(2 * np.pi * (h + 10) / 24)
predictions.append(float(pred))
continue
try:
# ensure model.predict returns array-like
pred_val = model.predict(input_features)
# pick first value if array
pred = float(pred_val[0]) if hasattr(pred_val, '__getitem__') else float(pred_val)
predictions.append(pred)
except Exception:
# on any model error, append synthetic fallback
pred = last_temp + 1.5 * np.sin(2 * np.pi * (h + 10) / 24)
predictions.append(float(pred))
return predictions
# --- OpenWeather API helpers (Live & Future Forecast) ---
@st.cache_data(ttl=600)
def get_live_weather(api_key: str):
"""Return current weather JSON from OpenWeatherMap for Ho Chi Minh City."""
LAT, LON = 10.7769, 106.7009
URL = f"https://api.openweathermap.org/data/2.5/weather?lat={LAT}&lon={LON}&appid={api_key}&units=metric"
try:
resp = requests.get(URL, timeout=10)
resp.raise_for_status()
return resp.json()
except Exception as e:
st.warning(f"Could not fetch live weather: {e}")
return {}
@st.cache_data(ttl=3600)
def get_5_day_forecast(api_key: str):
"""Return 5-day / 3-hour forecast JSON from OpenWeatherMap for Ho Chi Minh City."""
LAT, LON = 10.7769, 106.7009
URL = f"https://api.openweathermap.org/data/2.5/forecast?lat={LAT}&lon={LON}&appid={api_key}&units=metric"
try:
resp = requests.get(URL, timeout=10)
resp.raise_for_status()
return resp.json()
except Exception as e:
st.warning(f"Could not fetch 5-day forecast: {e}")
return {}
# --- 5. GIAO DIỆN SIDEBAR (ĐÃ XÓA) ---
# Toàn bộ phần sidebar đã bị xóa
# --- 6. GIAO DIỆN CHÍNH (MAIN PANEL) ---
# Preload leaderboard early to ensure it's available before rendering tabs (prevents reflow)
leaderboard_df = load_leaderboard_data()
# Tạo các tab ngang thay vì radio button (thêm emoji + tab Live OpenWeather)
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"📄 Project Overview",
"🔮 Live 5-Day Forecast",
"📈 Model Performance",
"🕒 Hourly Prediction",
"📡 Live & Future Forecast"
])
# --- TAB 1: Project Overview ---
with tab1:
emoji = _time_of_day_emoji()
st.markdown(f"""
<div class="tab-header" style="display:flex;justify-content:space-between;align-items:center;position:relative;">
<h1 style="margin:0">Saigon Temperature Forecasting Application 🌦️</h1>
<div class="tab-emoji">{emoji}</div>
</div>
""", unsafe_allow_html=True)
# Prominent banner: use a fixed-height HTML container to reserve layout space
# Prefer a local banner file (no remote load) if available; otherwise fall back to the Bamboo page URL.
local_candidates = [
"assets/banner.jpg",
"static/banner.jpg",
"data/banner.jpg",
]
banner_url = "https://bambooairways.com/jp/en/explore/destinations/southeast-asia/vietnam/ho-chi-minh"
for p in local_candidates:
if os.path.exists(p):
try:
with open(p, 'rb') as _f:
_b = _f.read()
_b64 = base64.b64encode(_b).decode('ascii')
banner_url = f"data:image/jpg;base64,{_b64}"
except Exception:
# If embedding fails, keep the fallback URL
pass
break
# Render banner. Prefer an inline <img> when we have a direct image (data URI or image URL).
is_image_src = False
try:
is_image_src = str(banner_url).startswith('data:image') or str(banner_url).lower().endswith(('.jpg', '.jpeg', '.png', '.webp'))
except Exception:
is_image_src = False
if is_image_src:
# Use <img> with fixed height + object-fit to avoid layout jumps and ensure image displays.
banner_html = f"""
<div style="width:100%;border-radius:10px;overflow:hidden;box-shadow:0 6px 18px rgba(3,37,76,0.06);">
<img src="{banner_url}" style="width:100%;height:360px;object-fit:cover;display:block;" alt="Ho Chi Minh City" />
</div>
<div style="width:100%;text-align:center;margin-top:8px;color:#666;font-size:0.95rem;font-weight:600;">Ho Chi Minh City — Credit: Bamboo Airways</div>
"""
else:
# Fallback: render a fixed-height colored block (useful when banner_url is a page URL)
banner_html = """
<div style="min-height:360px;width:100%;border-radius:10px;overflow:hidden;position:relative;background-color:#cfeef3;box-shadow:0 6px 18px rgba(3,37,76,0.06);">
</div>
<div style="width:100%;text-align:center;margin-top:8px;color:#666;font-size:0.95rem;font-weight:600;">Ho Chi Minh City — Credit: Bamboo Airways</div>
"""
st.markdown(banner_html, unsafe_allow_html=True)
# --- WRAP ROW 1 ---
st.markdown('<div class="card-row">', unsafe_allow_html=True)
row1_col1, row1_col2 = st.columns(2)
with row1_col1:
st.markdown(
'''
<div class="info-card">
<h4>📝 Project Summary</h4>
<p>The goal of this project is to execute a full end-to-end machine learning workflow to forecast the <strong>average daily temperature</strong> for Ho Chi Minh City over the <strong>next 5 days</strong>.</p>
<ul>
<li><strong>Data Source:</strong> 10 years of historical daily and hourly weather data from <strong>Visual Crossing</strong>.</li>
<li><strong>Forecasting Method:</strong> We employ a <strong>Direct Forecasting Strategy</strong>, using 5 independent 'specialist' models, where each model is individually optimized to predict a specific day's temperature (T+1 to T+5).</li>
<li><strong>Final Champion Model:</strong> The best-performing architecture is a <strong>Stacking (Optuna) Ensemble</strong>, which synergistically combines multiple machine learning models to achieve the highest accuracy.</li>
</ul>
</div>
''',
unsafe_allow_html=True
)
with row1_col2:
st.markdown(
'''
<div class="info-card">
<h4>🧬 Our 'Two-Stream' Data Strategy</h4>
<p>To optimize performance for different model families, we applied a "Two-Stream" data preparation strategy:</p>
<ul>
<li><strong>Stream 1 (For Linear Models):</strong> Features were pruned using an <strong>iterative Variance Inflation Factor (VIF)</strong> process to create a stable, non-collinear feature set.</li>
<li><strong>Stream 2 (For Tree-based Models):</strong> Advanced models (LightGBM, CatBoost) were trained on a <strong>comprehensive set of ~156 features</strong>, leveraging their robustness to multicollinearity.</li>
</ul>
<p>Our final Champion Model is a <strong>Stacking Ensemble</strong> that strategically uses components from both streams.</p>
</div>
''',
unsafe_allow_html=True
)
st.markdown('</div>', unsafe_allow_html=True) # Close the row div
# --- WRAP ROW 2 ---
st.markdown('<div class="card-row">', unsafe_allow_html=True)
# ROW 2: Key Analytical Findings & Final MLOps Strategy
row2_col1, row2_col2 = st.columns(2)
with row2_col1:
st.markdown(
'''
<div class="info-card">
<h4>🔬 Key Analytical Findings</h4>
<ul>
<li><strong>Non-Stationary Climate:</strong> Observed a long-term warming trend and extreme events (e.g., 2024 El Niño-driven heatwave) supporting adaptive modeling.</li>
<li><strong>Signal vs. Noise:</strong> A clean daily dataset outperformed noisier hourly-derived aggregates for daily average forecasting.</li>
<li><strong>Meteorological Logic:</strong> Model feature importance aligns with domain knowledge (thermal inertia, heat index, wind vectors).</li>
</ul>
</div>
''',
unsafe_allow_html=True
)
with row2_col2:
st.markdown(
'''
<div class="info-card" style="min-height: 265px;">
<h4>🚀 Final MLOps Strategy</h4>
<ul>
<li><strong>Retraining Trigger:</strong> Performance-based safety net (live RMSE), drift detection (PSI/KS-Test), and a fixed 90-day refresh cadence.</li>
<li><strong>Deployment Efficiency (ONNX):</strong> Converted scoring components to ONNX for faster, lighter inference and reduced latency.</li>
</ul>
</div>
''',
unsafe_allow_html=True
)
st.markdown('</div>', unsafe_allow_html=True) # Close the row div
# ROW 3: Full-width Leaderboard Card
# We'll inject the full container + content in one markdown call so the
# rendered HTML is nested correctly (Streamlit renders separate markdown
# calls as siblings, which previously produced an empty container above).
title_html = '<h3>🏆 Final 5-Day Daily Model Leaderboard</h3>'
if 'leaderboard_df' in globals() and not leaderboard_df.empty:
# Sort and prepare display DataFrame
display_df = leaderboard_df.sort_values(by='RMSE (Absolute Error)').reset_index(drop=True)
display_df.index = range(1, len(display_df) + 1)
# Estimate placeholder height based on number of rows to avoid vertical growth
# Estimate: header (56px) + rows * 48px + padding (40px)
# Multiply by 1.2 to add headroom for hosted environments where fonts/spacing vary.
raw_height = 56 + len(display_df) * 48 + 40
estimated_height = int(max(360, raw_height * 1.2))
placeholder = st.empty()
placeholder.markdown(f'<div style="min-height:{estimated_height}px;"></div>', unsafe_allow_html=True)
# Build static HTML rows to mimic a table but reduce Streamlit's DataFrame overhead
cols = list(display_df.columns)
# Build header (include Rank as first column)
header_cells = '<div class="static-row static-header">'
header_cells += '<div class="static-cell small">#</div>'
for c in cols:
header_cells += f'<div class="static-cell">{c}</div>'
header_cells += '</div>'
# Build body rows
body_rows = ''
for idx, row in display_df.iterrows():
body_rows += '<div class="static-row">'
body_rows += f'<div class="static-cell small">{idx}</div>'
for c in cols:
val = row[c]
# Simple formatting for floats
try:
if pd.isna(val):
cell = '--'
elif isinstance(val, (float, np.floating)):
cell = f'{val:.3f}' if abs(val) < 1000 else f'{val:.0f}'
else:
cell = str(val)
except Exception:
cell = str(val)
body_rows += f'<div class="static-cell">{cell}</div>'
body_rows += '</div>'
static_html = f'<div class="leaderboard-card">{header_cells}{body_rows}</div>'
full_html = f'<div class="leaderboard-container">{title_html}{static_html}</div>'
# Inject the full container+content in one go so nesting is preserved
placeholder.markdown(full_html, unsafe_allow_html=True)
else:
placeholder = st.empty()
placeholder.markdown('<div style="min-height:360px;"></div>', unsafe_allow_html=True)
empty_html = f'<div class="leaderboard-container">{title_html}<div style="padding:18px;color:#666;">Could not load leaderboard data.</div></div>'
placeholder.markdown(empty_html, unsafe_allow_html=True)
# --------------------------------------------------------------------
# --- TAB 2: Live Forecast ---
with tab2:
# --- MỤC 4 TRONG CHECKLIST ---
emoji = _time_of_day_emoji()
st.markdown(f"""
<div class="tab-header" style="display:flex;justify-content:space-between;align-items:center;position:relative;">
<h1 style="margin:0">🔮 Live 5-Day Forecast</h1>
<div class="tab-emoji">{emoji}</div>
</div>
""", unsafe_allow_html=True)
# Lazy-load champion models only when the user opens Tab 2 to avoid startup lag
if models is None:
try:
with st.spinner("Loading champion models..."):
models = load_champion_models()
except Exception as e:
st.warning(f"Could not load champion models immediately: {e}")
# --- ĐÃ DI CHUYỂN LOGIC DATE INPUT VÀO ĐÂY ---
st.subheader("Forecast Input")
selected_date = None
if not X_test.empty:
min_date = X_test.index.min()
max_date = X_test.index.max()
selected_date = st.date_input( # Đã xóa st.sidebar.
"Select a date from the test set:",
value=min_date,
min_value=min_date,
max_value=max_date,
format="YYYY-MM-DD"
)
else:
st.error("Test data could not be loaded.") # Đã xóa st.sidebar.
st.divider() # Thêm đường kẻ ngang
# Biến 'selected_date' GHI Giờ đã được định nghĩa ở trên
if selected_date and not X_test.empty and models:
st.header(f"📍 5-Day Forecast from: {selected_date.strftime('%Y-%m-%d')}")
# 1. Lấy Input Features
selected_date_ts = pd.Timestamp(selected_date)
# Sửa lỗi logic: input_features phải được lấy từ X_test
if selected_date_ts in X_test.index:
input_features = X_test.loc[[selected_date_ts]]
else:
st.error("Data not found for the selected date in X_test.")
input_features = pd.DataFrame() # Tạo dataframe rỗng để tránh lỗi sau
if input_features.empty:
st.error("Data not found for the selected date.")
else:
# 2. Tạo dự đoán
predictions = []
for i in range(5):
model = models[i] # Lấy mô hình T+i
pred = model.predict(input_features)[0]
predictions.append(pred)
# 3. Hiển thị dự đoán (dùng st.metric)
forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:]
cols = st.columns(5)
# Lấy giá trị thực tế để so sánh
actual_values = []
if selected_date_ts in all_data_df.index:
actual_row = all_data_df.loc[selected_date_ts]
for col_name in TARGET_COLS:
actual_values.append(actual_row[col_name])
else:
actual_values = [float('nan')] * 5
is_partial_forecast = any(pd.isna(v) for v in actual_values)
for i in range(5):
with cols[i]:
actual_val = actual_values[i]
delta_text = f"Actual: {actual_val:.1f}°C" if pd.notna(actual_val) else "Actual: --"
st.metric(
label=f"Forecast for {forecast_dates[i].strftime('%b %d')}",
value=f"{predictions[i]:.1f}°C",
delta=delta_text,
delta_color="off"
)
# --- NÂNG CẤP: Thêm "Why" Insights ---
st.subheader("Forecast Insights (Why?)")
# Lấy 2 features từ input_features (đã được xác nhận tồn tại)
temp_lag_1 = input_features['temp_lag_1'].iloc[0]
precip_today = input_features['precip'].iloc[0]
# Hiển thị insight dựa trên giá trị
if temp_lag_1 > 30: # Giả định 30°C là "rất nóng"
st.info(f"💡 Insight: Yesterday was very hot ({temp_lag_1:.1f}°C). The model is using this strong 'persistence' signal for tomorrow's forecast.")
elif temp_lag_1 < 25: # Giả định 25°C là "mát mẻ"
st.info(f"💡 Insight: Yesterday was cool ({temp_lag_1:.1f}°C). This is likely pulling the initial forecast down.")
if precip_today > 10: # Giả định 10mm là "ngày mưa"
st.info(f"💡 Insight: The selected day had {precip_today:.1f}mm of rain. This humidity and cloud cover is factored into the forecast.")
elif 'temp_lag_1' not in locals() or (temp_lag_1 >= 25 and temp_lag_1 <= 30):
st.info("💡 Insight: Weather conditions appear stable. The forecast is primarily driven by seasonal trends and recent temperature history.")
# --- KẾT THÚC NÂNG CẤP ---
# --- NÂNG CẤP MỚI: Thêm "Feature Inspector" ---
st.markdown("---") # Thêm đường kẻ ngang
with st.expander("🔍 Feature Inspector: What the Model Saw on this Day"):
if not input_features.empty:
# Chúng ta sẽ hiển thị các tính năng trong các cột có tổ chức
col1, col2, col3 = st.columns(3)
# --- Cột 1: Core Weather & Persistence ---
with col1:
st.subheader("Core Conditions")
st.metric(label="🌡️ Today's Avg Temp (temp)", value=f"{input_features['temp'].iloc[0]:.1f}°C")
st.metric(label="🌡️ Today's 'Feels Like' (feelslike)", value=f"{input_features['feelslike'].iloc[0]:.1f}°C")
st.metric(label="💧 Humidity", value=f"{input_features['humidity'].iloc[0]:.1f}%")
st.metric(label="☁️ Cloud Cover", value=f"{input_features['cloudcover'].iloc[0]:.1f}%")
st.metric(label="🌧️ Precipitation", value=f"{input_features['precip'].iloc[0]:.1f} mm")
# --- Cột 2: Recent History (Lags & Rolling Windows) ---
with col2:
st.subheader("Recent History")
st.metric(label="🌡️ Temp Yesterday (temp_lag_1)", value=f"{input_features['temp_lag_1'].iloc[0]:.1f}°C")
st.metric(label="📈 7-Day Avg Temp (temp_roll_7d_mean)", value=f"{input_features['temp_roll_7d_mean'].iloc[0]:.1f}°C")
# --- GIỮ NGUYÊN LỖI THEO YÊU CẦU ---
# Code này sẽ gây lỗi KeyError nếu 'precip_roll_7d_sum' không tồn tại
st.metric(label="🌧️ 7-Day Total Rainfall (precip_roll_7d_sum)", value=f"{input_features['precip_roll_7d_sum'].iloc[0]:.1f} mm")
st.metric(label="📊 14-Day Temp Volatility (temp_roll_14d_std)", value=f"{input_features['temp_roll_14d_std'].iloc[0]:.2f}°C")
# --- Cột 3: Seasonal & Atmospheric Context ---
with col3:
st.subheader("Seasonal Context")
st.metric(label="📅 Day of Year", value=f"{input_features['day_of_year'].iloc[0]}")
st.metric(label="🔻 Sea Level Pressure", value=f"{input_features['sealevelpressure'].iloc[0]:.1f} hPa")
st.metric(label="💨 Wind Speed", value=f"{input_features['windspeed'].iloc[0]:.1f} km/h")
st.metric(label="🧭 Wind Direction", value=f"{input_features['winddir'].iloc[0]:.0f}°")
else:
st.warning("No feature data available for the selected date.")
# --- KẾT THÚC NÂNG CẤP "Feature Inspector" ---
# --- BIỂU ĐỒ DỮ LIỆU TRAINING ---
st.subheader("Training Set Overview")
with st.expander("Show plot of all training data (before 2024-02-18)"):
train_end_date = pd.Timestamp(TEST_START_DATE) - pd.Timedelta(days=1)
train_df = all_data_df.loc[:train_end_date][CURRENT_TEMP_COL]
fig_train = go.Figure()
fig_train.add_trace(go.Scatter(
x=train_df.index, y=train_df,
mode='lines', name='Training Data (Actual)',
line=dict(color='#005aa7', width=1)
))
fig_train.update_layout(
title="Actual Temperature - Full Training Set",
xaxis_title="Date", yaxis_title="Temperature (°C)",
template="plotly_white",
xaxis_rangeslider_visible=True, # Thêm slider
yaxis_fixedrange=True # Khóa trục Y
)
st.plotly_chart(fig_train, use_container_width=True)
# 4. Biểu đồ Context
st.subheader("Historical Context & Forecast")
history_start = selected_date_ts - pd.Timedelta(days=14)
history_end = selected_date_ts
history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL]
forecast_df = pd.DataFrame({
'Date': forecast_dates,
'Forecast': predictions
}).set_index('Date')
fig = go.Figure()
fig.add_trace(go.Scatter(
x=history_df.index, y=history_df,
mode='lines+markers', name='Past 14 Days (Actual)',
line=dict(color='blue')
))
fig.add_trace(go.Scatter(
x=forecast_df.index, y=forecast_df['Forecast'],
mode='lines+markers', name='5-Day Forecast',
line=dict(color='red', dash='dot')
))
fig.update_layout(
title="Forecast vs. Historical Context",
xaxis_title="Date", yaxis_title="Temperature (°C)",
template="plotly_white", legend=dict(x=0.01, y=0.99)
)
st.plotly_chart(fig, use_container_width=True)
# --- NÂNG CẤP: Biểu đồ thông minh hơn ---
st.subheader("5-Day Forecast vs. Actual Comparison")
fig_comp = go.Figure()
# 1. Luôn thêm đường Dự báo
fig_comp.add_trace(go.Scatter(
x=forecast_dates, y=predictions,
mode='lines+markers', name='5-Day Forecast',
line=dict(color='red', dash='dot')
))
# 2. Chỉ thêm đường Thực tế nếu có đủ 5 ngày dữ liệu
if not is_partial_forecast:
fig_comp.add_trace(go.Scatter(
x=forecast_dates, y=actual_values,
mode='lines+markers', name='5-Day Actual',
line=dict(color='blue')
))
fig_comp.update_layout(title="5-Day Forecast vs. Actual Values")
else:
# Nếu không, chỉ hiển thị dự báo
fig_comp.update_layout(title="5-Day Forecast (Actual data not yet available)")
# Luôn hiển thị biểu đồ
fig_comp.update_layout(
xaxis_title="Date", yaxis_title="Temperature (°C)",
template="plotly_white", legend=dict(x=0.01, y=0.99)
)
st.plotly_chart(fig_comp, use_container_width=True)
# --- KẾT THÚC NÂNG CẤP ---
else:
# Điều chỉnh lại cảnh báo này
if not selected_date:
st.warning("Test data could not be loaded.")
else:
st.warning("Please wait... Loading data or models.")
# --------------------------------------------------------------------
# --- TAB 3: Model Performance ---
with tab3:
# --- MỤC 5 TRONG CHECKLIST ---
emoji = _time_of_day_emoji()
st.markdown(f"""
<div class="tab-header" style="display:flex;justify-content:space-between;align-items:center;position:relative;">
<h1 style="margin:0">📈 Model Performance & Diagnostics</h1>
<div class="tab-emoji">{emoji}</div>
</div>
""", unsafe_allow_html=True)
if not perf_df.empty and not y_test.empty:
st.subheader("Performance Degradation over 5 Days")
st.markdown("How model performance changes as the forecast horizon increases.")
MODEL_NAME = 'Champion (Stacking)'
champion_perf_df = perf_df[perf_df['Model'] == MODEL_NAME].copy()
# 1. Biểu đồ suy giảm hiệu suất (RMSE & R2)
RMSE_COL_NAME = 'RMSE (Absolute Error)'
R2_COL_NAME = 'R-squared'
col1, col2 = st.columns(2)
with col1:
fig_rmse = diag.plot_performance_degradation(
champion_perf_df,
metric_column=RMSE_COL_NAME,
metric_name='RMSE (Temperature °C)',
color='blue'
)
st.plotly_chart(fig_rmse, use_container_width=True)
with col2:
fig_r2 = diag.plot_performance_degradation(
champion_perf_df,
metric_column=R2_COL_NAME,
metric_name='R-squared (R²)',
color='green'
)
st.plotly_chart(fig_r2, use_container_width=True)
# --- NÂNG CẤP: Biểu đồ tương tác với Slider ---
st.subheader("Interactive Forecast vs. Actual Comparison")
# 1. Thêm slider
selected_horizon = st.slider(
"Select Forecast Horizon (Day) to inspect:",
1, 5, 1
)
# 2. Lấy dữ liệu dự đoán (đã được cache)
@st.cache_data
def get_full_test_predictions(_models, _X_test):
"""Run predictions on the entire test set and cache the results."""
all_preds = {}
for i in range(5):
model = _models[i]
preds = model.predict(_X_test)
all_preds[f'Day {i+1}'] = preds
return pd.DataFrame(all_preds, index=_X_test.index)
with st.spinner("Running predictions on entire test set... (This is cached for next time)"):
y_pred_test = get_full_test_predictions(models, X_test)
# 3. Chọn dữ liệu dựa trên slider
y_true_selected = y_test[f'Day {selected_horizon}']
y_pred_selected = y_pred_test[f'Day {selected_horizon}']
# 4. Vẽ 1 biểu đồ duy nhất
fig_interactive = diag.plot_forecast_vs_actual(
y_true=y_true_selected,
y_pred=y_pred_selected,
day_ahead_title=f"Day {selected_horizon} Forecast"
)
st.plotly_chart(fig_interactive, use_container_width=True)
# --- KẾT THÚC NÂNG CẤP ---
# 3. Mục Tùy chọn: Deep Dive Expander
with st.expander("Champion Model Diagnostics (Deep Dive)"):
st.markdown("Detailed analysis of residuals (error = actual - predicted) for the Day 1 forecast.")
y_true_d1 = y_test['Day 1']
y_pred_d1 = y_pred_test['Day 1']
dates_d1 = y_test.index
fig_res_time = diag.plot_residuals_vs_time(
y_true_d1, y_pred_d1, dates_d1, "Day 1"
)
st.plotly_chart(fig_res_time, use_container_width=True)
fig_res_dist = diag.plot_residuals_distribution(
y_true_d1, y_pred_d1, "Day 1"
)
st.plotly_chart(fig_res_dist, use_container_width=True)
st.markdown("A good model will have residuals (errors) normally distributed (bell curve) "
"around 0 and show no pattern over time.")
else:
st.warning("Loading performance data...")
# --- TAB 4: Hourly Prediction ---
with tab4:
emoji = _time_of_day_emoji()
st.markdown(f"""
<div class="tab-header" style="display:flex;justify-content:space-between;align-items:center;position:relative;">
<h1 style="margin:0">🕒 Hourly Prediction (Next 24 Hours)</h1>
<div class="tab-emoji">{emoji}</div>
</div>
""", unsafe_allow_html=True)
st.subheader("Forecast Start Time")
if not X_test_hourly.empty:
min_ts = X_test_hourly.index.min()
max_ts = X_test_hourly.index.max()
# 1. Date Selection
selected_date = st.date_input(
"Select the date:",
value=max_ts.date(), # Mặc định chọn ngày cuối cùng
min_value=min_ts.date(),
max_value=max_ts.date(),
format="YYYY-MM-DD",
key="hourly_date_input" # Thêm key duy nhất
)
# 2. Hour Selection (Chỉ show các giờ có sẵn trong ngày đã chọn)
available_hours_in_day = X_test_hourly[X_test_hourly.index.date == selected_date].index.hour.unique().sort_values()
if available_hours_in_day.empty:
st.warning(f"No hourly data found for {selected_date}. Please select a different date.")
st.stop()
# Chọn giờ: Mặc định chọn giờ muộn nhất trong ngày (latest known hour)
default_hour = available_hours_in_day.max()
default_hour_index = available_hours_in_day.get_loc(default_hour)
selected_hour = st.selectbox(
"Select the latest known hour:",
options=available_hours_in_day.tolist(),
index=default_hour_index,
format_func=lambda x: f"{x:02d}:00:00"
)
# Kết hợp ngày và giờ thành Timestamp duy nhất
latest_time_for_day = pd.to_datetime(f"{selected_date} {selected_hour:02d}:00:00")
# Lấy Input Features cho timestamp đã chọn
input_features_hourly = X_test_hourly.loc[[latest_time_for_day]]
st.info(f"The model runs based on data up to the latest known hour: **{latest_time_for_day.strftime('%Y-%m-%d %H:%M:%S')}**")
st.divider()
# 1. Chạy Dự đoán Hourly (cho biểu đồ T+1h đến T+24h)
predictions_24h = predict_next_24_hours(input_features_hourly, hourly_models_24h)
# Load precomputed forecasts (if available) for a deterministic fallback
precomputed_df = load_precomputed_hourly_forecasts()
# Count loaded hourly models (used to decide whether to prefer precomputed fallback)
try:
loaded_hourly_models_count = len(hourly_models_24h) if isinstance(hourly_models_24h, dict) else (len(hourly_models_24h) if hasattr(hourly_models_24h, '__len__') else 0)
except Exception:
loaded_hourly_models_count = 0
# Prefer precomputed forecasts when models are incomplete: teammate likely used that source.
used_precomputed = False
if loaded_hourly_models_count < 24 and not precomputed_df.empty:
if latest_time_for_day in precomputed_df.index:
row = precomputed_df.loc[latest_time_for_day]
# extract numeric values from the row and take first 24
nums = [v for v in row.values if isinstance(v, (int, float)) and not pd.isna(v)]
if len(nums) >= 24:
predictions_24h = [float(nums[i]) for i in range(24)]
used_precomputed = True
st.info("Using precomputed 120h forecast for this timestamp (deterministic fallback).")
# (Removed per-horizon debug table and model-count messages to simplify the UI.)
# --- TÍNH TOÁN METRIC T+24h ---
t_plus_24h_metric_value = predictions_24h[23] if len(predictions_24h) >= 24 else (predictions_24h[-1] if predictions_24h else float('nan'))
# 2. Hiển thị Dự đoán T+24h (Tức là giờ đó ngày mai)
st.subheader(f"Summary Forecast for Next Day (Starting {latest_time_for_day.strftime('%H:%M')})")
forecast_start_ts = latest_time_for_day + pd.Timedelta(hours=1)
# Tính các giá trị cho T+2h và T+3h
t_plus_2h_value = predictions_24h[1] if len(predictions_24h) >= 2 else float('nan')
t_plus_3h_value = predictions_24h[2] if len(predictions_24h) >= 3 else float('nan')
# Các giá trị Max/Mean (sử dụng np đã được import)
avg_temp = np.nanmean(predictions_24h)
max_temp = np.nanmax(predictions_24h)
# --- BẮT ĐẦU NÂNG CẤP: Lấy 24h ACTUAL values (Giống logic Tab 2) ---
# 1. Lấy 24h index
actual_hourly_index = pd.date_range(start=forecast_start_ts, periods=24, freq='H')
# 2. Thử lấy 24h actual temps từ dataframe GỐC (hourly_data_df)
try:
# Dùng .reindex() để đảm bảo chúng ta có 24 dòng, ngay cả khi thiếu dữ liệu
actual_temps_24h_series = hourly_data_df['temp'].reindex(actual_hourly_index)
except Exception:
# Trường hợp dự phòng
actual_temps_24h_series = pd.Series([float('nan')] * 24, index=actual_hourly_index)
# 3. Lấy các giá trị điểm (T+2, T+3, T+24)
forecast_t2_ts = forecast_start_ts + pd.Timedelta(hours=1) # T+2
forecast_t3_ts = forecast_start_ts + pd.Timedelta(hours=2) # T+3
forecast_t24_ts = forecast_start_ts + pd.Timedelta(hours=23) # T+24
actual_t2_val = actual_temps_24h_series.get(forecast_t2_ts)
actual_t3_val = actual_temps_24h_series.get(forecast_t3_ts)
actual_t24_val = actual_temps_24h_series.get(forecast_t24_ts)
# 4. Tính toán giá trị tổng hợp (chỉ khi KHÔNG có NaN)
is_partial_hourly = actual_temps_24h_series.isna().any()
if is_partial_hourly:
actual_avg_val = float('nan')
actual_max_val = float('nan')
else:
actual_avg_val = np.nanmean(actual_temps_24h_series)
actual_max_val = np.nanmax(actual_temps_24h_series)
# 5. Tạo các chuỗi delta_text
delta_t2 = f"Actual: {actual_t2_val:.1f}°C" if pd.notna(actual_t2_val) else "Actual: --"
delta_t3 = f"Actual: {actual_t3_val:.1f}°C" if pd.notna(actual_t3_val) else "Actual: --"
delta_t24 = f"Actual: {actual_t24_val:.1f}°C" if pd.notna(actual_t24_val) else "Actual: --"
delta_avg = f"Actual: {actual_avg_val:.1f}°C" if pd.notna(actual_avg_val) else "Actual: --"
delta_max = f"Actual: {actual_max_val:.1f}°C" if pd.notna(actual_max_val) else "Actual: --"
# --- KẾT THÚC NÂNG CẤP ---
# Tạo 5 cột mới để hiển thị các metric
col_t2, col_t3, col_t24, col_avg, col_max = st.columns(5)
# --- 1. Metric T+2h ---
with col_t2:
st.metric(
label=f"Forecast @ {forecast_t2_ts.strftime('%H:%M')} (T+2H)",
value=f"{t_plus_2h_value:.1f}°C",
delta=delta_t2, # <-- THÊM MỚI
delta_color="off" # <-- THÊM MỚI
)
# --- 2. Metric T+3h ---
with col_t3:
st.metric(
label=f"Forecast @ {forecast_t3_ts.strftime('%H:%M')} (T+3H)",
value=f"{t_plus_3h_value:.1f}°C",
delta=delta_t3, # <-- THÊM MỚI
delta_color="off" # <-- THÊM MỚI
)
# --- 3. Metric T+24h (Giữ lại để đối chiếu) ---
with col_t24:
st.metric(
label=f"Forecast @ {forecast_t24_ts.strftime('%H:%M')} (T+24H)",
value=f"{t_plus_24h_metric_value:.1f}°C",
delta=delta_t24, # <-- THÊM MỚI
delta_color="off" # <-- THÊM MỚI
)
# --- 4. Metric Average ---
with col_avg:
st.metric(
label="Next 24h Average Temp",
value=f"{avg_temp:.1f}°C",
delta=delta_avg, # <-- THÊM MỚI
delta_color="off" # <-- THÊM MỚI
)
# --- 5. Metric Max (Sử dụng bố cục ngang) ---
with col_max:
st.metric(
label="Next 24h Max Temp",
value=f"{max_temp:.1f}°C", # max_temp là dự đoán
delta=delta_max, # delta_max là thực tế
delta_color="off" # <-- THAY THẾ "Peak Heat"
)
# --- BẮT ĐẦU THAY THẾ BIỂU ĐỒ TAB 4 ---
# 5.1 Graph: Bối cảnh Lịch sử & Dự báo
st.subheader("Historical Context & Forecast (Hourly)")
# Lấy 24 giờ lịch sử
history_start_ts = latest_time_for_day - pd.Timedelta(hours=23) # Lùi 23 giờ để có 24 điểm
history_end_ts = latest_time_for_day
# Lấy 'temp' (actual) từ dataframe GỐC theo giờ
history_df_hourly = hourly_data_df.loc[history_start_ts:history_end_ts]['temp']
# Tạo dataframe cho 24h dự báo
forecast_hourly_index = pd.date_range(start=forecast_start_ts, periods=len(predictions_24h), freq='H')
forecast_df_hourly = pd.DataFrame({
'Time': forecast_hourly_index,
'Forecast': predictions_24h
}).set_index('Time')
# Vẽ biểu đồ
fig_hist_hourly = go.Figure()
fig_hist_hourly.add_trace(go.Scatter(
x=history_df_hourly.index, y=history_df_hourly,
mode='lines+markers', name='Past 24 Hours (Actual)',
line=dict(color='blue')
))
fig_hist_hourly.add_trace(go.Scatter(
x=forecast_df_hourly.index, y=forecast_df_hourly['Forecast'],
mode='lines+markers', name='Next 24 Hours (Forecast)',
line=dict(color='red', dash='dot')
))
fig_hist_hourly.update_layout(
title="Hourly Forecast vs. Historical Context",
xaxis_title="Time", yaxis_title="Temperature (°C)",
template="plotly_white", legend=dict(x=0.01, y=0.99)
)
st.plotly_chart(fig_hist_hourly, use_container_width=True)
# 5.2 Graph: So sánh Dự báo vs Thực tế
st.subheader("24-Hour Forecast vs. Actual Comparison")
# (Sử dụng lại actual_temps_24h_series và is_partial_hourly đã tính toán ở trên)
actual_values_24h = actual_temps_24h_series.values
is_partial_hourly_forecast = is_partial_hourly # Đổi tên biến cho nhất quán
fig_comp_hourly = go.Figure()
# 1. Luôn thêm đường Dự báo
fig_comp_hourly.add_trace(go.Scatter(
x=forecast_hourly_index, y=predictions_24h,
mode='lines+markers', name='24-Hour Forecast',
line=dict(color='red', dash='dot')
))
# 2. Chỉ thêm đường Thực tế (màu xanh) nếu có đủ dữ liệu
if not is_partial_hourly_forecast:
fig_comp_hourly.add_trace(go.Scatter(
x=forecast_hourly_index, y=actual_values_24h,
mode='lines+markers', name='24-Hour Actual',
line=dict(color='blue')
))
fig_comp_hourly.update_layout(title="24-Hour Forecast vs. Actual Values")
else:
# Nếu không, chỉ hiển thị dự báo
fig_comp_hourly.update_layout(title="24-Hour Forecast (Actual data not yet available)")
# Luôn hiển thị biểu đồ
fig_comp_hourly.update_layout(
xaxis_title="Time", yaxis_title="Temperature (°C)",
template="plotly_white", legend=dict(x=0.01, y=0.99)
)
st.plotly_chart(fig_comp_hourly, use_container_width=True)
# --- KẾT THÚC THAY THẾ BIỂU ĐỒ TAB 4 ---
# --- NEW GRAPH 1: RMSE Degradation Plot (Reliability) ---
st.subheader("Model Reliability: Error Degradation")
if not hourly_perf_df.empty:
# SỬ DỤNG DỮ LIỆU HIỆU SUẤT THEO GIỜ (120H)
# Chỉ lấy 24 giờ đầu tiên nếu bạn muốn tập trung vào 24h forecast
# Nếu muốn hiển thị 120h, hãy bỏ .head(24)
df_plot = hourly_perf_df.head(24)
# Giả định các cột là 'Horizon' và 'RMSE'
fig_rmse_hourly = go.Figure()
fig_rmse_hourly.add_trace(go.Scatter(
x=df_plot['Horizon'],
y=df_plot['RMSE'],
mode='lines+markers',
name='RMSE',
line=dict(color='#005aa7')
))
fig_rmse_hourly.update_layout(
title="RMSE Degradation: Forecast Error vs. Hour Ahead (T+1h to T+24h)",
xaxis_title="Forecast Horizon (Hours)",
yaxis_title="RMSE (°C)",
template="plotly_white",
yaxis_range=[0, df_plot['RMSE'].max() * 1.1 if not df_plot['RMSE'].empty else 1],
height=400 # Chiều cao cố định để cân đối với biểu đồ khác
)
st.plotly_chart(fig_rmse_hourly, use_container_width=True)
else:
st.warning("Could not load Hourly RMSE Degradation data from hourly_120h_evaluation_results.csv.")
# 6. Hiển thị Features Dùng để Dự đoán (Giữ nguyên)
st.markdown("---")
with st.expander("🔍 Feature Inspector: Hourly Inputs for the Forecast"):
if not input_features_hourly.empty:
important_hourly_features = [
'temp', 'humidity', 'windspeed', 'cloudcover',
'temp_lag_1h', 'humidity_lag_24h', 'temp_diff_24h',
'temp_roll_24h_mean', 'humidity_roll_24h_mean',
'hour_sin', 'day_of_year_sin'
]
col_h1, col_h2, col_h3 = st.columns(3)
# Small emoji mapping for known hourly features
emoji_map = {
'temp': '🌡️', 'humidity': '💧', 'windspeed': '💨', 'cloudcover': '☁️',
'temp_lag_1h': '⏳', 'humidity_lag_24h': '📉', 'temp_diff_24h': '🔺',
'temp_roll_24h_mean': '📈', 'humidity_roll_24h_mean': '📊',
'hour_sin': '🕒', 'day_of_year_sin': '📅'
}
for i, feature in enumerate(important_hourly_features):
if feature in input_features_hourly.columns:
raw_value = input_features_hourly[feature].iloc[0]
prefix = emoji_map.get(feature, '')
label = f"{prefix} {feature.replace('_', ' ').title()}".strip()
formatted = _format_feature_value(feature, raw_value)
target_col = [col_h1, col_h2, col_h3][i % 3]
with target_col:
try:
st.metric(label=label, value=formatted)
except Exception:
st.write(f"{label}: {formatted}")
else:
st.warning("No hourly feature data available for the selected hour.")
else:
st.warning("Please wait... Loading hourly data or models.")
# --- TAB 5: Live & Future Forecast (OpenWeatherMap) ---
with tab5:
emoji = _time_of_day_emoji()
st.markdown(f"""
<div class="tab-header" style="display:flex;justify-content:space-between;align-items:center;position:relative;">
<h1 style="margin:0">📡Live Weather & OpenWeather Forecast</h1>
<div class="tab-emoji">{emoji}</div>
</div>
""", unsafe_allow_html=True)
# Get API key from Streamlit secrets
api_key = None
try:
api_key = st.secrets["OPENWEATHER_API_KEY"]
except Exception:
api_key = None
if not api_key:
st.warning("OpenWeather API key not found. To enable live forecasts, add OPENWEATHER_API_KEY to .streamlit/secrets.toml")
else:
with st.spinner("Fetching live weather..."):
live = get_live_weather(api_key)
if live:
st.header("Live Weather in Ho Chi Minh City 🌆")
col1, col2, col3 = st.columns(3)
try:
col1.metric("🌡️ Current Temperature", f"{live['main']['temp']:.1f}°C")
col2.metric("💧 Humidity", f"{live['main']['humidity']}%")
# wind speed in m/s -> km/h
wind_kmh = live.get('wind', {}).get('speed', 0) * 3.6
col3.metric("💨 Wind Speed", f"{wind_kmh:.1f} km/h")
# --- Additional prominent live metrics ---
feels_like = live.get('main', {}).get('feels_like')
cloud_cover = None
try:
cloud_cover = live.get('clouds', {}).get('all') if isinstance(live.get('clouds'), dict) else None
except Exception:
cloud_cover = None
# precipitation (1h or 3h) - prefer rain, fall back to snow
precip_val = None
rain_block = live.get('rain') if isinstance(live.get('rain'), dict) else None
snow_block = live.get('snow') if isinstance(live.get('snow'), dict) else None
if rain_block:
precip_val = rain_block.get('1h') or rain_block.get('3h')
elif snow_block:
precip_val = snow_block.get('1h') or snow_block.get('3h')
# Render additional metrics in a second row
col4, col5, col6 = st.columns(3)
try:
col4.metric("🌡️ Feels Like", f"{feels_like:.1f}°C" if feels_like is not None else "--")
except Exception:
col4.metric("🌡️ Feels Like", "--")
try:
col5.metric("☁️ Cloud Cover", f"{cloud_cover:.0f}%" if cloud_cover is not None else "--")
except Exception:
col5.metric("☁️ Cloud Cover", "--")
try:
col6.metric("🌧️ Precipitation (1h)", f"{precip_val:.1f} mm" if precip_val is not None else "--")
except Exception:
col6.metric("🌧️ Precipitation (1h)", "--")
except Exception:
st.write(live)
st.markdown("---")
with st.spinner("Fetching 5-day forecast from OpenWeatherMap..."):
forecast_json = get_5_day_forecast(api_key)
if forecast_json and 'list' in forecast_json:
forecast_list = []
for item in forecast_json['list']:
forecast_list.append({
'time': pd.to_datetime(item['dt'], unit='s'),
'temp': item['main']['temp']
})
forecast_df = pd.DataFrame(forecast_list)
st.header("5-Day Forecast (OpenWeatherMap)")
fig_live = px.line(forecast_df, x='time', y='temp', title='Temperature Forecast for the Next 5 Days', markers=True)
st.plotly_chart(fig_live, use_container_width=True)
st.info("Note: This forecast is provided by OpenWeatherMap and is independent of our trained ML models.")
# ---------------------------------------------
# Our Model-Based 5-Day Forecast (Hybrid)
# ---------------------------------------------
# Use the live summary + historical dataset to build features
try:
if models and not all_data_df.empty:
with st.spinner("Generating model-based 5-day forecast using live features..."):
# Build a simplified live summary from the API response
live_summary = {
'temp': live.get('main', {}).get('temp'),
'feelslike': live.get('main', {}).get('feels_like'),
'humidity': live.get('main', {}).get('humidity'),
'clouds': live.get('clouds', {}).get('all') if isinstance(live.get('clouds'), dict) else None,
'windspeed': wind_kmh,
}
# precipitation may be under 'rain' with '1h' or '3h'
rain = live.get('rain', {}) if isinstance(live.get('rain', {}), dict) else {}
if '1h' in rain:
live_summary['precip'] = rain['1h']
elif '3h' in rain:
live_summary['precip'] = rain['3h']
# Create feature row
live_feature_vector = fe_live.create_live_feature_vector(live_summary, all_data_df)
# Ensure column order matches training data
try:
live_feature_vector = live_feature_vector.reindex(columns=all_data_df.columns, fill_value=np.nan)
except Exception:
pass
# Generate predictions from champion models (Day1..Day5)
model_forecasts = []
for i, mdl in enumerate(models):
try:
# Align features to the model's expected input
X_in = _align_features_for_model(mdl, live_feature_vector, all_data_df.columns)
pred = mdl.predict(X_in)[0]
except Exception as e:
st.warning(f"Model for Day {i+1} prediction failed: {e}")
pred = float('nan')
model_forecasts.append(float(pred))
# Display model forecast
st.header("Our Model: 5-Day Forecast")
five_day_dates = [pd.Timestamp.now().normalize() + pd.Timedelta(days=i) for i in range(1, 6)]
fig_model = go.Figure()
fig_model.add_trace(go.Scatter(x=five_day_dates, y=model_forecasts, mode='lines+markers', name='Model Forecast', line=dict(color='#d62728')))
fig_model.update_layout(title="Model 5-Day Forecast (Our Stacking Models)", xaxis_title="Date", yaxis_title="Temperature (°C)", template='plotly_white')
st.plotly_chart(fig_model, use_container_width=True)
# Model 5-Day Outlook (Champion Stacking)
current_temp = live_summary.get('temp', float('nan'))
prev_day_temp = all_data_df['temp'].iloc[-1] if 'temp' in all_data_df.columns else float('nan')
st.subheader("Model 5-Day Outlook — Champion Stacking")
# Render five compact cards horizontally (responsive)
try:
cards = st.columns(5)
for i, pred in enumerate(model_forecasts):
date = five_day_dates[i]
try:
delta_now = pred - current_temp if pd.notna(current_temp) else float('nan')
except Exception:
delta_now = float('nan')
try:
delta_prev = pred - prev_day_temp if pd.notna(prev_day_temp) else float('nan')
except Exception:
delta_prev = float('nan')
if pd.notna(pred):
if pred >= 30:
summary_text = "Hot and humid. Stay hydrated and watch for heat stress."
elif pred <= 20:
summary_text = "Cool conditions expected; light jacket recommended."
else:
summary_text = "Comfortable temperatures expected."
else:
summary_text = "Model could not produce a reliable prediction for this day."
card_html = f"""
<div style='background:#ffffff;border-radius:10px;padding:12px;border:1px solid #e6eef8;box-shadow:0 2px 8px rgba(3,37,76,0.04);'>
<div style='font-size:0.95rem;color:#004080;font-weight:700;margin-bottom:6px;'>{date.strftime('%A, %b %d')}</div>
<div style='font-size:28px;font-weight:800;color:#d62728;margin-bottom:6px;'>{pred:.1f}°C</div>
<div style='font-size:0.85rem;color:#333;margin-bottom:6px;'>
<span style='color:#2b8a3e;font-weight:600;'>{delta_now:+.1f}° vs now</span>
&nbsp;|&nbsp;
<span style='color:#1f77b4;font-weight:600;'>{delta_prev:+.1f}° vs prev</span>
</div>
<div style='font-size:0.85rem;color:#555;margin-top:6px;'>{summary_text}</div>
</div>
"""
with cards[i]:
st.markdown(card_html, unsafe_allow_html=True)
except Exception:
# fallback to simple expanders if layout fails
st.subheader("Daily Outlook (Model)")
for i, pred in enumerate(model_forecasts):
date = five_day_dates[i]
st.write(f"{date.strftime('%A, %b %d')} — Predicted Avg: {pred:.1f}°C")
else:
st.info("Model-based forecast unavailable: champion models or historical data not found.")
except Exception as e:
st.warning(f"Could not run model-based forecast: {e}")
else:
st.warning("Could not retrieve 5-day forecast from OpenWeatherMap.")