Spaces:

Harveyntt
/

DSEB65A_Group4_FinalProject_HCMWeatherForecast

Sleeping

App Files Files Community

Harveyntt commited on Nov 17, 2025

Commit

8a9ed0e

verified ·

1 Parent(s): 88d3c90

Upload 4 files

Browse files

Files changed (4) hide show

src/__init__.py +0 -0
src/benchmark_utils.py +34 -0
src/diagnostic_plots.py +190 -0
src/feature_engineering_live.py +130 -0

src/__init__.py ADDED Viewed

File without changes

src/benchmark_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pandas as pd
+import streamlit as st
+@st.cache_data(hash_funcs={pd.DataFrame: lambda _: None})
+def load_leaderboard(file_path="data/results_df_all_tuned.csv"):
+    """
+    Tải và xử lý file CSV chứa kết quả leaderboard của các mô hình.
+    Args:
+        file_path (str): Đường dẫn đến file CSV leaderboard.
+    Returns:
+        pd.DataFrame: DataFrame đã được sắp xếp, sẵn sàng để hiển thị.
+    """
+    try:
+        df = pd.read_csv(file_path)
+        SORT_COLUMN_NAME = 'RMSE (Absolute Error)'
+        if SORT_COLUMN_NAME in df.columns:
+            df_sorted = df.sort_values(by=SORT_COLUMN_NAME, ascending=True)
+        else:
+            st.warning(f"Không tìm thấy cột '{SORT_COLUMN_NAME}' để sắp xếp leaderboard. "
+                       f"Vui lòng kiểm tra file `src/benchmark_utils.py`.")
+            df_sorted = df
+        return df_sorted
+    except FileNotFoundError:
+        st.error(f"LỖI: Không tìm thấy file leaderboard tại đường dẫn: {file_path}")
+        return pd.DataFrame() # Trả về DataFrame rỗng nếu có lỗi
+    except Exception as e:
+        st.error(f"Lỗi khi tải leaderboard: {e}")
+        return pd.DataFrame()

src/diagnostic_plots.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+import streamlit as st # Cần thiết để báo lỗi nếu cột không tồn tại
+# --- HÀM 1: Biểu đồ suy giảm hiệu suất (Theo Checklist mục 5) ---
+def plot_performance_degradation(df, metric_column, metric_name, color='blue'):
+    """
+    Tạo biểu đồ đường (line plot) cho thấy một chỉ số (metric) thay đổi
+    như thế nào qua 5 ngày dự báo.
+    Args:
+        df (pd.DataFrame): DataFrame được tải từ 'final_5_day_results_df.csv'.
+        metric_column (str): Tên cột chính xác trong CSV (ví dụ: 'RMSE (Absolute Error)').
+        metric_name (str): Tên hiển thị đẹp cho trục Y (ví dụ: 'RMSE (Temperature °C)').
+        color (str): Tên màu cho đường line.
+    Returns:
+        plotly.graph_objects.Figure: Một đối tượng biểu đồ Plotly.
+    """
+    # --- TÙY CHỈNH QUAN TRỌNG (ĐÃ SỬA) ---
+    # Cột chứa "Day 1", "Day 2",... là 'Horizon'
+    DAY_AHEAD_COLUMN = 'Horizon'
+    # ---------------------------
+    if DAY_AHEAD_COLUMN not in df.columns:
+        st.error(f"Lỗi plot: Không tìm thấy cột '{DAY_AHEAD_COLUMN}' trong dữ liệu. "
+                 f"Vui lòng kiểm tra file `src/diagnostic_plots.py`.")
+        return go.Figure()
+    if metric_column not in df.columns:
+        st.error(f"Lỗi plot: Không tìm thấy cột '{metric_column}' trong dữ liệu. "
+                 f"Vui lòng kiểm tra file `src/diagnostic_plots.py`.")
+        return go.Figure()
+    # --- SỬA LỖI LOGIC: Chuyển "Day 1" thành số 1 ---
+    # Tạo một bản copy để tránh cảnh báo
+    plot_df = df.copy()
+    # Trích xuất số từ cột 'Horizon' (ví dụ: 'Day 1' -> 1)
+    # và tạo cột mới 'day_num'
+    plot_df['day_num'] = plot_df[DAY_AHEAD_COLUMN].str.extract(r'(\d+)').astype(int)
+    plot_df = plot_df.sort_values(by='day_num')
+    # ---------------------------------------------
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(
+        x=plot_df['day_num'], # Dùng cột số 'day_num' mới cho trục X
+        y=plot_df[metric_column],
+        mode='lines+markers',
+        name=metric_name,
+        line=dict(color=color, width=3),
+        marker=dict(size=8)
+    ))
+    fig.update_layout(
+        title=f"<b>{metric_name} vs. Forecast Horizon</b>",
+        xaxis_title="Day Ahead (Horizon)",
+        yaxis_title=metric_name,
+        title_x=0.5, # Căn giữa tiêu đề
+        template="plotly_white",
+        xaxis = dict(tickmode = 'linear', tick0 = 1, dtick = 1) # Đảm bảo trục X là 1, 2, 3, 4, 5
+    )
+    # Nếu là R2, đặt giới hạn trục y từ 0 đến 1 cho dễ nhìn
+    if "R2" in metric_name or "R-squared" in metric_name:
+         fig.update_layout(yaxis_range=[0, 1])
+    return fig
+# --- HÀM 2: Biểu đồ Dự báo vs. Thực tế (Theo Checklist mục 5) ---
+def plot_forecast_vs_actual(y_true, y_pred, day_ahead_title):
+    """
+    Tạo biểu đồ phân tán (scatter plot) so sánh giá trị dự báo và giá trị thực tế.
+    Args:
+        y_true (array-like): Mảng chứa các giá trị thực tế.
+        y_pred (array-like): Mảng chứa các giá trị dự báo.
+        day_ahead_title (str): Tiêu đề phụ (ví dụ: "Day 1" hoặc "Day 5").
+    Returns:
+        plotly.graph_objects.Figure: Một đối tượng biểu đồ Plotly.
+    """
+    # Tạo DataFrame tạm thời để vẽ
+    plot_df = pd.DataFrame({
+        'Actual': y_true,
+        'Predicted': y_pred
+    })
+    fig = px.scatter(
+        plot_df,
+        x='Actual',
+        y='Predicted',
+        title=f"<b>Forecast vs. Actual - {day_ahead_title}</b>",
+        opacity=0.7,
+        hover_data={'Actual': ':.2f', 'Predicted': ':.2f'}
+    )
+    # Thêm đường chéo (y=x) thể hiện dự báo hoàn hảo
+    min_val = min(plot_df['Actual'].min(), plot_df['Predicted'].min())
+    max_val = max(plot_df['Actual'].max(), plot_df['Predicted'].max())
+    fig.add_trace(go.Scatter(
+        x=[min_val, max_val],
+        y=[min_val, max_val],
+        mode='lines',
+        name='Perfect Prediction',
+        line=dict(color='red', dash='dash', width=2)
+    ))
+    fig.update_layout(
+        title_x=0.5,
+        xaxis_title="Actual Temperature (°C)",
+        yaxis_title="Predicted Temperature (°C)",
+        template="plotly_white"
+    )
+    return fig
+# --- CÁC HÀM 3 & 4: Biểu đồ "Deep Dive" (Theo Checklist mục 5 - Tùy chọn) ---
+def plot_residuals_vs_time(y_true, y_pred, dates, day_ahead_title):
+    """
+    Tạo biểu đồ phân tán của phần dư (residuals) theo thời gian.
+    Args:
+        y_true (array-like): Mảng giá trị thực tế.
+        y_pred (array-like): Mảng giá trị dự báo.
+        dates (array-like): Mảng chứa ngày th��ng tương ứng.
+        day_ahead_title (str): Tiêu đề phụ (ví dụ: "Day 1").
+    Returns:
+        plotly.graph_objects.Figure: Một đối tượng biểu đồ Plotly.
+    """
+    residuals = y_true - y_pred
+    plot_df = pd.DataFrame({
+        'Date': dates,
+        'Residual': residuals
+    })
+    fig = px.scatter(
+        plot_df,
+        x='Date',
+        y='Residual',
+        title=f"<b>Residuals vs. Time - {day_ahead_title}</b>",
+        opacity=0.7
+    )
+    # Thêm đường y=0 (lỗi bằng 0)
+    fig.add_hline(y=0, line=dict(color='red', dash='dash', width=2))
+    fig.update_layout(
+        title_x=0.5,
+        yaxis_title="Residual (Actual - Predicted)",
+        template="plotly_white"
+    )
+    return fig
+def plot_residuals_distribution(y_true, y_pred, day_ahead_title):
+    """
+    Tạo biểu đồ histogram phân phối của phần dư (residuals).
+    Args:
+        y_true (array-like): Mảng giá trị thực tế.
+        y_pred (array-like): Mảng giá trị dự báo.
+        day_ahead_title (str): Tiêu đề phụ (ví dụ: "Day 1").
+    Returns:
+        plotly.graph_objects.Figure: Một đối tượng biểu đồ Plotly.
+    """
+    residuals = y_true - y_pred
+    fig = px.histogram(
+        residuals,
+        nbins=50,
+        title=f"<b>Residuals Distribution - {day_ahead_title}</b>"
+    )
+    fig.update_layout(
+        title_x=0.5,
+        xaxis_title="Residual (Error)",
+        yaxis_title="Count",
+        template="plotly_white"
+    )
+    return fig

src/feature_engineering_live.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import pandas as pd
+import numpy as np
+def create_live_feature_vector(live_daily_summary: dict, historical_data: pd.DataFrame) -> pd.DataFrame:
+    """Create a single-row DataFrame of features suitable for the 5-day models.
+    This is a pragmatic, reduced-feature implementation: it fills a template row
+    using the last historical day as a baseline and replaces/engineers the most
+    important features from live_daily_summary + recent history.
+    Note: The full project used ~157 features. Implementing all of them here is
+    tedious and error-prone; this function focuses on ~25 high-importance
+    features commonly used in temperature forecasting. It will also attempt to
+    preserve the original columns order (using historical_data.columns) so
+    models expecting the same schema are less likely to fail.
+    """
+    if historical_data is None or historical_data.empty:
+        raise ValueError("historical_data must be a non-empty DataFrame")
+    # Use the last historical row as a template (copy to avoid mutation)
+    template = historical_data.iloc[-1].copy()
+    # Start with a series having same index as template (so column ordering is preserved)
+    today_row = pd.Series(index=historical_data.columns, dtype="float64")
+    # Basic direct mappings (if columns exist)
+    mappings = {
+        'temp': ['temp', 'temperature', 'avg_temp'],
+        'feelslike': ['feelslike', 'feels_like'],
+        'humidity': ['humidity'],
+        'precip': ['precip', 'precipitation', 'rain'],
+        'windspeed': ['windspeed', 'wind_speed', 'windspd'],
+        'cloudcover': ['cloudcover', 'clouds', 'cloud_percent']
+    }
+    for feature, candidates in mappings.items():
+        val = None
+        for c in candidates:
+            if c in live_daily_summary:
+                val = live_daily_summary.get(c)
+                break
+        # fallback to nested keys in OpenWeather-like structures
+        if val is None and 'main' in live_daily_summary and feature in live_daily_summary['main']:
+            val = live_daily_summary['main'].get(feature)
+        if val is None and feature in live_daily_summary:
+            val = live_daily_summary.get(feature)
+        # Put into today_row if a matching column exists
+        for col in historical_data.columns:
+            if col == feature and val is not None:
+                today_row[col] = float(val)
+    # If 'temp' column still missing fill from template or live summary
+    if 'temp' in historical_data.columns and pd.isna(today_row.get('temp')):
+        if 'temp' in live_daily_summary:
+            today_row['temp'] = float(live_daily_summary['temp'])
+        else:
+            today_row['temp'] = float(template.get('temp', np.nan))
+    # Temporal features
+    today_ts = pd.Timestamp.now().normalize()
+    if 'year' in historical_data.columns:
+        today_row['year'] = today_ts.year
+    if 'month' in historical_data.columns:
+        today_row['month'] = today_ts.month
+    if 'day_of_year' in historical_data.columns:
+        today_row['day_of_year'] = today_ts.dayofyear
+    # Lag features (use recent historical days)
+    def safe_hist(col, offset=1):
+        idx = -offset
+        try:
+            return float(historical_data[col].iloc[idx])
+        except Exception:
+            return np.nan
+    if 'temp_lag_1' in historical_data.columns:
+        today_row['temp_lag_1'] = safe_hist('temp', 1)
+    if 'temp_lag_2' in historical_data.columns:
+        today_row['temp_lag_2'] = safe_hist('temp', 2)
+    if 'humidity_lag_1' in historical_data.columns:
+        today_row['humidity_lag_1'] = safe_hist('humidity', 1)
+    # Rolling windows: combine last N historical days with today's live 'temp' when available
+    def rolling_stat(col, window=7, stat='mean'):
+        try:
+            hist_vals = historical_data[col].dropna().iloc[-(window-1):].astype(float)
+            if not np.isnan(today_row.get(col)):
+                combined = pd.concat([hist_vals, pd.Series([today_row[col]])], ignore_index=True)
+            else:
+                combined = hist_vals
+            if combined.empty:
+                return np.nan
+            if stat == 'mean':
+                return float(combined.mean())
+            if stat == 'std':
+                return float(combined.std())
+            if stat == 'sum':
+                return float(combined.sum())
+            return np.nan
+        except Exception:
+            return np.nan
+    if 'temp_roll_7d_mean' in historical_data.columns:
+        today_row['temp_roll_7d_mean'] = rolling_stat('temp', window=7, stat='mean')
+    if 'temp_roll_7d_std' in historical_data.columns:
+        today_row['temp_roll_7d_std'] = rolling_stat('temp', window=7, stat='std')
+    if 'temp_roll_14d_std' in historical_data.columns:
+        today_row['temp_roll_14d_std'] = rolling_stat('temp', window=14, stat='std')
+    # If the model expects precip_roll_7d_sum and we can compute it
+    if 'precip' in historical_data.columns and 'precip_roll_7d_sum' in historical_data.columns:
+        today_row['precip_roll_7d_sum'] = rolling_stat('precip', window=7, stat='sum')
+    # Fill other columns conservatively using the last historical values (template)
+    for col in historical_data.columns:
+        if pd.isna(today_row.get(col)):
+            try:
+                today_row[col] = float(template[col]) if pd.notna(template[col]) else np.nan
+            except Exception:
+                today_row[col] = np.nan
+    # Convert to single-row DataFrame and ensure dtypes
+    today_df = pd.DataFrame([today_row])
+    today_df.index = [pd.Timestamp.now()]
+    # Reorder columns to match historical_data (already aligned) and return
+    today_df = today_df.reindex(columns=historical_data.columns)
+    return today_df