ML_Final_Hourly

Sleeping

App Files Files Community

Gumball2k5 commited on Nov 15, 2025

Commit

bdab3fd

verified ·

1 Parent(s): ade05b3

Create app.py

Browse files

Files changed (1) hide show

app.py +348 -0

app.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# --- 1. IMPORT THƯ VIỆN ---
+import streamlit as st
+import pandas as pd
+import joblib
+import plotly.graph_objects as go
+from datetime import datetime
+# Import các script tiện ích của bạn từ thư mục 'src'
+try:
+    from src import benchmark_utils
+    from src import diagnostic_plots as diag
+except ImportError:
+    st.error("Lỗi: Không tìm thấy file 'src/benchmark_utils.py' hoặc 'src/diagnostic_plots.py'. "
+             "Hãy đảm bảo chúng tồn tại trong thư mục 'src/'.")
+    st.stop()
+# --- 2. CẤU HÌNH TRANG WEB ---
+st.set_page_config(
+    page_title="Saigon Temperature Forecast",
+    page_icon="🌦️",
+    layout="wide"
+)
+# --- 3. CÁC HÀM TẢI DỮ LIỆU & MÔ HÌNH (VỚI CACHING) ---
+# Mục 1 & 2 trong checklist: Tải mọi thứ nặng bằng cache
+@st.cache_data
+def load_feature_data(file_path="data/final_dataset_tree.csv"):
+    """Tải dữ liệu features và targets, chuyển đổi index thành datetime."""
+    try:
+        df = pd.read_csv(file_path)
+        # --- TÙY CHỈNH QUAN TRỌNG ---
+        # Đảm bảo 'datetime' là tên cột ngày tháng trong file CSV của bạn
+        DATE_COLUMN = 'datetime'
+        if DATE_COLUMN not in df.columns:
+            st.error(f"Lỗi: Không tìm thấy cột ngày tháng '{DATE_COLUMN}' trong 'final_dataset_tree.csv'. "
+                     f"Vui lòng cập nhật biến DATE_COLUMN trong 'app.py'.")
+            return pd.DataFrame()
+        df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
+        df = df.set_index(DATE_COLUMN)
+        df = df.sort_index()
+        return df
+    except FileNotFoundError:
+        st.error(f"LỖI: Không tìm thấy file data chính tại: {file_path}")
+        return pd.DataFrame()
+@st.cache_resource
+def load_champion_models():
+    """Tải 5 mô hình chuyên gia (specialist models) từ checklist."""
+    models = []
+    try:
+        for i in range(1, 6):
+            file_path = f"models/champion_stacking_day{i}.pkl"
+            model = joblib.load(file_path)
+            models.append(model)
+        return models
+    except FileNotFoundError as e:
+        st.error(f"LỖI: Không tìm thấy file mô hình. Đã kiểm tra: {e.filename}. "
+                 "Hãy đảm bảo 5 file .pkl nằm trong thư mục 'models/'.")
+        return []
+@st.cache_data
+def load_performance_data(file_path="data/final_5_day_results_df.csv"):
+    """Tải dữ liệu hiệu suất đã tính toán trước cho Tab 3."""
+    try:
+        df = pd.read_csv(file_path)
+        return df
+    except FileNotFoundError:
+        st.error(f"LỖI: Không tìm thấy file hiệu suất tại: {file_path}")
+        return pd.DataFrame()
+# --- 4. KHỞI TẠO DỮ LIỆU & TÁCH TEST SET ---
+# Tải tất cả dữ liệu và mô hình
+all_data_df = load_feature_data()
+models = load_champion_models()
+perf_df = load_performance_data()
+# --- TÙY CHỈNH QUAN TRỌNG ---
+# Giả định tên các cột target (thực tế) trong file CSV của bạn
+# Checklist không nói rõ, nên tôi giả định tên là 't+1', 't+2', v.v.
+TARGET_COLS = [f't+{i}' for i in range(1, 6)]
+# Giả định tên cột nhiệt độ của ngày HIỆN TẠI (dùng để vẽ lịch sử)
+CURRENT_TEMP_COL = 'temp'
+# Tách test set (dựa trên ngày trong checklist)
+TEST_START_DATE = "2024-02-20"
+TEST_END_DATE = "2025-09-26"
+X_test, y_test, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
+if not all_data_df.empty:
+    try:
+        test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy()
+        # Giả định: 157 features là TẤT CẢ các cột KHÔNG PHẢI là target
+        feature_cols = [col for col in test_df.columns if col not in TARGET_COLS]
+        # Tách X_test (features) và y_test (thực tế)
+        X_test = test_df[feature_cols]
+        y_test = test_df[TARGET_COLS]
+        # Đổi tên cột y_test cho dễ hiểu (dùng trong Tab 3)
+        y_test.columns = [f'Day {i}' for i in range(1, 6)]
+    except KeyError:
+        st.error(f"Lỗi: Không tìm thấy cột target (ví dụ: '{TARGET_COLS[0]}') hoặc cột "
+                 f"'{CURRENT_TEMP_COL}' trong file CSV. Vui lòng cập nhật 'app.py'.")
+    except Exception as e:
+        st.error(f"Lỗi khi xử lý test set: {e}")
+else:
+    st.error("Không thể tải dữ liệu chính, ứng dụng không thể tiếp tục.")
+    st.stop()
+# --- 5. GIAO DIỆN SIDEBAR (THANH ĐIỀU HƯỚNG) ---
+st.sidebar.title("Navigation")
+app_section = st.sidebar.radio(
+    "Choose a section:",
+    ("Project Overview & Methodology", "Live 5-Day Forecast", "Model Performance & Diagnostics")
+)
+# Date input chỉ hiển thị khi ở tab "Live Forecast"
+selected_date = None
+if app_section == "Live 5-Day Forecast":
+    st.sidebar.header("Forecast Input")
+    if not X_test.empty:
+        min_date = X_test.index.min()
+        max_date = X_test.index.max()
+        selected_date = st.sidebar.date_input(
+            "Select a date from the test set:",
+            value=min_date,
+            min_value=min_date,
+            max_value=max_date,
+            format="YYYY-MM-DD"
+        )
+    else:
+        st.sidebar.error("Test data could not be loaded.")
+# --- 6. GIAO DIỆN CHÍNH (MAIN PANEL) ---
+if app_section == "Project Overview & Methodology":
+    # --- MỤC 3 TRONG CHECKLIST ---
+    st.title("Saigon Temperature Forecasting Application 🌦️")
+    st.subheader("Project Summary")
+    st.markdown("""
+    Mục tiêu của dự án này là dự đoán nhiệt độ trung bình hàng ngày cho TP. Hồ Chí Minh trong 5 ngày tới.
+    * **Dữ liệu:** Dữ liệu thời tiết lịch sử 10 năm từ Visual Crossing.
+    * **Mô hình:** Chúng tôi sử dụng 5 mô hình 'chuyên gia' (specialist models) - mỗi mô hình được tối ưu để dự đoán một ngày cụ thể trong tương lai (T+1 đến T+5).
+    """)
+    st.subheader("Our 'Two-Stream' Strategy")
+    st.markdown("""
+    Để tối ưu hóa hiệu suất, chúng tôi đã áp dụng chiến lược "Hai luồng" (Two-Stream):
+    1.  **Luồng 1 (Linear Models):** Các mô hình tuyến tính (như Linear Regression) được huấn luyện trên một bộ features đã được tinh gọn (sử dụng VIF) để tránh đa cộng tuyến.
+    2.  **Luồng 2 (Tree-based Models):** Các mô hình phức tạp hơn (như Random Forest, Gradient Boosting) được huấn luyện trên một bộ features toàn diện (157 features) để nắm bắt các mối quan hệ phi tuyến.
+    Mô hình chiến thắng (Champion Model) của chúng tôi là một mô hình **Stacking** từ Luồng 2, cho thấy hiệu suất vượt trội.
+    """)
+    st.subheader("Final Model Leaderboard")
+    st.markdown("Bảng xếp hạng các mô hình dựa trên điểm RMSE trung bình (càng thấp càng tốt).")
+    # Gọi hàm từ benchmark_utils.py
+    leaderboard_df = benchmark_utils.load_leaderboard()
+    if not leaderboard_df.empty:
+        # Hiển thị 10 mô hình hàng đầu
+        st.dataframe(leaderboard_df.head(10), use_container_width=True)
+    else:
+        st.warning("Không thể tải dữ liệu leaderboard.")
+# --------------------------------------------------------------------
+elif app_section == "Live 5-Day Forecast":
+    # --- MỤC 4 TRONG CHECKLIST ---
+    st.title("Live 5-Day Forecast")
+    if selected_date and not X_test.empty and models:
+        st.header(f"Dự báo cho 5 ngày tới từ: {selected_date.strftime('%Y-%m-%d')}")
+        # 1. Lấy Input Features
+        selected_date_ts = pd.Timestamp(selected_date)
+        input_features = X_test.loc[[selected_date_ts]]
+        if input_features.empty:
+            st.error("Không tìm thấy dữ liệu cho ngày đã chọn.")
+        else:
+            # 2. Tạo dự đoán
+            predictions = []
+            for i in range(5):
+                model = models[i] # Lấy mô hình T+i
+                pred = model.predict(input_features)[0]
+                predictions.append(pred)
+            # 3. Hiển thị dự đoán (dùng st.metric)
+            forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:]
+            cols = st.columns(5)
+            # Lấy giá trị thực tế để so sánh
+            actual_values = y_test.loc[selected_date_ts].values
+            for i in range(5):
+                with cols[i]:
+                    st.metric(
+                        label=f"Forecast for {forecast_dates[i].strftime('%b %d')}",
+                        value=f"{predictions[i]:.1f}°C",
+                        delta=f"Actual: {actual_values[i]:.1f}°C",
+                        delta_color="off" # Màu xám trung tính
+                    )
+            # 4. Biểu đồ (Optimal Suggestion)
+            st.subheader("Historical Context & Forecast")
+            # Lấy 14 ngày lịch sử
+            history_start = selected_date_ts - pd.Timedelta(days=14)
+            history_end = selected_date_ts
+            # Lấy dữ liệu 'temp' thực tế từ dataframe gốc
+            history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL]
+            # Tạo dataframe cho dự báo
+            forecast_df = pd.DataFrame({
+                'Date': forecast_dates,
+                'Forecast': predictions
+            }).set_index('Date')
+            fig = go.Figure()
+            fig.add_trace(go.Scatter(
+                x=history_df.index, y=history_df,
+                mode='lines+markers', name='Past 14 Days (Actual)',
+                line=dict(color='blue')
+            ))
+            fig.add_trace(go.Scatter(
+                x=forecast_df.index, y=forecast_df['Forecast'],
+                mode='lines+markers', name='5-Day Forecast',
+                line=dict(color='red', dash='dot')
+            ))
+            fig.update_layout(
+                title="Forecast vs. Historical Context",
+                xaxis_title="Date", yaxis_title="Temperature (°C)",
+                template="plotly_white", legend=dict(x=0.01, y=0.99)
+            )
+            st.plotly_chart(fig, use_container_width=True)
+    else:
+        st.warning("Vui lòng đợi... Đang tải dữ liệu hoặc mô hình.")
+# --------------------------------------------------------------------
+elif app_section == "Model Performance & Diagnostics":
+    # --- MỤC 5 TRONG CHECKLIST ---
+    st.title("Model Performance & Diagnostics")
+    if not perf_df.empty and not y_test.empty:
+        st.subheader("Performance Degradation over 5 Days")
+        st.markdown("Hiệu suất mô hình thay đổi như thế nào khi dự báo xa hơn.")
+        # 1. Biểu đồ suy giảm hiệu suất (RMSE & R2)
+        # --- TÙY CHỈNH ---
+        # Đảm bảo 'RMSE' và 'R2' là tên cột chính xác trong file 'final_5_day_results_df.csv'
+        RMSE_COL_NAME = 'RMSE'
+        R2_COL_NAME = 'R2'
+        col1, col2 = st.columns(2)
+        with col1:
+            fig_rmse = diag.plot_performance_degradation(
+                perf_df,
+                metric_column=RMSE_COL_NAME,
+                metric_name='RMSE (Temperature °C)',
+                color='blue'
+            )
+            st.plotly_chart(fig_rmse, use_container_width=True)
+        with col2:
+            fig_r2 = diag.plot_performance_degradation(
+                perf_df,
+                metric_column=R2_COL_NAME,
+                metric_name='R-squared (R²)',
+                color='green'
+            )
+            st.plotly_chart(fig_r2, use_container_width=True)
+        # 2. Biểu đồ Dự báo vs. Thực tế
+        st.subheader("Forecast vs. Actual Comparison (on entire test set)")
+        # Hàm này chạy dự đoán trên *toàn bộ* X_test (hàng ngàn dòng)
+        # Nó sẽ rất chậm nếu không có cache
+        @st.cache_data
+        def get_full_test_predictions(_models, _X_test):
+            """Chạy dự đoán trên toàn bộ test set và cache lại."""
+            all_preds = {}
+            for i in range(5):
+                model = _models[i]
+                preds = model.predict(_X_test)
+                all_preds[f'Day {i+1}'] = preds
+            return pd.DataFrame(all_preds, index=_X_test.index)
+        with st.spinner("Running predictions on entire test set... (This is cached for next time)"):
+            y_pred_test = get_full_test_predictions(models, X_test)
+        col1, col2 = st.columns(2)
+        with col1:
+            fig_d1 = diag.plot_forecast_vs_actual(
+                y_true=y_test['Day 1'],
+                y_pred=y_pred_test['Day 1'],
+                day_ahead_title="Day 1 Forecast"
+            )
+            st.plotly_chart(fig_d1, use_container_width=True)
+        with col2:
+            fig_d5 = diag.plot_forecast_vs_actual(
+                y_true=y_test['Day 5'],
+                y_pred=y_pred_test['Day 5'],
+                day_ahead_title="Day 5 Forecast"
+            )
+            st.plotly_chart(fig_d5, use_container_width=True)
+        # 3. Mục Tùy chọn: Deep Dive Expander
+        with st.expander("Champion Model Diagnostics (Deep Dive)"):
+            st.markdown("Phân tích chi tiết phần dư (lỗi = thực tế - dự báo) cho dự báo Day 1.")
+            y_true_d1 = y_test['Day 1']
+            y_pred_d1 = y_pred_test['Day 1']
+            dates_d1 = y_test.index
+            fig_res_time = diag.plot_residuals_vs_time(
+                y_true_d1, y_pred_d1, dates_d1, "Day 1"
+            )
+            st.plotly_chart(fig_res_time, use_container_width=True)
+            fig_res_dist = diag.plot_residuals_distribution(
+                y_true_d1, y_pred_d1, "Day 1"
+            )
+            st.plotly_chart(fig_res_dist, use_container_width=True)
+            st.markdown("Một mô hình tốt sẽ có phần dư (lỗi) phân phối chuẩn (hình chuông) "
+                        "quanh giá trị 0 và không có xu hướng (pattern) nào theo thời gian.")
+    else:
+        st.warning("Đang tải dữ liệu hiệu suất...")