final

Sleeping

App Files Files Community

final / app.py

Gumball2k5

Create app.py

bdab3fd verified 5 months ago

raw

history blame

14.4 kB

	# --- 1. IMPORT THƯ VIỆN ---
	import streamlit as st
	import pandas as pd
	import joblib
	import plotly.graph_objects as go
	from datetime import datetime

	# Import các script tiện ích của bạn từ thư mục 'src'
	try:
	from src import benchmark_utils
	from src import diagnostic_plots as diag
	except ImportError:
	st.error("Lỗi: Không tìm thấy file 'src/benchmark_utils.py' hoặc 'src/diagnostic_plots.py'. "
	"Hãy đảm bảo chúng tồn tại trong thư mục 'src/'.")
	st.stop()

	# --- 2. CẤU HÌNH TRANG WEB ---
	st.set_page_config(
	page_title="Saigon Temperature Forecast",
	page_icon="🌦️",
	layout="wide"
	)

	# --- 3. CÁC HÀM TẢI DỮ LIỆU & MÔ HÌNH (VỚI CACHING) ---
	# Mục 1 & 2 trong checklist: Tải mọi thứ nặng bằng cache

	@st.cache_data
	def load_feature_data(file_path="data/final_dataset_tree.csv"):
	"""Tải dữ liệu features và targets, chuyển đổi index thành datetime."""
	try:
	df = pd.read_csv(file_path)

	# --- TÙY CHỈNH QUAN TRỌNG ---
	# Đảm bảo 'datetime' là tên cột ngày tháng trong file CSV của bạn
	DATE_COLUMN = 'datetime'

	if DATE_COLUMN not in df.columns:
	st.error(f"Lỗi: Không tìm thấy cột ngày tháng '{DATE_COLUMN}' trong 'final_dataset_tree.csv'. "
	f"Vui lòng cập nhật biến DATE_COLUMN trong 'app.py'.")
	return pd.DataFrame()

	df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
	df = df.set_index(DATE_COLUMN)
	df = df.sort_index()
	return df
	except FileNotFoundError:
	st.error(f"LỖI: Không tìm thấy file data chính tại: {file_path}")
	return pd.DataFrame()

	@st.cache_resource
	def load_champion_models():
	"""Tải 5 mô hình chuyên gia (specialist models) từ checklist."""
	models = []
	try:
	for i in range(1, 6):
	file_path = f"models/champion_stacking_day{i}.pkl"
	model = joblib.load(file_path)
	models.append(model)
	return models
	except FileNotFoundError as e:
	st.error(f"LỖI: Không tìm thấy file mô hình. Đã kiểm tra: {e.filename}. "
	"Hãy đảm bảo 5 file .pkl nằm trong thư mục 'models/'.")
	return []

	@st.cache_data
	def load_performance_data(file_path="data/final_5_day_results_df.csv"):
	"""Tải dữ liệu hiệu suất đã tính toán trước cho Tab 3."""
	try:
	df = pd.read_csv(file_path)
	return df
	except FileNotFoundError:
	st.error(f"LỖI: Không tìm thấy file hiệu suất tại: {file_path}")
	return pd.DataFrame()

	# --- 4. KHỞI TẠO DỮ LIỆU & TÁCH TEST SET ---

	# Tải tất cả dữ liệu và mô hình
	all_data_df = load_feature_data()
	models = load_champion_models()
	perf_df = load_performance_data()

	# --- TÙY CHỈNH QUAN TRỌNG ---
	# Giả định tên các cột target (thực tế) trong file CSV của bạn
	# Checklist không nói rõ, nên tôi giả định tên là 't+1', 't+2', v.v.
	TARGET_COLS = [f't+{i}' for i in range(1, 6)]
	# Giả định tên cột nhiệt độ của ngày HIỆN TẠI (dùng để vẽ lịch sử)
	CURRENT_TEMP_COL = 'temp'

	# Tách test set (dựa trên ngày trong checklist)
	TEST_START_DATE = "2024-02-20"
	TEST_END_DATE = "2025-09-26"

	X_test, y_test, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

	if not all_data_df.empty:
	try:
	test_df = all_data_df.loc[TEST_START_DATE:TEST_END_DATE].copy()

	# Giả định: 157 features là TẤT CẢ các cột KHÔNG PHẢI là target
	feature_cols = [col for col in test_df.columns if col not in TARGET_COLS]

	# Tách X_test (features) và y_test (thực tế)
	X_test = test_df[feature_cols]
	y_test = test_df[TARGET_COLS]

	# Đổi tên cột y_test cho dễ hiểu (dùng trong Tab 3)
	y_test.columns = [f'Day {i}' for i in range(1, 6)]
	except KeyError:
	st.error(f"Lỗi: Không tìm thấy cột target (ví dụ: '{TARGET_COLS[0]}') hoặc cột "
	f"'{CURRENT_TEMP_COL}' trong file CSV. Vui lòng cập nhật 'app.py'.")
	except Exception as e:
	st.error(f"Lỗi khi xử lý test set: {e}")
	else:
	st.error("Không thể tải dữ liệu chính, ứng dụng không thể tiếp tục.")
	st.stop()


	# --- 5. GIAO DIỆN SIDEBAR (THANH ĐIỀU HƯỚNG) ---

	st.sidebar.title("Navigation")
	app_section = st.sidebar.radio(
	"Choose a section:",
	("Project Overview & Methodology", "Live 5-Day Forecast", "Model Performance & Diagnostics")
	)

	# Date input chỉ hiển thị khi ở tab "Live Forecast"
	selected_date = None
	if app_section == "Live 5-Day Forecast":
	st.sidebar.header("Forecast Input")

	if not X_test.empty:
	min_date = X_test.index.min()
	max_date = X_test.index.max()

	selected_date = st.sidebar.date_input(
	"Select a date from the test set:",
	value=min_date,
	min_value=min_date,
	max_value=max_date,
	format="YYYY-MM-DD"
	)
	else:
	st.sidebar.error("Test data could not be loaded.")


	# --- 6. GIAO DIỆN CHÍNH (MAIN PANEL) ---

	if app_section == "Project Overview & Methodology":
	# --- MỤC 3 TRONG CHECKLIST ---
	st.title("Saigon Temperature Forecasting Application 🌦️")

	st.subheader("Project Summary")
	st.markdown("""
	Mục tiêu của dự án này là dự đoán nhiệt độ trung bình hàng ngày cho TP. Hồ Chí Minh trong 5 ngày tới.

	* Dữ liệu: Dữ liệu thời tiết lịch sử 10 năm từ Visual Crossing.
	* Mô hình: Chúng tôi sử dụng 5 mô hình 'chuyên gia' (specialist models) - mỗi mô hình được tối ưu để dự đoán một ngày cụ thể trong tương lai (T+1 đến T+5).
	""")

	st.subheader("Our 'Two-Stream' Strategy")
	st.markdown("""
	Để tối ưu hóa hiệu suất, chúng tôi đã áp dụng chiến lược "Hai luồng" (Two-Stream):
	1. Luồng 1 (Linear Models): Các mô hình tuyến tính (như Linear Regression) được huấn luyện trên một bộ features đã được tinh gọn (sử dụng VIF) để tránh đa cộng tuyến.
	2. Luồng 2 (Tree-based Models): Các mô hình phức tạp hơn (như Random Forest, Gradient Boosting) được huấn luyện trên một bộ features toàn diện (157 features) để nắm bắt các mối quan hệ phi tuyến.

	Mô hình chiến thắng (Champion Model) của chúng tôi là một mô hình Stacking từ Luồng 2, cho thấy hiệu suất vượt trội.
	""")

	st.subheader("Final Model Leaderboard")
	st.markdown("Bảng xếp hạng các mô hình dựa trên điểm RMSE trung bình (càng thấp càng tốt).")

	# Gọi hàm từ benchmark_utils.py
	leaderboard_df = benchmark_utils.load_leaderboard()

	if not leaderboard_df.empty:
	# Hiển thị 10 mô hình hàng đầu
	st.dataframe(leaderboard_df.head(10), use_container_width=True)
	else:
	st.warning("Không thể tải dữ liệu leaderboard.")

	# --------------------------------------------------------------------

	elif app_section == "Live 5-Day Forecast":
	# --- MỤC 4 TRONG CHECKLIST ---
	st.title("Live 5-Day Forecast")

	if selected_date and not X_test.empty and models:
	st.header(f"Dự báo cho 5 ngày tới từ: {selected_date.strftime('%Y-%m-%d')}")

	# 1. Lấy Input Features
	selected_date_ts = pd.Timestamp(selected_date)
	input_features = X_test.loc[[selected_date_ts]]

	if input_features.empty:
	st.error("Không tìm thấy dữ liệu cho ngày đã chọn.")
	else:
	# 2. Tạo dự đoán
	predictions = []
	for i in range(5):
	model = models[i] # Lấy mô hình T+i
	pred = model.predict(input_features)[0]
	predictions.append(pred)

	# 3. Hiển thị dự đoán (dùng st.metric)
	forecast_dates = pd.date_range(start=selected_date, periods=6, freq='D')[1:]
	cols = st.columns(5)

	# Lấy giá trị thực tế để so sánh
	actual_values = y_test.loc[selected_date_ts].values

	for i in range(5):
	with cols[i]:
	st.metric(
	label=f"Forecast for {forecast_dates[i].strftime('%b %d')}",
	value=f"{predictions[i]:.1f}°C",
	delta=f"Actual: {actual_values[i]:.1f}°C",
	delta_color="off" # Màu xám trung tính
	)

	# 4. Biểu đồ (Optimal Suggestion)
	st.subheader("Historical Context & Forecast")

	# Lấy 14 ngày lịch sử
	history_start = selected_date_ts - pd.Timedelta(days=14)
	history_end = selected_date_ts

	# Lấy dữ liệu 'temp' thực tế từ dataframe gốc
	history_df = all_data_df.loc[history_start:history_end][CURRENT_TEMP_COL]

	# Tạo dataframe cho dự báo
	forecast_df = pd.DataFrame({
	'Date': forecast_dates,
	'Forecast': predictions
	}).set_index('Date')

	fig = go.Figure()

	fig.add_trace(go.Scatter(
	x=history_df.index, y=history_df,
	mode='lines+markers', name='Past 14 Days (Actual)',
	line=dict(color='blue')
	))
	fig.add_trace(go.Scatter(
	x=forecast_df.index, y=forecast_df['Forecast'],
	mode='lines+markers', name='5-Day Forecast',
	line=dict(color='red', dash='dot')
	))

	fig.update_layout(
	title="Forecast vs. Historical Context",
	xaxis_title="Date", yaxis_title="Temperature (°C)",
	template="plotly_white", legend=dict(x=0.01, y=0.99)
	)
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.warning("Vui lòng đợi... Đang tải dữ liệu hoặc mô hình.")

	# --------------------------------------------------------------------

	elif app_section == "Model Performance & Diagnostics":
	# --- MỤC 5 TRONG CHECKLIST ---
	st.title("Model Performance & Diagnostics")

	if not perf_df.empty and not y_test.empty:
	st.subheader("Performance Degradation over 5 Days")
	st.markdown("Hiệu suất mô hình thay đổi như thế nào khi dự báo xa hơn.")

	# 1. Biểu đồ suy giảm hiệu suất (RMSE & R2)

	# --- TÙY CHỈNH ---
	# Đảm bảo 'RMSE' và 'R2' là tên cột chính xác trong file 'final_5_day_results_df.csv'
	RMSE_COL_NAME = 'RMSE'
	R2_COL_NAME = 'R2'

	col1, col2 = st.columns(2)
	with col1:
	fig_rmse = diag.plot_performance_degradation(
	perf_df,
	metric_column=RMSE_COL_NAME,
	metric_name='RMSE (Temperature °C)',
	color='blue'
	)
	st.plotly_chart(fig_rmse, use_container_width=True)
	with col2:
	fig_r2 = diag.plot_performance_degradation(
	perf_df,
	metric_column=R2_COL_NAME,
	metric_name='R-squared (R²)',
	color='green'
	)
	st.plotly_chart(fig_r2, use_container_width=True)

	# 2. Biểu đồ Dự báo vs. Thực tế
	st.subheader("Forecast vs. Actual Comparison (on entire test set)")

	# Hàm này chạy dự đoán trên toàn bộ X_test (hàng ngàn dòng)
	# Nó sẽ rất chậm nếu không có cache
	@st.cache_data
	def get_full_test_predictions(_models, _X_test):
	"""Chạy dự đoán trên toàn bộ test set và cache lại."""
	all_preds = {}
	for i in range(5):
	model = _models[i]
	preds = model.predict(_X_test)
	all_preds[f'Day {i+1}'] = preds
	return pd.DataFrame(all_preds, index=_X_test.index)

	with st.spinner("Running predictions on entire test set... (This is cached for next time)"):
	y_pred_test = get_full_test_predictions(models, X_test)

	col1, col2 = st.columns(2)
	with col1:
	fig_d1 = diag.plot_forecast_vs_actual(
	y_true=y_test['Day 1'],
	y_pred=y_pred_test['Day 1'],
	day_ahead_title="Day 1 Forecast"
	)
	st.plotly_chart(fig_d1, use_container_width=True)
	with col2:
	fig_d5 = diag.plot_forecast_vs_actual(
	y_true=y_test['Day 5'],
	y_pred=y_pred_test['Day 5'],
	day_ahead_title="Day 5 Forecast"
	)
	st.plotly_chart(fig_d5, use_container_width=True)

	# 3. Mục Tùy chọn: Deep Dive Expander
	with st.expander("Champion Model Diagnostics (Deep Dive)"):
	st.markdown("Phân tích chi tiết phần dư (lỗi = thực tế - dự báo) cho dự báo Day 1.")

	y_true_d1 = y_test['Day 1']
	y_pred_d1 = y_pred_test['Day 1']
	dates_d1 = y_test.index

	fig_res_time = diag.plot_residuals_vs_time(
	y_true_d1, y_pred_d1, dates_d1, "Day 1"
	)
	st.plotly_chart(fig_res_time, use_container_width=True)

	fig_res_dist = diag.plot_residuals_distribution(
	y_true_d1, y_pred_d1, "Day 1"
	)
	st.plotly_chart(fig_res_dist, use_container_width=True)
	st.markdown("Một mô hình tốt sẽ có phần dư (lỗi) phân phối chuẩn (hình chuông) "
	"quanh giá trị 0 và không có xu hướng (pattern) nào theo thời gian.")

	else:
	st.warning("Đang tải dữ liệu hiệu suất...")