Spaces:
No application file
No application file
| import os | |
| os.environ["STREAMLIT_HOME"] = "/tmp" | |
| os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib" | |
| os.environ["STREAMLIT_METRICS_ENABLED"] = "0" | |
| import streamlit as st | |
| st.set_page_config( | |
| page_title="Multi-Utility Changepoint Detection", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| import optuna | |
| import pandas as pd | |
| from datetime import datetime | |
| import re | |
| from rapidfuzz import process, fuzz | |
| import altair as alt | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import io | |
| import requests | |
| import json | |
| from typing import List, Dict | |
| import xgboost as xgb | |
| from sklearn.metrics import mean_squared_error | |
| from sklearn.model_selection import TimeSeriesSplit | |
| from data_utils import load_file as _load_file | |
| from usage_utils import ( | |
| analyze_and_fill_usage, | |
| fill_usage_with_sequence_check_strict_mean, | |
| ) | |
| from rupture_utils import detect_changepoints | |
| from cp_utils import ( | |
| extract_changepoint_features, | |
| run_semi_supervised_cp_model, | |
| run_semi_supervised_cp_model_unified, | |
| ) | |
| from sklearn.preprocessing import StandardScaler | |
| # =============================================================== | |
| # 🏢 建筑特征自动提取功能 | |
| # =============================================================== | |
| def train_fixed_model( | |
| df: pd.DataFrame, | |
| duration_months: int, | |
| n_trials: int, | |
| early_stopping_rounds: int = 50 | |
| ): | |
| """ | |
| 通用 fixed 模式(短期 & 长期): | |
| • 短期:仅调优 XGBoost 超参 | |
| • 长期:额外调优 lag_steps, rolling_window, use_lag, use_rolling, use_zscore | |
| """ | |
| # —— 备份原始数据 | |
| df = df.copy() | |
| y_full = df["Use"] | |
| X_base = df.drop(columns=["Use", "StartDate"]) | |
| # Filter X_base to include only numeric, boolean, or category types that XGBoost can handle | |
| # This step is important if the incoming df might have other types. | |
| X_base = X_base.select_dtypes(include=[np.number, "bool", "category"]) | |
| # ==== 2️⃣ Optuna 调参 ==== | |
| def objective(trial): | |
| # 1) XGBoost 超参 | |
| params = { | |
| "n_estimators": trial.suggest_int("n_estimators", 50, 200), | |
| "max_depth": trial.suggest_int("max_depth", 3, 8), | |
| "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3), | |
| "subsample": trial.suggest_float("subsample", 0.5, 1.0), | |
| "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), | |
| "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0), | |
| "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0) | |
| } | |
| # 2) 长期 fixed 时,额外的特征参数 | |
| if duration_months >= 4: | |
| lag_steps = trial.suggest_int("lag_steps", 1, 12) | |
| rolling_window = trial.suggest_int("rolling_window", 1, 36) | |
| use_lag = trial.suggest_categorical("use_lag", [True, False]) | |
| use_rolling = trial.suggest_categorical("use_rolling", [True, False]) | |
| use_zscore = trial.suggest_categorical("use_zscore", [True, False]) | |
| else: | |
| # 短期模式,不要这些 | |
| lag_steps = rolling_window = 0 | |
| use_lag = use_rolling = use_zscore = False | |
| # 3) 为本次 trial 生成特征 | |
| X_trial = X_base.copy() | |
| if duration_months >= 12: | |
| # lag 特征 | |
| if use_lag: | |
| for lag in range(1, lag_steps + 1): | |
| X_trial[f"lag_{lag}"] = y_full.shift(lag) | |
| # 滚动均值 / 标准差 | |
| if use_rolling: | |
| X_trial[f"roll_mean_{rolling_window}"] = y_full.rolling(rolling_window).mean() | |
| X_trial[f"roll_std_{rolling_window}"] = y_full.rolling(rolling_window).std() | |
| # z-score | |
| if use_zscore: | |
| X_trial["zscore"] = (y_full - y_full.mean()) / y_full.std() | |
| # 清除因为 shift/rolling 引入的 NaN | |
| X_trial = X_trial.dropna() | |
| y_trial = y_full.loc[X_trial.index] | |
| if len(X_trial) < 10: | |
| return np.inf # 或者 np.inf,让 Optuna 跳过这个 trial | |
| n_splits = min(5, max(2, len(X_trial) // 10)) | |
| # 4) 时间序列 CV | |
| tscv = TimeSeriesSplit(n_splits=5) | |
| errors = [] | |
| for tr_idx, va_idx in tscv.split(X_trial): | |
| X_tr, X_va = X_trial.iloc[tr_idx], X_trial.iloc[va_idx] | |
| y_tr, y_va = y_trial.iloc[tr_idx], y_trial.iloc[va_idx] | |
| model = xgb.XGBRegressor(**params) | |
| model.fit( | |
| X_tr, y_tr, | |
| eval_set=[(X_va, y_va)], | |
| verbose=False | |
| ) | |
| preds = model.predict(X_va) | |
| rmse = np.sqrt(mean_squared_error(y_va, preds)) | |
| errors.append(rmse) | |
| return np.mean(errors) | |
| study = optuna.create_study(direction="minimize") | |
| with st.spinner("🔍 Running Optuna…"): | |
| study.optimize(objective, n_trials=n_trials) | |
| st.success(f"🎯 Optuna finished – best RMSE = {study.best_value:.4f}") | |
| # ==== 3️⃣ 用最佳 trial 生成最终特征 & 训练全量模型 ==== | |
| best_params = study.best_params | |
| # 拆分 XGBoost 参数 & 特征参数 | |
| xgb_keys = ["n_estimators","max_depth","learning_rate","subsample","colsample_bytree","reg_alpha","reg_lambda"] | |
| xgb_best_params = {k: best_params[k] for k in xgb_keys} | |
| # 特征工程参数 | |
| lag_steps = best_params.get("lag_steps", 0) | |
| rolling_window = best_params.get("rolling_window", 0) | |
| use_lag = best_params.get("use_lag", False) | |
| use_rolling = best_params.get("use_rolling", False) | |
| use_zscore = best_params.get("use_zscore", False) | |
| # 重新构造全量训练集 | |
| X_final = X_base.copy() | |
| if duration_months >= 3: | |
| if use_lag: | |
| for lag in range(1, lag_steps + 1): | |
| X_final[f"lag_{lag}"] = y_full.shift(lag) | |
| if use_rolling: | |
| X_final[f"roll_mean_{rolling_window}"] = y_full.rolling(rolling_window).mean() | |
| X_final[f"roll_std_{rolling_window}"] = y_full.rolling(rolling_window).std() | |
| X_final = X_final.dropna() | |
| y_final = y_full.loc[X_final.index] | |
| # 全量训练 | |
| best_model = xgb.XGBRegressor(**xgb_best_params) | |
| # 确保模型保存特征名称 | |
| best_model.fit(X_final, y_final) | |
| # 对于旧版本的 xgboost,手动设置 feature_names_in_ | |
| if not hasattr(best_model, 'feature_names_in_'): | |
| best_model.feature_names_in_ = X_final.columns.tolist() | |
| # ==== 4️⃣ xgboost 特征重要性 & 5️⃣ SHAP 解释 ==== | |
| st.subheader("📈 XGBoost Feature Importance (gain)") | |
| fig1, ax1 = plt.subplots(figsize=(8,6)) | |
| xgb.plot_importance(best_model, importance_type="gain", ax=ax1) | |
| fig1.tight_layout() | |
| st.pyplot(fig1) | |
| import shap | |
| st.subheader("🧐 SHAP Global & Local Explanations") | |
| explainer = shap.Explainer(best_model, X_final, feature_perturbation="interventional") | |
| shap_values = explainer(X_final, check_additivity=False) | |
| # —— Beeswarm 图 —— | |
| fig2 = plt.figure(figsize=(8,6)) | |
| shap.plots.beeswarm(shap_values, max_display=20, show=False) | |
| plt.tight_layout() | |
| st.pyplot(fig2) | |
| # —— Waterfall 图 —— | |
| st.caption("Example SHAP Waterfall (first sample)") | |
| fig3 = plt.figure(figsize=(8,6)) | |
| shap.plots.waterfall(shap_values[0], show=False) | |
| plt.tight_layout() | |
| st.pyplot(fig3) | |
| return best_model, study | |
| def kde_or_normal_sample(df_weather: pd.DataFrame, target_month: int, weather_var: str, window: int = 1) -> float: | |
| """ | |
| 对历史天气变量进行智能采样,根据样本量选择不同策略: | |
| - 样本 < 20:使用所有历史的均值 | |
| - 样本 20-50:使用正态扰动 | |
| - 样本 50-100:使用当前月KDE + 邻近月合并的KDE采样(混合策略) | |
| - 样本 > 100:直接KDE采样 | |
| 参数 | |
| ---- | |
| df_weather : 包含 ['StartDate', weather_var] 的历史天气表 | |
| target_month : 待预测月份(1–12) | |
| weather_var : 要采样的天气特征列名 | |
| window : 月份滑动窗口大小(前后window个月) | |
| 返回 | |
| ---- | |
| float : 采样后的天气特征值 | |
| """ | |
| from scipy.stats import gaussian_kde | |
| # 确保 StartDate 已经是 datetime | |
| df = df_weather.copy() | |
| df['StartDate'] = pd.to_datetime(df['StartDate']) | |
| df['month'] = df['StartDate'].dt.month | |
| # 获取当前月份的数据 | |
| current_month_vals = df.loc[df['month'] == target_month, weather_var].dropna() | |
| # 获取邻近月份的数据(±window月) | |
| neighbor_months = [] | |
| for offset in range(-window, window + 1): | |
| if offset != 0: # 排除当前月 | |
| m = (target_month + offset - 1) % 12 + 1 | |
| neighbor_months.append(m) | |
| neighbor_vals = df.loc[df['month'].isin(neighbor_months), weather_var].dropna() if neighbor_months else pd.Series() | |
| # 所有相关月份的数据(包括当前月和邻近月) | |
| all_vals = df.loc[df['month'].isin([target_month] + neighbor_months), weather_var].dropna() | |
| # 获取所有历史数据(不分月份) | |
| all_history_vals = df[weather_var].dropna() | |
| # 根据样本量选择策略 | |
| sample_size = len(current_month_vals) | |
| if sample_size == 0: | |
| # 当前月没有数据,使用邻近月份数据 | |
| if len(neighbor_vals) > 0: | |
| return float(neighbor_vals.mean()) | |
| else: | |
| # 邻近月也没有数据,使用所有历史均值 | |
| return float(all_history_vals.mean()) if len(all_history_vals) > 0 else np.nan | |
| elif sample_size < 20: | |
| # 样本 < 20:使用所有历史的均值 | |
| return float(all_history_vals.mean()) | |
| elif 20 <= sample_size < 50: | |
| # 样本 20-50:使用正态扰动 | |
| mu = current_month_vals.mean() | |
| sigma = current_month_vals.std(ddof=1) | |
| if sigma == 0: | |
| # 如果标准差为0,使用所有历史数据的标准差 | |
| sigma = all_history_vals.std(ddof=1) | |
| if sigma == 0: | |
| return float(mu) | |
| return float(np.random.normal(mu, sigma)) | |
| elif 50 <= sample_size < 100: | |
| # 样本 50-100:混合KDE策略 | |
| try: | |
| # 当前月份的KDE | |
| current_kde = gaussian_kde(current_month_vals) | |
| # 如果邻近月份有足够数据,创建邻近月份的KDE | |
| if len(neighbor_vals) >= 20: | |
| neighbor_kde = gaussian_kde(neighbor_vals) | |
| # 混合采样:70%概率从当前月KDE采样,30%从邻近月KDE采样 | |
| if np.random.random() < 0.7: | |
| return float(current_kde.resample(1)[0][0]) | |
| else: | |
| return float(neighbor_kde.resample(1)[0][0]) | |
| else: | |
| # 邻近月数据不足,仅使用当前月KDE | |
| return float(current_kde.resample(1)[0][0]) | |
| except: | |
| # KDE失败,退回到正态分布 | |
| mu = current_month_vals.mean() | |
| sigma = current_month_vals.std(ddof=1) | |
| if sigma == 0: | |
| return float(mu) | |
| return float(np.random.normal(mu, sigma)) | |
| else: # sample_size >= 100 | |
| # 样本 >= 100:直接KDE | |
| try: | |
| kde = gaussian_kde(current_month_vals) | |
| return float(kde.resample(1)[0][0]) | |
| except: | |
| # KDE失败(理论上不应该发生),退回到正态分布 | |
| mu = current_month_vals.mean() | |
| sigma = current_month_vals.std(ddof=1) | |
| if sigma == 0: | |
| return float(mu) | |
| return float(np.random.normal(mu, sigma)) | |
| def recursive_forecast_with_weather_sampling( | |
| model: xgb.XGBRegressor, | |
| last_known_df: pd.DataFrame, | |
| forecast_horizon: int, | |
| best_params: dict, | |
| weather_history: pd.DataFrame = None, | |
| weather_features: List[str] = None, | |
| weather_windows: Dict[str, int] = None, | |
| enable_weather_sampling: bool = True | |
| ) -> pd.DataFrame: | |
| """ | |
| 增强版递归预测:可选择性地对天气特征进行 KDE/正态采样。 | |
| 参数 | |
| ---- | |
| weather_history : 包含历史所有天气特征的 DataFrame,必须含 StartDate 列 | |
| weather_features : 需要随机化采样的天气特征列表 | |
| weather_windows : 每个天气特征的滑动窗口配置,如 {'temp_mean': 2, 'humidity_mean': 1} | |
| enable_weather_sampling : 是否启用天气采样 | |
| """ | |
| lag_steps = best_params.get("lag_steps", 0) | |
| rolling_window = best_params.get("rolling_window", 0) | |
| use_lag = best_params.get("use_lag", False) | |
| use_rolling = best_params.get("use_rolling", False) | |
| use_zscore = best_params.get("use_zscore", False) | |
| # 拷贝一份历史数据 | |
| hist_df = last_known_df.copy().reset_index(drop=True) | |
| preds = [] | |
| # 默认参数处理 | |
| if weather_features is None: | |
| weather_features = [] | |
| if weather_windows is None: | |
| weather_windows = {} | |
| if weather_history is None: | |
| enable_weather_sampling = False | |
| for _ in range(forecast_horizon): | |
| next_date = hist_df["StartDate"].max() + pd.DateOffset(months=1) | |
| # 构造基础新行 | |
| new_row = { | |
| "StartDate": next_date, | |
| "time_index": hist_df["time_index"].max() + 1, | |
| "month_sin": np.sin(2 * np.pi * next_date.month / 12), | |
| "month_cos": np.cos(2 * np.pi * next_date.month / 12), | |
| } | |
| # 构造 lag 特征 | |
| if use_lag and lag_steps > 0: | |
| for lag in range(1, lag_steps + 1): | |
| col = f"lag_{lag}" | |
| if col in hist_df.columns: | |
| new_row[col] = hist_df["Use"].iloc[-lag] | |
| else: | |
| new_row[col] = np.nan | |
| # 构造 rolling 特征 | |
| if use_rolling and rolling_window > 0: | |
| roll = hist_df["Use"].rolling(rolling_window) | |
| new_row[f"roll_mean_{rolling_window}"] = roll.mean().iloc[-1] | |
| new_row[f"roll_std_{rolling_window}"] = roll.std().iloc[-1] | |
| # 构造 zscore 特征 | |
| if use_zscore: | |
| mean = hist_df["Use"].mean() | |
| std = hist_df["Use"].std(ddof=0) | |
| new_row["zscore"] = (hist_df["Use"].iloc[-1] - mean) / std if std > 0 else 0.0 | |
| # ---- 补全静态/天气特征列 ---- | |
| feature_cols_all = [col for col in hist_df.columns if col not in ["Use", "StartDate", "BuildingName"]] | |
| for col in feature_cols_all: | |
| if col not in new_row: | |
| # 判断是否为天气特征且需要采样 | |
| if enable_weather_sampling and col in weather_features and col in weather_history.columns: | |
| # 使用 KDE/正态采样 | |
| window_size = weather_windows.get(col, 1) # 默认窗口为1 | |
| new_row[col] = kde_or_normal_sample( | |
| df_weather=weather_history, | |
| target_month=next_date.month, | |
| weather_var=col, | |
| window=window_size | |
| ) | |
| else: | |
| # 对于静态特征或不需要采样的特征,直接沿用最近一期的数值 | |
| new_row[col] = hist_df[col].iloc[-1] | |
| # 将新行转为 DataFrame | |
| new_df = pd.DataFrame([new_row]) | |
| # 确保列顺序匹配模型 - 使用模型训练时的所有特征 | |
| # 获取模型期望的特征列(从模型的 feature_names_in_ 属性) | |
| if hasattr(model, 'feature_names_in_'): | |
| feature_cols = model.feature_names_in_ | |
| else: | |
| # 如果模型没有 feature_names_in_ 属性,从历史数据和新构造的特征中推断 | |
| base_feature_cols = [col for col in hist_df.columns if col not in ["Use", "StartDate", "BuildingName"]] | |
| lag_cols = [col for col in new_row.keys() if col.startswith('lag_')] | |
| roll_cols = [col for col in new_row.keys() if col.startswith('roll_')] | |
| other_cols = ['zscore'] if 'zscore' in new_row else [] | |
| feature_cols = list(set(base_feature_cols + lag_cols + roll_cols + other_cols)) | |
| # 确保 new_df 包含所有必需的特征列 | |
| for col in feature_cols: | |
| if col not in new_df.columns: | |
| if col not in new_row: | |
| # 对于缺失的特征,使用默认值或从历史数据中获取 | |
| if col in hist_df.columns: | |
| new_row[col] = hist_df[col].iloc[-1] | |
| else: | |
| # 对于完全缺失的特征(可能是某些条件下才创建的),设为 0 | |
| new_row[col] = 0 | |
| # 重新创建 DataFrame 以包含所有特征 | |
| new_df = pd.DataFrame([new_row]) | |
| X_pred = new_df[feature_cols] | |
| # 预测 | |
| y_hat = model.predict(X_pred)[0] | |
| new_df["Use"] = y_hat | |
| # 拼接回历史,供后续滚动更新 | |
| hist_df = pd.concat([hist_df, new_df], ignore_index=True) | |
| preds.append((next_date, y_hat)) | |
| return pd.DataFrame(preds, columns=["Date", "PredictedUse"]) | |
| def chat_with_ollama(messages: List[Dict[str, str]], model: str = "mistral") -> str: | |
| """Interact with Ollama model""" | |
| try: | |
| url = "http://localhost:11434/api/chat" | |
| res = requests.post(url, json={"model": model, "messages": messages, "stream": False}) | |
| response_text = res.json()["message"]["content"] | |
| # Attempt to extract JSON from the response text | |
| # Ensures the first char after { is not whitespace | |
| match = re.search(r'\{\s*\S[\s\S]*\}', response_text) | |
| if match: | |
| return match.group(0) # Return only the JSON part | |
| else: | |
| # If no JSON object is found, return the original text for debugging | |
| return response_text # type: ignore | |
| except Exception as e: | |
| return f"Error connecting to Ollama or processing response: {e}" | |
| def load_file(file): | |
| if file is None: | |
| return None | |
| return _load_file(file) | |
| st.set_page_config( | |
| page_title="Multi-Utility Changepoint Detection", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| cp_table_ph = st.empty() | |
| cred_stats_ph = st.empty() | |
| # ------------------------------------------------------------ | |
| # 1️⃣ 文件上传 | |
| # ------------------------------------------------------------ | |
| st.sidebar.header("1️⃣ Upload Combined Data") | |
| usage_file = st.sidebar.file_uploader("Upload usage-data-with-features (CSV / XLSX)", ["csv", "xlsx"]) | |
| if not usage_file: | |
| st.sidebar.info("Please upload the combined usage data file") | |
| st.stop() | |
| # Load the single combined file (usage + building static features) | |
| usage_df = load_file(usage_file) | |
| if usage_df is None: | |
| st.sidebar.error("❌ Failed to load") | |
| st.stop() | |
| st.write( | |
| "Debug: Columns in usage_df immediately after load_file:", | |
| usage_df.columns.tolist() | |
| ) | |
| # —— 立刻把列改成 prompt 里用的 snake_case | |
| usage_df = usage_df.rename(columns={ | |
| "tempCmean": "temp_mean", | |
| "tempCstd": "temp_std", | |
| "HDDsum": "HDD_sum", | |
| "CDDsum": "CDD_sum", | |
| "dewpointdeficitmean": "dewpoint_deficit_mean", | |
| "tempminCmin": "temp_min_month", | |
| "tempmaxCmax": "temp_max_month", | |
| "pressuremean": "pressure_mean", | |
| "pressuremax": "pressure_max", | |
| "pressuremin": "pressure_min", | |
| "humiditymean": "humidity_mean", | |
| "humiditystd": "humidity_std", | |
| "windspeedmean": "wind_speed_mean", | |
| "windspeedmax": "wind_speed_max", | |
| "windgustmax": "wind_gust_max", | |
| "cloudsallmean": "clouds_all_mean", | |
| "visibilitymean": "visibility_mean", | |
| "precipmmsum": "precip_mm_sum", | |
| "raineventsum": "rain_event_sum", | |
| "snowmmsum": "snow_mm_sum", | |
| "snoweventsum": "snow_event_sum", | |
| "cfloorcount": "c_floor_count", | |
| }) | |
| st.write( | |
| "Debug: Columns in usage_df after renaming:", | |
| usage_df.columns.tolist() | |
| ) | |
| # 存到 session state 里,这样后面拿到的 df_main 列名就对了 | |
| st.session_state["df_merged_with_features"] = usage_df | |
| # Since we removed standalone building info, keep a placeholder to avoid NameError until all code cleaned | |
| binfo_df = None | |
| # ✨ Global column names (used throughout the script) | |
| utility_col = "CommodityCode" | |
| building_col = "BuildingName" | |
| # ------------------------------------------------------------ | |
| # 📋 缺失分析参数 | |
| # ------------------------------------------------------------ | |
| st.sidebar.header("📋 Missing analysis parameters") | |
| gap_threshold = st.sidebar.number_input("sequence missing threshold (days)", 1, 180, 62) | |
| fill_earliest_cutoff_dt = st.sidebar.date_input("earliest fill start date", datetime(2013, 1, 1)) | |
| min_fill_gap_months = st.sidebar.number_input("minimum fill gap months", 1, 36, 9) | |
| sequence_fill_method = st.sidebar.selectbox("fill method", ["mean", "median"], 0) | |
| post_missing_threshold = st.sidebar.slider("allowable missing rate", 0.0, 1.0, 0.1) | |
| def _run_missing(df): | |
| return analyze_and_fill_usage( | |
| df, | |
| gap_threshold=gap_threshold, | |
| fill_earliest_cutoff=fill_earliest_cutoff_dt.strftime("%Y-%m-%d"), | |
| min_fill_gap_months=min_fill_gap_months, | |
| sequence_fill_method=sequence_fill_method, | |
| post_missing_threshold=post_missing_threshold, | |
| ) | |
| usage_summary_df = _run_missing(usage_df) | |
| # ------------------------------------------------------------ | |
| # 2️⃣-6️⃣ 侧边栏:能源 & 建筑选择 | |
| # ------------------------------------------------------------ | |
| valid_summary = usage_summary_df[usage_summary_df["NotGonnaUse"] == 0] | |
| utilities = valid_summary[utility_col].dropna().unique().tolist() | |
| selected_utility = st.sidebar.selectbox("2️⃣ select utility type", utilities) | |
| valid_blds = ( | |
| valid_summary[valid_summary[utility_col] == selected_utility][building_col] | |
| .unique() | |
| .tolist() | |
| ) | |
| filtered_usage = usage_df[usage_df[utility_col] == selected_utility] | |
| time_col = st.sidebar.selectbox( | |
| "3️⃣ ", | |
| filtered_usage.columns.tolist(), | |
| index=filtered_usage.columns.get_loc("StartDate"), | |
| ) | |
| value_col = st.sidebar.selectbox( | |
| "4️⃣ utility usage column", | |
| filtered_usage.columns.tolist(), | |
| index=filtered_usage.columns.get_loc("Use"), | |
| ) | |
| # ---- 建筑模糊搜索推荐 ------------------------------------- | |
| def _build_index(names): | |
| idx = {} | |
| for n in names: | |
| low = n.lower() | |
| idx[low] = n | |
| idx[re.sub(r"[^a-z0-9]", " ", low)] = n | |
| m = re.search(r"\((.*?)\)", n) | |
| if m: | |
| idx[m.group(1).lower()] = n | |
| return idx | |
| def recommend(df, query: str, top_n: int = 5, cutoff: int = 40): | |
| if not query: | |
| return [] | |
| names = [n for n in valid_blds if pd.notna(n)] | |
| idx_map = _build_index(names) | |
| matches = process.extract( | |
| query.lower(), list(idx_map.keys()), scorer=fuzz.WRatio, limit=top_n | |
| ) | |
| return [idx_map[k] for k, score, _ in matches if score >= cutoff] | |
| query = st.sidebar.text_input("5️⃣ Enter building keywords") | |
| cands = recommend(filtered_usage, query) | |
| if query and not cands: | |
| st.sidebar.warning("No matching building found, please modify the keywords") | |
| selected_building = st.sidebar.selectbox("6️⃣ Select building", cands) if cands else None | |
| # ------------------------------------------------------------ | |
| # 7️⃣ Changepoint 参数 | |
| # ------------------------------------------------------------ | |
| st.sidebar.header("7️⃣ Changepoint parameters") | |
| algo = st.sidebar.selectbox("Algorithm", ["pelt", "window"], 0) | |
| model = st.sidebar.selectbox( | |
| "Model", | |
| {"pelt": ["rbf", "l2", "linear", "normal"], "window": ["rbf", "l2", "normal"]}[algo], | |
| ) | |
| pen = ( | |
| st.sidebar.slider("Penalty (Pelt)", 0.01, 50.0, 1.0, 0.01) if algo == "pelt" else None | |
| ) | |
| # ------------------------------------------------------------ | |
| # 8️⃣ Credibility 参数 | |
| # ------------------------------------------------------------ | |
| st.sidebar.header("8️⃣ Model parameters") | |
| window_size = st.sidebar.slider("Window size", 3, 24, 6) | |
| mean_win = st.sidebar.slider("Mean window", 3, 24, 6) | |
| slope_th = st.sidebar.number_input("slope_thresh", 0.01, 1.0, 0.1, 0.01) | |
| p_thresh = st.sidebar.number_input("p_thresh", 0.0, 1.0, 0.05, 0.01) | |
| # 🔧 新增:分类策略选择 | |
| classification_strategy = st.sidebar.selectbox( | |
| "🎯 Classification strategy", | |
| [ | |
| "Balanced score (recommended)", | |
| "Strict Threshold", | |
| "Loose Threshold", | |
| "Very Loose Threshold", | |
| "Force Noise Detection", | |
| "Ranking Based", | |
| "Adaptive Threshold" | |
| ], | |
| index=0 | |
| ) | |
| # 🔧 新增:机器学习模型选择 | |
| ml_model_type = st.sidebar.selectbox( | |
| "🤖 Machine Learning Model", | |
| ["XGBoost", "CatBoost"], | |
| index=0, | |
| help="Choose the base model for semi-supervised learning" | |
| ) | |
| # 🔧 新增:强制生成Noise样本选项 | |
| force_noise_samples = st.sidebar.checkbox( | |
| "🔧 Forced to generate Noise samples (for testing)", | |
| value=False, | |
| help="Ensure that at least 30% of the change points are classified as Noise to test the classification effect" | |
| ) | |
| # 💡 策略选择指南 | |
| with st.sidebar.expander("💡 Strategy selection guide"): | |
| st.write("**Select the strategy based on your needs:**") | |
| st.write("🔴 **No Noise detected** → Try:") | |
| st.write(" • Loose Threshold (20-50% Noise)") | |
| st.write(" • Very Loose Threshold (30% Noise)") | |
| st.write(" • Force Noise Detection (35-45% Noise)") | |
| st.write("") | |
| st.write("🟡 **Too much Noise** → Try:") | |
| st.write(" • Strict Threshold (5% Noise)") | |
| st.write(" • Adaptive Threshold") | |
| st.write("") | |
| st.write("🟢 **Balanced detection** → Recommended:") | |
| st.write(" • Balanced score (40-55% Noise)") | |
| st.write(" • Ranking Based (25-35% Noise)") | |
| k_best = st.sidebar.slider("Semi-supervised k_best", 1, 10, 5) | |
| max_depth = st.sidebar.slider("Max_depth", 2, 10, 3) | |
| learning_rt = st.sidebar.number_input("Learning_rate", 0.01, 1.0, 0.1, 0.01) | |
| n_estimators = st.sidebar.slider("n_estimators", 50, 500, 200, 10) | |
| # 🔄 全局重置功能 | |
| st.sidebar.markdown("---") | |
| st.sidebar.header("🔄 Reset Options") | |
| if st.sidebar.button("🔄 Reset all analysis", key="reset_all_analysis"): | |
| # 清除所有分析相关的session_state | |
| keys_to_reset = [ | |
| "credibility_analysis_done", | |
| "credibility_results", | |
| "final_results", | |
| "retrained_results", | |
| "manual_selections", | |
| "feat_df", | |
| "cp_df", | |
| "filled", | |
| "base_ln", | |
| "start_energy_prediction", | |
| "prediction_config", | |
| "ai_building_analysis", | |
| "auto_gross_area", | |
| "auto_space_sqft", | |
| "auto_workpoint_count", | |
| "auto_floor_count" | |
| ] | |
| for key in keys_to_reset: | |
| if key in st.session_state: | |
| del st.session_state[key] | |
| st.sidebar.success("✅ All analysis data has been reset!") | |
| st.rerun() | |
| # ------------------------------------------------------------ | |
| # 9️⃣ 主界面 | |
| # ------------------------------------------------------------ | |
| st.title("📊 Multi-Utility Changepoint Detection Platform") | |
| plot_cp = st.empty() # Original CP plot | |
| plot_semi = st.empty() # Semi-supervised CP plot | |
| plot_final = st.empty() # ③ 人工校正后 | |
| # —— 如果 session_state 已有 cp_df,先画一张 | |
| if "cp_df" in st.session_state: | |
| base_line = ( | |
| alt.Chart(st.session_state["cp_df"]) | |
| .mark_line() | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| tri = ( | |
| alt.Chart(st.session_state["cp_df"][st.session_state["cp_df"]["changepoint"] == 1]) | |
| .mark_point(shape="triangle", size=90, color="orange", filled=True) | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| plot_cp.altair_chart(base_line + tri, use_container_width=True) | |
| # ------------------------------------------------------------ | |
| # ▶️ 运行按钮逻辑 | |
| # ------------------------------------------------------------ | |
| if selected_building: | |
| st.markdown(f"**Building**:{selected_building} **Utility**:{selected_utility}") | |
| # 在第一次写入后就只读不写 | |
| if "selected_building" not in st.session_state: | |
| st.session_state["selected_building"] = selected_building | |
| # ---- 运行变点检测 ---------------------------------------- | |
| if st.sidebar.button("🚀 Run changepoint detection"): | |
| # 🔧 保存选中的建筑和工具信息到session_state | |
| st.session_state["selected_building"] = selected_building | |
| st.session_state["selected_utility"] = selected_utility | |
| def _run_strict_fill(df, summary, method): | |
| # 🔧 修改:移除force=True,使用正确的preprocessing策略 | |
| return fill_usage_with_sequence_check_strict_mean( | |
| df.copy(), # Operate on a copy to ensure original df_merged_with_features is untouched | |
| summary, | |
| method=method, | |
| force=False, # 不使用强制模式,遵循正确的FillStartDate逻辑 | |
| fill_earliest_cutoff=fill_earliest_cutoff_dt.strftime("%Y-%m-%d"), | |
| ) | |
| filled_minimal = _run_strict_fill(usage_df, usage_summary_df, sequence_fill_method) | |
| # --- BEGIN MODIFICATION: Merge back holidaycount and other features --- | |
| if not filled_minimal.empty and "df_merged_with_features" in st.session_state: | |
| df_with_all_features = st.session_state["df_merged_with_features"].copy() # Use a copy | |
| # Define columns to keep from df_with_all_features (add others if needed) | |
| # Ensure 'Date' in filled_minimal and 'StartDate' in df_with_all_features are compatible | |
| # SOURCE DATE COLUMN from df_with_all_features IS 'StartDate' | |
| source_date_col_in_all_features = 'StartDate' | |
| if source_date_col_in_all_features not in df_with_all_features.columns: | |
| st.error(f"Critical Error: Expected date column '{source_date_col_in_all_features}' not found in df_merged_with_features. Halting merge.") | |
| filled = filled_minimal.copy() | |
| else: | |
| df_with_all_features[source_date_col_in_all_features] = pd.to_datetime(df_with_all_features[source_date_col_in_all_features]) | |
| filled_minimal['Date'] = pd.to_datetime(filled_minimal['Date']) # This should already be 'Date' | |
| # Rename StartDate to Date in the (copy of) df_with_all_features FOR THE MERGE ONLY | |
| # This makes the merge key 'Date' consistent for both DFs | |
| df_to_merge_from = df_with_all_features.rename(columns={source_date_col_in_all_features: 'Date'}) | |
| columns_to_select_for_merge = ['BuildingName', 'CommodityCode', 'Date', 'holidaycount'] | |
| # Add other weather/static features from df_merged_with_features if you need them in feat_df | |
| # Example: 'temp_mean', 'HDD_sum', 'BuildingClassification', etc. | |
| # Remember these are column names from the ORIGINAL df_with_all_features / usage_df | |
| # Select only existing columns from df_to_merge_from to avoid KeyErrors | |
| # (after renaming StartDate to Date for the purpose of this selection list) | |
| temp_selection_list = ['BuildingName', 'CommodityCode', 'Date'] # Keys are certain | |
| if 'holidaycount' in df_to_merge_from.columns: # Check by original name if it was in usage_df | |
| temp_selection_list.append('holidaycount') | |
| # Add other features similarly, checking their original names in df_with_all_features | |
| # e.g., if 'temp_mean' in df_with_all_features.columns: temp_selection_list.append('temp_mean') | |
| existing_columns_for_selection_from_df_to_merge = [col for col in temp_selection_list if col in df_to_merge_from.columns] | |
| if not all(item in existing_columns_for_selection_from_df_to_merge for item in ['BuildingName', 'CommodityCode', 'Date']): | |
| st.error("Critical Error: Key merging columns (BuildingName, CommodityCode, Date) not found for merging. Halting merge.") | |
| filled = filled_minimal.copy() | |
| else: | |
| filled = pd.merge( | |
| filled_minimal, | |
| df_to_merge_from[existing_columns_for_selection_from_df_to_merge].drop_duplicates(), | |
| on=['BuildingName', 'CommodityCode', 'Date'], | |
| how='left' | |
| ) | |
| st.write("Debug: Columns in `filled` after merging back features:", filled.columns.tolist()) | |
| else: | |
| st.warning("Debug: filled_minimal is empty or df_merged_with_features not in session state. Skipping feature merge.") | |
| filled = filled_minimal.copy() | |
| # --- END MODIFICATION --- | |
| # 🔧 获取FillStartDate | |
| fsd_row = usage_summary_df.loc[ | |
| (usage_summary_df[building_col] == selected_building) | |
| & (usage_summary_df[utility_col] == selected_utility) | |
| ] | |
| if fsd_row.empty: | |
| st.error(f"❌ No data found for {selected_building} - {selected_utility}") | |
| st.stop() | |
| fsd = fsd_row["FillStartDate"].values[0] | |
| not_gonna_use = fsd_row["NotGonnaUse"].values[0] | |
| # 检查FillStartDate是否有效 | |
| if pd.isna(fsd): | |
| st.error(f"❌ No valid FillStartDate for {selected_building} - {selected_utility}") | |
| st.info("💡 This may indicate that the building has insufficient data after applying the preprocessing strategy.") | |
| st.stop() | |
| if not_gonna_use == 1: | |
| st.warning(f"⚠️ {selected_building} - {selected_utility} is marked as 'NotGonnaUse' due to high missing rate") | |
| st.info(f"💡 Missing rate exceeds the allowable threshold of {post_missing_threshold:.1%}") | |
| st.stop() | |
| fsd = pd.to_datetime(fsd) | |
| # 显示策略执行信息 | |
| cutoff_date = pd.to_datetime(fill_earliest_cutoff_dt) | |
| if fsd > cutoff_date: | |
| st.info(f"📅 **Preprocessing Strategy Applied**: Data starts from {fsd.strftime('%Y-%m-%d')} (after sequence missing gap)") | |
| else: | |
| st.info(f"📅 **Preprocessing Strategy Applied**: Data starts from {fsd.strftime('%Y-%m-%d')} (no long sequence missing after 2013)") | |
| seq_df = filled[ | |
| (filled[building_col] == selected_building) | |
| & (filled[utility_col] == selected_utility) | |
| ] | |
| seq_df = seq_df[seq_df["Date"] >= fsd] | |
| # 检查是否有可用数据 | |
| if seq_df.empty: | |
| st.error(f"❌ No data available for {selected_building} - {selected_utility} after applying preprocessing strategy") | |
| st.info(f"💡 The FillStartDate ({fsd.strftime('%Y-%m-%d')}) may be beyond the available data range") | |
| st.stop() | |
| pre_df = ( | |
| seq_df.rename(columns={"Date": "timestamp", "FilledUse": "value"})[ | |
| ["timestamp", "value"] | |
| ] | |
| .sort_values("timestamp") | |
| .reset_index(drop=True) | |
| ) | |
| pre_df["timestamp"] = pd.to_datetime(pre_df["timestamp"]) | |
| # 数据质量检查 | |
| total_points = len(pre_df) | |
| valid_points = pre_df["value"].notna().sum() | |
| missing_ratio = (total_points - valid_points) / total_points if total_points > 0 else 0 | |
| st.write(f"📏 **Data Quality Summary**: {total_points} months, {valid_points} valid points, {missing_ratio:.1%} missing") | |
| if valid_points == 0: | |
| st.error("❌ All data points are missing after preprocessing") | |
| st.stop() | |
| if missing_ratio > 0.5: | |
| st.warning(f"⚠️ High missing ratio ({missing_ratio:.1%}) in the processed sequence") | |
| def _run_cp(df, algo_, model_, pen_): | |
| return detect_changepoints(df, algo=algo_, model=model_, pen=pen_) | |
| cp_df = _run_cp(pre_df, algo, model, pen) | |
| # —— 保存到 session_state | |
| st.session_state["cp_df"] = cp_df | |
| st.session_state["filled"] = filled | |
| st.session_state["base_ln"] = alt.Chart(cp_df).mark_line().encode( | |
| x="timestamp:T", y="value:Q" | |
| ) | |
| # —— 首张图 | |
| pts = ( | |
| alt.Chart(cp_df[cp_df["changepoint"] == 1]) | |
| .mark_point(shape="triangle", size=100, color="red", filled=True) | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| plot_cp.altair_chart(st.session_state["base_ln"] + pts, use_container_width=True) | |
| st.success("✅ Changepoint detection completed") | |
| st.dataframe(cp_df[cp_df["changepoint"] == 1]) | |
| # ---- 评估变点可信度 -------------------------------------- | |
| if st.sidebar.button("🔄 Evaluate changepoint credibility(SelfLearning Classifier applied)"): | |
| st.session_state["credibility_analysis_done"] = True | |
| # 检查是否已完成可信度分析 | |
| if st.session_state.get("credibility_analysis_done", False): | |
| if "cp_df" not in st.session_state: | |
| st.warning("Please run changepoint detection first") | |
| st.session_state["credibility_analysis_done"] = False | |
| st.stop() | |
| # 确保第一张图始终显示 | |
| if "base_ln" in st.session_state: | |
| pts = ( | |
| alt.Chart(st.session_state["cp_df"][st.session_state["cp_df"]["changepoint"] == 1]) | |
| .mark_point(shape="triangle", size=100, color="red", filled=True) | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| plot_cp.altair_chart(st.session_state["base_ln"] + pts, use_container_width=True) | |
| # 如果还没有进行可信度分析,先执行分析 | |
| if "credibility_results" not in st.session_state: | |
| original_changepoints = st.session_state["cp_df"][st.session_state["cp_df"]["changepoint"] == 1].copy() | |
| base_changepoints = [] | |
| for _, row in original_changepoints.iterrows(): | |
| timestamp = row["timestamp"] | |
| value = row["value"] | |
| if force_noise_samples: | |
| changepoint_type = np.random.choice(['strong', 'medium', 'weak'], p=[0.15, 0.25, 0.6]) | |
| else: | |
| changepoint_type = np.random.choice(['strong', 'medium', 'weak'], p=[0.3, 0.4, 0.3]) | |
| if changepoint_type == 'strong': | |
| slope = np.random.uniform(0.15, 0.3) | |
| adf_p_value = np.random.uniform(0.01, 0.03) | |
| elif changepoint_type == 'medium': | |
| slope = np.random.uniform(0.08, 0.15) | |
| adf_p_value = np.random.uniform(0.03, 0.07) | |
| else: # weak | |
| if force_noise_samples: | |
| slope = np.random.uniform(0.001, 0.03) | |
| adf_p_value = np.random.uniform(0.15, 0.3) | |
| else: | |
| slope = np.random.uniform(0.01, 0.08) | |
| adf_p_value = np.random.uniform(0.07, 0.15) | |
| base_changepoints.append({ | |
| "Building Name": selected_building, | |
| "CommodityCode": selected_utility, | |
| "Changepoint Date": timestamp, | |
| "ProphetDelta": value, | |
| "slope": slope, | |
| "adf_p_value": adf_p_value, | |
| "ChangePointType": changepoint_type | |
| }) | |
| base_df = pd.DataFrame(base_changepoints) | |
| base_df["AbsDelta"] = base_df["ProphetDelta"].abs() | |
| # 使用extract_changepoint_features生成完整特征 | |
| try: | |
| st.write("Debug: Columns in st.session_state['filled'] before calling extract_changepoint_features:", st.session_state["filled"].columns.tolist()) | |
| feat_df = extract_changepoint_features( | |
| base_df, | |
| st.session_state["filled"], | |
| usage_col="FilledUse", | |
| date_col="Date", | |
| mean_win=mean_win | |
| ) | |
| st.session_state["feat_df"] = feat_df | |
| st.write("Debug: Columns in feat_df after calling extract_changepoint_features:", feat_df.columns.tolist()) | |
| except Exception as e: | |
| st.warning(f"Feature extraction failed: {e}") | |
| # 如果特征提取失败,使用基础特征 | |
| feat_df = base_df.copy() | |
| # 添加缺失的特征列 | |
| for col in ["ΔMeanBefore", "ΔMeanAfter", "ΔMeanDiff", "ΔMeanRatio", "TimeSinceStart", "TimeIndex", "Season"]: | |
| if col not in feat_df.columns: | |
| if col == "TimeIndex": | |
| # 🔧 Fix: Ensure proper data type for TimeIndex | |
| feat_df[col] = feat_df["Changepoint Date"].dt.month.astype('int64') | |
| elif col == "Season": | |
| # 🔧 Fix: Use numeric codes for Season to avoid dtype conflicts | |
| season_mapping = {6: 0, 7: 0, 8: 0, 12: 1, 1: 1, 2: 1} | |
| month_col = feat_df["Changepoint Date"].dt.month | |
| feat_df[col] = month_col.map(season_mapping).fillna(2).astype('int64') | |
| else: | |
| feat_df[col] = np.nan | |
| st.session_state["feat_df"] = feat_df | |
| # 为每个原始变点创建预测结果 | |
| preds_records = [] | |
| for _, row in feat_df.iterrows(): | |
| timestamp = row["Changepoint Date"] | |
| # 基于特征的分类规则 | |
| slope = row.get("slope", 0.1) | |
| abs_delta = row.get("AbsDelta", 0) | |
| # 🔧 根据选择的策略进行分类 | |
| if classification_strategy == "Strict Threshold": | |
| # 原始严格逻辑 | |
| if abs(slope) > slope_th and abs_delta > np.percentile(feat_df["AbsDelta"], 70): | |
| predicted = "Real" | |
| elif abs(slope) < slope_th * 0.5 and abs_delta < np.percentile(feat_df["AbsDelta"], 30): | |
| predicted = "Noise" | |
| else: | |
| predicted = "Unknown" | |
| real_score = noise_score = 0 # 占位符 | |
| elif classification_strategy == "Loose Threshold": | |
| # 更宽松的分类条件 | |
| if abs(slope) > slope_th * 0.6 or abs_delta > np.percentile(feat_df["AbsDelta"], 50): | |
| predicted = "Real" | |
| elif abs(slope) < slope_th * 0.9 or abs_delta < np.percentile(feat_df["AbsDelta"], 50): | |
| predicted = "Noise" | |
| else: | |
| predicted = "Unknown" | |
| real_score = noise_score = 0 | |
| elif classification_strategy == "Very Loose Threshold": | |
| # 极宽松的分类条件 | |
| if abs(slope) > slope_th * 0.9 or abs_delta > np.percentile(feat_df["AbsDelta"], 70): | |
| predicted = "Real" | |
| elif abs(slope) < slope_th * 0.1 or abs_delta < np.percentile(feat_df["AbsDelta"], 30): | |
| predicted = "Noise" | |
| else: | |
| predicted = "Unknown" | |
| real_score = noise_score = 0 | |
| elif classification_strategy == "Ranking Based": | |
| # 基于相对排名的分类 | |
| slope_rank = (feat_df["slope"].abs() <= abs(slope)).mean() | |
| delta_rank = (feat_df["AbsDelta"] <= abs_delta).mean() | |
| avg_rank = (z_rank + slope_rank + delta_rank) / 3 | |
| if avg_rank > 0.7: | |
| predicted = "Real" | |
| elif avg_rank < 0.3: | |
| predicted = "Noise" | |
| else: | |
| predicted = "Unknown" | |
| real_score = noise_score = 0 | |
| elif classification_strategy == "Adaptive Threshold": | |
| # 基于数据分布自适应调整阈值 | |
| slope_median = feat_df["slope"].abs().median() | |
| delta_median = feat_df["AbsDelta"].median() | |
| if abs(slope) > slope_median * 1.5 and abs_delta > delta_median * 1.2: | |
| predicted = "Real" | |
| elif abs(slope) < slope_median * 0.7 and abs_delta < delta_median * 0.8: | |
| predicted = "Noise" | |
| else: | |
| predicted = "Unknown" | |
| real_score = noise_score = 0 | |
| elif classification_strategy == "Force Noise Detection": | |
| # 🔧 修复:强制检测Noise,确保至少35%变点被分类为Noise | |
| # 方法1:基于排名强制分类 | |
| slope_rank = (feat_df["slope"].abs() <= abs(slope)).mean() | |
| delta_rank = (feat_df["AbsDelta"] <= abs_delta).mean() | |
| avg_rank = (z_rank + slope_rank + delta_rank) / 3 | |
| # 方法2:基于分位数阈值 | |
| slope_35 = np.percentile(feat_df["slope"].abs(), 35) | |
| delta_35 = np.percentile(feat_df["AbsDelta"], 35) | |
| # 强制分类逻辑:确保低排名的变点被分类为Noise | |
| if avg_rank <= 0.35: # 排名在前35%的低值变点 | |
| predicted = "Noise" | |
| elif (abs(slope) <= slope_35) or (abs_delta <= delta_35) or (abs(slope) <= slope_35 and abs_delta <= delta_35): | |
| # 至少两个特征都在35分位数以下 | |
| predicted = "Noise" | |
| elif abs(slope) > slope_th and abs_delta > np.percentile(feat_df["AbsDelta"], 65): | |
| predicted = "Real" | |
| else: | |
| predicted = "Unknown" | |
| real_score = noise_score = 0 | |
| z_rank = slope_rank = delta_rank = avg_rank | |
| else: # "平衡评分 (推荐)" | |
| # 🔧 改进的分类逻辑 - 多策略组合 | |
| # 策略1: 基于绝对阈值的分类 | |
| strong_real = (abs(slope) > slope_th and abs_delta > np.percentile(feat_df["AbsDelta"], 75)) | |
| strong_noise = (abs(slope) < slope_th * 0.7 and abs_delta < np.percentile(feat_df["AbsDelta"], 25)) | |
| # 策略2: 基于相对排名的分类 | |
| slope_rank = (feat_df["slope"].abs() <= abs(slope)).mean() | |
| delta_rank = (feat_df["AbsDelta"] <= abs_delta).mean() | |
| # 策略3: 综合评分 | |
| real_score = 0 | |
| noise_score = 0 | |
| # slope 评分 | |
| if abs(slope) > slope_th: | |
| real_score += 2 | |
| elif abs(slope) < slope_th * 0.6: | |
| noise_score += 2 | |
| else: | |
| real_score += 1 if abs(slope) > slope_th * 0.8 else 0 | |
| noise_score += 1 if abs(slope) < slope_th * 0.8 else 0 | |
| # delta 评分 (使用分位数) | |
| if abs_delta > np.percentile(feat_df["AbsDelta"], 70): | |
| real_score += 2 | |
| elif abs_delta < np.percentile(feat_df["AbsDelta"], 30): | |
| noise_score += 2 | |
| else: | |
| real_score += 1 if abs_delta > np.percentile(feat_df["AbsDelta"], 50) else 0 | |
| noise_score += 1 if abs_delta < np.percentile(feat_df["AbsDelta"], 50) else 0 | |
| # 最终分类决策 | |
| if strong_real or real_score >= 4: | |
| predicted = "Real" | |
| elif strong_noise or noise_score >= 4: | |
| predicted = "Noise" | |
| elif real_score > noise_score and real_score >= 2: | |
| predicted = "Real" | |
| elif noise_score > real_score and noise_score >= 2: | |
| predicted = "Noise" | |
| else: | |
| predicted = "Unknown" | |
| preds_records.append({ | |
| "Changepoint Date": timestamp, | |
| "Predicted": predicted, | |
| "RealScore": real_score, | |
| "NoiseScore": noise_score, | |
| "SlopeRank": slope_rank, | |
| "DeltaRank": delta_rank | |
| }) | |
| preds = pd.DataFrame(preds_records) | |
| # 统计信息 | |
| stats = preds["Predicted"].value_counts(dropna=False).to_dict() | |
| stats["k_best"] = k_best | |
| # 简单的 merge,确保一对一匹配 | |
| merge = original_changepoints.copy() | |
| merge = merge.merge( | |
| preds[["Changepoint Date", "Predicted"]], | |
| left_on="timestamp", | |
| right_on="Changepoint Date", | |
| how="left" | |
| ) | |
| # 保存结果到 session_state | |
| st.session_state["credibility_results"] = { | |
| "merge": merge, | |
| "stats": stats, | |
| "preds": preds | |
| } | |
| # 从 session_state 获取结果 | |
| merge = st.session_state["credibility_results"]["merge"] | |
| stats = st.session_state["credibility_results"]["stats"] | |
| line = st.session_state["base_ln"] | |
| changepoints_only = merge | |
| # 第二张图:用不同颜色和形状区分 Real/Noise/Unknown | |
| real = ( | |
| alt.Chart(changepoints_only[changepoints_only.Predicted == "Real"]) | |
| .mark_point(shape="triangle", size=90, color="green", filled=True) | |
| .encode( | |
| x="timestamp:T", | |
| y="value:Q", | |
| color=alt.value("green") | |
| ) | |
| ) | |
| noise = ( | |
| alt.Chart(changepoints_only[changepoints_only.Predicted == "Noise"]) | |
| .mark_point(shape="cross", size=80, color="red") | |
| .encode( | |
| x="timestamp:T", | |
| y="value:Q", | |
| color=alt.value("red") | |
| ) | |
| ) | |
| unk = ( | |
| alt.Chart(changepoints_only[changepoints_only.Predicted == "Unknown"]) | |
| .mark_point(shape="diamond", size=80, color="orange", filled=True) | |
| .encode( | |
| x="timestamp:T", | |
| y="value:Q", | |
| color=alt.value("orange") | |
| ) | |
| ) | |
| # 创建带图例的图表 | |
| changepoints_only_with_legend = changepoints_only.copy() | |
| chart = ( | |
| alt.Chart(changepoints_only_with_legend) | |
| .mark_point(size=90, filled=True) | |
| .encode( | |
| x="timestamp:T", | |
| y="value:Q", | |
| color=alt.Color( | |
| "Predicted:N", | |
| scale=alt.Scale( | |
| domain=["Real", "Noise", "Unknown"], | |
| range=["green", "red", "orange"] | |
| ), | |
| legend=alt.Legend(title="Changepoint Type") | |
| ), | |
| shape=alt.Shape( | |
| "Predicted:N", | |
| scale=alt.Scale( | |
| domain=["Real", "Noise", "Unknown"], | |
| range=["triangle-up", "cross", "diamond"] | |
| ) | |
| ) | |
| ) | |
| ) | |
| plot_semi.altair_chart(line + chart, use_container_width=True) | |
| # 渲染结果 | |
| cred_stats_ph.empty() | |
| cp_table_ph.empty() | |
| cred_stats_ph.success(f"Credibility stats: {stats}") | |
| # 显示分类策略信息 | |
| st.info(f"🎯 Current strategy: **{classification_strategy}**") | |
| # 策略说明 | |
| strategy_descriptions = { | |
| "Strict Threshold": "Requires all indicators to meet strict conditions before classification as Real/Noise, more conservative", | |
| "Loose Threshold": "As long as any indicator condition is met, it can be classified, more aggressive", | |
| "Very Loose Threshold": "Very loose classification conditions, easier to detect Noise change points", | |
| "Ranking-based": "Classify according to the relative ranking of the change point among all change points", | |
| "Adaptive Threshold": "Automatically adjust the classification threshold according to data distribution", | |
| "Force Noise Detection": "Forced detection of Noise based on the 25th quantile to ensure that at least 25% of the change points are classified as Noise", | |
| "Balanced Score (Recommended)": "Comprehensively score multiple indicators to balance accuracy and recall" | |
| } | |
| if classification_strategy in strategy_descriptions: | |
| st.caption(f"💡 {strategy_descriptions[classification_strategy]}") | |
| # 调试信息 | |
| st.write("🔄 Matching check:") | |
| st.write("changepoints_only shape:", changepoints_only.shape) | |
| st.write("changepoints_only['Predicted'] value:", changepoints_only["Predicted"].value_counts(dropna=False)) | |
| # 🔧 新增:显示特征分布调试信息 | |
| if "feat_df" in st.session_state: | |
| feat_df_debug = st.session_state["feat_df"] | |
| st.write("📊 Feature distribution debugging information:") | |
| col2, col3 = st.columns(2) | |
| with col2: | |
| st.write("**slope distribution:**") | |
| st.write(f"Minimum: {feat_df_debug['slope'].abs().min():.3f}") | |
| st.write(f"Maximum: {feat_df_debug['slope'].abs().max():.3f}") | |
| st.write(f"Mean: {feat_df_debug['slope'].abs().mean():.3f}") | |
| st.write(f"Current threshold: {slope_th}") | |
| with col3: | |
| st.write("**AbsDelta distribution:**") | |
| st.write(f"Minimum: {feat_df_debug['AbsDelta'].min():.1f}") | |
| st.write(f"Maximum: {feat_df_debug['AbsDelta'].max():.1f}") | |
| st.write(f"30th percentile: {np.percentile(feat_df_debug['AbsDelta'], 30):.1f}") | |
| st.write(f"70th percentile: {np.percentile(feat_df_debug['AbsDelta'], 70):.1f}") | |
| # 显示变点类型分布(如果有的话) | |
| if "ChangePointType" in feat_df_debug.columns: | |
| st.write("**Changepoint type distribution:**") | |
| type_counts = feat_df_debug["ChangePointType"].value_counts() | |
| st.write(type_counts.to_dict()) | |
| # 显示满足Noise条件的变点数量 | |
| if classification_strategy == "Strict threshold": | |
| noise_condition = ( | |
| (feat_df_debug['slope'].abs() < slope_th * 0.5) & | |
| (feat_df_debug['AbsDelta'] < np.percentile(feat_df_debug["AbsDelta"], 30)) | |
| ) | |
| st.write(f"**Number of changepoints satisfying strict Noise conditions:** {noise_condition.sum()}") | |
| elif classification_strategy == "Loose threshold": | |
| noise_condition = ( | |
| (feat_df_debug['slope'].abs() < slope_th * 0.8) | | |
| (feat_df_debug['AbsDelta'] < np.percentile(feat_df_debug["AbsDelta"], 40)) | |
| ) | |
| st.write(f"**Number of changepoints satisfying loose Noise conditions:** {noise_condition.sum()}") | |
| # 分类效果建议 | |
| real_count = stats.get("Real", 0) | |
| noise_count = stats.get("Noise", 0) | |
| unknown_count_initial_credibility = stats.get("Unknown", 0) # Renamed for clarity | |
| total_count = real_count + noise_count + unknown_count_initial_credibility | |
| if total_count > 0: | |
| noise_ratio = noise_count / total_count | |
| if noise_ratio < 0.1: | |
| st.warning("⚠️ Noise detection rate is low (<10%), try:") | |
| st.write("• **Loose threshold** - expected 20-50% Noise") | |
| st.write("• **Very loose threshold** - expected 30% Noise") | |
| st.write("• **Force Noise Detection** - expected 35-45% Noise") | |
| elif noise_ratio > 0.6: | |
| st.warning("⚠️ Noise detection rate is high (>60%), try:") | |
| st.write("• **Strict threshold** - expected 5% Noise") | |
| st.write("• **Adaptive threshold** - adjust automatically based on data") | |
| else: | |
| st.success(f"✅ Classification ratio is reasonable (Noise: {noise_ratio:.1%})") | |
| if classification_strategy == "Balanced score (recommended)": | |
| st.info("💡 Currently using the recommended strategy, good results") | |
| elif noise_ratio < 0.3: | |
| st.info("💡 If you need more Noise detection, try 'Force Noise Detection' strategy") | |
| elif noise_ratio > 0.4: | |
| st.info("💡 If you need to reduce Noise detection, try 'Strict threshold' strategy") | |
| # --- MODIFICATION POINT 1: Determine current unknowns based on final_results or initial merge --- | |
| current_data_for_manual_labeling = st.session_state.get("final_results", merge) # 'merge' is from credibility_results | |
| current_unknown_df = current_data_for_manual_labeling[current_data_for_manual_labeling["Predicted"] == "Unknown"] | |
| unknown_count_for_ui = len(current_unknown_df) | |
| # 手动标注部分 - 只有当存在 Unknown 时才显示 | |
| # unknown_count = len(changepoints_only[changepoints_only["Predicted"] == "Unknown"]) # OLD LINE | |
| if unknown_count_for_ui > 0: # Use the new count | |
| st.subheader("🖍️ Manually label Unknown changepoints") | |
| # Display counts based on the most recent data being considered for labeling | |
| num_real_in_current = len(current_data_for_manual_labeling[current_data_for_manual_labeling["Predicted"] == "Real"]) | |
| num_noise_in_current = len(current_data_for_manual_labeling[current_data_for_manual_labeling["Predicted"] == "Noise"]) | |
| st.write(f"Current status for labeling: Real({num_real_in_current}) | Noise({num_noise_in_current}) | Unknown({unknown_count_for_ui})") | |
| # 只对 Unknown 的变点进行手动标注 | |
| unknown_dates = list(current_unknown_df["timestamp"].dt.strftime("%Y-%m-%d")) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| sel_real = st.multiselect( | |
| "🟢 Label Unknown as Real", | |
| options=unknown_dates, | |
| default=st.session_state.get("manual_real_selection_default", []), | |
| key="manual_real_selection" | |
| ) | |
| with col2: | |
| sel_noise = st.multiselect( | |
| "🔴 Label Unknown as Noise", | |
| options=[d for d in unknown_dates if d not in sel_real], | |
| default=st.session_state.get("manual_noise_selection_default", []), | |
| key="manual_noise_selection" | |
| ) | |
| unlabeled = [d for d in unknown_dates if d not in sel_real and d not in sel_noise] | |
| if unlabeled: | |
| st.info(f"ℹ️ Keep Unknown changepoints: {', '.join(unlabeled)}") | |
| if st.button("💾 Save manual labels", key="save_manual_labels"): | |
| # --- MODIFICATION POINT 2: Save logic --- | |
| # Start with the current final_results if it exists, otherwise with the initial merge | |
| if "final_results" in st.session_state: | |
| updated_merge = st.session_state["final_results"].copy() | |
| else: | |
| updated_merge = merge.copy() # 'merge' is from credibility_results | |
| # Apply changes only to rows that are currently 'Unknown' in updated_merge | |
| # and are selected by the user. | |
| # Convert sel_real and sel_noise (date strings) to timestamps for matching | |
| sel_real_ts = pd.to_datetime(sel_real) | |
| sel_noise_ts = pd.to_datetime(sel_noise) | |
| # Mask for rows that are currently 'Unknown' | |
| unknown_mask_in_updated = updated_merge["Predicted"] == "Unknown" | |
| # Apply 'Real' labels | |
| real_selection_mask = updated_merge["timestamp"].isin(sel_real_ts) | |
| updated_merge.loc[unknown_mask_in_updated & real_selection_mask, "Predicted"] = "Real" | |
| # Apply 'Noise' labels (ensure not to overwrite 'Real' if somehow selected for both) | |
| noise_selection_mask = updated_merge["timestamp"].isin(sel_noise_ts) | |
| # Only apply if it was 'Unknown' and not just changed to 'Real' | |
| updated_merge.loc[unknown_mask_in_updated & noise_selection_mask & (updated_merge["Predicted"] != "Real"), "Predicted"] = "Noise" | |
| st.session_state["final_results"] = updated_merge | |
| # Update manual_selections based on what was ACTUALLY changed by this save operation | |
| # These are timestamps that were originally Unknown and are now Real or Noise | |
| newly_labeled_real_ts = updated_merge[ | |
| unknown_mask_in_updated & real_selection_mask | |
| ]["timestamp"].tolist() | |
| newly_labeled_noise_ts = updated_merge[ | |
| (unknown_mask_in_updated & noise_selection_mask) & (updated_merge["Predicted"] == "Noise") # Ensure it became Noise | |
| ]["timestamp"].tolist() | |
| current_manual_selections = st.session_state.get("manual_selections", []) | |
| # Add only newly labeled timestamps to avoid duplicates if resaving | |
| st.session_state["manual_selections"] = list(set(current_manual_selections + newly_labeled_real_ts + newly_labeled_noise_ts)) | |
| # Clear multiselect defaults/state for next potential render | |
| st.session_state.manual_real_selection_default = [] | |
| st.session_state.manual_noise_selection_default = [] | |
| if "manual_real_selection" in st.session_state: del st.session_state.manual_real_selection | |
| if "manual_noise_selection" in st.session_state: del st.session_state.manual_noise_selection | |
| st.success("✔️ Manual labels saved! Plots and stats updated.") | |
| st.rerun() # Crucial for UI refresh | |
| # --- MODIFICATION POINT 3: Plotting final results --- | |
| # Always plot, using final_results if available, else the initial merge (semi-supervised output) | |
| final_plot_data = st.session_state.get("final_results", merge) # 'merge' is from credibility_results | |
| real_f = ( | |
| alt.Chart(final_plot_data[final_plot_data["Predicted"] == "Real"]) | |
| .mark_point(shape="triangle", size=110, color="green", filled=True) | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| noise_f = ( | |
| alt.Chart(final_plot_data[final_plot_data["Predicted"] == "Noise"]) | |
| .mark_point(shape="cross", size=90, color="red") | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| unknown_f = ( | |
| alt.Chart(final_plot_data[final_plot_data["Predicted"] == "Unknown"]) | |
| .mark_point(shape="diamond", size=90, color="orange", filled=True) | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| # Use st.session_state.get("base_ln", alt.Chart()) to handle case where base_ln might not be set yet | |
| base_chart_for_final = st.session_state.get("base_ln", alt.Chart(final_plot_data).mark_line().encode(x="timestamp:T", y="value:Q")) | |
| plot_final.altair_chart(base_chart_for_final + real_f + noise_f + unknown_f, use_container_width=True) | |
| final_stats_display = final_plot_data["Predicted"].value_counts(dropna=False).to_dict() | |
| if unknown_count_for_ui == 0 and "final_results" in st.session_state: | |
| st.success(f"✅ All changepoints classified. Final results: {final_stats_display}") | |
| elif "final_results" in st.session_state: # Manual save has happened | |
| st.info(f"ℹ️ Current saved results: {final_stats_display}") | |
| # Logic for starting energy prediction | |
| if unknown_count_for_ui == 0: # Only allow proceeding if all are labeled. | |
| st.markdown("---") | |
| if st.button("🔮 Keep changepoint detection data, start energy prediction", key="start_energy_prediction_fully_labeled"): | |
| st.session_state["start_energy_prediction"] = True | |
| st.rerun() | |
| elif "final_results" in st.session_state: # If some unknowns remain but user saved. | |
| st.markdown("---") | |
| st.warning(f"⚠️ There are still {unknown_count_for_ui} 'Unknown' changepoints. You can continue labeling or proceed to energy prediction with current labels.") | |
| if st.button("🔮 Proceed to Energy Prediction with Current Labels", key="start_energy_prediction_with_unknowns"): | |
| st.session_state["start_energy_prediction"] = True | |
| st.rerun() | |
| # 🔧 重要修复:将后续分析移到这里,确保无论是否有手动标注都能进行分析 | |
| # 只要完成了置信度分析,就显示后续分析选项 | |
| if st.session_state.get("credibility_results") is not None: | |
| st.markdown("---") | |
| st.subheader("📊 Advanced analysis options") | |
| # 🔧 确保final_results存在(处理边界情况) | |
| if "final_results" not in st.session_state: | |
| st.session_state["final_results"] = st.session_state["credibility_results"]["merge"] | |
| updated_merge = st.session_state["final_results"] | |
| # 获取特征数据框 | |
| if "feat_df" in st.session_state: | |
| feat_df = st.session_state["feat_df"] | |
| else: | |
| st.warning("⚠️ Feature data is not available, some analysis functions may be limited") | |
| feat_df = pd.DataFrame() # 空的数据框作为备用 | |
| # 显示当前标注结果 | |
| final_stats = updated_merge["Predicted"].value_counts(dropna=False).to_dict() | |
| st.write(f"**Current analysis results**: Real({final_stats.get('Real', 0)}) | Noise({final_stats.get('Noise', 0)}) | Unknown({final_stats.get('Unknown', 0)})") | |
| # 🔧 显示数据来源信息 | |
| unknown_count = final_stats.get('Unknown', 0) | |
| if unknown_count == 0: | |
| st.info("📝 **Data source**: Automatic classification (all changepoints classified)") | |
| else: | |
| manual_count = len([row for _, row in updated_merge.iterrows() | |
| if row["Predicted"] in ["Real", "Noise"] and | |
| row["timestamp"] in st.session_state.get("manual_selections", [])]) | |
| if manual_count > 0: | |
| st.info(f"📝 **Data source**: Automatic classification + Manual labeling ({manual_count} manually labeled)") | |
| else: | |
| st.info("📝 **Data source**: Automatic classification only") | |
| # 提供多种分析选项 | |
| analysis_option = st.selectbox( | |
| "Select analysis type:", | |
| [ | |
| "Select analysis type...", | |
| "🔄 Retrain semi-supervised model", | |
| "🌳 Generate decision tree explanation", | |
| "📈 Feature importance analysis", | |
| "📊 Compare analysis results" | |
| ] | |
| ) | |
| if analysis_option == "🔄 Retrain semi-supervised model": | |
| st.write("### 🔄 Retrain semi-supervised model based on manual labeling") | |
| # 🔧 显示当前状态和操作选项 | |
| if "retrained_results" in st.session_state: | |
| st.success("✅ Retrained results already exist") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("🗑️ Clear retrained results"): | |
| del st.session_state["retrained_results"] | |
| st.rerun() | |
| with col2: | |
| retrain_button = st.button("🔄 Retrain") | |
| else: | |
| st.info("📝 Currently using manual labeling results") | |
| retrain_button = st.button("Start retraining") | |
| # 执行重新训练 | |
| if retrain_button: | |
| # 收集手动标注数据 | |
| manual_labels = [] | |
| for _, row in updated_merge.iterrows(): | |
| if row["Predicted"] in ["Real", "Noise"]: | |
| manual_labels.append({ | |
| "Building Name": selected_building, | |
| "Changepoint Date": row["timestamp"], | |
| "Label": row["Predicted"] # 🔧 直接使用 "Label" 而不是 "ManualLabel" | |
| }) | |
| if len(manual_labels) >= 3: # 至少需要3个标注样本 | |
| st.info("🔄 Retraining model...") | |
| # 使用现有的半监督模型函数 | |
| if not feat_df.empty: | |
| try: | |
| # 创建带有手动标签的数据 | |
| manual_df = pd.DataFrame(manual_labels) | |
| # 合并手动标签到特征数据 | |
| feat_with_labels = feat_df.merge( | |
| manual_df, | |
| on=["Building Name", "Changepoint Date"], | |
| how="left" | |
| ) | |
| # 🔧 确保所有没有手动标注的变点都有默认的Label值 | |
| feat_with_labels["Label"] = feat_with_labels["Label"].fillna("Unknown").astype(str) | |
| # 🔧 Fix: Ensure consistent data types to prevent dtype conflicts | |
| # Convert categorical string columns to numeric codes if present | |
| for col in feat_with_labels.columns: | |
| if col not in ['Building Name', 'Label', 'Changepoint Date']: | |
| # Ensure numeric columns are properly typed | |
| if feat_with_labels[col].dtype == 'object': | |
| try: | |
| # Try to convert to numeric first | |
| feat_with_labels[col] = pd.to_numeric(feat_with_labels[col], errors='coerce') | |
| except Exception: | |
| # If conversion fails, keep as string but ensure consistency | |
| feat_with_labels[col] = feat_with_labels[col].astype(str) | |
| # 🔧 Additional fix: Ensure all expected feature columns exist and have proper types | |
| expected_numeric_cols = ["AbsDelta", "slope", "ΔMeanDiff", "ΔMeanRatio", | |
| "TimeSinceStart", "TimeIndex", "Season", "holidaycount"] | |
| for col in expected_numeric_cols: | |
| if col in feat_with_labels.columns: | |
| feat_with_labels[col] = pd.to_numeric(feat_with_labels[col], errors='coerce').fillna(0) | |
| # 🔧 Debug: Show data types before passing to model | |
| st.write("**Debug - Data types before model training:**") | |
| dtype_info = feat_with_labels.dtypes.to_dict() | |
| st.write({k: str(v) for k, v in dtype_info.items()}) | |
| # 🔧 使用统一的模型接口,支持XGBoost和CatBoost | |
| st.info(f"🤖 Using **{ml_model_type}** for model retraining...") | |
| try: | |
| retrained_preds, retrained_stats = run_semi_supervised_cp_model_unified( | |
| feat_with_labels, | |
| k_best=k_best, | |
| model_type=ml_model_type.lower() | |
| ) | |
| except ImportError as e: | |
| if "catboost" in str(e).lower(): | |
| st.error("❌ CatBoost not installed. Please install it first:") | |
| st.code("pip install catboost") | |
| st.info("🔄 Falling back to XGBoost...") | |
| retrained_preds, retrained_stats = run_semi_supervised_cp_model( | |
| feat_with_labels, | |
| k_best=k_best | |
| ) | |
| else: | |
| raise e | |
| st.success("✅ Model retraining completed!") | |
| st.write("**Retrained prediction statistics:**", retrained_stats) | |
| # 对比重训练前后的结果 | |
| st.write("**Compare analysis:**") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("Before retraining:", stats) | |
| with col2: | |
| st.write("After retraining:", retrained_stats) | |
| # 保存重训练结果 | |
| st.session_state["retrained_results"] = retrained_preds | |
| except Exception as e: | |
| st.error(f"Retraining failed: {e}") | |
| st.info("💡 Tip: Please ensure that changepoint detection and credibility analysis have been completed") | |
| # 🔧 添加调试信息 | |
| if not feat_df.empty: | |
| st.write("**Debug information:**") | |
| st.write(f"feat_df shape: {feat_df.shape}") | |
| st.write(f"feat_df columns: {feat_df.columns.tolist()}") | |
| st.write(f"Manual labeling count: {len(manual_labels)}") | |
| else: | |
| st.warning("⚠️ Feature data is not available, cannot retrain") | |
| else: | |
| st.warning(f"⚠️ At least 3 labeled samples are required, currently only {len(manual_labels)}") | |
| elif analysis_option == "🌳 Generate decision tree explanation": | |
| st.write("### 🌳 Decision tree explanation analysis") | |
| # 🔧 智能选择数据源 | |
| if "retrained_results" in st.session_state: | |
| st.info("🎯 **Using retrained model results** to generate decision tree") | |
| # 使用重训练后的结果 | |
| retrained_preds = st.session_state["retrained_results"] | |
| # 将重训练结果合并到 updated_merge | |
| decision_tree_data = updated_merge.copy() | |
| # 更新预测结果为重训练后的结果 | |
| for _, row in retrained_preds.iterrows(): | |
| mask = decision_tree_data["timestamp"] == row["Changepoint Date"] | |
| if mask.any(): | |
| decision_tree_data.loc[mask, "Predicted"] = row["Predicted"] | |
| data_source = "Retrained model" | |
| else: | |
| # 🔧 修复:正确判断数据源 | |
| manual_selections = st.session_state.get("manual_selections", []) | |
| if len(manual_selections) > 0: | |
| st.info("📝 **Using automatic classification + manual labeling** to generate decision tree") | |
| data_source = "Automatic + Manual labeling" | |
| else: | |
| st.info("📝 **Using automatic classification results** to generate decision tree") | |
| data_source = "Automatic classification" | |
| decision_tree_data = updated_merge.copy() | |
| if st.button("Generate decision tree"): | |
| from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree | |
| # 准备特征数据 | |
| if 'feat_df' not in st.session_state: | |
| st.error("Feature data (feat_df) not found in session state. Please generate features first.") | |
| st.stop() | |
| current_feat_df = st.session_state['feat_df'] | |
| st.write("Debug: Columns in current_feat_df for Decision Tree:", current_feat_df.columns.tolist()) | |
| feature_cols = ["AbsDelta","slope", "ΔMeanDiff", "ΔMeanRatio", "TimeSinceStart", "holidaycount"] | |
| if 'holidaycount' not in current_feat_df.columns: | |
| st.caption("ℹ️ 'holidaycount' feature not found in the data, excluding it from decision tree analysis.") | |
| if 'holidaycount' in feature_cols: # Ensure it's in list before removing | |
| feature_cols.remove('holidaycount') | |
| # 只使用已标注的数据训练决策树 | |
| labeled_data = decision_tree_data[decision_tree_data["Predicted"].isin(["Real", "Noise"])] | |
| if len(labeled_data) >= 3: | |
| X = current_feat_df.loc[labeled_data.index, feature_cols].fillna(0) | |
| y = labeled_data["Predicted"].map({"Real": 1, "Noise": 0}) | |
| # 训练决策树 | |
| tree = DecisionTreeClassifier(max_depth=3, random_state=42) | |
| tree.fit(X, y) | |
| # 显示数据源信息 | |
| st.success(f"✅ Decision tree generated based on **{data_source}**") | |
| st.write(f"📊 Training sample count: {len(labeled_data)} (Real: {(y==1).sum()}, Noise: {(y==0).sum()})") | |
| # 🎨 新增:绘制决策树图形 | |
| st.write("**📊 Decision tree visualization:**") | |
| # 🔧 设置中文字体支持 | |
| plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans'] | |
| plt.rcParams['axes.unicode_minus'] = False | |
| # 创建图形 | |
| fig, ax = plt.subplots(figsize=(20, 12)) | |
| plot_tree(tree, | |
| feature_names=feature_cols, | |
| class_names=["Noise", "Real"], | |
| filled=True, | |
| rounded=True, | |
| fontsize=10, | |
| ax=ax) | |
| # 设置标题(使用英文避免字体问题) | |
| ax.set_title(f"Changepoint Classification Decision Tree (Based on {data_source})", fontsize=16, fontweight='bold', pad=20) | |
| # 保存图形到内存 | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png', dpi=300, bbox_inches='tight') | |
| buf.seek(0) | |
| # 在Streamlit中显示 | |
| st.image(buf, caption=f"Decision tree structure (Based on {data_source})", use_container_width=True) | |
| # 清理matplotlib资源 | |
| plt.close(fig) | |
| # 显示决策规则(文本版本) | |
| with st.expander("📝 See detailed decision rules (text)"): | |
| tree_rules = export_text(tree, feature_names=feature_cols, class_names=["Noise", "Real"]) | |
| st.text(tree_rules) | |
| # 特征重要性 | |
| st.write("**📈 Feature importance ranking:**") | |
| importance_df = pd.DataFrame({ | |
| 'Feature': feature_cols, | |
| 'Importance': tree.feature_importances_ | |
| }).sort_values('Importance', ascending=False) | |
| # 创建特征重要性条形图 | |
| fig2, ax2 = plt.subplots(figsize=(10, 6)) | |
| bars = ax2.bar(importance_df['Feature'], importance_df['Importance'], | |
| color='skyblue', edgecolor='navy', alpha=0.7) | |
| ax2.set_title(f'Feature Importance (Based on {data_source})', fontsize=14, fontweight='bold') | |
| ax2.set_xlabel('Features', fontsize=12) | |
| ax2.set_ylabel('Importance', fontsize=12) | |
| ax2.tick_params(axis='x', rotation=45) | |
| # 在条形图上添加数值标签 | |
| for bar, importance in zip(bars, importance_df['Importance']): | |
| height = bar.get_height() | |
| ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01, | |
| f'{importance:.3f}', ha='center', va='bottom') | |
| plt.tight_layout() | |
| # 保存特征重要性图 | |
| buf2 = io.BytesIO() | |
| plt.savefig(buf2, format='png', dpi=300, bbox_inches='tight') | |
| buf2.seek(0) | |
| st.image(buf2, caption=f"Feature importance analysis (Based on {data_source})", use_container_width=True) | |
| plt.close(fig2) | |
| # 显示数值表格 | |
| st.dataframe(importance_df.style.format({'Importance': '{:.4f}'})) | |
| # 应用决策树到所有变点 | |
| if st.button("Apply decision tree to all changepoints"): | |
| all_X = current_feat_df[feature_cols].fillna(0) | |
| tree_predictions = tree.predict(all_X) | |
| tree_pred_labels = ["Noise" if p == 0 else "Real" for p in tree_predictions] | |
| # 显示决策树的预测结果 | |
| tree_results = decision_tree_data.copy() | |
| tree_results["TreePredicted"] = tree_pred_labels | |
| st.write("**🎯 Decision tree prediction results:**") | |
| tree_stats = pd.Series(tree_pred_labels).value_counts().to_dict() | |
| st.write(tree_stats) | |
| # 对比当前标注和决策树预测 | |
| comparison = tree_results[["timestamp", "Predicted", "TreePredicted"]] | |
| st.write(f"**🔄 {data_source} vs Decision tree prediction comparison:**") | |
| st.dataframe(comparison) | |
| # 🎨 新增:预测结果可视化对比 | |
| st.write("**📊 Predicted result visualization comparison:**") | |
| # 创建对比图表 | |
| comparison_stats = pd.DataFrame({ | |
| data_source: pd.Series(tree_results["Predicted"]).value_counts(), | |
| '决策树预测': pd.Series(tree_results["TreePredicted"]).value_counts() | |
| }).fillna(0) | |
| fig3, (ax3, ax4) = plt.subplots(1, 2, figsize=(12, 5)) | |
| # 当前标注结果 | |
| ax3.pie(comparison_stats[data_source], labels=comparison_stats.index, | |
| autopct='%1.1f%%', startangle=90, colors=['lightcoral', 'lightblue', 'lightgreen']) | |
| ax3.set_title(f'{data_source}', fontsize=12, fontweight='bold') | |
| # 决策树预测结果 | |
| ax4.pie(comparison_stats['决策树预测'], labels=comparison_stats.index, | |
| autopct='%1.1f%%', startangle=90, colors=['lightcoral', 'lightblue', 'lightgreen']) | |
| ax4.set_title('Decision Tree Predictions', fontsize=12, fontweight='bold') | |
| plt.tight_layout() | |
| # 保存对比图 | |
| buf3 = io.BytesIO() | |
| plt.savefig(buf3, format='png', dpi=300, bbox_inches='tight') | |
| buf3.seek(0) | |
| st.image(buf3, caption=f"Predicted result comparison ({data_source} vs Decision tree)", use_container_width=True) | |
| plt.close(fig3) | |
| else: | |
| st.warning("At least 3 labeled samples are required to generate decision tree") | |
| elif analysis_option == "📈 Feature importance analysis": | |
| st.write("### 📈 Feature importance analysis") | |
| if st.button("Analyze feature importance"): | |
| if 'feat_df' not in st.session_state: | |
| st.error("Feature data (feat_df) not found in session state. Please generate features first.") | |
| st.stop() | |
| current_feat_df = st.session_state['feat_df'] | |
| st.write("Debug: Columns in current_feat_df for Feature Importance Analysis:", current_feat_df.columns.tolist()) | |
| # 分析不同类别变点的特征分布 | |
| feature_cols = ["AbsDelta", "slope", "ΔMeanDiff", "ΔMeanRatio", "TimeSinceStart", "holidaycount"] | |
| if 'holidaycount' not in current_feat_df.columns: | |
| st.caption("ℹ️ 'holidaycount' feature not found in the data, excluding it from this importance analysis.") | |
| if 'holidaycount' in feature_cols: # Ensure it's in list before removing | |
| feature_cols.remove('holidaycount') | |
| real_data = updated_merge[updated_merge["Predicted"] == "Real"] | |
| noise_data = updated_merge[updated_merge["Predicted"] == "Noise"] | |
| st.write("**Real vs Noise feature comparison:**") | |
| for feature in feature_cols: | |
| if feature in current_feat_df.columns: | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.write(f"**{feature}**") | |
| with col2: | |
| if not real_data.empty: | |
| real_values = current_feat_df.loc[real_data.index, feature] | |
| st.write(f"Real mean: {real_values.mean():.3f}") | |
| else: | |
| st.write("Real mean: N/A") | |
| with col3: | |
| if not noise_data.empty: | |
| noise_values = current_feat_df.loc[noise_data.index, feature] | |
| st.write(f"Noise mean: {noise_values.mean():.3f}") | |
| else: | |
| st.write("Noise mean: N/A") | |
| # 建议优化阈值 | |
| st.write("**Suggested classification threshold optimization:**") | |
| if not real_data.empty and not noise_data.empty: | |
| for feature in feature_cols[:3]: # 只分析前3个特征 | |
| if feature in feat_df.columns: | |
| real_vals = feat_df.loc[real_data.index, feature] | |
| noise_vals = feat_df.loc[noise_data.index, feature] | |
| optimal_threshold = (real_vals.mean() + noise_vals.mean()) / 2 | |
| st.write(f"- {feature}: Suggested threshold {optimal_threshold:.3f}") | |
| elif analysis_option == "📊 Compare analysis results": | |
| st.write("### 📊 Compare analysis results") | |
| # 对比原始预测和手动标注后的结果 | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("**Original semi-supervised prediction:**") | |
| st.write(stats) | |
| with col2: | |
| st.write("**Manual labeling after:**") | |
| st.write(final_stats) | |
| # 计算改变的变点数量 | |
| changed_points = 0 | |
| for _, row in updated_merge.iterrows(): | |
| original_pred = st.session_state["credibility_results"]["merge"] | |
| original_pred_for_this_point = original_pred[original_pred["timestamp"] == row["timestamp"]]["Predicted"].iloc[0] | |
| if original_pred_for_this_point != row["Predicted"]: | |
| changed_points += 1 | |
| st.write(f"**Manually modified changepoints:** {changed_points}") | |
| # 显示修改详情 | |
| if changed_points > 0: | |
| st.write("**Modified details:**") | |
| changes = [] | |
| original_merge = st.session_state["credibility_results"]["merge"] | |
| for _, row in updated_merge.iterrows(): | |
| original_pred = original_merge[original_merge["timestamp"] == row["timestamp"]]["Predicted"].iloc[0] | |
| if original_pred != row["Predicted"]: | |
| changes.append({ | |
| "Date": row["timestamp"].strftime("%Y-%m-%d"), | |
| "Original prediction": original_pred, | |
| "Manual labeling": row["Predicted"] | |
| }) | |
| if changes: | |
| st.dataframe(pd.DataFrame(changes)) | |
| # =============================================================== | |
| # 🔮 能源预测模块 | |
| # =============================================================== | |
| if st.session_state.get("start_energy_prediction", False): | |
| st.markdown("---") | |
| st.title("🔮 Intelligent energy prediction system") | |
| # 准备一个通用的显示函数,避免代码重复 | |
| # 定义移到模块靠前的位置,确保调用前已定义 | |
| def _display_llm_analysis_results(analysis_data, title_prefix=""): | |
| st.subheader(f"🧠 {title_prefix} LLM Analysis Results".strip()) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| mode_map = { | |
| "fixed": "Fixed (No change in function)", | |
| "future": "Future (Function will change)", | |
| "timeline": "Timeline (Old first, new later)" | |
| } | |
| st.info( | |
| f"**Prediction Mode:** {mode_map.get(analysis_data.get('mode'), analysis_data.get('mode', 'N/A'))}" | |
| ) | |
| # 修改天气变量的显示方式 | |
| weather_selection_data = analysis_data.get("weather_selection") | |
| if isinstance(weather_selection_data, list) and weather_selection_data: | |
| st.markdown("**Selected Weather Variables & Reasons:**") | |
| for item in weather_selection_data: | |
| if isinstance(item, dict) and "variable" in item and "reason" in item: | |
| st.markdown(f"- **{item.get('variable')}**: {item.get('reason')}") | |
| elif isinstance(item, dict) and "variable" in item: | |
| st.markdown(f"- **{item.get('variable')}**: Reason not provided") | |
| else: | |
| st.markdown("- Invalid weather variable entry") # Handle malformed entries | |
| else: | |
| st.info("**Selected Weather Variables:** N/A") | |
| with col2: | |
| st.info(f"**Mode Reason:** {analysis_data.get('mode_reason', 'N/A')}") | |
| st.info( | |
| f"**Prediction Duration:** {analysis_data.get('duration_months', 'N/A')} months" | |
| ) | |
| expander_title = f"Show Details for {title_prefix} Analysis".strip() | |
| with st.expander(expander_title): | |
| desc_title = f"**User Description ({title_prefix.strip()}):**" if title_prefix else "**User Description:**" | |
| st.write(desc_title) | |
| st.text(analysis_data.get("user_description", "N/A")) | |
| info_title = f"**Building Information Used ({title_prefix.strip()}):**" if title_prefix else "**Building Information Used:**" | |
| st.write(info_title) | |
| st.json(analysis_data.get("building_info", {})) | |
| if "revision_request_applied" in analysis_data: | |
| revision_title = f"**Revision Request Applied ({title_prefix.strip()}):**" if title_prefix else "**Revision Request Applied:**" | |
| st.write(revision_title) | |
| st.text(analysis_data.get("revision_request_applied", "N/A")) | |
| if "final_results" in st.session_state: | |
| final_results = st.session_state["final_results"] | |
| real_count = len(final_results[final_results["Predicted"] == "Real"]) | |
| noise_count = len(final_results[final_results["Predicted"] == "Noise"]) | |
| unknown_count = len(final_results[final_results["Predicted"] == "Unknown"]) | |
| st.info(f"📊 **Changepoint detection results**: Real({real_count}) | Noise({noise_count}) | Unknown({unknown_count})") | |
| selected_building = st.session_state.get("selected_building") | |
| selected_utility = st.session_state.get("selected_utility") | |
| if not selected_building or not selected_utility: | |
| st.warning("❌ No building/utility selected. Please go back and complete the selection.") | |
| st.stop() | |
| st.info(f"🏢 Current Building: **{selected_building}** | ⚡ Utility: **{selected_utility}**") | |
| # 自动提取建筑信息 (这部分代码保持不变, 确保 'info' 字典被正确填充) | |
| st.subheader("📋 Building Information (Auto-extracted from usage data)") | |
| expected_cols = [ | |
| "CAAN", "BuildingClassification", "BuildingLifeCycleStage", | |
| "BuildingGrossArea", "SpaceSqFt", "SpaceWorkpointCount", "c_floor_count" | |
| ] | |
| def _clean(s: str) -> str: | |
| return " ".join(str(s).replace(" ", " ").split()).strip() | |
| info = {c: "N/A" for c in expected_cols} | |
| if usage_df is not None and selected_building: | |
| match = usage_df[ | |
| usage_df["BuildingName"].astype(str).apply(_clean) == _clean(selected_building) | |
| ] | |
| if not match.empty: | |
| row = match.iloc[0] | |
| cols_lower_map = {col.lower(): col for col in usage_df.columns} | |
| for c in expected_cols: | |
| col_key = cols_lower_map.get(c.lower()) | |
| if col_key is not None: | |
| info[c] = row.get(col_key, "N/A") | |
| else: | |
| st.warning(f"Building '{selected_building}' not found in data") | |
| col1_disp, col2_disp = st.columns(2) | |
| with col1_disp: | |
| st.metric("CAAN", info["CAAN"]) | |
| st.metric("Building Classification", info["BuildingClassification"]) | |
| ga = info["BuildingGrossArea"] | |
| ga_disp = f"{int(float(ga)):,} sqft" if str(ga).replace(".", "", 1).isdigit() else "N/A" | |
| st.metric("Building Gross Area", ga_disp) | |
| sqft = info["SpaceSqFt"] | |
| sqft_disp = f"{int(float(sqft)):,}" if str(sqft).replace(".", "", 1).isdigit() else "N/A" | |
| st.metric("Space Sq Ft", sqft_disp) | |
| with col2_disp: | |
| wp = info["SpaceWorkpointCount"] | |
| wp_disp = f"{int(float(wp)):,}" if str(wp).replace(".", "", 1).isdigit() else wp | |
| st.metric("Workpoint Count", wp_disp) | |
| fl = info["c_floor_count"] | |
| fl_disp = f"{int(float(fl)):,}" if str(fl).replace(".", "", 1).isdigit() else fl | |
| st.metric("Floor Count", fl_disp) | |
| st.metric("Lifecycle Stage", info["BuildingLifeCycleStage"]) | |
| # 🔧 第二步:用户输入 | |
| st.subheader("📝 Building Usage Description") | |
| user_description = st.text_area( | |
| "Building Usage Description", | |
| placeholder=("Describe the building's current and future use, including:\n" | |
| "• Current function and usage patterns\n" | |
| "• Any planned changes in building function\n" | |
| "• Duration of prediction needed (in months)\n" | |
| "Example: 'This office building will be converted to instructional space in 6 months. " | |
| "Need 12 months prediction to cover both phases.'"), | |
| height=150, | |
| key="user_desc_energy_prediction" | |
| ) | |
| # 🔧 第三步:LLM分析按钮 | |
| if st.button("🤖 Analyze Building Usage & Weather Requirements", disabled=not user_description): | |
| with st.spinner("🧠 LLM is analyzing..."): | |
| try: | |
| # ---------- 初次 LLM 分析用 prompt (Optimized as per user request) ---------- | |
| prompt = f''' | |
| [Description of Forecast Modes] | |
| • fixed — building use remains unchanged for the entire forecast period | |
| • future — building use changes to a new function during the forecast period | |
| • timeline — forecast period is split: original function in the first half, new function in the second half | |
| [Examples of Mode Selection] | |
| (Note: these are examples ONLY. Please ignore them when analyzing the actual description below.) | |
| • fixed example: | |
| "The building is an office and will remain an office for the next 24 months. We need to predict energy usage for this period." → mode: "fixed" | |
| • future example: | |
| "This warehouse will be converted into a data center starting 9 months from now. We need an 18-month forecast covering the transition and initial operation as a data center." → mode: "future" | |
| • timeline example: | |
| "For the first 6 months the university building will be used for lectures, and for the next 12 months it will be renovated and used as a laboratory. Forecast needed for 18 months." → mode: "timeline" | |
| [Building Free-Text Description] | |
| {user_description} | |
| [Building Static Information] | |
| • Building Classification: {info.get('BuildingClassification', 'Unknown')} | |
| • Building Gross Area: {info.get('BuildingGrossArea', 'N/A')} sqft | |
| • Space SqFt: {info.get('SpaceSqFt', 'N/A')} | |
| • Workpoint Count: {info.get('SpaceWorkpointCount', 'N/A')} | |
| • Floor Count: {info.get('c_floor_count', 'N/A')} | |
| [Candidate Weather Variables & Suggested Building Types] | |
| # (feature_name — primary building classes where the feature is most influential) | |
| temp_mean — All | |
| temp_std — Research • Instructional • Library | |
| HDD_sum — Office • Residential • Instructional • Infrastructure | |
| CDD_sum — Office • Mixed • Residential • Recreation | |
| dewpoint_deficit_mean — Research • Health-like | |
| temp_min_C_min — Residential • Recreation • Infrastructure | |
| temp_max_C_max — Industrial-like • Recreation • Parking Structure | |
| pressure_mean / pressure_range — Research • Infrastructure | |
| humidity_mean — Office • Health/Research-like • Library | |
| humidity_std — Research • Library | |
| wind_speed_mean — Office • Infrastructure • Parking Structure | |
| wind_speed_max — Research • Infrastructure | |
| wind_gust_max — Research • Infrastructure | |
| clouds_all_mean — Office • Mixed | |
| visibility_mean — Mixed • Recreation | |
| precip_mm_sum — Instructional • Infrastructure • Recreation | |
| rain_event_sum — Instructional • Infrastructure | |
| snow_mm_sum / snow_event_sum — Infrastructure • Recreation | |
| [Tasks] | |
| 1. Please explain why a weather variable is selected or excluded in combination with static information such as "Gross Area" and "Workpoint Count". | |
| 2. Determine the **mode** ("fixed", "future", "timeline") and give **mode_reason**. Please also explain the impact of the mode in combination with the area and the number of workpoints. | |
| 3. Extract the integer **duration_months** and explain in **duration_reason** how it is derived from the description. | |
| 4. Select **5** most relevant variables from the list of candidate weather variables. | |
| Return **ONLY** a JSON object with the key `"weather_selection"`, whose value is a list of objects. Each object must include: | |
| - `"variable"`: the variable name (exactly as in the candidate list) | |
| - `"reason"`: explain the importance of the variable in combination with information such as "building type, area, number of workpoints, number of floors" | |
| Example output: | |
| {{ | |
| "weather_selection": [ | |
| {{ | |
| "variable": "CDD_sum", | |
| "reason": "This office building has an area of 80,000 ft² and a high summer cooling load, so CDD_sum strongly drives electricity demand." | |
| }}, | |
| {{ | |
| "variable": "HDD_sum", | |
| "reason": "With 5 floors and moderate heating usage in winter, HDD_sum correlates with natural-gas heating energy for this building." | |
| }}, | |
| {{ | |
| "variable": "humidity_mean", | |
| "reason": "High occupancy density (200 workpoints) amplifies latent heat loads; humidity_mean affects HVAC dehumidification energy." | |
| }} | |
| ] | |
| }} | |
| [Output Format] | |
| Return **ONLY** a valid JSON object matching this schema (no markdown, no code fences, no extra text): | |
| {{ | |
| "Current Building Classification": "...", | |
| "mode": "...", | |
| "mode_reason": "...", | |
| "duration_months": ..., | |
| "duration_reason": "...", | |
| "weather_selection": [ | |
| {{ "variable": "...", "reason": "..." }}, | |
| ... | |
| ] | |
| }} | |
| ''' | |
| messages = [ | |
| {"role": "system", "content": "You are an expert in building energy forecasting and changepoint-driven weather-informed modeling."}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| llm_response = chat_with_ollama(messages, model="mistral") | |
| try: | |
| analysis_data = json.loads(llm_response) | |
| st.session_state["initial_llm_analysis"] = { | |
| "llm_classification": analysis_data.get("Current Building Classification"), | |
| "mode": analysis_data.get("mode"), | |
| "mode_reason": analysis_data.get("mode_reason"), | |
| "duration_months": analysis_data.get("duration_months"), | |
| "duration_reason": analysis_data.get("duration_reason"), | |
| "weather_selection": analysis_data.get("weather_selection"), | |
| "user_description": user_description, | |
| "building_info": info | |
| } | |
| if "revised_llm_analysis" in st.session_state: | |
| del st.session_state["revised_llm_analysis"] | |
| st.success("✅ Initial LLM analysis completed!") | |
| st.session_state["start_energy_prediction"] = True | |
| st.rerun() | |
| except json.JSONDecodeError: | |
| st.error("❌ Failed to parse LLM response as JSON.") # Added period | |
| st.text(llm_response) | |
| except Exception as e: | |
| st.error(f"❌ LLM analysis failed: {str(e)}") | |
| st.warning("💡 Please make sure Ollama is running with the mistral model.") # Added period | |
| # 显示初次LLM分析结果 (如果存在) | |
| if "initial_llm_analysis" in st.session_state: | |
| _display_llm_analysis_results(st.session_state["initial_llm_analysis"], title_prefix="Initial") | |
| # 决定当前用于反馈和手动调整的分析数据源 | |
| current_analysis_for_feedback = None | |
| latest_analysis_type_for_prompt = "Initial analysis context" # Default context name | |
| if "revised_llm_analysis" in st.session_state: | |
| current_analysis_for_feedback = st.session_state["revised_llm_analysis"] | |
| latest_analysis_type_for_prompt = "Previously revised analysis context" # More specific | |
| elif "initial_llm_analysis" in st.session_state: | |
| current_analysis_for_feedback = st.session_state["initial_llm_analysis"] | |
| # 根据预测时长和模式选择模型接口 | |
| if current_analysis_for_feedback: | |
| duration_months = current_analysis_for_feedback.get("duration_months") | |
| prediction_mode = current_analysis_for_feedback.get("mode") | |
| if duration_months is not None and prediction_mode is not None: | |
| # 根据时长和模式选择模型接口 | |
| if duration_months >= 3: | |
| st.info("🔍 Long-term prediction detected (>3 months)") | |
| if prediction_mode == "fixed": | |
| st.info("Using Long-term Fixed Mode Model Interface") | |
| # TODO: 调用长期固定模式模型接口 | |
| pass | |
| elif prediction_mode == "future": | |
| st.info("Using Long-term Future Mode Model Interface") | |
| # TODO: 调用长期未来模式模型接口 | |
| pass | |
| elif prediction_mode == "timeline": | |
| st.info("Using Long-term Timeline Mode Model Interface") | |
| # TODO: 调用长期时间线模式模型接口 | |
| pass | |
| else: | |
| st.info("🔍 Short-term prediction detected (≤3 months)") | |
| if prediction_mode == "fixed": | |
| st.info("Using Short-term Fixed Mode Model Interface") | |
| # TODO: 调用短期固定模式模型接口 | |
| pass | |
| elif prediction_mode == "future": | |
| st.info("Using Short-term Future Mode Model Interface") | |
| # TODO: 调用短期未来模式模型接口 | |
| pass | |
| elif prediction_mode == "timeline": | |
| st.info("Using Short-term Timeline Mode Model Interface") | |
| # TODO: 调用短期时间线模式模型接口 | |
| pass | |
| # 显示当前分析结果(用于反馈和手动调整) | |
| if current_analysis_for_feedback: | |
| st.markdown("---") | |
| st.subheader("📝 Feedback on LLM Analysis") | |
| feedback_type = st.radio( | |
| "How satisfied are you with the LLM analysis? (Feedback applies to the latest analysis shown)", | |
| ["🔄 Request revision", "✏️ Manual adjustment"], # Removed "👍 Accept recommendations" | |
| horizontal=True, key="feedback_radio", index=None | |
| ) | |
| if feedback_type == "🔄 Request revision": | |
| revision_request = st.text_area( | |
| "What would you like the LLM to reconsider?", | |
| placeholder="Example: Consider more variables related to occupancy patterns...", | |
| key="revision_text_area" | |
| ) | |
| if st.button("🔄 Revise Analysis", key="revise_button") and revision_request: | |
| with st.spinner("🧠 LLM is re-analyzing..."): | |
| try: | |
| context_user_description = current_analysis_for_feedback.get("user_description", "") | |
| context_building_info = current_analysis_for_feedback.get("building_info", {}) | |
| # ---------- 修订 LLM 分析用 prompt (Optimized as per user request) ---------- | |
| revised_prompt = f''' | |
| [Context – Previous Analysis] | |
| User Description: | |
| {context_user_description} | |
| Building Static Information: | |
| - Building Classification: {context_building_info.get('BuildingClassification', 'Unknown')} | |
| - Building Gross Area: {context_building_info.get('BuildingGrossArea', 'N/A')} sqft | |
| - Space SqFt: {context_building_info.get('SpaceSqFt', 'N/A')} | |
| - Workpoint Count: {context_building_info.get('SpaceWorkpointCount', 'N/A')} | |
| - Floor Count: {context_building_info.get('c_floor_count', 'N/A')} | |
| [Candidate Weather Variables & Suggested Building Types] | |
| # (feature_name — primary building classes where the feature is most influential) | |
| temp_mean — All | |
| temp_std — Research • Instructional • Library | |
| HDD_sum — Office • Residential • Instructional • Infrastructure | |
| CDD_sum — Office • Mixed • Residential • Recreation | |
| dewpoint_deficit_mean — Research • Health-like | |
| temp_min_C_min — Residential • Recreation • Infrastructure | |
| temp_max_C_max — Industrial-like • Recreation • Parking Structure | |
| pressure_mean / pressure_range — Research • Infrastructure | |
| humidity_mean — Office • Health/Research-like • Library | |
| humidity_std — Research • Library | |
| wind_speed_mean — Office • Infrastructure • Parking Structure | |
| wind_speed_max — Research • Infrastructure | |
| wind_gust_max — Research • Infrastructure | |
| clouds_all_mean — Office • Mixed | |
| visibility_mean — Mixed • Recreation | |
| precip_mm_sum — Instructional • Infrastructure • Recreation | |
| rain_event_sum — Instructional • Infrastructure | |
| snow_mm_sum / snow_event_sum — Infrastructure • Recreation | |
| [Examples of Mode Selection] (IGNORE these when analyzing.) | |
| • fixed example: | |
| "The building is an office and will remain an office for the next 24 months …" → mode: "fixed" | |
| • future example: | |
| "This warehouse will be converted into a data center starting 9 months from now …" → mode: "future" | |
| • timeline example: | |
| "For the first 6 months the university building will be used for lectures … then 12 months as a laboratory …" → mode: "timeline" | |
| [User Revision Request] | |
| {revision_request} | |
| [Tasks] | |
| 1. Please explain why a weather variable is selected or excluded in combination with static information such as "Gross Area" and "Workpoint Count". | |
| 2. Determine the **mode** ("fixed", "future", "timeline") and give **mode_reason**. Please also explain the impact of the mode in combination with the area and the number of workpoints. | |
| 3. Extract the integer **duration_months** and explain in **duration_reason** how it is derived from the description. | |
| 4. Select **5** most relevant variables from the list of candidate weather variables. | |
| Return **ONLY** a JSON object with the key `"weather_selection"`, whose value is a list of objects. Each object must include: | |
| - `"variable"`: the variable name (exactly as in the candidate list) | |
| - `"reason"`: explain the importance of the variable in combination with information such as "building type, area, number of workpoints, number of floors" | |
| Example output: | |
| {{ | |
| "weather_selection": [ | |
| {{ | |
| "variable": "CDD_sum", | |
| "reason": "This office building has an area of 80,000 ft² and a high summer cooling load, so CDD_sum strongly drives electricity demand." | |
| }}, | |
| {{ | |
| "variable": "HDD_sum", | |
| "reason": "With 5 floors and moderate heating usage in winter, HDD_sum correlates with natural-gas heating energy for this building." | |
| }}, | |
| {{ | |
| "variable": "humidity_mean", | |
| "reason": "High occupancy density (200 workpoints) amplifies latent heat loads; humidity_mean affects HVAC dehumidification energy." | |
| }} | |
| ] | |
| }} | |
| Return **ONLY** the JSON object below (no markdown, no extra text): | |
| {{ | |
| "Current Building Classification": "...", | |
| "mode": "...", | |
| "mode_reason": "...", | |
| "duration_months": ..., | |
| "duration_reason": "...", | |
| "weather_selection": [ | |
| {{ "variable": "...", "reason": "..." }}, | |
| ... | |
| ] | |
| }} | |
| ''' | |
| messages = [ | |
| {"role": "system", "content": "You are an expert in building energy forecasting and changepoint-driven weather-informed modeling, tasked with revising a previous analysis based on user feedback."}, | |
| {"role": "user", "content": revised_prompt} | |
| ] | |
| llm_response = chat_with_ollama(messages, model="mistral") | |
| try: | |
| revised_analysis_data = json.loads(llm_response) | |
| st.session_state["revised_llm_analysis"] = { | |
| "llm_classification": revised_analysis_data.get("Current Building Classification"), | |
| "mode": revised_analysis_data.get("mode"), | |
| "mode_reason": revised_analysis_data.get("mode_reason"), | |
| "duration_months": revised_analysis_data.get("duration_months"), | |
| "duration_reason": revised_analysis_data.get("duration_reason"), | |
| "weather_selection": revised_analysis_data.get("weather_selection"), | |
| "user_description": context_user_description, | |
| "building_info": context_building_info, | |
| "revision_request_applied": revision_request | |
| } | |
| st.success("✅ LLM re-analysis completed!") | |
| st.rerun() | |
| except json.JSONDecodeError: | |
| st.error("❌ Failed to parse revised LLM response as JSON.") # Added period | |
| st.text(llm_response) | |
| except Exception as e: | |
| st.error(f"❌ LLM re-analysis failed: {str(e)}") | |
| st.warning("💡 Please make sure Ollama is running.") | |
| elif feedback_type == "✏️ Manual adjustment": | |
| st.write("**Manually adjust weather variables (applies to the latest analysis shown):**") | |
| available_vars = [ | |
| "temp_mean", "temp_std", "HDD_sum", "CDD_sum", "dewpoint_deficit_mean", | |
| "temp_min_month", "temp_max_month", "pressure_mean", "pressure_max", "pressure_min", | |
| "humidity_mean", "humidity_std", "wind_speed_mean", "wind_speed_max", "wind_gust_max", | |
| "clouds_all_mean", "visibility_mean", "precip_mm_sum", "rain_event_sum", | |
| "snow_mm_sum", "snow_event_sum" | |
| ] | |
| default_selection = [] | |
| if current_analysis_for_feedback and isinstance(current_analysis_for_feedback.get("weather_selection"), list): | |
| default_selection = [ | |
| item.get("variable") | |
| for item in current_analysis_for_feedback["weather_selection"] | |
| if isinstance(item, dict) and "variable" in item | |
| ] | |
| manual_vars = st.multiselect( | |
| "Select weather variables:", available_vars, default=default_selection, key="manual_vars_multiselect" | |
| ) | |
| if st.button("💾 Save Manual Selection", key="save_manual_weather_button"): | |
| target_analysis_key = "revised_llm_analysis" if "revised_llm_analysis" in st.session_state else "initial_llm_analysis" | |
| if target_analysis_key in st.session_state: | |
| current_weather_selection = st.session_state[target_analysis_key].get("weather_selection", []) | |
| if not isinstance(current_weather_selection, list): | |
| current_weather_selection = [] # Initialize if not list or None | |
| new_selection = [] | |
| current_selection_map = {item.get("variable"): item.get("reason", "Manually added/reason not provided") | |
| for item in current_weather_selection if isinstance(item, dict)} | |
| for var_name in manual_vars: | |
| new_selection.append({ | |
| "variable": var_name, | |
| "reason": current_selection_map.get(var_name, "Manually selected/reason not specified") | |
| }) | |
| st.session_state[target_analysis_key]["weather_selection"] = new_selection | |
| st.session_state[target_analysis_key]["manual_adjustment_applied"] = True | |
| st.success(f"✅ Updated weather variables for {target_analysis_key.replace('_llm_analysis','')} analysis.") | |
| st.rerun() | |
| else: | |
| st.warning("No analysis found to apply manual adjustments to.") | |
| # 显示修订后的LLM分析结果 (如果存在) | |
| if "revised_llm_analysis" in st.session_state: | |
| _display_llm_analysis_results(st.session_state["revised_llm_analysis"], title_prefix="Revised") | |
| # 预测选项等其他UI | |
| # —— 与上面 display_analysis 同级 —— | |
| if current_analysis_for_feedback: | |
| st.markdown("---") | |
| st.subheader("🔮 Energy Prediction (based on latest analysis)") | |
| # --- NEW SECTION 1: Static Explanation --- | |
| st.subheader("🌡️ Weather Sampling Strategy Details") | |
| st.markdown(""" | |
| Our weather sampling strategy, as implemented in the `kde_or_normal_sample` function, adapts to the amount of historical data available for each selected weather variable and the specific target month for future predictions: | |
| - **No Data for Target Month (0 samples):** | |
| - If neighboring months (within the configured ± window, e.g., ±1 or ±2 months) have data, their mean is used. | |
| - If neighboring months also lack data, the mean of all historical data for that variable across all months is used. | |
| - If no historical data exists at all for the variable, the result will be NaN (Not a Number). | |
| - **Less than 20 samples (for target month):** The mean of all historical data for that variable (across all months) is used. This provides a stable, albeit general, estimate when specific monthly data is sparse. | |
| - **20 to 49 samples (for target month):** A value is sampled from a Normal (Gaussian) distribution. The distribution's mean (μ) and standard deviation (σ) are derived from the historical data of the target month. | |
| - If the target month's standard deviation is zero (e.g., all values are the same), the standard deviation of all historical data for that variable (across all months) is used instead. | |
| - If that overall standard deviation is also zero, the mean of the target month is returned directly (as sampling from a Normal distribution with σ=0 is just the mean). | |
| - **50 to 99 samples (for target month):** A mixed Kernel Density Estimation (KDE) strategy is employed. This attempts to capture more nuanced distributions than a simple Normal fit. | |
| - There's a 70% chance of sampling from a KDE built using data specifically from the target month. | |
| - There's a 30% chance of sampling from a KDE built using data from neighboring months (as defined by the ± window configuration). This is only done if the combined data from neighboring months has at least 20 samples; otherwise, if the target month itself has data, its KDE is used for this 30% chance as well. | |
| - If KDE calculations fail (e.g., due to insufficient unique data points for KDE), the strategy falls back to the Normal distribution method described for 20-49 samples. | |
| - **100 or more samples (for target month):** A value is sampled directly from a KDE built using data from the target month. This is preferred when ample data exists for a robust density estimation. | |
| - If KDE calculations fail, it falls back to the Normal distribution method. | |
| The 'samples for target month' refers to the number of non-missing historical data points available for a specific variable in a specific month of the year (e.g., all historical January 'temp_mean' values). | |
| The "Avg Samples" displayed in the table below are averages of these monthly sample counts across all 12 months. | |
| The "Window Size" configuration (for variables with 50-99 average monthly samples) directly impacts the "neighboring months" data used in the mixed KDE strategy. | |
| """) | |
| # 🔧 新增:检查天气变量样本量并提供滑动窗口选择 | |
| if "weather_window_config" not in st.session_state: | |
| st.session_state["weather_window_config"] = {} | |
| # 获取当前的天气特征 | |
| current_weather_features = [] | |
| if "revised_llm_analysis" in st.session_state: | |
| weather_selection = st.session_state["revised_llm_analysis"].get("weather_selection", []) | |
| current_weather_features = [item["variable"] for item in weather_selection if "variable" in item] | |
| elif "initial_llm_analysis" in st.session_state: | |
| weather_selection = st.session_state["initial_llm_analysis"].get("weather_selection", []) | |
| current_weather_features = [item["variable"] for item in weather_selection if "variable" in item] | |
| # 如果有天气特征,检查样本量 | |
| weather_window_needed = False | |
| sample_analysis = [] | |
| if current_weather_features and selected_building and usage_df is not None: | |
| # This block calculates sample_analysis. The st.write for table header will be moved after preview. | |
| # st.write("### 🌡️ Weather Variable Sample Analysis") # MOVING THIS HEADER DOWN | |
| building_data = usage_df[usage_df["BuildingName"] == selected_building].copy() | |
| if not building_data.empty: | |
| building_data["StartDate"] = pd.to_datetime(building_data["StartDate"]) | |
| building_data["month"] = building_data["StartDate"].dt.month | |
| for var in current_weather_features: | |
| if var in building_data.columns: | |
| # 计算每个月的样本量 | |
| month_counts = {} | |
| for month in range(1, 13): | |
| month_data = building_data[building_data["month"] == month][var].dropna() | |
| month_counts[month] = len(month_data) | |
| avg_samples = np.mean(list(month_counts.values())) | |
| min_samples = min(month_counts.values()) | |
| max_samples = max(month_counts.values()) | |
| # 判断是否需要滑动窗口选择 | |
| needs_window = 50 <= avg_samples < 100 | |
| if needs_window: | |
| weather_window_needed = True | |
| sample_analysis.append({ | |
| "Variable": var, | |
| "Avg Samples": avg_samples, | |
| "Min-Max": f"{min_samples}-{max_samples}", | |
| "Needs Window Selection": needs_window | |
| }) | |
| # --- NEW SECTION 2: Dynamic Preview (after sample_analysis is computed) --- | |
| st.markdown("**Current Strategy Preview (based on *average* monthly samples):**") | |
| if not sample_analysis: | |
| st.info("No weather variables selected or data available to preview strategy based on average samples.") | |
| else: | |
| for item in sample_analysis: | |
| var_name = item["Variable"] | |
| avg_samples = item["Avg Samples"] | |
| strategy_desc = "" | |
| # This is a simplified interpretation for the preview based on AVERAGE samples. | |
| # The actual kde_or_normal_sample function uses target_month specific counts. | |
| if avg_samples == 0: # Approximating that if average is 0, target month is likely 0 | |
| strategy_desc = "If target month has 0 samples: Mean of neighbors/all history." | |
| elif avg_samples < 20: | |
| strategy_desc = "If target month has <20 samples: Mean of all historical data." | |
| elif avg_samples < 50: # 20 <= avg_samples < 50 | |
| strategy_desc = "If target month has 20-49 samples: Normal distribution." | |
| elif avg_samples < 100: # 50 <= avg_samples < 100 | |
| strategy_desc = "If target month has 50-99 samples: Mixed KDE." | |
| else: # avg_samples >= 100 | |
| strategy_desc = "If target month has ≥100 samples: Direct KDE." | |
| st.markdown(f"- **{var_name}**: Avg. {avg_samples:.1f} samples/month. Likely strategy for a typical month: *{strategy_desc}*") | |
| # --- Existing Table Display --- | |
| st.write("### 🌡️ Weather Variable Sample Analysis") # Header for the table | |
| if sample_analysis: # line 2345 | |
| sample_df = pd.DataFrame(sample_analysis) | |
| st.dataframe(sample_df, use_container_width=True) # LINE 2347 | |
| else: | |
| st.info("No weather variable sample analysis to display (no variables selected or data available).") | |
| # 如果需要滑动窗口选择,显示选择界面 (line 2350) | |
| if weather_window_needed and "window_selection_done" not in st.session_state: | |
| st.warning("⚠️ Some weather variables have sample sizes between 50-100. Please select sliding window sizes for better sampling:") | |
| with st.form("weather_window_form"): | |
| st.write("**Select sliding window for each weather variable:**") | |
| st.caption("Window size determines how many neighboring months to include in the sampling process.") | |
| window_configs = {} | |
| for item in sample_analysis: | |
| if item["Needs Window Selection"]: | |
| var_name = item["Variable"] | |
| avg_samples = item["Avg Samples"] | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.write(f"**{var_name}** (avg {avg_samples:.0f} samples/month)") | |
| with col2: | |
| window_size = st.select_slider( | |
| f"Window for {var_name}", | |
| options=[1, 2, 3], | |
| value=2, | |
| key=f"window_{var_name}", | |
| help=f"1 = current month only, 2 = ±1 month, 3 = ±2 months" | |
| ) | |
| window_configs[var_name] = window_size | |
| submitted = st.form_submit_button("✅ Confirm Window Selection") | |
| if submitted: | |
| # 保存窗口配置 | |
| st.session_state["weather_window_config"] = window_configs | |
| st.session_state["window_selection_done"] = True | |
| st.success("✅ Window configuration saved!") | |
| st.rerun() | |
| # 显示当前窗口配置(如果已设置) | |
| if st.session_state.get("weather_window_config") and weather_window_needed: | |
| with st.expander("📋 Current Window Configuration", expanded=False): | |
| config_df = pd.DataFrame([ | |
| {"Variable": k, "Window Size": f"±{v-1} months"} | |
| for k, v in st.session_state["weather_window_config"].items() | |
| ]) | |
| st.dataframe(config_df, use_container_width=True) | |
| if st.button("🔄 Reset Window Configuration"): | |
| if "weather_window_config" in st.session_state: | |
| del st.session_state["weather_window_config"] | |
| if "window_selection_done" in st.session_state: | |
| del st.session_state["window_selection_done"] | |
| st.rerun() | |
| # --- User choice for Target Variable --- | |
| st.markdown("---") # Visual separator | |
| st.subheader("🎯 Target Variable for Modeling") | |
| target_use_choice = st.radio( | |
| "Select the target 'Use' column for training and prediction:", | |
| ('Original Use', 'FilledUse (from Changepoint Preprocessing)'), | |
| index=0, # Default to 'Original Use' | |
| key='target_use_choice', | |
| horizontal=True, | |
| help="Choose 'FilledUse' if you believe the preprocessed (filled) data from the changepoint detection step better represents the true consumption pattern for modeling." | |
| ) | |
| # 只有在不需要窗口选择或已完成窗口选择后,才显示预测按钮 | |
| if not weather_window_needed or st.session_state.get("window_selection_done", False): | |
| if st.button("Generate Predictions", key="generate_predictions_button"): | |
| # --- Centralized Data Preparation based on User Choice --- | |
| sel_building = st.session_state.get("selected_building") | |
| selected_utility = st.session_state.get("selected_utility") # CommodityCode | |
| if not sel_building or not selected_utility: | |
| st.error("❌ Building or Utility not selected. Please make selections in the sidebar.") | |
| st.stop() | |
| # 1. Start with a copy of the primary data source (contains original 'Use' and all raw features) | |
| df_source_for_modeling = st.session_state.get("df_merged_with_features") | |
| if df_source_for_modeling is None or df_source_for_modeling.empty: | |
| st.error("❌ Main data ('df_merged_with_features') is not available. Please ensure data is loaded and preprocessed.") | |
| st.stop() | |
| df_for_modeling = df_source_for_modeling.copy() # IMPORTANT: Work on a copy | |
| # Ensure 'StartDate' is datetime for potential merges and consistent processing | |
| if 'StartDate' not in df_for_modeling.columns: | |
| st.error("❌ 'StartDate' column is missing from the main data source.") | |
| st.stop() | |
| df_for_modeling['StartDate'] = pd.to_datetime(df_for_modeling['StartDate']) | |
| # 2. Get the user's choice for the target 'Use' column | |
| chosen_target_source = st.session_state.get("target_use_choice", "Original Use") | |
| if chosen_target_source == "FilledUse (from Changepoint Preprocessing)": | |
| st.info("🎯 Using 'FilledUse' as the target variable for modeling.") | |
| if "filled" not in st.session_state or st.session_state["filled"].empty: | |
| st.error("❌ 'FilledUse' data (from st.session_state['filled']) is not available. " | |
| "This data is generated during changepoint detection. " | |
| "Please run changepoint detection and credibility analysis first. " | |
| "Using 'Original Use' as fallback.") | |
| # No changes to df_for_modeling['Use'], it remains original | |
| else: | |
| filled_data_for_merge = st.session_state["filled"].copy() | |
| if 'Date' not in filled_data_for_merge.columns: | |
| st.error("❌ 'Date' column not found in 'filled' data. Cannot merge 'FilledUse'. Using original 'Use'.") | |
| elif 'BuildingName' not in filled_data_for_merge.columns: | |
| st.error("❌ 'BuildingName' column not found in 'filled' data. Cannot merge 'FilledUse'. Using original 'Use'.") | |
| elif 'CommodityCode' not in filled_data_for_merge.columns: | |
| st.error("❌ 'CommodityCode' column not found in 'filled' data. Cannot merge 'FilledUse'. Using original 'Use'.") | |
| elif 'FilledUse' not in filled_data_for_merge.columns: | |
| st.error("❌ 'FilledUse' column not found in 'filled' data. Cannot merge. Using original 'Use'.") | |
| else: | |
| # Ensure correct datetime type for merging key | |
| filled_data_for_merge['Date_for_merge'] = pd.to_datetime(filled_data_for_merge['Date']) | |
| # Select only necessary columns for the merge to avoid duplicate columns from 'filled' | |
| filled_data_to_join = filled_data_for_merge[['BuildingName', 'CommodityCode', 'Date_for_merge', 'FilledUse']] | |
| # Store original 'Use' before merge to handle non-matches correctly | |
| original_use_series = df_for_modeling['Use'].copy() | |
| # Perform the merge | |
| df_for_modeling = pd.merge( | |
| df_for_modeling, | |
| filled_data_to_join, | |
| left_on=['BuildingName', 'CommodityCode', 'StartDate'], | |
| right_on=['BuildingName', 'CommodityCode', 'Date_for_merge'], | |
| how='left' | |
| ) | |
| # Update the 'Use' column: if 'FilledUse' is NaN (no match), revert to original 'Use' for that row | |
| if 'FilledUse' in df_for_modeling.columns: | |
| df_for_modeling['Use'] = df_for_modeling['FilledUse'].fillna(original_use_series) | |
| # Clean up columns added from the merge | |
| df_for_modeling = df_for_modeling.drop(columns=['Date_for_merge', 'FilledUse']) | |
| st.success("Successfully merged 'FilledUse' as the target 'Use' column.") | |
| else: | |
| # This case should ideally not be reached if preliminary checks pass, | |
| # but as a safeguard: | |
| st.warning("⚠️ 'FilledUse' column was expected but not found after merge. " | |
| "Reverting to original 'Use' values.") | |
| df_for_modeling['Use'] = original_use_series # Ensure 'Use' is the original series | |
| elif chosen_target_source == "Original Use": | |
| st.info("🎯 Using original 'Use' as the target variable for modeling.") | |
| # No change needed for df_for_modeling['Use'] as it's already the original 'Use'. | |
| else: # Should not happen with st.radio due to default | |
| st.error(f"❌ Unknown target_use_choice: {chosen_target_source}. Defaulting to 'Original Use'.") | |
| # df_for_modeling['Use'] remains original | |
| # 3. Proceed with filtering based on LLM/Original Classification (This part of your logic can remain similar) | |
| def _get_llm_cls(): # Your existing helper function | |
| if "revised_llm_analysis" in st.session_state: | |
| return st.session_state["revised_llm_analysis"].get("llm_classification") | |
| if "initial_llm_analysis" in st.session_state: | |
| return st.session_state["initial_llm_analysis"].get("llm_classification") | |
| return None | |
| current_row_for_info = df_for_modeling[df_for_modeling["BuildingName"] == sel_building] | |
| if current_row_for_info.empty: # Should be caught by df_for_modeling check, but good to have | |
| st.error(f"No data for building '{sel_building}' in the prepared modeling data.") | |
| st.stop() | |
| orig_cls = current_row_for_info["BuildingClassification"].iloc[0] if "BuildingClassification" in current_row_for_info else "Unknown" | |
| llm_cls = _get_llm_cls() | |
| cls_for_filter = llm_cls or orig_cls | |
| if llm_cls: | |
| st.write(f"Filtering by LLM classification: **{llm_cls}**") | |
| else: | |
| st.write(f"Filtering by original classification: **{orig_cls}**") | |
| # Filter based on classification and commodity | |
| # Ensure 'BuildingClassification' exists before filtering | |
| if "BuildingClassification" not in df_for_modeling.columns: | |
| st.error("❌ 'BuildingClassification' column missing from modeling data. Cannot filter.") | |
| st.stop() | |
| filtered_for_model = df_for_modeling[ | |
| (df_for_modeling["BuildingClassification"].astype(str).str.strip() == str(cls_for_filter).strip()) & | |
| (df_for_modeling["CommodityCode"] == selected_utility) | |
| ] | |
| if filtered_for_model.empty and llm_cls: | |
| st.warning(f"⚠️ No data found for LLM classification '{llm_cls}'. Falling back to original classification '{orig_cls}'.") | |
| filtered_for_model = df_for_modeling[ | |
| (df_for_modeling["BuildingClassification"].astype(str).str.strip() == str(orig_cls).strip()) & | |
| (df_for_modeling["CommodityCode"] == selected_utility) | |
| ] | |
| if filtered_for_model.empty: | |
| st.error("❌ Cannot train model: No data found for the selected classification & commodity combination, even after fallback.") | |
| st.stop() | |
| # 4. Extract base columns + weather features (Your existing logic) | |
| base_columns = [ # Keep 'Use' here as it's now the chosen target | |
| 'BuildingName', 'Use', 'StartDate', 'SpaceSqFt', 'SpaceWorkpointCount', | |
| 'c_floor_count', 'BuildingLifeCycleStage', 'holidaycount', 'BuildingGrossArea' # Added BuildingGrossArea based on later code | |
| ] | |
| # Get selected weather features | |
| if "revised_llm_analysis" in st.session_state: | |
| manual_weather_features = [item["variable"] for item in st.session_state["revised_llm_analysis"].get("weather_selection", []) if "variable" in item] | |
| elif "initial_llm_analysis" in st.session_state: | |
| manual_weather_features = [item["variable"] for item in st.session_state["initial_llm_analysis"].get("weather_selection", []) if "variable" in item] | |
| else: | |
| manual_weather_features = [] | |
| all_required_columns_for_model = list(set(base_columns + manual_weather_features)) # Use set to avoid duplicates if 'Use' was in manual_weather_features by mistake | |
| # Check column existence in 'filtered_for_model' | |
| missing_model_cols = [col for col in all_required_columns_for_model if col not in filtered_for_model.columns] | |
| if missing_model_cols: | |
| st.error(f"❌ The following required columns for the model are missing from the filtered data: {', '.join(missing_model_cols)}. " | |
| f"Available columns: {filtered_for_model.columns.tolist()}") | |
| st.stop() | |
| final_extracted_data = filtered_for_model[all_required_columns_for_model].copy() # Work with a copy for feature engineering | |
| # === Feature Engineering: Time features (Your existing logic) === | |
| # 'StartDate' is already pd.to_datetime | |
| final_extracted_data['month'] = final_extracted_data['StartDate'].dt.month | |
| final_extracted_data['month_sin'] = np.sin(2 * np.pi * final_extracted_data['month'] / 12) | |
| final_extracted_data['month_cos'] = np.cos(2 * np.pi * final_extracted_data['month'] / 12) | |
| final_extracted_data = final_extracted_data.drop(columns=['month']) | |
| final_extracted_data = final_extracted_data.sort_values("StartDate") | |
| if not final_extracted_data.empty: # Ensure not empty before min() | |
| final_extracted_data['time_index'] = \ | |
| ((final_extracted_data['StartDate'] - final_extracted_data['StartDate'].min()).dt.days // 30) | |
| else: | |
| final_extracted_data['time_index'] = pd.Series(dtype='int') | |
| st.write("### Prepared Data for Model Input (with chosen 'Use' and time features)") | |
| st.dataframe(final_extracted_data.head()) | |
| if final_extracted_data.empty: | |
| st.error("❌ No data available after all preparation steps for modeling.") | |
| st.stop() | |
| st.success(f"Successfully prepared {len(final_extracted_data)} records for modeling.") | |
| # --- From here, your existing logic for train_df, pred_df, basic_features, standardization, etc., should largely follow --- | |
| # MAKE SURE to use `final_extracted_data` as the source for splitting `train_df` and `pred_df`. | |
| # And ensure `basic_features` list is consistent with the columns available in `final_extracted_data` (excluding 'Use', 'StartDate', 'BuildingName'). | |
| # Example continuation: | |
| train_df = final_extracted_data[final_extracted_data["BuildingName"] != sel_building].copy() | |
| pred_df = final_extracted_data[final_extracted_data["BuildingName"] == sel_building].copy() | |
| if train_df.empty or pred_df.empty: | |
| st.error("❌ Training or prediction data insufficient after splitting. Check filter conditions and data for selected building vs. others.") | |
| st.stop() | |
| # Redefine basic_features based on what's truly available in final_extracted_data (excluding target, IDs, and date) | |
| # and what is intended to be a "basic" non-weather feature. | |
| basic_features = [ | |
| "SpaceSqFt", "SpaceWorkpointCount", "c_floor_count", 'BuildingGrossArea', | |
| "BuildingLifeCycleStage", "holidaycount", | |
| "month_sin", "month_cos", "time_index", # These are now part of the core dataset | |
| ] | |
| # Ensure all basic_features are in final_extracted_data.columns | |
| actual_basic_features = [f for f in basic_features if f in final_extracted_data.columns] | |
| missing_basic = [f for f in basic_features if f not in actual_basic_features] | |
| if missing_basic: | |
| st.warning(f"⚠️ Some defined 'basic_features' were not found in the final data and will be excluded: {missing_basic}") | |
| actual_weather_features = [f for f in manual_weather_features if f in final_extracted_data.columns] | |
| missing_weather = [f for f in manual_weather_features if f not in actual_weather_features] | |
| if missing_weather: | |
| st.warning(f"⚠️ Some selected 'weather_features' were not found in the final data and will be excluded: {missing_weather}") | |
| feature_cols = actual_basic_features + actual_weather_features | |
| # Remove 'Use', 'StartDate', 'BuildingName' if they accidentally got into feature_cols | |
| feature_cols = [f for f in feature_cols if f not in ['Use', 'StartDate', 'BuildingName']] | |
| feature_cols = list(dict.fromkeys(feature_cols)) # Remove duplicates while preserving order | |
| st.write("Actual feature columns for model:", feature_cols) | |
| # ---- Standardization (Your existing logic, ensure columns exist) ---- | |
| # Standardize "SpaceSqFt" and "BuildingGrossArea" | |
| cols_to_scale_basic = ["SpaceSqFt", "BuildingGrossArea"] | |
| actual_cols_to_scale_basic = [col for col in cols_to_scale_basic if col in train_df.columns and col in pred_df.columns] | |
| if actual_cols_to_scale_basic: | |
| train_df[actual_cols_to_scale_basic] = np.log1p(train_df[actual_cols_to_scale_basic]) | |
| pred_df[actual_cols_to_scale_basic] = np.log1p(pred_df[actual_cols_to_scale_basic]) | |
| else: | |
| st.warning(f"Columns for basic scaling ({cols_to_scale_basic}) not all found in train/pred DFs.") | |
| # Standardize "HDD_sum" and "CDD_sum" if they are in weather_features | |
| weather_features_to_scale_specific = [f for f in ['HDD_sum', 'CDD_sum'] if f in actual_weather_features] | |
| actual_weather_features_to_scale_specific = [col for col in weather_features_to_scale_specific if col in train_df.columns and col in pred_df.columns] | |
| if actual_weather_features_to_scale_specific: | |
| train_df[actual_weather_features_to_scale_specific] = np.log1p(train_df[actual_weather_features_to_scale_specific]) | |
| pred_df[actual_weather_features_to_scale_specific] = np.log1p(pred_df[actual_weather_features_to_scale_specific]) | |
| # ---- Prepare training data (Your existing logic) ---- | |
| # Ensure 'Use' and 'StartDate' are present for this step | |
| required_for_input_df = ["StartDate", "Use"] + feature_cols | |
| actual_cols_for_input_df = [col for col in required_for_input_df if col in train_df.columns] | |
| train_input_df = train_df[actual_cols_for_input_df].copy() | |
| # 'StartDate' is already datetime | |
| for col in feature_cols: # Iterate only over actual feature_cols | |
| if col in train_input_df.columns and train_input_df[col].dtype == "object": | |
| train_input_df[col] = train_input_df[col].astype("category").cat.codes | |
| train_input_df = train_input_df.reset_index(drop=True) | |
| # ---- Prepare the last-known frame (Your existing logic) ---- | |
| actual_cols_for_last_known_df = [col for col in required_for_input_df if col in pred_df.columns] | |
| last_known_df = pred_df[actual_cols_for_last_known_df].copy() | |
| # 'StartDate' is already datetime | |
| for col in feature_cols: # Iterate only over actual feature_cols | |
| if col in last_known_df.columns and last_known_df[col].dtype == "object": | |
| last_known_df[col] = last_known_df[col].astype("category").cat.codes | |
| last_known_df = last_known_df.reset_index(drop=True) | |
| # ---- Train model (Your existing logic) ---- | |
| # Retrieve duration_months from the correct analysis state | |
| duration_months = None | |
| analysis_source_for_duration = st.session_state.get("revised_llm_analysis", st.session_state.get("initial_llm_analysis")) | |
| if analysis_source_for_duration: | |
| duration_months = analysis_source_for_duration.get("duration_months") | |
| if duration_months is None: | |
| st.error("❌ Prediction duration (duration_months) not found in LLM analysis. Cannot train model.") | |
| st.stop() | |
| if not isinstance(duration_months, int) or duration_months <=0: | |
| st.error(f"❌ Invalid prediction duration: {duration_months}. Must be a positive integer.") | |
| st.stop() | |
| if train_input_df.empty or 'Use' not in train_input_df.columns or len(feature_cols) == 0: | |
| st.error("❌ Training input data is empty or critical columns ('Use', features) are missing. Cannot train model.") | |
| st.stop() | |
| st.write(f"Training model with {len(train_input_df)} samples and {len(feature_cols)} features.") | |
| st.dataframe(train_input_df.head()) | |
| best_model, study = train_fixed_model( | |
| df=train_input_df, # train_input_df already has 'Use' and features | |
| duration_months=duration_months, | |
| n_trials=100, # Kept low for speed in example | |
| early_stopping_rounds=50, | |
| ) | |
| # ---- Forecast (Your existing logic) ---- | |
| weather_windows_config = st.session_state.get("weather_window_config", {}) | |
| weather_windows = {col: weather_windows_config.get(col, 2) for col in actual_weather_features} # Use actual_weather_features | |
| if last_known_df.empty: | |
| st.error("❌ Last known data for prediction is empty. Cannot forecast.") | |
| st.stop() | |
| future_df = recursive_forecast_with_weather_sampling( | |
| model=best_model, | |
| last_known_df=last_known_df, # last_known_df also has 'Use' and features | |
| forecast_horizon=duration_months, | |
| best_params=study.best_params, | |
| weather_history=df_source_for_modeling[df_source_for_modeling["BuildingName"] == sel_building], # Use original df_source_for_modeling for weather history | |
| weather_features=actual_weather_features, # Use actual_weather_features | |
| weather_windows=weather_windows, | |
| enable_weather_sampling=True | |
| ) | |
| # ---- Visualise (Your existing logic) ---- | |
| # Use pred_df (which has the chosen 'Use' column) for historical actuals in the plot | |
| st.subheader(f"🔮 Energy Usage Forecast for {selected_building} (Target: {chosen_target_source})") | |
| historical_data_for_plot = pred_df[["StartDate", "Use"]].copy() # 'Use' here is the one chosen by the user | |
| historical_data_for_plot.columns = ["Date", "ActualUse"] | |
| historical_data_for_plot["Type"] = "Historical" | |
| forecast_data_for_plot = future_df.copy() | |
| if not forecast_data_for_plot.empty: | |
| # Ensure column names from recursive_forecast_with_weather_sampling are consistent | |
| # It returns ["Date", "PredictedUse"] | |
| forecast_data_for_plot.columns = ["Date", "ForecastUse"] | |
| forecast_data_for_plot["Type"] = "Forecast" | |
| # Combine for plotting (ActualUse vs ForecastUse) | |
| # Need to align column names for y-axis | |
| plot_data_hist = historical_data_for_plot.rename(columns={"ActualUse": "EnergyUsage"}) | |
| plot_data_fcst = forecast_data_for_plot.rename(columns={"ForecastUse": "EnergyUsage"}) | |
| combined_data = pd.concat([plot_data_hist, plot_data_fcst], ignore_index=True) | |
| combined_data["Date"] = pd.to_datetime(combined_data["Date"]) | |
| # ... (Your existing Altair chart plotting logic, ensure y-axis is 'EnergyUsage') ... | |
| # Example for Altair chart: | |
| historical_line = alt.Chart(combined_data[combined_data["Type"] == "Historical"]).mark_line( | |
| color='steelblue', strokeWidth=2 | |
| ).encode( | |
| x=alt.X('Date:T', title='Date'), | |
| y=alt.Y('EnergyUsage:Q', title=f'Energy Usage ({chosen_target_source})'), | |
| tooltip=['Date:T', 'EnergyUsage:Q'] | |
| ) | |
| # Prepare data for the connecting line and the solid forecast line | |
| if not historical_data_for_plot.empty and not forecast_data_for_plot.empty: | |
| last_hist_point = historical_data_for_plot.iloc[[-1]].rename(columns={"ActualUse": "EnergyUsage"}) | |
| first_fcst_point = forecast_data_for_plot.iloc[[0]].rename(columns={"ForecastUse": "EnergyUsage"}) | |
| # Data for the connecting line segment | |
| connecting_line_data = pd.concat([ | |
| last_hist_point[['Date', 'EnergyUsage']], | |
| first_fcst_point[['Date', 'EnergyUsage']] | |
| ]).reset_index(drop=True) | |
| connecting_line_chart = alt.Chart(connecting_line_data).mark_line( | |
| color='red', strokeWidth=2 | |
| ).encode( | |
| x='Date:T', | |
| y='EnergyUsage:Q' | |
| ) | |
| # Data for the main forecast line (ensure it starts from the first forecast point) | |
| # The forecast_data_for_plot already has 'EnergyUsage' as the y-column due to combined_data preparation | |
| forecast_plot_points = combined_data[combined_data["Type"] == "Forecast"] | |
| forecast_line = alt.Chart(forecast_plot_points).mark_line( | |
| color='red', strokeWidth=2 # Changed to solid red | |
| ).encode( | |
| x='Date:T', | |
| y='EnergyUsage:Q', | |
| tooltip=['Date:T', 'EnergyUsage:Q'] | |
| ) | |
| # Add points, divider, etc. as before | |
| last_historical_date = historical_data_for_plot["Date"].max() | |
| divider = alt.Chart(pd.DataFrame({'Date': [last_historical_date]})).mark_rule( | |
| color='gray', strokeDash=[3,3], opacity=0.5 | |
| ).encode(x='Date:T') | |
| chart = (historical_line + forecast_line + connecting_line_chart + divider).properties( | |
| width=800, height=400, | |
| title=f"{selected_utility} Usage: Historical vs {duration_months}-Month Forecast" | |
| ).interactive() | |
| else: # Fallback if data is missing for connection | |
| # Original forecast line (dashed) if connection isn't possible | |
| forecast_line = alt.Chart(combined_data[combined_data["Type"] == "Forecast"]).mark_line( | |
| color='red', strokeWidth=2, strokeDash=[5,5] # Kept dashed for fallback | |
| ).encode( | |
| x='Date:T', | |
| y='EnergyUsage:Q', | |
| tooltip=['Date:T', 'EnergyUsage:Q'] | |
| ) | |
| last_historical_date = historical_data_for_plot["Date"].max() if not historical_data_for_plot.empty else pd.Timestamp.now() | |
| divider = alt.Chart(pd.DataFrame({'Date': [last_historical_date]})).mark_rule( | |
| color='gray', strokeDash=[3,3], opacity=0.5 | |
| ).encode(x='Date:T') | |
| chart = (historical_line + forecast_line + divider).properties( | |
| width=800, height=400, | |
| title=f"{selected_utility} Usage: Historical vs {duration_months}-Month Forecast" | |
| ).interactive() | |
| st.altair_chart(chart, use_container_width=True) | |
| # ... (Your existing metrics display logic, ensure it uses the correct columns) ... | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Historical Data Points", len(historical_data_for_plot)) | |
| with col2: | |
| st.metric("Forecast Horizon", f"{duration_months} months") | |
| with col3: | |
| avg_historical = historical_data_for_plot["ActualUse"].mean() if not historical_data_for_plot.empty else 0 | |
| avg_forecast = forecast_data_for_plot["ForecastUse"].mean() if not forecast_data_for_plot.empty else 0 | |
| change_pct = ((avg_forecast - avg_historical) / avg_historical * 100) if avg_historical != 0 else 0 | |
| st.metric("Avg. Change", f"{change_pct:+.1f}%") | |
| # Display the forecast table | |
| if not forecast_data_for_plot.empty: | |
| st.subheader("📅 Monthly Forecasted Usage") | |
| display_forecast_df = forecast_data_for_plot[['Date', 'ForecastUse']].copy() | |
| display_forecast_df['Date'] = display_forecast_df['Date'].dt.strftime('%Y-%m-%d') | |
| display_forecast_df.rename(columns={'ForecastUse': 'Predicted Energy Use'}, inplace=True) | |
| st.dataframe(display_forecast_df.set_index('Date'), use_container_width=True) | |
| else: | |
| st.info("No forecast data to display in table.") | |
| else: | |
| st.warning("⚠️ Forecast data is empty. Cannot visualize or display table.") | |
| # ... (Your existing weather sampling strategy display logic) ... | |
| # 返回按钮 | |
| st.markdown("---") | |
| if st.button("← Return to Changepoint Detection", key="return_to_cp_button"): | |
| st.session_state["start_energy_prediction"] = False | |
| st.rerun() | |
| # The following block needs to be correctly indented to be part of the main script execution flow, | |
| # specifically within the 'if selected_building:' block where it was originally intended for credibility analysis. | |
| # This indentation was lost in previous edits and needs to be restored. | |
| # Corrected indentation for the credibility analysis block: | |
| if st.session_state.get("credibility_analysis_done", False): # This line should align with other top-level 'if's in the 'if selected_building:' block | |
| if "cp_df" not in st.session_state: | |
| st.warning("Please run changepoint detection first") | |
| st.session_state["credibility_analysis_done"] = False | |
| st.stop() | |
| if "base_ln" in st.session_state: | |
| pts = ( | |
| alt.Chart(st.session_state["cp_df"][st.session_state["cp_df"]["changepoint"] == 1]) | |
| .mark_point(shape="triangle", size=100, color="red", filled=True) | |
| .encode(x="timestamp:T", y="value:Q") | |
| ) | |
| plot_cp.altair_chart(st.session_state["base_ln"] + pts, use_container_width=True) | |
| if "credibility_results" not in st.session_state: | |
| original_changepoints = st.session_state["cp_df"][st.session_state["cp_df"]["changepoint"] == 1].copy() | |
| base_changepoints = [] | |
| for _, row in original_changepoints.iterrows(): | |
| timestamp = row["timestamp"] | |
| value = row["value"] | |
| if force_noise_samples: | |
| changepoint_type = np.random.choice(['strong', 'medium', 'weak'], p=[0.15, 0.25, 0.6]) | |
| else: | |
| changepoint_type = np.random.choice(['strong', 'medium', 'weak'], p=[0.3, 0.4, 0.3]) | |
| if changepoint_type == 'strong': | |
| z_score = np.random.uniform(2.5, 4.0) | |
| slope = np.random.uniform(0.15, 0.3) | |
| adf_p_value = np.random.uniform(0.01, 0.03) | |
| elif changepoint_type == 'medium': | |
| z_score = np.random.uniform(1.5, 2.5) | |
| slope = np.random.uniform(0.08, 0.15) | |
| adf_p_value = np.random.uniform(0.03, 0.07) | |
| else: # weak | |
| if force_noise_samples: | |
| z_score = np.random.uniform(0.2, 0.8) | |
| slope = np.random.uniform(0.001, 0.03) | |
| adf_p_value = np.random.uniform(0.15, 0.3) | |
| else: | |
| z_score = np.random.uniform(0.5, 1.5) | |
| slope = np.random.uniform(0.01, 0.08) | |
| adf_p_value = np.random.uniform(0.07, 0.15) | |
| base_changepoints.append({ | |
| "Building Name": selected_building, | |
| "CommodityCode": selected_utility, | |
| "Changepoint Date": timestamp, | |
| "ProphetDelta": value, | |
| "z_score": z_score, | |
| "slope": slope, | |
| "adf_p_value": adf_p_value, | |
| "ChangePointType": changepoint_type | |
| }) | |
| base_df = pd.DataFrame(base_changepoints) | |
| base_df["AbsDelta"] = base_df["ProphetDelta"].abs() | |
| # ... (The rest of the credibility analysis logic from the original file) | |
| # This includes feature extraction, prediction loop, stats calculation, plotting, etc. | |
| # Ensure this entire block is correctly indented under the | |
| # 'if st.session_state.get("credibility_analysis_done", False):' condition. | |
| # Due to length, the full credibility block is not repeated here but needs to be present and correctly indented in your actual app.py | |
| st.write("DEBUG-state", | |
| start_pred=st.session_state.get("start_energy_prediction"), | |
| bld=st.session_state.get("selected_building"), | |
| utl=st.session_state.get("selected_utility")) |