Spaces:

cameroncameron
/

OMCP

No application file

App Files Files Community

cameroncameron commited on Jun 10, 2025

Commit

6782585

verified ·

1 Parent(s): ec514ce

Upload 4 files

Browse files

Files changed (4) hide show

src/cp_utils.py +437 -0
src/data_utils.py +25 -0
src/rupture_utils.py +48 -0
src/usage_utils.py +433 -0

src/cp_utils.py ADDED Viewed

	@@ -0,0 +1,437 @@

+"""cp_utils.py
+Utilities for evaluating changepoint credibility and performing
+semi-/supervised classification on changepoints.
+"""
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+from typing import Tuple
+from statsmodels.tsa.stattools import adfuller
+from prophet import Prophet
+from sklearn.semi_supervised import SelfTrainingClassifier
+from xgboost import XGBClassifier
+# 🔧 新增：添加CatBoost支持
+try:
+    from catboost import CatBoostClassifier
+    CATBOOST_AVAILABLE = True
+except ImportError:
+    CATBOOST_AVAILABLE = False
+    print("⚠️ CatBoost not installed. Install with: pip install catboost")
+# 1. 评估单栋楼的候选 changepoints —— ProphetDelta + 结构性指标
+def _validate_cp_metrics(residual: pd.Series, idx: int, win: int = 6
+                         ) -> Tuple[bool, float, float]:
+    """在 idx±win 窗口内计算 (slope, adf_p) 并判断是否显著"""
+    lo = max(0, idx - win)
+    hi = min(len(residual), idx + win + 1)
+    seg = residual.iloc[lo:hi].dropna()
+    if seg.size < 4:
+        return False, np.nan, np.nan
+    slope = np.polyfit(range(len(seg)), seg, 1)[0]
+    p_val = adfuller(seg)[1]
+    is_valid = (abs(slope) > 0.1) and (p_val > 0.05)
+    return is_valid, float(slope), float(p_val)
+def building_score_changepoints(
+    summary_df: pd.DataFrame,
+    filled_df: pd.DataFrame,
+    building_name: str,
+    model: str = "rbf",
+    penalty: float | None = None,
+    window_size: int = 6,
+    usage_col: str = "FilledUse",
+    date_col: str = "Date",
+    cp_df: pd.DataFrame | None = None,
+) -> pd.DataFrame:
+    """Return table with Prophet delta & structure metrics for one building."""
+    from rupture_utils import detect_changepoints
+    records: list[dict] = []
+    utilities = (
+        summary_df.loc[summary_df["BuildingName"] == building_name,
+                       "CommodityCode"].unique()
+    )
+    for util in utilities:
+        df_util = (
+            filled_df[
+                (filled_df["BuildingName"] == building_name)
+                & (filled_df["CommodityCode"] == util)
+            ]
+            .sort_values(date_col)
+            .reset_index(drop=True)
+        )
+        if df_util[usage_col].isna().any() or len(df_util) < 24:
+            continue
+        if cp_df is None:
+            cp_df = detect_changepoints(
+                df_util[[date_col, usage_col]].rename(
+                    columns={date_col: "timestamp", usage_col: "value"}),
+                algo="pelt",
+                model=model,
+                pen=penalty or 1.0,
+            )
+        if cp_df.empty or cp_df["changepoint"].sum() == 0:
+            continue
+        df_p = df_util[[date_col, usage_col]].rename(
+            columns={date_col: "ds", usage_col: "y"})
+        m_tmp = Prophet(yearly_seasonality=False, weekly_seasonality=False,
+                        daily_seasonality=False)
+        m_tmp.fit(df_p)
+        residual = df_p["y"] - m_tmp.predict(df_p)["yhat"]
+        validated_dates: list[pd.Timestamp] = []
+        metrics_map: dict[pd.Timestamp, Tuple[float, float]] = {}
+        for d in pd.to_datetime(cp_df.loc[cp_df["changepoint"] == 1,
+                                          "timestamp"]):
+            if d not in df_util[date_col].values:
+                continue
+            idx = df_util.index[df_util[date_col] == d][0]
+            is_valid, slope, p_val = _validate_cp_metrics(
+                residual, idx, win=window_size
+            )
+            if is_valid:
+                validated_dates.append(d)
+                metrics_map[d] = (slope, p_val)
+        if not validated_dates:
+            continue
+        m = Prophet(changepoints=validated_dates, yearly_seasonality=False)
+        m.fit(df_p)
+        deltas = m.params["delta"].mean(axis=0)
+        for cp, delta in zip(m.changepoints, deltas):
+            d = pd.to_datetime(cp)
+            slope, p_val = metrics_map.get(d, (np.nan, np.nan))
+            records.append(
+                {
+                    "Building Name": building_name,
+                    "CommodityCode": util,
+                    "Changepoint Date": d,
+                    "ProphetDelta": float(delta),
+                    "slope": float(slope),
+                    "adf_p_value": float(p_val),
+                }
+            )
+    result_df = pd.DataFrame(records)
+    if not result_df.empty:
+        result_df["AbsDelta"] = result_df["ProphetDelta"].abs()
+    return result_df
+# 2. 伪标签打标
+def label_changepoints_by_structure_signal(
+    df: pd.DataFrame,
+    slope_thresh: float = 0.1,
+    p_thresh: float = 0.05,
+) -> pd.DataFrame:
+    """Assign pseudo-labels Real / Noise / Unknown based on structure
+    signals."""
+    def _assign(row):
+        s, p = row["slope"], row["adf_p_value"]
+        if pd.isna(s) or pd.isna(p):
+            return "Unknown"
+        if (abs(s) > slope_thresh) and (p > p_thresh):
+            return "Real"
+        if (abs(s) < slope_thresh * 0.5) and (p < p_thresh * 0.5):
+            return "Noise"
+        return "Unknown"
+    out = df.copy()
+    out["Label"] = out.apply(_assign, axis=1)
+    return out
+# 3. 时序衍生特征
+def extract_changepoint_features(
+    cp_df: pd.DataFrame,
+    filled_df: pd.DataFrame,
+    usage_col: str = "FilledUse",
+    date_col: str = "Date",
+    mean_win: int = 6,
+) -> pd.DataFrame:
+    """Derive mean diff/ratio and temporal context features for each cp,
+    并合并holidaycount特征（如有）"""
+    cp_df = cp_df.copy()
+    # 🔧 Fix: Ensure proper data types for TimeIndex and Season columns
+    cp_df["TimeIndex"] = cp_df["Changepoint Date"].dt.month.astype('int64')
+    # 🔧 Fix: Convert Season to categorical codes to avoid string/numeric
+    # dtype conflicts
+    season_mapping = {
+        6: 0, 7: 0, 8: 0,        # Summer = 0
+        12: 1, 1: 1, 2: 1,       # Winter = 1
+    }
+    # Other = 2
+    season_col = cp_df["TimeIndex"].map(season_mapping).fillna(2)
+    cp_df["Season"] = season_col.astype('int64')
+    min_dates = filled_df.groupby("BuildingName")[date_col].min().to_dict()
+    for i, row in cp_df.iterrows():
+        bld = row["Building Name"]
+        cp_date = row["Changepoint Date"]
+        df_bld = (
+            filled_df[filled_df["BuildingName"] == bld]
+            .sort_values(date_col)
+            .reset_index(drop=True)
+        )
+        if cp_date not in df_bld[date_col].values:
+            continue
+        idx = df_bld.index[df_bld[date_col] == cp_date][0]
+        before_vals = df_bld[usage_col].iloc[max(0, idx - mean_win): idx]
+        after_vals = df_bld[usage_col].iloc[idx + 1: idx + mean_win + 1]
+        before_mean = before_vals.mean() if len(before_vals) else np.nan
+        after_mean = after_vals.mean() if len(after_vals) else np.nan
+        diff = after_mean - before_mean if np.isfinite(before_mean) and np.isfinite(after_mean) else np.nan
+        ratio = after_mean / before_mean if np.isfinite(before_mean) and before_mean != 0 else np.nan
+        cp_df.at[i, "ΔMeanBefore"] = before_mean
+        cp_df.at[i, "ΔMeanAfter"] = after_mean
+        cp_df.at[i, "ΔMeanDiff"] = diff
+        cp_df.at[i, "ΔMeanRatio"] = ratio
+        cp_df.at[i, "TimeSinceStart"] = (cp_date - min_dates.get(bld, cp_date)).days
+    # 🔧 Fix: Ensure all numeric columns have consistent dtypes
+    numeric_cols = ["ΔMeanBefore", "ΔMeanAfter", "ΔMeanDiff", "ΔMeanRatio",
+                    "TimeSinceStart", "TimeIndex", "Season"]
+    for col in numeric_cols:
+        if col in cp_df.columns:
+            cp_df[col] = pd.to_numeric(cp_df[col], errors='coerce')
+    if "holidaycount" in filled_df.columns:
+        # 只保留合并所需的列，避免重复
+        holiday_df = filled_df[["BuildingName", date_col, "holidaycount"]].drop_duplicates()
+        holiday_df = holiday_df.rename(
+            columns={"BuildingName": "Building Name", date_col: "Changepoint Date"}
+        )
+        # 🔧 Fix: Ensure holidaycount is numeric
+        holiday_df["holidaycount"] = pd.to_numeric(
+            holiday_df["holidaycount"], errors='coerce'
+        ).fillna(0)
+        cp_df = cp_df.merge(
+            holiday_df, on=["Building Name", "Changepoint Date"], how="left"
+        )
+        # Fill any missing holidaycount values with 0
+        cp_df["holidaycount"] = cp_df["holidaycount"].fillna(0)
+    return cp_df
+# 4. 半监督模型 (Self-Training XGBoost)
+def run_semi_supervised_cp_model(
+    base_df: pd.DataFrame,
+    k_best: int = 10,
+    feature_cols: list[str] | None = None,
+    xgb_params: dict | None = None,
+) -> Tuple[pd.DataFrame, dict]:
+    """Return preds_df (with Predicted) and simple stats."""
+    if feature_cols is None:
+        feature_cols = [
+            "AbsDelta",
+            "slope",
+            "ΔMeanDiff",
+            "ΔMeanRatio",
+            "TimeSinceStart",
+            'holidaycount'
+        ]
+    if xgb_params is None:
+        xgb_params = {
+            "max_depth": 3,
+            "learning_rate": 0.1,
+            "n_estimators": 200,
+            "subsample": 0.8,
+            "colsample_bytree": 0.8,
+            "objective": "binary:logistic",
+            "eval_metric": "logloss",
+            "verbosity": 0,
+        }
+    df = base_df.copy()
+    y = np.full(len(df), -1, dtype=int)
+    y[df["Label"] == "Real"] = 1
+    y[df["Label"] == "Noise"] = 0
+    X = df[feature_cols].fillna(0).values
+    base_clf = XGBClassifier(**xgb_params)
+    clf = SelfTrainingClassifier(
+        base_estimator=base_clf,
+        criterion="k_best",
+        k_best=k_best
+    )
+    unique_labels_in_y = np.unique(y[y != -1])
+    if len(unique_labels_in_y) < 2 and len(unique_labels_in_y) > 0:
+        print(f"Initial pseudo-labels only contain one class: "
+              f"{unique_labels_in_y}. Self-training may not be effective "
+              f"or may fail. Predictions might be skewed or based on "
+              f"initial labels only.")
+        try:
+            clf.fit(X, y)
+            trans = clf.transduction_
+        except ValueError as e:
+            print(f"SelfTrainingClassifier.fit error: {e}. This might be "
+                  f"due to homogenous initial labels (e.g., all 'Real' "
+                  f"or all 'Noise').")
+            trans = y.copy()
+    elif len(unique_labels_in_y) == 0:
+        print("No initial pseudo-labels (Real/Noise) found. "
+              "Self-training cannot proceed. All will be 'Unknown'.")
+        trans = y
+    else:
+        clf.fit(X, y)
+        trans = clf.transduction_
+    # 🔧 Fix: More robust dtype handling for np.select
+    # Ensure trans is integer type and handle any potential issues
+    trans = np.asarray(trans, dtype=int)
+    # 🔧 Fix: Alternative approach - using pandas map for safer type handling
+    # Create mapping dict and use pandas functionality instead of np.select
+    label_map = {1: "Real", 0: "Noise", -1: "Unknown"}
+    # Convert to pandas Series for safer dtype handling
+    trans_series = pd.Series(trans)
+    predicted_labels = trans_series.map(label_map).fillna("Unknown")
+    # Assign to dataframe with explicit dtype specification
+    df = df.copy()  # Ensure we work with a clean copy
+    df["Predicted"] = predicted_labels.astype(str)
+    stats = df["Predicted"].value_counts(dropna=False).to_dict()
+    stats["k_best"] = k_best
+    return df, stats
+# 🔧 新增：CatBoost版本的半监督模型
+def run_semi_supervised_cp_model_catboost(
+    base_df: pd.DataFrame,
+    k_best: int = 10,
+    feature_cols: list[str] | None = None,
+    catboost_params: dict | None = None,
+) -> Tuple[pd.DataFrame, dict]:
+    """
+    CatBoost版本的半监督变点分类模型
+    Args:
+        base_df: 包含特征和标签的数据框
+        k_best: SelfTrainingClassifier的k_best参数
+        feature_cols: 特征列名列表
+        catboost_params: CatBoost参数字典
+    Returns:
+        预测结果数据框和统计信息
+    """
+    if not CATBOOST_AVAILABLE:
+        raise ImportError("CatBoost not available. Install with: "
+                          "pip install catboost")
+    if feature_cols is None:
+        feature_cols = [
+            "AbsDelta",
+            "slope",
+            "ΔMeanDiff",
+            "ΔMeanRatio",
+            "TimeSinceStart",
+            'holidaycount'
+        ]
+    if catboost_params is None:
+        catboost_params = {
+            "depth": 3,
+            "learning_rate": 0.1,
+            "iterations": 200,
+            "colsample_bylevel": 0.8,
+            "loss_function": "Logloss",
+            "eval_metric": "Logloss",
+            "verbose": False,
+            "allow_writing_files": False,
+            "bootstrap_type": "Bayesian",  # Better for small samples
+        }
+    df = base_df.copy()
+    y = np.full(len(df), -1, dtype=int)
+    y[df["Label"] == "Real"] = 1
+    y[df["Label"] == "Noise"] = 0
+    X = df[feature_cols].fillna(0).values
+    # 🎯 使用CatBoost替代XGBoost
+    base_clf = CatBoostClassifier(**catboost_params)
+    clf = SelfTrainingClassifier(
+        base_estimator=base_clf,
+        criterion="k_best",
+        k_best=k_best
+    )
+    unique_labels_in_y = np.unique(y[y != -1])
+    if len(unique_labels_in_y) < 2 and len(unique_labels_in_y) > 0:
+        print(f"Initial pseudo-labels only contain one class: "
+              f"{unique_labels_in_y}. Self-training may not be effective "
+              f"or may fail. Predictions might be skewed or based on "
+              f"initial labels only.")
+        try:
+            clf.fit(X, y)
+            trans = clf.transduction_
+        except ValueError as e:
+            print(f"SelfTrainingClassifier.fit error: {e}. This might be "
+                  f"due to homogenous initial labels (e.g., all 'Real' "
+                  f"or all 'Noise').")
+            trans = y.copy()
+    elif len(unique_labels_in_y) == 0:
+        print("No initial pseudo-labels (Real/Noise) found. "
+              "Self-training cannot proceed. All will be 'Unknown'.")
+        trans = y
+    else:
+        clf.fit(X, y)
+        trans = clf.transduction_
+    # 🔧 Fix: Same robust dtype handling as XGBoost version
+    trans = np.asarray(trans, dtype=int)
+    # Use pandas map for safer type handling
+    label_map = {1: "Real", 0: "Noise", -1: "Unknown"}
+    trans_series = pd.Series(trans)
+    predicted_labels = trans_series.map(label_map).fillna("Unknown")
+    # Assign to dataframe with explicit dtype specification
+    df = df.copy()
+    df["Predicted"] = predicted_labels.astype(str)
+    stats = df["Predicted"].value_counts(dropna=False).to_dict()
+    stats["k_best"] = k_best
+    stats["model_type"] = "CatBoost"
+    return df, stats
+# 🔧 新增：统一的模型选择函数
+def run_semi_supervised_cp_model_unified(
+    base_df: pd.DataFrame,
+    k_best: int = 10,
+    feature_cols: list[str] | None = None,
+    model_type: str = "xgboost",
+    model_params: dict | None = None,
+) -> Tuple[pd.DataFrame, dict]:
+    """
+    统一的半监督变点分类模型接口，支持XGBoost和CatBoost
+    Args:
+        base_df: 包含特征和标签的数据框
+        k_best: SelfTrainingClassifier的k_best参数
+        feature_cols: 特征列名列表
+        model_type: 模型类型，"xgboost" 或 "catboost"
+        model_params: 模型参数字典
+    Returns:
+        预测结果数据框和统计信息
+    """
+    if model_type.lower() == "catboost":
+        return run_semi_supervised_cp_model_catboost(
+            base_df, k_best, feature_cols, model_params
+        )
+    elif model_type.lower() == "xgboost":
+        return run_semi_supervised_cp_model(
+            base_df, k_best, feature_cols, model_params
+        )
+    else:
+        raise ValueError(f"Unsupported model_type: {model_type}. "
+                        f"Choose from 'xgboost' or 'catboost'")

src/data_utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import pandas as pd
+def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    标准化列名：去除空格、下划线和非可见字符
+    """
+    df.columns = [c.strip().replace("\xa0", "").replace(" ", "").replace("_", "") for c in df.columns]
+    return df
+def load_file(file) -> pd.DataFrame:
+    if file.name.endswith(".csv"):
+        df = pd.read_csv(file)
+    elif file.name.endswith(".xlsx"):
+        df = pd.read_excel(file)
+    else:
+        raise ValueError("File type not supported, only CSV or Excel are accepted")
+    return standardize_columns(df)
+def recommend_buildings(building_list, query, scorer, limit, threshold, index_builder, fuzzy_engine):
+    if not query:
+        return []
+    idx_map = index_builder(building_list)
+    keys = list(idx_map.keys())
+    matches = fuzzy_engine(query.lower(), keys, scorer=scorer, limit=limit)
+    return [idx_map[k] for k, score, _ in matches if score >= threshold]

src/rupture_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from sklearn.preprocessing import StandardScaler
+import ruptures as rpt
+import pandas as pd
+import streamlit as st
+def detect_changepoints(
+    df: pd.DataFrame,
+    algo: str = "pelt",
+    model: str = "rbf",
+    pen: float = 1.0,
+) -> pd.DataFrame:
+    # 0. 确保索引从 0 开始，避免后续按位置赋值出错
+    df = df.reset_index(drop=True)
+    y = df["value"].values  # 原始序列
+    # Step 1: 标准化（对 rbf/l2 强烈建议）
+    if model in ["rbf", "l2", "normal"]:
+        # 先标准化再保证二维形状 (n_samples, n_features)
+        y_scaled = StandardScaler().fit_transform(y.reshape(-1, 1))
+        X = y_scaled  # shape (n,1)
+    else:
+        X = y  # 1-D 数组即可
+    # Step 2: 检测变点
+    if algo == "pelt":
+        algo_obj = rpt.Pelt(model=model).fit(X)
+        result = algo_obj.predict(pen=pen)
+    elif algo == "window":
+        algo_obj = rpt.Window(width=10, model=model).fit(X)
+        result = algo_obj.predict(n_bkps=5)
+    else:
+        raise ValueError("Unknown algo")
+    # Step 3: 标注变点
+    df_out = df.copy()
+    df_out["changepoint"] = 0
+    for idx in result[:-1]:  # 最后一个是序列终点
+        # ruptures 返回的 idx 是段结束位置（从 1 开始计数）；
+        # 直接按位置写入即可
+        if idx - 1 < len(df_out):
+            df_out.loc[idx - 1, "changepoint"] = 1
+    st.write("Changepoint Index:", df_out[df_out["changepoint"] == 1])
+    print("Input sequence：", df["value"].values)
+    print("Changepoint detected：", result)
+    return df_out

src/usage_utils.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import pandas as pd
+import numpy as np
+def analyze_and_fill_usage(
+    usage_data: pd.DataFrame,
+    gap_threshold: int = 62,
+    fill_earliest_cutoff: str = "2013-01-01",
+    min_fill_gap_months: int = 9,
+    rolling_window_size: int = 4,
+    rolling_centered: bool = True,
+    sequence_fill_method: str = "mean",
+    post_missing_threshold: float = 0.1,
+) -> pd.DataFrame:
+    """
+    对 usage_data 进行缺失类型分析，计算 FillStartDate 和 NotGonnaUse，
+    返回 summary_df，只包含统计指标，不做时间序列填补。
+    """
+    df = usage_data.copy()
+    df["StartDate"] = pd.to_datetime(df["StartDate"])
+    df["EndDate"] = pd.to_datetime(df["EndDate"]) - pd.Timedelta(days=2)
+    # 🔧 修复：自动检测建筑列名，兼容两种格式
+    building_col = None
+    if "BuildingName" in df.columns:
+        building_col = "BuildingName"
+    elif "Building Name" in df.columns:
+        building_col = "Building Name"
+        # 创建标准化列名供后续使用
+        df["BuildingName"] = df["Building Name"]
+    else:
+        raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
+    records = []
+    # 🔧 修复：始终使用BuildingName进行groupby，确保一致性
+    for (bld, util), grp in df.groupby(["BuildingName", "CommodityCode"]):
+        # 完整月份索引
+        start = grp["StartDate"].min().replace(day=1)
+        end = grp["StartDate"].max().replace(day=1)
+        full_idx = pd.date_range(start, end, freq="MS")
+        flag = pd.Series(0, index=full_idx)
+        flag.loc[grp["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
+        missing = flag[flag == 0].index
+        # 统计 Random / Sequence 缺失
+        seq_ranges, rand_dates = [], []
+        rand_months = seq_months = 0
+        if missing.empty:
+            mtype = "No Missing"
+        else:
+            gaps = missing.to_series().diff().dt.days.fillna(9999)
+            gid = (gaps > gap_threshold).cumsum()
+            for _, seg in missing.to_series().groupby(gid):
+                if len(seg) > 1:
+                    seq_ranges.append(
+                        f"From {seg.min().strftime('%Y-%m')} to {seg.max().strftime('%Y-%m')}"
+                    )
+                    seq_months += len(seg)
+                else:
+                    rand_dates.append(seg.iloc[0].strftime("%Y-%m"))
+                    rand_months += 1
+            has_rand, has_seq = bool(rand_dates), bool(seq_ranges)
+            if has_rand and has_seq:
+                mtype = "Both"
+            elif has_rand:
+                mtype = "Random"
+            else:
+                mtype = "Sequence"
+        records.append(
+            {
+                "BuildingName": bld,
+                "CommodityCode": util,
+                "MissingType": mtype,
+                "SequenceMissingRanges": "; ".join(seq_ranges),
+                "RandomMissingDates": "; ".join(rand_dates),
+                "TotalMonths": len(full_idx),
+                "RandomMissingMonths": rand_months,
+                "SequenceMissingMonths": seq_months,
+                "RandomMissingRatio": rand_months / len(full_idx)
+                if full_idx.size
+                else 0,
+                "SequenceMissingRatio": seq_months / len(full_idx)
+                if full_idx.size
+                else 0,
+            }
+        )
+    summary_df = pd.DataFrame(records)
+    # -------------------------------------------------------------
+    # 计算 FillStartDate
+    # -------------------------------------------------------------
+    cutoff_dt = pd.to_datetime(fill_earliest_cutoff)
+    def get_fill_start(r):
+        """
+        根据用户澄清的正确策略计算FillStartDate：
+        1. 不管2013年之前是否有序列缺失，都检查2013年之后是否存在≥9个月的序列缺失
+        2. 如果2013年之后存在≥9个月缺失 → 返回缺失结束时间+1个月
+        3. 如果2013年之后没有≥9个月缺失 → 返回2013-01-01
+        """
+        try:
+            # 解析所有序列缺失范围
+            seq_ranges = []
+            seq_missing_ranges = r.get("SequenceMissingRanges", "")
+            if not seq_missing_ranges or pd.isna(seq_missing_ranges):
+                # 没有序列缺失数据，从2013年开始
+                return cutoff_dt
+            for rng in str(seq_missing_ranges).split("; "):
+                if "to" not in rng:
+                    continue
+                try:
+                    s, e = rng.replace("From ", "").split(" to ")
+                    sd, ed = pd.to_datetime(s), pd.to_datetime(e)
+                    gap = (ed.to_period("M") - sd.to_period("M")).n + 1
+                    seq_ranges.append((sd, ed, gap))
+                except Exception:
+                    # 跳过无法解析的日期范围，继续处理其他范围
+                    continue
+            if not seq_ranges:
+                # 没有有效���序列缺失，从2013年开始
+                return cutoff_dt
+            # 🔍 关键：只关注2013年之后的序列缺失（开始时间>=2013-01-01）
+            # 🔧 修复：确保正确访问min_fill_gap_months变量
+            post_2013_missing = [
+                (sd, ed, gap) for sd, ed, gap in seq_ranges
+                if sd >= cutoff_dt and gap >= min_fill_gap_months
+            ]
+            # 🔍 如果2013年之后存在≥9个月的序列缺失
+            if post_2013_missing:
+                # 按开始时间排序，取第一个符合条件的缺失
+                post_2013_missing.sort(key=lambda x: x[0])
+                sd, ed, gap = post_2013_missing[0]
+                return ed + pd.offsets.MonthBegin(1)
+            # 🔍 如果2013年之后没有≥9个月的序列缺失，从2013年开始
+            return cutoff_dt
+        except Exception as e:
+            # 🔧 关键修复：如果任何解析步骤失败，总是返回默认的cutoff_dt
+            # 这确保get_fill_start永远不会抛出异常，从而避免pandas.apply返回NaT
+            # 🔧 新增：记录异常信息用于调试
+            import traceback
+            print(f"get_fill_start exception for {r.get('BuildingName', 'Unknown')}: {e}")
+            print(f"Traceback: {traceback.format_exc()}")
+            return cutoff_dt
+    summary_df["FillStartDate"] = summary_df.apply(get_fill_start, axis=1)
+    # -------------------------------------------------------------
+    # 计算填充后缺失比率
+    # -------------------------------------------------------------
+    post_recs = []
+    for _, r in summary_df.iterrows():
+        bld, util, fsd = r["BuildingName"], r["CommodityCode"], r["FillStartDate"]
+        if pd.isna(fsd):
+            continue
+        grp2 = df[
+            (df["BuildingName"] == bld)
+            & (df["CommodityCode"] == util)
+            & (df["StartDate"] >= fsd)
+        ]
+        if grp2.empty:
+            continue
+        idx2 = pd.date_range(
+            fsd.replace(day=1), grp2["StartDate"].max().replace(day=1), freq="MS"
+        )
+        flag2 = pd.Series(0, index=idx2)
+        flag2.loc[grp2["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
+        miss2 = flag2[flag2 == 0].index
+        gaps2 = pd.Series(miss2).diff().dt.days.fillna(9999)
+        gid2 = (gaps2 > gap_threshold).cumsum()
+        rm2 = sm2 = 0
+        for _, seg2 in pd.Series(miss2).groupby(gid2):
+            if len(seg2) > 1:
+                sm2 += len(seg2)
+            else:
+                rm2 += 1
+        post_recs.append(
+            {
+                "BuildingName": bld,
+                "CommodityCode": util,
+                "PostTotalMonths": len(idx2),
+                "PostRandomMissingMonths": rm2,
+                "PostSequenceMissingMonths": sm2,
+                "PostRandomMissingRatio": rm2 / len(idx2) if idx2.size else 0,
+                "PostSequenceMissingRatio": sm2 / len(idx2) if idx2.size else 0,
+            }
+        )
+    post_df = pd.DataFrame(post_recs)
+    # 🔧 修复：如果post_df为空，需要创建包含所有必要列的空DataFrame
+    if post_df.empty:
+        # 创建一个与summary_df结构匹配的空DataFrame
+        post_df = pd.DataFrame(columns=[
+            "BuildingName", "CommodityCode", "PostTotalMonths",
+            "PostRandomMissingMonths", "PostSequenceMissingMonths",
+            "PostRandomMissingRatio", "PostSequenceMissingRatio"
+        ])
+    summary_df = summary_df.merge(post_df, on=["BuildingName", "CommodityCode"], how="left")
+    # 🔧 修复：填充缺失的post分析列为默认值
+    post_columns = ["PostTotalMonths", "PostRandomMissingMonths", "PostSequenceMissingMonths",
+                   "PostRandomMissingRatio", "PostSequenceMissingRatio"]
+    for col in post_columns:
+        if col not in summary_df.columns:
+            if "Ratio" in col:
+                summary_df[col] = 0.0  # 比率列默认为0
+            else:
+                summary_df[col] = 0    # 月数列默认为0
+    summary_df["NotGonnaUse"] = (
+        (summary_df["PostRandomMissingRatio"] > post_missing_threshold)
+        | (summary_df["PostSequenceMissingRatio"] > post_missing_threshold)
+    ).astype(int)
+    return summary_df
+def fill_usage_with_sequence_check_strict_mean(
+    usage_data: pd.DataFrame,
+    summary_df: pd.DataFrame,
+    method: str = "mean",
+    force: bool = False,
+    fill_earliest_cutoff: str = "1900-01-01",
+) -> pd.DataFrame:
+    """
+    根据 summary_df 的 FillStartDate / NotGonnaUse 对 usage_data 进行填补。
+    参数
+    ----------
+    usage_data : 原始用量表
+    summary_df : analyze_and_fill_usage 的结果
+    method     : 'mean' 或 'median'
+    force      : True 时忽略 NotGonnaUse 和 NaT，自动调整起点，保证能输出序列
+    fill_earliest_cutoff : 当 FillStartDate 为 NaT 时的回退起点（仅在 force=True 时使用��
+    返回
+    ----------
+    filled_df : ['BuildingName','CommodityCode','Date','FilledUse']
+    """
+    df = usage_data.copy()
+    df["StartDate"] = pd.to_datetime(df["StartDate"]).dt.to_period("M").dt.to_timestamp()
+    # 🔧 修复：自动检测建筑列名，兼容两种格式
+    building_col = None
+    if "BuildingName" in df.columns:
+        building_col = "BuildingName"
+    elif "Building Name" in df.columns:
+        building_col = "Building Name"
+        # 创建标准化列名供后续使用
+        df["BuildingName"] = df["Building Name"]
+    else:
+        raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
+    all_records = []
+    for _, row in summary_df.iterrows():
+        bld, util, fsd, drop = (
+            row["BuildingName"],
+            row["CommodityCode"],
+            row["FillStartDate"],
+            row["NotGonnaUse"],
+        )
+        # ───────── 闸门 1 & 2 ─────────
+        if not force and (drop == 1 or pd.isna(fsd)):
+            # 严格模式：缺失率过高 或 没有有效起点 → 直接跳过
+            continue
+        if force and pd.isna(fsd):
+            fsd = pd.to_datetime(fill_earliest_cutoff)
+        # 取 >= fsd 的原始数据（使用标准化的BuildingName列）
+        grp = df[
+            (df["BuildingName"] == bld)
+            & (df["CommodityCode"] == util)
+            & (df["StartDate"] >= fsd)
+        ]
+        # ───────── 闸门 3 ─────────
+        if grp.empty:
+            if not force:
+                continue
+            # 强制模式：回退到该组合最早月份
+            grp = df[(df["BuildingName"] == bld) & (df["CommodityCode"] == util)]
+            if grp.empty:
+                # 数据确实不存在
+                continue
+            fsd = grp["StartDate"].min()
+        last_m = grp["StartDate"].max()
+        all_months = pd.date_range(fsd, last_m, freq="MS")
+        monthly = grp.groupby("StartDate")["Use"].sum().reindex(all_months)
+        base = monthly.dropna()
+        fill_val = base.median() if method == "median" else base.mean()
+        filled = monthly.fillna(fill_val).reset_index()
+        filled.columns = ["Date", "FilledUse"]
+        filled["BuildingName"] = bld
+        filled["CommodityCode"] = util
+        all_records.append(filled)
+    if not all_records:
+        return pd.DataFrame(columns=["BuildingName", "CommodityCode", "Date", "FilledUse"])
+    return pd.concat(all_records, ignore_index=True)
+# ===============================================================
+# LLM-based Weather Variable Selection Functions
+# ===============================================================
+# Building Type → Weather Variable Rule Mapping
+weather_influence_map = {
+    "Office": ["temp_mean", "temp_std", "CDD_sum", "clouds_all_mean"],
+    "Instructional": ["temp_mean", "temp_std", "CDD_sum", "humidity_mean"],
+    "Residential": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
+    "Health": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
+    "Library": ["temp_mean", "temp_std", "humidity_mean", "clouds_all_mean"],
+    "Dining": ["temp_mean", "rain_sum", "CDD_sum"],
+    "Recreation": ["temp_mean", "rain_sum", "wind_speed_mean"],
+    "Assembly or Theater": ["temp_mean", "wind_speed_mean", "rain_sum"],
+    "Affiliate": ["temp_mean", "CDD_sum", "rain_sum"],
+    "Parking Structure": [],
+    "Infrastructure": [],
+    "Container": [],
+    "Mixed": ["temp_mean", "CDD_sum", "humidity_mean"],
+    "Other": ["temp_mean", "CDD_sum"],
+}
+def infer_building_type(text: str) -> str:
+    """推断建筑类型基于文本描述"""
+    patterns = {
+        "Instructional": ["Teaching", "Classroom", "School", "Lecture Hall", "Academic", "Education"],
+        "Residential": ["Residential", "Apartment", "Dormitory", "Housing", "Student"],
+        "Office": ["Office", "Office Building", "Administrative", "Admin"],
+        "Health": ["Hospital", "Medical", "Clinic", "Health"],
+        "Dining": ["Canteen", "Restaurant", "Dining", "Food", "Kitchen"],
+        "Recreation": ["Fitness", "Sports", "Entertainment", "Recreation", "Gym"],
+        "Library": ["Library"],
+        "Assembly or Theater": ["Theater", "Auditorium", "Performance", "Assembly"],
+        "Affiliate": ["Affiliate"],
+    }
+    text_lower = text.lower()
+    for btype, keywords in patterns.items():
+        if any(k.lower() in text_lower for k in keywords):
+            return btype
+    return "Mixed"
+def construct_weather_prompt_static(
+    user_description: str, detected_type: str, suggested_vars: list,
+    gross_area: float, avg_space_sqft: float, workpoint_count: int, floor_count: int
+) -> str:
+    """构建静态weather prompt"""
+    scenario_note = """
+Weather-Scenario (z-score offset, user will pick one):
+• Normal  → 0 σ offset (historical monthly mean)
+• Hot     → +1 σ on temp_mean & CDD_sum, −0.5 σ on humidity_mean
+• ColdWet → −1 σ on temp_mean, +1 σ on HDD_sum & humidity_mean
+• WindyCloudy → +1 σ on wind_speed_mean & clouds_all_mean
+LLM only needs to recommend variables; offsets are applied downstream.
+"""
+    return f"""
+You are an expert in building energy modeling and changepoint detection.
+{scenario_note}
+Building Description (current use only):
+{user_description}
+Inferred Operation Type: {detected_type}
+Structural Information:
+- Building Gross Area: {gross_area:,.0f} sq ft
+- Average Space Size: {avg_space_sqft:,.0f} sq ft
+- Total Workpoint Count: {workpoint_count}
+- Floor Count: {floor_count}
+Rule-based Suggested Variables: {', '.join(suggested_vars)}
+Candidate Weather Variables:
+temp_mean · temp_std · HDD_sum · CDD_sum · rain_sum · clouds_all_mean · humidity_mean · wind_speed_mean
+Tasks:
+1. Select 3–5 variables that best capture energy-use behaviour under the current configuration.
+2. Briefly justify each choice.
+3. Return a markdown table with columns: Selected Variable | Reason.
+"""
+def chat_with_ollama(messages: list, model: str = "mistral") -> str:
+    """与Ollama API聊天"""
+    import requests
+    url = "http://localhost:11434/api/chat"
+    try:
+        response = requests.post(
+            url,
+            json={"model": model, "messages": messages, "stream": False},
+            timeout=30
+        )
+        response.raise_for_status()
+        return response.json()["message"]["content"]
+    except requests.exceptions.RequestException as e:
+        raise Exception(f"Ollama API error: {str(e)}")
+    except KeyError:
+        raise Exception("Invalid response format from Ollama API")
+def parse_selected_vars(md: str) -> list:
+    """解析markdown表格中的变量列表"""
+    vars_ = []
+    for row in md.strip().splitlines():
+        if row.startswith("|") and not row.startswith("| Selected"):
+            parts = row.split("|")
+            if len(parts) > 1:
+                first = parts[1].strip()
+                if first and first != '---' and first not in ["Selected Variable", ""]:
+                    vars_.append(first)
+    return vars_