Spaces:
No application file
No application file
Upload 4 files
Browse files- src/cp_utils.py +437 -0
- src/data_utils.py +25 -0
- src/rupture_utils.py +48 -0
- src/usage_utils.py +433 -0
src/cp_utils.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""cp_utils.py
|
| 2 |
+
Utilities for evaluating changepoint credibility and performing
|
| 3 |
+
semi-/supervised classification on changepoints.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from typing import Tuple
|
| 11 |
+
|
| 12 |
+
from statsmodels.tsa.stattools import adfuller
|
| 13 |
+
from prophet import Prophet
|
| 14 |
+
from sklearn.semi_supervised import SelfTrainingClassifier
|
| 15 |
+
from xgboost import XGBClassifier
|
| 16 |
+
# 🔧 新增:添加CatBoost支持
|
| 17 |
+
try:
|
| 18 |
+
from catboost import CatBoostClassifier
|
| 19 |
+
CATBOOST_AVAILABLE = True
|
| 20 |
+
except ImportError:
|
| 21 |
+
CATBOOST_AVAILABLE = False
|
| 22 |
+
print("⚠️ CatBoost not installed. Install with: pip install catboost")
|
| 23 |
+
|
| 24 |
+
# 1. 评估单栋楼的候选 changepoints —— ProphetDelta + 结构性指标
|
| 25 |
+
def _validate_cp_metrics(residual: pd.Series, idx: int, win: int = 6
|
| 26 |
+
) -> Tuple[bool, float, float]:
|
| 27 |
+
"""在 idx±win 窗口内计算 (slope, adf_p) 并判断是否显著"""
|
| 28 |
+
lo = max(0, idx - win)
|
| 29 |
+
hi = min(len(residual), idx + win + 1)
|
| 30 |
+
seg = residual.iloc[lo:hi].dropna()
|
| 31 |
+
if seg.size < 4:
|
| 32 |
+
return False, np.nan, np.nan
|
| 33 |
+
slope = np.polyfit(range(len(seg)), seg, 1)[0]
|
| 34 |
+
p_val = adfuller(seg)[1]
|
| 35 |
+
is_valid = (abs(slope) > 0.1) and (p_val > 0.05)
|
| 36 |
+
return is_valid, float(slope), float(p_val)
|
| 37 |
+
|
| 38 |
+
def building_score_changepoints(
|
| 39 |
+
summary_df: pd.DataFrame,
|
| 40 |
+
filled_df: pd.DataFrame,
|
| 41 |
+
building_name: str,
|
| 42 |
+
model: str = "rbf",
|
| 43 |
+
penalty: float | None = None,
|
| 44 |
+
window_size: int = 6,
|
| 45 |
+
usage_col: str = "FilledUse",
|
| 46 |
+
date_col: str = "Date",
|
| 47 |
+
cp_df: pd.DataFrame | None = None,
|
| 48 |
+
) -> pd.DataFrame:
|
| 49 |
+
"""Return table with Prophet delta & structure metrics for one building."""
|
| 50 |
+
from rupture_utils import detect_changepoints
|
| 51 |
+
|
| 52 |
+
records: list[dict] = []
|
| 53 |
+
utilities = (
|
| 54 |
+
summary_df.loc[summary_df["BuildingName"] == building_name,
|
| 55 |
+
"CommodityCode"].unique()
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
for util in utilities:
|
| 59 |
+
df_util = (
|
| 60 |
+
filled_df[
|
| 61 |
+
(filled_df["BuildingName"] == building_name)
|
| 62 |
+
& (filled_df["CommodityCode"] == util)
|
| 63 |
+
]
|
| 64 |
+
.sort_values(date_col)
|
| 65 |
+
.reset_index(drop=True)
|
| 66 |
+
)
|
| 67 |
+
if df_util[usage_col].isna().any() or len(df_util) < 24:
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
if cp_df is None:
|
| 71 |
+
cp_df = detect_changepoints(
|
| 72 |
+
df_util[[date_col, usage_col]].rename(
|
| 73 |
+
columns={date_col: "timestamp", usage_col: "value"}),
|
| 74 |
+
algo="pelt",
|
| 75 |
+
model=model,
|
| 76 |
+
pen=penalty or 1.0,
|
| 77 |
+
)
|
| 78 |
+
if cp_df.empty or cp_df["changepoint"].sum() == 0:
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
df_p = df_util[[date_col, usage_col]].rename(
|
| 82 |
+
columns={date_col: "ds", usage_col: "y"})
|
| 83 |
+
m_tmp = Prophet(yearly_seasonality=False, weekly_seasonality=False,
|
| 84 |
+
daily_seasonality=False)
|
| 85 |
+
m_tmp.fit(df_p)
|
| 86 |
+
residual = df_p["y"] - m_tmp.predict(df_p)["yhat"]
|
| 87 |
+
|
| 88 |
+
validated_dates: list[pd.Timestamp] = []
|
| 89 |
+
metrics_map: dict[pd.Timestamp, Tuple[float, float]] = {}
|
| 90 |
+
for d in pd.to_datetime(cp_df.loc[cp_df["changepoint"] == 1,
|
| 91 |
+
"timestamp"]):
|
| 92 |
+
if d not in df_util[date_col].values:
|
| 93 |
+
continue
|
| 94 |
+
idx = df_util.index[df_util[date_col] == d][0]
|
| 95 |
+
is_valid, slope, p_val = _validate_cp_metrics(
|
| 96 |
+
residual, idx, win=window_size
|
| 97 |
+
)
|
| 98 |
+
if is_valid:
|
| 99 |
+
validated_dates.append(d)
|
| 100 |
+
metrics_map[d] = (slope, p_val)
|
| 101 |
+
|
| 102 |
+
if not validated_dates:
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
m = Prophet(changepoints=validated_dates, yearly_seasonality=False)
|
| 106 |
+
m.fit(df_p)
|
| 107 |
+
deltas = m.params["delta"].mean(axis=0)
|
| 108 |
+
for cp, delta in zip(m.changepoints, deltas):
|
| 109 |
+
d = pd.to_datetime(cp)
|
| 110 |
+
slope, p_val = metrics_map.get(d, (np.nan, np.nan))
|
| 111 |
+
records.append(
|
| 112 |
+
{
|
| 113 |
+
"Building Name": building_name,
|
| 114 |
+
"CommodityCode": util,
|
| 115 |
+
"Changepoint Date": d,
|
| 116 |
+
"ProphetDelta": float(delta),
|
| 117 |
+
"slope": float(slope),
|
| 118 |
+
"adf_p_value": float(p_val),
|
| 119 |
+
}
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
result_df = pd.DataFrame(records)
|
| 123 |
+
if not result_df.empty:
|
| 124 |
+
result_df["AbsDelta"] = result_df["ProphetDelta"].abs()
|
| 125 |
+
return result_df
|
| 126 |
+
|
| 127 |
+
# 2. 伪标签打标
|
| 128 |
+
def label_changepoints_by_structure_signal(
|
| 129 |
+
df: pd.DataFrame,
|
| 130 |
+
slope_thresh: float = 0.1,
|
| 131 |
+
p_thresh: float = 0.05,
|
| 132 |
+
) -> pd.DataFrame:
|
| 133 |
+
"""Assign pseudo-labels Real / Noise / Unknown based on structure
|
| 134 |
+
signals."""
|
| 135 |
+
def _assign(row):
|
| 136 |
+
s, p = row["slope"], row["adf_p_value"]
|
| 137 |
+
if pd.isna(s) or pd.isna(p):
|
| 138 |
+
return "Unknown"
|
| 139 |
+
if (abs(s) > slope_thresh) and (p > p_thresh):
|
| 140 |
+
return "Real"
|
| 141 |
+
if (abs(s) < slope_thresh * 0.5) and (p < p_thresh * 0.5):
|
| 142 |
+
return "Noise"
|
| 143 |
+
return "Unknown"
|
| 144 |
+
out = df.copy()
|
| 145 |
+
out["Label"] = out.apply(_assign, axis=1)
|
| 146 |
+
return out
|
| 147 |
+
|
| 148 |
+
# 3. 时序衍生特征
|
| 149 |
+
def extract_changepoint_features(
|
| 150 |
+
cp_df: pd.DataFrame,
|
| 151 |
+
filled_df: pd.DataFrame,
|
| 152 |
+
usage_col: str = "FilledUse",
|
| 153 |
+
date_col: str = "Date",
|
| 154 |
+
mean_win: int = 6,
|
| 155 |
+
) -> pd.DataFrame:
|
| 156 |
+
"""Derive mean diff/ratio and temporal context features for each cp,
|
| 157 |
+
并合并holidaycount特征(如有)"""
|
| 158 |
+
cp_df = cp_df.copy()
|
| 159 |
+
|
| 160 |
+
# 🔧 Fix: Ensure proper data types for TimeIndex and Season columns
|
| 161 |
+
cp_df["TimeIndex"] = cp_df["Changepoint Date"].dt.month.astype('int64')
|
| 162 |
+
|
| 163 |
+
# 🔧 Fix: Convert Season to categorical codes to avoid string/numeric
|
| 164 |
+
# dtype conflicts
|
| 165 |
+
season_mapping = {
|
| 166 |
+
6: 0, 7: 0, 8: 0, # Summer = 0
|
| 167 |
+
12: 1, 1: 1, 2: 1, # Winter = 1
|
| 168 |
+
}
|
| 169 |
+
# Other = 2
|
| 170 |
+
season_col = cp_df["TimeIndex"].map(season_mapping).fillna(2)
|
| 171 |
+
cp_df["Season"] = season_col.astype('int64')
|
| 172 |
+
|
| 173 |
+
min_dates = filled_df.groupby("BuildingName")[date_col].min().to_dict()
|
| 174 |
+
for i, row in cp_df.iterrows():
|
| 175 |
+
bld = row["Building Name"]
|
| 176 |
+
cp_date = row["Changepoint Date"]
|
| 177 |
+
df_bld = (
|
| 178 |
+
filled_df[filled_df["BuildingName"] == bld]
|
| 179 |
+
.sort_values(date_col)
|
| 180 |
+
.reset_index(drop=True)
|
| 181 |
+
)
|
| 182 |
+
if cp_date not in df_bld[date_col].values:
|
| 183 |
+
continue
|
| 184 |
+
idx = df_bld.index[df_bld[date_col] == cp_date][0]
|
| 185 |
+
before_vals = df_bld[usage_col].iloc[max(0, idx - mean_win): idx]
|
| 186 |
+
after_vals = df_bld[usage_col].iloc[idx + 1: idx + mean_win + 1]
|
| 187 |
+
before_mean = before_vals.mean() if len(before_vals) else np.nan
|
| 188 |
+
after_mean = after_vals.mean() if len(after_vals) else np.nan
|
| 189 |
+
diff = after_mean - before_mean if np.isfinite(before_mean) and np.isfinite(after_mean) else np.nan
|
| 190 |
+
ratio = after_mean / before_mean if np.isfinite(before_mean) and before_mean != 0 else np.nan
|
| 191 |
+
cp_df.at[i, "ΔMeanBefore"] = before_mean
|
| 192 |
+
cp_df.at[i, "ΔMeanAfter"] = after_mean
|
| 193 |
+
cp_df.at[i, "ΔMeanDiff"] = diff
|
| 194 |
+
cp_df.at[i, "ΔMeanRatio"] = ratio
|
| 195 |
+
cp_df.at[i, "TimeSinceStart"] = (cp_date - min_dates.get(bld, cp_date)).days
|
| 196 |
+
|
| 197 |
+
# 🔧 Fix: Ensure all numeric columns have consistent dtypes
|
| 198 |
+
numeric_cols = ["ΔMeanBefore", "ΔMeanAfter", "ΔMeanDiff", "ΔMeanRatio",
|
| 199 |
+
"TimeSinceStart", "TimeIndex", "Season"]
|
| 200 |
+
for col in numeric_cols:
|
| 201 |
+
if col in cp_df.columns:
|
| 202 |
+
cp_df[col] = pd.to_numeric(cp_df[col], errors='coerce')
|
| 203 |
+
|
| 204 |
+
if "holidaycount" in filled_df.columns:
|
| 205 |
+
# 只保留合并所需的列,避免重复
|
| 206 |
+
holiday_df = filled_df[["BuildingName", date_col, "holidaycount"]].drop_duplicates()
|
| 207 |
+
holiday_df = holiday_df.rename(
|
| 208 |
+
columns={"BuildingName": "Building Name", date_col: "Changepoint Date"}
|
| 209 |
+
)
|
| 210 |
+
# 🔧 Fix: Ensure holidaycount is numeric
|
| 211 |
+
holiday_df["holidaycount"] = pd.to_numeric(
|
| 212 |
+
holiday_df["holidaycount"], errors='coerce'
|
| 213 |
+
).fillna(0)
|
| 214 |
+
|
| 215 |
+
cp_df = cp_df.merge(
|
| 216 |
+
holiday_df, on=["Building Name", "Changepoint Date"], how="left"
|
| 217 |
+
)
|
| 218 |
+
# Fill any missing holidaycount values with 0
|
| 219 |
+
cp_df["holidaycount"] = cp_df["holidaycount"].fillna(0)
|
| 220 |
+
|
| 221 |
+
return cp_df
|
| 222 |
+
|
| 223 |
+
# 4. 半监督模型 (Self-Training XGBoost)
|
| 224 |
+
def run_semi_supervised_cp_model(
|
| 225 |
+
base_df: pd.DataFrame,
|
| 226 |
+
k_best: int = 10,
|
| 227 |
+
feature_cols: list[str] | None = None,
|
| 228 |
+
xgb_params: dict | None = None,
|
| 229 |
+
) -> Tuple[pd.DataFrame, dict]:
|
| 230 |
+
"""Return preds_df (with Predicted) and simple stats."""
|
| 231 |
+
if feature_cols is None:
|
| 232 |
+
feature_cols = [
|
| 233 |
+
"AbsDelta",
|
| 234 |
+
"slope",
|
| 235 |
+
"ΔMeanDiff",
|
| 236 |
+
"ΔMeanRatio",
|
| 237 |
+
"TimeSinceStart",
|
| 238 |
+
'holidaycount'
|
| 239 |
+
]
|
| 240 |
+
if xgb_params is None:
|
| 241 |
+
xgb_params = {
|
| 242 |
+
"max_depth": 3,
|
| 243 |
+
"learning_rate": 0.1,
|
| 244 |
+
"n_estimators": 200,
|
| 245 |
+
"subsample": 0.8,
|
| 246 |
+
"colsample_bytree": 0.8,
|
| 247 |
+
"objective": "binary:logistic",
|
| 248 |
+
"eval_metric": "logloss",
|
| 249 |
+
"verbosity": 0,
|
| 250 |
+
}
|
| 251 |
+
df = base_df.copy()
|
| 252 |
+
y = np.full(len(df), -1, dtype=int)
|
| 253 |
+
y[df["Label"] == "Real"] = 1
|
| 254 |
+
y[df["Label"] == "Noise"] = 0
|
| 255 |
+
|
| 256 |
+
X = df[feature_cols].fillna(0).values
|
| 257 |
+
base_clf = XGBClassifier(**xgb_params)
|
| 258 |
+
clf = SelfTrainingClassifier(
|
| 259 |
+
base_estimator=base_clf,
|
| 260 |
+
criterion="k_best",
|
| 261 |
+
k_best=k_best
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
unique_labels_in_y = np.unique(y[y != -1])
|
| 265 |
+
if len(unique_labels_in_y) < 2 and len(unique_labels_in_y) > 0:
|
| 266 |
+
print(f"Initial pseudo-labels only contain one class: "
|
| 267 |
+
f"{unique_labels_in_y}. Self-training may not be effective "
|
| 268 |
+
f"or may fail. Predictions might be skewed or based on "
|
| 269 |
+
f"initial labels only.")
|
| 270 |
+
try:
|
| 271 |
+
clf.fit(X, y)
|
| 272 |
+
trans = clf.transduction_
|
| 273 |
+
except ValueError as e:
|
| 274 |
+
print(f"SelfTrainingClassifier.fit error: {e}. This might be "
|
| 275 |
+
f"due to homogenous initial labels (e.g., all 'Real' "
|
| 276 |
+
f"or all 'Noise').")
|
| 277 |
+
trans = y.copy()
|
| 278 |
+
elif len(unique_labels_in_y) == 0:
|
| 279 |
+
print("No initial pseudo-labels (Real/Noise) found. "
|
| 280 |
+
"Self-training cannot proceed. All will be 'Unknown'.")
|
| 281 |
+
trans = y
|
| 282 |
+
else:
|
| 283 |
+
clf.fit(X, y)
|
| 284 |
+
trans = clf.transduction_
|
| 285 |
+
|
| 286 |
+
# 🔧 Fix: More robust dtype handling for np.select
|
| 287 |
+
# Ensure trans is integer type and handle any potential issues
|
| 288 |
+
trans = np.asarray(trans, dtype=int)
|
| 289 |
+
|
| 290 |
+
# 🔧 Fix: Alternative approach - using pandas map for safer type handling
|
| 291 |
+
# Create mapping dict and use pandas functionality instead of np.select
|
| 292 |
+
label_map = {1: "Real", 0: "Noise", -1: "Unknown"}
|
| 293 |
+
|
| 294 |
+
# Convert to pandas Series for safer dtype handling
|
| 295 |
+
trans_series = pd.Series(trans)
|
| 296 |
+
predicted_labels = trans_series.map(label_map).fillna("Unknown")
|
| 297 |
+
|
| 298 |
+
# Assign to dataframe with explicit dtype specification
|
| 299 |
+
df = df.copy() # Ensure we work with a clean copy
|
| 300 |
+
df["Predicted"] = predicted_labels.astype(str)
|
| 301 |
+
|
| 302 |
+
stats = df["Predicted"].value_counts(dropna=False).to_dict()
|
| 303 |
+
stats["k_best"] = k_best
|
| 304 |
+
return df, stats
|
| 305 |
+
|
| 306 |
+
# 🔧 新增:CatBoost版本的半监督模型
|
| 307 |
+
def run_semi_supervised_cp_model_catboost(
|
| 308 |
+
base_df: pd.DataFrame,
|
| 309 |
+
k_best: int = 10,
|
| 310 |
+
feature_cols: list[str] | None = None,
|
| 311 |
+
catboost_params: dict | None = None,
|
| 312 |
+
) -> Tuple[pd.DataFrame, dict]:
|
| 313 |
+
"""
|
| 314 |
+
CatBoost版本的半监督变点分类模型
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
base_df: 包含特征和标签的数据框
|
| 318 |
+
k_best: SelfTrainingClassifier的k_best参数
|
| 319 |
+
feature_cols: 特征列名列表
|
| 320 |
+
catboost_params: CatBoost参数字典
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
预测结果数据框和统计信息
|
| 324 |
+
"""
|
| 325 |
+
if not CATBOOST_AVAILABLE:
|
| 326 |
+
raise ImportError("CatBoost not available. Install with: "
|
| 327 |
+
"pip install catboost")
|
| 328 |
+
|
| 329 |
+
if feature_cols is None:
|
| 330 |
+
feature_cols = [
|
| 331 |
+
"AbsDelta",
|
| 332 |
+
"slope",
|
| 333 |
+
"ΔMeanDiff",
|
| 334 |
+
"ΔMeanRatio",
|
| 335 |
+
"TimeSinceStart",
|
| 336 |
+
'holidaycount'
|
| 337 |
+
]
|
| 338 |
+
|
| 339 |
+
if catboost_params is None:
|
| 340 |
+
catboost_params = {
|
| 341 |
+
"depth": 3,
|
| 342 |
+
"learning_rate": 0.1,
|
| 343 |
+
"iterations": 200,
|
| 344 |
+
"colsample_bylevel": 0.8,
|
| 345 |
+
"loss_function": "Logloss",
|
| 346 |
+
"eval_metric": "Logloss",
|
| 347 |
+
"verbose": False,
|
| 348 |
+
"allow_writing_files": False,
|
| 349 |
+
"bootstrap_type": "Bayesian", # Better for small samples
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
df = base_df.copy()
|
| 353 |
+
y = np.full(len(df), -1, dtype=int)
|
| 354 |
+
y[df["Label"] == "Real"] = 1
|
| 355 |
+
y[df["Label"] == "Noise"] = 0
|
| 356 |
+
|
| 357 |
+
X = df[feature_cols].fillna(0).values
|
| 358 |
+
|
| 359 |
+
# 🎯 使用CatBoost替代XGBoost
|
| 360 |
+
base_clf = CatBoostClassifier(**catboost_params)
|
| 361 |
+
clf = SelfTrainingClassifier(
|
| 362 |
+
base_estimator=base_clf,
|
| 363 |
+
criterion="k_best",
|
| 364 |
+
k_best=k_best
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
unique_labels_in_y = np.unique(y[y != -1])
|
| 368 |
+
if len(unique_labels_in_y) < 2 and len(unique_labels_in_y) > 0:
|
| 369 |
+
print(f"Initial pseudo-labels only contain one class: "
|
| 370 |
+
f"{unique_labels_in_y}. Self-training may not be effective "
|
| 371 |
+
f"or may fail. Predictions might be skewed or based on "
|
| 372 |
+
f"initial labels only.")
|
| 373 |
+
try:
|
| 374 |
+
clf.fit(X, y)
|
| 375 |
+
trans = clf.transduction_
|
| 376 |
+
except ValueError as e:
|
| 377 |
+
print(f"SelfTrainingClassifier.fit error: {e}. This might be "
|
| 378 |
+
f"due to homogenous initial labels (e.g., all 'Real' "
|
| 379 |
+
f"or all 'Noise').")
|
| 380 |
+
trans = y.copy()
|
| 381 |
+
elif len(unique_labels_in_y) == 0:
|
| 382 |
+
print("No initial pseudo-labels (Real/Noise) found. "
|
| 383 |
+
"Self-training cannot proceed. All will be 'Unknown'.")
|
| 384 |
+
trans = y
|
| 385 |
+
else:
|
| 386 |
+
clf.fit(X, y)
|
| 387 |
+
trans = clf.transduction_
|
| 388 |
+
|
| 389 |
+
# 🔧 Fix: Same robust dtype handling as XGBoost version
|
| 390 |
+
trans = np.asarray(trans, dtype=int)
|
| 391 |
+
|
| 392 |
+
# Use pandas map for safer type handling
|
| 393 |
+
label_map = {1: "Real", 0: "Noise", -1: "Unknown"}
|
| 394 |
+
trans_series = pd.Series(trans)
|
| 395 |
+
predicted_labels = trans_series.map(label_map).fillna("Unknown")
|
| 396 |
+
|
| 397 |
+
# Assign to dataframe with explicit dtype specification
|
| 398 |
+
df = df.copy()
|
| 399 |
+
df["Predicted"] = predicted_labels.astype(str)
|
| 400 |
+
|
| 401 |
+
stats = df["Predicted"].value_counts(dropna=False).to_dict()
|
| 402 |
+
stats["k_best"] = k_best
|
| 403 |
+
stats["model_type"] = "CatBoost"
|
| 404 |
+
return df, stats
|
| 405 |
+
|
| 406 |
+
# 🔧 新增:统一的模型选择函数
|
| 407 |
+
def run_semi_supervised_cp_model_unified(
|
| 408 |
+
base_df: pd.DataFrame,
|
| 409 |
+
k_best: int = 10,
|
| 410 |
+
feature_cols: list[str] | None = None,
|
| 411 |
+
model_type: str = "xgboost",
|
| 412 |
+
model_params: dict | None = None,
|
| 413 |
+
) -> Tuple[pd.DataFrame, dict]:
|
| 414 |
+
"""
|
| 415 |
+
统一的半监督变点分类模型接口,支持XGBoost和CatBoost
|
| 416 |
+
|
| 417 |
+
Args:
|
| 418 |
+
base_df: 包含特征和标签的数据框
|
| 419 |
+
k_best: SelfTrainingClassifier的k_best参数
|
| 420 |
+
feature_cols: 特征列名列表
|
| 421 |
+
model_type: 模型类型,"xgboost" 或 "catboost"
|
| 422 |
+
model_params: 模型参数字典
|
| 423 |
+
|
| 424 |
+
Returns:
|
| 425 |
+
预测结果数据框和统计信息
|
| 426 |
+
"""
|
| 427 |
+
if model_type.lower() == "catboost":
|
| 428 |
+
return run_semi_supervised_cp_model_catboost(
|
| 429 |
+
base_df, k_best, feature_cols, model_params
|
| 430 |
+
)
|
| 431 |
+
elif model_type.lower() == "xgboost":
|
| 432 |
+
return run_semi_supervised_cp_model(
|
| 433 |
+
base_df, k_best, feature_cols, model_params
|
| 434 |
+
)
|
| 435 |
+
else:
|
| 436 |
+
raise ValueError(f"Unsupported model_type: {model_type}. "
|
| 437 |
+
f"Choose from 'xgboost' or 'catboost'")
|
src/data_utils.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 4 |
+
"""
|
| 5 |
+
标准化列名:去除空格、下划线和非可见字符
|
| 6 |
+
"""
|
| 7 |
+
df.columns = [c.strip().replace("\xa0", "").replace(" ", "").replace("_", "") for c in df.columns]
|
| 8 |
+
return df
|
| 9 |
+
|
| 10 |
+
def load_file(file) -> pd.DataFrame:
|
| 11 |
+
if file.name.endswith(".csv"):
|
| 12 |
+
df = pd.read_csv(file)
|
| 13 |
+
elif file.name.endswith(".xlsx"):
|
| 14 |
+
df = pd.read_excel(file)
|
| 15 |
+
else:
|
| 16 |
+
raise ValueError("File type not supported, only CSV or Excel are accepted")
|
| 17 |
+
return standardize_columns(df)
|
| 18 |
+
|
| 19 |
+
def recommend_buildings(building_list, query, scorer, limit, threshold, index_builder, fuzzy_engine):
|
| 20 |
+
if not query:
|
| 21 |
+
return []
|
| 22 |
+
idx_map = index_builder(building_list)
|
| 23 |
+
keys = list(idx_map.keys())
|
| 24 |
+
matches = fuzzy_engine(query.lower(), keys, scorer=scorer, limit=limit)
|
| 25 |
+
return [idx_map[k] for k, score, _ in matches if score >= threshold]
|
src/rupture_utils.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.preprocessing import StandardScaler
|
| 2 |
+
import ruptures as rpt
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
def detect_changepoints(
|
| 7 |
+
df: pd.DataFrame,
|
| 8 |
+
algo: str = "pelt",
|
| 9 |
+
model: str = "rbf",
|
| 10 |
+
pen: float = 1.0,
|
| 11 |
+
) -> pd.DataFrame:
|
| 12 |
+
# 0. 确保索引从 0 开始,避免后续按位置赋值出错
|
| 13 |
+
df = df.reset_index(drop=True)
|
| 14 |
+
y = df["value"].values # 原始序列
|
| 15 |
+
|
| 16 |
+
# Step 1: 标准化(对 rbf/l2 强烈建议)
|
| 17 |
+
if model in ["rbf", "l2", "normal"]:
|
| 18 |
+
# 先标准化再保证二维形状 (n_samples, n_features)
|
| 19 |
+
y_scaled = StandardScaler().fit_transform(y.reshape(-1, 1))
|
| 20 |
+
X = y_scaled # shape (n,1)
|
| 21 |
+
else:
|
| 22 |
+
X = y # 1-D 数组即可
|
| 23 |
+
|
| 24 |
+
# Step 2: 检测变点
|
| 25 |
+
if algo == "pelt":
|
| 26 |
+
algo_obj = rpt.Pelt(model=model).fit(X)
|
| 27 |
+
result = algo_obj.predict(pen=pen)
|
| 28 |
+
elif algo == "window":
|
| 29 |
+
algo_obj = rpt.Window(width=10, model=model).fit(X)
|
| 30 |
+
result = algo_obj.predict(n_bkps=5)
|
| 31 |
+
else:
|
| 32 |
+
raise ValueError("Unknown algo")
|
| 33 |
+
|
| 34 |
+
# Step 3: 标注变点
|
| 35 |
+
df_out = df.copy()
|
| 36 |
+
df_out["changepoint"] = 0
|
| 37 |
+
for idx in result[:-1]: # 最后一个是序列终点
|
| 38 |
+
# ruptures 返回的 idx 是段结束位置(从 1 开始计数);
|
| 39 |
+
# 直接按位置写入即可
|
| 40 |
+
if idx - 1 < len(df_out):
|
| 41 |
+
df_out.loc[idx - 1, "changepoint"] = 1
|
| 42 |
+
|
| 43 |
+
st.write("Changepoint Index:", df_out[df_out["changepoint"] == 1])
|
| 44 |
+
|
| 45 |
+
print("Input sequence:", df["value"].values)
|
| 46 |
+
print("Changepoint detected:", result)
|
| 47 |
+
|
| 48 |
+
return df_out
|
src/usage_utils.py
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def analyze_and_fill_usage(
|
| 6 |
+
usage_data: pd.DataFrame,
|
| 7 |
+
gap_threshold: int = 62,
|
| 8 |
+
fill_earliest_cutoff: str = "2013-01-01",
|
| 9 |
+
min_fill_gap_months: int = 9,
|
| 10 |
+
rolling_window_size: int = 4,
|
| 11 |
+
rolling_centered: bool = True,
|
| 12 |
+
sequence_fill_method: str = "mean",
|
| 13 |
+
post_missing_threshold: float = 0.1,
|
| 14 |
+
) -> pd.DataFrame:
|
| 15 |
+
"""
|
| 16 |
+
对 usage_data 进行缺失类型分析,计算 FillStartDate 和 NotGonnaUse,
|
| 17 |
+
返回 summary_df,只包含统计指标,不做时间序列填补。
|
| 18 |
+
"""
|
| 19 |
+
df = usage_data.copy()
|
| 20 |
+
df["StartDate"] = pd.to_datetime(df["StartDate"])
|
| 21 |
+
df["EndDate"] = pd.to_datetime(df["EndDate"]) - pd.Timedelta(days=2)
|
| 22 |
+
|
| 23 |
+
# 🔧 修复:自动检测建筑列名,兼容两种格式
|
| 24 |
+
building_col = None
|
| 25 |
+
if "BuildingName" in df.columns:
|
| 26 |
+
building_col = "BuildingName"
|
| 27 |
+
elif "Building Name" in df.columns:
|
| 28 |
+
building_col = "Building Name"
|
| 29 |
+
# 创建标准化列名供后续使用
|
| 30 |
+
df["BuildingName"] = df["Building Name"]
|
| 31 |
+
else:
|
| 32 |
+
raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
|
| 33 |
+
|
| 34 |
+
records = []
|
| 35 |
+
# 🔧 修复:始终使用BuildingName进行groupby,确保一致性
|
| 36 |
+
for (bld, util), grp in df.groupby(["BuildingName", "CommodityCode"]):
|
| 37 |
+
# 完整月份索引
|
| 38 |
+
start = grp["StartDate"].min().replace(day=1)
|
| 39 |
+
end = grp["StartDate"].max().replace(day=1)
|
| 40 |
+
full_idx = pd.date_range(start, end, freq="MS")
|
| 41 |
+
|
| 42 |
+
flag = pd.Series(0, index=full_idx)
|
| 43 |
+
flag.loc[grp["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
|
| 44 |
+
missing = flag[flag == 0].index
|
| 45 |
+
|
| 46 |
+
# 统计 Random / Sequence 缺失
|
| 47 |
+
seq_ranges, rand_dates = [], []
|
| 48 |
+
rand_months = seq_months = 0
|
| 49 |
+
if missing.empty:
|
| 50 |
+
mtype = "No Missing"
|
| 51 |
+
else:
|
| 52 |
+
gaps = missing.to_series().diff().dt.days.fillna(9999)
|
| 53 |
+
gid = (gaps > gap_threshold).cumsum()
|
| 54 |
+
for _, seg in missing.to_series().groupby(gid):
|
| 55 |
+
if len(seg) > 1:
|
| 56 |
+
seq_ranges.append(
|
| 57 |
+
f"From {seg.min().strftime('%Y-%m')} to {seg.max().strftime('%Y-%m')}"
|
| 58 |
+
)
|
| 59 |
+
seq_months += len(seg)
|
| 60 |
+
else:
|
| 61 |
+
rand_dates.append(seg.iloc[0].strftime("%Y-%m"))
|
| 62 |
+
rand_months += 1
|
| 63 |
+
has_rand, has_seq = bool(rand_dates), bool(seq_ranges)
|
| 64 |
+
if has_rand and has_seq:
|
| 65 |
+
mtype = "Both"
|
| 66 |
+
elif has_rand:
|
| 67 |
+
mtype = "Random"
|
| 68 |
+
else:
|
| 69 |
+
mtype = "Sequence"
|
| 70 |
+
|
| 71 |
+
records.append(
|
| 72 |
+
{
|
| 73 |
+
"BuildingName": bld,
|
| 74 |
+
"CommodityCode": util,
|
| 75 |
+
"MissingType": mtype,
|
| 76 |
+
"SequenceMissingRanges": "; ".join(seq_ranges),
|
| 77 |
+
"RandomMissingDates": "; ".join(rand_dates),
|
| 78 |
+
"TotalMonths": len(full_idx),
|
| 79 |
+
"RandomMissingMonths": rand_months,
|
| 80 |
+
"SequenceMissingMonths": seq_months,
|
| 81 |
+
"RandomMissingRatio": rand_months / len(full_idx)
|
| 82 |
+
if full_idx.size
|
| 83 |
+
else 0,
|
| 84 |
+
"SequenceMissingRatio": seq_months / len(full_idx)
|
| 85 |
+
if full_idx.size
|
| 86 |
+
else 0,
|
| 87 |
+
}
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
summary_df = pd.DataFrame(records)
|
| 91 |
+
|
| 92 |
+
# -------------------------------------------------------------
|
| 93 |
+
# 计算 FillStartDate
|
| 94 |
+
# -------------------------------------------------------------
|
| 95 |
+
cutoff_dt = pd.to_datetime(fill_earliest_cutoff)
|
| 96 |
+
|
| 97 |
+
def get_fill_start(r):
|
| 98 |
+
"""
|
| 99 |
+
根据用户澄清的正确策略计算FillStartDate:
|
| 100 |
+
1. 不管2013年之前是否有序列缺失,都检查2013年之后是否存在≥9个月的序列缺失
|
| 101 |
+
2. 如果2013年之后存在≥9个月缺失 → 返回缺失结束时间+1个月
|
| 102 |
+
3. 如果2013年之后没有≥9个月缺失 → 返回2013-01-01
|
| 103 |
+
"""
|
| 104 |
+
try:
|
| 105 |
+
# 解析所有序列缺失范围
|
| 106 |
+
seq_ranges = []
|
| 107 |
+
seq_missing_ranges = r.get("SequenceMissingRanges", "")
|
| 108 |
+
|
| 109 |
+
if not seq_missing_ranges or pd.isna(seq_missing_ranges):
|
| 110 |
+
# 没有序列缺失数据,从2013年开始
|
| 111 |
+
return cutoff_dt
|
| 112 |
+
|
| 113 |
+
for rng in str(seq_missing_ranges).split("; "):
|
| 114 |
+
if "to" not in rng:
|
| 115 |
+
continue
|
| 116 |
+
try:
|
| 117 |
+
s, e = rng.replace("From ", "").split(" to ")
|
| 118 |
+
sd, ed = pd.to_datetime(s), pd.to_datetime(e)
|
| 119 |
+
gap = (ed.to_period("M") - sd.to_period("M")).n + 1
|
| 120 |
+
seq_ranges.append((sd, ed, gap))
|
| 121 |
+
except Exception:
|
| 122 |
+
# 跳过无法解析的日期范围,继续处理其他范围
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
if not seq_ranges:
|
| 126 |
+
# 没有有效���序列缺失,从2013年开始
|
| 127 |
+
return cutoff_dt
|
| 128 |
+
|
| 129 |
+
# 🔍 关键:只关注2013年之后的序列缺失(开始时间>=2013-01-01)
|
| 130 |
+
# 🔧 修复:确保正确访问min_fill_gap_months变量
|
| 131 |
+
post_2013_missing = [
|
| 132 |
+
(sd, ed, gap) for sd, ed, gap in seq_ranges
|
| 133 |
+
if sd >= cutoff_dt and gap >= min_fill_gap_months
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
# 🔍 如果2013年之后存在≥9个月的序列缺失
|
| 137 |
+
if post_2013_missing:
|
| 138 |
+
# 按开始时间排序,取第一个符合条件的缺失
|
| 139 |
+
post_2013_missing.sort(key=lambda x: x[0])
|
| 140 |
+
sd, ed, gap = post_2013_missing[0]
|
| 141 |
+
return ed + pd.offsets.MonthBegin(1)
|
| 142 |
+
|
| 143 |
+
# 🔍 如果2013年之后没有≥9个月的序列缺失,从2013年开始
|
| 144 |
+
return cutoff_dt
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
# 🔧 关键修复:如果任何解析步骤失败,总是返回默认的cutoff_dt
|
| 148 |
+
# 这确保get_fill_start永远不会抛出异常,从而避免pandas.apply返回NaT
|
| 149 |
+
# 🔧 新增:记录异常信息用于调试
|
| 150 |
+
import traceback
|
| 151 |
+
print(f"get_fill_start exception for {r.get('BuildingName', 'Unknown')}: {e}")
|
| 152 |
+
print(f"Traceback: {traceback.format_exc()}")
|
| 153 |
+
return cutoff_dt
|
| 154 |
+
|
| 155 |
+
summary_df["FillStartDate"] = summary_df.apply(get_fill_start, axis=1)
|
| 156 |
+
|
| 157 |
+
# -------------------------------------------------------------
|
| 158 |
+
# 计算填充后缺失比率
|
| 159 |
+
# -------------------------------------------------------------
|
| 160 |
+
post_recs = []
|
| 161 |
+
for _, r in summary_df.iterrows():
|
| 162 |
+
bld, util, fsd = r["BuildingName"], r["CommodityCode"], r["FillStartDate"]
|
| 163 |
+
if pd.isna(fsd):
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
grp2 = df[
|
| 167 |
+
(df["BuildingName"] == bld)
|
| 168 |
+
& (df["CommodityCode"] == util)
|
| 169 |
+
& (df["StartDate"] >= fsd)
|
| 170 |
+
]
|
| 171 |
+
if grp2.empty:
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
idx2 = pd.date_range(
|
| 175 |
+
fsd.replace(day=1), grp2["StartDate"].max().replace(day=1), freq="MS"
|
| 176 |
+
)
|
| 177 |
+
flag2 = pd.Series(0, index=idx2)
|
| 178 |
+
flag2.loc[grp2["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
|
| 179 |
+
miss2 = flag2[flag2 == 0].index
|
| 180 |
+
|
| 181 |
+
gaps2 = pd.Series(miss2).diff().dt.days.fillna(9999)
|
| 182 |
+
gid2 = (gaps2 > gap_threshold).cumsum()
|
| 183 |
+
rm2 = sm2 = 0
|
| 184 |
+
for _, seg2 in pd.Series(miss2).groupby(gid2):
|
| 185 |
+
if len(seg2) > 1:
|
| 186 |
+
sm2 += len(seg2)
|
| 187 |
+
else:
|
| 188 |
+
rm2 += 1
|
| 189 |
+
|
| 190 |
+
post_recs.append(
|
| 191 |
+
{
|
| 192 |
+
"BuildingName": bld,
|
| 193 |
+
"CommodityCode": util,
|
| 194 |
+
"PostTotalMonths": len(idx2),
|
| 195 |
+
"PostRandomMissingMonths": rm2,
|
| 196 |
+
"PostSequenceMissingMonths": sm2,
|
| 197 |
+
"PostRandomMissingRatio": rm2 / len(idx2) if idx2.size else 0,
|
| 198 |
+
"PostSequenceMissingRatio": sm2 / len(idx2) if idx2.size else 0,
|
| 199 |
+
}
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
post_df = pd.DataFrame(post_recs)
|
| 203 |
+
|
| 204 |
+
# 🔧 修复:如果post_df为空,需要创建包含所有必要列的空DataFrame
|
| 205 |
+
if post_df.empty:
|
| 206 |
+
# 创建一个与summary_df结构匹配的空DataFrame
|
| 207 |
+
post_df = pd.DataFrame(columns=[
|
| 208 |
+
"BuildingName", "CommodityCode", "PostTotalMonths",
|
| 209 |
+
"PostRandomMissingMonths", "PostSequenceMissingMonths",
|
| 210 |
+
"PostRandomMissingRatio", "PostSequenceMissingRatio"
|
| 211 |
+
])
|
| 212 |
+
|
| 213 |
+
summary_df = summary_df.merge(post_df, on=["BuildingName", "CommodityCode"], how="left")
|
| 214 |
+
|
| 215 |
+
# 🔧 修复:填充缺失的post分析列为默认值
|
| 216 |
+
post_columns = ["PostTotalMonths", "PostRandomMissingMonths", "PostSequenceMissingMonths",
|
| 217 |
+
"PostRandomMissingRatio", "PostSequenceMissingRatio"]
|
| 218 |
+
for col in post_columns:
|
| 219 |
+
if col not in summary_df.columns:
|
| 220 |
+
if "Ratio" in col:
|
| 221 |
+
summary_df[col] = 0.0 # 比率列默认为0
|
| 222 |
+
else:
|
| 223 |
+
summary_df[col] = 0 # 月数列默认为0
|
| 224 |
+
|
| 225 |
+
summary_df["NotGonnaUse"] = (
|
| 226 |
+
(summary_df["PostRandomMissingRatio"] > post_missing_threshold)
|
| 227 |
+
| (summary_df["PostSequenceMissingRatio"] > post_missing_threshold)
|
| 228 |
+
).astype(int)
|
| 229 |
+
|
| 230 |
+
return summary_df
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def fill_usage_with_sequence_check_strict_mean(
|
| 234 |
+
usage_data: pd.DataFrame,
|
| 235 |
+
summary_df: pd.DataFrame,
|
| 236 |
+
method: str = "mean",
|
| 237 |
+
force: bool = False,
|
| 238 |
+
fill_earliest_cutoff: str = "1900-01-01",
|
| 239 |
+
) -> pd.DataFrame:
|
| 240 |
+
"""
|
| 241 |
+
根据 summary_df 的 FillStartDate / NotGonnaUse 对 usage_data 进行填补。
|
| 242 |
+
|
| 243 |
+
参数
|
| 244 |
+
----------
|
| 245 |
+
usage_data : 原始用量表
|
| 246 |
+
summary_df : analyze_and_fill_usage 的结果
|
| 247 |
+
method : 'mean' 或 'median'
|
| 248 |
+
force : True 时忽略 NotGonnaUse 和 NaT,自动调整起点,保证能输出序列
|
| 249 |
+
fill_earliest_cutoff : 当 FillStartDate 为 NaT 时的回退起点(仅在 force=True 时使用��
|
| 250 |
+
|
| 251 |
+
返回
|
| 252 |
+
----------
|
| 253 |
+
filled_df : ['BuildingName','CommodityCode','Date','FilledUse']
|
| 254 |
+
"""
|
| 255 |
+
df = usage_data.copy()
|
| 256 |
+
df["StartDate"] = pd.to_datetime(df["StartDate"]).dt.to_period("M").dt.to_timestamp()
|
| 257 |
+
|
| 258 |
+
# 🔧 修复:自动检测建筑列名,兼容两种格式
|
| 259 |
+
building_col = None
|
| 260 |
+
if "BuildingName" in df.columns:
|
| 261 |
+
building_col = "BuildingName"
|
| 262 |
+
elif "Building Name" in df.columns:
|
| 263 |
+
building_col = "Building Name"
|
| 264 |
+
# 创建标准化列名供后续使用
|
| 265 |
+
df["BuildingName"] = df["Building Name"]
|
| 266 |
+
else:
|
| 267 |
+
raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
|
| 268 |
+
|
| 269 |
+
all_records = []
|
| 270 |
+
for _, row in summary_df.iterrows():
|
| 271 |
+
bld, util, fsd, drop = (
|
| 272 |
+
row["BuildingName"],
|
| 273 |
+
row["CommodityCode"],
|
| 274 |
+
row["FillStartDate"],
|
| 275 |
+
row["NotGonnaUse"],
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
# ───────── 闸门 1 & 2 ─────────
|
| 279 |
+
if not force and (drop == 1 or pd.isna(fsd)):
|
| 280 |
+
# 严格模式:缺失率过高 或 没有有效起点 → 直接跳过
|
| 281 |
+
continue
|
| 282 |
+
if force and pd.isna(fsd):
|
| 283 |
+
fsd = pd.to_datetime(fill_earliest_cutoff)
|
| 284 |
+
|
| 285 |
+
# 取 >= fsd 的原始数据(使用标准化的BuildingName列)
|
| 286 |
+
grp = df[
|
| 287 |
+
(df["BuildingName"] == bld)
|
| 288 |
+
& (df["CommodityCode"] == util)
|
| 289 |
+
& (df["StartDate"] >= fsd)
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
# ───────── 闸门 3 ─────────
|
| 293 |
+
if grp.empty:
|
| 294 |
+
if not force:
|
| 295 |
+
continue
|
| 296 |
+
# 强制模式:回退到该组合最早月份
|
| 297 |
+
grp = df[(df["BuildingName"] == bld) & (df["CommodityCode"] == util)]
|
| 298 |
+
if grp.empty:
|
| 299 |
+
# 数据确实不存在
|
| 300 |
+
continue
|
| 301 |
+
fsd = grp["StartDate"].min()
|
| 302 |
+
|
| 303 |
+
last_m = grp["StartDate"].max()
|
| 304 |
+
all_months = pd.date_range(fsd, last_m, freq="MS")
|
| 305 |
+
monthly = grp.groupby("StartDate")["Use"].sum().reindex(all_months)
|
| 306 |
+
|
| 307 |
+
base = monthly.dropna()
|
| 308 |
+
fill_val = base.median() if method == "median" else base.mean()
|
| 309 |
+
filled = monthly.fillna(fill_val).reset_index()
|
| 310 |
+
|
| 311 |
+
filled.columns = ["Date", "FilledUse"]
|
| 312 |
+
filled["BuildingName"] = bld
|
| 313 |
+
filled["CommodityCode"] = util
|
| 314 |
+
all_records.append(filled)
|
| 315 |
+
|
| 316 |
+
if not all_records:
|
| 317 |
+
return pd.DataFrame(columns=["BuildingName", "CommodityCode", "Date", "FilledUse"])
|
| 318 |
+
|
| 319 |
+
return pd.concat(all_records, ignore_index=True)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# ===============================================================
|
| 323 |
+
# LLM-based Weather Variable Selection Functions
|
| 324 |
+
# ===============================================================
|
| 325 |
+
|
| 326 |
+
# Building Type → Weather Variable Rule Mapping
|
| 327 |
+
weather_influence_map = {
|
| 328 |
+
"Office": ["temp_mean", "temp_std", "CDD_sum", "clouds_all_mean"],
|
| 329 |
+
"Instructional": ["temp_mean", "temp_std", "CDD_sum", "humidity_mean"],
|
| 330 |
+
"Residential": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
|
| 331 |
+
"Health": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
|
| 332 |
+
"Library": ["temp_mean", "temp_std", "humidity_mean", "clouds_all_mean"],
|
| 333 |
+
"Dining": ["temp_mean", "rain_sum", "CDD_sum"],
|
| 334 |
+
"Recreation": ["temp_mean", "rain_sum", "wind_speed_mean"],
|
| 335 |
+
"Assembly or Theater": ["temp_mean", "wind_speed_mean", "rain_sum"],
|
| 336 |
+
"Affiliate": ["temp_mean", "CDD_sum", "rain_sum"],
|
| 337 |
+
"Parking Structure": [],
|
| 338 |
+
"Infrastructure": [],
|
| 339 |
+
"Container": [],
|
| 340 |
+
"Mixed": ["temp_mean", "CDD_sum", "humidity_mean"],
|
| 341 |
+
"Other": ["temp_mean", "CDD_sum"],
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
def infer_building_type(text: str) -> str:
|
| 345 |
+
"""推断建筑类型基于文本描述"""
|
| 346 |
+
patterns = {
|
| 347 |
+
"Instructional": ["Teaching", "Classroom", "School", "Lecture Hall", "Academic", "Education"],
|
| 348 |
+
"Residential": ["Residential", "Apartment", "Dormitory", "Housing", "Student"],
|
| 349 |
+
"Office": ["Office", "Office Building", "Administrative", "Admin"],
|
| 350 |
+
"Health": ["Hospital", "Medical", "Clinic", "Health"],
|
| 351 |
+
"Dining": ["Canteen", "Restaurant", "Dining", "Food", "Kitchen"],
|
| 352 |
+
"Recreation": ["Fitness", "Sports", "Entertainment", "Recreation", "Gym"],
|
| 353 |
+
"Library": ["Library"],
|
| 354 |
+
"Assembly or Theater": ["Theater", "Auditorium", "Performance", "Assembly"],
|
| 355 |
+
"Affiliate": ["Affiliate"],
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
text_lower = text.lower()
|
| 359 |
+
for btype, keywords in patterns.items():
|
| 360 |
+
if any(k.lower() in text_lower for k in keywords):
|
| 361 |
+
return btype
|
| 362 |
+
return "Mixed"
|
| 363 |
+
|
| 364 |
+
def construct_weather_prompt_static(
|
| 365 |
+
user_description: str, detected_type: str, suggested_vars: list,
|
| 366 |
+
gross_area: float, avg_space_sqft: float, workpoint_count: int, floor_count: int
|
| 367 |
+
) -> str:
|
| 368 |
+
"""构建静态weather prompt"""
|
| 369 |
+
scenario_note = """
|
| 370 |
+
Weather-Scenario (z-score offset, user will pick one):
|
| 371 |
+
• Normal → 0 σ offset (historical monthly mean)
|
| 372 |
+
• Hot → +1 σ on temp_mean & CDD_sum, −0.5 σ on humidity_mean
|
| 373 |
+
• ColdWet → −1 σ on temp_mean, +1 σ on HDD_sum & humidity_mean
|
| 374 |
+
• WindyCloudy → +1 σ on wind_speed_mean & clouds_all_mean
|
| 375 |
+
LLM only needs to recommend variables; offsets are applied downstream.
|
| 376 |
+
"""
|
| 377 |
+
|
| 378 |
+
return f"""
|
| 379 |
+
You are an expert in building energy modeling and changepoint detection.
|
| 380 |
+
|
| 381 |
+
{scenario_note}
|
| 382 |
+
|
| 383 |
+
Building Description (current use only):
|
| 384 |
+
{user_description}
|
| 385 |
+
|
| 386 |
+
Inferred Operation Type: {detected_type}
|
| 387 |
+
|
| 388 |
+
Structural Information:
|
| 389 |
+
- Building Gross Area: {gross_area:,.0f} sq ft
|
| 390 |
+
- Average Space Size: {avg_space_sqft:,.0f} sq ft
|
| 391 |
+
- Total Workpoint Count: {workpoint_count}
|
| 392 |
+
- Floor Count: {floor_count}
|
| 393 |
+
|
| 394 |
+
Rule-based Suggested Variables: {', '.join(suggested_vars)}
|
| 395 |
+
|
| 396 |
+
Candidate Weather Variables:
|
| 397 |
+
temp_mean · temp_std · HDD_sum · CDD_sum · rain_sum · clouds_all_mean · humidity_mean · wind_speed_mean
|
| 398 |
+
|
| 399 |
+
Tasks:
|
| 400 |
+
1. Select 3–5 variables that best capture energy-use behaviour under the current configuration.
|
| 401 |
+
2. Briefly justify each choice.
|
| 402 |
+
3. Return a markdown table with columns: Selected Variable | Reason.
|
| 403 |
+
"""
|
| 404 |
+
|
| 405 |
+
def chat_with_ollama(messages: list, model: str = "mistral") -> str:
|
| 406 |
+
"""与Ollama API聊天"""
|
| 407 |
+
import requests
|
| 408 |
+
|
| 409 |
+
url = "http://localhost:11434/api/chat"
|
| 410 |
+
try:
|
| 411 |
+
response = requests.post(
|
| 412 |
+
url,
|
| 413 |
+
json={"model": model, "messages": messages, "stream": False},
|
| 414 |
+
timeout=30
|
| 415 |
+
)
|
| 416 |
+
response.raise_for_status()
|
| 417 |
+
return response.json()["message"]["content"]
|
| 418 |
+
except requests.exceptions.RequestException as e:
|
| 419 |
+
raise Exception(f"Ollama API error: {str(e)}")
|
| 420 |
+
except KeyError:
|
| 421 |
+
raise Exception("Invalid response format from Ollama API")
|
| 422 |
+
|
| 423 |
+
def parse_selected_vars(md: str) -> list:
|
| 424 |
+
"""解析markdown表格中的变量列表"""
|
| 425 |
+
vars_ = []
|
| 426 |
+
for row in md.strip().splitlines():
|
| 427 |
+
if row.startswith("|") and not row.startswith("| Selected"):
|
| 428 |
+
parts = row.split("|")
|
| 429 |
+
if len(parts) > 1:
|
| 430 |
+
first = parts[1].strip()
|
| 431 |
+
if first and first != '---' and first not in ["Selected Variable", ""]:
|
| 432 |
+
vars_.append(first)
|
| 433 |
+
return vars_
|