cameroncameron commited on
Commit
6782585
·
verified ·
1 Parent(s): ec514ce

Upload 4 files

Browse files
Files changed (4) hide show
  1. src/cp_utils.py +437 -0
  2. src/data_utils.py +25 -0
  3. src/rupture_utils.py +48 -0
  4. src/usage_utils.py +433 -0
src/cp_utils.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """cp_utils.py
2
+ Utilities for evaluating changepoint credibility and performing
3
+ semi-/supervised classification on changepoints.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from typing import Tuple
11
+
12
+ from statsmodels.tsa.stattools import adfuller
13
+ from prophet import Prophet
14
+ from sklearn.semi_supervised import SelfTrainingClassifier
15
+ from xgboost import XGBClassifier
16
+ # 🔧 新增:添加CatBoost支持
17
+ try:
18
+ from catboost import CatBoostClassifier
19
+ CATBOOST_AVAILABLE = True
20
+ except ImportError:
21
+ CATBOOST_AVAILABLE = False
22
+ print("⚠️ CatBoost not installed. Install with: pip install catboost")
23
+
24
+ # 1. 评估单栋楼的候选 changepoints —— ProphetDelta + 结构性指标
25
+ def _validate_cp_metrics(residual: pd.Series, idx: int, win: int = 6
26
+ ) -> Tuple[bool, float, float]:
27
+ """在 idx±win 窗口内计算 (slope, adf_p) 并判断是否显著"""
28
+ lo = max(0, idx - win)
29
+ hi = min(len(residual), idx + win + 1)
30
+ seg = residual.iloc[lo:hi].dropna()
31
+ if seg.size < 4:
32
+ return False, np.nan, np.nan
33
+ slope = np.polyfit(range(len(seg)), seg, 1)[0]
34
+ p_val = adfuller(seg)[1]
35
+ is_valid = (abs(slope) > 0.1) and (p_val > 0.05)
36
+ return is_valid, float(slope), float(p_val)
37
+
38
+ def building_score_changepoints(
39
+ summary_df: pd.DataFrame,
40
+ filled_df: pd.DataFrame,
41
+ building_name: str,
42
+ model: str = "rbf",
43
+ penalty: float | None = None,
44
+ window_size: int = 6,
45
+ usage_col: str = "FilledUse",
46
+ date_col: str = "Date",
47
+ cp_df: pd.DataFrame | None = None,
48
+ ) -> pd.DataFrame:
49
+ """Return table with Prophet delta & structure metrics for one building."""
50
+ from rupture_utils import detect_changepoints
51
+
52
+ records: list[dict] = []
53
+ utilities = (
54
+ summary_df.loc[summary_df["BuildingName"] == building_name,
55
+ "CommodityCode"].unique()
56
+ )
57
+
58
+ for util in utilities:
59
+ df_util = (
60
+ filled_df[
61
+ (filled_df["BuildingName"] == building_name)
62
+ & (filled_df["CommodityCode"] == util)
63
+ ]
64
+ .sort_values(date_col)
65
+ .reset_index(drop=True)
66
+ )
67
+ if df_util[usage_col].isna().any() or len(df_util) < 24:
68
+ continue
69
+
70
+ if cp_df is None:
71
+ cp_df = detect_changepoints(
72
+ df_util[[date_col, usage_col]].rename(
73
+ columns={date_col: "timestamp", usage_col: "value"}),
74
+ algo="pelt",
75
+ model=model,
76
+ pen=penalty or 1.0,
77
+ )
78
+ if cp_df.empty or cp_df["changepoint"].sum() == 0:
79
+ continue
80
+
81
+ df_p = df_util[[date_col, usage_col]].rename(
82
+ columns={date_col: "ds", usage_col: "y"})
83
+ m_tmp = Prophet(yearly_seasonality=False, weekly_seasonality=False,
84
+ daily_seasonality=False)
85
+ m_tmp.fit(df_p)
86
+ residual = df_p["y"] - m_tmp.predict(df_p)["yhat"]
87
+
88
+ validated_dates: list[pd.Timestamp] = []
89
+ metrics_map: dict[pd.Timestamp, Tuple[float, float]] = {}
90
+ for d in pd.to_datetime(cp_df.loc[cp_df["changepoint"] == 1,
91
+ "timestamp"]):
92
+ if d not in df_util[date_col].values:
93
+ continue
94
+ idx = df_util.index[df_util[date_col] == d][0]
95
+ is_valid, slope, p_val = _validate_cp_metrics(
96
+ residual, idx, win=window_size
97
+ )
98
+ if is_valid:
99
+ validated_dates.append(d)
100
+ metrics_map[d] = (slope, p_val)
101
+
102
+ if not validated_dates:
103
+ continue
104
+
105
+ m = Prophet(changepoints=validated_dates, yearly_seasonality=False)
106
+ m.fit(df_p)
107
+ deltas = m.params["delta"].mean(axis=0)
108
+ for cp, delta in zip(m.changepoints, deltas):
109
+ d = pd.to_datetime(cp)
110
+ slope, p_val = metrics_map.get(d, (np.nan, np.nan))
111
+ records.append(
112
+ {
113
+ "Building Name": building_name,
114
+ "CommodityCode": util,
115
+ "Changepoint Date": d,
116
+ "ProphetDelta": float(delta),
117
+ "slope": float(slope),
118
+ "adf_p_value": float(p_val),
119
+ }
120
+ )
121
+
122
+ result_df = pd.DataFrame(records)
123
+ if not result_df.empty:
124
+ result_df["AbsDelta"] = result_df["ProphetDelta"].abs()
125
+ return result_df
126
+
127
+ # 2. 伪标签打标
128
+ def label_changepoints_by_structure_signal(
129
+ df: pd.DataFrame,
130
+ slope_thresh: float = 0.1,
131
+ p_thresh: float = 0.05,
132
+ ) -> pd.DataFrame:
133
+ """Assign pseudo-labels Real / Noise / Unknown based on structure
134
+ signals."""
135
+ def _assign(row):
136
+ s, p = row["slope"], row["adf_p_value"]
137
+ if pd.isna(s) or pd.isna(p):
138
+ return "Unknown"
139
+ if (abs(s) > slope_thresh) and (p > p_thresh):
140
+ return "Real"
141
+ if (abs(s) < slope_thresh * 0.5) and (p < p_thresh * 0.5):
142
+ return "Noise"
143
+ return "Unknown"
144
+ out = df.copy()
145
+ out["Label"] = out.apply(_assign, axis=1)
146
+ return out
147
+
148
+ # 3. 时序衍生特征
149
+ def extract_changepoint_features(
150
+ cp_df: pd.DataFrame,
151
+ filled_df: pd.DataFrame,
152
+ usage_col: str = "FilledUse",
153
+ date_col: str = "Date",
154
+ mean_win: int = 6,
155
+ ) -> pd.DataFrame:
156
+ """Derive mean diff/ratio and temporal context features for each cp,
157
+ 并合并holidaycount特征(如有)"""
158
+ cp_df = cp_df.copy()
159
+
160
+ # 🔧 Fix: Ensure proper data types for TimeIndex and Season columns
161
+ cp_df["TimeIndex"] = cp_df["Changepoint Date"].dt.month.astype('int64')
162
+
163
+ # 🔧 Fix: Convert Season to categorical codes to avoid string/numeric
164
+ # dtype conflicts
165
+ season_mapping = {
166
+ 6: 0, 7: 0, 8: 0, # Summer = 0
167
+ 12: 1, 1: 1, 2: 1, # Winter = 1
168
+ }
169
+ # Other = 2
170
+ season_col = cp_df["TimeIndex"].map(season_mapping).fillna(2)
171
+ cp_df["Season"] = season_col.astype('int64')
172
+
173
+ min_dates = filled_df.groupby("BuildingName")[date_col].min().to_dict()
174
+ for i, row in cp_df.iterrows():
175
+ bld = row["Building Name"]
176
+ cp_date = row["Changepoint Date"]
177
+ df_bld = (
178
+ filled_df[filled_df["BuildingName"] == bld]
179
+ .sort_values(date_col)
180
+ .reset_index(drop=True)
181
+ )
182
+ if cp_date not in df_bld[date_col].values:
183
+ continue
184
+ idx = df_bld.index[df_bld[date_col] == cp_date][0]
185
+ before_vals = df_bld[usage_col].iloc[max(0, idx - mean_win): idx]
186
+ after_vals = df_bld[usage_col].iloc[idx + 1: idx + mean_win + 1]
187
+ before_mean = before_vals.mean() if len(before_vals) else np.nan
188
+ after_mean = after_vals.mean() if len(after_vals) else np.nan
189
+ diff = after_mean - before_mean if np.isfinite(before_mean) and np.isfinite(after_mean) else np.nan
190
+ ratio = after_mean / before_mean if np.isfinite(before_mean) and before_mean != 0 else np.nan
191
+ cp_df.at[i, "ΔMeanBefore"] = before_mean
192
+ cp_df.at[i, "ΔMeanAfter"] = after_mean
193
+ cp_df.at[i, "ΔMeanDiff"] = diff
194
+ cp_df.at[i, "ΔMeanRatio"] = ratio
195
+ cp_df.at[i, "TimeSinceStart"] = (cp_date - min_dates.get(bld, cp_date)).days
196
+
197
+ # 🔧 Fix: Ensure all numeric columns have consistent dtypes
198
+ numeric_cols = ["ΔMeanBefore", "ΔMeanAfter", "ΔMeanDiff", "ΔMeanRatio",
199
+ "TimeSinceStart", "TimeIndex", "Season"]
200
+ for col in numeric_cols:
201
+ if col in cp_df.columns:
202
+ cp_df[col] = pd.to_numeric(cp_df[col], errors='coerce')
203
+
204
+ if "holidaycount" in filled_df.columns:
205
+ # 只保留合并所需的列,避免重复
206
+ holiday_df = filled_df[["BuildingName", date_col, "holidaycount"]].drop_duplicates()
207
+ holiday_df = holiday_df.rename(
208
+ columns={"BuildingName": "Building Name", date_col: "Changepoint Date"}
209
+ )
210
+ # 🔧 Fix: Ensure holidaycount is numeric
211
+ holiday_df["holidaycount"] = pd.to_numeric(
212
+ holiday_df["holidaycount"], errors='coerce'
213
+ ).fillna(0)
214
+
215
+ cp_df = cp_df.merge(
216
+ holiday_df, on=["Building Name", "Changepoint Date"], how="left"
217
+ )
218
+ # Fill any missing holidaycount values with 0
219
+ cp_df["holidaycount"] = cp_df["holidaycount"].fillna(0)
220
+
221
+ return cp_df
222
+
223
+ # 4. 半监督模型 (Self-Training XGBoost)
224
+ def run_semi_supervised_cp_model(
225
+ base_df: pd.DataFrame,
226
+ k_best: int = 10,
227
+ feature_cols: list[str] | None = None,
228
+ xgb_params: dict | None = None,
229
+ ) -> Tuple[pd.DataFrame, dict]:
230
+ """Return preds_df (with Predicted) and simple stats."""
231
+ if feature_cols is None:
232
+ feature_cols = [
233
+ "AbsDelta",
234
+ "slope",
235
+ "ΔMeanDiff",
236
+ "ΔMeanRatio",
237
+ "TimeSinceStart",
238
+ 'holidaycount'
239
+ ]
240
+ if xgb_params is None:
241
+ xgb_params = {
242
+ "max_depth": 3,
243
+ "learning_rate": 0.1,
244
+ "n_estimators": 200,
245
+ "subsample": 0.8,
246
+ "colsample_bytree": 0.8,
247
+ "objective": "binary:logistic",
248
+ "eval_metric": "logloss",
249
+ "verbosity": 0,
250
+ }
251
+ df = base_df.copy()
252
+ y = np.full(len(df), -1, dtype=int)
253
+ y[df["Label"] == "Real"] = 1
254
+ y[df["Label"] == "Noise"] = 0
255
+
256
+ X = df[feature_cols].fillna(0).values
257
+ base_clf = XGBClassifier(**xgb_params)
258
+ clf = SelfTrainingClassifier(
259
+ base_estimator=base_clf,
260
+ criterion="k_best",
261
+ k_best=k_best
262
+ )
263
+
264
+ unique_labels_in_y = np.unique(y[y != -1])
265
+ if len(unique_labels_in_y) < 2 and len(unique_labels_in_y) > 0:
266
+ print(f"Initial pseudo-labels only contain one class: "
267
+ f"{unique_labels_in_y}. Self-training may not be effective "
268
+ f"or may fail. Predictions might be skewed or based on "
269
+ f"initial labels only.")
270
+ try:
271
+ clf.fit(X, y)
272
+ trans = clf.transduction_
273
+ except ValueError as e:
274
+ print(f"SelfTrainingClassifier.fit error: {e}. This might be "
275
+ f"due to homogenous initial labels (e.g., all 'Real' "
276
+ f"or all 'Noise').")
277
+ trans = y.copy()
278
+ elif len(unique_labels_in_y) == 0:
279
+ print("No initial pseudo-labels (Real/Noise) found. "
280
+ "Self-training cannot proceed. All will be 'Unknown'.")
281
+ trans = y
282
+ else:
283
+ clf.fit(X, y)
284
+ trans = clf.transduction_
285
+
286
+ # 🔧 Fix: More robust dtype handling for np.select
287
+ # Ensure trans is integer type and handle any potential issues
288
+ trans = np.asarray(trans, dtype=int)
289
+
290
+ # 🔧 Fix: Alternative approach - using pandas map for safer type handling
291
+ # Create mapping dict and use pandas functionality instead of np.select
292
+ label_map = {1: "Real", 0: "Noise", -1: "Unknown"}
293
+
294
+ # Convert to pandas Series for safer dtype handling
295
+ trans_series = pd.Series(trans)
296
+ predicted_labels = trans_series.map(label_map).fillna("Unknown")
297
+
298
+ # Assign to dataframe with explicit dtype specification
299
+ df = df.copy() # Ensure we work with a clean copy
300
+ df["Predicted"] = predicted_labels.astype(str)
301
+
302
+ stats = df["Predicted"].value_counts(dropna=False).to_dict()
303
+ stats["k_best"] = k_best
304
+ return df, stats
305
+
306
+ # 🔧 新增:CatBoost版本的半监督模型
307
+ def run_semi_supervised_cp_model_catboost(
308
+ base_df: pd.DataFrame,
309
+ k_best: int = 10,
310
+ feature_cols: list[str] | None = None,
311
+ catboost_params: dict | None = None,
312
+ ) -> Tuple[pd.DataFrame, dict]:
313
+ """
314
+ CatBoost版本的半监督变点分类模型
315
+
316
+ Args:
317
+ base_df: 包含特征和标签的数据框
318
+ k_best: SelfTrainingClassifier的k_best参数
319
+ feature_cols: 特征列名列表
320
+ catboost_params: CatBoost参数字典
321
+
322
+ Returns:
323
+ 预测结果数据框和统计信息
324
+ """
325
+ if not CATBOOST_AVAILABLE:
326
+ raise ImportError("CatBoost not available. Install with: "
327
+ "pip install catboost")
328
+
329
+ if feature_cols is None:
330
+ feature_cols = [
331
+ "AbsDelta",
332
+ "slope",
333
+ "ΔMeanDiff",
334
+ "ΔMeanRatio",
335
+ "TimeSinceStart",
336
+ 'holidaycount'
337
+ ]
338
+
339
+ if catboost_params is None:
340
+ catboost_params = {
341
+ "depth": 3,
342
+ "learning_rate": 0.1,
343
+ "iterations": 200,
344
+ "colsample_bylevel": 0.8,
345
+ "loss_function": "Logloss",
346
+ "eval_metric": "Logloss",
347
+ "verbose": False,
348
+ "allow_writing_files": False,
349
+ "bootstrap_type": "Bayesian", # Better for small samples
350
+ }
351
+
352
+ df = base_df.copy()
353
+ y = np.full(len(df), -1, dtype=int)
354
+ y[df["Label"] == "Real"] = 1
355
+ y[df["Label"] == "Noise"] = 0
356
+
357
+ X = df[feature_cols].fillna(0).values
358
+
359
+ # 🎯 使用CatBoost替代XGBoost
360
+ base_clf = CatBoostClassifier(**catboost_params)
361
+ clf = SelfTrainingClassifier(
362
+ base_estimator=base_clf,
363
+ criterion="k_best",
364
+ k_best=k_best
365
+ )
366
+
367
+ unique_labels_in_y = np.unique(y[y != -1])
368
+ if len(unique_labels_in_y) < 2 and len(unique_labels_in_y) > 0:
369
+ print(f"Initial pseudo-labels only contain one class: "
370
+ f"{unique_labels_in_y}. Self-training may not be effective "
371
+ f"or may fail. Predictions might be skewed or based on "
372
+ f"initial labels only.")
373
+ try:
374
+ clf.fit(X, y)
375
+ trans = clf.transduction_
376
+ except ValueError as e:
377
+ print(f"SelfTrainingClassifier.fit error: {e}. This might be "
378
+ f"due to homogenous initial labels (e.g., all 'Real' "
379
+ f"or all 'Noise').")
380
+ trans = y.copy()
381
+ elif len(unique_labels_in_y) == 0:
382
+ print("No initial pseudo-labels (Real/Noise) found. "
383
+ "Self-training cannot proceed. All will be 'Unknown'.")
384
+ trans = y
385
+ else:
386
+ clf.fit(X, y)
387
+ trans = clf.transduction_
388
+
389
+ # 🔧 Fix: Same robust dtype handling as XGBoost version
390
+ trans = np.asarray(trans, dtype=int)
391
+
392
+ # Use pandas map for safer type handling
393
+ label_map = {1: "Real", 0: "Noise", -1: "Unknown"}
394
+ trans_series = pd.Series(trans)
395
+ predicted_labels = trans_series.map(label_map).fillna("Unknown")
396
+
397
+ # Assign to dataframe with explicit dtype specification
398
+ df = df.copy()
399
+ df["Predicted"] = predicted_labels.astype(str)
400
+
401
+ stats = df["Predicted"].value_counts(dropna=False).to_dict()
402
+ stats["k_best"] = k_best
403
+ stats["model_type"] = "CatBoost"
404
+ return df, stats
405
+
406
+ # 🔧 新增:统一的模型选择函数
407
+ def run_semi_supervised_cp_model_unified(
408
+ base_df: pd.DataFrame,
409
+ k_best: int = 10,
410
+ feature_cols: list[str] | None = None,
411
+ model_type: str = "xgboost",
412
+ model_params: dict | None = None,
413
+ ) -> Tuple[pd.DataFrame, dict]:
414
+ """
415
+ 统一的半监督变点分类模型接口,支持XGBoost和CatBoost
416
+
417
+ Args:
418
+ base_df: 包含特征和标签的数据框
419
+ k_best: SelfTrainingClassifier的k_best参数
420
+ feature_cols: 特征列名列表
421
+ model_type: 模型类型,"xgboost" 或 "catboost"
422
+ model_params: 模型参数字典
423
+
424
+ Returns:
425
+ 预测结果数据框和统计信息
426
+ """
427
+ if model_type.lower() == "catboost":
428
+ return run_semi_supervised_cp_model_catboost(
429
+ base_df, k_best, feature_cols, model_params
430
+ )
431
+ elif model_type.lower() == "xgboost":
432
+ return run_semi_supervised_cp_model(
433
+ base_df, k_best, feature_cols, model_params
434
+ )
435
+ else:
436
+ raise ValueError(f"Unsupported model_type: {model_type}. "
437
+ f"Choose from 'xgboost' or 'catboost'")
src/data_utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
4
+ """
5
+ 标准化列名:去除空格、下划线和非可见字符
6
+ """
7
+ df.columns = [c.strip().replace("\xa0", "").replace(" ", "").replace("_", "") for c in df.columns]
8
+ return df
9
+
10
+ def load_file(file) -> pd.DataFrame:
11
+ if file.name.endswith(".csv"):
12
+ df = pd.read_csv(file)
13
+ elif file.name.endswith(".xlsx"):
14
+ df = pd.read_excel(file)
15
+ else:
16
+ raise ValueError("File type not supported, only CSV or Excel are accepted")
17
+ return standardize_columns(df)
18
+
19
+ def recommend_buildings(building_list, query, scorer, limit, threshold, index_builder, fuzzy_engine):
20
+ if not query:
21
+ return []
22
+ idx_map = index_builder(building_list)
23
+ keys = list(idx_map.keys())
24
+ matches = fuzzy_engine(query.lower(), keys, scorer=scorer, limit=limit)
25
+ return [idx_map[k] for k, score, _ in matches if score >= threshold]
src/rupture_utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.preprocessing import StandardScaler
2
+ import ruptures as rpt
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ def detect_changepoints(
7
+ df: pd.DataFrame,
8
+ algo: str = "pelt",
9
+ model: str = "rbf",
10
+ pen: float = 1.0,
11
+ ) -> pd.DataFrame:
12
+ # 0. 确保索引从 0 开始,避免后续按位置赋值出错
13
+ df = df.reset_index(drop=True)
14
+ y = df["value"].values # 原始序列
15
+
16
+ # Step 1: 标准化(对 rbf/l2 强烈建议)
17
+ if model in ["rbf", "l2", "normal"]:
18
+ # 先标准化再保证二维形状 (n_samples, n_features)
19
+ y_scaled = StandardScaler().fit_transform(y.reshape(-1, 1))
20
+ X = y_scaled # shape (n,1)
21
+ else:
22
+ X = y # 1-D 数组即可
23
+
24
+ # Step 2: 检测变点
25
+ if algo == "pelt":
26
+ algo_obj = rpt.Pelt(model=model).fit(X)
27
+ result = algo_obj.predict(pen=pen)
28
+ elif algo == "window":
29
+ algo_obj = rpt.Window(width=10, model=model).fit(X)
30
+ result = algo_obj.predict(n_bkps=5)
31
+ else:
32
+ raise ValueError("Unknown algo")
33
+
34
+ # Step 3: 标注变点
35
+ df_out = df.copy()
36
+ df_out["changepoint"] = 0
37
+ for idx in result[:-1]: # 最后一个是序列终点
38
+ # ruptures 返回的 idx 是段结束位置(从 1 开始计数);
39
+ # 直接按位置写入即可
40
+ if idx - 1 < len(df_out):
41
+ df_out.loc[idx - 1, "changepoint"] = 1
42
+
43
+ st.write("Changepoint Index:", df_out[df_out["changepoint"] == 1])
44
+
45
+ print("Input sequence:", df["value"].values)
46
+ print("Changepoint detected:", result)
47
+
48
+ return df_out
src/usage_utils.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+
5
+ def analyze_and_fill_usage(
6
+ usage_data: pd.DataFrame,
7
+ gap_threshold: int = 62,
8
+ fill_earliest_cutoff: str = "2013-01-01",
9
+ min_fill_gap_months: int = 9,
10
+ rolling_window_size: int = 4,
11
+ rolling_centered: bool = True,
12
+ sequence_fill_method: str = "mean",
13
+ post_missing_threshold: float = 0.1,
14
+ ) -> pd.DataFrame:
15
+ """
16
+ 对 usage_data 进行缺失类型分析,计算 FillStartDate 和 NotGonnaUse,
17
+ 返回 summary_df,只包含统计指标,不做时间序列填补。
18
+ """
19
+ df = usage_data.copy()
20
+ df["StartDate"] = pd.to_datetime(df["StartDate"])
21
+ df["EndDate"] = pd.to_datetime(df["EndDate"]) - pd.Timedelta(days=2)
22
+
23
+ # 🔧 修复:自动检测建筑列名,兼容两种格式
24
+ building_col = None
25
+ if "BuildingName" in df.columns:
26
+ building_col = "BuildingName"
27
+ elif "Building Name" in df.columns:
28
+ building_col = "Building Name"
29
+ # 创建标准化列名供后续使用
30
+ df["BuildingName"] = df["Building Name"]
31
+ else:
32
+ raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
33
+
34
+ records = []
35
+ # 🔧 修复:始终使用BuildingName进行groupby,确保一致性
36
+ for (bld, util), grp in df.groupby(["BuildingName", "CommodityCode"]):
37
+ # 完整月份索引
38
+ start = grp["StartDate"].min().replace(day=1)
39
+ end = grp["StartDate"].max().replace(day=1)
40
+ full_idx = pd.date_range(start, end, freq="MS")
41
+
42
+ flag = pd.Series(0, index=full_idx)
43
+ flag.loc[grp["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
44
+ missing = flag[flag == 0].index
45
+
46
+ # 统计 Random / Sequence 缺失
47
+ seq_ranges, rand_dates = [], []
48
+ rand_months = seq_months = 0
49
+ if missing.empty:
50
+ mtype = "No Missing"
51
+ else:
52
+ gaps = missing.to_series().diff().dt.days.fillna(9999)
53
+ gid = (gaps > gap_threshold).cumsum()
54
+ for _, seg in missing.to_series().groupby(gid):
55
+ if len(seg) > 1:
56
+ seq_ranges.append(
57
+ f"From {seg.min().strftime('%Y-%m')} to {seg.max().strftime('%Y-%m')}"
58
+ )
59
+ seq_months += len(seg)
60
+ else:
61
+ rand_dates.append(seg.iloc[0].strftime("%Y-%m"))
62
+ rand_months += 1
63
+ has_rand, has_seq = bool(rand_dates), bool(seq_ranges)
64
+ if has_rand and has_seq:
65
+ mtype = "Both"
66
+ elif has_rand:
67
+ mtype = "Random"
68
+ else:
69
+ mtype = "Sequence"
70
+
71
+ records.append(
72
+ {
73
+ "BuildingName": bld,
74
+ "CommodityCode": util,
75
+ "MissingType": mtype,
76
+ "SequenceMissingRanges": "; ".join(seq_ranges),
77
+ "RandomMissingDates": "; ".join(rand_dates),
78
+ "TotalMonths": len(full_idx),
79
+ "RandomMissingMonths": rand_months,
80
+ "SequenceMissingMonths": seq_months,
81
+ "RandomMissingRatio": rand_months / len(full_idx)
82
+ if full_idx.size
83
+ else 0,
84
+ "SequenceMissingRatio": seq_months / len(full_idx)
85
+ if full_idx.size
86
+ else 0,
87
+ }
88
+ )
89
+
90
+ summary_df = pd.DataFrame(records)
91
+
92
+ # -------------------------------------------------------------
93
+ # 计算 FillStartDate
94
+ # -------------------------------------------------------------
95
+ cutoff_dt = pd.to_datetime(fill_earliest_cutoff)
96
+
97
+ def get_fill_start(r):
98
+ """
99
+ 根据用户澄清的正确策略计算FillStartDate:
100
+ 1. 不管2013年之前是否有序列缺失,都检查2013年之后是否存在≥9个月的序列缺失
101
+ 2. 如果2013年之后存在≥9个月缺失 → 返回缺失结束时间+1个月
102
+ 3. 如果2013年之后没有≥9个月缺失 → 返回2013-01-01
103
+ """
104
+ try:
105
+ # 解析所有序列缺失范围
106
+ seq_ranges = []
107
+ seq_missing_ranges = r.get("SequenceMissingRanges", "")
108
+
109
+ if not seq_missing_ranges or pd.isna(seq_missing_ranges):
110
+ # 没有序列缺失数据,从2013年开始
111
+ return cutoff_dt
112
+
113
+ for rng in str(seq_missing_ranges).split("; "):
114
+ if "to" not in rng:
115
+ continue
116
+ try:
117
+ s, e = rng.replace("From ", "").split(" to ")
118
+ sd, ed = pd.to_datetime(s), pd.to_datetime(e)
119
+ gap = (ed.to_period("M") - sd.to_period("M")).n + 1
120
+ seq_ranges.append((sd, ed, gap))
121
+ except Exception:
122
+ # 跳过无法解析的日期范围,继续处理其他范围
123
+ continue
124
+
125
+ if not seq_ranges:
126
+ # 没有有效���序列缺失,从2013年开始
127
+ return cutoff_dt
128
+
129
+ # 🔍 关键:只关注2013年之后的序列缺失(开始时间>=2013-01-01)
130
+ # 🔧 修复:确保正确访问min_fill_gap_months变量
131
+ post_2013_missing = [
132
+ (sd, ed, gap) for sd, ed, gap in seq_ranges
133
+ if sd >= cutoff_dt and gap >= min_fill_gap_months
134
+ ]
135
+
136
+ # 🔍 如果2013年之后存在≥9个月的序列缺失
137
+ if post_2013_missing:
138
+ # 按开始时间排序,取第一个符合条件的缺失
139
+ post_2013_missing.sort(key=lambda x: x[0])
140
+ sd, ed, gap = post_2013_missing[0]
141
+ return ed + pd.offsets.MonthBegin(1)
142
+
143
+ # 🔍 如果2013年之后没有≥9个月的序列缺失,从2013年开始
144
+ return cutoff_dt
145
+
146
+ except Exception as e:
147
+ # 🔧 关键修复:如果任何解析步骤失败,总是返回默认的cutoff_dt
148
+ # 这确保get_fill_start永远不会抛出异常,从而避免pandas.apply返回NaT
149
+ # 🔧 新增:记录异常信息用于调试
150
+ import traceback
151
+ print(f"get_fill_start exception for {r.get('BuildingName', 'Unknown')}: {e}")
152
+ print(f"Traceback: {traceback.format_exc()}")
153
+ return cutoff_dt
154
+
155
+ summary_df["FillStartDate"] = summary_df.apply(get_fill_start, axis=1)
156
+
157
+ # -------------------------------------------------------------
158
+ # 计算填充后缺失比率
159
+ # -------------------------------------------------------------
160
+ post_recs = []
161
+ for _, r in summary_df.iterrows():
162
+ bld, util, fsd = r["BuildingName"], r["CommodityCode"], r["FillStartDate"]
163
+ if pd.isna(fsd):
164
+ continue
165
+
166
+ grp2 = df[
167
+ (df["BuildingName"] == bld)
168
+ & (df["CommodityCode"] == util)
169
+ & (df["StartDate"] >= fsd)
170
+ ]
171
+ if grp2.empty:
172
+ continue
173
+
174
+ idx2 = pd.date_range(
175
+ fsd.replace(day=1), grp2["StartDate"].max().replace(day=1), freq="MS"
176
+ )
177
+ flag2 = pd.Series(0, index=idx2)
178
+ flag2.loc[grp2["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
179
+ miss2 = flag2[flag2 == 0].index
180
+
181
+ gaps2 = pd.Series(miss2).diff().dt.days.fillna(9999)
182
+ gid2 = (gaps2 > gap_threshold).cumsum()
183
+ rm2 = sm2 = 0
184
+ for _, seg2 in pd.Series(miss2).groupby(gid2):
185
+ if len(seg2) > 1:
186
+ sm2 += len(seg2)
187
+ else:
188
+ rm2 += 1
189
+
190
+ post_recs.append(
191
+ {
192
+ "BuildingName": bld,
193
+ "CommodityCode": util,
194
+ "PostTotalMonths": len(idx2),
195
+ "PostRandomMissingMonths": rm2,
196
+ "PostSequenceMissingMonths": sm2,
197
+ "PostRandomMissingRatio": rm2 / len(idx2) if idx2.size else 0,
198
+ "PostSequenceMissingRatio": sm2 / len(idx2) if idx2.size else 0,
199
+ }
200
+ )
201
+
202
+ post_df = pd.DataFrame(post_recs)
203
+
204
+ # 🔧 修复:如果post_df为空,需要创建包含所有必要列的空DataFrame
205
+ if post_df.empty:
206
+ # 创建一个与summary_df结构匹配的空DataFrame
207
+ post_df = pd.DataFrame(columns=[
208
+ "BuildingName", "CommodityCode", "PostTotalMonths",
209
+ "PostRandomMissingMonths", "PostSequenceMissingMonths",
210
+ "PostRandomMissingRatio", "PostSequenceMissingRatio"
211
+ ])
212
+
213
+ summary_df = summary_df.merge(post_df, on=["BuildingName", "CommodityCode"], how="left")
214
+
215
+ # 🔧 修复:填充缺失的post分析列为默认值
216
+ post_columns = ["PostTotalMonths", "PostRandomMissingMonths", "PostSequenceMissingMonths",
217
+ "PostRandomMissingRatio", "PostSequenceMissingRatio"]
218
+ for col in post_columns:
219
+ if col not in summary_df.columns:
220
+ if "Ratio" in col:
221
+ summary_df[col] = 0.0 # 比率列默认为0
222
+ else:
223
+ summary_df[col] = 0 # 月数列默认为0
224
+
225
+ summary_df["NotGonnaUse"] = (
226
+ (summary_df["PostRandomMissingRatio"] > post_missing_threshold)
227
+ | (summary_df["PostSequenceMissingRatio"] > post_missing_threshold)
228
+ ).astype(int)
229
+
230
+ return summary_df
231
+
232
+
233
+ def fill_usage_with_sequence_check_strict_mean(
234
+ usage_data: pd.DataFrame,
235
+ summary_df: pd.DataFrame,
236
+ method: str = "mean",
237
+ force: bool = False,
238
+ fill_earliest_cutoff: str = "1900-01-01",
239
+ ) -> pd.DataFrame:
240
+ """
241
+ 根据 summary_df 的 FillStartDate / NotGonnaUse 对 usage_data 进行填补。
242
+
243
+ 参数
244
+ ----------
245
+ usage_data : 原始用量表
246
+ summary_df : analyze_and_fill_usage 的结果
247
+ method : 'mean' 或 'median'
248
+ force : True 时忽略 NotGonnaUse 和 NaT,自动调整起点,保证能输出序列
249
+ fill_earliest_cutoff : 当 FillStartDate 为 NaT 时的回退起点(仅在 force=True 时使用��
250
+
251
+ 返回
252
+ ----------
253
+ filled_df : ['BuildingName','CommodityCode','Date','FilledUse']
254
+ """
255
+ df = usage_data.copy()
256
+ df["StartDate"] = pd.to_datetime(df["StartDate"]).dt.to_period("M").dt.to_timestamp()
257
+
258
+ # 🔧 修复:自动检测建筑列名,兼容两种格式
259
+ building_col = None
260
+ if "BuildingName" in df.columns:
261
+ building_col = "BuildingName"
262
+ elif "Building Name" in df.columns:
263
+ building_col = "Building Name"
264
+ # 创建标准化列名供后续使用
265
+ df["BuildingName"] = df["Building Name"]
266
+ else:
267
+ raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
268
+
269
+ all_records = []
270
+ for _, row in summary_df.iterrows():
271
+ bld, util, fsd, drop = (
272
+ row["BuildingName"],
273
+ row["CommodityCode"],
274
+ row["FillStartDate"],
275
+ row["NotGonnaUse"],
276
+ )
277
+
278
+ # ───────── 闸门 1 & 2 ─────────
279
+ if not force and (drop == 1 or pd.isna(fsd)):
280
+ # 严格模式:缺失率过高 或 没有有效起点 → 直接跳过
281
+ continue
282
+ if force and pd.isna(fsd):
283
+ fsd = pd.to_datetime(fill_earliest_cutoff)
284
+
285
+ # 取 >= fsd 的原始数据(使用标准化的BuildingName列)
286
+ grp = df[
287
+ (df["BuildingName"] == bld)
288
+ & (df["CommodityCode"] == util)
289
+ & (df["StartDate"] >= fsd)
290
+ ]
291
+
292
+ # ───────── 闸门 3 ─────────
293
+ if grp.empty:
294
+ if not force:
295
+ continue
296
+ # 强制模式:回退到该组合最早月份
297
+ grp = df[(df["BuildingName"] == bld) & (df["CommodityCode"] == util)]
298
+ if grp.empty:
299
+ # 数据确实不存在
300
+ continue
301
+ fsd = grp["StartDate"].min()
302
+
303
+ last_m = grp["StartDate"].max()
304
+ all_months = pd.date_range(fsd, last_m, freq="MS")
305
+ monthly = grp.groupby("StartDate")["Use"].sum().reindex(all_months)
306
+
307
+ base = monthly.dropna()
308
+ fill_val = base.median() if method == "median" else base.mean()
309
+ filled = monthly.fillna(fill_val).reset_index()
310
+
311
+ filled.columns = ["Date", "FilledUse"]
312
+ filled["BuildingName"] = bld
313
+ filled["CommodityCode"] = util
314
+ all_records.append(filled)
315
+
316
+ if not all_records:
317
+ return pd.DataFrame(columns=["BuildingName", "CommodityCode", "Date", "FilledUse"])
318
+
319
+ return pd.concat(all_records, ignore_index=True)
320
+
321
+
322
+ # ===============================================================
323
+ # LLM-based Weather Variable Selection Functions
324
+ # ===============================================================
325
+
326
+ # Building Type → Weather Variable Rule Mapping
327
+ weather_influence_map = {
328
+ "Office": ["temp_mean", "temp_std", "CDD_sum", "clouds_all_mean"],
329
+ "Instructional": ["temp_mean", "temp_std", "CDD_sum", "humidity_mean"],
330
+ "Residential": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
331
+ "Health": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
332
+ "Library": ["temp_mean", "temp_std", "humidity_mean", "clouds_all_mean"],
333
+ "Dining": ["temp_mean", "rain_sum", "CDD_sum"],
334
+ "Recreation": ["temp_mean", "rain_sum", "wind_speed_mean"],
335
+ "Assembly or Theater": ["temp_mean", "wind_speed_mean", "rain_sum"],
336
+ "Affiliate": ["temp_mean", "CDD_sum", "rain_sum"],
337
+ "Parking Structure": [],
338
+ "Infrastructure": [],
339
+ "Container": [],
340
+ "Mixed": ["temp_mean", "CDD_sum", "humidity_mean"],
341
+ "Other": ["temp_mean", "CDD_sum"],
342
+ }
343
+
344
+ def infer_building_type(text: str) -> str:
345
+ """推断建筑类型基于文本描述"""
346
+ patterns = {
347
+ "Instructional": ["Teaching", "Classroom", "School", "Lecture Hall", "Academic", "Education"],
348
+ "Residential": ["Residential", "Apartment", "Dormitory", "Housing", "Student"],
349
+ "Office": ["Office", "Office Building", "Administrative", "Admin"],
350
+ "Health": ["Hospital", "Medical", "Clinic", "Health"],
351
+ "Dining": ["Canteen", "Restaurant", "Dining", "Food", "Kitchen"],
352
+ "Recreation": ["Fitness", "Sports", "Entertainment", "Recreation", "Gym"],
353
+ "Library": ["Library"],
354
+ "Assembly or Theater": ["Theater", "Auditorium", "Performance", "Assembly"],
355
+ "Affiliate": ["Affiliate"],
356
+ }
357
+
358
+ text_lower = text.lower()
359
+ for btype, keywords in patterns.items():
360
+ if any(k.lower() in text_lower for k in keywords):
361
+ return btype
362
+ return "Mixed"
363
+
364
+ def construct_weather_prompt_static(
365
+ user_description: str, detected_type: str, suggested_vars: list,
366
+ gross_area: float, avg_space_sqft: float, workpoint_count: int, floor_count: int
367
+ ) -> str:
368
+ """构建静态weather prompt"""
369
+ scenario_note = """
370
+ Weather-Scenario (z-score offset, user will pick one):
371
+ • Normal → 0 σ offset (historical monthly mean)
372
+ • Hot → +1 σ on temp_mean & CDD_sum, −0.5 σ on humidity_mean
373
+ • ColdWet → −1 σ on temp_mean, +1 σ on HDD_sum & humidity_mean
374
+ • WindyCloudy → +1 σ on wind_speed_mean & clouds_all_mean
375
+ LLM only needs to recommend variables; offsets are applied downstream.
376
+ """
377
+
378
+ return f"""
379
+ You are an expert in building energy modeling and changepoint detection.
380
+
381
+ {scenario_note}
382
+
383
+ Building Description (current use only):
384
+ {user_description}
385
+
386
+ Inferred Operation Type: {detected_type}
387
+
388
+ Structural Information:
389
+ - Building Gross Area: {gross_area:,.0f} sq ft
390
+ - Average Space Size: {avg_space_sqft:,.0f} sq ft
391
+ - Total Workpoint Count: {workpoint_count}
392
+ - Floor Count: {floor_count}
393
+
394
+ Rule-based Suggested Variables: {', '.join(suggested_vars)}
395
+
396
+ Candidate Weather Variables:
397
+ temp_mean · temp_std · HDD_sum · CDD_sum · rain_sum · clouds_all_mean · humidity_mean · wind_speed_mean
398
+
399
+ Tasks:
400
+ 1. Select 3–5 variables that best capture energy-use behaviour under the current configuration.
401
+ 2. Briefly justify each choice.
402
+ 3. Return a markdown table with columns: Selected Variable | Reason.
403
+ """
404
+
405
+ def chat_with_ollama(messages: list, model: str = "mistral") -> str:
406
+ """与Ollama API聊天"""
407
+ import requests
408
+
409
+ url = "http://localhost:11434/api/chat"
410
+ try:
411
+ response = requests.post(
412
+ url,
413
+ json={"model": model, "messages": messages, "stream": False},
414
+ timeout=30
415
+ )
416
+ response.raise_for_status()
417
+ return response.json()["message"]["content"]
418
+ except requests.exceptions.RequestException as e:
419
+ raise Exception(f"Ollama API error: {str(e)}")
420
+ except KeyError:
421
+ raise Exception("Invalid response format from Ollama API")
422
+
423
+ def parse_selected_vars(md: str) -> list:
424
+ """解析markdown表格中的变量列表"""
425
+ vars_ = []
426
+ for row in md.strip().splitlines():
427
+ if row.startswith("|") and not row.startswith("| Selected"):
428
+ parts = row.split("|")
429
+ if len(parts) > 1:
430
+ first = parts[1].strip()
431
+ if first and first != '---' and first not in ["Selected Variable", ""]:
432
+ vars_.append(first)
433
+ return vars_