Spaces:
No application file
No application file
File size: 17,082 Bytes
6782585 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 |
import pandas as pd
import numpy as np
def analyze_and_fill_usage(
usage_data: pd.DataFrame,
gap_threshold: int = 62,
fill_earliest_cutoff: str = "2013-01-01",
min_fill_gap_months: int = 9,
rolling_window_size: int = 4,
rolling_centered: bool = True,
sequence_fill_method: str = "mean",
post_missing_threshold: float = 0.1,
) -> pd.DataFrame:
"""
对 usage_data 进行缺失类型分析,计算 FillStartDate 和 NotGonnaUse,
返回 summary_df,只包含统计指标,不做时间序列填补。
"""
df = usage_data.copy()
df["StartDate"] = pd.to_datetime(df["StartDate"])
df["EndDate"] = pd.to_datetime(df["EndDate"]) - pd.Timedelta(days=2)
# 🔧 修复:自动检测建筑列名,兼容两种格式
building_col = None
if "BuildingName" in df.columns:
building_col = "BuildingName"
elif "Building Name" in df.columns:
building_col = "Building Name"
# 创建标准化列名供后续使用
df["BuildingName"] = df["Building Name"]
else:
raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
records = []
# 🔧 修复:始终使用BuildingName进行groupby,确保一致性
for (bld, util), grp in df.groupby(["BuildingName", "CommodityCode"]):
# 完整月份索引
start = grp["StartDate"].min().replace(day=1)
end = grp["StartDate"].max().replace(day=1)
full_idx = pd.date_range(start, end, freq="MS")
flag = pd.Series(0, index=full_idx)
flag.loc[grp["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
missing = flag[flag == 0].index
# 统计 Random / Sequence 缺失
seq_ranges, rand_dates = [], []
rand_months = seq_months = 0
if missing.empty:
mtype = "No Missing"
else:
gaps = missing.to_series().diff().dt.days.fillna(9999)
gid = (gaps > gap_threshold).cumsum()
for _, seg in missing.to_series().groupby(gid):
if len(seg) > 1:
seq_ranges.append(
f"From {seg.min().strftime('%Y-%m')} to {seg.max().strftime('%Y-%m')}"
)
seq_months += len(seg)
else:
rand_dates.append(seg.iloc[0].strftime("%Y-%m"))
rand_months += 1
has_rand, has_seq = bool(rand_dates), bool(seq_ranges)
if has_rand and has_seq:
mtype = "Both"
elif has_rand:
mtype = "Random"
else:
mtype = "Sequence"
records.append(
{
"BuildingName": bld,
"CommodityCode": util,
"MissingType": mtype,
"SequenceMissingRanges": "; ".join(seq_ranges),
"RandomMissingDates": "; ".join(rand_dates),
"TotalMonths": len(full_idx),
"RandomMissingMonths": rand_months,
"SequenceMissingMonths": seq_months,
"RandomMissingRatio": rand_months / len(full_idx)
if full_idx.size
else 0,
"SequenceMissingRatio": seq_months / len(full_idx)
if full_idx.size
else 0,
}
)
summary_df = pd.DataFrame(records)
# -------------------------------------------------------------
# 计算 FillStartDate
# -------------------------------------------------------------
cutoff_dt = pd.to_datetime(fill_earliest_cutoff)
def get_fill_start(r):
"""
根据用户澄清的正确策略计算FillStartDate:
1. 不管2013年之前是否有序列缺失,都检查2013年之后是否存在≥9个月的序列缺失
2. 如果2013年之后存在≥9个月缺失 → 返回缺失结束时间+1个月
3. 如果2013年之后没有≥9个月缺失 → 返回2013-01-01
"""
try:
# 解析所有序列缺失范围
seq_ranges = []
seq_missing_ranges = r.get("SequenceMissingRanges", "")
if not seq_missing_ranges or pd.isna(seq_missing_ranges):
# 没有序列缺失数据,从2013年开始
return cutoff_dt
for rng in str(seq_missing_ranges).split("; "):
if "to" not in rng:
continue
try:
s, e = rng.replace("From ", "").split(" to ")
sd, ed = pd.to_datetime(s), pd.to_datetime(e)
gap = (ed.to_period("M") - sd.to_period("M")).n + 1
seq_ranges.append((sd, ed, gap))
except Exception:
# 跳过无法解析的日期范围,继续处理其他范围
continue
if not seq_ranges:
# 没有有效的序列缺失,从2013年开始
return cutoff_dt
# 🔍 关键:只关注2013年之后的序列缺失(开始时间>=2013-01-01)
# 🔧 修复:确保正确访问min_fill_gap_months变量
post_2013_missing = [
(sd, ed, gap) for sd, ed, gap in seq_ranges
if sd >= cutoff_dt and gap >= min_fill_gap_months
]
# 🔍 如果2013年之后存在≥9个月的序列缺失
if post_2013_missing:
# 按开始时间排序,取第一个符合条件的缺失
post_2013_missing.sort(key=lambda x: x[0])
sd, ed, gap = post_2013_missing[0]
return ed + pd.offsets.MonthBegin(1)
# 🔍 如果2013年之后没有≥9个月的序列缺失,从2013年开始
return cutoff_dt
except Exception as e:
# 🔧 关键修复:如果任何解析步骤失败,总是返回默认的cutoff_dt
# 这确保get_fill_start永远不会抛出异常,从而避免pandas.apply返回NaT
# 🔧 新增:记录异常信息用于调试
import traceback
print(f"get_fill_start exception for {r.get('BuildingName', 'Unknown')}: {e}")
print(f"Traceback: {traceback.format_exc()}")
return cutoff_dt
summary_df["FillStartDate"] = summary_df.apply(get_fill_start, axis=1)
# -------------------------------------------------------------
# 计算填充后缺失比率
# -------------------------------------------------------------
post_recs = []
for _, r in summary_df.iterrows():
bld, util, fsd = r["BuildingName"], r["CommodityCode"], r["FillStartDate"]
if pd.isna(fsd):
continue
grp2 = df[
(df["BuildingName"] == bld)
& (df["CommodityCode"] == util)
& (df["StartDate"] >= fsd)
]
if grp2.empty:
continue
idx2 = pd.date_range(
fsd.replace(day=1), grp2["StartDate"].max().replace(day=1), freq="MS"
)
flag2 = pd.Series(0, index=idx2)
flag2.loc[grp2["StartDate"].dt.to_period("M").dt.to_timestamp()] = 1
miss2 = flag2[flag2 == 0].index
gaps2 = pd.Series(miss2).diff().dt.days.fillna(9999)
gid2 = (gaps2 > gap_threshold).cumsum()
rm2 = sm2 = 0
for _, seg2 in pd.Series(miss2).groupby(gid2):
if len(seg2) > 1:
sm2 += len(seg2)
else:
rm2 += 1
post_recs.append(
{
"BuildingName": bld,
"CommodityCode": util,
"PostTotalMonths": len(idx2),
"PostRandomMissingMonths": rm2,
"PostSequenceMissingMonths": sm2,
"PostRandomMissingRatio": rm2 / len(idx2) if idx2.size else 0,
"PostSequenceMissingRatio": sm2 / len(idx2) if idx2.size else 0,
}
)
post_df = pd.DataFrame(post_recs)
# 🔧 修复:如果post_df为空,需要创建包含所有必要列的空DataFrame
if post_df.empty:
# 创建一个与summary_df结构匹配的空DataFrame
post_df = pd.DataFrame(columns=[
"BuildingName", "CommodityCode", "PostTotalMonths",
"PostRandomMissingMonths", "PostSequenceMissingMonths",
"PostRandomMissingRatio", "PostSequenceMissingRatio"
])
summary_df = summary_df.merge(post_df, on=["BuildingName", "CommodityCode"], how="left")
# 🔧 修复:填充缺失的post分析列为默认值
post_columns = ["PostTotalMonths", "PostRandomMissingMonths", "PostSequenceMissingMonths",
"PostRandomMissingRatio", "PostSequenceMissingRatio"]
for col in post_columns:
if col not in summary_df.columns:
if "Ratio" in col:
summary_df[col] = 0.0 # 比率列默认为0
else:
summary_df[col] = 0 # 月数列默认为0
summary_df["NotGonnaUse"] = (
(summary_df["PostRandomMissingRatio"] > post_missing_threshold)
| (summary_df["PostSequenceMissingRatio"] > post_missing_threshold)
).astype(int)
return summary_df
def fill_usage_with_sequence_check_strict_mean(
usage_data: pd.DataFrame,
summary_df: pd.DataFrame,
method: str = "mean",
force: bool = False,
fill_earliest_cutoff: str = "1900-01-01",
) -> pd.DataFrame:
"""
根据 summary_df 的 FillStartDate / NotGonnaUse 对 usage_data 进行填补。
参数
----------
usage_data : 原始用量表
summary_df : analyze_and_fill_usage 的结果
method : 'mean' 或 'median'
force : True 时忽略 NotGonnaUse 和 NaT,自动调整起点,保证能输出序列
fill_earliest_cutoff : 当 FillStartDate 为 NaT 时的回退起点(仅在 force=True 时使用)
返回
----------
filled_df : ['BuildingName','CommodityCode','Date','FilledUse']
"""
df = usage_data.copy()
df["StartDate"] = pd.to_datetime(df["StartDate"]).dt.to_period("M").dt.to_timestamp()
# 🔧 修复:自动检测建筑列名,兼容两种格式
building_col = None
if "BuildingName" in df.columns:
building_col = "BuildingName"
elif "Building Name" in df.columns:
building_col = "Building Name"
# 创建标准化列名供后续使用
df["BuildingName"] = df["Building Name"]
else:
raise ValueError("Neither 'BuildingName' nor 'Building Name' column found in usage data")
all_records = []
for _, row in summary_df.iterrows():
bld, util, fsd, drop = (
row["BuildingName"],
row["CommodityCode"],
row["FillStartDate"],
row["NotGonnaUse"],
)
# ───────── 闸门 1 & 2 ─────────
if not force and (drop == 1 or pd.isna(fsd)):
# 严格模式:缺失率过高 或 没有有效起点 → 直接跳过
continue
if force and pd.isna(fsd):
fsd = pd.to_datetime(fill_earliest_cutoff)
# 取 >= fsd 的原始数据(使用标准化的BuildingName列)
grp = df[
(df["BuildingName"] == bld)
& (df["CommodityCode"] == util)
& (df["StartDate"] >= fsd)
]
# ───────── 闸门 3 ─────────
if grp.empty:
if not force:
continue
# 强制模式:回退到该组合最早月份
grp = df[(df["BuildingName"] == bld) & (df["CommodityCode"] == util)]
if grp.empty:
# 数据确实不存在
continue
fsd = grp["StartDate"].min()
last_m = grp["StartDate"].max()
all_months = pd.date_range(fsd, last_m, freq="MS")
monthly = grp.groupby("StartDate")["Use"].sum().reindex(all_months)
base = monthly.dropna()
fill_val = base.median() if method == "median" else base.mean()
filled = monthly.fillna(fill_val).reset_index()
filled.columns = ["Date", "FilledUse"]
filled["BuildingName"] = bld
filled["CommodityCode"] = util
all_records.append(filled)
if not all_records:
return pd.DataFrame(columns=["BuildingName", "CommodityCode", "Date", "FilledUse"])
return pd.concat(all_records, ignore_index=True)
# ===============================================================
# LLM-based Weather Variable Selection Functions
# ===============================================================
# Building Type → Weather Variable Rule Mapping
weather_influence_map = {
"Office": ["temp_mean", "temp_std", "CDD_sum", "clouds_all_mean"],
"Instructional": ["temp_mean", "temp_std", "CDD_sum", "humidity_mean"],
"Residential": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
"Health": ["temp_mean", "HDD_sum", "CDD_sum", "humidity_mean"],
"Library": ["temp_mean", "temp_std", "humidity_mean", "clouds_all_mean"],
"Dining": ["temp_mean", "rain_sum", "CDD_sum"],
"Recreation": ["temp_mean", "rain_sum", "wind_speed_mean"],
"Assembly or Theater": ["temp_mean", "wind_speed_mean", "rain_sum"],
"Affiliate": ["temp_mean", "CDD_sum", "rain_sum"],
"Parking Structure": [],
"Infrastructure": [],
"Container": [],
"Mixed": ["temp_mean", "CDD_sum", "humidity_mean"],
"Other": ["temp_mean", "CDD_sum"],
}
def infer_building_type(text: str) -> str:
"""推断建筑类型基于文本描述"""
patterns = {
"Instructional": ["Teaching", "Classroom", "School", "Lecture Hall", "Academic", "Education"],
"Residential": ["Residential", "Apartment", "Dormitory", "Housing", "Student"],
"Office": ["Office", "Office Building", "Administrative", "Admin"],
"Health": ["Hospital", "Medical", "Clinic", "Health"],
"Dining": ["Canteen", "Restaurant", "Dining", "Food", "Kitchen"],
"Recreation": ["Fitness", "Sports", "Entertainment", "Recreation", "Gym"],
"Library": ["Library"],
"Assembly or Theater": ["Theater", "Auditorium", "Performance", "Assembly"],
"Affiliate": ["Affiliate"],
}
text_lower = text.lower()
for btype, keywords in patterns.items():
if any(k.lower() in text_lower for k in keywords):
return btype
return "Mixed"
def construct_weather_prompt_static(
user_description: str, detected_type: str, suggested_vars: list,
gross_area: float, avg_space_sqft: float, workpoint_count: int, floor_count: int
) -> str:
"""构建静态weather prompt"""
scenario_note = """
Weather-Scenario (z-score offset, user will pick one):
• Normal → 0 σ offset (historical monthly mean)
• Hot → +1 σ on temp_mean & CDD_sum, −0.5 σ on humidity_mean
• ColdWet → −1 σ on temp_mean, +1 σ on HDD_sum & humidity_mean
• WindyCloudy → +1 σ on wind_speed_mean & clouds_all_mean
LLM only needs to recommend variables; offsets are applied downstream.
"""
return f"""
You are an expert in building energy modeling and changepoint detection.
{scenario_note}
Building Description (current use only):
{user_description}
Inferred Operation Type: {detected_type}
Structural Information:
- Building Gross Area: {gross_area:,.0f} sq ft
- Average Space Size: {avg_space_sqft:,.0f} sq ft
- Total Workpoint Count: {workpoint_count}
- Floor Count: {floor_count}
Rule-based Suggested Variables: {', '.join(suggested_vars)}
Candidate Weather Variables:
temp_mean · temp_std · HDD_sum · CDD_sum · rain_sum · clouds_all_mean · humidity_mean · wind_speed_mean
Tasks:
1. Select 3–5 variables that best capture energy-use behaviour under the current configuration.
2. Briefly justify each choice.
3. Return a markdown table with columns: Selected Variable | Reason.
"""
def chat_with_ollama(messages: list, model: str = "mistral") -> str:
"""与Ollama API聊天"""
import requests
url = "http://localhost:11434/api/chat"
try:
response = requests.post(
url,
json={"model": model, "messages": messages, "stream": False},
timeout=30
)
response.raise_for_status()
return response.json()["message"]["content"]
except requests.exceptions.RequestException as e:
raise Exception(f"Ollama API error: {str(e)}")
except KeyError:
raise Exception("Invalid response format from Ollama API")
def parse_selected_vars(md: str) -> list:
"""解析markdown表格中的变量列表"""
vars_ = []
for row in md.strip().splitlines():
if row.startswith("|") and not row.startswith("| Selected"):
parts = row.split("|")
if len(parts) > 1:
first = parts[1].strip()
if first and first != '---' and first not in ["Selected Variable", ""]:
vars_.append(first)
return vars_ |