Spaces:
Sleeping
Sleeping
Upload bn_core.py
Browse files- bn_core.py +155 -280
bn_core.py
CHANGED
|
@@ -13,17 +13,17 @@ from sklearn.metrics import (
|
|
| 13 |
recall_score, f1_score, roc_curve, roc_auc_score
|
| 14 |
)
|
| 15 |
from pgmpy.metrics import log_likelihood_score, structure_score
|
| 16 |
-
from networkx import is_directed_acyclic_graph, DiGraph
|
| 17 |
import threading
|
| 18 |
from datetime import datetime
|
|
|
|
| 19 |
|
| 20 |
class BayesianNetworkAnalyzer:
|
| 21 |
"""
|
| 22 |
-
貝葉斯網路分析器
|
| 23 |
-
支
|
| 24 |
"""
|
| 25 |
|
| 26 |
-
# 類
|
| 27 |
_lock = threading.Lock()
|
| 28 |
|
| 29 |
# 儲存各 session 的分析結果
|
|
@@ -48,7 +48,7 @@ class BayesianNetworkAnalyzer:
|
|
| 48 |
equivalent_sample_size=3, score_method='BIC',
|
| 49 |
sig_level=0.05, n_bins=10):
|
| 50 |
"""
|
| 51 |
-
執行完整的貝葉斯網路分析 - 對齊 Django
|
| 52 |
|
| 53 |
Args:
|
| 54 |
df: 原始資料框
|
|
@@ -69,38 +69,37 @@ class BayesianNetworkAnalyzer:
|
|
| 69 |
|
| 70 |
with self._lock:
|
| 71 |
try:
|
| 72 |
-
# 1. 選擇
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
# 重置索引 (對齊 Line 599)
|
| 78 |
-
df.index = df.index + 1
|
| 79 |
|
| 80 |
-
# 2.
|
| 81 |
self.train_data, self.test_data = train_test_split(
|
| 82 |
-
|
| 83 |
test_size=test_fraction,
|
| 84 |
-
random_state=
|
| 85 |
-
stratify=
|
| 86 |
)
|
| 87 |
|
| 88 |
-
# 3. 學習網路結構 (
|
| 89 |
self.model = self._learn_structure(
|
| 90 |
algorithm, score_method, sig_level, target_variable
|
| 91 |
)
|
| 92 |
|
| 93 |
-
# 4.
|
| 94 |
-
|
| 95 |
-
self._preprocess_data_inplace(cat_features, con_features, n_bins)
|
| 96 |
|
| 97 |
-
# 5.
|
|
|
|
|
|
|
|
|
|
| 98 |
self._fit_parameters(estimator, equivalent_sample_size)
|
| 99 |
|
| 100 |
-
#
|
| 101 |
self.inference = VariableElimination(self.model)
|
| 102 |
|
| 103 |
-
#
|
| 104 |
train_metrics = self._evaluate_model(
|
| 105 |
self.train_data, target_variable, "train"
|
| 106 |
)
|
|
@@ -108,13 +107,13 @@ class BayesianNetworkAnalyzer:
|
|
| 108 |
self.test_data, target_variable, "test"
|
| 109 |
)
|
| 110 |
|
| 111 |
-
#
|
| 112 |
cpds = self._get_all_cpds()
|
| 113 |
|
| 114 |
-
#
|
| 115 |
scores = self._calculate_scores()
|
| 116 |
|
| 117 |
-
#
|
| 118 |
results = {
|
| 119 |
'model': self.model,
|
| 120 |
'inference': self.inference,
|
|
@@ -146,28 +145,42 @@ class BayesianNetworkAnalyzer:
|
|
| 146 |
except Exception as e:
|
| 147 |
raise Exception(f"Analysis failed: {str(e)}")
|
| 148 |
|
| 149 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
"""
|
| 151 |
-
|
| 152 |
-
|
|
|
|
| 153 |
"""
|
| 154 |
-
|
| 155 |
-
# 處理分類特徵 (對齊 Line 762-764)
|
| 156 |
for col in cat_features:
|
| 157 |
if col in self.train_data.columns:
|
| 158 |
if self.train_data[col].dtype == 'object':
|
| 159 |
self.train_data[col] = self.train_data[col].astype('category').cat.codes
|
|
|
|
| 160 |
if col in self.test_data.columns:
|
| 161 |
if self.test_data[col].dtype == 'object':
|
| 162 |
self.test_data[col] = self.test_data[col].astype('category').cat.codes
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
self.bins_dict = {}
|
| 166 |
|
| 167 |
-
# 步驟 1: 用 train_data 計算 bins (對齊 Line 769-787)
|
| 168 |
for col in con_features:
|
| 169 |
if col in self.train_data.columns and self.train_data[col].notna().sum() > 0:
|
| 170 |
-
# 計算分箱邊界
|
| 171 |
bin_edges = pd.cut(
|
| 172 |
self.train_data[col],
|
| 173 |
bins=n_bins,
|
|
@@ -175,49 +188,38 @@ class BayesianNetworkAnalyzer:
|
|
| 175 |
duplicates='drop'
|
| 176 |
)[1]
|
| 177 |
|
| 178 |
-
# 儲存 bins 供測試集使用
|
| 179 |
self.bins_dict[col] = bin_edges
|
| 180 |
|
| 181 |
-
# 創建分箱標籤
|
| 182 |
bin_labels = [
|
| 183 |
-
f"{round(bin_edges[i], 2)}
|
| 184 |
for i in range(len(bin_edges) - 1)
|
| 185 |
]
|
| 186 |
|
| 187 |
-
#
|
| 188 |
self.train_data[col] = pd.cut(
|
| 189 |
self.train_data[col],
|
| 190 |
bins=bin_edges,
|
| 191 |
labels=bin_labels,
|
| 192 |
include_lowest=True
|
| 193 |
).astype(object).fillna("Missing")
|
| 194 |
-
|
| 195 |
-
# 步驟 2: 用相同的 bins 處理 test_data (對齊 Line 789-803)
|
| 196 |
-
for col in con_features:
|
| 197 |
-
if col in self.test_data.columns and col in self.bins_dict:
|
| 198 |
-
bin_edges = self.bins_dict[col]
|
| 199 |
|
| 200 |
-
# 使用相同
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
labels=bin_labels,
|
| 211 |
-
include_lowest=True
|
| 212 |
-
).astype(object).fillna("Missing")
|
| 213 |
|
| 214 |
def _learn_structure(self, algorithm, score_method, sig_level, target_variable):
|
| 215 |
-
"""
|
| 216 |
-
學習網路結構 - 對齊 Django Line 607-759
|
| 217 |
-
"""
|
| 218 |
|
| 219 |
if algorithm == 'NB':
|
| 220 |
-
# Naive Bayes
|
| 221 |
edges = [
|
| 222 |
(target_variable, feature)
|
| 223 |
for feature in self.train_data.columns
|
|
@@ -226,8 +228,8 @@ class BayesianNetworkAnalyzer:
|
|
| 226 |
model = BayesianNetwork(edges)
|
| 227 |
|
| 228 |
elif algorithm == 'TAN':
|
| 229 |
-
# Tree-Augmented Naive Bayes
|
| 230 |
-
# 特殊處理: asia
|
| 231 |
if 'asia' in self.train_data.columns and 'either' in self.train_data.columns and target_variable == 'either':
|
| 232 |
tan_search = TreeSearch(self.train_data, root_node='asia')
|
| 233 |
else:
|
|
@@ -240,7 +242,7 @@ class BayesianNetworkAnalyzer:
|
|
| 240 |
model = BayesianNetwork(structure.edges())
|
| 241 |
|
| 242 |
elif algorithm == 'CL':
|
| 243 |
-
# Chow-Liu
|
| 244 |
tan_search = TreeSearch(self.train_data)
|
| 245 |
structure = tan_search.estimate(
|
| 246 |
estimator_type='chow-liu',
|
|
@@ -248,124 +250,54 @@ class BayesianNetworkAnalyzer:
|
|
| 248 |
)
|
| 249 |
model = BayesianNetwork(structure.edges())
|
| 250 |
|
| 251 |
-
elif algorithm == '
|
| 252 |
-
#
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
# 嘗試不同的 max_cond_vars (對齊 Line 669-720)
|
| 256 |
-
model_est = None
|
| 257 |
-
|
| 258 |
-
# max_cond_vars = 5
|
| 259 |
-
try:
|
| 260 |
-
model_est = pc.estimate(
|
| 261 |
-
significance_level=sig_level,
|
| 262 |
-
max_cond_vars=5,
|
| 263 |
-
ci_test='chi_square',
|
| 264 |
-
variant='stable',
|
| 265 |
-
n_jobs=1
|
| 266 |
-
)
|
| 267 |
-
edges = model_est.edges()
|
| 268 |
-
# 驗證: 必須是 DAG 且目標變數在結構中
|
| 269 |
-
if not is_directed_acyclic_graph(DiGraph(edges)) or \
|
| 270 |
-
not any(target_variable in edge for edge in edges):
|
| 271 |
-
model_est = None
|
| 272 |
-
except:
|
| 273 |
-
model_est = None
|
| 274 |
|
| 275 |
-
#
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
n_jobs=1
|
| 284 |
-
)
|
| 285 |
-
edges = model_est.edges()
|
| 286 |
-
if not is_directed_acyclic_graph(DiGraph(edges)) or \
|
| 287 |
-
not any(target_variable in edge for edge in edges):
|
| 288 |
-
model_est = None
|
| 289 |
-
except:
|
| 290 |
-
model_est = None
|
| 291 |
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
significance_level=sig_level,
|
| 297 |
-
max_cond_vars=3,
|
| 298 |
-
ci_test='chi_square',
|
| 299 |
-
variant='stable',
|
| 300 |
-
n_jobs=1
|
| 301 |
-
)
|
| 302 |
-
edges = model_est.edges()
|
| 303 |
-
if not is_directed_acyclic_graph(DiGraph(edges)) or \
|
| 304 |
-
not any(target_variable in edge for edge in edges):
|
| 305 |
-
model_est = None
|
| 306 |
-
except:
|
| 307 |
-
model_est = None
|
| 308 |
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
model_est = pc.estimate(
|
| 313 |
-
significance_level=sig_level,
|
| 314 |
-
max_cond_vars=2,
|
| 315 |
-
ci_test='chi_square',
|
| 316 |
-
variant='stable',
|
| 317 |
-
n_jobs=1
|
| 318 |
-
)
|
| 319 |
-
edges = model_est.edges()
|
| 320 |
-
if not is_directed_acyclic_graph(DiGraph(edges)) or \
|
| 321 |
-
not any(target_variable in edge for edge in edges):
|
| 322 |
-
model_est = None
|
| 323 |
-
except:
|
| 324 |
-
model_est = None
|
| 325 |
|
| 326 |
-
# max_cond_vars
|
| 327 |
-
|
| 328 |
try:
|
| 329 |
-
|
| 330 |
significance_level=sig_level,
|
| 331 |
-
max_cond_vars=
|
| 332 |
ci_test='chi_square',
|
| 333 |
variant='stable',
|
| 334 |
-
n_jobs=1
|
| 335 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
except:
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
if model_est is None:
|
| 341 |
-
print("⚠️ PC algorithm failed, falling back to Naive Bayes")
|
| 342 |
edges = [
|
| 343 |
(target_variable, feature)
|
| 344 |
for feature in self.train_data.columns
|
| 345 |
if feature != target_variable
|
| 346 |
]
|
| 347 |
model = BayesianNetwork(edges)
|
| 348 |
-
else:
|
| 349 |
-
model = BayesianNetwork(model_est.edges())
|
| 350 |
-
|
| 351 |
-
elif algorithm == 'HC':
|
| 352 |
-
# Hill Climbing (對齊 Line 723-758)
|
| 353 |
-
hc = HillClimbSearch(self.train_data)
|
| 354 |
-
|
| 355 |
-
# 選擇評分方法
|
| 356 |
-
scoring_methods = {
|
| 357 |
-
'AIC': AICScore(self.train_data),
|
| 358 |
-
'BIC': BicScore(self.train_data),
|
| 359 |
-
'K2': K2Score(self.train_data),
|
| 360 |
-
'BDeu': BDeuScore(self.train_data),
|
| 361 |
-
'BDs': BDsScore(self.train_data)
|
| 362 |
-
}
|
| 363 |
-
|
| 364 |
-
structure = hc.estimate(
|
| 365 |
-
scoring_method=scoring_methods[score_method],
|
| 366 |
-
start_dag=None
|
| 367 |
-
)
|
| 368 |
-
model = BayesianNetwork(structure.edges())
|
| 369 |
|
| 370 |
else:
|
| 371 |
raise ValueError(f"Unknown algorithm: {algorithm}")
|
|
@@ -373,32 +305,22 @@ class BayesianNetworkAnalyzer:
|
|
| 373 |
return model
|
| 374 |
|
| 375 |
def _fit_parameters(self, estimator, equivalent_sample_size):
|
| 376 |
-
"""
|
| 377 |
-
參數估計 - 對齊 Django Line 817-834
|
| 378 |
-
"""
|
| 379 |
if estimator == 'bn':
|
| 380 |
self.model.fit(
|
| 381 |
self.train_data,
|
| 382 |
estimator=BayesianEstimator,
|
| 383 |
equivalent_sample_size=equivalent_sample_size
|
| 384 |
)
|
| 385 |
-
elif estimator == 'bn_mcmc':
|
| 386 |
-
# Django 有這個選項但未實作完整
|
| 387 |
-
# 這裡保留相容性
|
| 388 |
-
self.model.fit(
|
| 389 |
-
self.train_data,
|
| 390 |
-
estimator=BayesianEstimator,
|
| 391 |
-
equivalent_sample_size=equivalent_sample_size
|
| 392 |
-
)
|
| 393 |
else:
|
| 394 |
self.model.fit(
|
| 395 |
self.train_data,
|
| 396 |
estimator=MaximumLikelihoodEstimator
|
| 397 |
)
|
| 398 |
|
| 399 |
-
def _predict_probabilities(self, data, target_variable
|
| 400 |
"""
|
| 401 |
-
預測機率 -
|
| 402 |
"""
|
| 403 |
true_labels = []
|
| 404 |
predicted_probs = []
|
|
@@ -406,13 +328,9 @@ class BayesianNetworkAnalyzer:
|
|
| 406 |
model_nodes = set(self.model.nodes())
|
| 407 |
|
| 408 |
for idx, row in data.iterrows():
|
|
|
|
| 409 |
raw_evidence = row.drop(target_variable).to_dict()
|
| 410 |
-
|
| 411 |
-
# 過濾只在模型中的變數
|
| 412 |
-
filtered_evidence = {
|
| 413 |
-
k: v for k, v in raw_evidence.items()
|
| 414 |
-
if k in model_nodes
|
| 415 |
-
}
|
| 416 |
|
| 417 |
true_label = row[target_variable]
|
| 418 |
true_labels.append(true_label)
|
|
@@ -424,39 +342,33 @@ class BayesianNetworkAnalyzer:
|
|
| 424 |
)
|
| 425 |
probs = result.values
|
| 426 |
predicted_probs.append(probs)
|
| 427 |
-
|
| 428 |
except Exception as e:
|
| 429 |
-
# 詳細的錯誤訊息
|
| 430 |
print(f"⚠️ Inference failed at row {idx} | evidence keys: {list(filtered_evidence.keys())} | error: {e}")
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
|
|
|
|
|
|
| 437 |
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
def _evaluate_model(self, data, target_variable, dataset_name):
|
| 441 |
-
"""
|
| 442 |
-
|
| 443 |
-
"""
|
| 444 |
-
threshold = 0.5 # 二元分類閾值
|
| 445 |
-
|
| 446 |
-
# 預測 (對齊 Line 840-892)
|
| 447 |
true_labels, pred_probs = self._predict_probabilities(
|
| 448 |
data, target_variable
|
| 449 |
)
|
| 450 |
|
| 451 |
-
|
| 452 |
-
filtered_data = [
|
| 453 |
-
(label, prob, idx)
|
| 454 |
-
for idx, (label, prob) in enumerate(zip(true_labels, pred_probs))
|
| 455 |
-
if label is not None and prob is not None and len(prob) > 1
|
| 456 |
-
]
|
| 457 |
-
|
| 458 |
-
if not filtered_data:
|
| 459 |
-
print(f"⚠️ No valid predictions for {dataset_name} set")
|
| 460 |
return {
|
| 461 |
'accuracy': 0,
|
| 462 |
'precision': 0,
|
|
@@ -468,74 +380,47 @@ class BayesianNetworkAnalyzer:
|
|
| 468 |
'specificity': 0,
|
| 469 |
'confusion_matrix': [[0, 0], [0, 0]],
|
| 470 |
'fpr': [0],
|
| 471 |
-
'tpr': [0]
|
| 472 |
-
'predicted_probs': []
|
| 473 |
}
|
| 474 |
|
| 475 |
-
|
|
|
|
|
|
|
| 476 |
|
| 477 |
-
#
|
| 478 |
-
pred_probs_array = np.round(
|
| 479 |
-
np.array([prob[1] for prob in pred_probs_filtered]),
|
| 480 |
-
4
|
| 481 |
-
)
|
| 482 |
-
|
| 483 |
-
# 二元預測 (對齊 Line 881)
|
| 484 |
-
pred_labels = (pred_probs_array >= threshold).astype(int)
|
| 485 |
-
|
| 486 |
-
# 確保一致性 (對齊 Line 884-886)
|
| 487 |
-
if len(true_labels_filtered) != len(pred_labels):
|
| 488 |
-
raise ValueError("Mismatch between true labels and predictions after filtering.")
|
| 489 |
-
|
| 490 |
-
true_labels = true_labels_filtered
|
| 491 |
-
|
| 492 |
-
# 計算混淆矩陣 (對齊 Line 888)
|
| 493 |
-
cm = confusion_matrix(true_labels, pred_labels)
|
| 494 |
-
|
| 495 |
-
# 計算 AUC (對齊 Line 890-897)
|
| 496 |
-
try:
|
| 497 |
-
auc = roc_auc_score(
|
| 498 |
-
[1 if label == 1 else 0 for label in true_labels],
|
| 499 |
-
pred_probs_array
|
| 500 |
-
)
|
| 501 |
-
except:
|
| 502 |
-
auc = 0.0
|
| 503 |
-
|
| 504 |
-
# ROC 曲線 (對齊 Line 906)
|
| 505 |
-
try:
|
| 506 |
-
fpr, tpr, _ = roc_curve(true_labels, pred_probs_array)
|
| 507 |
-
except:
|
| 508 |
-
fpr, tpr = [0, 1], [0, 1]
|
| 509 |
-
|
| 510 |
-
# 計算基本指標 (對齊 Line 908-911)
|
| 511 |
accuracy = accuracy_score(true_labels, pred_labels) * 100
|
| 512 |
precision = precision_score(true_labels, pred_labels, zero_division=0) * 100
|
| 513 |
recall = recall_score(true_labels, pred_labels, zero_division=0) * 100
|
| 514 |
f1 = f1_score(true_labels, pred_labels, zero_division=0) * 100
|
| 515 |
|
| 516 |
-
#
|
| 517 |
-
|
|
|
|
|
|
|
| 518 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
|
| 520 |
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
g_mean = np.sqrt(sensitivity * precision_raw) * 100
|
| 524 |
p_mean = np.sqrt(specificity * sensitivity) * 100
|
| 525 |
|
| 526 |
return {
|
| 527 |
-
'accuracy':
|
| 528 |
-
'precision':
|
| 529 |
-
'recall':
|
| 530 |
-
'f1':
|
| 531 |
-
'auc':
|
| 532 |
-
'g_mean':
|
| 533 |
-
'p_mean':
|
| 534 |
-
'specificity':
|
| 535 |
-
'confusion_matrix': cm
|
| 536 |
'fpr': fpr.tolist(),
|
| 537 |
'tpr': tpr.tolist(),
|
| 538 |
-
'predicted_probs':
|
| 539 |
}
|
| 540 |
|
| 541 |
def _get_all_cpds(self):
|
|
@@ -547,24 +432,14 @@ class BayesianNetworkAnalyzer:
|
|
| 547 |
return cpds
|
| 548 |
|
| 549 |
def _calculate_scores(self):
|
| 550 |
-
"""計算模型評分
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
}
|
| 559 |
-
except Exception as e:
|
| 560 |
-
print(f"⚠️ Error calculating scores: {e}")
|
| 561 |
-
scores = {
|
| 562 |
-
'log_likelihood': 0,
|
| 563 |
-
'bic': 0,
|
| 564 |
-
'k2': 0,
|
| 565 |
-
'bdeu': 0,
|
| 566 |
-
'bds': 0
|
| 567 |
-
}
|
| 568 |
return scores
|
| 569 |
|
| 570 |
@classmethod
|
|
@@ -576,4 +451,4 @@ class BayesianNetworkAnalyzer:
|
|
| 576 |
def clear_session_results(cls, session_id):
|
| 577 |
"""清除特定 session 的結果"""
|
| 578 |
if session_id in cls._session_results:
|
| 579 |
-
del cls._session_results[session_id]
|
|
|
|
| 13 |
recall_score, f1_score, roc_curve, roc_auc_score
|
| 14 |
)
|
| 15 |
from pgmpy.metrics import log_likelihood_score, structure_score
|
|
|
|
| 16 |
import threading
|
| 17 |
from datetime import datetime
|
| 18 |
+
from networkx import is_directed_acyclic_graph, DiGraph
|
| 19 |
|
| 20 |
class BayesianNetworkAnalyzer:
|
| 21 |
"""
|
| 22 |
+
貝葉斯網路分析器
|
| 23 |
+
支持多用戶同時使用,每個 session 獨立處理
|
| 24 |
"""
|
| 25 |
|
| 26 |
+
# 類別級的鎖,用於線程安全
|
| 27 |
_lock = threading.Lock()
|
| 28 |
|
| 29 |
# 儲存各 session 的分析結果
|
|
|
|
| 48 |
equivalent_sample_size=3, score_method='BIC',
|
| 49 |
sig_level=0.05, n_bins=10):
|
| 50 |
"""
|
| 51 |
+
執行完整的貝葉斯網路分析 - 完全對齊 Django 版本的順序
|
| 52 |
|
| 53 |
Args:
|
| 54 |
df: 原始資料框
|
|
|
|
| 69 |
|
| 70 |
with self._lock:
|
| 71 |
try:
|
| 72 |
+
# 1. 資料預處理 (只選擇欄位和處理缺失值)
|
| 73 |
+
processed_df = self._preprocess_data(
|
| 74 |
+
df, cat_features, con_features, target_variable
|
| 75 |
+
)
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
# 2. 分割訓練/測試集 (✅ random_state=526)
|
| 78 |
self.train_data, self.test_data = train_test_split(
|
| 79 |
+
processed_df,
|
| 80 |
test_size=test_fraction,
|
| 81 |
+
random_state=526,
|
| 82 |
+
stratify=processed_df[target_variable] if target_variable in processed_df.columns else None
|
| 83 |
)
|
| 84 |
|
| 85 |
+
# 3. ✅ 學習網路結構 (在分箱和編碼之前!)
|
| 86 |
self.model = self._learn_structure(
|
| 87 |
algorithm, score_method, sig_level, target_variable
|
| 88 |
)
|
| 89 |
|
| 90 |
+
# 4. ✅ 對分類變數編碼 (在學習結構之後,分箱之前)
|
| 91 |
+
self._encode_categorical_features(cat_features)
|
|
|
|
| 92 |
|
| 93 |
+
# 5. ✅ 對連續變數分箱 (在編碼之後)
|
| 94 |
+
self._bin_continuous_features(con_features, n_bins)
|
| 95 |
+
|
| 96 |
+
# 6. 參數估計
|
| 97 |
self._fit_parameters(estimator, equivalent_sample_size)
|
| 98 |
|
| 99 |
+
# 7. 初始化推論引擎
|
| 100 |
self.inference = VariableElimination(self.model)
|
| 101 |
|
| 102 |
+
# 8. 評估模型
|
| 103 |
train_metrics = self._evaluate_model(
|
| 104 |
self.train_data, target_variable, "train"
|
| 105 |
)
|
|
|
|
| 107 |
self.test_data, target_variable, "test"
|
| 108 |
)
|
| 109 |
|
| 110 |
+
# 9. 獲取 CPD
|
| 111 |
cpds = self._get_all_cpds()
|
| 112 |
|
| 113 |
+
# 10. 計算模型評分
|
| 114 |
scores = self._calculate_scores()
|
| 115 |
|
| 116 |
+
# 11. 整理結果
|
| 117 |
results = {
|
| 118 |
'model': self.model,
|
| 119 |
'inference': self.inference,
|
|
|
|
| 145 |
except Exception as e:
|
| 146 |
raise Exception(f"Analysis failed: {str(e)}")
|
| 147 |
|
| 148 |
+
def _preprocess_data(self, df, cat_features, con_features, target_variable):
|
| 149 |
+
"""資料預處理 - 只選擇欄位和刪除缺失值"""
|
| 150 |
+
# 選擇需要的欄位
|
| 151 |
+
selected_columns = cat_features + con_features + [target_variable]
|
| 152 |
+
processed_df = df[selected_columns].copy()
|
| 153 |
+
|
| 154 |
+
# 處理缺失值
|
| 155 |
+
processed_df = processed_df.dropna()
|
| 156 |
+
|
| 157 |
+
return processed_df
|
| 158 |
+
|
| 159 |
+
def _encode_categorical_features(self, cat_features):
|
| 160 |
"""
|
| 161 |
+
✅ 將分類變數轉為 category codes - 完全對齊 Django
|
| 162 |
+
注意:只對 cat_features 編碼,不對分箱後的連續變數編碼
|
| 163 |
+
Django 只對 train_data 編碼,但我們為了一致性也對 test_data 編碼
|
| 164 |
"""
|
|
|
|
|
|
|
| 165 |
for col in cat_features:
|
| 166 |
if col in self.train_data.columns:
|
| 167 |
if self.train_data[col].dtype == 'object':
|
| 168 |
self.train_data[col] = self.train_data[col].astype('category').cat.codes
|
| 169 |
+
# Django 沒有對 test_data 編碼,但為了預測時一致性,我們也編碼
|
| 170 |
if col in self.test_data.columns:
|
| 171 |
if self.test_data[col].dtype == 'object':
|
| 172 |
self.test_data[col] = self.test_data[col].astype('category').cat.codes
|
| 173 |
+
|
| 174 |
+
def _bin_continuous_features(self, con_features, n_bins):
|
| 175 |
+
"""
|
| 176 |
+
✅ 對連續變數分箱 - 完全對齊 Django 版本
|
| 177 |
+
先用訓練集計算邊界,再套用到測試集
|
| 178 |
+
"""
|
| 179 |
self.bins_dict = {}
|
| 180 |
|
|
|
|
| 181 |
for col in con_features:
|
| 182 |
if col in self.train_data.columns and self.train_data[col].notna().sum() > 0:
|
| 183 |
+
# 使用訓練集計算分箱邊界
|
| 184 |
bin_edges = pd.cut(
|
| 185 |
self.train_data[col],
|
| 186 |
bins=n_bins,
|
|
|
|
| 188 |
duplicates='drop'
|
| 189 |
)[1]
|
| 190 |
|
|
|
|
| 191 |
self.bins_dict[col] = bin_edges
|
| 192 |
|
| 193 |
+
# 創建分箱標籤 (✅ 使用 – 而不是 -)
|
| 194 |
bin_labels = [
|
| 195 |
+
f"{round(bin_edges[i], 2)}–{round(bin_edges[i+1], 2)}"
|
| 196 |
for i in range(len(bin_edges) - 1)
|
| 197 |
]
|
| 198 |
|
| 199 |
+
# 對訓練集分箱
|
| 200 |
self.train_data[col] = pd.cut(
|
| 201 |
self.train_data[col],
|
| 202 |
bins=bin_edges,
|
| 203 |
labels=bin_labels,
|
| 204 |
include_lowest=True
|
| 205 |
).astype(object).fillna("Missing")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
+
# 對測試集使用相同邊界分箱
|
| 208 |
+
if col in self.test_data.columns:
|
| 209 |
+
self.test_data[col] = pd.cut(
|
| 210 |
+
self.test_data[col],
|
| 211 |
+
bins=bin_edges,
|
| 212 |
+
labels=bin_labels,
|
| 213 |
+
include_lowest=True
|
| 214 |
+
).astype(object).fillna("Missing")
|
| 215 |
+
else:
|
| 216 |
+
print(f"⚠️ Skipped binning column '{col}' – missing or all NaN")
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
def _learn_structure(self, algorithm, score_method, sig_level, target_variable):
|
| 219 |
+
"""學習網路結構 - 完全對齊 Django 版本"""
|
|
|
|
|
|
|
| 220 |
|
| 221 |
if algorithm == 'NB':
|
| 222 |
+
# Naive Bayes
|
| 223 |
edges = [
|
| 224 |
(target_variable, feature)
|
| 225 |
for feature in self.train_data.columns
|
|
|
|
| 228 |
model = BayesianNetwork(edges)
|
| 229 |
|
| 230 |
elif algorithm == 'TAN':
|
| 231 |
+
# Tree-Augmented Naive Bayes
|
| 232 |
+
# ✅ 特殊情況處理: 如果同時存在'asia'和'either'列,特別指定'asia'作為根節點
|
| 233 |
if 'asia' in self.train_data.columns and 'either' in self.train_data.columns and target_variable == 'either':
|
| 234 |
tan_search = TreeSearch(self.train_data, root_node='asia')
|
| 235 |
else:
|
|
|
|
| 242 |
model = BayesianNetwork(structure.edges())
|
| 243 |
|
| 244 |
elif algorithm == 'CL':
|
| 245 |
+
# Chow-Liu
|
| 246 |
tan_search = TreeSearch(self.train_data)
|
| 247 |
structure = tan_search.estimate(
|
| 248 |
estimator_type='chow-liu',
|
|
|
|
| 250 |
)
|
| 251 |
model = BayesianNetwork(structure.edges())
|
| 252 |
|
| 253 |
+
elif algorithm == 'HC':
|
| 254 |
+
# Hill Climbing
|
| 255 |
+
hc = HillClimbSearch(self.train_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
+
# 選擇評分方法
|
| 258 |
+
scoring_methods = {
|
| 259 |
+
'BIC': BicScore(self.train_data),
|
| 260 |
+
'AIC': AICScore(self.train_data),
|
| 261 |
+
'K2': K2Score(self.train_data),
|
| 262 |
+
'BDeu': BDeuScore(self.train_data),
|
| 263 |
+
'BDs': BDsScore(self.train_data)
|
| 264 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
+
structure = hc.estimate(
|
| 267 |
+
scoring_method=scoring_methods[score_method]
|
| 268 |
+
)
|
| 269 |
+
model = BayesianNetwork(structure.edges())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
+
elif algorithm == 'PC':
|
| 272 |
+
# PC Algorithm - ✅ 與 Django 完全一致的降級策略
|
| 273 |
+
pc = PC(self.train_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
+
# 嘗試不同的 max_cond_vars 直到成功
|
| 276 |
+
for max_cond in [5, 4, 3, 2, 1]:
|
| 277 |
try:
|
| 278 |
+
structure = pc.estimate(
|
| 279 |
significance_level=sig_level,
|
| 280 |
+
max_cond_vars=max_cond,
|
| 281 |
ci_test='chi_square',
|
| 282 |
variant='stable',
|
| 283 |
+
n_jobs=1 # ✅ Django 第一次用 1
|
| 284 |
)
|
| 285 |
+
|
| 286 |
+
# 檢查是否有效 (✅ 與 Django 一致)
|
| 287 |
+
edges = structure.edges()
|
| 288 |
+
if is_directed_acyclic_graph(DiGraph(edges)) and any(target_variable in edge for edge in edges):
|
| 289 |
+
model = BayesianNetwork(structure.edges())
|
| 290 |
+
break
|
| 291 |
except:
|
| 292 |
+
continue
|
| 293 |
+
else:
|
| 294 |
+
# 如果都失敗,使用 Naive Bayes (✅ 與 Django 一致)
|
|
|
|
|
|
|
| 295 |
edges = [
|
| 296 |
(target_variable, feature)
|
| 297 |
for feature in self.train_data.columns
|
| 298 |
if feature != target_variable
|
| 299 |
]
|
| 300 |
model = BayesianNetwork(edges)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
else:
|
| 303 |
raise ValueError(f"Unknown algorithm: {algorithm}")
|
|
|
|
| 305 |
return model
|
| 306 |
|
| 307 |
def _fit_parameters(self, estimator, equivalent_sample_size):
|
| 308 |
+
"""參數估計"""
|
|
|
|
|
|
|
| 309 |
if estimator == 'bn':
|
| 310 |
self.model.fit(
|
| 311 |
self.train_data,
|
| 312 |
estimator=BayesianEstimator,
|
| 313 |
equivalent_sample_size=equivalent_sample_size
|
| 314 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
else:
|
| 316 |
self.model.fit(
|
| 317 |
self.train_data,
|
| 318 |
estimator=MaximumLikelihoodEstimator
|
| 319 |
)
|
| 320 |
|
| 321 |
+
def _predict_probabilities(self, data, target_variable):
|
| 322 |
"""
|
| 323 |
+
預測機率 - ✅ 與 Django 版本完全一致
|
| 324 |
"""
|
| 325 |
true_labels = []
|
| 326 |
predicted_probs = []
|
|
|
|
| 328 |
model_nodes = set(self.model.nodes())
|
| 329 |
|
| 330 |
for idx, row in data.iterrows():
|
| 331 |
+
# 準備 evidence (✅ 過濾只在模型中的變數)
|
| 332 |
raw_evidence = row.drop(target_variable).to_dict()
|
| 333 |
+
filtered_evidence = {k: v for k, v in raw_evidence.items() if k in model_nodes}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
true_label = row[target_variable]
|
| 336 |
true_labels.append(true_label)
|
|
|
|
| 342 |
)
|
| 343 |
probs = result.values
|
| 344 |
predicted_probs.append(probs)
|
|
|
|
| 345 |
except Exception as e:
|
|
|
|
| 346 |
print(f"⚠️ Inference failed at row {idx} | evidence keys: {list(filtered_evidence.keys())} | error: {e}")
|
| 347 |
+
predicted_probs.append(None)
|
| 348 |
+
|
| 349 |
+
# ✅ 過濾有效結果 (與 Django 一致)
|
| 350 |
+
valid_data = [
|
| 351 |
+
(label, prob)
|
| 352 |
+
for label, prob in zip(true_labels, predicted_probs)
|
| 353 |
+
if prob is not None and len(prob) > 1
|
| 354 |
+
]
|
| 355 |
|
| 356 |
+
if not valid_data:
|
| 357 |
+
return [], []
|
| 358 |
+
|
| 359 |
+
valid_labels, valid_probs = zip(*valid_data)
|
| 360 |
+
prob_array = np.round(np.array([prob[1] for prob in valid_probs]), 4)
|
| 361 |
+
|
| 362 |
+
return list(valid_labels), prob_array
|
| 363 |
|
| 364 |
def _evaluate_model(self, data, target_variable, dataset_name):
|
| 365 |
+
"""評估模型效能 - ✅ 與 Django 完全一致"""
|
| 366 |
+
# 預測
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
true_labels, pred_probs = self._predict_probabilities(
|
| 368 |
data, target_variable
|
| 369 |
)
|
| 370 |
|
| 371 |
+
if len(true_labels) == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
return {
|
| 373 |
'accuracy': 0,
|
| 374 |
'precision': 0,
|
|
|
|
| 380 |
'specificity': 0,
|
| 381 |
'confusion_matrix': [[0, 0], [0, 0]],
|
| 382 |
'fpr': [0],
|
| 383 |
+
'tpr': [0]
|
|
|
|
| 384 |
}
|
| 385 |
|
| 386 |
+
# 二元預測 (threshold = 0.1, ✅ 與 Django 一致)
|
| 387 |
+
threshold = 0.1
|
| 388 |
+
pred_labels = (pred_probs >= threshold).astype(int)
|
| 389 |
|
| 390 |
+
# 計算指標
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
accuracy = accuracy_score(true_labels, pred_labels) * 100
|
| 392 |
precision = precision_score(true_labels, pred_labels, zero_division=0) * 100
|
| 393 |
recall = recall_score(true_labels, pred_labels, zero_division=0) * 100
|
| 394 |
f1 = f1_score(true_labels, pred_labels, zero_division=0) * 100
|
| 395 |
|
| 396 |
+
# ROC 曲線
|
| 397 |
+
pred_probs_clean = np.nan_to_num(pred_probs, nan=0.0)
|
| 398 |
+
fpr, tpr, _ = roc_curve(true_labels, pred_probs_clean)
|
| 399 |
+
auc = roc_auc_score(true_labels, pred_probs_clean)
|
| 400 |
|
| 401 |
+
# 混淆矩陣
|
| 402 |
+
cm = confusion_matrix(true_labels, pred_labels).tolist()
|
| 403 |
+
|
| 404 |
+
# G-mean 和 P-mean (✅ 與 Django 計算方式一致)
|
| 405 |
+
tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()
|
| 406 |
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
|
| 407 |
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
|
| 408 |
+
g_mean = np.sqrt(sensitivity * precision / 100) * 100
|
|
|
|
|
|
|
| 409 |
p_mean = np.sqrt(specificity * sensitivity) * 100
|
| 410 |
|
| 411 |
return {
|
| 412 |
+
'accuracy': accuracy,
|
| 413 |
+
'precision': precision,
|
| 414 |
+
'recall': recall,
|
| 415 |
+
'f1': f1,
|
| 416 |
+
'auc': auc,
|
| 417 |
+
'g_mean': g_mean,
|
| 418 |
+
'p_mean': p_mean,
|
| 419 |
+
'specificity': specificity * 100,
|
| 420 |
+
'confusion_matrix': cm,
|
| 421 |
'fpr': fpr.tolist(),
|
| 422 |
'tpr': tpr.tolist(),
|
| 423 |
+
'predicted_probs': pred_probs.tolist()
|
| 424 |
}
|
| 425 |
|
| 426 |
def _get_all_cpds(self):
|
|
|
|
| 432 |
return cpds
|
| 433 |
|
| 434 |
def _calculate_scores(self):
|
| 435 |
+
"""計算模型評分"""
|
| 436 |
+
scores = {
|
| 437 |
+
'log_likelihood': log_likelihood_score(self.model, self.train_data),
|
| 438 |
+
'bic': structure_score(self.model, self.train_data, scoring_method='bic'),
|
| 439 |
+
'k2': structure_score(self.model, self.train_data, scoring_method='k2'),
|
| 440 |
+
'bdeu': structure_score(self.model, self.train_data, scoring_method='bdeu'),
|
| 441 |
+
'bds': structure_score(self.model, self.train_data, scoring_method='bds')
|
| 442 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
return scores
|
| 444 |
|
| 445 |
@classmethod
|
|
|
|
| 451 |
def clear_session_results(cls, session_id):
|
| 452 |
"""清除特定 session 的結果"""
|
| 453 |
if session_id in cls._session_results:
|
| 454 |
+
del cls._session_results[session_id]
|