Wen1201 commited on
Commit
72f09a3
·
verified ·
1 Parent(s): 1e931ad

Upload bn_core.py

Browse files
Files changed (1) hide show
  1. bn_core.py +155 -280
bn_core.py CHANGED
@@ -13,17 +13,17 @@ from sklearn.metrics import (
13
  recall_score, f1_score, roc_curve, roc_auc_score
14
  )
15
  from pgmpy.metrics import log_likelihood_score, structure_score
16
- from networkx import is_directed_acyclic_graph, DiGraph
17
  import threading
18
  from datetime import datetime
 
19
 
20
  class BayesianNetworkAnalyzer:
21
  """
22
- 貝葉斯網路分析器 - 完全對齊 Django views.py 的邏輯
23
- 多用戶同時使用,每個 session 獨立處理
24
  """
25
 
26
- # 類別的鎖,用於線程安全
27
  _lock = threading.Lock()
28
 
29
  # 儲存各 session 的分析結果
@@ -48,7 +48,7 @@ class BayesianNetworkAnalyzer:
48
  equivalent_sample_size=3, score_method='BIC',
49
  sig_level=0.05, n_bins=10):
50
  """
51
- 執行完整的貝葉斯網路分析 - 對齊 Django views.py Line 392-926
52
 
53
  Args:
54
  df: 原始資料框
@@ -69,38 +69,37 @@ class BayesianNetworkAnalyzer:
69
 
70
  with self._lock:
71
  try:
72
- # 1. 選擇需要的欄位 (對齊 Line 598)
73
- selected_features = cat_features + con_features
74
- Sum_features = selected_features + [target_variable]
75
- df = df[Sum_features].copy()
76
-
77
- # 重置索引 (對齊 Line 599)
78
- df.index = df.index + 1
79
 
80
- # 2. 分割資料集 (對齊 Line 600-604) - 關鍵!在預處理之前
81
  self.train_data, self.test_data = train_test_split(
82
- df,
83
  test_size=test_fraction,
84
- random_state=42, # Django 用 526, 這裡用 42 (可改)
85
- stratify=df[target_variable] if target_variable in df.columns else None
86
  )
87
 
88
- # 3. 學習網路結構 (對齊 Line 607-759)
89
  self.model = self._learn_structure(
90
  algorithm, score_method, sig_level, target_variable
91
  )
92
 
93
- # 4. 資料預處理 -結構學習之後 (對齊 Line 762-814)
94
- # 關鍵: 用 train_data 計算 bins, 然後應用到 test_data
95
- self._preprocess_data_inplace(cat_features, con_features, n_bins)
96
 
97
- # 5. 參數估計 ( Line 817-834)
 
 
 
98
  self._fit_parameters(estimator, equivalent_sample_size)
99
 
100
- # 6. 初始化推論引擎 (對齊 Line 837)
101
  self.inference = VariableElimination(self.model)
102
 
103
- # 7. 評估模型 (對齊 Line 840-928)
104
  train_metrics = self._evaluate_model(
105
  self.train_data, target_variable, "train"
106
  )
@@ -108,13 +107,13 @@ class BayesianNetworkAnalyzer:
108
  self.test_data, target_variable, "test"
109
  )
110
 
111
- # 8. 獲取 CPD
112
  cpds = self._get_all_cpds()
113
 
114
- # 9. 計算模型評分
115
  scores = self._calculate_scores()
116
 
117
- # 10. 整理結果
118
  results = {
119
  'model': self.model,
120
  'inference': self.inference,
@@ -146,28 +145,42 @@ class BayesianNetworkAnalyzer:
146
  except Exception as e:
147
  raise Exception(f"Analysis failed: {str(e)}")
148
 
149
- def _preprocess_data_inplace(self, cat_features, con_features, n_bins):
 
 
 
 
 
 
 
 
 
 
 
150
  """
151
- 就地預處理訓練和測試資料 - 對齊 Django Line 762-814
152
- 關鍵: 先用 train_data 計算 bins, 再應用到 test_data
 
153
  """
154
-
155
- # 處理分類特徵 (對齊 Line 762-764)
156
  for col in cat_features:
157
  if col in self.train_data.columns:
158
  if self.train_data[col].dtype == 'object':
159
  self.train_data[col] = self.train_data[col].astype('category').cat.codes
 
160
  if col in self.test_data.columns:
161
  if self.test_data[col].dtype == 'object':
162
  self.test_data[col] = self.test_data[col].astype('category').cat.codes
163
-
164
- # 處理連續特徵 - 分箱 (對齊 Line 766-814)
 
 
 
 
165
  self.bins_dict = {}
166
 
167
- # 步驟 1: 用 train_data 計算 bins (對齊 Line 769-787)
168
  for col in con_features:
169
  if col in self.train_data.columns and self.train_data[col].notna().sum() > 0:
170
- # 計算分箱邊界
171
  bin_edges = pd.cut(
172
  self.train_data[col],
173
  bins=n_bins,
@@ -175,49 +188,38 @@ class BayesianNetworkAnalyzer:
175
  duplicates='drop'
176
  )[1]
177
 
178
- # 儲存 bins 供測試集使用
179
  self.bins_dict[col] = bin_edges
180
 
181
- # 創建分箱標籤
182
  bin_labels = [
183
- f"{round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)}"
184
  for i in range(len(bin_edges) - 1)
185
  ]
186
 
187
- # 應用到訓練集
188
  self.train_data[col] = pd.cut(
189
  self.train_data[col],
190
  bins=bin_edges,
191
  labels=bin_labels,
192
  include_lowest=True
193
  ).astype(object).fillna("Missing")
194
-
195
- # 步驟 2: 用相同的 bins 處理 test_data (對齊 Line 789-803)
196
- for col in con_features:
197
- if col in self.test_data.columns and col in self.bins_dict:
198
- bin_edges = self.bins_dict[col]
199
 
200
- # 使用相同的標籤
201
- bin_labels = [
202
- f"{round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)}"
203
- for i in range(len(bin_edges) - 1)
204
- ]
205
-
206
- # 應用到測試集
207
- self.test_data[col] = pd.cut(
208
- self.test_data[col],
209
- bins=bin_edges,
210
- labels=bin_labels,
211
- include_lowest=True
212
- ).astype(object).fillna("Missing")
213
 
214
  def _learn_structure(self, algorithm, score_method, sig_level, target_variable):
215
- """
216
- 學習網路結構 - 對齊 Django Line 607-759
217
- """
218
 
219
  if algorithm == 'NB':
220
- # Naive Bayes (對齊 Line 608-609)
221
  edges = [
222
  (target_variable, feature)
223
  for feature in self.train_data.columns
@@ -226,8 +228,8 @@ class BayesianNetworkAnalyzer:
226
  model = BayesianNetwork(edges)
227
 
228
  elif algorithm == 'TAN':
229
- # Tree-Augmented Naive Bayes (對齊 Line 610-623)
230
- # 特殊處理: asia dataset
231
  if 'asia' in self.train_data.columns and 'either' in self.train_data.columns and target_variable == 'either':
232
  tan_search = TreeSearch(self.train_data, root_node='asia')
233
  else:
@@ -240,7 +242,7 @@ class BayesianNetworkAnalyzer:
240
  model = BayesianNetwork(structure.edges())
241
 
242
  elif algorithm == 'CL':
243
- # Chow-Liu (對齊 Line 625-627)
244
  tan_search = TreeSearch(self.train_data)
245
  structure = tan_search.estimate(
246
  estimator_type='chow-liu',
@@ -248,124 +250,54 @@ class BayesianNetworkAnalyzer:
248
  )
249
  model = BayesianNetwork(structure.edges())
250
 
251
- elif algorithm == 'PC':
252
- # PC Algorithm (對齊 Line 629-721)
253
- pc = PC(self.train_data)
254
-
255
- # 嘗試不同的 max_cond_vars (對齊 Line 669-720)
256
- model_est = None
257
-
258
- # max_cond_vars = 5
259
- try:
260
- model_est = pc.estimate(
261
- significance_level=sig_level,
262
- max_cond_vars=5,
263
- ci_test='chi_square',
264
- variant='stable',
265
- n_jobs=1
266
- )
267
- edges = model_est.edges()
268
- # 驗證: 必須是 DAG 且目標變數在結構中
269
- if not is_directed_acyclic_graph(DiGraph(edges)) or \
270
- not any(target_variable in edge for edge in edges):
271
- model_est = None
272
- except:
273
- model_est = None
274
 
275
- # max_cond_vars = 4
276
- if model_est is None:
277
- try:
278
- model_est = pc.estimate(
279
- significance_level=sig_level,
280
- max_cond_vars=4,
281
- ci_test='chi_square',
282
- variant='stable',
283
- n_jobs=1
284
- )
285
- edges = model_est.edges()
286
- if not is_directed_acyclic_graph(DiGraph(edges)) or \
287
- not any(target_variable in edge for edge in edges):
288
- model_est = None
289
- except:
290
- model_est = None
291
 
292
- # max_cond_vars = 3
293
- if model_est is None:
294
- try:
295
- model_est = pc.estimate(
296
- significance_level=sig_level,
297
- max_cond_vars=3,
298
- ci_test='chi_square',
299
- variant='stable',
300
- n_jobs=1
301
- )
302
- edges = model_est.edges()
303
- if not is_directed_acyclic_graph(DiGraph(edges)) or \
304
- not any(target_variable in edge for edge in edges):
305
- model_est = None
306
- except:
307
- model_est = None
308
 
309
- # max_cond_vars = 2
310
- if model_est is None:
311
- try:
312
- model_est = pc.estimate(
313
- significance_level=sig_level,
314
- max_cond_vars=2,
315
- ci_test='chi_square',
316
- variant='stable',
317
- n_jobs=1
318
- )
319
- edges = model_est.edges()
320
- if not is_directed_acyclic_graph(DiGraph(edges)) or \
321
- not any(target_variable in edge for edge in edges):
322
- model_est = None
323
- except:
324
- model_est = None
325
 
326
- # max_cond_vars = 1
327
- if model_est is None:
328
  try:
329
- model_est = pc.estimate(
330
  significance_level=sig_level,
331
- max_cond_vars=1,
332
  ci_test='chi_square',
333
  variant='stable',
334
- n_jobs=1
335
  )
 
 
 
 
 
 
336
  except:
337
- model_est = None
338
-
339
- # 如果全部失敗, fallback to Naive Bayes
340
- if model_est is None:
341
- print("⚠️ PC algorithm failed, falling back to Naive Bayes")
342
  edges = [
343
  (target_variable, feature)
344
  for feature in self.train_data.columns
345
  if feature != target_variable
346
  ]
347
  model = BayesianNetwork(edges)
348
- else:
349
- model = BayesianNetwork(model_est.edges())
350
-
351
- elif algorithm == 'HC':
352
- # Hill Climbing (對齊 Line 723-758)
353
- hc = HillClimbSearch(self.train_data)
354
-
355
- # 選擇評分方法
356
- scoring_methods = {
357
- 'AIC': AICScore(self.train_data),
358
- 'BIC': BicScore(self.train_data),
359
- 'K2': K2Score(self.train_data),
360
- 'BDeu': BDeuScore(self.train_data),
361
- 'BDs': BDsScore(self.train_data)
362
- }
363
-
364
- structure = hc.estimate(
365
- scoring_method=scoring_methods[score_method],
366
- start_dag=None
367
- )
368
- model = BayesianNetwork(structure.edges())
369
 
370
  else:
371
  raise ValueError(f"Unknown algorithm: {algorithm}")
@@ -373,32 +305,22 @@ class BayesianNetworkAnalyzer:
373
  return model
374
 
375
  def _fit_parameters(self, estimator, equivalent_sample_size):
376
- """
377
- 參數估計 - 對齊 Django Line 817-834
378
- """
379
  if estimator == 'bn':
380
  self.model.fit(
381
  self.train_data,
382
  estimator=BayesianEstimator,
383
  equivalent_sample_size=equivalent_sample_size
384
  )
385
- elif estimator == 'bn_mcmc':
386
- # Django 有這個選項但未實作完整
387
- # 這裡保留相容性
388
- self.model.fit(
389
- self.train_data,
390
- estimator=BayesianEstimator,
391
- equivalent_sample_size=equivalent_sample_size
392
- )
393
  else:
394
  self.model.fit(
395
  self.train_data,
396
  estimator=MaximumLikelihoodEstimator
397
  )
398
 
399
- def _predict_probabilities(self, data, target_variable, fallback_prob=None):
400
  """
401
- 預測機率 - 對齊 Django Line 120-158
402
  """
403
  true_labels = []
404
  predicted_probs = []
@@ -406,13 +328,9 @@ class BayesianNetworkAnalyzer:
406
  model_nodes = set(self.model.nodes())
407
 
408
  for idx, row in data.iterrows():
 
409
  raw_evidence = row.drop(target_variable).to_dict()
410
-
411
- # 過濾只在模型中的變數
412
- filtered_evidence = {
413
- k: v for k, v in raw_evidence.items()
414
- if k in model_nodes
415
- }
416
 
417
  true_label = row[target_variable]
418
  true_labels.append(true_label)
@@ -424,39 +342,33 @@ class BayesianNetworkAnalyzer:
424
  )
425
  probs = result.values
426
  predicted_probs.append(probs)
427
-
428
  except Exception as e:
429
- # 詳細的錯誤訊息
430
  print(f"⚠️ Inference failed at row {idx} | evidence keys: {list(filtered_evidence.keys())} | error: {e}")
431
-
432
- # 支援 fallback probability
433
- if fallback_prob is not None:
434
- predicted_probs.append(fallback_prob)
435
- else:
436
- predicted_probs.append(None)
 
 
437
 
438
- return true_labels, predicted_probs
 
 
 
 
 
 
439
 
440
  def _evaluate_model(self, data, target_variable, dataset_name):
441
- """
442
- 評估模型效能 - 對齊 Django Line 840-928
443
- """
444
- threshold = 0.5 # 二元分類閾值
445
-
446
- # 預測 (對齊 Line 840-892)
447
  true_labels, pred_probs = self._predict_probabilities(
448
  data, target_variable
449
  )
450
 
451
- # 過濾有效結果 (對齊 Line 866-874)
452
- filtered_data = [
453
- (label, prob, idx)
454
- for idx, (label, prob) in enumerate(zip(true_labels, pred_probs))
455
- if label is not None and prob is not None and len(prob) > 1
456
- ]
457
-
458
- if not filtered_data:
459
- print(f"⚠️ No valid predictions for {dataset_name} set")
460
  return {
461
  'accuracy': 0,
462
  'precision': 0,
@@ -468,74 +380,47 @@ class BayesianNetworkAnalyzer:
468
  'specificity': 0,
469
  'confusion_matrix': [[0, 0], [0, 0]],
470
  'fpr': [0],
471
- 'tpr': [0],
472
- 'predicted_probs': []
473
  }
474
 
475
- true_labels_filtered, pred_probs_filtered, valid_indices = zip(*filtered_data)
 
 
476
 
477
- # 轉換為 numpy array (對齊 Line 878)
478
- pred_probs_array = np.round(
479
- np.array([prob[1] for prob in pred_probs_filtered]),
480
- 4
481
- )
482
-
483
- # 二元預測 (對齊 Line 881)
484
- pred_labels = (pred_probs_array >= threshold).astype(int)
485
-
486
- # 確保一致性 (對齊 Line 884-886)
487
- if len(true_labels_filtered) != len(pred_labels):
488
- raise ValueError("Mismatch between true labels and predictions after filtering.")
489
-
490
- true_labels = true_labels_filtered
491
-
492
- # 計算混淆矩陣 (對齊 Line 888)
493
- cm = confusion_matrix(true_labels, pred_labels)
494
-
495
- # 計算 AUC (對齊 Line 890-897)
496
- try:
497
- auc = roc_auc_score(
498
- [1 if label == 1 else 0 for label in true_labels],
499
- pred_probs_array
500
- )
501
- except:
502
- auc = 0.0
503
-
504
- # ROC 曲線 (對齊 Line 906)
505
- try:
506
- fpr, tpr, _ = roc_curve(true_labels, pred_probs_array)
507
- except:
508
- fpr, tpr = [0, 1], [0, 1]
509
-
510
- # 計算基本指標 (對齊 Line 908-911)
511
  accuracy = accuracy_score(true_labels, pred_labels) * 100
512
  precision = precision_score(true_labels, pred_labels, zero_division=0) * 100
513
  recall = recall_score(true_labels, pred_labels, zero_division=0) * 100
514
  f1 = f1_score(true_labels, pred_labels, zero_division=0) * 100
515
 
516
- # 計算 G-mean 和 P-mean (對齊 Django calculate_performance_metrics Line 48-66)
517
- tn, fp, fn, tp = cm.ravel()
 
 
518
 
 
 
 
 
 
519
  sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
520
  specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
521
- precision_raw = tp / (tp + fp) if (tp + fp) > 0 else 0
522
-
523
- g_mean = np.sqrt(sensitivity * precision_raw) * 100
524
  p_mean = np.sqrt(specificity * sensitivity) * 100
525
 
526
  return {
527
- 'accuracy': round(accuracy, 2),
528
- 'precision': round(precision, 2),
529
- 'recall': round(recall, 2),
530
- 'f1': round(f1, 2),
531
- 'auc': round(auc, 4),
532
- 'g_mean': round(g_mean, 2),
533
- 'p_mean': round(p_mean, 2),
534
- 'specificity': round(specificity * 100, 2),
535
- 'confusion_matrix': cm.tolist(),
536
  'fpr': fpr.tolist(),
537
  'tpr': tpr.tolist(),
538
- 'predicted_probs': pred_probs_array.tolist()
539
  }
540
 
541
  def _get_all_cpds(self):
@@ -547,24 +432,14 @@ class BayesianNetworkAnalyzer:
547
  return cpds
548
 
549
  def _calculate_scores(self):
550
- """計算模型評分 - 對齊 Django Line 942-947"""
551
- try:
552
- scores = {
553
- 'log_likelihood': log_likelihood_score(self.model, self.train_data),
554
- 'bic': structure_score(self.model, self.train_data, scoring_method='bic'),
555
- 'k2': structure_score(self.model, self.train_data, scoring_method='k2'),
556
- 'bdeu': structure_score(self.model, self.train_data, scoring_method='bdeu'),
557
- 'bds': structure_score(self.model, self.train_data, scoring_method='bds')
558
- }
559
- except Exception as e:
560
- print(f"⚠️ Error calculating scores: {e}")
561
- scores = {
562
- 'log_likelihood': 0,
563
- 'bic': 0,
564
- 'k2': 0,
565
- 'bdeu': 0,
566
- 'bds': 0
567
- }
568
  return scores
569
 
570
  @classmethod
@@ -576,4 +451,4 @@ class BayesianNetworkAnalyzer:
576
  def clear_session_results(cls, session_id):
577
  """清除特定 session 的結果"""
578
  if session_id in cls._session_results:
579
- del cls._session_results[session_id]
 
13
  recall_score, f1_score, roc_curve, roc_auc_score
14
  )
15
  from pgmpy.metrics import log_likelihood_score, structure_score
 
16
  import threading
17
  from datetime import datetime
18
+ from networkx import is_directed_acyclic_graph, DiGraph
19
 
20
  class BayesianNetworkAnalyzer:
21
  """
22
+ 貝葉斯網路分析器
23
+ 多用戶同時使用,每個 session 獨立處理
24
  """
25
 
26
+ # 類別的鎖,用於線程安全
27
  _lock = threading.Lock()
28
 
29
  # 儲存各 session 的分析結果
 
48
  equivalent_sample_size=3, score_method='BIC',
49
  sig_level=0.05, n_bins=10):
50
  """
51
+ 執行完整的貝葉斯網路分析 - 完全對齊 Django 版本的順序
52
 
53
  Args:
54
  df: 原始資料框
 
69
 
70
  with self._lock:
71
  try:
72
+ # 1. 資料預處理 (只選擇欄位和處理缺失值)
73
+ processed_df = self._preprocess_data(
74
+ df, cat_features, con_features, target_variable
75
+ )
 
 
 
76
 
77
+ # 2. 分割訓練/測試集 ( random_state=526)
78
  self.train_data, self.test_data = train_test_split(
79
+ processed_df,
80
  test_size=test_fraction,
81
+ random_state=526,
82
+ stratify=processed_df[target_variable] if target_variable in processed_df.columns else None
83
  )
84
 
85
+ # 3. 學習網路結構 (在分箱和編碼之前!)
86
  self.model = self._learn_structure(
87
  algorithm, score_method, sig_level, target_variable
88
  )
89
 
90
+ # 4. 對分類變數編碼 (在學習結構之後,分箱之前)
91
+ self._encode_categorical_features(cat_features)
 
92
 
93
+ # 5. 連續變數分箱 (在編碼之後)
94
+ self._bin_continuous_features(con_features, n_bins)
95
+
96
+ # 6. 參數估計
97
  self._fit_parameters(estimator, equivalent_sample_size)
98
 
99
+ # 7. 初始化推論引擎
100
  self.inference = VariableElimination(self.model)
101
 
102
+ # 8. 評估模型
103
  train_metrics = self._evaluate_model(
104
  self.train_data, target_variable, "train"
105
  )
 
107
  self.test_data, target_variable, "test"
108
  )
109
 
110
+ # 9. 獲取 CPD
111
  cpds = self._get_all_cpds()
112
 
113
+ # 10. 計算模型評分
114
  scores = self._calculate_scores()
115
 
116
+ # 11. 整理結果
117
  results = {
118
  'model': self.model,
119
  'inference': self.inference,
 
145
  except Exception as e:
146
  raise Exception(f"Analysis failed: {str(e)}")
147
 
148
+ def _preprocess_data(self, df, cat_features, con_features, target_variable):
149
+ """資料預處理 - 只選擇欄位和刪除缺失值"""
150
+ # 選擇需要的欄位
151
+ selected_columns = cat_features + con_features + [target_variable]
152
+ processed_df = df[selected_columns].copy()
153
+
154
+ # 處理缺失值
155
+ processed_df = processed_df.dropna()
156
+
157
+ return processed_df
158
+
159
+ def _encode_categorical_features(self, cat_features):
160
  """
161
+ 將分類變數轉為 category codes - 完全對齊 Django
162
+ 注意:只對 cat_features 編碼,不對分箱後的連續變數編碼
163
+ Django 只對 train_data 編碼,但我們為了一致性也對 test_data 編碼
164
  """
 
 
165
  for col in cat_features:
166
  if col in self.train_data.columns:
167
  if self.train_data[col].dtype == 'object':
168
  self.train_data[col] = self.train_data[col].astype('category').cat.codes
169
+ # Django 沒有對 test_data 編碼,但為了預測時一致性,我們也編碼
170
  if col in self.test_data.columns:
171
  if self.test_data[col].dtype == 'object':
172
  self.test_data[col] = self.test_data[col].astype('category').cat.codes
173
+
174
+ def _bin_continuous_features(self, con_features, n_bins):
175
+ """
176
+ ✅ 對連續變數分箱 - 完全對齊 Django 版本
177
+ 先用訓練集計算邊界,再套用到測試集
178
+ """
179
  self.bins_dict = {}
180
 
 
181
  for col in con_features:
182
  if col in self.train_data.columns and self.train_data[col].notna().sum() > 0:
183
+ # 使用訓練集計算分箱邊界
184
  bin_edges = pd.cut(
185
  self.train_data[col],
186
  bins=n_bins,
 
188
  duplicates='drop'
189
  )[1]
190
 
 
191
  self.bins_dict[col] = bin_edges
192
 
193
+ # 創建分箱標籤 (✅ 使用 – 而不是 -)
194
  bin_labels = [
195
+ f"{round(bin_edges[i], 2)}{round(bin_edges[i+1], 2)}"
196
  for i in range(len(bin_edges) - 1)
197
  ]
198
 
199
+ # 訓練集分箱
200
  self.train_data[col] = pd.cut(
201
  self.train_data[col],
202
  bins=bin_edges,
203
  labels=bin_labels,
204
  include_lowest=True
205
  ).astype(object).fillna("Missing")
 
 
 
 
 
206
 
207
+ # 對測試集使用相同邊界分箱
208
+ if col in self.test_data.columns:
209
+ self.test_data[col] = pd.cut(
210
+ self.test_data[col],
211
+ bins=bin_edges,
212
+ labels=bin_labels,
213
+ include_lowest=True
214
+ ).astype(object).fillna("Missing")
215
+ else:
216
+ print(f"⚠️ Skipped binning column '{col}' – missing or all NaN")
 
 
 
217
 
218
  def _learn_structure(self, algorithm, score_method, sig_level, target_variable):
219
+ """學習網路結構 - 完全對齊 Django 版本"""
 
 
220
 
221
  if algorithm == 'NB':
222
+ # Naive Bayes
223
  edges = [
224
  (target_variable, feature)
225
  for feature in self.train_data.columns
 
228
  model = BayesianNetwork(edges)
229
 
230
  elif algorithm == 'TAN':
231
+ # Tree-Augmented Naive Bayes
232
+ # 特殊情況處理: 如果同時存在'asia'和'either'列,特別指定'asia'作為根節點
233
  if 'asia' in self.train_data.columns and 'either' in self.train_data.columns and target_variable == 'either':
234
  tan_search = TreeSearch(self.train_data, root_node='asia')
235
  else:
 
242
  model = BayesianNetwork(structure.edges())
243
 
244
  elif algorithm == 'CL':
245
+ # Chow-Liu
246
  tan_search = TreeSearch(self.train_data)
247
  structure = tan_search.estimate(
248
  estimator_type='chow-liu',
 
250
  )
251
  model = BayesianNetwork(structure.edges())
252
 
253
+ elif algorithm == 'HC':
254
+ # Hill Climbing
255
+ hc = HillClimbSearch(self.train_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ # 選擇評分方法
258
+ scoring_methods = {
259
+ 'BIC': BicScore(self.train_data),
260
+ 'AIC': AICScore(self.train_data),
261
+ 'K2': K2Score(self.train_data),
262
+ 'BDeu': BDeuScore(self.train_data),
263
+ 'BDs': BDsScore(self.train_data)
264
+ }
 
 
 
 
 
 
 
 
265
 
266
+ structure = hc.estimate(
267
+ scoring_method=scoring_methods[score_method]
268
+ )
269
+ model = BayesianNetwork(structure.edges())
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ elif algorithm == 'PC':
272
+ # PC Algorithm - ✅ 與 Django 完全一致的降級策略
273
+ pc = PC(self.train_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
+ # 嘗試不同的 max_cond_vars 直到成功
276
+ for max_cond in [5, 4, 3, 2, 1]:
277
  try:
278
+ structure = pc.estimate(
279
  significance_level=sig_level,
280
+ max_cond_vars=max_cond,
281
  ci_test='chi_square',
282
  variant='stable',
283
+ n_jobs=1 # ✅ Django 第一次用 1
284
  )
285
+
286
+ # 檢查是否有效 (✅ 與 Django 一致)
287
+ edges = structure.edges()
288
+ if is_directed_acyclic_graph(DiGraph(edges)) and any(target_variable in edge for edge in edges):
289
+ model = BayesianNetwork(structure.edges())
290
+ break
291
  except:
292
+ continue
293
+ else:
294
+ # 如果失敗,使用 Naive Bayes (✅ 與 Django 一致)
 
 
295
  edges = [
296
  (target_variable, feature)
297
  for feature in self.train_data.columns
298
  if feature != target_variable
299
  ]
300
  model = BayesianNetwork(edges)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  else:
303
  raise ValueError(f"Unknown algorithm: {algorithm}")
 
305
  return model
306
 
307
  def _fit_parameters(self, estimator, equivalent_sample_size):
308
+ """參數估計"""
 
 
309
  if estimator == 'bn':
310
  self.model.fit(
311
  self.train_data,
312
  estimator=BayesianEstimator,
313
  equivalent_sample_size=equivalent_sample_size
314
  )
 
 
 
 
 
 
 
 
315
  else:
316
  self.model.fit(
317
  self.train_data,
318
  estimator=MaximumLikelihoodEstimator
319
  )
320
 
321
+ def _predict_probabilities(self, data, target_variable):
322
  """
323
+ 預測機率 - Django 版本完全一致
324
  """
325
  true_labels = []
326
  predicted_probs = []
 
328
  model_nodes = set(self.model.nodes())
329
 
330
  for idx, row in data.iterrows():
331
+ # 準備 evidence (✅ 過濾只在模型中的變數)
332
  raw_evidence = row.drop(target_variable).to_dict()
333
+ filtered_evidence = {k: v for k, v in raw_evidence.items() if k in model_nodes}
 
 
 
 
 
334
 
335
  true_label = row[target_variable]
336
  true_labels.append(true_label)
 
342
  )
343
  probs = result.values
344
  predicted_probs.append(probs)
 
345
  except Exception as e:
 
346
  print(f"⚠️ Inference failed at row {idx} | evidence keys: {list(filtered_evidence.keys())} | error: {e}")
347
+ predicted_probs.append(None)
348
+
349
+ # 過濾有效結果 (與 Django 一致)
350
+ valid_data = [
351
+ (label, prob)
352
+ for label, prob in zip(true_labels, predicted_probs)
353
+ if prob is not None and len(prob) > 1
354
+ ]
355
 
356
+ if not valid_data:
357
+ return [], []
358
+
359
+ valid_labels, valid_probs = zip(*valid_data)
360
+ prob_array = np.round(np.array([prob[1] for prob in valid_probs]), 4)
361
+
362
+ return list(valid_labels), prob_array
363
 
364
  def _evaluate_model(self, data, target_variable, dataset_name):
365
+ """評估模型效能 - ✅ 與 Django 完全一致"""
366
+ # 預測
 
 
 
 
367
  true_labels, pred_probs = self._predict_probabilities(
368
  data, target_variable
369
  )
370
 
371
+ if len(true_labels) == 0:
 
 
 
 
 
 
 
 
372
  return {
373
  'accuracy': 0,
374
  'precision': 0,
 
380
  'specificity': 0,
381
  'confusion_matrix': [[0, 0], [0, 0]],
382
  'fpr': [0],
383
+ 'tpr': [0]
 
384
  }
385
 
386
+ # 二元預測 (threshold = 0.1, ✅ 與 Django 一致)
387
+ threshold = 0.1
388
+ pred_labels = (pred_probs >= threshold).astype(int)
389
 
390
+ # 計算指標
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  accuracy = accuracy_score(true_labels, pred_labels) * 100
392
  precision = precision_score(true_labels, pred_labels, zero_division=0) * 100
393
  recall = recall_score(true_labels, pred_labels, zero_division=0) * 100
394
  f1 = f1_score(true_labels, pred_labels, zero_division=0) * 100
395
 
396
+ # ROC 曲線
397
+ pred_probs_clean = np.nan_to_num(pred_probs, nan=0.0)
398
+ fpr, tpr, _ = roc_curve(true_labels, pred_probs_clean)
399
+ auc = roc_auc_score(true_labels, pred_probs_clean)
400
 
401
+ # 混淆矩陣
402
+ cm = confusion_matrix(true_labels, pred_labels).tolist()
403
+
404
+ # G-mean 和 P-mean (✅ 與 Django 計算方式一致)
405
+ tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()
406
  sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
407
  specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
408
+ g_mean = np.sqrt(sensitivity * precision / 100) * 100
 
 
409
  p_mean = np.sqrt(specificity * sensitivity) * 100
410
 
411
  return {
412
+ 'accuracy': accuracy,
413
+ 'precision': precision,
414
+ 'recall': recall,
415
+ 'f1': f1,
416
+ 'auc': auc,
417
+ 'g_mean': g_mean,
418
+ 'p_mean': p_mean,
419
+ 'specificity': specificity * 100,
420
+ 'confusion_matrix': cm,
421
  'fpr': fpr.tolist(),
422
  'tpr': tpr.tolist(),
423
+ 'predicted_probs': pred_probs.tolist()
424
  }
425
 
426
  def _get_all_cpds(self):
 
432
  return cpds
433
 
434
  def _calculate_scores(self):
435
+ """計算模型評分"""
436
+ scores = {
437
+ 'log_likelihood': log_likelihood_score(self.model, self.train_data),
438
+ 'bic': structure_score(self.model, self.train_data, scoring_method='bic'),
439
+ 'k2': structure_score(self.model, self.train_data, scoring_method='k2'),
440
+ 'bdeu': structure_score(self.model, self.train_data, scoring_method='bdeu'),
441
+ 'bds': structure_score(self.model, self.train_data, scoring_method='bds')
442
+ }
 
 
 
 
 
 
 
 
 
 
443
  return scores
444
 
445
  @classmethod
 
451
  def clear_session_results(cls, session_id):
452
  """清除特定 session 的結果"""
453
  if session_id in cls._session_results:
454
+ del cls._session_results[session_id]