Spaces:

Wen1201
/

bayesian-network

Sleeping

App Files Files Community

Wen1201 commited on Oct 31, 2025

Commit

72f09a3

verified ·

1 Parent(s): 1e931ad

Upload bn_core.py

Browse files

Files changed (1) hide show

bn_core.py +155 -280

bn_core.py CHANGED Viewed

@@ -13,17 +13,17 @@ from sklearn.metrics import (
     recall_score, f1_score, roc_curve, roc_auc_score
 )
 from pgmpy.metrics import log_likelihood_score, structure_score
-from networkx import is_directed_acyclic_graph, DiGraph
 import threading
 from datetime import datetime
 class BayesianNetworkAnalyzer:
     """
-    貝葉斯網路分析器 - 完全對齊 Django views.py 的邏輯
-    支援多用戶同時使用,每個 session 獨立處理
     """
-    # 類級別的鎖,用於線程安全
     _lock = threading.Lock()
     # 儲存各 session 的分析結果
@@ -48,7 +48,7 @@ class BayesianNetworkAnalyzer:
                      equivalent_sample_size=3, score_method='BIC',
                      sig_level=0.05, n_bins=10):
         """
-        執行完整的貝葉斯網路分析 - 對齊 Django views.py Line 392-926
         Args:
             df: 原始資料框
@@ -69,38 +69,37 @@ class BayesianNetworkAnalyzer:
         with self._lock:
             try:
-                # 1. 選擇需要的欄位 (對齊 Line 598)
-                selected_features = cat_features + con_features
-                Sum_features = selected_features + [target_variable]
-                df = df[Sum_features].copy()
-                # 重置索引 (對齊 Line 599)
-                df.index = df.index + 1
-                # 2. 先分割資料集 (對齊 Line 600-604) - 關鍵!在預處理之前
                 self.train_data, self.test_data = train_test_split(
-                    df,
                     test_size=test_fraction,
-                    random_state=42,  # Django 用 526, 這裡用 42 (可改)
-                    stratify=df[target_variable] if target_variable in df.columns else None
                 )
-                # 3. 學習網路結構 (對齊 Line 607-759)
                 self.model = self._learn_structure(
                     algorithm, score_method, sig_level, target_variable
                 )
-                # 4. 資料預處理 - 在結構學習之後 (對齊 Line 762-814)
-                # 關鍵: 用 train_data 計算 bins, 然後應用到 test_data
-                self._preprocess_data_inplace(cat_features, con_features, n_bins)
-                # 5. 參數估計 (對齊 Line 817-834)
                 self._fit_parameters(estimator, equivalent_sample_size)
-                # 6. 初始化推論引擎 (對齊 Line 837)
                 self.inference = VariableElimination(self.model)
-                # 7. 評估模型 (對齊 Line 840-928)
                 train_metrics = self._evaluate_model(
                     self.train_data, target_variable, "train"
                 )
@@ -108,13 +107,13 @@ class BayesianNetworkAnalyzer:
                     self.test_data, target_variable, "test"
                 )
-                # 8. 獲取 CPD
                 cpds = self._get_all_cpds()
-                # 9. 計算模型評分
                 scores = self._calculate_scores()
-                # 10. 整理結果
                 results = {
                     'model': self.model,
                     'inference': self.inference,
@@ -146,28 +145,42 @@ class BayesianNetworkAnalyzer:
             except Exception as e:
                 raise Exception(f"Analysis failed: {str(e)}")
-    def _preprocess_data_inplace(self, cat_features, con_features, n_bins):
         """
-        就地預處理訓練和測試資料 - 對齊 Django Line 762-814
-        關鍵: 先用 train_data 計算 bins, 再應用到 test_data
         """
-        # 處理分類特徵 (對齊 Line 762-764)
         for col in cat_features:
             if col in self.train_data.columns:
                 if self.train_data[col].dtype == 'object':
                     self.train_data[col] = self.train_data[col].astype('category').cat.codes
             if col in self.test_data.columns:
                 if self.test_data[col].dtype == 'object':
                     self.test_data[col] = self.test_data[col].astype('category').cat.codes
-        # 處理連續特徵 - 分箱 (對齊 Line 766-814)
         self.bins_dict = {}
-        # 步驟 1: 用 train_data 計算 bins (對齊 Line 769-787)
         for col in con_features:
             if col in self.train_data.columns and self.train_data[col].notna().sum() > 0:
-                # 計算分箱邊界
                 bin_edges = pd.cut(
                     self.train_data[col],
                     bins=n_bins,
@@ -175,49 +188,38 @@ class BayesianNetworkAnalyzer:
                     duplicates='drop'
                 )[1]
-                # 儲存 bins 供測試集使用
                 self.bins_dict[col] = bin_edges
-                # 創建分箱標籤
                 bin_labels = [
-                    f"{round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)}"
                     for i in range(len(bin_edges) - 1)
                 ]
-                # 應用到訓練集
                 self.train_data[col] = pd.cut(
                     self.train_data[col],
                     bins=bin_edges,
                     labels=bin_labels,
                     include_lowest=True
                 ).astype(object).fillna("Missing")
-        # 步驟 2: 用相同的 bins 處理 test_data (對齊 Line 789-803)
-        for col in con_features:
-            if col in self.test_data.columns and col in self.bins_dict:
-                bin_edges = self.bins_dict[col]
-                # 使用相同的標籤
-                bin_labels = [
-                    f"{round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)}"
-                    for i in range(len(bin_edges) - 1)
-                ]
-                # 應用到測試集
-                self.test_data[col] = pd.cut(
-                    self.test_data[col],
-                    bins=bin_edges,
-                    labels=bin_labels,
-                    include_lowest=True
-                ).astype(object).fillna("Missing")
     def _learn_structure(self, algorithm, score_method, sig_level, target_variable):
-        """
-        學習網路結構 - 對齊 Django Line 607-759
-        """
         if algorithm == 'NB':
-            # Naive Bayes (對齊 Line 608-609)
             edges = [
                 (target_variable, feature)
                 for feature in self.train_data.columns
@@ -226,8 +228,8 @@ class BayesianNetworkAnalyzer:
             model = BayesianNetwork(edges)
         elif algorithm == 'TAN':
-            # Tree-Augmented Naive Bayes (對齊 Line 610-623)
-            # 特殊處理: asia dataset
             if 'asia' in self.train_data.columns and 'either' in self.train_data.columns and target_variable == 'either':
                 tan_search = TreeSearch(self.train_data, root_node='asia')
             else:
@@ -240,7 +242,7 @@ class BayesianNetworkAnalyzer:
             model = BayesianNetwork(structure.edges())
         elif algorithm == 'CL':
-            # Chow-Liu (對齊 Line 625-627)
             tan_search = TreeSearch(self.train_data)
             structure = tan_search.estimate(
                 estimator_type='chow-liu',
@@ -248,124 +250,54 @@ class BayesianNetworkAnalyzer:
             )
             model = BayesianNetwork(structure.edges())
-        elif algorithm == 'PC':
-            # PC Algorithm (對齊 Line 629-721)
-            pc = PC(self.train_data)
-            # 嘗試不同的 max_cond_vars (對齊 Line 669-720)
-            model_est = None
-            # max_cond_vars = 5
-            try:
-                model_est = pc.estimate(
-                    significance_level=sig_level,
-                    max_cond_vars=5,
-                    ci_test='chi_square',
-                    variant='stable',
-                    n_jobs=1
-                )
-                edges = model_est.edges()
-                # 驗證: 必須是 DAG 且目標變數在結構中
-                if not is_directed_acyclic_graph(DiGraph(edges)) or \
-                   not any(target_variable in edge for edge in edges):
-                    model_est = None
-            except:
-                model_est = None
-            # max_cond_vars = 4
-            if model_est is None:
-                try:
-                    model_est = pc.estimate(
-                        significance_level=sig_level,
-                        max_cond_vars=4,
-                        ci_test='chi_square',
-                        variant='stable',
-                        n_jobs=1
-                    )
-                    edges = model_est.edges()
-                    if not is_directed_acyclic_graph(DiGraph(edges)) or \
-                       not any(target_variable in edge for edge in edges):
-                        model_est = None
-                except:
-                    model_est = None
-            # max_cond_vars = 3
-            if model_est is None:
-                try:
-                    model_est = pc.estimate(
-                        significance_level=sig_level,
-                        max_cond_vars=3,
-                        ci_test='chi_square',
-                        variant='stable',
-                        n_jobs=1
-                    )
-                    edges = model_est.edges()
-                    if not is_directed_acyclic_graph(DiGraph(edges)) or \
-                       not any(target_variable in edge for edge in edges):
-                        model_est = None
-                except:
-                    model_est = None
-            # max_cond_vars = 2
-            if model_est is None:
-                try:
-                    model_est = pc.estimate(
-                        significance_level=sig_level,
-                        max_cond_vars=2,
-                        ci_test='chi_square',
-                        variant='stable',
-                        n_jobs=1
-                    )
-                    edges = model_est.edges()
-                    if not is_directed_acyclic_graph(DiGraph(edges)) or \
-                       not any(target_variable in edge for edge in edges):
-                        model_est = None
-                except:
-                    model_est = None
-            # max_cond_vars = 1
-            if model_est is None:
                 try:
-                    model_est = pc.estimate(
                         significance_level=sig_level,
-                        max_cond_vars=1,
                         ci_test='chi_square',
                         variant='stable',
-                        n_jobs=1
                     )
                 except:
-                    model_est = None
-            # 如果全部失敗, fallback to Naive Bayes
-            if model_est is None:
-                print("⚠️ PC algorithm failed, falling back to Naive Bayes")
                 edges = [
                     (target_variable, feature)
                     for feature in self.train_data.columns
                     if feature != target_variable
                 ]
                 model = BayesianNetwork(edges)
-            else:
-                model = BayesianNetwork(model_est.edges())
-        elif algorithm == 'HC':
-            # Hill Climbing (對齊 Line 723-758)
-            hc = HillClimbSearch(self.train_data)
-            # 選擇評分方法
-            scoring_methods = {
-                'AIC': AICScore(self.train_data),
-                'BIC': BicScore(self.train_data),
-                'K2': K2Score(self.train_data),
-                'BDeu': BDeuScore(self.train_data),
-                'BDs': BDsScore(self.train_data)
-            }
-            structure = hc.estimate(
-                scoring_method=scoring_methods[score_method],
-                start_dag=None
-            )
-            model = BayesianNetwork(structure.edges())
         else:
             raise ValueError(f"Unknown algorithm: {algorithm}")
@@ -373,32 +305,22 @@ class BayesianNetworkAnalyzer:
         return model
     def _fit_parameters(self, estimator, equivalent_sample_size):
-        """
-        參數估計 - 對齊 Django Line 817-834
-        """
         if estimator == 'bn':
             self.model.fit(
                 self.train_data,
                 estimator=BayesianEstimator,
                 equivalent_sample_size=equivalent_sample_size
             )
-        elif estimator == 'bn_mcmc':
-            # Django 有這個選項但未實作完整
-            # 這裡保留相容性
-            self.model.fit(
-                self.train_data,
-                estimator=BayesianEstimator,
-                equivalent_sample_size=equivalent_sample_size
-            )
         else:
             self.model.fit(
                 self.train_data,
                 estimator=MaximumLikelihoodEstimator
             )
-    def _predict_probabilities(self, data, target_variable, fallback_prob=None):
         """
-        預測機率 - 對齊 Django Line 120-158
         """
         true_labels = []
         predicted_probs = []
@@ -406,13 +328,9 @@ class BayesianNetworkAnalyzer:
         model_nodes = set(self.model.nodes())
         for idx, row in data.iterrows():
             raw_evidence = row.drop(target_variable).to_dict()
-            # 過濾只在模型中的變數
-            filtered_evidence = {
-                k: v for k, v in raw_evidence.items()
-                if k in model_nodes
-            }
             true_label = row[target_variable]
             true_labels.append(true_label)
@@ -424,39 +342,33 @@ class BayesianNetworkAnalyzer:
                 )
                 probs = result.values
                 predicted_probs.append(probs)
             except Exception as e:
-                # 詳細的錯誤訊息
                 print(f"⚠️ Inference failed at row {idx} | evidence keys: {list(filtered_evidence.keys())} | error: {e}")
-                # 支援 fallback probability
-                if fallback_prob is not None:
-                    predicted_probs.append(fallback_prob)
-                else:
-                    predicted_probs.append(None)
-        return true_labels, predicted_probs
     def _evaluate_model(self, data, target_variable, dataset_name):
-        """
-        評估模型效能 - 對齊 Django Line 840-928
-        """
-        threshold = 0.5  # 二元分類閾值
-        # 預測 (對齊 Line 840-892)
         true_labels, pred_probs = self._predict_probabilities(
             data, target_variable
         )
-        # 過濾有效結果 (對齊 Line 866-874)
-        filtered_data = [
-            (label, prob, idx)
-            for idx, (label, prob) in enumerate(zip(true_labels, pred_probs))
-            if label is not None and prob is not None and len(prob) > 1
-        ]
-        if not filtered_data:
-            print(f"⚠️ No valid predictions for {dataset_name} set")
             return {
                 'accuracy': 0,
                 'precision': 0,
@@ -468,74 +380,47 @@ class BayesianNetworkAnalyzer:
                 'specificity': 0,
                 'confusion_matrix': [[0, 0], [0, 0]],
                 'fpr': [0],
-                'tpr': [0],
-                'predicted_probs': []
             }
-        true_labels_filtered, pred_probs_filtered, valid_indices = zip(*filtered_data)
-        # 轉換為 numpy array (對齊 Line 878)
-        pred_probs_array = np.round(
-            np.array([prob[1] for prob in pred_probs_filtered]),
-            4
-        )
-        # 二元預測 (對齊 Line 881)
-        pred_labels = (pred_probs_array >= threshold).astype(int)
-        # 確保一致性 (對齊 Line 884-886)
-        if len(true_labels_filtered) != len(pred_labels):
-            raise ValueError("Mismatch between true labels and predictions after filtering.")
-        true_labels = true_labels_filtered
-        # 計算混淆矩陣 (對齊 Line 888)
-        cm = confusion_matrix(true_labels, pred_labels)
-        # 計算 AUC (對齊 Line 890-897)
-        try:
-            auc = roc_auc_score(
-                [1 if label == 1 else 0 for label in true_labels],
-                pred_probs_array
-            )
-        except:
-            auc = 0.0
-        # ROC 曲線 (對齊 Line 906)
-        try:
-            fpr, tpr, _ = roc_curve(true_labels, pred_probs_array)
-        except:
-            fpr, tpr = [0, 1], [0, 1]
-        # 計算基本指標 (對齊 Line 908-911)
         accuracy = accuracy_score(true_labels, pred_labels) * 100
         precision = precision_score(true_labels, pred_labels, zero_division=0) * 100
         recall = recall_score(true_labels, pred_labels, zero_division=0) * 100
         f1 = f1_score(true_labels, pred_labels, zero_division=0) * 100
-        # 計算 G-mean 和 P-mean (對齊 Django calculate_performance_metrics Line 48-66)
-        tn, fp, fn, tp = cm.ravel()
         sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
         specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
-        precision_raw = tp / (tp + fp) if (tp + fp) > 0 else 0
-        g_mean = np.sqrt(sensitivity * precision_raw) * 100
         p_mean = np.sqrt(specificity * sensitivity) * 100
         return {
-            'accuracy': round(accuracy, 2),
-            'precision': round(precision, 2),
-            'recall': round(recall, 2),
-            'f1': round(f1, 2),
-            'auc': round(auc, 4),
-            'g_mean': round(g_mean, 2),
-            'p_mean': round(p_mean, 2),
-            'specificity': round(specificity * 100, 2),
-            'confusion_matrix': cm.tolist(),
             'fpr': fpr.tolist(),
             'tpr': tpr.tolist(),
-            'predicted_probs': pred_probs_array.tolist()
         }
     def _get_all_cpds(self):
@@ -547,24 +432,14 @@ class BayesianNetworkAnalyzer:
         return cpds
     def _calculate_scores(self):
-        """計算模型評分 - 對齊 Django Line 942-947"""
-        try:
-            scores = {
-                'log_likelihood': log_likelihood_score(self.model, self.train_data),
-                'bic': structure_score(self.model, self.train_data, scoring_method='bic'),
-                'k2': structure_score(self.model, self.train_data, scoring_method='k2'),
-                'bdeu': structure_score(self.model, self.train_data, scoring_method='bdeu'),
-                'bds': structure_score(self.model, self.train_data, scoring_method='bds')
-            }
-        except Exception as e:
-            print(f"⚠️ Error calculating scores: {e}")
-            scores = {
-                'log_likelihood': 0,
-                'bic': 0,
-                'k2': 0,
-                'bdeu': 0,
-                'bds': 0
-            }
         return scores
     @classmethod
@@ -576,4 +451,4 @@ class BayesianNetworkAnalyzer:
     def clear_session_results(cls, session_id):
         """清除特定 session 的結果"""
         if session_id in cls._session_results:
-            del cls._session_results[session_id]

     recall_score, f1_score, roc_curve, roc_auc_score
 )
 from pgmpy.metrics import log_likelihood_score, structure_score
 import threading
 from datetime import datetime
+from networkx import is_directed_acyclic_graph, DiGraph
 class BayesianNetworkAnalyzer:
     """
+    貝葉斯網路分析器
+    支持多用戶同時使用,每個 session 獨立處理
     """
+    # 類別級的鎖,用於線程安全
     _lock = threading.Lock()
     # 儲存各 session 的分析結果
                      equivalent_sample_size=3, score_method='BIC',
                      sig_level=0.05, n_bins=10):
         """
+        執行完整的貝葉斯網路分析 - 完全對齊 Django 版本的順序
         Args:
             df: 原始資料框
         with self._lock:
             try:
+                # 1. 資料預處理 (只選擇欄位和處理缺失值)
+                processed_df = self._preprocess_data(
+                    df, cat_features, con_features, target_variable
+                )
+                # 2. 分割訓練/測試集 (✅ random_state=526)
                 self.train_data, self.test_data = train_test_split(
+                    processed_df,
                     test_size=test_fraction,
+                    random_state=526,
+                    stratify=processed_df[target_variable] if target_variable in processed_df.columns else None
                 )
+                # 3. ✅ 學習網路結構 (在分箱和編碼之前!)
                 self.model = self._learn_structure(
                     algorithm, score_method, sig_level, target_variable
                 )
+                # 4. ✅ 對分類變數編碼 (在學習結構之後,分箱之前)
+                self._encode_categorical_features(cat_features)
+                # 5. ✅ 對連續變數分箱 (在編碼之後)
+                self._bin_continuous_features(con_features, n_bins)
+                # 6. 參數估計
                 self._fit_parameters(estimator, equivalent_sample_size)
+                # 7. 初始化推論引擎
                 self.inference = VariableElimination(self.model)
+                # 8. 評估模型
                 train_metrics = self._evaluate_model(
                     self.train_data, target_variable, "train"
                 )
                     self.test_data, target_variable, "test"
                 )
+                # 9. 獲取 CPD
                 cpds = self._get_all_cpds()
+                # 10. 計算模型評分
                 scores = self._calculate_scores()
+                # 11. 整理結果
                 results = {
                     'model': self.model,
                     'inference': self.inference,
             except Exception as e:
                 raise Exception(f"Analysis failed: {str(e)}")
+    def _preprocess_data(self, df, cat_features, con_features, target_variable):
+        """資料預處理 - 只選擇欄位和刪除缺失值"""
+        # 選擇需要的欄位
+        selected_columns = cat_features + con_features + [target_variable]
+        processed_df = df[selected_columns].copy()
+        # 處理缺失值
+        processed_df = processed_df.dropna()
+        return processed_df
+    def _encode_categorical_features(self, cat_features):
         """
+        ✅ 將分類變數轉為 category codes - 完全對齊 Django
+        注意:只對 cat_features 編碼,不對分箱後的連續變數編碼
+        Django 只對 train_data 編碼,但我們為了一致性也對 test_data 編碼
         """
         for col in cat_features:
             if col in self.train_data.columns:
                 if self.train_data[col].dtype == 'object':
                     self.train_data[col] = self.train_data[col].astype('category').cat.codes
+            # Django 沒有對 test_data 編碼,但為了預測時一致性,我們也編碼
             if col in self.test_data.columns:
                 if self.test_data[col].dtype == 'object':
                     self.test_data[col] = self.test_data[col].astype('category').cat.codes
+    def _bin_continuous_features(self, con_features, n_bins):
+        """
+        ✅ 對連續變數分箱 - 完全對齊 Django 版本
+        先用訓練集計算邊界,再套用到測試集
+        """
         self.bins_dict = {}
         for col in con_features:
             if col in self.train_data.columns and self.train_data[col].notna().sum() > 0:
+                # 使用訓練集計算分箱邊界
                 bin_edges = pd.cut(
                     self.train_data[col],
                     bins=n_bins,
                     duplicates='drop'
                 )[1]
                 self.bins_dict[col] = bin_edges
+                # 創建分箱標籤 (✅ 使用 – 而不是 -)
                 bin_labels = [
+                    f"{round(bin_edges[i], 2)}–{round(bin_edges[i+1], 2)}"
                     for i in range(len(bin_edges) - 1)
                 ]
+                # 對訓練集分箱
                 self.train_data[col] = pd.cut(
                     self.train_data[col],
                     bins=bin_edges,
                     labels=bin_labels,
                     include_lowest=True
                 ).astype(object).fillna("Missing")
+                # 對測試集使用相同邊界分箱
+                if col in self.test_data.columns:
+                    self.test_data[col] = pd.cut(
+                        self.test_data[col],
+                        bins=bin_edges,
+                        labels=bin_labels,
+                        include_lowest=True
+                    ).astype(object).fillna("Missing")
+            else:
+                print(f"⚠️ Skipped binning column '{col}' – missing or all NaN")
     def _learn_structure(self, algorithm, score_method, sig_level, target_variable):
+        """學習網路結構 - 完全對齊 Django 版本"""
         if algorithm == 'NB':
+            # Naive Bayes
             edges = [
                 (target_variable, feature)
                 for feature in self.train_data.columns
             model = BayesianNetwork(edges)
         elif algorithm == 'TAN':
+            # Tree-Augmented Naive Bayes
+            # ✅ 特殊情況處理: 如果同時存在'asia'和'either'列,特別指定'asia'作為根節點
             if 'asia' in self.train_data.columns and 'either' in self.train_data.columns and target_variable == 'either':
                 tan_search = TreeSearch(self.train_data, root_node='asia')
             else:
             model = BayesianNetwork(structure.edges())
         elif algorithm == 'CL':
+            # Chow-Liu
             tan_search = TreeSearch(self.train_data)
             structure = tan_search.estimate(
                 estimator_type='chow-liu',
             )
             model = BayesianNetwork(structure.edges())
+        elif algorithm == 'HC':
+            # Hill Climbing
+            hc = HillClimbSearch(self.train_data)
+            # 選擇評分方法
+            scoring_methods = {
+                'BIC': BicScore(self.train_data),
+                'AIC': AICScore(self.train_data),
+                'K2': K2Score(self.train_data),
+                'BDeu': BDeuScore(self.train_data),
+                'BDs': BDsScore(self.train_data)
+            }
+            structure = hc.estimate(
+                scoring_method=scoring_methods[score_method]
+            )
+            model = BayesianNetwork(structure.edges())
+        elif algorithm == 'PC':
+            # PC Algorithm - ✅ 與 Django 完全一致的降級策略
+            pc = PC(self.train_data)
+            # 嘗試不同的 max_cond_vars 直到成功
+            for max_cond in [5, 4, 3, 2, 1]:
                 try:
+                    structure = pc.estimate(
                         significance_level=sig_level,
+                        max_cond_vars=max_cond,
                         ci_test='chi_square',
                         variant='stable',
+                        n_jobs=1  # ✅ Django 第一次用 1
                     )
+                    # 檢查是否有效 (✅ 與 Django 一致)
+                    edges = structure.edges()
+                    if is_directed_acyclic_graph(DiGraph(edges)) and any(target_variable in edge for edge in edges):
+                        model = BayesianNetwork(structure.edges())
+                        break
                 except:
+                    continue
+            else:
+                # 如果都失敗,使用 Naive Bayes (✅ 與 Django 一致)
                 edges = [
                     (target_variable, feature)
                     for feature in self.train_data.columns
                     if feature != target_variable
                 ]
                 model = BayesianNetwork(edges)
         else:
             raise ValueError(f"Unknown algorithm: {algorithm}")
         return model
     def _fit_parameters(self, estimator, equivalent_sample_size):
+        """參數估計"""
         if estimator == 'bn':
             self.model.fit(
                 self.train_data,
                 estimator=BayesianEstimator,
                 equivalent_sample_size=equivalent_sample_size
             )
         else:
             self.model.fit(
                 self.train_data,
                 estimator=MaximumLikelihoodEstimator
             )
+    def _predict_probabilities(self, data, target_variable):
         """
+        預測機率 - ✅ 與 Django 版本完全一致
         """
         true_labels = []
         predicted_probs = []
         model_nodes = set(self.model.nodes())
         for idx, row in data.iterrows():
+            # 準備 evidence (✅ 過濾只在模型中的變數)
             raw_evidence = row.drop(target_variable).to_dict()
+            filtered_evidence = {k: v for k, v in raw_evidence.items() if k in model_nodes}
             true_label = row[target_variable]
             true_labels.append(true_label)
                 )
                 probs = result.values
                 predicted_probs.append(probs)
             except Exception as e:
                 print(f"⚠️ Inference failed at row {idx} | evidence keys: {list(filtered_evidence.keys())} | error: {e}")
+                predicted_probs.append(None)
+        # ✅ 過濾有效結果 (與 Django 一致)
+        valid_data = [
+            (label, prob)
+            for label, prob in zip(true_labels, predicted_probs)
+            if prob is not None and len(prob) > 1
+        ]
+        if not valid_data:
+            return [], []
+        valid_labels, valid_probs = zip(*valid_data)
+        prob_array = np.round(np.array([prob[1] for prob in valid_probs]), 4)
+        return list(valid_labels), prob_array
     def _evaluate_model(self, data, target_variable, dataset_name):
+        """評估模型效能 - ✅ 與 Django 完全一致"""
+        # 預測
         true_labels, pred_probs = self._predict_probabilities(
             data, target_variable
         )
+        if len(true_labels) == 0:
             return {
                 'accuracy': 0,
                 'precision': 0,
                 'specificity': 0,
                 'confusion_matrix': [[0, 0], [0, 0]],
                 'fpr': [0],
+                'tpr': [0]
             }
+        # 二元預測 (threshold = 0.1, ✅ 與 Django 一致)
+        threshold = 0.1
+        pred_labels = (pred_probs >= threshold).astype(int)
+        # 計算指標
         accuracy = accuracy_score(true_labels, pred_labels) * 100
         precision = precision_score(true_labels, pred_labels, zero_division=0) * 100
         recall = recall_score(true_labels, pred_labels, zero_division=0) * 100
         f1 = f1_score(true_labels, pred_labels, zero_division=0) * 100
+        # ROC 曲線
+        pred_probs_clean = np.nan_to_num(pred_probs, nan=0.0)
+        fpr, tpr, _ = roc_curve(true_labels, pred_probs_clean)
+        auc = roc_auc_score(true_labels, pred_probs_clean)
+        # 混淆矩陣
+        cm = confusion_matrix(true_labels, pred_labels).tolist()
+        # G-mean 和 P-mean (✅ 與 Django 計算方式一致)
+        tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()
         sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
         specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
+        g_mean = np.sqrt(sensitivity * precision / 100) * 100
         p_mean = np.sqrt(specificity * sensitivity) * 100
         return {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1': f1,
+            'auc': auc,
+            'g_mean': g_mean,
+            'p_mean': p_mean,
+            'specificity': specificity * 100,
+            'confusion_matrix': cm,
             'fpr': fpr.tolist(),
             'tpr': tpr.tolist(),
+            'predicted_probs': pred_probs.tolist()
         }
     def _get_all_cpds(self):
         return cpds
     def _calculate_scores(self):
+        """計算模型評分"""
+        scores = {
+            'log_likelihood': log_likelihood_score(self.model, self.train_data),
+            'bic': structure_score(self.model, self.train_data, scoring_method='bic'),
+            'k2': structure_score(self.model, self.train_data, scoring_method='k2'),
+            'bdeu': structure_score(self.model, self.train_data, scoring_method='bdeu'),
+            'bds': structure_score(self.model, self.train_data, scoring_method='bds')
+        }
         return scores
     @classmethod
     def clear_session_results(cls, session_id):
         """清除特定 session 的結果"""
         if session_id in cls._session_results:
+            del cls._session_results[session_id]