Spaces:

Donlagon007
/

BN_upload

Sleeping

App Files Files Community

Donlagon007 commited on Nov 1, 2025

Commit

c64919d

verified ·

1 Parent(s): 9bb6b99

Upload 8 files

Browse files

Files changed (8) hide show

BC_imputed_micerf_period13_fid_course_D4.csv +0 -0
README.md +8 -18
app.py +1060 -0
bn_core.py +536 -0
llm_assistant.py +360 -0
packages.txt +1 -0
requirements.txt +9 -3
utils.py +313 -0

BC_imputed_micerf_period13_fid_course_D4.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,20 +1,10 @@
 ---
-title: BN Upload
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: Streamlit template space
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: Bayesian Network Analysis
+emoji: 🔬
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.31.0
+app_file: app.py
 pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,1060 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+import plotly.express as px
+from io import BytesIO
+import base64
+import json
+from datetime import datetime
+import uuid
+# 頁面配置
+st.set_page_config(
+    page_title="Bayesian Network Analysis System",
+    page_icon="🔬",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# 自定義 CSS - 讓介面更像 Django
+st.markdown("""
+<style>
+    /* Expander 樣式 - 類似 Django 的摺疊區域 */
+    .streamlit-expanderHeader {
+        background-color: #e8f1f8;
+        border: 1px solid #b0cfe8;
+        border-radius: 5px;
+        font-weight: 600;
+        color: #1b4f72;
+    }
+    .streamlit-expanderHeader:hover {
+        background-color: #d0e7f8;
+    }
+    /* Checkbox 樣式 */
+    .stCheckbox {
+        padding: 2px 0;
+    }
+    /* Radio button 樣式 */
+    .stRadio > label {
+        font-weight: 600;
+        color: #1b4f72;
+    }
+    /* 選擇框樣式 */
+    .stSelectbox > label, .stNumberInput > label {
+        font-weight: 600;
+        color: #1b4f72;
+    }
+    /* 分隔線 */
+    hr {
+        margin: 1rem 0;
+        border-top: 2px solid #b0cfe8;
+    }
+    /* 表單容器 */
+    .element-container {
+        margin-bottom: 0.5rem;
+    }
+    /* 摺疊內容區域 */
+    .streamlit-expanderContent {
+        background-color: #f8fbff;
+        border: 1px solid #d0e4f5;
+        border-top: none;
+        padding: 1rem;
+    }
+    /* 按鈕樣式 */
+    .stButton > button {
+        width: 100%;
+        border-radius: 20px;
+        font-weight: 600;
+        transition: all 0.3s ease;
+    }
+    .stButton > button:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 4px 8px rgba(0,0,0,0.2);
+    }
+</style>
+""", unsafe_allow_html=True)
+# 導入自定義模組
+from bn_core import BayesianNetworkAnalyzer
+from llm_assistant import LLMAssistant
+from utils import (
+    plot_roc_curve,
+    plot_confusion_matrix,
+    plot_probability_distribution,
+    generate_network_graph,
+    create_cpd_table,
+    export_results_to_json
+)
+# 初始化 session state
+if 'session_id' not in st.session_state:
+    st.session_state.session_id = str(uuid.uuid4())
+if 'analysis_results' not in st.session_state:
+    st.session_state.analysis_results = None
+if 'trained_model_results' not in st.session_state:
+    st.session_state.trained_model_results = None
+if 'loaded_model_results' not in st.session_state:
+    st.session_state.loaded_model_results = None
+if 'loaded_models' not in st.session_state:
+    st.session_state.loaded_models = []  # List to store multiple loaded models
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+if 'model_trained' not in st.session_state:
+    st.session_state.model_trained = False
+# 標題
+st.title("🔬 Bayesian Network Analysis System")
+st.markdown("---")
+# Sidebar - OpenAI API Key
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    api_key = st.text_input(
+        "OpenAI API Key",
+        type="password",
+        help="Enter your OpenAI API key to use the AI assistant"
+    )
+    if api_key:
+        st.session_state.api_key = api_key
+        st.success("✅ API Key loaded")
+    st.markdown("---")
+    # 資料來源選擇
+    st.subheader("📊 Data Source")
+    data_source = st.radio(
+        "Select data source:",
+        ["Use Default Dataset", "Upload Your Data"]
+    )
+    uploaded_file = None
+    if data_source == "Upload Your Data":
+        uploaded_file = st.file_uploader(
+            "Upload CSV file",
+            type=['csv'],
+            help="Upload your dataset in CSV format"
+        )
+# 主要內容區
+tab1, tab2, tab3 = st.tabs(["📈 Analysis", "💬 AI Assistant", "📂 Load Model"])
+# Tab 1: 分析介面
+with tab1:
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.header("Model Configuration")
+        # 載入資料
+        if data_source == "Use Default Dataset":
+            # 使用預設資料集
+            @st.cache_data
+            def load_default_data():
+                # 這裡放入預設資料集的路徑
+                df = pd.read_csv("BC_imputed_micerf_period13_fid_course_D4.csv")
+                return df
+            try:
+                df = load_default_data()
+                st.success(f"✅ Default dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
+            except:
+                st.error("❌ Default dataset not found. Please upload your own data.")
+                df = None
+        else:
+            if uploaded_file:
+                df = pd.read_csv(uploaded_file)
+                st.success(f"✅ Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
+            else:
+                st.info("👆 Please upload a CSV file to begin")
+                df = None
+        if df is not None:
+            # 特��選擇 - 使用 expander (可摺疊)
+            st.subheader("🎯 Input Features")
+            # 手動指定特徵類型 (針對預設乳癌資料集)
+            if data_source == "Use Default Dataset":
+                # 預設資料集的固定分類
+                numeric_cols = ['size', 'stime']  # 只有這兩個是連續變數
+                categorical_cols = [col for col in df.columns if col not in numeric_cols]
+            else:
+                # 上傳資料集才自動判斷
+                numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+                categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+            # 二元分類變數(用於目標變數)
+            binary_cols = [col for col in df.columns if df[col].nunique() == 2]
+            col_feat1, col_feat2 = st.columns(2)
+            with col_feat1:
+                with st.expander("**Continuous**", expanded=False):
+                    st.caption("Select continuous features:")
+                    con_features = []
+                    for col in numeric_cols:
+                        if st.checkbox(col, value=False, key=f"con_{col}"):
+                            con_features.append(col)
+            with col_feat2:
+                with st.expander("**Categorical**", expanded=True):
+                    st.caption("Select categorical features:")
+                    cat_features = []
+                    for col in categorical_cols:
+                        # 預設勾選前幾個
+                        default_checked = categorical_cols.index(col) < 5 if len(categorical_cols) > 5 else True
+                        if st.checkbox(col, value=default_checked, key=f"cat_{col}"):
+                            cat_features.append(col)
+            # 目標變數 - 放在特徵選擇下方
+            st.markdown("---")
+            col_target1, col_target2 = st.columns([1, 2])
+            with col_target1:
+                target_variable = st.selectbox(
+                    "Target Variable (Y):",
+                    options=binary_cols,
+                    help="Must be a binary classification variable"
+                )
+            with col_target2:
+                test_fraction = st.number_input(
+                    "Test Dataset Proportion:",
+                    min_value=0.10,
+                    max_value=0.50,
+                    value=0.25,
+                    step=0.05,
+                    format="%.2f"
+                )
+            # 驗證選擇
+            selected_features = cat_features + con_features
+            if target_variable in selected_features:
+                st.error("❌ Target variable cannot be in feature list!")
+                st.stop()
+            st.markdown("---")
+            # 模型參數 - 使用更緊湊的佈局
+            st.subheader("⚙️ Model Configuration")
+            col_param1, col_param2 = st.columns(2)
+            with col_param1:
+                algorithm = st.radio(
+                    "Network Structure:",
+                    options=['NB', 'TAN', 'CL', 'HC', 'PC'],
+                    format_func=lambda x: {
+                        'NB': 'Naive Bayes (NB)',
+                        'TAN': 'Tree-Augmented Naive Bayes (TAN)',
+                        'CL': 'Chow-Liu',
+                        'HC': 'Hill Climbing',
+                        'PC': 'PC'
+                    }[x],
+                    help="Select structure learning algorithm"
+                )
+                # 條件性參數 - HC
+                if algorithm == 'HC':
+                    score_method = st.selectbox(
+                        "Scoring Method:",
+                        options=['BIC', 'AIC', 'K2', 'BDeu', 'BDs'],
+                        help="Select scoring method for Hill Climbing"
+                    )
+                else:
+                    score_method = 'BIC'
+                # 條件性參數 - PC
+                if algorithm == 'PC':
+                    sig_level = st.number_input(
+                        "Significance Level:",
+                        min_value=0.01,
+                        max_value=1.0,
+                        value=0.05,
+                        step=0.01,
+                        help="Significance level for PC algorithm"
+                    )
+                else:
+                    sig_level = 0.05
+            with col_param2:
+                estimator = st.radio(
+                    "Parameter Estimator:",
+                    options=['ml', 'bn'],
+                    format_func=lambda x: {
+                        'ml': 'MaximumLikelihoodEstimator',
+                        'bn': 'BayesianEstimator'
+                    }[x],
+                    help="Select parameter estimation method"
+                )
+                if estimator == 'bn':
+                    equivalent_sample_size = st.number_input(
+                        "Equivalent Sample Size:",
+                        min_value=1,
+                        value=3,
+                        step=1,
+                        help="Prior strength for Bayesian estimation"
+                    )
+                else:
+                    equivalent_sample_size = 3
+                # Decision (如果是預設資料集才顯示)
+                if data_source == "Use Default Dataset":
+                    decision = st.selectbox(
+                        "Decision:",
+                        options=['OverAll', 'Exposed', 'Unexposed'],
+                        index=0,
+                        help="Analysis subset selection"
+                    )
+                else:
+                    decision = 'OverAll'
+            # Provide Evidence - 可摺疊區域
+            st.markdown("---")
+            with st.expander("**Provide Evidence**", expanded=False):
+                st.caption("Enter evidence values for inference (optional):")
+                evidence_cols = st.columns(2)
+                evidence_dict = {}
+                # 為每個非目標變數創建輸入框
+                all_vars = [v for v in selected_features if v != target_variable]
+                for idx, var in enumerate(all_vars):
+                    with evidence_cols[idx % 2]:
+                        val = st.text_input(
+                            f"{var}:",
+                            value="",
+                            key=f"evidence_{var}",
+                            help=f"Enter value for {var} (leave empty to ignore)"
+                        )
+                        if val.strip():
+                            evidence_dict[var] = val.strip()
+            # 進階參數 - 摺疊區域
+            with st.expander("**Advanced Parameters**", expanded=False):
+                n_bins = st.slider(
+                    "Number of Bins (for continuous variables):",
+                    min_value=3,
+                    max_value=20,
+                    value=10,
+                    step=1,
+                    help="Number of bins for discretizing continuous features"
+                )
+            # 執行分析按鈕
+            st.markdown("---")
+            col_btn1, col_btn2 = st.columns([3, 1])
+            with col_btn1:
+                run_button = st.button("🚀 Run Analysis", type="primary", width='stretch')
+            with col_btn2:
+                if st.button("🔄 Reset", width='stretch'):
+                    st.session_state.analysis_results = None
+                    st.session_state.trained_model_results = None
+                    st.session_state.model_trained = False
+                    st.session_state.chat_history = []
+                    st.rerun()
+            # 分析步驟說明
+            with st.expander("ℹ️ Analysis Steps", expanded=False):
+                st.markdown("""
+                **Process:**
+                1. Split data (train/test)
+                2. Learn network structure
+                3. Process features (bins from train)
+                4. Estimate parameters
+                5. Evaluate performance
+                **Note:** Test set bins are derived from training set to prevent data leakage.
+                """)
+            if run_button:
+                # 驗證
+                if not selected_features:
+                    st.error("❌ Please select at least one feature!")
+                    st.stop()
+                if target_variable in selected_features:
+                    st.error("❌ Target variable cannot be in feature list!")
+                    st.stop()
+                with st.spinner("🔄 Training Bayesian Network..."):
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    try:
+                        # 初始化分析器
+                        status_text.text("📊 Initializing analyzer...")
+                        progress_bar.progress(10)
+                        analyzer = BayesianNetworkAnalyzer(
+                            session_id=st.session_state.session_id
+                        )
+                        status_text.text(f"📐 Learning {algorithm} structure...")
+                        progress_bar.progress(30)
+                        # 執行分析
+                        results = analyzer.run_analysis(
+                            df=df,
+                            cat_features=cat_features,
+                            con_features=con_features,
+                            target_variable=target_variable,
+                            test_fraction=test_fraction,
+                            algorithm=algorithm,
+                            estimator=estimator,
+                            equivalent_sample_size=equivalent_sample_size,
+                            score_method=score_method,
+                            sig_level=sig_level,
+                            n_bins=n_bins
+                        )
+                        status_text.text("✅ Analysis completed!")
+                        progress_bar.progress(100)
+                        # 儲存結果
+                        st.session_state.trained_model_results = results  # For Tab 1 display
+                        st.session_state.analysis_results = results  # For AI Assistant
+                        st.session_state.model_trained = True
+                        # 🆕 儲存 analyzer 到 session_state（用於個人化預測）
+                        st.session_state.analyzer = analyzer
+                        st.success("✅ Analysis completed successfully!")
+                        st.balloons()
+                        # 清空進度
+                        import time
+                        time.sleep(1)
+                        progress_bar.empty()
+                        status_text.empty()
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"❌ Error during analysis: {str(e)}")
+                        st.exception(e)
+                        progress_bar.empty()
+                        status_text.empty()
+    with col2:
+        st.header("Quick Stats")
+        if df is not None:
+            st.metric("Total Samples", df.shape[0])
+            st.metric("Total Features", df.shape[1])
+            st.metric("Selected Features", len(selected_features) if 'selected_features' in locals() else 0)
+            if st.session_state.model_trained:
+                st.success("✅ Model Trained")
+            else:
+                st.info("⏳ Awaiting Training")
+    # 顯示結果
+    if st.session_state.trained_model_results:
+        st.markdown("---")
+        st.header("📊 Analysis Results")
+        results = st.session_state.trained_model_results
+        # 使用 tabs 來組織結果
+        result_tabs = st.tabs([
+            "🕸️ Network Structure",
+            "📈 Performance Metrics",
+            "📋 CPD Tables",
+            "📊 Model Scores"
+        ])
+        # Tab 1: 網路結構
+        with result_tabs[0]:
+            network_base64 = generate_network_graph(results['model'])# Pi
+            st.image(f"data:image/png;base64,{network_base64}", width='stretch')# Pi
+            # 顯示邊的列表
+            with st.expander("View Network Edges", expanded=False):
+                edges = list(results['model'].edges())
+                st.write(f"Total edges: {len(edges)}")
+                # 每行顯示 3 個邊
+                for i in range(0, len(edges), 3):
+                    cols = st.columns(3)
+                    for j, col in enumerate(cols):
+                        if i + j < len(edges):
+                            edge = edges[i + j]
+                            col.markdown(f"**{edge[0]}** → {edge[1]}")
+        # Tab 2: 效能指標
+        with result_tabs[1]:
+            # Check if metrics are available
+            if 'train_metrics' in results and 'test_metrics' in results:
+                col_m1, col_m2 = st.columns(2)
+                with col_m1:
+                    st.markdown("### Training Set")
+                    train_metrics = results['train_metrics']
+                    # 使用 metrics 卡片
+                    metric_cols = st.columns(4)
+                    metric_cols[0].metric("Accuracy", f"{train_metrics['accuracy']:.2f}%")
+                    metric_cols[1].metric("Precision", f"{train_metrics['precision']:.2f}%")
+                    metric_cols[2].metric("Recall", f"{train_metrics['recall']:.2f}%")
+                    metric_cols[3].metric("F1-Score", f"{train_metrics['f1']:.2f}%")
+                    metric_cols2 = st.columns(4)
+                    metric_cols2[0].metric("AUC", f"{train_metrics['auc']:.4f}")
+                    metric_cols2[1].metric("G-mean", f"{train_metrics['g_mean']:.2f}%")
+                    metric_cols2[2].metric("P-mean", f"{train_metrics['p_mean']:.2f}%")
+                    metric_cols2[3].metric("Specificity", f"{train_metrics['specificity']:.2f}%")
+                    # 混淆矩陣
+                    with st.expander("Confusion Matrix", expanded=True):
+                        conf_fig_train = plot_confusion_matrix(
+                            train_metrics['confusion_matrix'],
+                            title="Training Set"
+                        )
+                        st.plotly_chart(conf_fig_train, width='stretch')
+                    # ROC Curve
+                    with st.expander("ROC Curve", expanded=False):
+                        roc_fig_train = plot_roc_curve(
+                            train_metrics['fpr'],
+                            train_metrics['tpr'],
+                            train_metrics['auc'],
+                            title="Training Set"
+                        )
+                        st.plotly_chart(roc_fig_train, width='stretch')
+                with col_m2:
+                    st.markdown("### Test Set")
+                    test_metrics = results['test_metrics']
+                    metric_cols = st.columns(4)
+                    metric_cols[0].metric("Accuracy", f"{test_metrics['accuracy']:.2f}%")
+                    metric_cols[1].metric("Precision", f"{test_metrics['precision']:.2f}%")
+                    metric_cols[2].metric("Recall", f"{test_metrics['recall']:.2f}%")
+                    metric_cols[3].metric("F1-Score", f"{test_metrics['f1']:.2f}%")
+                    metric_cols2 = st.columns(4)
+                    metric_cols2[0].metric("AUC", f"{test_metrics['auc']:.4f}")
+                    metric_cols2[1].metric("G-mean", f"{test_metrics['g_mean']:.2f}%")
+                    metric_cols2[2].metric("P-mean", f"{test_metrics['p_mean']:.2f}%")
+                    metric_cols2[3].metric("Specificity", f"{test_metrics['specificity']:.2f}%")
+                    # 混淆矩陣
+                    with st.expander("Confusion Matrix", expanded=True):
+                        conf_fig_test = plot_confusion_matrix(
+                            test_metrics['confusion_matrix'],
+                            title="Test Set"
+                        )
+                        st.plotly_chart(conf_fig_test, width='stretch')
+                    # ROC Curve
+                    with st.expander("ROC Curve", expanded=False):
+                        roc_fig_test = plot_roc_curve(
+                            test_metrics['fpr'],
+                            test_metrics['tpr'],
+                            test_metrics['auc'],
+                            title="Test Set"
+                        )
+                        st.plotly_chart(roc_fig_test, width='stretch')
+        # Tab 3: 條件機率表
+        with result_tabs[2]:
+            selected_node = st.selectbox(
+                "Select a node to view its CPD:",
+                options=list(results['cpds'].keys())
+            )
+            if selected_node:
+                cpd_df = create_cpd_table(results['cpds'][selected_node])
+                st.dataframe(cpd_df, width='stretch')
+                # 下載按鈕
+                csv = cpd_df.to_csv()
+                st.download_button(
+                    label="📥 Download CPD as CSV",
+                    data=csv,
+                    file_name=f"cpd_{selected_node}.csv",
+                    mime="text/csv"
+                )
+        # Tab 4: 模型評分
+        with result_tabs[3]:
+            scores = results['scores']
+            score_cols = st.columns(5)
+            score_cols[0].metric("Log-Likelihood", f"{scores['log_likelihood']:.2f}")
+            score_cols[1].metric("BIC Score", f"{scores['bic']:.2f}")
+            score_cols[2].metric("K2 Score", f"{scores['k2']:.2f}")
+            score_cols[3].metric("BDeu Score", f"{scores['bdeu']:.2f}")
+            score_cols[4].metric("BDs Score", f"{scores['bds']:.2f}")
+            # 參數摘要
+            with st.expander("Analysis Parameters", expanded=True):
+                params = results['parameters']
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.markdown("**Algorithm Settings**")
+                    st.write(f"- Algorithm: {params['algorithm']}")
+                    st.write(f"- Estimator: {params['estimator']}")
+                    st.write(f"- Test Fraction: {params['test_fraction']:.2%}")
+                with col2:
+                    st.markdown("**Feature Information**")
+                    st.write(f"- Total Features: {params['n_features']}")
+                    st.write(f"- Categorical: {len(params['cat_features'])}")
+                    st.write(f"- Continuous: {len(params['con_features'])}")
+                    st.write(f"- Target: {params['target_variable']}")
+                with col3:
+                    st.markdown("**Other Parameters**")
+                    st.write(f"- Bins: {params['n_bins']}")
+                    st.write(f"- Score Method: {params['score_method']}")
+                    st.write(f"- Significance Level: {params['sig_level']}")
+                    st.write(f"- Equivalent Sample Size: {params['equivalent_sample_size']}")
+            # 匯出結果
+            with st.expander("Export Results", expanded=False):
+                col1, col2 = st.columns(2)
+                with col1:
+                    # 原本的 JSON 下載
+                    result_json = export_results_to_json(results)
+                    st.download_button(
+                        label="📥 Download Full Results (JSON)",
+                        data=result_json,
+                        file_name=f"bn_analysis_{results['timestamp'][:10]}.json",
+                        mime="application/json"
+                    )
+                with col2:
+                    # 🆕 新增：下載模型
+                    if st.button("💾 Save Trained Model"):
+                        if 'analyzer' in st.session_state:
+                            import tempfile
+                            import os
+                            # 創建臨時文件
+                            with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as tmp_file:
+                                model_path = tmp_file.name
+                                st.session_state.analyzer.save_model(model_path)
+                                # 讀取並提供下載
+                                with open(model_path, 'rb') as f:
+                                    st.download_button(
+                                        label="📥 Download Model File (.pkl)",
+                                        data=f,
+                                        file_name=f"bn_model_{results['timestamp'][:10]}.pkl",
+                                        mime="application/octet-stream",
+                                        key="download_model_btn"
+                                    )
+                                # 清理臨時文件
+                                os.unlink(model_path)
+                        else:
+                            st.error("❌ Analyzer not found in session state")
+# Tab 2: AI 助手
+with tab2:
+    st.header("💬 AI Analysis Assistant")
+    if not st.session_state.get('api_key'):
+        st.warning("⚠️ Please enter your OpenAI API Key in the sidebar to use the AI assistant.")
+    elif not st.session_state.model_trained:
+        st.info("ℹ️ Please train a model first in the Analysis tab to use the AI assistant.")
+    else:
+        # 初始化 LLM 助手
+        if 'llm_assistant' not in st.session_state:
+            st.session_state.llm_assistant = LLMAssistant(
+                api_key=st.session_state.api_key,
+                session_id=st.session_state.session_id
+            )
+        # 顯示聊天歷史
+        chat_container = st.container()
+        with chat_container:
+            for message in st.session_state.chat_history:
+                with st.chat_message(message["role"]):
+                    st.markdown(message["content"])
+        # 聊天輸入
+        if prompt := st.chat_input("Ask me anything about your analysis results..."):
+            # 添加用戶訊息
+            st.session_state.chat_history.append({
+                "role": "user",
+                "content": prompt
+            })
+            with st.chat_message("user"):
+                st.markdown(prompt)
+            # 🆕 檢測是否為個人化預測請求
+            prediction_keywords = ['predict', 'risk', 'patient', 'case', 'my risk', 'calculate', 'probability', 'chance']
+            is_prediction_request = any(keyword in prompt.lower() for keyword in prediction_keywords)
+            # 獲取 AI 回應
+            with st.chat_message("assistant"):
+                with st.spinner("Analyzing..." if is_prediction_request else "Thinking..."):
+                    try:
+                        if is_prediction_request:
+                            # 🆕 執行個人化預測
+                            # 從 session_state 取得必要資訊
+                            results = st.session_state.analysis_results
+                            # 重建 analyzer（需要載入模型狀態）
+                            # ⚠️ 這裡需要先把 analyzer 存在 session_state 中
+                            if 'analyzer' not in st.session_state:
+                                st.error("❌ Model not found. Please train a model first in the Analysis tab.")
+                                response = "I cannot perform predictions because the model is not available. Please train a model first."
+                            else:
+                                response = st.session_state.llm_assistant.predict_from_text(
+                                    user_description=prompt,
+                                    analyzer=st.session_state.analyzer,
+                                    target_variable=results['parameters']['target_variable'],
+                                    feature_list=results['parameters']['cat_features'] + results['parameters']['con_features']
+                                )
+                        else:
+                            # 原本的一般對話
+                            response = st.session_state.llm_assistant.get_response(
+                                user_message=prompt,
+                                analysis_results=st.session_state.analysis_results
+                            )
+                        st.markdown(response)
+                    except Exception as e:
+                        error_msg = f"❌ Error: {str(e)}\n\nPlease try rephrasing your question or check the model status."
+                        st.error(error_msg)
+                        response = error_msg
+            # 添加助手訊息
+            st.session_state.chat_history.append({
+                "role": "assistant",
+                "content": response
+            })
+        # 快速問題按鈕
+        st.markdown("---")
+        st.subheader("💡 Quick Questions")
+        quick_questions = [
+            "📊 Give me a summary of the analysis results",
+            "🎯 What is the model's performance?",
+            "🔍 Explain the Bayesian Network structure",
+            "⚠️ What are the limitations of this model?",
+            "💡 How can I improve the model?"
+        ]
+        cols = st.columns(len(quick_questions))
+        for idx, (col, question) in enumerate(zip(cols, quick_questions)):
+            if col.button(question, key=f"quick_{idx}"):
+                st.session_state.chat_history.append({
+                    "role": "user",
+                    "content": question
+                })
+                response = st.session_state.llm_assistant.get_response(
+                    user_message=question,
+                    analysis_results=st.session_state.analysis_results
+                )
+                st.session_state.chat_history.append({
+                    "role": "assistant",
+                    "content": response
+                })
+                st.rerun()
+# Tab 3: Load Model
+with tab3:
+    st.header("📂 Load Pre-trained Models")
+    st.markdown("""
+    Load previously trained Bayesian Network models to view and compare their structures.
+    **Maximum: 2 models**
+    **Supported formats:**
+    - 📦 `.pkl` - Full model with all parameters
+    """)
+    st.markdown("---")
+    # Check if already loaded 2 models
+    if len(st.session_state.loaded_models) >= 2:
+        st.warning("⚠️ Maximum 2 models can be loaded. Please remove a model before loading another.")
+        uploaded_model = None
+    else:
+        # File uploader
+        uploaded_model = st.file_uploader(
+            "Upload model file",
+            type=['pkl', 'bif'],
+            help="Upload a .pkl file containing a Bayesian Network model"
+        )
+    if uploaded_model:
+        file_extension = uploaded_model.name.split('.')[-1].lower()
+        col_load1, col_load2 = st.columns([3, 1])
+        with col_load1:
+            st.info(f"📄 File: **{uploaded_model.name}** ({file_extension.upper()} format)")
+        with col_load2:
+            load_button = st.button("🔄 Load Model", type="primary", width='stretch')
+        if load_button:
+            with st.spinner(f"Loading {file_extension.upper()} model..."):
+                try:
+                    if file_extension == 'pkl':
+                        # Load .pkl file
+                        import pickle
+                        import tempfile
+                        import os
+                        # Save uploaded file to temp location
+                        with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as tmp_file:
+                            tmp_file.write(uploaded_model.read())
+                            tmp_path = tmp_file.name
+                        # Load model data
+                        with open(tmp_path, 'rb') as f:
+                            model_data = pickle.load(f)
+                        # Clean up temp file
+                        os.unlink(tmp_path)
+                        # Extract model info - handle multiple formats
+                        from pgmpy.models import BayesianNetwork
+                        if isinstance(model_data, BayesianNetwork):
+                            # Case 1: Direct BayesianNetwork object
+                            model = model_data
+                            bins_dict = None
+                            train_columns = list(model.nodes())
+                            timestamp = 'Unknown'
+                            st.info("ℹ️ Loaded raw BayesianNetwork object (no metadata)")
+                        elif isinstance(model_data, dict):
+                            # Case 2: Dictionary format
+                            if 'model' in model_data:
+                                # Case 2a: Our format or similar
+                                model = model_data['model']
+                                bins_dict = model_data.get('bins_dict', None)
+                                train_columns = model_data.get('train_columns', list(model.nodes()))
+                                timestamp = model_data.get('timestamp', 'Unknown')
+                            else:
+                                # Case 2b: Try to find model in other common keys
+                                possible_keys = ['network', 'bn', 'bayesian_network', 'graph']
+                                model = None
+                                found_key = None
+                                for key in possible_keys:
+                                    if key in model_data and isinstance(model_data[key], BayesianNetwork):
+                                        model = model_data[key]
+                                        found_key = key
+                                        break
+                                if model is None:
+                                    raise ValueError(f"Cannot find BayesianNetwork in pickle file. Available keys: {list(model_data.keys())}. Expected one of: {['model'] + possible_keys}")
+                                bins_dict = model_data.get('bins_dict', None)
+                                train_columns = list(model.nodes())
+                                timestamp = 'Unknown'
+                                st.info(f"ℹ️ Loaded model from key: '{found_key}'")
+                        else:
+                            raise TypeError(f"Unsupported pickle format. Expected BayesianNetwork or dict, got {type(model_data).__name__}")
+                        # Store in session state - append to list (max 2)
+                        if len(st.session_state.loaded_models) < 2:
+                            model_info = {
+                                'model': model,
+                                'source': 'pkl',
+                                'bins_dict': bins_dict,
+                                'train_columns': train_columns,
+                                'timestamp': timestamp,
+                                'file_name': uploaded_model.name
+                            }
+                            st.session_state.loaded_models.append(model_info)
+                            st.success(f"✅ Model #{len(st.session_state.loaded_models)} loaded successfully from .pkl file!")
+                            st.info("ℹ️ This loaded model is displayed below. To use AI Assistant, please train a model in the Analysis tab.")
+                            st.balloons()
+                        else:
+                            st.error("❌ Cannot load more than 2 models. Please remove a model first.")
+                    elif file_extension == 'bif':
+                        # Load .bif file
+                        from pgmpy.readwrite import BIFReader
+                        import tempfile
+                        import os
+                        # Save uploaded file to temp location
+                        with tempfile.NamedTemporaryFile(delete=False, suffix='.bif', mode='w') as tmp_file:
+                            tmp_file.write(uploaded_model.read().decode('utf-8'))
+                            tmp_path = tmp_file.name
+                        # Load model
+                        reader = BIFReader(tmp_path)
+                        model = reader.get_model()
+                        # Clean up temp file
+                        os.unlink(tmp_path)
+                        # Store in session state - append to list (max 2)
+                        if len(st.session_state.loaded_models) < 2:
+                            model_info = {
+                                'model': model,
+                                'source': 'bif',
+                                'bins_dict': None,
+                                'train_columns': list(model.nodes()),
+                                'timestamp': 'Unknown',
+                                'file_name': uploaded_model.name
+                            }
+                            st.session_state.loaded_models.append(model_info)
+                            st.success(f"✅ Model #{len(st.session_state.loaded_models)} loaded successfully from .bif file!")
+                            st.warning("⚠️ Note: .bif files do not contain bins_dict.")
+                            st.info("ℹ️ This loaded model is displayed below. To use AI Assistant, please train a model in the Analysis tab.")
+                            st.balloons()
+                        else:
+                            st.error("❌ Cannot load more than 2 models. Please remove a model first.")
+                except Exception as e:
+                    st.error(f"❌ Error loading model: {str(e)}")
+                    st.exception(e)
+    # Display loaded models information
+    if st.session_state.loaded_models:
+        st.markdown("---")
+        # Header with Clear All button
+        col_header, col_clear = st.columns([3, 1])
+        with col_header:
+            st.header(f"📊 Loaded Models ({len(st.session_state.loaded_models)})")
+        with col_clear:
+            if st.button("🗑️ Clear All", type="secondary", width='stretch'):
+                st.session_state.loaded_models = []
+                st.rerun()
+        # Loop through all loaded models
+        for idx, loaded_model in enumerate(st.session_state.loaded_models):
+            model = loaded_model['model']
+            # Model separator
+            st.markdown("---")
+            # Model header with Remove button
+            col_title, col_remove = st.columns([4, 1])
+            with col_title:
+                st.subheader(f"Model #{idx + 1}: {loaded_model['file_name']}")
+            with col_remove:
+                if st.button(f"❌ Remove", key=f"remove_model_{idx}", width='stretch'):
+                    st.session_state.loaded_models.pop(idx)
+                    st.rerun()
+            # Display network graph and basic info
+            col_graph, col_info = st.columns([2, 1])
+            with col_graph:
+                st.markdown("**🕸️ Network Structure**")
+                try:
+                    network_base64 = generate_network_graph(model)
+                    st.image(f"data:image/png;base64,{network_base64}", width='stretch')
+                except Exception as e:
+                    st.error(f"Error generating network graph: {str(e)}")
+                    st.info("Network structure visualization is not available.")
+            with col_info:
+                st.markdown("**ℹ️ Basic Information**")
+                st.metric("File Name", loaded_model['file_name'])
+                st.metric("Format", loaded_model['source'].upper())
+                st.metric("Total Nodes", len(model.nodes()))
+                st.metric("Total Edges", len(model.edges()))
+                if loaded_model['timestamp'] != 'Unknown':
+                    st.metric("Timestamp", loaded_model['timestamp'][:19])
+                if loaded_model['bins_dict']:
+                    st.metric("Bins Available", "✅ Yes")
+                else:
+                    st.metric("Bins Available", "❌ No")
+            # Network structure details
+            col_nodes, col_edges = st.columns(2)
+            with col_nodes:
+                with st.expander("📋 Node List", expanded=False):
+                    nodes = list(model.nodes())
+                    st.write(f"**Total nodes:** {len(nodes)}")
+                    for i, node in enumerate(nodes, 1):
+                        st.write(f"{i}. {node}")
+            with col_edges:
+                with st.expander("🔗 Edge List", expanded=False):
+                    edges = list(model.edges())
+                    st.write(f"**Total edges:** {len(edges)}")
+                    for i, edge in enumerate(edges, 1):
+                        st.write(f"{i}. **{edge[0]}** → {edge[1]}")
+            # CPD Tables
+            st.markdown("**📋 Conditional Probability Distributions (CPDs)**")
+            selected_node = st.selectbox(
+                "Select a node to view its CPD:",
+                options=list(model.nodes()),
+                key=f"load_model_cpd_select_{idx}"
+            )
+            if selected_node:
+                cpd = model.get_cpds(selected_node)
+                cpd_df = create_cpd_table(cpd)
+                st.dataframe(cpd_df, width='stretch')
+                # Download button
+                csv = cpd_df.to_csv()
+                st.download_button(
+                    label="📥 Download CPD as CSV",
+                    data=csv,
+                    file_name=f"cpd_{selected_node}_model{idx+1}.csv",
+                    mime="text/csv",
+                    key=f"load_model_cpd_download_{idx}"
+                )
+            # Additional information for .pkl files
+            if loaded_model['source'] == 'pkl' and loaded_model['bins_dict']:
+                with st.expander("🔢 Binning Information", expanded=False):
+                    st.write("**Bins dictionary available for continuous variables:**")
+                    st.json(loaded_model['bins_dict'])
+# Footer
+st.markdown("---")
+st.markdown(
+    """
+    <div style='text-align: center'>
+        <p>🔬 Bayesian Network Analysis System | Built with Streamlit</p>
+        <p>Powered by OpenAI GPT-4 | Session ID: {}</p>
+    </div>
+    """.format(st.session_state.session_id[:8]),
+    unsafe_allow_html=True
+)

bn_core.py ADDED Viewed

	@@ -0,0 +1,536 @@

+import pandas as pd
+import numpy as np
+from pgmpy.models import BayesianNetwork
+from pgmpy.estimators import (
+    TreeSearch, HillClimbSearch, PC,
+    MaximumLikelihoodEstimator, BayesianEstimator,
+    BicScore, AICScore, K2Score, BDeuScore, BDsScore
+)
+from pgmpy.inference import VariableElimination
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import (
+    confusion_matrix, accuracy_score, precision_score,
+    recall_score, f1_score, roc_curve, roc_auc_score
+)
+from pgmpy.metrics import log_likelihood_score, structure_score
+import threading
+from datetime import datetime
+from networkx import is_directed_acyclic_graph, DiGraph
+class BayesianNetworkAnalyzer:
+    """
+    貝葉斯網路分析器
+    支持多用戶同時使用,每個 session 獨立處理
+    """
+    # 類別級的鎖,用於線程安全
+    _lock = threading.Lock()
+    # 儲存各 session 的分析結果
+    _session_results = {}
+    def __init__(self, session_id):
+        """
+        初始化分析器
+        Args:
+            session_id: 唯一的 session 識別碼
+        """
+        self.session_id = session_id
+        self.model = None
+        self.inference = None
+        self.train_data = None
+        self.test_data = None
+        self.bins_dict = {}
+    def run_analysis(self, df, cat_features, con_features, target_variable,
+                     test_fraction=0.25, algorithm='NB', estimator='ml',
+                     equivalent_sample_size=3, score_method='BIC',
+                     sig_level=0.05, n_bins=10):
+        """
+        執行完整的貝葉斯網路分析 - 完全對齊 Django 版本的順序
+        Args:
+            df: 原始資料框
+            cat_features: 分類特徵列表
+            con_features: 連續特徵列表
+            target_variable: 目標變數名稱
+            test_fraction: 測試集比例
+            algorithm: 結構學習演算法
+            estimator: 參數估計方法
+            equivalent_sample_size: 等效樣本大小(用於貝葉斯估計)
+            score_method: 評分方法(用於 Hill Climbing)
+            sig_level: 顯著性水準(用於 PC 演算法)
+            n_bins: 連續變數分箱數量
+        Returns:
+            dict: 包含所有分析結果的字典
+        """
+        with self._lock:
+            try:
+                # 1. 資料預處理 (只選擇欄位和處理缺失值)
+                processed_df = self._preprocess_data(
+                    df, cat_features, con_features, target_variable
+                )
+                # 2. 分割訓練/測試集 (✅ random_state=526)
+                self.train_data, self.test_data = train_test_split(
+                    processed_df,
+                    test_size=test_fraction,
+                    random_state=526,
+                    stratify=processed_df[target_variable] if target_variable in processed_df.columns else None
+                )
+                # 3. ✅ 學習網路結構 (在分箱和編碼之前!)
+                self.model = self._learn_structure(
+                    algorithm, score_method, sig_level, target_variable
+                )
+                # 4. ✅ 對分類變數編碼 (在學習結構之後,分箱之前)
+                self._encode_categorical_features(cat_features)
+                # 5. ✅ 對連續變數分箱 (在編碼之後)
+                self._bin_continuous_features(con_features, n_bins)
+                # 6. 參數估計
+                self._fit_parameters(estimator, equivalent_sample_size)
+                # 7. 初始化推論引擎
+                self.inference = VariableElimination(self.model)
+                # 8. 評估模型
+                train_metrics = self._evaluate_model(
+                    self.train_data, target_variable, "train"
+                )
+                test_metrics = self._evaluate_model(
+                    self.test_data, target_variable, "test"
+                )
+                # 9. 獲取 CPD
+                cpds = self._get_all_cpds()
+                # 10. 計算模型評分
+                scores = self._calculate_scores()
+                # 11. 整理結果
+                results = {
+                    'model': self.model,
+                    'inference': self.inference,
+                    'train_metrics': train_metrics,
+                    'test_metrics': test_metrics,
+                    'cpds': cpds,
+                    'scores': scores,
+                    'parameters': {
+                        'algorithm': algorithm,
+                        'estimator': estimator,
+                        'test_fraction': test_fraction,
+                        'n_features': len(cat_features) + len(con_features),
+                        'cat_features': cat_features,
+                        'con_features': con_features,
+                        'target_variable': target_variable,
+                        'n_bins': n_bins,
+                        'score_method': score_method,
+                        'sig_level': sig_level,
+                        'equivalent_sample_size': equivalent_sample_size
+                    },
+                    'timestamp': datetime.now().isoformat()
+                }
+                # 儲存到 session results
+                self._session_results[self.session_id] = results
+                return results
+            except Exception as e:
+                raise Exception(f"Analysis failed: {str(e)}")
+    def _preprocess_data(self, df, cat_features, con_features, target_variable):
+        """資料預處理 - 只選擇欄位和刪除缺失值"""
+        # 選擇需要的欄位
+        selected_columns = cat_features + con_features + [target_variable]
+        processed_df = df[selected_columns].copy()
+        # 處理缺失值
+        processed_df = processed_df.dropna()
+        return processed_df
+    def _encode_categorical_features(self, cat_features):
+        """
+        ✅ 將分類變數轉為 category codes - 完全對齊 Django
+        注意:只對 cat_features 編碼,不對分箱後的連續變數編碼
+        Django 只對 train_data 編碼,但我們為了一致性也對 test_data 編碼
+        """
+        for col in cat_features:
+            if col in self.train_data.columns:
+                if self.train_data[col].dtype == 'object':
+                    self.train_data[col] = self.train_data[col].astype('category').cat.codes
+            # Django 沒有對 test_data 編碼,但為了預測時一致性,我們也編碼
+            if col in self.test_data.columns:
+                if self.test_data[col].dtype == 'object':
+                    self.test_data[col] = self.test_data[col].astype('category').cat.codes
+    def _bin_continuous_features(self, con_features, n_bins):
+        """
+        ✅ 對連續變數分箱 - 完全對齊 Django 版本
+        先用訓練集計算邊界,再套用到測試集
+        """
+        self.bins_dict = {}
+        for col in con_features:
+            if col in self.train_data.columns and self.train_data[col].notna().sum() > 0:
+                # 使用訓練集計算分箱邊界
+                bin_edges = pd.cut(
+                    self.train_data[col],
+                    bins=n_bins,
+                    retbins=True,
+                    duplicates='drop'
+                )[1]
+                self.bins_dict[col] = bin_edges
+                # 創建分箱標籤 (✅ 使用 – 而不是 -)
+                bin_labels = [
+                    f"{round(bin_edges[i], 2)}–{round(bin_edges[i+1], 2)}"
+                    for i in range(len(bin_edges) - 1)
+                ]
+                # 對訓練集分箱
+                self.train_data[col] = pd.cut(
+                    self.train_data[col],
+                    bins=bin_edges,
+                    labels=bin_labels,
+                    include_lowest=True
+                ).astype(object).fillna("Missing")
+                # 對測試集使用相同邊界分箱
+                if col in self.test_data.columns:
+                    self.test_data[col] = pd.cut(
+                        self.test_data[col],
+                        bins=bin_edges,
+                        labels=bin_labels,
+                        include_lowest=True
+                    ).astype(object).fillna("Missing")
+            else:
+                print(f"⚠️ Skipped binning column '{col}' – missing or all NaN")
+    def _learn_structure(self, algorithm, score_method, sig_level, target_variable):
+        """學習網路結構 - 完全對齊 Django 版本"""
+        if algorithm == 'NB':
+            # Naive Bayes
+            edges = [
+                (target_variable, feature)
+                for feature in self.train_data.columns
+                if feature != target_variable
+            ]
+            model = BayesianNetwork(edges)
+        elif algorithm == 'TAN':
+            # Tree-Augmented Naive Bayes
+            # ✅ 特殊情況處理: 如果同時存在'asia'和'either'列,特別指定'asia'作為根節點
+            if 'asia' in self.train_data.columns and 'either' in self.train_data.columns and target_variable == 'either':
+                tan_search = TreeSearch(self.train_data, root_node='asia')
+            else:
+                tan_search = TreeSearch(self.train_data)
+            structure = tan_search.estimate(
+                estimator_type='tan',
+                class_node=target_variable
+            )
+            model = BayesianNetwork(structure.edges())
+        elif algorithm == 'CL':
+            # Chow-Liu
+            tan_search = TreeSearch(self.train_data)
+            structure = tan_search.estimate(
+                estimator_type='chow-liu',
+                class_node=target_variable
+            )
+            model = BayesianNetwork(structure.edges())
+        elif algorithm == 'HC':
+            # Hill Climbing
+            hc = HillClimbSearch(self.train_data)
+            # 選擇評分方法
+            scoring_methods = {
+                'BIC': BicScore(self.train_data),
+                'AIC': AICScore(self.train_data),
+                'K2': K2Score(self.train_data),
+                'BDeu': BDeuScore(self.train_data),
+                'BDs': BDsScore(self.train_data)
+            }
+            structure = hc.estimate(
+                scoring_method=scoring_methods[score_method]
+            )
+            model = BayesianNetwork(structure.edges())
+        elif algorithm == 'PC':
+            # PC Algorithm - ✅ 與 Django 完全一致的降級策略
+            pc = PC(self.train_data)
+            # 嘗試不同的 max_cond_vars 直到成功
+            for max_cond in [5, 4, 3, 2, 1]:
+                try:
+                    structure = pc.estimate(
+                        significance_level=sig_level,
+                        max_cond_vars=max_cond,
+                        ci_test='chi_square',
+                        variant='stable',
+                        n_jobs=1  # ✅ Django 第一次用 1
+                    )
+                    # 檢查是否有效 (✅ 與 Django 一致)
+                    edges = structure.edges()
+                    if is_directed_acyclic_graph(DiGraph(edges)) and any(target_variable in edge for edge in edges):
+                        model = BayesianNetwork(structure.edges())
+                        break
+                except:
+                    continue
+            else:
+                # 如果都失敗,使用 Naive Bayes (✅ 與 Django 一致)
+                edges = [
+                    (target_variable, feature)
+                    for feature in self.train_data.columns
+                    if feature != target_variable
+                ]
+                model = BayesianNetwork(edges)
+        else:
+            raise ValueError(f"Unknown algorithm: {algorithm}")
+        return model
+    def _fit_parameters(self, estimator, equivalent_sample_size):
+        """參數估計"""
+        if estimator == 'bn':
+            self.model.fit(
+                self.train_data,
+                estimator=BayesianEstimator,
+                equivalent_sample_size=equivalent_sample_size
+            )
+        else:
+            self.model.fit(
+                self.train_data,
+                estimator=MaximumLikelihoodEstimator
+            )
+    def _predict_probabilities(self, data, target_variable):
+        """
+        預測機率 - ✅ 與 Django 版本完全一致
+        """
+        true_labels = []
+        predicted_probs = []
+        model_nodes = set(self.model.nodes())
+        for idx, row in data.iterrows():
+            # 準備 evidence (✅ 過濾只在模型中的變數)
+            raw_evidence = row.drop(target_variable).to_dict()
+            filtered_evidence = {k: v for k, v in raw_evidence.items() if k in model_nodes}
+            true_label = row[target_variable]
+            true_labels.append(true_label)
+            try:
+                result = self.inference.query(
+                    variables=[target_variable],
+                    evidence=filtered_evidence
+                )
+                probs = result.values
+                predicted_probs.append(probs)
+            except Exception as e:
+                print(f"⚠️ Inference failed at row {idx} | evidence keys: {list(filtered_evidence.keys())} | error: {e}")
+                predicted_probs.append(None)
+        # ✅ 過濾有效結果 (與 Django 一致)
+        valid_data = [
+            (label, prob)
+            for label, prob in zip(true_labels, predicted_probs)
+            if prob is not None and len(prob) > 1
+        ]
+        if not valid_data:
+            return [], []
+        valid_labels, valid_probs = zip(*valid_data)
+        prob_array = np.round(np.array([prob[1] for prob in valid_probs]), 4)
+        return list(valid_labels), prob_array
+    def _evaluate_model(self, data, target_variable, dataset_name):
+        """評估模型效能 - ✅ 與 Django 完全一致"""
+        # 預測
+        true_labels, pred_probs = self._predict_probabilities(
+            data, target_variable
+        )
+        if len(true_labels) == 0:
+            return {
+                'accuracy': 0,
+                'precision': 0,
+                'recall': 0,
+                'f1': 0,
+                'auc': 0,
+                'g_mean': 0,
+                'p_mean': 0,
+                'specificity': 0,
+                'confusion_matrix': [[0, 0], [0, 0]],
+                'fpr': [0],
+                'tpr': [0]
+            }
+        # 二元預測 (threshold = 0.1, ✅ 與 Django 一致)
+        threshold = 0.1
+        pred_labels = (pred_probs >= threshold).astype(int)
+        # 計算指標
+        accuracy = accuracy_score(true_labels, pred_labels) * 100
+        precision = precision_score(true_labels, pred_labels, zero_division=0) * 100
+        recall = recall_score(true_labels, pred_labels, zero_division=0) * 100
+        f1 = f1_score(true_labels, pred_labels, zero_division=0) * 100
+        # ROC 曲線
+        pred_probs_clean = np.nan_to_num(pred_probs, nan=0.0)
+        fpr, tpr, _ = roc_curve(true_labels, pred_probs_clean)
+        auc = roc_auc_score(true_labels, pred_probs_clean)
+        # 混淆矩陣
+        cm = confusion_matrix(true_labels, pred_labels).tolist()
+        # G-mean 和 P-mean (✅ 與 Django 計算方式一致)
+        tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()
+        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
+        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
+        g_mean = np.sqrt(sensitivity * precision / 100) * 100
+        p_mean = np.sqrt(specificity * sensitivity) * 100
+        return {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1': f1,
+            'auc': auc,
+            'g_mean': g_mean,
+            'p_mean': p_mean,
+            'specificity': specificity * 100,
+            'confusion_matrix': cm,
+            'fpr': fpr.tolist(),
+            'tpr': tpr.tolist(),
+            'predicted_probs': pred_probs.tolist()
+        }
+    def _get_all_cpds(self):
+        """獲取所有條件機率表"""
+        cpds = {}
+        for node in self.model.nodes():
+            cpd = self.model.get_cpds(node)
+            cpds[node] = cpd
+        return cpds
+    def _calculate_scores(self):
+        """計算模型評分"""
+        scores = {
+            'log_likelihood': log_likelihood_score(self.model, self.train_data),
+            'bic': structure_score(self.model, self.train_data, scoring_method='bic'),
+            'k2': structure_score(self.model, self.train_data, scoring_method='k2'),
+            'bdeu': structure_score(self.model, self.train_data, scoring_method='bdeu'),
+            'bds': structure_score(self.model, self.train_data, scoring_method='bds')
+        }
+        return scores
+    def save_model(self, filepath):
+        """
+        儲存訓練好的模型
+        包含: model, bins_dict, train_data columns 等資訊
+        """
+        import pickle
+        model_data = {
+            'model': self.model,
+            'bins_dict': self.bins_dict,
+            'train_columns': list(self.train_data.columns),
+            'timestamp': datetime.now().isoformat()
+        }
+        with open(filepath, 'wb') as f:
+            pickle.dump(model_data, f)
+    def load_model(self, filepath):
+        """
+        載入已訓練的模型
+        """
+        import pickle
+        with open(filepath, 'rb') as f:
+            model_data = pickle.load(f)
+        self.model = model_data['model']
+        self.bins_dict = model_data['bins_dict']
+        self.inference = VariableElimination(self.model)
+        return model_data
+    def predict_single_instance(self, evidence_dict, target_variable):
+        """
+        對單一個案進行預測
+        """
+        processed_evidence = {}
+        for key, value in evidence_dict.items():
+            if key in self.bins_dict:
+                # 連續變數需要分箱
+                bins = self.bins_dict[key]
+                # 🆕 處理超出範圍的值
+                if value < bins[0]:
+                    # 小於最小值，使用第一個 bin
+                    processed_evidence[key] = f"{round(bins[0], 2)}–{round(bins[1], 2)}"
+                elif value > bins[-1]:
+                    # 大於最大值，使用最後一個 bin
+                    processed_evidence[key] = f"{round(bins[-2], 2)}–{round(bins[-1], 2)}"
+                else:
+                    # 正常範圍內，找到對應的 bin
+                    for i in range(len(bins)-1):
+                        if bins[i] <= value <= bins[i+1]:
+                            processed_evidence[key] = f"{round(bins[i], 2)}–{round(bins[i+1], 2)}"
+                            break
+            else:
+                # 分類變數直接使用
+                processed_evidence[key] = value
+        # 2. 進行推論
+        result = self.inference.query(
+            variables=[target_variable],
+            evidence=processed_evidence
+        )
+        # 3. 整理結果
+        probs = result.values
+        death_prob = probs[1] if len(probs) > 1 else probs[0]
+        # 判斷風險等級
+        if death_prob >= 0.7:
+            risk_level = "High"
+        elif death_prob >= 0.3:
+            risk_level = "Moderate"
+        else:
+            risk_level = "Low"
+        return {
+            'probability': float(death_prob),
+            'risk_level': risk_level,
+            'all_probs': {i: float(p) for i, p in enumerate(probs)},
+            'processed_evidence': processed_evidence
+        }
+    @classmethod
+    def get_session_results(cls, session_id):
+        """獲取特定 session 的結果"""
+        return cls._session_results.get(session_id)
+    @classmethod
+    def clear_session_results(cls, session_id):
+        """清除特定 session 的結果"""
+        if session_id in cls._session_results:
+            del cls._session_results[session_id]

llm_assistant.py ADDED Viewed

	@@ -0,0 +1,360 @@

+from openai import OpenAI
+import json
+import numpy as np
+class LLMAssistant:
+    """
+    LLM 問答助手
+    協助用戶理解貝葉斯網路分析結果
+    """
+    def __init__(self, api_key, session_id):
+        """
+        初始化 LLM 助手
+        Args:
+            api_key: OpenAI API key
+            session_id: 唯一的 session 識別碼
+        """
+        self.client = OpenAI(api_key=api_key)
+        self.session_id = session_id
+        self.conversation_history = []
+        # 系統提示詞
+        self.system_prompt = """You are an expert data scientist specializing in Bayesian Networks and machine learning.
+Your role is to help users understand their Bayesian Network analysis results.
+You should:
+1. Explain complex statistical concepts in simple terms
+2. Provide insights about model performance metrics
+3. Suggest improvements when asked
+4. Explain the structure and relationships in the Bayesian Network
+5. Help interpret conditional probability tables (CPTs)
+6. Discuss limitations and assumptions of the model
+7. Perform personalized risk predictions from patient descriptions**
+8. Provide empathetic, evidence-based interpretations of risk levels**
+When performing predictions:
+- Extract relevant medical features from natural language descriptions
+- Clearly communicate risk levels (High/Moderate/Low) with probabilities
+- Explain key risk factors in understandable terms
+- Always emphasize limitations and the need for professional medical consultation
+Always be clear, concise, and educational. Use examples when helpful.
+Format your responses with proper markdown for better readability."""
+    def get_response(self, user_message, analysis_results):
+        """
+        獲取 AI 回應
+        Args:
+            user_message: 用戶訊息
+            analysis_results: 分析結果字典
+        Returns:
+            str: AI 回應
+        """
+        # 準備上下文資訊
+        context = self._prepare_context(analysis_results)
+        # 添加用戶訊息到歷史
+        self.conversation_history.append({
+            "role": "user",
+            "content": user_message
+        })
+        # 構建訊息列表
+        messages = [
+            {"role": "system", "content": self.system_prompt},
+            {"role": "system", "content": f"Analysis Context:\n{context}"}
+        ] + self.conversation_history
+        try:
+            # 調用 OpenAI API
+            response = self.client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=messages,
+                temperature=0.7,
+                max_tokens=1500
+            )
+            assistant_message = response.choices[0].message.content
+            # 添加助手回應到歷史
+            self.conversation_history.append({
+                "role": "assistant",
+                "content": assistant_message
+            })
+            return assistant_message
+        except Exception as e:
+            return f"❌ Error: {str(e)}\n\nPlease check your API key and try again."
+    def _prepare_context(self, results):
+        """準備分析結果的上下文資訊"""
+        if not results:
+            return "No analysis results available yet."
+        # 提取關鍵資訊
+        params = results['parameters']
+        train_metrics = results['train_metrics']
+        test_metrics = results['test_metrics']
+        scores = results['scores']
+        # 構建上下文字串
+        context = f"""
+## Model Configuration
+- Algorithm: {params['algorithm']}
+- Estimator: {params['estimator']}
+- Number of Features: {params['n_features']}
+  - Categorical: {len(params['cat_features'])}
+  - Continuous: {len(params['con_features'])}
+- Target Variable: {params['target_variable']}
+- Test Set Proportion: {params['test_fraction']:.0%}
+## Training Set Performance
+- Accuracy: {train_metrics['accuracy']:.2f}%
+- Precision: {train_metrics['precision']:.2f}%
+- Recall: {train_metrics['recall']:.2f}%
+- F1-Score: {train_metrics['f1']:.2f}%
+- AUC: {train_metrics['auc']:.4f}
+- G-mean: {train_metrics['g_mean']:.2f}%
+- P-mean: {train_metrics['p_mean']:.2f}%
+- Specificity: {train_metrics['specificity']:.2f}%
+## Test Set Performance
+- Accuracy: {test_metrics['accuracy']:.2f}%
+- Precision: {test_metrics['precision']:.2f}%
+- Recall: {test_metrics['recall']:.2f}%
+- F1-Score: {test_metrics['f1']:.2f}%
+- AUC: {test_metrics['auc']:.4f}
+- G-mean: {test_metrics['g_mean']:.2f}%
+- P-mean: {test_metrics['p_mean']:.2f}%
+- Specificity: {test_metrics['specificity']:.2f}%
+## Model Scores
+- Log-Likelihood: {scores['log_likelihood']:.2f}
+- BIC Score: {scores['bic']:.2f}
+- K2 Score: {scores['k2']:.2f}
+- BDeu Score: {scores['bdeu']:.2f}
+- BDs Score: {scores['bds']:.2f}
+## Network Structure
+- Total Nodes: {len(results['model'].nodes())}
+- Total Edges: {len(results['model'].edges())}
+- Network Edges: {list(results['model'].edges())[:10]}... (showing first 10)
+"""
+        return context
+    def generate_summary(self, analysis_results):
+        """
+        自動生成分析結果總結
+        Args:
+            analysis_results: 分析結果字典
+        Returns:
+            str: 總結文字
+        """
+        summary_prompt = """Based on the analysis results provided in the context, please generate a comprehensive summary that includes:
+1. **Model Overview**: Brief description of the model type and configuration
+2. **Performance Analysis**:
+   - Overall model performance on both training and test sets
+   - Comparison between training and test performance (overfitting/underfitting)
+   - Key strengths and weaknesses
+3. **Network Structure Insights**: What the learned structure tells us about variable relationships
+4. **Recommendations**: Specific suggestions for improvement
+5. **Limitations**: Important caveats and limitations to consider
+Format the summary in clear markdown with appropriate sections and bullet points."""
+        return self.get_response(summary_prompt, analysis_results)
+    def explain_metric(self, metric_name, analysis_results):
+        """
+        解釋特定指標
+        Args:
+            metric_name: 指標名稱
+            analysis_results: 分析結果字典
+        Returns:
+            str: 指標解釋
+        """
+        explain_prompt = f"""Please explain the following metric in the context of this analysis:
+Metric: {metric_name}
+Include:
+1. What this metric measures
+2. The value obtained in this analysis (training and test)
+3. How to interpret this value
+4. What it tells us about model performance
+5. How it relates to other metrics in the analysis"""
+        return self.get_response(explain_prompt, analysis_results)
+    def suggest_improvements(self, analysis_results):
+        """
+        提供改進建議
+        Args:
+            analysis_results: 分析結果字典
+        Returns:
+            str: 改進建議
+        """
+        improve_prompt = """Based on the current model performance and configuration, please provide specific, actionable recommendations for improvement.
+Consider:
+1. Feature engineering opportunities
+2. Algorithm selection
+3. Hyperparameter tuning
+4. Data quality issues
+5. Model complexity trade-offs
+Prioritize recommendations by potential impact."""
+        return self.get_response(improve_prompt, analysis_results)
+    def explain_network_structure(self, analysis_results):
+        """
+        解釋網路結構
+        Args:
+            analysis_results: 分析結果字典
+        Returns:
+            str: 網路結構解釋
+        """
+        structure_prompt = """Please explain the learned Bayesian Network structure:
+1. What are the key relationships (edges) discovered?
+2. What do these relationships tell us about the domain?
+3. Are there any surprising or interesting patterns?
+4. How does the structure relate to the target variable?
+5. What are the implications for prediction and inference?"""
+        return self.get_response(structure_prompt, analysis_results)
+    def compare_algorithms(self, analysis_results):
+        """
+        比較不同演算法
+        Args:
+            analysis_results: 分析結果字典
+        Returns:
+            str: 演算法比較
+        """
+        compare_prompt = f"""The current model uses the {analysis_results['parameters']['algorithm']} algorithm.
+Please:
+1. Explain the characteristics of this algorithm
+2. Compare it with other available algorithms (NB, TAN, CL, HC, PC)
+3. Discuss when this algorithm is most appropriate
+4. Suggest if a different algorithm might be better for this dataset
+5. Explain the trade-offs involved"""
+        return self.get_response(compare_prompt, analysis_results)
+    def predict_from_text(self, user_description, analyzer, target_variable, feature_list):
+        """
+        從文字描述中提取特徵並進行預測
+        Args:
+            user_description: 用戶的文字描述
+            analyzer: BayesianNetworkAnalyzer 實例
+            target_variable: 目標變數
+            feature_list: 模型使用的特徵列表
+        Returns:
+            str: AI 回應包含預測結果
+        """
+        # Step 1: 使用 LLM 從文字中提取結構化特徵
+        extraction_prompt = f"""
+You are a medical data analyst. Extract the following patient features from the description:
+Required features: {', '.join(feature_list)}
+User description: "{user_description}"
+Please extract the values in JSON format. If a feature is not mentioned, use "unknown".
+Return ONLY the JSON object, no other text.
+Example format:
+{{
+    "age": 65,
+    "size": 25,
+    "grade": 2,
+    "nodes": 1,
+    ...
+}}
+"""
+        # 呼叫 OpenAI API 提取特徵
+        response = self.client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are a precise medical data extractor. Return only valid JSON."},
+                {"role": "user", "content": extraction_prompt}
+            ],
+            temperature=0.1
+        )
+        # 解析 JSON
+        extracted_features = json.loads(response.choices[0].message.content)
+        # Step 2: 移除 unknown 值
+        evidence_dict = {k: v for k, v in extracted_features.items()
+                         if v != "unknown" and k != target_variable}
+        # Step 3: 使用模型進行預測
+        prediction = analyzer.predict_single_instance(evidence_dict, target_variable)
+        # Step 4: 讓 LLM 生成易懂的回應
+        interpretation_prompt = f"""
+Based on the Bayesian Network model analysis:
+Patient features: {evidence_dict}
+Predicted death probability: {prediction['probability']:.2%}
+Risk level: {prediction['risk_level']}
+Please provide a clear, empathetic explanation including:
+1. A summary of the patient's key risk factors
+2. The predicted risk level and what it means
+3. Important considerations and limitations
+4. Recommendations for next steps
+Be professional but accessible. Use markdown formatting.
+"""
+        final_response = self.client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are a compassionate medical AI assistant."},
+                {"role": "user", "content": interpretation_prompt}
+            ],
+            temperature=0.7
+        )
+        return final_response.choices[0].message.content
+    def reset_conversation(self):
+        """重置對話歷史"""
+        self.conversation_history = []

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ graphviz

requirements.txt CHANGED Viewed

@@ -1,3 +1,9 @@
-altair
-pandas
-streamlit

+streamlit>=1.37.0
+pandas>=2.2.0
+plotly>=5.20.0
+scikit-learn>=1.5.0
+networkx>=3.3
+openai>=1.30.0
+graphviz>=0.20.3
+pgmpy==0.1.26
+numpy>=2.1.0,<3.0.0

utils.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import plotly.graph_objects as go
+import plotly.express as px
+import pandas as pd
+import numpy as np
+import networkx as nx
+from plotly.subplots import make_subplots
+from graphviz import Digraph
+import base64
+def plot_roc_curve(fpr, tpr, auc, title="ROC Curve"):
+    """
+    繪製 ROC 曲線
+    Args:
+        fpr: False positive rate
+        tpr: True positive rate
+        auc: Area under curve
+        title: 圖表標題
+    Returns:
+        plotly figure
+    """
+    fig = go.Figure()
+    # ROC 曲線
+    fig.add_trace(go.Scatter(
+        x=fpr,
+        y=tpr,
+        mode='lines',
+        name=f'ROC Curve (AUC = {auc:.4f})',
+        line=dict(color='#2d6ca2', width=2)
+    ))
+    # 對角線(隨機分類器)
+    fig.add_trace(go.Scatter(
+        x=[0, 1],
+        y=[0, 1],
+        mode='lines',
+        name='Random Classifier',
+        line=dict(color='gray', width=1, dash='dash')
+    ))
+    fig.update_layout(
+        title=title,
+        xaxis_title='False Positive Rate',
+        yaxis_title='True Positive Rate',
+        width=600,
+        height=500,
+        template='plotly_white',
+        legend=dict(x=0.6, y=0.1)
+    )
+    return fig
+def plot_confusion_matrix(cm, title="Confusion Matrix"):
+    """
+    繪製混淆矩陣
+    Args:
+        cm: 混淆矩陣 (2x2 list)
+        title: 圖表標題
+    Returns:
+        plotly figure
+    """
+    # 轉換為 numpy array
+    cm_array = np.array(cm)
+    # 計算百分比
+    cm_percent = cm_array / cm_array.sum() * 100
+    # 創建標籤
+    labels = [
+        [f'{cm_array[i][j]}<br>({cm_percent[i][j]:.1f}%)'
+         for j in range(2)]
+        for i in range(2)
+    ]
+    fig = go.Figure(data=go.Heatmap(
+        z=cm_array,
+        x=['Predicted: 0', 'Predicted: 1'],
+        y=['Actual: 0', 'Actual: 1'],
+        text=labels,
+        texttemplate='%{text}',
+        textfont={"size": 14},
+        colorscale='Blues',
+        showscale=True
+    ))
+    fig.update_layout(
+        title=title,
+        width=500,
+        height=450,
+        template='plotly_white'
+    )
+    return fig
+def plot_probability_distribution(probs, title="Probability Distribution"):
+    """
+    繪製機率分佈圖
+    Args:
+        probs: 預測機率列表
+        title: 圖表標題
+    Returns:
+        plotly figure
+    """
+    fig = go.Figure()
+    fig.add_trace(go.Histogram(
+        x=probs,
+        nbinsx=20,
+        name='Predicted Probabilities',
+        marker=dict(
+            color='#2d6ca2',
+            line=dict(color='white', width=1)
+        )
+    ))
+    fig.update_layout(
+        title=title,
+        xaxis_title='Predicted Probability for Class 1',
+        yaxis_title='Frequency',
+        width=700,
+        height=400,
+        template='plotly_white',
+        showlegend=False
+    )
+    fig.update_xaxes(range=[0, 1])
+    return fig
+def generate_network_graph(model): # Pi
+    """
+    Generate a Graphviz tree from a BayesianNetwork model and return it as a Base64-encoded string.
+    Args:
+        model: BayesianNetwork 模型
+    Returns:
+        Base64-encoded PNG string
+    """
+    dot = Digraph(format='png', engine='dot')
+    dot.attr('node', style='filled', color='lightblue', shape='ellipse')
+    dot.attr(dpi='300')
+    # Add nodes and edges from the BayesianNetwork model
+    for node in model.nodes():
+        dot.node(node)
+    for edge in model.edges():
+        dot.edge(edge[1], edge[0])
+    # Render directly to binary and encode in Base64
+    png_data = dot.pipe(format='png')
+    tree_base64 = base64.b64encode(png_data).decode('utf-8')
+    return tree_base64
+def create_cpd_table(cpd):
+    """
+    創建條件機率表的 DataFrame
+    Args:
+        cpd: CPD 物件
+    Returns:
+        pandas DataFrame
+    """
+    if cpd is None:
+        return pd.DataFrame()
+    # 獲取變數資訊
+    variable = cpd.variable
+    evidence_vars = cpd.variables[1:] if len(cpd.variables) > 1 else []
+    # 如果是根節點(沒有父節點)
+    if not evidence_vars:
+        values = np.round(cpd.values.flatten(), 4)
+        df = pd.DataFrame(
+            {variable: values},
+            index=[f"{variable}({i})" for i in range(len(values))]
+        )
+        return df
+    # 有父節點的情況
+    evidence_card = cpd.cardinality[1:]
+    # 生成多層索引欄位
+    from itertools import product
+    column_values = list(product(*[range(card) for card in evidence_card]))
+    # 創建欄位名稱
+    columns = pd.MultiIndex.from_tuples(
+        [tuple(f"{var}({val})" for var, val in zip(evidence_vars, vals))
+         for vals in column_values],
+        names=evidence_vars
+    )
+    # 重塑 CPD 值
+    reshaped_values = cpd.values.reshape(len(cpd.values), -1)
+    reshaped_values = np.round(reshaped_values, 4)
+    # 創建 DataFrame
+    df = pd.DataFrame(
+        reshaped_values,
+        index=[f"{variable}({i})" for i in range(len(cpd.values))],
+        columns=columns
+    )
+    return df
+def create_metrics_comparison_table(train_metrics, test_metrics):
+    """
+    創建訓練集和測試集指標比較表
+    Args:
+        train_metrics: 訓練集指標字典
+        test_metrics: 測試集指標字典
+    Returns:
+        pandas DataFrame
+    """
+    metrics_data = {
+        'Metric': [
+            'Accuracy', 'Precision', 'Recall', 'F1-Score',
+            'AUC', 'G-mean', 'P-mean', 'Specificity'
+        ],
+        'Training Set': [
+            f"{train_metrics['accuracy']:.2f}%",
+            f"{train_metrics['precision']:.2f}%",
+            f"{train_metrics['recall']:.2f}%",
+            f"{train_metrics['f1']:.2f}%",
+            f"{train_metrics['auc']:.4f}",
+            f"{train_metrics['g_mean']:.2f}%",
+            f"{train_metrics['p_mean']:.2f}%",
+            f"{train_metrics['specificity']:.2f}%"
+        ],
+        'Test Set': [
+            f"{test_metrics['accuracy']:.2f}%",
+            f"{test_metrics['precision']:.2f}%",
+            f"{test_metrics['recall']:.2f}%",
+            f"{test_metrics['f1']:.2f}%",
+            f"{test_metrics['auc']:.4f}",
+            f"{test_metrics['g_mean']:.2f}%",
+            f"{test_metrics['p_mean']:.2f}%",
+            f"{test_metrics['specificity']:.2f}%"
+        ]
+    }
+    df = pd.DataFrame(metrics_data)
+    return df
+def export_results_to_json(results, filename="analysis_results.json"):
+    """
+    將結果匯出為 JSON 格式
+    Args:
+        results: 分析結果字典
+        filename: 檔案名稱
+    Returns:
+        JSON 字串
+    """
+    import json
+    # 移除無法序列化的物件
+    exportable_results = {
+        'parameters': results['parameters'],
+        'train_metrics': {
+            k: v for k, v in results['train_metrics'].items()
+            if k not in ['fpr', 'tpr', 'predicted_probs']
+        },
+        'test_metrics': {
+            k: v for k, v in results['test_metrics'].items()
+            if k not in ['fpr', 'tpr', 'predicted_probs']
+        },
+        'scores': results['scores'],
+        'network_edges': list(results['model'].edges()),
+        'timestamp': results['timestamp']
+    }
+    return json.dumps(exportable_results, indent=2)
+def calculate_performance_gap(train_metrics, test_metrics):
+    """
+    計算訓練集和測試集之間的效能差距
+    Args:
+        train_metrics: 訓練集指標
+        test_metrics: 測試集指標
+    Returns:
+        dict: 效能差距字典
+    """
+    gaps = {
+        'accuracy_gap': train_metrics['accuracy'] - test_metrics['accuracy'],
+        'precision_gap': train_metrics['precision'] - test_metrics['precision'],
+        'recall_gap': train_metrics['recall'] - test_metrics['recall'],
+        'f1_gap': train_metrics['f1'] - test_metrics['f1'],
+        'auc_gap': train_metrics['auc'] - test_metrics['auc']
+    }
+    # 判斷是否有過擬合
+    avg_gap = np.mean([abs(v) for v in gaps.values()])
+    overfitting_status = "High" if avg_gap > 10 else "Moderate" if avg_gap > 5 else "Low"
+    gaps['average_gap'] = avg_gap
+    gaps['overfitting_risk'] = overfitting_status
+    return gaps