Spaces:

Wen1201
/

bayesian-network

Sleeping

App Files Files Community

bayesian-network / app.py

Wen1201

Upload 3 files

0ee744a verified 5 months ago

raw

history blame contribute delete

31.6 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.graph_objects as go
	import plotly.express as px
	from io import BytesIO
	import base64
	import json
	from datetime import datetime
	import uuid

	# 頁面配置
	st.set_page_config(
	page_title="Bayesian Network Analysis System",
	page_icon="🔬",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# 自定義 CSS - 讓介面更像 Django
	st.markdown("""
	<style>
	/* Expander 樣式 - 類似 Django 的摺疊區域 */
	.streamlit-expanderHeader {
	background-color: #e8f1f8;
	border: 1px solid #b0cfe8;
	border-radius: 5px;
	font-weight: 600;
	color: #1b4f72;
	}

	.streamlit-expanderHeader:hover {
	background-color: #d0e7f8;
	}

	/* Checkbox 樣式 */
	.stCheckbox {
	padding: 2px 0;
	}

	/* Radio button 樣式 */
	.stRadio > label {
	font-weight: 600;
	color: #1b4f72;
	}

	/* 選擇框樣式 */
	.stSelectbox > label, .stNumberInput > label {
	font-weight: 600;
	color: #1b4f72;
	}

	/* 分隔線 */
	hr {
	margin: 1rem 0;
	border-top: 2px solid #b0cfe8;
	}

	/* 表單容器 */
	.element-container {
	margin-bottom: 0.5rem;
	}

	/* 摺疊內容區域 */
	.streamlit-expanderContent {
	background-color: #f8fbff;
	border: 1px solid #d0e4f5;
	border-top: none;
	padding: 1rem;
	}

	/* 按鈕樣式 */
	.stButton > button {
	width: 100%;
	border-radius: 20px;
	font-weight: 600;
	transition: all 0.3s ease;
	}

	.stButton > button:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 8px rgba(0,0,0,0.2);
	}
	</style>
	""", unsafe_allow_html=True)

	# 導入自定義模組
	from bn_core import BayesianNetworkAnalyzer
	from llm_assistant import LLMAssistant
	from utils import (
	plot_roc_curve,
	plot_confusion_matrix,
	plot_probability_distribution,
	generate_network_graph,
	create_cpd_table,
	export_results_to_json
	)

	# 初始化 session state
	if 'session_id' not in st.session_state:
	st.session_state.session_id = str(uuid.uuid4())
	if 'analysis_results' not in st.session_state:
	st.session_state.analysis_results = None
	if 'chat_history' not in st.session_state:
	st.session_state.chat_history = []
	if 'model_trained' not in st.session_state:
	st.session_state.model_trained = False

	# 標題
	st.title("🔬 Bayesian Network Analysis System")
	st.markdown("---")

	# Sidebar - OpenAI API Key
	with st.sidebar:
	st.header("⚙️ Configuration")

	api_key = st.text_input(
	"OpenAI API Key",
	type="password",
	help="Enter your OpenAI API key to use the AI assistant"
	)

	if api_key:
	st.session_state.api_key = api_key
	st.success("✅ API Key loaded")

	st.markdown("---")

	# 資料來源選擇
	st.subheader("📊 Data Source")
	data_source = st.radio(
	"Select data source:",
	["Use Default Dataset", "Upload Your Data"]
	)

	uploaded_file = None
	if data_source == "Upload Your Data":
	uploaded_file = st.file_uploader(
	"Upload CSV file",
	type=['csv'],
	help="Upload your dataset in CSV format"
	)

	# 主要內容區
	tab1, tab2 = st.tabs(["📈 Analysis", "💬 AI Assistant"])

	# Tab 1: 分析介面
	with tab1:
	col1, col2 = st.columns([2, 1])

	with col1:
	st.header("Model Configuration")

	# 載入資料
	if data_source == "Use Default Dataset":
	# 使用預設資料集
	@st.cache_data
	def load_default_data():
	# 這裡放入預設資料集的路徑
	df = pd.read_csv("BC_imputed_micerf_period13_fid_course_D4.csv")
	return df

	try:
	df = load_default_data()
	st.success(f"✅ Default dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
	except:
	st.error("❌ Default dataset not found. Please upload your own data.")
	df = None
	else:
	if uploaded_file:
	df = pd.read_csv(uploaded_file)
	st.success(f"✅ Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
	else:
	st.info("👆 Please upload a CSV file to begin")
	df = None

	if df is not None:
	# 特徵選擇 - 使用 expander (可摺疊)
	st.subheader("🎯 Input Features")

	# 手動指定特徵類型 (針對預設乳癌資料集)
	if data_source == "Use Default Dataset":
	# 預設資料集的固定分類
	numeric_cols = ['size', 'stime'] # 只有這兩個是連續變數
	categorical_cols = [col for col in df.columns if col not in numeric_cols]
	else:
	# 上傳資料集才自動判斷
	numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

	# 二元分類變數(用於目標變數)
	binary_cols = [col for col in df.columns if df[col].nunique() == 2]

	col_feat1, col_feat2 = st.columns(2)


	with col_feat1:
	with st.expander("Continuous", expanded=False):
	st.caption("Select continuous features:")
	con_features = []
	for col in numeric_cols:
	if st.checkbox(col, value=False, key=f"con_{col}"):
	con_features.append(col)

	with col_feat2:
	with st.expander("Categorical", expanded=True):
	st.caption("Select categorical features:")
	cat_features = []
	for col in categorical_cols:
	# 預設勾選前幾個
	default_checked = categorical_cols.index(col) < 5 if len(categorical_cols) > 5 else True
	if st.checkbox(col, value=default_checked, key=f"cat_{col}"):
	cat_features.append(col)

	# 目標變數 - 放在特徵選擇下方
	st.markdown("---")

	col_target1, col_target2 = st.columns([1, 2])
	with col_target1:
	target_variable = st.selectbox(
	"Target Variable (Y):",
	options=binary_cols,
	help="Must be a binary classification variable"
	)

	with col_target2:
	test_fraction = st.number_input(
	"Test Dataset Proportion:",
	min_value=0.10,
	max_value=0.50,
	value=0.25,
	step=0.05,
	format="%.2f"
	)

	# 驗證選擇
	selected_features = cat_features + con_features
	if target_variable in selected_features:
	st.error("❌ Target variable cannot be in feature list!")
	st.stop()

	st.markdown("---")

	# 模型參數 - 使用更緊湊的佈局
	st.subheader("⚙️ Model Configuration")

	col_param1, col_param2 = st.columns(2)

	with col_param1:
	algorithm = st.radio(
	"Network Structure:",
	options=['NB', 'TAN', 'CL', 'HC', 'PC'],
	format_func=lambda x: {
	'NB': 'Naive Bayes (NB)',
	'TAN': 'Tree-Augmented Naive Bayes (TAN)',
	'CL': 'Chow-Liu',
	'HC': 'Hill Climbing',
	'PC': 'PC'
	}[x],
	help="Select structure learning algorithm"
	)

	# 條件性參數 - HC
	if algorithm == 'HC':
	score_method = st.selectbox(
	"Scoring Method:",
	options=['BIC', 'AIC', 'K2', 'BDeu', 'BDs'],
	help="Select scoring method for Hill Climbing"
	)
	else:
	score_method = 'BIC'

	# 條件性參數 - PC
	if algorithm == 'PC':
	sig_level = st.number_input(
	"Significance Level:",
	min_value=0.01,
	max_value=1.0,
	value=0.05,
	step=0.01,
	help="Significance level for PC algorithm"
	)
	else:
	sig_level = 0.05

	with col_param2:
	estimator = st.radio(
	"Parameter Estimator:",
	options=['ml', 'bn'],
	format_func=lambda x: {
	'ml': 'MaximumLikelihoodEstimator',
	'bn': 'BayesianEstimator'
	}[x],
	help="Select parameter estimation method"
	)

	if estimator == 'bn':
	equivalent_sample_size = st.number_input(
	"Equivalent Sample Size:",
	min_value=1,
	value=3,
	step=1,
	help="Prior strength for Bayesian estimation"
	)
	else:
	equivalent_sample_size = 3

	# Decision (如果是預設資料集才顯示)
	if data_source == "Use Default Dataset":
	decision = st.selectbox(
	"Decision:",
	options=['OverAll', 'Exposed', 'Unexposed'],
	index=0,
	help="Analysis subset selection"
	)
	else:
	decision = 'OverAll'

	# Provide Evidence - 可摺疊區域
	st.markdown("---")
	with st.expander("Provide Evidence", expanded=False):
	st.caption("Enter evidence values for inference (optional):")

	evidence_cols = st.columns(2)
	evidence_dict = {}

	# 為每個非目標變數創建輸入框
	all_vars = [v for v in selected_features if v != target_variable]

	for idx, var in enumerate(all_vars):
	with evidence_cols[idx % 2]:
	val = st.text_input(
	f"{var}:",
	value="",
	key=f"evidence_{var}",
	help=f"Enter value for {var} (leave empty to ignore)"
	)
	if val.strip():
	evidence_dict[var] = val.strip()

	# 進階參數 - 摺疊區域
	with st.expander("Advanced Parameters", expanded=False):
	n_bins = st.slider(
	"Number of Bins (for continuous variables):",
	min_value=3,
	max_value=20,
	value=10,
	step=1,
	help="Number of bins for discretizing continuous features"
	)


	# 執行分析按鈕
	st.markdown("---")

	col_btn1, col_btn2 = st.columns([3, 1])

	with col_btn1:
	run_button = st.button("🚀 Run Analysis", type="primary", use_container_width=True)

	with col_btn2:
	if st.button("🔄 Reset", use_container_width=True):
	st.session_state.analysis_results = None
	st.session_state.model_trained = False
	st.session_state.chat_history = []
	st.rerun()

	# 分析步驟說明
	with st.expander("ℹ️ Analysis Steps", expanded=False):
	st.markdown("""
	Process:
	1. Split data (train/test)
	2. Learn network structure
	3. Process features (bins from train)
	4. Estimate parameters
	5. Evaluate performance

	Note: Test set bins are derived from training set to prevent data leakage.
	""")

	if run_button:
	# 驗證
	if not selected_features:
	st.error("❌ Please select at least one feature!")
	st.stop()

	if target_variable in selected_features:
	st.error("❌ Target variable cannot be in feature list!")
	st.stop()

	with st.spinner("🔄 Training Bayesian Network..."):
	progress_bar = st.progress(0)
	status_text = st.empty()

	try:
	# 初始化分析器
	status_text.text("📊 Initializing analyzer...")
	progress_bar.progress(10)

	analyzer = BayesianNetworkAnalyzer(
	session_id=st.session_state.session_id
	)

	status_text.text(f"📐 Learning {algorithm} structure...")
	progress_bar.progress(30)

	# 執行分析
	results = analyzer.run_analysis(
	df=df,
	cat_features=cat_features,
	con_features=con_features,
	target_variable=target_variable,
	test_fraction=test_fraction,
	algorithm=algorithm,
	estimator=estimator,
	equivalent_sample_size=equivalent_sample_size,
	score_method=score_method,
	sig_level=sig_level,
	n_bins=n_bins
	)

	status_text.text("✅ Analysis completed!")
	progress_bar.progress(100)

	# 儲存結果
	st.session_state.analysis_results = results
	st.session_state.model_trained = True
	# 🆕 儲存 analyzer 到 session_state（用於個人化預測）
	st.session_state.analyzer = analyzer

	st.success("✅ Analysis completed successfully!")
	st.balloons()


	# 清空進度
	import time
	time.sleep(1)
	progress_bar.empty()
	status_text.empty()

	st.rerun()

	except Exception as e:
	st.error(f"❌ Error during analysis: {str(e)}")
	st.exception(e)
	progress_bar.empty()
	status_text.empty()

	with col2:
	st.header("Quick Stats")

	if df is not None:
	st.metric("Total Samples", df.shape[0])
	st.metric("Total Features", df.shape[1])
	st.metric("Selected Features", len(selected_features) if 'selected_features' in locals() else 0)

	if st.session_state.model_trained:
	st.success("✅ Model Trained")
	else:
	st.info("⏳ Awaiting Training")

	# 顯示結果
	if st.session_state.analysis_results:
	st.markdown("---")
	st.header("📊 Analysis Results")

	results = st.session_state.analysis_results

	# 使用 tabs 來組織結果
	result_tabs = st.tabs([
	"🕸️ Network Structure",
	"📈 Performance Metrics",
	"📋 CPD Tables",
	"📊 Model Scores"
	])

	# Tab 1: 網路結構
	with result_tabs[0]:
	network_fig = generate_network_graph(results['model'])
	st.plotly_chart(network_fig, use_container_width=True)

	# 顯示邊的列表
	with st.expander("View Network Edges", expanded=False):
	edges = list(results['model'].edges())
	st.write(f"Total edges: {len(edges)}")

	# 每行顯示 3 個邊
	for i in range(0, len(edges), 3):
	cols = st.columns(3)
	for j, col in enumerate(cols):
	if i + j < len(edges):
	edge = edges[i + j]
	col.markdown(f"{edge[0]} → {edge[1]}")

	# Tab 2: 效能指標
	with result_tabs[1]:
	col_m1, col_m2 = st.columns(2)

	with col_m1:
	st.markdown("### Training Set")
	train_metrics = results['train_metrics']

	# 使用 metrics 卡片
	metric_cols = st.columns(4)
	metric_cols[0].metric("Accuracy", f"{train_metrics['accuracy']:.2f}%")
	metric_cols[1].metric("Precision", f"{train_metrics['precision']:.2f}%")
	metric_cols[2].metric("Recall", f"{train_metrics['recall']:.2f}%")
	metric_cols[3].metric("F1-Score", f"{train_metrics['f1']:.2f}%")

	metric_cols2 = st.columns(4)
	metric_cols2[0].metric("AUC", f"{train_metrics['auc']:.4f}")
	metric_cols2[1].metric("G-mean", f"{train_metrics['g_mean']:.2f}%")
	metric_cols2[2].metric("P-mean", f"{train_metrics['p_mean']:.2f}%")
	metric_cols2[3].metric("Specificity", f"{train_metrics['specificity']:.2f}%")

	# 混淆矩陣
	with st.expander("Confusion Matrix", expanded=True):
	conf_fig_train = plot_confusion_matrix(
	train_metrics['confusion_matrix'],
	title="Training Set"
	)
	st.plotly_chart(conf_fig_train, use_container_width=True)

	# ROC Curve
	with st.expander("ROC Curve", expanded=False):
	roc_fig_train = plot_roc_curve(
	train_metrics['fpr'],
	train_metrics['tpr'],
	train_metrics['auc'],
	title="Training Set"
	)
	st.plotly_chart(roc_fig_train, use_container_width=True)

	with col_m2:
	st.markdown("### Test Set")
	test_metrics = results['test_metrics']

	metric_cols = st.columns(4)
	metric_cols[0].metric("Accuracy", f"{test_metrics['accuracy']:.2f}%")
	metric_cols[1].metric("Precision", f"{test_metrics['precision']:.2f}%")
	metric_cols[2].metric("Recall", f"{test_metrics['recall']:.2f}%")
	metric_cols[3].metric("F1-Score", f"{test_metrics['f1']:.2f}%")

	metric_cols2 = st.columns(4)
	metric_cols2[0].metric("AUC", f"{test_metrics['auc']:.4f}")
	metric_cols2[1].metric("G-mean", f"{test_metrics['g_mean']:.2f}%")
	metric_cols2[2].metric("P-mean", f"{test_metrics['p_mean']:.2f}%")
	metric_cols2[3].metric("Specificity", f"{test_metrics['specificity']:.2f}%")

	# 混淆矩陣
	with st.expander("Confusion Matrix", expanded=True):
	conf_fig_test = plot_confusion_matrix(
	test_metrics['confusion_matrix'],
	title="Test Set"
	)
	st.plotly_chart(conf_fig_test, use_container_width=True)

	# ROC Curve
	with st.expander("ROC Curve", expanded=False):
	roc_fig_test = plot_roc_curve(
	test_metrics['fpr'],
	test_metrics['tpr'],
	test_metrics['auc'],
	title="Test Set"
	)
	st.plotly_chart(roc_fig_test, use_container_width=True)

	# Tab 3: 條件機率表
	with result_tabs[2]:
	selected_node = st.selectbox(
	"Select a node to view its CPD:",
	options=list(results['cpds'].keys())
	)

	if selected_node:
	cpd_df = create_cpd_table(results['cpds'][selected_node])
	st.dataframe(cpd_df, use_container_width=True)

	# 下載按鈕
	csv = cpd_df.to_csv()
	st.download_button(
	label="📥 Download CPD as CSV",
	data=csv,
	file_name=f"cpd_{selected_node}.csv",
	mime="text/csv"
	)

	# Tab 4: 模型評分
	with result_tabs[3]:
	scores = results['scores']

	score_cols = st.columns(5)
	score_cols[0].metric("Log-Likelihood", f"{scores['log_likelihood']:.2f}")
	score_cols[1].metric("BIC Score", f"{scores['bic']:.2f}")
	score_cols[2].metric("K2 Score", f"{scores['k2']:.2f}")
	score_cols[3].metric("BDeu Score", f"{scores['bdeu']:.2f}")
	score_cols[4].metric("BDs Score", f"{scores['bds']:.2f}")

	# 參數摘要
	with st.expander("Analysis Parameters", expanded=True):
	params = results['parameters']

	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("Algorithm Settings")
	st.write(f"- Algorithm: {params['algorithm']}")
	st.write(f"- Estimator: {params['estimator']}")
	st.write(f"- Test Fraction: {params['test_fraction']:.2%}")

	with col2:
	st.markdown("Feature Information")
	st.write(f"- Total Features: {params['n_features']}")
	st.write(f"- Categorical: {len(params['cat_features'])}")
	st.write(f"- Continuous: {len(params['con_features'])}")
	st.write(f"- Target: {params['target_variable']}")

	with col3:
	st.markdown("Other Parameters")
	st.write(f"- Bins: {params['n_bins']}")
	st.write(f"- Score Method: {params['score_method']}")
	st.write(f"- Significance Level: {params['sig_level']}")
	st.write(f"- Equivalent Sample Size: {params['equivalent_sample_size']}")

	# 匯出結果
	with st.expander("Export Results", expanded=False):
	col1, col2 = st.columns(2)

	with col1:
	# 原本的 JSON 下載
	result_json = export_results_to_json(results)
	st.download_button(
	label="📥 Download Full Results (JSON)",
	data=result_json,
	file_name=f"bn_analysis_{results['timestamp'][:10]}.json",
	mime="application/json"
	)

	with col2:
	# 🆕 新增：下載模型
	if st.button("💾 Save Trained Model"):
	if 'analyzer' in st.session_state:
	import tempfile
	import os

	# 創建臨時文件
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as tmp_file:
	model_path = tmp_file.name
	st.session_state.analyzer.save_model(model_path)

	# 讀取並提供下載
	with open(model_path, 'rb') as f:
	st.download_button(
	label="📥 Download Model File (.pkl)",
	data=f,
	file_name=f"bn_model_{results['timestamp'][:10]}.pkl",
	mime="application/octet-stream",
	key="download_model_btn"
	)

	# 清理臨時文件
	os.unlink(model_path)
	else:
	st.error("❌ Analyzer not found in session state")


	# Tab 2: AI 助手
	with tab2:
	st.header("💬 AI Analysis Assistant")

	if not st.session_state.get('api_key'):
	st.warning("⚠️ Please enter your OpenAI API Key in the sidebar to use the AI assistant.")
	elif not st.session_state.model_trained:
	st.info("ℹ️ Please train a model first in the Analysis tab to use the AI assistant.")
	else:
	# 初始化 LLM 助手
	if 'llm_assistant' not in st.session_state:
	st.session_state.llm_assistant = LLMAssistant(
	api_key=st.session_state.api_key,
	session_id=st.session_state.session_id
	)

	# 顯示聊天歷史
	chat_container = st.container()

	with chat_container:
	for message in st.session_state.chat_history:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# 聊天輸入
	if prompt := st.chat_input("Ask me anything about your analysis results..."):
	# 添加用戶訊息
	st.session_state.chat_history.append({
	"role": "user",
	"content": prompt
	})

	with st.chat_message("user"):
	st.markdown(prompt)

	# 🆕 檢測是否為個人化預測請求
	prediction_keywords = ['predict', 'risk', 'patient', 'case', 'my risk', 'calculate', 'probability', 'chance']
	is_prediction_request = any(keyword in prompt.lower() for keyword in prediction_keywords)

	# 獲取 AI 回應
	with st.chat_message("assistant"):
	with st.spinner("Analyzing..." if is_prediction_request else "Thinking..."):
	try:
	if is_prediction_request:
	# 🆕 執行個人化預測
	# 從 session_state 取得必要資訊
	results = st.session_state.analysis_results

	# 重建 analyzer（需要載入模型狀態）
	# ⚠️ 這裡需要先把 analyzer 存在 session_state 中
	if 'analyzer' not in st.session_state:
	st.error("❌ Model not found. Please train a model first in the Analysis tab.")
	response = "I cannot perform predictions because the model is not available. Please train a model first."
	else:
	response = st.session_state.llm_assistant.predict_from_text(
	user_description=prompt,
	analyzer=st.session_state.analyzer,
	target_variable=results['parameters']['target_variable'],
	feature_list=results['parameters']['cat_features'] + results['parameters']['con_features']
	)
	else:
	# 原本的一般對話
	response = st.session_state.llm_assistant.get_response(
	user_message=prompt,
	analysis_results=st.session_state.analysis_results
	)

	st.markdown(response)

	except Exception as e:
	error_msg = f"❌ Error: {str(e)}\n\nPlease try rephrasing your question or check the model status."
	st.error(error_msg)
	response = error_msg

	# 添加助手訊息
	st.session_state.chat_history.append({
	"role": "assistant",
	"content": response
	})

	# 快速問題按鈕
	st.markdown("---")
	st.subheader("💡 Quick Questions")

	quick_questions = [
	"📊 Give me a summary of the analysis results",
	"🎯 What is the model's performance?",
	"🔍 Explain the Bayesian Network structure",
	"⚠️ What are the limitations of this model?",
	"💡 How can I improve the model?"
	]

	cols = st.columns(len(quick_questions))
	for idx, (col, question) in enumerate(zip(cols, quick_questions)):
	if col.button(question, key=f"quick_{idx}"):
	st.session_state.chat_history.append({
	"role": "user",
	"content": question
	})

	response = st.session_state.llm_assistant.get_response(
	user_message=question,
	analysis_results=st.session_state.analysis_results
	)

	st.session_state.chat_history.append({
	"role": "assistant",
	"content": response
	})

	st.rerun()

	# Footer
	st.markdown("---")
	st.markdown(
	"""
	<div style='text-align: center'>
	<p>🔬 Bayesian Network Analysis System \| Built with Streamlit</p>
	<p>Powered by OpenAI GPT-4 \| Session ID: {}</p>
	</div>
	""".format(st.session_state.session_id[:8]),
	unsafe_allow_html=True
	)