Spaces:

smartTranscend
/

bert-llama-finetuning

Runtime error

App Files Files Community

bert-llama-finetuning / app.py

smartTranscend

Rename 1031 combine_Wen.py to app.py

f7f2a8b verified 5 months ago

raw

history blame contribute delete

56.1 kB

	import gradio as gr
	import pandas as pd
	import torch
	from transformers import (
	BertTokenizer, BertForSequenceClassification,
	AutoTokenizer, AutoModelForSequenceClassification,
	TrainingArguments, Trainer, DataCollatorWithPadding
	)
	from peft import (
	LoraConfig, AdaLoraConfig, AdaptionPromptConfig, PrefixTuningConfig,
	get_peft_model, TaskType, prepare_model_for_kbit_training
	)
	from datasets import Dataset, DatasetDict
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
	from sklearn.model_selection import train_test_split
	from sklearn.utils import resample
	from torch import nn
	import os
	from datetime import datetime
	import gc
	import numpy as np

	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

	torch.backends.cudnn.benchmark = False
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# 全域變數
	trained_models = {}
	model_counter = 0
	baseline_results = {}
	baseline_model_cache = {}
	baseline_performance_cache = {}
	second_stage_models = {} # 儲存二次微調的模型

	def calculate_improvement(baseline_val, finetuned_val):
	"""安全計算改善率"""
	if baseline_val == 0:
	if finetuned_val > 0:
	return float('inf')
	else:
	return 0.0
	return (finetuned_val - baseline_val) / baseline_val * 100

	def format_improve(val):
	"""格式化改善率"""
	if val == float('inf'):
	return "N/A (baseline=0)"
	return f"{val:+.1f}%"

	def thorough_memory_cleanup():
	"""徹底清理記憶體"""
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()
	torch.cuda.synchronize()

	def compute_metrics(pred):
	try:
	labels = pred.label_ids
	preds = pred.predictions.argmax(-1)
	precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', pos_label=1, zero_division=0)
	acc = accuracy_score(labels, preds)
	cm = confusion_matrix(labels, preds)
	if cm.shape == (2, 2):
	tn, fp, fn, tp = cm.ravel()
	else:
	tn = fp = fn = tp = 0
	sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
	specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
	return {
	'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall,
	'sensitivity': sensitivity, 'specificity': specificity,
	'tp': int(tp), 'tn': int(tn), 'fp': int(fp), 'fn': int(fn)
	}
	except Exception as e:
	print(f"Error in compute_metrics: {e}")
	return {
	'accuracy': 0, 'f1': 0, 'precision': 0, 'recall': 0,
	'sensitivity': 0, 'specificity': 0, 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0
	}

	def evaluate_baseline(model, tokenizer, test_dataset, device, is_llama=False):
	"""評估未微調的基準模型"""
	model.eval()
	all_preds = []
	all_labels = []

	from torch.utils.data import DataLoader

	def collate_fn(batch):
	return {
	'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
	'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
	'labels': torch.tensor([item['label'] for item in batch])
	}

	dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

	with torch.no_grad():
	for batch in dataloader:
	labels = batch.pop('labels')
	target_device = model.device if is_llama else device
	inputs = {k: v.to(target_device) for k, v in batch.items()}
	outputs = model(**inputs)
	preds = torch.argmax(outputs.logits, dim=-1)
	all_preds.extend(preds.cpu().numpy())
	all_labels.extend(labels.numpy())

	precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', pos_label=1, zero_division=0)
	acc = accuracy_score(all_labels, all_preds)
	cm = confusion_matrix(all_labels, all_preds)
	if cm.shape == (2, 2):
	tn, fp, fn, tp = cm.ravel()
	else:
	tn = fp = fn = tp = 0
	sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
	specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

	return {
	'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall,
	'sensitivity': sensitivity, 'specificity': specificity,
	'tp': int(tp), 'tn': int(tn), 'fp': int(fp), 'fn': int(fn)
	}

	class WeightedTrainer(Trainer):
	def __init__(self, args, class_weights=None, use_focal_loss=False, focal_gamma=2.0, *kwargs):
	super().__init__(args, *kwargs)
	self.class_weights = class_weights
	self.use_focal_loss = use_focal_loss
	self.focal_gamma = focal_gamma

	def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
	labels = inputs.pop("labels")
	outputs = model(**inputs)
	logits = outputs.logits

	if self.class_weights is not None:
	class_weights = self.class_weights.to(logits.dtype).to(logits.device)
	else:
	class_weights = None

	if self.use_focal_loss:
	ce_loss = nn.CrossEntropyLoss(reduction='none')(
	logits.view(-1, 2), labels.view(-1)
	)
	pt = torch.exp(-ce_loss)
	focal_weight = (1 - pt) ** self.focal_gamma
	focal_loss = focal_weight * ce_loss

	if class_weights is not None:
	sample_weights = class_weights[labels.view(-1)]
	focal_loss = focal_loss * sample_weights

	loss = focal_loss.mean()
	else:
	loss_fct = nn.CrossEntropyLoss(weight=class_weights)
	loss = loss_fct(logits.view(-1, 2), labels.view(-1))

	return (loss, outputs) if return_outputs else loss

	def train_model(csv_file, base_model, method, num_epochs, batch_size, learning_rate,
	weight_decay, dropout, lora_r, lora_alpha, lora_dropout,
	adalora_init_r, adalora_tinit, adalora_tfinal, adalora_deltaT,
	adapter_len, prefix_len, best_metric):
	global trained_models, model_counter, baseline_results, baseline_performance_cache

	thorough_memory_cleanup()
	print(f"🧹 GPU 記憶體清理完成")

	model_mapping = {
	"BERT-base": "bert-base-uncased",
	"Llama-3.2-1B": "meta-llama/Llama-3.2-1B",
	}

	model_name = model_mapping.get(base_model, "bert-base-uncased")
	is_llama = "llama" in model_name.lower()

	try:
	if csv_file is None:
	return "❌ 請上傳 CSV", "", "", ""

	df = pd.read_csv(csv_file.name)

	text_col = 'Text' if 'Text' in df.columns else 'text'
	label_col = 'label' if 'label' in df.columns else 'nbcd'

	if text_col not in df.columns or label_col not in df.columns:
	return f"❌ 需要 {text_col} 和 {label_col} 欄位", "", "", ""

	df_clean = pd.DataFrame({
	'text': df[text_col].astype(str),
	'label': df[label_col].astype(int)
	}).dropna()

	avg_length = df_clean['text'].str.len().mean()
	min_length = df_clean['text'].str.len().min()
	max_length = df_clean['text'].str.len().max()

	n0_original = int(sum(df_clean['label'] == 0))
	n1_original = int(sum(df_clean['label'] == 1))
	if n1_original == 0:
	return "❌ 無死亡樣本", "", "", ""

	ratio_original = n0_original / n1_original

	info = f"📊 原始資料: {len(df_clean)} 筆\n"
	info += f"📏 文本長度: 平均 {avg_length:.0f} \| 最小 {min_length} \| 最大 {max_length}\n"
	info += f"📈 原始分布 - 存活: {n0_original} \| 死亡: {n1_original} (比例 {ratio_original:.2f}:1)\n"


	# ⭐ 改這裡：強制所有模型都使用資料平衡
	if True: # 原本是 if is_llama，改成 True
	info += f"\n⚖️ 資料平衡策略：執行平衡處理...\n"
	df_class_0 = df_clean[df_clean['label'] == 0]
	df_class_1 = df_clean[df_clean['label'] == 1]

	# ⭐ 改這裡：BERT 和 Llama 用不同的平衡數量
	target_n = 500 if not is_llama else 700 # BERT 用 500，Llama 用 700

	if len(df_class_0) > target_n:
	df_class_0_balanced = resample(df_class_0, n_samples=target_n, random_state=42, replace=False)
	info += f" ✅ Class 0 欠採樣: {len(df_class_0)} → {len(df_class_0_balanced)} 筆\n"
	else:
	df_class_0_balanced = df_class_0
	info += f" ⚠️ Class 0 樣本數不足，保持 {len(df_class_0)} 筆\n"

	if len(df_class_1) < target_n:
	df_class_1_balanced = resample(df_class_1, n_samples=target_n, random_state=42, replace=True)
	info += f" ✅ Class 1 過採樣: {len(df_class_1)} → {len(df_class_1_balanced)} 筆\n"
	else:
	df_class_1_balanced = df_class_1
	info += f" ⚠️ Class 1 樣本數充足，保持 {len(df_class_1)} 筆\n"

	df_clean = pd.concat([df_class_0_balanced, df_class_1_balanced])
	df_clean = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)

	n0 = int(sum(df_clean['label'] == 0))
	n1 = int(sum(df_clean['label'] == 1))
	ratio = n0 / n1

	info += f"\n📊 平衡後資料: {len(df_clean)} 筆\n"
	info += f"📈 平衡後分布 - 存活: {n0} \| 死亡: {n1} (比例 {ratio:.2f}:1)\n"

	w0 = 1.0
	w1 = 1.0 # 已平衡，不需要額外權重

	info += f"🎯 類別權重: {w0:.4f} / {w1:.4f} (資料已平衡，使用相等權重)\n"

	info += f"🤖 模型: {base_model}\n"
	info += f"🔧 方法: {method.upper()}"

	if is_llama:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.pad_token_id = tokenizer.eos_token_id
	max_length = 512
	else:
	tokenizer = BertTokenizer.from_pretrained(model_name)
	max_length = 256

	dataset = Dataset.from_pandas(df_clean[['text', 'label']])

	def preprocess(ex):
	return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=max_length)

	tokenized = dataset.map(preprocess, batched=True, remove_columns=['text'])
	split = tokenized.train_test_split(test_size=0.2, seed=42)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	info += f"\n裝置: {'GPU ✅' if torch.cuda.is_available() else 'CPU ⚠️'}"

	# 評估基準模型
	baseline_key = f"{base_model}_baseline"

	if baseline_key in baseline_performance_cache:
	info += f"\n✅ 使用快取的 Baseline 評估結果\n"
	baseline_perf = baseline_performance_cache[baseline_key]
	else:
	info += f"\n🔍 首次評估 Baseline 模型...\n"

	if is_llama:
	baseline_model = AutoModelForSequenceClassification.from_pretrained(
	model_name, num_labels=2,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto" if torch.cuda.is_available() else None
	)
	baseline_model.config.pad_token_id = tokenizer.pad_token_id
	else:
	baseline_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
	baseline_model = baseline_model.to(device)

	baseline_perf = evaluate_baseline(baseline_model, tokenizer, split['test'], device, is_llama=is_llama)

	baseline_performance_cache[baseline_key] = baseline_perf
	baseline_results[baseline_key] = baseline_perf

	del baseline_model
	thorough_memory_cleanup()

	info += f"\n\n🔧 套用 {method.upper()} 微調..."

	if is_llama:
	model = AutoModelForSequenceClassification.from_pretrained(
	model_name, num_labels=2,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto" if torch.cuda.is_available() else None
	)
	model.config.pad_token_id = tokenizer.pad_token_id
	else:
	model = BertForSequenceClassification.from_pretrained(
	model_name, num_labels=2,
	hidden_dropout_prob=dropout,
	attention_probs_dropout_prob=dropout
	)

	peft_applied = False

	# 根據方法套用對應的 PEFT 配置
	if method == "lora":
	if is_llama:
	config = LoraConfig(
	task_type=TaskType.SEQ_CLS,
	r=int(lora_r),
	lora_alpha=int(lora_alpha),
	lora_dropout=lora_dropout,
	target_modules=["q_proj", "v_proj"],
	bias="none"
	)
	else:
	config = LoraConfig(
	task_type=TaskType.SEQ_CLS,
	r=int(lora_r),
	lora_alpha=int(lora_alpha),
	lora_dropout=lora_dropout,
	target_modules=["query", "value"],
	bias="none"
	)
	model = get_peft_model(model, config)
	peft_applied = True
	info += f"\n✅ LoRA 已套用（r={int(lora_r)}, alpha={int(lora_alpha)}）"

	elif method == "adalora":
	steps_per_epoch = len(split['train']) // int(batch_size)
	total_steps = steps_per_epoch * int(num_epochs)

	adjusted_tinit = min(int(adalora_tinit), int(total_steps * 0.2))
	adjusted_tfinal = min(int(adalora_tfinal), int(total_steps * 0.9))

	if adjusted_tinit >= adjusted_tfinal:
	adjusted_tinit = int(total_steps * 0.1)
	adjusted_tfinal = int(total_steps * 0.8)

	info += f"\n📊 AdaLoRA 步數調整：\n"
	info += f" 總訓練步數: {total_steps}\n"
	info += f" tinit: {int(adalora_tinit)} → {adjusted_tinit}\n"
	info += f" tfinal: {int(adalora_tfinal)} → {adjusted_tfinal}\n"

	if is_llama:
	config = AdaLoraConfig(
	task_type=TaskType.SEQ_CLS,
	r=int(lora_r),
	lora_alpha=int(lora_alpha),
	lora_dropout=lora_dropout,
	target_modules=["q_proj", "v_proj"],
	init_r=int(adalora_init_r),
	tinit=adjusted_tinit,
	tfinal=adjusted_tfinal,
	deltaT=int(adalora_deltaT)
	)
	else:
	config = AdaLoraConfig(
	task_type=TaskType.SEQ_CLS,
	r=int(lora_r),
	lora_alpha=int(lora_alpha),
	lora_dropout=lora_dropout,
	target_modules=["query", "value"],
	init_r=int(adalora_init_r),
	tinit=adjusted_tinit,
	tfinal=adjusted_tfinal,
	deltaT=int(adalora_deltaT)
	)
	model = get_peft_model(model, config)
	peft_applied = True
	info += f"\n✅ AdaLoRA 已套用（r={int(lora_r)}, alpha={int(lora_alpha)}, init_r={int(adalora_init_r)}）"

	elif method == "adapter":
	# Adapter (LLaMA-Adapter style)
	if is_llama:
	config = AdaptionPromptConfig(
	task_type=TaskType.SEQ_CLS,
	adapter_len=int(adapter_len),
	adapter_layers=30 # 根據 Llama 層數調整
	)
	model = get_peft_model(model, config)
	peft_applied = True
	info += f"\n✅ Adapter 已套用（length={int(adapter_len)}）"
	else:
	info += f"\n⚠️ Adapter 僅支援 Llama，改用 LoRA"
	config = LoraConfig(
	task_type=TaskType.SEQ_CLS,
	r=int(lora_r),
	lora_alpha=int(lora_alpha),
	lora_dropout=lora_dropout,
	target_modules=["query", "value"],
	bias="none"
	)
	model = get_peft_model(model, config)
	peft_applied = True

	elif method == "prefix":
	# Prefix Tuning
	config = PrefixTuningConfig(
	task_type=TaskType.SEQ_CLS,
	num_virtual_tokens=int(prefix_len),
	prefix_projection=True # 使用 MLP 投影
	)
	model = get_peft_model(model, config)
	peft_applied = True
	info += f"\n✅ Prefix Tuning 已套用（tokens={int(prefix_len)}）"

	elif method == "prompt":
	# Prompt Tuning (類似 Prefix 但更簡單)
	config = PrefixTuningConfig(
	task_type=TaskType.SEQ_CLS,
	num_virtual_tokens=int(prefix_len),
	prefix_projection=False # 不使用投影
	)
	model = get_peft_model(model, config)
	peft_applied = True
	info += f"\n✅ Prompt Tuning 已套用（tokens={int(prefix_len)}）"

	elif method == "bitfit":
	# BitFit: 只訓練 bias
	for name, param in model.named_parameters():
	if 'bias' not in name:
	param.requires_grad = False
	peft_applied = True
	info += f"\n✅ BitFit 已套用（僅訓練 bias 參數）"

	if not peft_applied:
	info += f"\n⚠️ 警告：{method} 方法未被識別，使用 Full Fine-tuning"

	if not is_llama:
	model = model.to(device)

	total = sum(p.numel() for p in model.parameters())
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	info += f"\n\n💾 參數量\n總參數: {total:,}\n可訓練: {trainable:,}\n比例: {trainable/total*100:.2f}%"

	if is_llama:
	weight_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	weights = torch.tensor([w0, w1], dtype=weight_dtype).to(model.device)
	else:
	weights = torch.tensor([w0, w1], dtype=torch.float32).to(device)

	info += f"\n⚖️ 權重 dtype: {weights.dtype} \| device: {weights.device}\n"

	metrics_lower_is_better = ['loss']
	is_greater_better = best_metric not in metrics_lower_is_better

	args = TrainingArguments(
	output_dir='./results',
	num_train_epochs=int(num_epochs),
	per_device_train_batch_size=int(batch_size),
	per_device_eval_batch_size=int(batch_size)*2,
	learning_rate=float(learning_rate),
	weight_decay=float(weight_decay),
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model=best_metric,
	greater_is_better=is_greater_better,
	report_to="none",
	logging_steps=10,
	warmup_steps=100 if is_llama else 50,
	warmup_ratio=0.1 if is_llama else 0.0,
	logging_first_step=True,
	bf16=(torch.cuda.is_available() and is_llama),
	gradient_accumulation_steps=4 if is_llama else 1,
	gradient_checkpointing=True if is_llama else False,
	optim="adamw_torch",
	seed=42,
	max_grad_norm=0.3 if is_llama else 1.0,
	)

	info += f"\n📊 最佳模型選擇: {best_metric} ({'越大越好' if is_greater_better else '越小越好'})\n"

	focal_gamma = 2.0
	trainer = WeightedTrainer(
	model=model,
	args=args,
	train_dataset=split['train'],
	eval_dataset=split['test'],
	compute_metrics=compute_metrics,
	class_weights=weights,
	use_focal_loss=True,
	focal_gamma=2.0
	)

	if is_llama:
	info += f"\n⚡ Llama 使用 Focal Loss (gamma={focal_gamma}) + {weight_boost}x 權重策略"

	info += "\n\n⏳ 開始訓練..."
	info += f"\n📊 訓練前檢查："
	info += f"\n - 訓練樣本: {len(split['train'])}"
	info += f"\n - 測試樣本: {len(split['test'])}"
	info += f"\n - 批次數/epoch: {len(split['train']) // int(batch_size)}"

	train_result = trainer.train()

	info += f"\n\n✅ 訓練完成！"
	info += f"\n📉 最終 Training Loss: {train_result.training_loss:.4f}"

	results = trainer.evaluate()

	model_counter += 1
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	model_id = f"{base_model}_{method}_{timestamp}"
	trained_models[model_id] = {
	'model': model,
	'tokenizer': tokenizer,
	'results': results,
	'baseline': baseline_perf,
	'config': {
	'type': base_model,
	'model_name': model_name,
	'method': method,
	'metric': best_metric,
	'is_llama': is_llama
	},
	'timestamp': timestamp,
	'stage': 1 # 標記為第一階段訓練
	}

	metric_name_map = {
	'f1': 'F1',
	'accuracy': 'Accuracy',
	'precision': 'Precision',
	'recall': 'Recall',
	'sensitivity': 'Sensitivity',
	'specificity': 'Specificity'
	}

	baseline_val = baseline_perf[best_metric]
	finetuned_val = results[f'eval_{best_metric}']
	improvement = calculate_improvement(baseline_val, finetuned_val)

	baseline_output = f"🔬 純 {base_model}（未微調）\n\n"
	baseline_output += f"📊 {metric_name_map[best_metric]} 表現\n"
	baseline_output += f"{metric_name_map[best_metric]}: {baseline_val:.4f}\n\n"
	baseline_output += f"混淆矩陣\n"
	baseline_output += f"TP: {baseline_perf['tp']} \| TN: {baseline_perf['tn']}\n"
	baseline_output += f"FP: {baseline_perf['fp']} \| FN: {baseline_perf['fn']}"

	finetuned_output = f"✅ 微調 {base_model}\n"
	finetuned_output += f"模型: {model_id}\n\n"
	finetuned_output += f"📊 {metric_name_map[best_metric]} 表現\n"
	finetuned_output += f"{metric_name_map[best_metric]}: {finetuned_val:.4f}\n\n"
	finetuned_output += f"混淆矩陣\n"
	finetuned_output += f"TP: {results['eval_tp']} \| TN: {results['eval_tn']}\n"
	finetuned_output += f"FP: {results['eval_fp']} \| FN: {results['eval_fn']}"

	comparison_output = f"📊 純 {base_model} vs 微調 {base_model} 比較\n\n"
	comparison_output += f"🎯 選擇的評估指標: {metric_name_map[best_metric]}\n\n"
	comparison_output += f"{metric_name_map[best_metric]} 改善：\n"
	comparison_output += f"{baseline_val:.4f} → {finetuned_val:.4f} ({format_improve(improvement)})\n\n"
	comparison_output += f"混淆矩陣變化：\n"
	comparison_output += f"TP: {baseline_perf['tp']} → {results['eval_tp']} ({results['eval_tp'] - baseline_perf['tp']:+d})\n"
	comparison_output += f"TN: {baseline_perf['tn']} → {results['eval_tn']} ({results['eval_tn'] - baseline_perf['tn']:+d})\n"
	comparison_output += f"FP: {baseline_perf['fp']} → {results['eval_fp']} ({results['eval_fp'] - baseline_perf['fp']:+d})\n"
	comparison_output += f"FN: {baseline_perf['fn']} → {results['eval_fn']} ({results['eval_fn'] - baseline_perf['fn']:+d})"

	info += "\n\n✅ 訓練完成！"

	thorough_memory_cleanup()

	return info, baseline_output, finetuned_output, comparison_output

	except Exception as e:
	thorough_memory_cleanup()

	import traceback
	error_msg = f"❌ 錯誤: {str(e)}\n\n{traceback.format_exc()}"
	return error_msg, "", "", ""

	def second_stage_train(first_model_id, csv_file, num_epochs, batch_size, learning_rate, best_metric):
	"""二次微調：基於已訓練模型繼續訓練"""
	global trained_models, second_stage_models

	if not first_model_id or first_model_id not in trained_models:
	return "❌ 請選擇第一階段模型", "", ""

	if csv_file is None:
	return "❌ 請上傳新的訓練資料", "", ""

	try:
	thorough_memory_cleanup()

	# 載入第一階段模型
	first_model_info = trained_models[first_model_id]
	model = first_model_info['model']
	tokenizer = first_model_info['tokenizer']
	config = first_model_info['config']
	is_llama = config['is_llama']

	info = f"🔄 二次微調\n"
	info += f"基於模型: {first_model_id}\n"
	info += f"方法: {config['method'].upper()}\n\n"

	# 讀取新資料
	df = pd.read_csv(csv_file.name)
	text_col = 'Text' if 'Text' in df.columns else 'text'
	label_col = 'label' if 'label' in df.columns else 'nbcd'

	df_clean = pd.DataFrame({
	'text': df[text_col].astype(str),
	'label': df[label_col].astype(int)
	}).dropna()

	n0 = int(sum(df_clean['label'] == 0))
	n1 = int(sum(df_clean['label'] == 1))

	info += f"📊 新資料: {len(df_clean)} 筆\n"
	info += f"📈 分布 - 存活: {n0} \| 死亡: {n1}\n\n"

	# 準備資料集
	max_length = 512 if is_llama else 256
	dataset = Dataset.from_pandas(df_clean[['text', 'label']])

	def preprocess(ex):
	return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=max_length)

	tokenized = dataset.map(preprocess, batched=True, remove_columns=['text'])
	split = tokenized.train_test_split(test_size=0.2, seed=42)

	# 計算權重
	if is_llama:
	w0 = 1.0
	w1 = (n0 / n1) * 1.5
	weight_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	weights = torch.tensor([w0, w1], dtype=weight_dtype).to(model.device)
	else:
	w0 = 1.0
	w1 = min((n0 / n1) * 0.8, 15.0)
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	weights = torch.tensor([w0, w1], dtype=torch.float32).to(device)

	info += f"🎯 類別權重: {w0:.4f} / {w1:.4f}\n"

	# 訓練配置
	args = TrainingArguments(
	output_dir='./results_stage2',
	num_train_epochs=int(num_epochs),
	per_device_train_batch_size=int(batch_size),
	per_device_eval_batch_size=int(batch_size)*2,
	learning_rate=float(learning_rate) * 0.5, # 二次微調使用較小學習率
	weight_decay=0.01,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model=best_metric,
	greater_is_better=True,
	report_to="none",
	logging_steps=10,
	seed=43 # 不同的 seed
	)

	info += f"\n⏳ 開始二次微調（學習率減半）...\n"

	trainer = WeightedTrainer(
	model=model,
	args=args,
	train_dataset=split['train'],
	eval_dataset=split['test'],
	compute_metrics=compute_metrics,
	class_weights=weights,
	use_focal_loss=is_llama
	)

	train_result = trainer.train()
	results = trainer.evaluate()

	info += f"\n✅ 二次微調完成！\n"
	info += f"📉 Training Loss: {train_result.training_loss:.4f}\n"

	# 保存二次微調模型
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	model_id = f"{first_model_id}_stage2_{timestamp}"
	second_stage_models[model_id] = {
	'model': model,
	'tokenizer': tokenizer,
	'results': results,
	'first_stage_id': first_model_id,
	'first_stage_results': first_model_info['results'],
	'baseline': first_model_info['baseline'],
	'config': config,
	'timestamp': timestamp,
	'stage': 2
	}

	# 同時加入 trained_models 以便預測使用
	trained_models[model_id] = second_stage_models[model_id]

	metric_name_map = {
	'f1': 'F1', 'accuracy': 'Accuracy', 'precision': 'Precision',
	'recall': 'Recall', 'sensitivity': 'Sensitivity', 'specificity': 'Specificity'
	}

	# 比較結果
	baseline_val = first_model_info['baseline'][best_metric]
	stage1_val = first_model_info['results'][f'eval_{best_metric}']
	stage2_val = results[f'eval_{best_metric}']

	stage1_improve = calculate_improvement(baseline_val, stage1_val)
	stage2_improve = calculate_improvement(stage1_val, stage2_val)
	total_improve = calculate_improvement(baseline_val, stage2_val)

	stage1_output = f"🥇 第一階段微調結果\n\n"
	stage1_output += f"模型: {first_model_id}\n"
	stage1_output += f"{metric_name_map[best_metric]}: {stage1_val:.4f}\n"
	stage1_output += f"較 Baseline 改善: {format_improve(stage1_improve)}\n\n"
	stage1_output += f"混淆矩陣\n"
	stage1_output += f"TP: {first_model_info['results']['eval_tp']} \| TN: {first_model_info['results']['eval_tn']}\n"
	stage1_output += f"FP: {first_model_info['results']['eval_fp']} \| FN: {first_model_info['results']['eval_fn']}"

	stage2_output = f"🥈 第二階段微調結果\n\n"
	stage2_output += f"模型: {model_id}\n"
	stage2_output += f"{metric_name_map[best_metric]}: {stage2_val:.4f}\n"
	stage2_output += f"較第一階段改善: {format_improve(stage2_improve)}\n"
	stage2_output += f"較 Baseline 總改善: {format_improve(total_improve)}\n\n"
	stage2_output += f"混淆矩陣\n"
	stage2_output += f"TP: {results['eval_tp']} \| TN: {results['eval_tn']}\n"
	stage2_output += f"FP: {results['eval_fp']} \| FN: {results['eval_fn']}"

	thorough_memory_cleanup()

	return info, stage1_output, stage2_output

	except Exception as e:
	thorough_memory_cleanup()
	import traceback
	return f"❌ 錯誤: {str(e)}\n\n{traceback.format_exc()}", "", ""

	def evaluate_on_new_data(csv_file, selected_models):
	"""在全新資料上評估多個模型"""
	global trained_models, baseline_model_cache

	if csv_file is None:
	return "❌ 請上傳測試資料"

	if not selected_models:
	return "❌ 請至少選擇一個模型"

	try:
	# 讀取測試資料
	df = pd.read_csv(csv_file.name)
	text_col = 'Text' if 'Text' in df.columns else 'text'
	label_col = 'label' if 'label' in df.columns else 'nbcd'

	df_clean = pd.DataFrame({
	'text': df[text_col].astype(str),
	'label': df[label_col].astype(int)
	}).dropna()

	output = f"# 📊 全新資料評估報告\n\n"
	output += f"## 測試資料概況\n"
	output += f"- 總樣本數: {len(df_clean)}\n"
	output += f"- 存活 (0): {sum(df_clean['label']==0)}\n"
	output += f"- 死亡 (1): {sum(df_clean['label']==1)}\n\n"

	output += f"## 模型表現比較\n\n"

	results_table = []

	for model_id in selected_models:
	if model_id not in trained_models:
	continue

	info = trained_models[model_id]
	model = info['model']
	tokenizer = info['tokenizer']
	config = info['config']
	is_llama = config['is_llama']

	# 準備資料
	max_length = 512 if is_llama else 256
	dataset = Dataset.from_pandas(df_clean[['text', 'label']])

	def preprocess(ex):
	return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=max_length)

	tokenized = dataset.map(preprocess, batched=True, remove_columns=['text'])

	# 評估
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	perf = evaluate_baseline(model, tokenizer, tokenized, device, is_llama=is_llama)

	stage = info.get('stage', 1)
	stage_label = "🔬 Baseline" if "baseline" in model_id else f"🥇 Stage {stage}"

	results_table.append({
	'model': model_id,
	'stage': stage_label,
	'method': config['method'].upper(),
	'f1': perf['f1'],
	'acc': perf['accuracy'],
	'prec': perf['precision'],
	'recall': perf['recall'],
	'sens': perf['sensitivity'],
	'spec': perf['specificity'],
	'tp': perf['tp'],
	'tn': perf['tn'],
	'fp': perf['fp'],
	'fn': perf['fn']
	})

	# 也評估 baseline 模型
	if results_table:
	first_model = trained_models[selected_models[0]]
	config = first_model['config']
	model_name = config['model_name']
	is_llama = config['is_llama']

	cache_key = model_name
	if cache_key not in baseline_model_cache:
	if is_llama:
	baseline_model = AutoModelForSequenceClassification.from_pretrained(
	model_name, num_labels=2,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto" if torch.cuda.is_available() else None
	)
	baseline_model.config.pad_token_id = first_model['tokenizer'].pad_token_id
	else:
	baseline_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	baseline_model = baseline_model.to(device)
	baseline_model.eval()
	baseline_model_cache[cache_key] = baseline_model
	else:
	baseline_model = baseline_model_cache[cache_key]

	tokenizer = first_model['tokenizer']
	max_length = 512 if is_llama else 256
	dataset = Dataset.from_pandas(df_clean[['text', 'label']])

	def preprocess(ex):
	return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=max_length)

	tokenized = dataset.map(preprocess, batched=True, remove_columns=['text'])
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	baseline_perf = evaluate_baseline(baseline_model, tokenizer, tokenized, device, is_llama=is_llama)

	results_table.insert(0, {
	'model': f"{config['type']}-Baseline",
	'stage': "🔬 Baseline",
	'method': "None",
	'f1': baseline_perf['f1'],
	'acc': baseline_perf['accuracy'],
	'prec': baseline_perf['precision'],
	'recall': baseline_perf['recall'],
	'sens': baseline_perf['sensitivity'],
	'spec': baseline_perf['specificity'],
	'tp': baseline_perf['tp'],
	'tn': baseline_perf['tn'],
	'fp': baseline_perf['fp'],
	'fn': baseline_perf['fn']
	})

	# 輸出表格
	output += "\| 模型 \| 階段 \| 方法 \| F1 \| Acc \| Prec \| Recall \| Sens \| Spec \|\n"
	output += "\|------\|------\|------\|-----\|-----\|------\|--------\|------\|------\|\n"

	for r in results_table:
	output += f"\| {r['model'][:30]} \| {r['stage']} \| {r['method']} \| "
	output += f"{r['f1']:.4f} \| {r['acc']:.4f} \| {r['prec']:.4f} \| "
	output += f"{r['recall']:.4f} \| {r['sens']:.4f} \| {r['spec']:.4f} \|\n"

	output += "\n## 混淆矩陣\n\n"
	output += "\| 模型 \| TP \| TN \| FP \| FN \|\n"
	output += "\|------\|----\|----\|----\|\----\|\n"

	for r in results_table:
	output += f"\| {r['model'][:30]} \| {r['tp']} \| {r['tn']} \| {r['fp']} \| {r['fn']} \|\n"

	# 找出最佳模型
	output += "\n## 🏆 最佳模型\n\n"
	for metric in ['f1', 'acc', 'sens', 'spec']:
	best = max(results_table, key=lambda x: x[metric])
	baseline_val = results_table[0][metric]
	improve = calculate_improvement(baseline_val, best[metric])

	metric_names = {'f1': 'F1', 'acc': 'Accuracy', 'sens': 'Sensitivity', 'spec': 'Specificity'}
	output += f"{metric_names[metric]}: {best['model'][:30]} ({best[metric]:.4f}, 較 Baseline 改善 {format_improve(improve)})\n\n"

	return output

	except Exception as e:
	import traceback
	return f"❌ 錯誤: {str(e)}\n\n{traceback.format_exc()}"

	def predict(model_id, text):
	"""預測功能（保持原樣）"""
	global baseline_model_cache

	if not model_id or model_id not in trained_models:
	return "❌ 請選擇模型"
	if not text:
	return "❌ 請輸入文字"

	try:
	info = trained_models[model_id]
	model, tokenizer = info['model'], info['tokenizer']
	config = info['config']
	is_llama = config.get('is_llama', False)
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	max_length = 512 if is_llama else 256

	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)

	if not is_llama:
	inputs_cuda = {k: v.to(device) for k, v in inputs.items()}
	else:
	inputs_cuda = {k: v.to(model.device) for k, v in inputs.items()}

	model.eval()
	with torch.no_grad():
	outputs = model(**inputs_cuda)
	probs_finetuned = torch.nn.functional.softmax(outputs.logits, dim=-1)
	pred_finetuned = torch.argmax(probs_finetuned, dim=-1).item()

	result_finetuned = "存活" if pred_finetuned == 0 else "死亡"

	cache_key = config['model_name']
	if cache_key not in baseline_model_cache:
	if is_llama:
	baseline_model = AutoModelForSequenceClassification.from_pretrained(
	config['model_name'], num_labels=2,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto" if torch.cuda.is_available() else None
	)
	baseline_model.config.pad_token_id = tokenizer.pad_token_id
	else:
	baseline_model = BertForSequenceClassification.from_pretrained(config['model_name'], num_labels=2)
	baseline_model = baseline_model.to(device)
	baseline_model.eval()
	baseline_model_cache[cache_key] = baseline_model
	else:
	baseline_model = baseline_model_cache[cache_key]

	with torch.no_grad():
	if is_llama:
	inputs_baseline = {k: v.to(baseline_model.device) for k, v in inputs.items()}
	else:
	inputs_baseline = inputs_cuda
	outputs_baseline = baseline_model(**inputs_baseline)
	probs_baseline = torch.nn.functional.softmax(outputs_baseline.logits, dim=-1)
	pred_baseline = torch.argmax(probs_baseline, dim=-1).item()

	result_baseline = "存活" if pred_baseline == 0 else "死亡"

	agreement = "✅ 一致" if pred_finetuned == pred_baseline else "⚠️ 不一致"

	metric_name_map = {
	'f1': 'F1',
	'accuracy': 'Accuracy',
	'precision': 'Precision',
	'recall': 'Recall',
	'sensitivity': 'Sensitivity',
	'specificity': 'Specificity'
	}

	selected_metric = config['metric']
	metric_display = metric_name_map[selected_metric]

	baseline_metric_val = info['baseline'][selected_metric]
	finetuned_metric_val = info['results'][f'eval_{selected_metric}']
	improvement = calculate_improvement(baseline_metric_val, finetuned_metric_val)

	stage = info.get('stage', 1)
	stage_label = f"Stage {stage}" if stage > 1 else "微調"

	output = f"""🔮 預測結果比較

	📝 輸入文字: {text[:100]}{'...' if len(text) > 100 else ''}

	{'='*50}

	🧬 {stage_label}模型 ({model_id})
	預測: {result_finetuned}
	信心: {probs_finetuned[0][pred_finetuned].item():.2%}
	機率分布:
	• 存活: {probs_finetuned[0][0].item():.2%}
	• 死亡: {probs_finetuned[0][1].item():.2%}

	{'='*50}

	🔬 基準模型（未微調 {config['type']}）
	預測: {result_baseline}
	信心: {probs_baseline[0][pred_baseline].item():.2%}
	機率分布:
	• 存活: {probs_baseline[0][0].item():.2%}
	• 死亡: {probs_baseline[0][1].item():.2%}

	{'='*50}

	📊 結論
	兩模型預測: {agreement}
	"""

	if pred_finetuned != pred_baseline:
	output += f"\n💡 分析: {stage_label}模型預測為【{result_finetuned}】，而基準模型預測為【{result_baseline}】"
	output += f"\n 這顯示了 fine-tuning 對此案例的影響！"

	output += f"""

	📈 模型表現（基於 {metric_display}）
	{stage_label}模型 {metric_display}: {finetuned_metric_val:.4f}
	基準模型 {metric_display}: {baseline_metric_val:.4f}
	改善幅度: {format_improve(improvement)}
	"""

	return output

	except Exception as e:
	import traceback
	return f"❌ 錯誤: {str(e)}\n\n{traceback.format_exc()}"

	def compare():
	"""比較所有模型"""
	if not trained_models:
	return "❌ 尚未訓練模型"

	text = "# 📊 模型比較\n\n"
	text += "## 微調模型表現\n\n"
	text += "\| 模型 \| 階段 \| 基礎 \| 方法 \| F1 \| Acc \| Prec \| Recall \| Sens \| Spec \|\n"
	text += "\|------\|------\|------\|------\|-----\|-----\|------\|--------\|------\|------\|\n"

	for mid, info in trained_models.items():
	r = info['results']
	c = info['config']
	stage = info.get('stage', 1)
	text += f"\| {mid} \| Stage{stage} \| {c['type']} \| {c['method'].upper()} \| {r['eval_f1']:.4f} \| {r['eval_accuracy']:.4f} \| "
	text += f"{r['eval_precision']:.4f} \| {r['eval_recall']:.4f} \| "
	text += f"{r['eval_sensitivity']:.4f} \| {r['eval_specificity']:.4f} \|\n"

	text += "\n## 基準模型表現（未微調）\n\n"
	text += "\| 模型 \| F1 \| Acc \| Prec \| Recall \| Sens \| Spec \|\n"
	text += "\|------\|-----\|-----\|------\|--------\|------\|------\|\n"

	seen_baselines = set()
	for mid, info in trained_models.items():
	b = info['baseline']
	c = info['config']
	baseline_key = f"{c['type']}-baseline"
	if baseline_key not in seen_baselines:
	text += f"\| {baseline_key} \| {b['f1']:.4f} \| {b['accuracy']:.4f} \| "
	text += f"{b['precision']:.4f} \| {b['recall']:.4f} \| "
	text += f"{b['sensitivity']:.4f} \| {b['specificity']:.4f} \|\n"
	seen_baselines.add(baseline_key)

	text += "\n## 🏆 最佳模型\n\n"
	for metric in ['f1', 'accuracy', 'precision', 'recall', 'sensitivity', 'specificity']:
	best = max(trained_models.items(), key=lambda x: x[1]['results'][f'eval_{metric}'])
	baseline_val = best[1]['baseline'][metric]
	finetuned_val = best[1]['results'][f'eval_{metric}']
	improvement = calculate_improvement(baseline_val, finetuned_val)

	text += f"{metric.upper()}: {best[0]} ({finetuned_val:.4f}, 改善 {format_improve(improvement)})\n\n"

	return text

	def refresh_model_list():
	return gr.Dropdown(choices=list(trained_models.keys()))

	def refresh_model_checkboxes():
	return gr.CheckboxGroup(choices=list(trained_models.keys()))

	def clear_gpu_memory():
	"""手動清理 GPU 記憶體"""
	global baseline_model_cache, baseline_performance_cache

	try:
	baseline_model_cache.clear()
	baseline_performance_cache.clear()

	thorough_memory_cleanup()

	if torch.cuda.is_available():
	allocated = torch.cuda.memory_allocated(0) / 1024**3
	reserved = torch.cuda.memory_reserved(0) / 1024**3
	max_allocated = torch.cuda.max_memory_allocated(0) / 1024**3
	return f"""✅ GPU 記憶體清理完成！

	當前狀態：
	已分配: {allocated:.2f} GB
	已保留: {reserved:.2f} GB
	峰值使用: {max_allocated:.2f} GB"""
	else:
	return "✅ 記憶體清理完成（CPU 模式）"
	except Exception as e:
	return f"❌ 清理失敗: {str(e)}"

	def update_method_params(method):
	"""根據選擇的方法更新參數顯示"""
	return {
	lora_params: gr.update(visible=method in ["lora", "adalora"]),
	adalora_params: gr.update(visible=method == "adalora"),
	adapter_params: gr.update(visible=method == "adapter"),
	prefix_params: gr.update(visible=method in ["prefix", "prompt"])
	}

	# Gradio UI
	with gr.Blocks(title="完整版 Fine-tuning 平台", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🤖 完整版 BERT & Llama Fine-tuning 平台 v3")
	gr.Markdown("### 支持 6 種微調方法 + 二次微調 + 全新資料測試")
	gr.Markdown("#### ✨ LoRA \| AdaLoRA \| Adapter \| Prefix Tuning \| Prompt Tuning \| BitFit")

	with gr.Tab("🥇 第一階段訓練"):
	gr.Markdown("## 步驟 1: 選擇基礎模型")

	base_model = gr.Dropdown(
	choices=["BERT-base", "Llama-3.2-1B"],
	value="BERT-base",
	label="基礎模型"
	)

	gr.Markdown("### 🧹 記憶體管理")
	with gr.Row():
	clear_mem_btn = gr.Button("🧹 清理 GPU 記憶體", variant="secondary")
	mem_status = gr.Textbox(label="記憶體狀態", lines=4, interactive=False, scale=2)

	gr.Markdown("## 步驟 2: 選擇微調方法")

	method = gr.Radio(
	choices=["lora", "adalora", "adapter", "prefix", "prompt", "bitfit"],
	value="lora",
	label="微調方法"
	)

	gr.Markdown("## 步驟 3: 上傳資料")
	csv_file = gr.File(label="CSV 檔案 (需包含 Text/text 和 label/nbcd 欄位)", file_types=[".csv"])

	gr.Markdown("## 步驟 4: 設定訓練參數")

	with gr.Row():
	num_epochs = gr.Number(value=8, label="訓練輪數", minimum=1, maximum=100, precision=0)
	batch_size = gr.Number(value=16, label="批次大小", minimum=1, maximum=128, precision=0)
	learning_rate = gr.Number(value=2e-5, label="學習率", minimum=0, maximum=1)

	with gr.Row():
	weight_decay = gr.Number(value=0.01, label="權重衰減", minimum=0, maximum=1)
	dropout = gr.Number(value=0.3, label="Dropout", minimum=0, maximum=1)

	gr.Markdown("### 🔧 方法參數")

	with gr.Group(visible=True) as lora_params:
	gr.Markdown("#### LoRA 參數")
	with gr.Row():
	lora_r = gr.Number(value=32, label="Rank (r)", minimum=1, maximum=256, precision=0)
	lora_alpha = gr.Number(value=64, label="Alpha", minimum=1, maximum=512, precision=0)
	lora_dropout = gr.Number(value=0.1, label="Dropout", minimum=0, maximum=1)

	with gr.Group(visible=False) as adalora_params:
	gr.Markdown("#### AdaLoRA 參數")
	with gr.Row():
	adalora_init_r = gr.Number(value=12, label="初始 Rank", minimum=1, maximum=64, precision=0)
	adalora_tinit = gr.Number(value=200, label="Tinit", minimum=0, maximum=1000, precision=0)
	with gr.Row():
	adalora_tfinal = gr.Number(value=1000, label="Tfinal", minimum=0, maximum=5000, precision=0)
	adalora_deltaT = gr.Number(value=10, label="DeltaT", minimum=1, maximum=100, precision=0)

	with gr.Group(visible=False) as adapter_params:
	gr.Markdown("#### Adapter 參數")
	adapter_len = gr.Number(value=10, label="Adapter Length", minimum=1, maximum=50, precision=0,
	info="Adapter tokens 數量")

	with gr.Group(visible=False) as prefix_params:
	gr.Markdown("#### Prefix/Prompt 參數")
	prefix_len = gr.Number(value=20, label="Virtual Tokens", minimum=1, maximum=100, precision=0,
	info="虛擬 token 數量")

	method.change(
	update_method_params,
	inputs=[method],
	outputs=[lora_params, adalora_params, adapter_params, prefix_params]
	)

	best_metric = gr.Dropdown(
	choices=["f1", "accuracy", "precision", "recall", "sensitivity", "specificity"],
	value="f1",
	label="最佳模型選擇指標"
	)

	train_btn = gr.Button("🚀 開始訓練", variant="primary", size="lg")

	gr.Markdown("## 📊 訓練結果")

	data_info = gr.Textbox(label="📋 資料資訊", lines=10)

	with gr.Row():
	baseline_result = gr.Textbox(label="🔬 Baseline", lines=14)
	finetuned_result = gr.Textbox(label="✅ 微調模型", lines=14)

	comparison_result = gr.Textbox(label="📊 比較", lines=14)

	clear_mem_btn.click(clear_gpu_memory, outputs=[mem_status])

	train_btn.click(
	train_model,
	inputs=[csv_file, base_model, method, num_epochs, batch_size, learning_rate,
	weight_decay, dropout, lora_r, lora_alpha, lora_dropout,
	adalora_init_r, adalora_tinit, adalora_tfinal, adalora_deltaT,
	adapter_len, prefix_len, best_metric],
	outputs=[data_info, baseline_result, finetuned_result, comparison_result]
	)

	with gr.Tab("🥈 第二階段訓練"):
	gr.Markdown("## 二次微調：基於已訓練模型繼續訓練")
	gr.Markdown("### 選擇第一階段模型，上傳新資料，進行二次微調")

	with gr.Row():
	first_model_select = gr.Dropdown(label="選擇第一階段模型", choices=list(trained_models.keys()))
	refresh_stage1 = gr.Button("🔄 刷新模型列表")

	stage2_csv = gr.File(label="上傳新的訓練資料 CSV", file_types=[".csv"])

	gr.Markdown("### 二次微調參數")
	with gr.Row():
	stage2_epochs = gr.Number(value=3, label="訓練輪數", minimum=1, maximum=20, precision=0,
	info="建議較少輪數")
	stage2_batch = gr.Number(value=16, label="批次大小", minimum=1, maximum=128, precision=0)
	stage2_lr = gr.Number(value=1e-5, label="學習率", minimum=0, maximum=1,
	info="自動減半，建議更小")

	stage2_metric = gr.Dropdown(
	choices=["f1", "accuracy", "precision", "recall", "sensitivity", "specificity"],
	value="f1",
	label="評估指標"
	)

	stage2_train_btn = gr.Button("🔄 開始二次微調", variant="primary", size="lg")

	gr.Markdown("## 📊 二次微調結果")

	stage2_info = gr.Textbox(label="📋 訓練資訊", lines=8)

	with gr.Row():
	stage1_result = gr.Textbox(label="🥇 第一階段", lines=12)
	stage2_result = gr.Textbox(label="🥈 第二階段", lines=12)

	refresh_stage1.click(refresh_model_list, outputs=[first_model_select])

	stage2_train_btn.click(
	second_stage_train,
	inputs=[first_model_select, stage2_csv, stage2_epochs, stage2_batch, stage2_lr, stage2_metric],
	outputs=[stage2_info, stage1_result, stage2_result]
	)

	with gr.Tab("🆕 全新資料測試"):
	gr.Markdown("## 在全新資料上測試所有模型")
	gr.Markdown("### 上傳模型未見過的測試資料，比較 Baseline、Stage1、Stage2 的表現")

	test_csv = gr.File(label="上傳測試資料 CSV", file_types=[".csv"])

	with gr.Row():
	test_models = gr.CheckboxGroup(label="選擇要測試的模型", choices=list(trained_models.keys()))
	refresh_test = gr.Button("🔄 刷新")

	test_btn = gr.Button("🧪 開始測試", variant="primary", size="lg")

	test_output = gr.Markdown(label="測試結果")

	refresh_test.click(refresh_model_checkboxes, outputs=[test_models])

	test_btn.click(
	evaluate_on_new_data,
	inputs=[test_csv, test_models],
	outputs=[test_output]
	)

	with gr.Tab("🔮 預測"):
	gr.Markdown("## 使用訓練好的模型預測")

	with gr.Row():
	model_drop = gr.Dropdown(label="選擇模型", choices=list(trained_models.keys()))
	refresh = gr.Button("🔄 刷新")

	text_input = gr.Textbox(label="輸入病例描述", lines=4,
	placeholder="Patient diagnosed with...")
	predict_btn = gr.Button("預測", variant="primary", size="lg")
	pred_output = gr.Textbox(label="預測結果", lines=20)

	refresh.click(refresh_model_list, outputs=[model_drop])
	predict_btn.click(predict, inputs=[model_drop, text_input], outputs=[pred_output])

	gr.Examples(
	examples=[
	["Patient with stage II breast cancer, good response to treatment."],
	["Advanced metastatic cancer, multiple organ involvement."]
	],
	inputs=text_input
	)

	with gr.Tab("📊 比較"):
	gr.Markdown("## 比較所有模型")
	compare_btn = gr.Button("比較", variant="primary", size="lg")
	compare_output = gr.Markdown()
	compare_btn.click(compare, outputs=[compare_output])

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	max_threads=4
	)