Spaces:

OliverPerrin
/

LexiMind

Running

LexiMind / scripts /train_bert_baseline.py

OliverPerrin

Ruff formatting fix

56554a7 4 days ago

40.7 kB

	"""
	BERT Baseline Training for LexiMind Comparison.

	Fine-tunes bert-base-uncased on topic classification and emotion detection
	to provide baselines for comparison with LexiMind (FLAN-T5-based).

	Supports three training modes to disentangle architecture vs. MTL effects:
	1. single-topic — BERT fine-tuned on topic classification only
	2. single-emotion — BERT fine-tuned on emotion detection only
	3. multitask — BERT fine-tuned on both tasks jointly

	Uses the same datasets, splits, label encoders, and evaluation metrics as the
	main LexiMind pipeline for fair comparison.

	Usage:
	python scripts/train_bert_baseline.py --mode single-topic
	python scripts/train_bert_baseline.py --mode single-emotion
	python scripts/train_bert_baseline.py --mode multitask
	python scripts/train_bert_baseline.py --mode all # Run all three sequentially

	Author: Oliver Perrin
	Date: March 2026
	"""

	from __future__ import annotations

	import argparse
	import json
	import random
	import sys
	import time
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Sequence

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from sklearn.metrics import accuracy_score, classification_report, f1_score
	from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
	from torch.cuda.amp import GradScaler, autocast
	from torch.optim import AdamW
	from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
	from torch.utils.data import DataLoader, Dataset
	from tqdm import tqdm
	from transformers import AutoModel, AutoTokenizer

	# Project imports
	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	if str(PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(PROJECT_ROOT))

	from src.data.dataset import (
	EmotionExample,
	TopicExample,
	load_emotion_jsonl,
	load_topic_jsonl,
	)
	from src.training.metrics import (
	bootstrap_confidence_interval,
	multilabel_f1,
	multilabel_macro_f1,
	multilabel_micro_f1,
	multilabel_per_class_metrics,
	tune_per_class_thresholds,
	)

	# Configuration


	@dataclass
	class BertBaselineConfig:
	"""Hyperparameters aligned with LexiMind's full.yaml where applicable."""

	# Model
	model_name: str = "bert-base-uncased"
	max_length: int = 256 # Same as LexiMind classification max_len

	# Optimizer (matching LexiMind's full.yaml)
	lr: float = 3e-5
	weight_decay: float = 0.01
	betas: tuple[float, float] = (0.9, 0.98)
	eps: float = 1e-6

	# Training
	batch_size: int = 10 # Same as LexiMind
	gradient_accumulation_steps: int = 4 # Same effective batch = 40
	max_epochs: int = 8
	warmup_steps: int = 300
	gradient_clip_norm: float = 1.0
	early_stopping_patience: int = 3
	seed: int = 17 # Same as LexiMind

	# Task weights (for multi-task mode)
	topic_weight: float = 0.3 # Same as LexiMind
	emotion_weight: float = 1.0

	# Temperature sampling (for multi-task mode)
	task_sampling_alpha: float = 0.5

	# Frozen layers: freeze bottom 4 layers (matching LexiMind's encoder strategy)
	freeze_layers: int = 4

	# Precision
	use_amp: bool = True # BFloat16 mixed precision

	# Paths
	data_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed")
	output_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "outputs" / "bert_baseline")
	checkpoint_dir: Path = field(
	default_factory=lambda: PROJECT_ROOT / "checkpoints" / "bert_baseline"
	)

	# Emotion threshold
	emotion_threshold: float = 0.3


	# Datasets


	class BertEmotionDataset(Dataset):
	"""Tokenized emotion dataset for BERT."""

	def __init__(
	self,
	examples: List[EmotionExample],
	tokenizer: AutoTokenizer,
	binarizer: MultiLabelBinarizer,
	max_length: int = 256,
	):
	self.examples = examples
	self.tokenizer = tokenizer
	self.binarizer = binarizer
	self.max_length = max_length

	def __len__(self) -> int:
	return len(self.examples)

	def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
	ex = self.examples[idx]
	encoding = self.tokenizer(
	ex.text,
	max_length=self.max_length,
	padding="max_length",
	truncation=True,
	return_tensors="pt",
	)
	labels = self.binarizer.transform([ex.emotions])[0]
	return {
	"input_ids": encoding["input_ids"].squeeze(0),
	"attention_mask": encoding["attention_mask"].squeeze(0),
	"labels": torch.tensor(labels, dtype=torch.float32),
	}


	class BertTopicDataset(Dataset):
	"""Tokenized topic dataset for BERT."""

	def __init__(
	self,
	examples: List[TopicExample],
	tokenizer: AutoTokenizer,
	encoder: LabelEncoder,
	max_length: int = 256,
	):
	self.examples = examples
	self.tokenizer = tokenizer
	self.encoder = encoder
	self.max_length = max_length

	def __len__(self) -> int:
	return len(self.examples)

	def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
	ex = self.examples[idx]
	encoding = self.tokenizer(
	ex.text,
	max_length=self.max_length,
	padding="max_length",
	truncation=True,
	return_tensors="pt",
	)
	label = self.encoder.transform([ex.topic])[0]
	return {
	"input_ids": encoding["input_ids"].squeeze(0),
	"attention_mask": encoding["attention_mask"].squeeze(0),
	"labels": torch.tensor(label, dtype=torch.long),
	}


	# Model


	class BertClassificationHead(nn.Module):
	"""Classification head on top of BERT [CLS] token.

	For emotion: uses attention pooling + 2-layer MLP (matching LexiMind's emotion head)
	For topic: uses [CLS] + single linear (matching LexiMind's mean pool + linear)
	"""

	def __init__(
	self,
	hidden_size: int,
	num_labels: int,
	pooling: str = "cls", # "cls" or "attention"
	hidden_dim: Optional[int] = None,
	dropout: float = 0.1,
	):
	super().__init__()
	self.pooling = pooling
	self.dropout = nn.Dropout(dropout)

	if pooling == "attention":
	self.attn_query = nn.Linear(hidden_size, 1, bias=False)

	if hidden_dim is not None:
	self.classifier = nn.Sequential(
	nn.Linear(hidden_size, hidden_dim),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_dim, num_labels),
	)
	else:
	self.classifier = nn.Linear(hidden_size, num_labels)

	def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
	if self.pooling == "attention":
	# Learned attention pooling (same mechanism as LexiMind)
	scores = self.attn_query(hidden_states) # (B, L, 1)
	mask = attention_mask.unsqueeze(-1).bool()
	scores = scores.masked_fill(~mask, float("-inf"))
	weights = F.softmax(scores, dim=1)
	pooled = (weights * hidden_states).sum(dim=1)
	elif self.pooling == "mean":
	# Mean pooling over valid tokens
	mask_expanded = attention_mask.unsqueeze(-1).float()
	sum_embeddings = (hidden_states * mask_expanded).sum(dim=1)
	sum_mask = mask_expanded.sum(dim=1).clamp(min=1e-9)
	pooled = sum_embeddings / sum_mask
	else:
	# [CLS] token
	pooled = hidden_states[:, 0, :]

	pooled = self.dropout(pooled)
	return self.classifier(pooled)


	class BertBaseline(nn.Module):
	"""BERT baseline model with task-specific heads.

	Supports single-task and multi-task configurations.
	"""

	def __init__(
	self,
	model_name: str = "bert-base-uncased",
	num_emotions: int = 28,
	num_topics: int = 7,
	tasks: Sequence[str] = ("emotion", "topic"),
	freeze_layers: int = 4,
	):
	super().__init__()
	self.bert = AutoModel.from_pretrained(model_name)
	hidden_size = self.bert.config.hidden_size # 768 for bert-base

	self.tasks = list(tasks)
	self.heads = nn.ModuleDict()

	if "emotion" in tasks:
	# Attention pooling + 2-layer MLP (matching LexiMind's emotion head)
	self.heads["emotion"] = BertClassificationHead(
	hidden_size=hidden_size,
	num_labels=num_emotions,
	pooling="attention",
	hidden_dim=hidden_size // 2, # 384, same ratio as LexiMind
	dropout=0.1,
	)

	if "topic" in tasks:
	# Mean pooling + single linear (matching LexiMind's topic head)
	self.heads["topic"] = BertClassificationHead(
	hidden_size=hidden_size,
	num_labels=num_topics,
	pooling="mean",
	hidden_dim=None,
	dropout=0.1,
	)

	# Freeze bottom N encoder layers (matching LexiMind's strategy)
	self._freeze_layers(freeze_layers)

	def _freeze_layers(self, n: int) -> None:
	"""Freeze embedding + bottom n encoder layers."""
	# Freeze embeddings
	for param in self.bert.embeddings.parameters():
	param.requires_grad = False

	# Freeze bottom n layers
	for i in range(min(n, len(self.bert.encoder.layer))):
	for param in self.bert.encoder.layer[i].parameters():
	param.requires_grad = False

	frozen = sum(1 for p in self.bert.parameters() if not p.requires_grad)
	total = sum(1 for p in self.bert.parameters())
	print(f" Frozen {frozen}/{total} BERT parameters (bottom {n} layers + embeddings)")

	def forward(
	self,
	task: str,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	) -> torch.Tensor:
	outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
	hidden_states = outputs.last_hidden_state # (B, L, 768)
	return self.heads[task](hidden_states, attention_mask)

	def param_count(self) -> Dict[str, int]:
	"""Count parameters by component."""
	counts = {}
	counts["bert_encoder"] = sum(p.numel() for p in self.bert.parameters())
	counts["bert_trainable"] = sum(p.numel() for p in self.bert.parameters() if p.requires_grad)
	for name, head in self.heads.items():
	counts[f"head_{name}"] = sum(p.numel() for p in head.parameters())
	counts["total"] = sum(p.numel() for p in self.parameters())
	counts["trainable"] = sum(p.numel() for p in self.parameters() if p.requires_grad)
	return counts


	# Training


	class BertTrainer:
	"""Trainer supporting single-task and multi-task BERT training."""

	def __init__(
	self,
	model: BertBaseline,
	config: BertBaselineConfig,
	train_loaders: Dict[str, DataLoader],
	val_loaders: Dict[str, DataLoader],
	device: torch.device,
	mode: str,
	):
	self.model = model
	self.config = config
	self.train_loaders = train_loaders
	self.val_loaders = val_loaders
	self.device = device
	self.mode = mode

	# Optimizer
	self.optimizer = AdamW(
	[p for p in model.parameters() if p.requires_grad],
	lr=config.lr,
	weight_decay=config.weight_decay,
	betas=config.betas,
	eps=config.eps,
	)

	# Calculate total training steps
	if len(train_loaders) > 1:
	# Multi-task: use temperature-sampled steps
	sizes = {k: len(v) for k, v in train_loaders.items()}
	total_batches = sum(sizes.values())
	else:
	total_batches = sum(len(v) for v in train_loaders.values())
	self.steps_per_epoch = total_batches // config.gradient_accumulation_steps
	self.total_steps = self.steps_per_epoch * config.max_epochs

	# LR scheduler: linear warmup + cosine decay (matching LexiMind)
	warmup_scheduler = LinearLR(
	self.optimizer,
	start_factor=1e-8 / config.lr,
	end_factor=1.0,
	total_iters=config.warmup_steps,
	)
	cosine_scheduler = CosineAnnealingLR(
	self.optimizer,
	T_max=max(self.total_steps - config.warmup_steps, 1),
	eta_min=config.lr * 0.1, # Decay to 10% of peak (matching LexiMind)
	)
	self.scheduler = SequentialLR(
	self.optimizer,
	schedulers=[warmup_scheduler, cosine_scheduler],
	milestones=[config.warmup_steps],
	)

	# Mixed precision
	self.scaler = GradScaler(enabled=config.use_amp)

	# Loss functions
	self.emotion_loss_fn = nn.BCEWithLogitsLoss()
	self.topic_loss_fn = nn.CrossEntropyLoss()

	# Tracking
	self.global_step = 0
	self.best_metric = -float("inf")
	self.patience_counter = 0
	self.training_history: List[Dict[str, Any]] = []

	def _compute_loss(self, task: str, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
	if task == "emotion":
	return self.emotion_loss_fn(logits, labels)
	else:
	return self.topic_loss_fn(logits, labels)

	def _get_task_weight(self, task: str) -> float:
	if self.mode != "multitask":
	return 1.0
	if task == "topic":
	return self.config.topic_weight
	return self.config.emotion_weight

	def _make_multitask_iterator(self):
	"""Temperature-based task sampling (matching LexiMind)."""
	sizes = {k: len(v.dataset) for k, v in self.train_loaders.items()}
	alpha = self.config.task_sampling_alpha

	# Compute sampling probabilities
	raw = {k: s ** (1.0 / alpha) for k, s in sizes.items()}
	total = sum(raw.values())
	probs = {k: v / total for k, v in raw.items()}

	# Create iterators
	iters = {k: iter(v) for k, v in self.train_loaders.items()}
	tasks = list(probs.keys())
	weights = [probs[t] for t in tasks]

	while True:
	task = random.choices(tasks, weights=weights, k=1)[0]
	try:
	batch = next(iters[task])
	except StopIteration:
	iters[task] = iter(self.train_loaders[task])
	batch = next(iters[task])
	yield task, batch

	def train_epoch(self, epoch: int) -> Dict[str, float]:
	"""Train one epoch."""
	self.model.train()
	self.optimizer.zero_grad()

	epoch_losses: Dict[str, List[float]] = {t: [] for t in self.train_loaders}

	if len(self.train_loaders) > 1:
	# Multi-task: temperature sampling
	iterator = self._make_multitask_iterator()
	total_batches = sum(len(v) for v in self.train_loaders.values())
	else:
	# Single-task: iterate normally
	task_name = list(self.train_loaders.keys())[0]
	iterator = ((task_name, batch) for batch in self.train_loaders[task_name])
	total_batches = len(self.train_loaders[task_name])

	pbar = tqdm(total=total_batches, desc=f"Epoch {epoch + 1}/{self.config.max_epochs}")

	for step_in_epoch in range(total_batches):
	task, batch = next(iterator)

	input_ids = batch["input_ids"].to(self.device)
	attention_mask = batch["attention_mask"].to(self.device)
	labels = batch["labels"].to(self.device)

	# Forward pass with AMP
	with autocast(dtype=torch.bfloat16, enabled=self.config.use_amp):
	logits = self.model(task, input_ids, attention_mask)
	loss = self._compute_loss(task, logits, labels)
	loss = loss * self._get_task_weight(task)
	loss = loss / self.config.gradient_accumulation_steps

	# Backward
	self.scaler.scale(loss).backward()
	epoch_losses[task].append(loss.item() * self.config.gradient_accumulation_steps)

	# Optimizer step (every N accumulation steps)
	if (step_in_epoch + 1) % self.config.gradient_accumulation_steps == 0:
	self.scaler.unscale_(self.optimizer)
	torch.nn.utils.clip_grad_norm_(
	self.model.parameters(), self.config.gradient_clip_norm
	)
	self.scaler.step(self.optimizer)
	self.scaler.update()
	self.optimizer.zero_grad()
	self.scheduler.step()
	self.global_step += 1

	pbar.set_postfix(
	{
	f"{task}_loss": f"{epoch_losses[task][-1]:.4f}",
	"lr": f"{self.scheduler.get_last_lr()[0]:.2e}",
	}
	)
	pbar.update(1)

	pbar.close()

	# Aggregate
	results = {}
	for task, losses in epoch_losses.items():
	if losses:
	results[f"train_{task}_loss"] = sum(losses) / len(losses)
	return results

	@torch.no_grad()
	def validate(self) -> Dict[str, Any]:
	"""Run validation across all tasks."""
	self.model.eval()
	results: Dict[str, Any] = {}

	for task, loader in self.val_loaders.items():
	all_logits = []
	all_labels = []
	total_loss = 0.0
	n_batches = 0

	for batch in loader:
	input_ids = batch["input_ids"].to(self.device)
	attention_mask = batch["attention_mask"].to(self.device)
	labels = batch["labels"].to(self.device)

	with autocast(dtype=torch.bfloat16, enabled=self.config.use_amp):
	logits = self.model(task, input_ids, attention_mask)
	loss = self._compute_loss(task, logits, labels)

	total_loss += loss.item()
	n_batches += 1
	all_logits.append(logits.float().cpu())
	all_labels.append(labels.float().cpu())

	all_logits_t = torch.cat(all_logits, dim=0)
	all_labels_t = torch.cat(all_labels, dim=0)
	results[f"val_{task}_loss"] = total_loss / max(n_batches, 1)

	if task == "emotion":
	preds = (torch.sigmoid(all_logits_t) > self.config.emotion_threshold).int()
	targets = all_labels_t.int()
	results["val_emotion_sample_f1"] = multilabel_f1(preds, targets)
	results["val_emotion_macro_f1"] = multilabel_macro_f1(preds, targets)
	results["val_emotion_micro_f1"] = multilabel_micro_f1(preds, targets)
	# Store raw logits for threshold tuning later
	results["_emotion_logits"] = all_logits_t
	results["_emotion_labels"] = all_labels_t

	elif task == "topic":
	preds = all_logits_t.argmax(dim=1).numpy()
	targets = all_labels_t.long().numpy()
	results["val_topic_accuracy"] = float(accuracy_score(targets, preds))
	results["val_topic_macro_f1"] = float(
	f1_score(targets, preds, average="macro", zero_division=0)
	)

	# Combined metric for early stopping / checkpointing
	metric_parts = []
	if "val_emotion_sample_f1" in results:
	metric_parts.append(results["val_emotion_sample_f1"])
	if "val_topic_accuracy" in results:
	metric_parts.append(results["val_topic_accuracy"])
	results["val_combined_metric"] = sum(metric_parts) / max(len(metric_parts), 1)

	return results

	def save_checkpoint(self, path: Path, epoch: int, metrics: Dict[str, Any]) -> None:
	"""Save model checkpoint."""
	path.parent.mkdir(parents=True, exist_ok=True)
	# Filter out tensors from metrics
	clean_metrics = {k: v for k, v in metrics.items() if not k.startswith("_")}
	torch.save(
	{
	"epoch": epoch,
	"model_state_dict": self.model.state_dict(),
	"optimizer_state_dict": self.optimizer.state_dict(),
	"scheduler_state_dict": self.scheduler.state_dict(),
	"metrics": clean_metrics,
	"config": {
	"mode": self.mode,
	"tasks": self.model.tasks,
	"model_name": self.config.model_name,
	},
	},
	path,
	)

	def train(self) -> Dict[str, Any]:
	"""Full training loop."""
	print(f"\n{'=' * 60}")
	print(f"Training BERT Baseline — Mode: {self.mode}")
	print(f"{'=' * 60}")

	param_counts = self.model.param_count()
	print(f" Total parameters: {param_counts['total']:,}")
	print(f" Trainable parameters: {param_counts['trainable']:,}")
	for name, count in param_counts.items():
	if name.startswith("head_"):
	print(f" {name}: {count:,}")
	print(f" Steps/epoch: {self.steps_per_epoch}")
	print(f" Total steps: {self.total_steps}")
	print()

	all_results: Dict[str, Any] = {"mode": self.mode, "epochs": []}
	start_time = time.time()

	for epoch in range(self.config.max_epochs):
	epoch_start = time.time()

	# Train
	train_metrics = self.train_epoch(epoch)

	# Validate
	val_metrics = self.validate()

	epoch_time = time.time() - epoch_start

	# Log
	epoch_result = {
	"epoch": epoch + 1,
	"time_seconds": epoch_time,
	**train_metrics,
	**{k: v for k, v in val_metrics.items() if not k.startswith("_")},
	}
	all_results["epochs"].append(epoch_result)
	self.training_history.append(epoch_result)

	# Print summary
	print(f"\n Epoch {epoch + 1} ({epoch_time:.0f}s):")
	for k, v in sorted(epoch_result.items()):
	if k not in ("epoch", "time_seconds") and isinstance(v, float):
	print(f" {k}: {v:.4f}")

	# Checkpointing
	combined = val_metrics["val_combined_metric"]
	if combined > self.best_metric:
	self.best_metric = combined
	self.patience_counter = 0
	self.save_checkpoint(
	self.config.checkpoint_dir / self.mode / "best.pt",
	epoch,
	val_metrics,
	)
	print(f" New best model (combined metric: {combined:.4f})")
	else:
	self.patience_counter += 1
	print(
	f" No improvement ({self.patience_counter}/{self.config.early_stopping_patience})"
	)

	# Always save epoch checkpoint
	self.save_checkpoint(
	self.config.checkpoint_dir / self.mode / f"epoch_{epoch + 1}.pt",
	epoch,
	val_metrics,
	)

	# Early stopping
	if self.patience_counter >= self.config.early_stopping_patience:
	print(f"\n Early stopping triggered at epoch {epoch + 1}")
	all_results["early_stopped"] = True
	all_results["best_epoch"] = epoch + 1 - self.config.early_stopping_patience
	break

	total_time = time.time() - start_time
	all_results["total_time_seconds"] = total_time
	all_results["total_time_human"] = f"{total_time / 3600:.1f}h"
	if "early_stopped" not in all_results:
	all_results["early_stopped"] = False
	all_results["best_epoch"] = (
	epoch + 1 - self.patience_counter if self.patience_counter > 0 else epoch + 1
	)
	all_results["param_counts"] = param_counts

	print(f"\n Training complete in {total_time / 3600:.1f}h")
	print(f" Best combined metric: {self.best_metric:.4f}")

	return all_results


	# Evaluation


	def evaluate_bert_model(
	model: BertBaseline,
	val_loaders: Dict[str, DataLoader],
	device: torch.device,
	config: BertBaselineConfig,
	emotion_classes: Optional[List[str]] = None,
	topic_classes: Optional[List[str]] = None,
	) -> Dict[str, Any]:
	"""Full evaluation with the same metrics as LexiMind's evaluate.py."""
	model.eval()
	results: Dict[str, Any] = {}

	with torch.no_grad():
	for task, loader in val_loaders.items():
	all_logits = []
	all_labels = []

	for batch in tqdm(loader, desc=f"Evaluating {task}"):
	input_ids = batch["input_ids"].to(device)
	attention_mask = batch["attention_mask"].to(device)
	labels = batch["labels"].to(device)

	with autocast(dtype=torch.bfloat16, enabled=config.use_amp):
	logits = model(task, input_ids, attention_mask)

	all_logits.append(logits.float().cpu())
	all_labels.append(labels.float().cpu())

	all_logits_t = torch.cat(all_logits, dim=0)
	all_labels_t = torch.cat(all_labels, dim=0)

	if task == "emotion":
	# Default threshold
	preds_default = (torch.sigmoid(all_logits_t) > config.emotion_threshold).int()
	targets = all_labels_t.int()

	results["emotion"] = {
	"default_threshold": config.emotion_threshold,
	"sample_avg_f1": multilabel_f1(preds_default, targets),
	"macro_f1": multilabel_macro_f1(preds_default, targets),
	"micro_f1": multilabel_micro_f1(preds_default, targets),
	}

	# Per-class metrics
	if emotion_classes:
	per_class = multilabel_per_class_metrics(
	preds_default, targets, emotion_classes
	)
	results["emotion"]["per_class"] = per_class

	# Threshold tuning
	best_thresholds, tuned_macro = tune_per_class_thresholds(all_logits_t, all_labels_t)
	tuned_preds = torch.zeros_like(all_logits_t)
	probs = torch.sigmoid(all_logits_t)
	for c in range(all_logits_t.shape[1]):
	tuned_preds[:, c] = (probs[:, c] >= best_thresholds[c]).float()
	tuned_preds = tuned_preds.int()

	results["emotion"]["tuned_macro_f1"] = tuned_macro
	results["emotion"]["tuned_sample_avg_f1"] = multilabel_f1(tuned_preds, targets)
	results["emotion"]["tuned_micro_f1"] = multilabel_micro_f1(tuned_preds, targets)

	# Bootstrap CI on sample-avg F1
	per_sample_f1 = []
	for i in range(preds_default.shape[0]):
	p = preds_default[i].float()
	g = targets[i].float()
	tp = (p * g).sum()
	prec = tp / p.sum().clamp(min=1)
	rec = tp / g.sum().clamp(min=1)
	f = (2 * prec * rec) / (prec + rec).clamp(min=1e-8)
	per_sample_f1.append(f.item())
	mean_f1, ci_low, ci_high = bootstrap_confidence_interval(per_sample_f1)
	results["emotion"]["sample_avg_f1_ci"] = [ci_low, ci_high]

	elif task == "topic":
	preds = all_logits_t.argmax(dim=1).numpy()
	targets = all_labels_t.long().numpy()

	acc = float(accuracy_score(targets, preds))
	macro_f1 = float(f1_score(targets, preds, average="macro", zero_division=0))

	results["topic"] = {
	"accuracy": acc,
	"macro_f1": macro_f1,
	}

	# Per-class metrics
	if topic_classes:
	report = classification_report(
	targets,
	preds,
	target_names=topic_classes,
	output_dict=True,
	zero_division=0,
	)
	results["topic"]["per_class"] = {
	name: {
	"precision": report[name]["precision"],
	"recall": report[name]["recall"],
	"f1": report[name]["f1-score"],
	"support": report[name]["support"],
	}
	for name in topic_classes
	if name in report
	}

	# Bootstrap CI on accuracy
	per_sample_correct = (preds == targets).astype(float).tolist()
	mean_acc, ci_low, ci_high = bootstrap_confidence_interval(per_sample_correct)
	results["topic"]["accuracy_ci"] = [ci_low, ci_high]

	return results


	# Main Pipeline


	def set_seed(seed: int) -> None:
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed)


	def load_data(config: BertBaselineConfig):
	"""Load all datasets and create label encoders."""
	data_dir = config.data_dir

	# Load emotion data
	emo_train = load_emotion_jsonl(str(data_dir / "emotion" / "train.jsonl"))
	emo_val_path = data_dir / "emotion" / "validation.jsonl"
	if not emo_val_path.exists():
	emo_val_path = data_dir / "emotion" / "val.jsonl"
	emo_val = load_emotion_jsonl(str(emo_val_path))

	# Load topic data
	top_train = load_topic_jsonl(str(data_dir / "topic" / "train.jsonl"))
	top_val_path = data_dir / "topic" / "validation.jsonl"
	if not top_val_path.exists():
	top_val_path = data_dir / "topic" / "val.jsonl"
	top_val = load_topic_jsonl(str(top_val_path))

	# Fit label encoders on training data (same as LexiMind)
	binarizer = MultiLabelBinarizer()
	binarizer.fit([ex.emotions for ex in emo_train])

	label_encoder = LabelEncoder()
	label_encoder.fit([ex.topic for ex in top_train])

	print(
	f" Emotion: {len(emo_train)} train, {len(emo_val)} val, {len(binarizer.classes_)} classes"
	)
	print(
	f" Topic: {len(top_train)} train, {len(top_val)} val, {len(label_encoder.classes_)} classes"
	)
	print(f" Emotion classes: {list(binarizer.classes_)[:5]}...")
	print(f" Topic classes: {list(label_encoder.classes_)}")

	return {
	"emotion_train": emo_train,
	"emotion_val": emo_val,
	"topic_train": top_train,
	"topic_val": top_val,
	"binarizer": binarizer,
	"label_encoder": label_encoder,
	}


	def run_experiment(mode: str, config: BertBaselineConfig) -> Dict[str, Any]:
	"""Run a single experiment (single-topic, single-emotion, or multitask)."""
	print(f"\n{'═' * 60}")
	print(f" BERT BASELINE EXPERIMENT: {mode.upper()}")
	print(f"{'═' * 60}")

	set_seed(config.seed)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f" Device: {device}")
	if torch.cuda.is_available():
	print(f" GPU: {torch.cuda.get_device_name()}")
	print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

	# CUDA optimizations
	if torch.cuda.is_available():
	torch.backends.cudnn.benchmark = True
	if hasattr(torch.backends, "cuda"):
	torch.backends.cuda.matmul.allow_tf32 = True

	# Load tokenizer
	print(f"\n Loading tokenizer: {config.model_name}")
	tokenizer = AutoTokenizer.from_pretrained(config.model_name)

	# Load data
	print(" Loading datasets...")
	data = load_data(config)

	# Determine tasks for this mode
	if mode == "single-topic":
	tasks = ["topic"]
	elif mode == "single-emotion":
	tasks = ["emotion"]
	else:
	tasks = ["emotion", "topic"]

	# Create datasets
	train_loaders: Dict[str, DataLoader] = {}
	val_loaders: Dict[str, DataLoader] = {}

	if "emotion" in tasks:
	emo_train_ds = BertEmotionDataset(
	data["emotion_train"], tokenizer, data["binarizer"], config.max_length
	)
	emo_val_ds = BertEmotionDataset(
	data["emotion_val"], tokenizer, data["binarizer"], config.max_length
	)
	train_loaders["emotion"] = DataLoader(
	emo_train_ds,
	batch_size=config.batch_size,
	shuffle=True,
	num_workers=4,
	pin_memory=True,
	persistent_workers=True,
	)
	val_loaders["emotion"] = DataLoader(
	emo_val_ds,
	batch_size=config.batch_size * 2,
	shuffle=False,
	num_workers=4,
	pin_memory=True,
	)

	if "topic" in tasks:
	top_train_ds = BertTopicDataset(
	data["topic_train"], tokenizer, data["label_encoder"], config.max_length
	)
	top_val_ds = BertTopicDataset(
	data["topic_val"], tokenizer, data["label_encoder"], config.max_length
	)
	train_loaders["topic"] = DataLoader(
	top_train_ds,
	batch_size=config.batch_size,
	shuffle=True,
	num_workers=4,
	pin_memory=True,
	persistent_workers=True,
	)
	val_loaders["topic"] = DataLoader(
	top_val_ds,
	batch_size=config.batch_size * 2,
	shuffle=False,
	num_workers=4,
	pin_memory=True,
	)

	# Create model
	print(f"\n Creating model with tasks: {tasks}")
	model = BertBaseline(
	model_name=config.model_name,
	num_emotions=len(data["binarizer"].classes_),
	num_topics=len(data["label_encoder"].classes_),
	tasks=tasks,
	freeze_layers=config.freeze_layers,
	).to(device)

	# Train
	trainer = BertTrainer(model, config, train_loaders, val_loaders, device, mode)
	training_results = trainer.train()

	# Load best checkpoint for final evaluation
	best_path = config.checkpoint_dir / mode / "best.pt"
	if best_path.exists():
	print("\n Loading best checkpoint for final evaluation...")
	checkpoint = torch.load(best_path, map_location=device, weights_only=False)
	model.load_state_dict(checkpoint["model_state_dict"])

	# Full evaluation
	print("\n Running final evaluation...")
	eval_results = evaluate_bert_model(
	model,
	val_loaders,
	device,
	config,
	emotion_classes=list(data["binarizer"].classes_) if "emotion" in tasks else None,
	topic_classes=list(data["label_encoder"].classes_) if "topic" in tasks else None,
	)

	# Combine results
	final_results = {
	"mode": mode,
	"model": config.model_name,
	"tasks": tasks,
	"training": training_results,
	"evaluation": eval_results,
	}

	# Save results
	output_path = config.output_dir / f"{mode}_results.json"
	output_path.parent.mkdir(parents=True, exist_ok=True)

	# Remove non-serializable fields
	def make_serializable(obj):
	if isinstance(obj, dict):
	return {k: make_serializable(v) for k, v in obj.items() if not k.startswith("_")}
	if isinstance(obj, list):
	return [make_serializable(item) for item in obj]
	if isinstance(obj, (np.integer, np.int64)):
	return int(obj)
	if isinstance(obj, (np.floating, np.float64)):
	return float(obj)
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	return obj

	with open(output_path, "w") as f:
	json.dump(make_serializable(final_results), f, indent=2)
	print(f"\n Results saved to {output_path}")

	return final_results


	def print_comparison_summary(all_results: Dict[str, Dict[str, Any]]) -> None:
	"""Print a side-by-side comparison of all experiments."""
	print(f"\n{'═' * 70}")
	print(" BERT BASELINE COMPARISON SUMMARY")
	print(f"{'═' * 70}")

	# Header
	modes = list(all_results.keys())
	header = f"{'Metric':<30}" + "".join(f"{m:>16}" for m in modes) + f"{'LexiMind':>16}"
	print(f"\n {header}")
	print(f" {'─' * len(header)}")

	# LexiMind reference values
	lexmind = {
	"topic_accuracy": 0.8571,
	"topic_macro_f1": 0.8539,
	"emotion_sample_f1": 0.3523,
	"emotion_macro_f1": 0.1432,
	"emotion_micro_f1": 0.4430,
	"emotion_tuned_macro_f1": 0.2936,
	}

	# Topic metrics
	print(f"\n {'Topic Classification':}")
	for metric_name, display_name in [
	("accuracy", "Accuracy"),
	("macro_f1", "Macro F1"),
	]:
	row = f" {display_name:<30}"
	for mode in modes:
	eval_data = all_results[mode].get("evaluation", {})
	topic = eval_data.get("topic", {})
	val = topic.get(metric_name, None)
	row += f"{val:>16.4f}" if val is not None else f"{'—':>16}"
	lm_key = f"topic_{metric_name}"
	row += f"{lexmind.get(lm_key, 0):>16.4f}"
	print(row)

	# Emotion metrics
	print(f"\n {'Emotion Detection':}")
	for metric_name, display_name in [
	("sample_avg_f1", "Sample-avg F1 (τ=0.3)"),
	("macro_f1", "Macro F1 (τ=0.3)"),
	("micro_f1", "Micro F1 (τ=0.3)"),
	("tuned_macro_f1", "Tuned Macro F1"),
	("tuned_sample_avg_f1", "Tuned Sample-avg F1"),
	]:
	row = f" {display_name:<30}"
	for mode in modes:
	eval_data = all_results[mode].get("evaluation", {})
	emo = eval_data.get("emotion", {})
	val = emo.get(metric_name, None)
	row += f"{val:>16.4f}" if val is not None else f"{'—':>16}"
	lm_key = f"emotion_{metric_name}"
	row += f"{lexmind.get(lm_key, 0):>16.4f}"
	print(row)

	# Training time
	print(f"\n {'Training Time':}")
	row = f" {'Hours':<30}"
	for mode in modes:
	t = all_results[mode].get("training", {}).get("total_time_seconds", 0) / 3600
	row += f"{t:>15.1f}h"
	row += f"{'~9.0h':>16}"
	print(row)

	print(f"\n{'═' * 70}\n")


	def main():
	parser = argparse.ArgumentParser(description="BERT Baseline Training for LexiMind")
	parser.add_argument(
	"--mode",
	type=str,
	required=True,
	choices=["single-topic", "single-emotion", "multitask", "all"],
	help="Training mode",
	)
	parser.add_argument("--epochs", type=int, default=None, help="Override max epochs")
	parser.add_argument("--lr", type=float, default=None, help="Override learning rate")
	parser.add_argument("--batch-size", type=int, default=None, help="Override batch size")
	parser.add_argument(
	"--model", type=str, default="bert-base-uncased", help="HuggingFace model name"
	)
	args = parser.parse_args()

	config = BertBaselineConfig()
	config.model_name = args.model
	if args.epochs is not None:
	config.max_epochs = args.epochs
	if args.lr is not None:
	config.lr = args.lr
	if args.batch_size is not None:
	config.batch_size = args.batch_size

	if args.mode == "all":
	modes = ["single-topic", "single-emotion", "multitask"]
	else:
	modes = [args.mode]

	all_results: Dict[str, Dict[str, Any]] = {}
	for mode in modes:
	results = run_experiment(mode, config)
	all_results[mode] = results

	# Clear GPU memory between experiments
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Save combined results
	if len(all_results) > 1:
	combined_path = config.output_dir / "combined_results.json"

	def make_serializable(obj):
	if isinstance(obj, dict):
	return {k: make_serializable(v) for k, v in obj.items() if not k.startswith("_")}
	if isinstance(obj, list):
	return [make_serializable(item) for item in obj]
	if isinstance(obj, (np.integer, np.int64)):
	return int(obj)
	if isinstance(obj, (np.floating, np.float64)):
	return float(obj)
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	return obj

	with open(combined_path, "w") as f:
	json.dump(make_serializable(all_results), f, indent=2)
	print(f" Combined results saved to {combined_path}")

	print_comparison_summary(all_results)


	if __name__ == "__main__":
	main()