""" Training and evaluation metrics for LexiMind. Provides metric computation utilities for all task types: accuracy for topic classification, multi-label F1 for emotion detection, and ROUGE/BLEU/BERTScore for summarization quality assessment. Author: Oliver Perrin Date: December 2025 """ from __future__ import annotations from typing import Any, Dict, List, Sequence, cast import numpy as np import torch from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support def accuracy(predictions: Sequence[int | str], targets: Sequence[int | str]) -> float: return cast(float, accuracy_score(targets, predictions)) def multilabel_f1(predictions: torch.Tensor, targets: torch.Tensor) -> float: preds = predictions.float() gold = targets.float() true_positive = (preds * gold).sum(dim=1) precision = true_positive / (preds.sum(dim=1).clamp(min=1.0)) recall = true_positive / (gold.sum(dim=1).clamp(min=1.0)) f1 = (2 * precision * recall) / (precision + recall).clamp(min=1e-8) return float(f1.mean().item()) def rouge_like(predictions: Sequence[str], references: Sequence[str]) -> float: if not predictions or not references: return 0.0 scores = [] for pred, ref in zip(predictions, references, strict=False): pred_tokens = pred.split() ref_tokens = ref.split() if not ref_tokens: scores.append(0.0) continue overlap = len(set(pred_tokens) & set(ref_tokens)) scores.append(overlap / len(ref_tokens)) return sum(scores) / len(scores) def calculate_bleu(predictions: Sequence[str], references: Sequence[str]) -> float: """Calculate BLEU-4 score.""" if not predictions or not references: return 0.0 smoother = SmoothingFunction().method1 scores = [] for pred, ref in zip(predictions, references, strict=False): pred_tokens = pred.split() ref_tokens = [ref.split()] # BLEU expects list of references scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother)) return cast(float, sum(scores) / len(scores)) def calculate_bertscore( predictions: Sequence[str], references: Sequence[str], model_type: str = "roberta-large", # Uses ~1.4GB VRAM vs ~6GB for deberta-xlarge batch_size: int = 16, device: str | None = None, ) -> Dict[str, float]: """ Calculate BERTScore for semantic similarity between predictions and references. BERTScore measures semantic similarity using contextual embeddings, making it more robust than n-gram based metrics like ROUGE for paraphrased content. Args: predictions: Generated summaries/descriptions references: Reference summaries/descriptions model_type: BERT model to use (default: deberta-xlarge-mnli for best quality) batch_size: Batch size for encoding device: Device to use (auto-detected if None) Returns: Dict with 'precision', 'recall', 'f1' BERTScore averages """ if not predictions or not references: return {"precision": 0.0, "recall": 0.0, "f1": 0.0} try: from bert_score import score as bert_score except ImportError: print("Warning: bert-score not installed. Run: pip install bert-score") return {"precision": 0.0, "recall": 0.0, "f1": 0.0} # Auto-detect device if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" # Calculate BERTScore P, R, F1 = bert_score( list(predictions), list(references), model_type=model_type, batch_size=batch_size, device=device, verbose=False, ) return { "precision": float(P.mean().item()), "recall": float(R.mean().item()), "f1": float(F1.mean().item()), } def calculate_rouge( predictions: Sequence[str], references: Sequence[str], ) -> Dict[str, float]: """ Calculate proper ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L). Args: predictions: Generated summaries references: Reference summaries Returns: Dict with rouge1, rouge2, rougeL F1 scores """ if not predictions or not references: return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0} try: from rouge_score import rouge_scorer except ImportError: print("Warning: rouge-score not installed. Run: pip install rouge-score") return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0} scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) rouge1_scores = [] rouge2_scores = [] rougeL_scores = [] for pred, ref in zip(predictions, references, strict=False): scores = scorer.score(ref, pred) rouge1_scores.append(scores['rouge1'].fmeasure) rouge2_scores.append(scores['rouge2'].fmeasure) rougeL_scores.append(scores['rougeL'].fmeasure) return { "rouge1": sum(rouge1_scores) / len(rouge1_scores), "rouge2": sum(rouge2_scores) / len(rouge2_scores), "rougeL": sum(rougeL_scores) / len(rougeL_scores), } def calculate_all_summarization_metrics( predictions: Sequence[str], references: Sequence[str], include_bertscore: bool = True, bertscore_model: str = "microsoft/deberta-xlarge-mnli", ) -> Dict[str, float]: """ Calculate comprehensive summarization metrics for research paper reporting. Includes: - ROUGE-1, ROUGE-2, ROUGE-L (lexical overlap) - BLEU-4 (n-gram precision) - BERTScore (semantic similarity) Args: predictions: Generated summaries/descriptions references: Reference summaries/descriptions include_bertscore: Whether to compute BERTScore (slower but valuable) bertscore_model: Model for BERTScore computation Returns: Dict with all metric scores """ metrics: Dict[str, float] = {} # ROUGE scores rouge_scores = calculate_rouge(predictions, references) metrics.update({f"rouge_{k}": v for k, v in rouge_scores.items()}) # BLEU score metrics["bleu4"] = calculate_bleu(predictions, references) # BERTScore (semantic similarity - important for back-cover style descriptions) if include_bertscore: bert_scores = calculate_bertscore( predictions, references, model_type=bertscore_model ) metrics.update({f"bertscore_{k}": v for k, v in bert_scores.items()}) return metrics def classification_report_dict( predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None ) -> Dict[str, Any]: """Generate a comprehensive classification report.""" precision, recall, f1, support = precision_recall_fscore_support( targets, predictions, labels=labels, average=None, zero_division=0 ) # Type hint help for static analysis since average=None returns arrays precision = cast(np.ndarray, precision) recall = cast(np.ndarray, recall) f1 = cast(np.ndarray, f1) support = cast(np.ndarray, support) report = {} if labels: for i, label in enumerate(labels): report[label] = { "precision": float(precision[i]), "recall": float(recall[i]), "f1-score": float(f1[i]), "support": int(support[i]), } # Macro average report["macro avg"] = { "precision": float(np.mean(precision)), "recall": float(np.mean(recall)), "f1-score": float(np.mean(f1)), "support": int(np.sum(support)), } return report def get_confusion_matrix( predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None ) -> np.ndarray: """Compute confusion matrix.""" return cast(np.ndarray, confusion_matrix(targets, predictions, labels=labels))