File size: 8,058 Bytes
a47e5cf
 
 
 
b93250a
 
a47e5cf
 
 
 
29f2de2
185b05e
 
c0044cc
185b05e
f6d689c
185b05e
f6d689c
 
185b05e
 
f6d689c
c0044cc
185b05e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7d82d1
185b05e
 
 
 
 
 
 
 
f6d689c
 
 
 
 
 
 
 
 
a7d82d1
f6d689c
 
 
 
c0044cc
f6d689c
 
b93250a
 
 
1e95f87
 
b93250a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6d689c
 
 
 
 
 
 
9e89f62
 
 
 
 
f6d689c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0044cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
"""
Training and evaluation metrics for LexiMind.

Provides metric computation utilities for all task types: accuracy for topic
classification, multi-label F1 for emotion detection, and ROUGE/BLEU/BERTScore
for summarization quality assessment.

Author: Oliver Perrin
Date: December 2025
"""

from __future__ import annotations

from typing import Any, Dict, List, Sequence, cast

import numpy as np
import torch
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support


def accuracy(predictions: Sequence[int | str], targets: Sequence[int | str]) -> float:
    return cast(float, accuracy_score(targets, predictions))


def multilabel_f1(predictions: torch.Tensor, targets: torch.Tensor) -> float:
    preds = predictions.float()
    gold = targets.float()
    true_positive = (preds * gold).sum(dim=1)
    precision = true_positive / (preds.sum(dim=1).clamp(min=1.0))
    recall = true_positive / (gold.sum(dim=1).clamp(min=1.0))
    f1 = (2 * precision * recall) / (precision + recall).clamp(min=1e-8)
    return float(f1.mean().item())


def rouge_like(predictions: Sequence[str], references: Sequence[str]) -> float:
    if not predictions or not references:
        return 0.0
    scores = []
    for pred, ref in zip(predictions, references, strict=False):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        if not ref_tokens:
            scores.append(0.0)
            continue
        overlap = len(set(pred_tokens) & set(ref_tokens))
        scores.append(overlap / len(ref_tokens))
    return sum(scores) / len(scores)


def calculate_bleu(predictions: Sequence[str], references: Sequence[str]) -> float:
    """Calculate BLEU-4 score."""
    if not predictions or not references:
        return 0.0

    smoother = SmoothingFunction().method1
    scores = []
    for pred, ref in zip(predictions, references, strict=False):
        pred_tokens = pred.split()
        ref_tokens = [ref.split()]  # BLEU expects list of references
        scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother))

    return cast(float, sum(scores) / len(scores))


def calculate_bertscore(
    predictions: Sequence[str],
    references: Sequence[str],
    model_type: str = "roberta-large",  # Uses ~1.4GB VRAM vs ~6GB for deberta-xlarge
    batch_size: int = 16,
    device: str | None = None,
) -> Dict[str, float]:
    """
    Calculate BERTScore for semantic similarity between predictions and references.
    
    BERTScore measures semantic similarity using contextual embeddings, making it
    more robust than n-gram based metrics like ROUGE for paraphrased content.
    
    Args:
        predictions: Generated summaries/descriptions
        references: Reference summaries/descriptions
        model_type: BERT model to use (default: deberta-xlarge-mnli for best quality)
        batch_size: Batch size for encoding
        device: Device to use (auto-detected if None)
    
    Returns:
        Dict with 'precision', 'recall', 'f1' BERTScore averages
    """
    if not predictions or not references:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    try:
        from bert_score import score as bert_score
    except ImportError:
        print("Warning: bert-score not installed. Run: pip install bert-score")
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    # Auto-detect device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Calculate BERTScore
    P, R, F1 = bert_score(
        list(predictions),
        list(references),
        model_type=model_type,
        batch_size=batch_size,
        device=device,
        verbose=False,
    )
    
    return {
        "precision": float(P.mean().item()),
        "recall": float(R.mean().item()),
        "f1": float(F1.mean().item()),
    }


def calculate_rouge(
    predictions: Sequence[str],
    references: Sequence[str],
) -> Dict[str, float]:
    """
    Calculate proper ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L).
    
    Args:
        predictions: Generated summaries
        references: Reference summaries
    
    Returns:
        Dict with rouge1, rouge2, rougeL F1 scores
    """
    if not predictions or not references:
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}
    
    try:
        from rouge_score import rouge_scorer
    except ImportError:
        print("Warning: rouge-score not installed. Run: pip install rouge-score")
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, ref in zip(predictions, references, strict=False):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    return {
        "rouge1": sum(rouge1_scores) / len(rouge1_scores),
        "rouge2": sum(rouge2_scores) / len(rouge2_scores),
        "rougeL": sum(rougeL_scores) / len(rougeL_scores),
    }


def calculate_all_summarization_metrics(
    predictions: Sequence[str],
    references: Sequence[str],
    include_bertscore: bool = True,
    bertscore_model: str = "microsoft/deberta-xlarge-mnli",
) -> Dict[str, float]:
    """
    Calculate comprehensive summarization metrics for research paper reporting.
    
    Includes:
    - ROUGE-1, ROUGE-2, ROUGE-L (lexical overlap)
    - BLEU-4 (n-gram precision)
    - BERTScore (semantic similarity)
    
    Args:
        predictions: Generated summaries/descriptions
        references: Reference summaries/descriptions
        include_bertscore: Whether to compute BERTScore (slower but valuable)
        bertscore_model: Model for BERTScore computation
    
    Returns:
        Dict with all metric scores
    """
    metrics: Dict[str, float] = {}
    
    # ROUGE scores
    rouge_scores = calculate_rouge(predictions, references)
    metrics.update({f"rouge_{k}": v for k, v in rouge_scores.items()})
    
    # BLEU score
    metrics["bleu4"] = calculate_bleu(predictions, references)
    
    # BERTScore (semantic similarity - important for back-cover style descriptions)
    if include_bertscore:
        bert_scores = calculate_bertscore(
            predictions, references, model_type=bertscore_model
        )
        metrics.update({f"bertscore_{k}": v for k, v in bert_scores.items()})
    
    return metrics


def classification_report_dict(
    predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None
) -> Dict[str, Any]:
    """Generate a comprehensive classification report."""
    precision, recall, f1, support = precision_recall_fscore_support(
        targets, predictions, labels=labels, average=None, zero_division=0
    )
    # Type hint help for static analysis since average=None returns arrays
    precision = cast(np.ndarray, precision)
    recall = cast(np.ndarray, recall)
    f1 = cast(np.ndarray, f1)
    support = cast(np.ndarray, support)

    report = {}
    if labels:
        for i, label in enumerate(labels):
            report[label] = {
                "precision": float(precision[i]),
                "recall": float(recall[i]),
                "f1-score": float(f1[i]),
                "support": int(support[i]),
            }

    # Macro average
    report["macro avg"] = {
        "precision": float(np.mean(precision)),
        "recall": float(np.mean(recall)),
        "f1-score": float(np.mean(f1)),
        "support": int(np.sum(support)),
    }

    return report


def get_confusion_matrix(
    predictions: Sequence[int | str], targets: Sequence[int | str], labels: List[str] | None = None
) -> np.ndarray:
    """Compute confusion matrix."""
    return cast(np.ndarray, confusion_matrix(targets, predictions, labels=labels))