rewrite / src /evaluation /gleu_scorer.py
morpheuslord's picture
Add files using upload-large-folder tool
12fd5f2 verified
"""
GLEU (Generalized Language Evaluation Understanding) score.
Preferred over BLEU for grammatical error correction tasks.
Also computes BERTScore for semantic similarity evaluation.
"""
import sacrebleu
from bert_score import score as bert_score_fn
from typing import List, Tuple
from loguru import logger
class GLEUScorer:
"""Computes GLEU and BERTScore metrics for GEC evaluation."""
def compute_gleu(
self,
predictions: List[str],
references: List[str],
) -> float:
"""Corpus-level GLEU score (0-100).
GLEU is the geometric mean of n-gram precisions and recall,
preferred over BLEU for GEC because it equally penalises
both under-correction and over-correction.
"""
if not predictions or not references:
return 0.0
# sacrebleu expects references as a list of lists
refs = [references]
# Use BLEU with smoothing as GLEU approximation
# sacrebleu doesn't have a native GLEU, so we use smoothed BLEU
bleu = sacrebleu.corpus_bleu(
predictions,
refs,
smooth_method="exp",
smooth_value=0.1,
)
return bleu.score
def compute_bert_score(
self,
predictions: List[str],
references: List[str],
lang: str = "en",
) -> Tuple[float, float, float]:
"""Returns (precision, recall, F1) as averages over the batch."""
if not predictions or not references:
return (0.0, 0.0, 0.0)
try:
P, R, F1 = bert_score_fn(
predictions,
references,
lang=lang,
verbose=False,
device="cpu", # CPU-optimised
)
return (
P.mean().item(),
R.mean().item(),
F1.mean().item(),
)
except Exception as e:
logger.warning(f"BERTScore computation failed: {e}")
return (0.0, 0.0, 0.0)