""" GLEU (Generalized Language Evaluation Understanding) score. Preferred over BLEU for grammatical error correction tasks. Also computes BERTScore for semantic similarity evaluation. """ import sacrebleu from bert_score import score as bert_score_fn from typing import List, Tuple from loguru import logger class GLEUScorer: """Computes GLEU and BERTScore metrics for GEC evaluation.""" def compute_gleu( self, predictions: List[str], references: List[str], ) -> float: """Corpus-level GLEU score (0-100). GLEU is the geometric mean of n-gram precisions and recall, preferred over BLEU for GEC because it equally penalises both under-correction and over-correction. """ if not predictions or not references: return 0.0 # sacrebleu expects references as a list of lists refs = [references] # Use BLEU with smoothing as GLEU approximation # sacrebleu doesn't have a native GLEU, so we use smoothed BLEU bleu = sacrebleu.corpus_bleu( predictions, refs, smooth_method="exp", smooth_value=0.1, ) return bleu.score def compute_bert_score( self, predictions: List[str], references: List[str], lang: str = "en", ) -> Tuple[float, float, float]: """Returns (precision, recall, F1) as averages over the batch.""" if not predictions or not references: return (0.0, 0.0, 0.0) try: P, R, F1 = bert_score_fn( predictions, references, lang=lang, verbose=False, device="cpu", # CPU-optimised ) return ( P.mean().item(), R.mean().item(), F1.mean().item(), ) except Exception as e: logger.warning(f"BERTScore computation failed: {e}") return (0.0, 0.0, 0.0)