| """ |
| GLEU (Generalized Language Evaluation Understanding) score. |
| Preferred over BLEU for grammatical error correction tasks. |
| Also computes BERTScore for semantic similarity evaluation. |
| """ |
|
|
| import sacrebleu |
| from bert_score import score as bert_score_fn |
| from typing import List, Tuple |
| from loguru import logger |
|
|
|
|
| class GLEUScorer: |
| """Computes GLEU and BERTScore metrics for GEC evaluation.""" |
|
|
| def compute_gleu( |
| self, |
| predictions: List[str], |
| references: List[str], |
| ) -> float: |
| """Corpus-level GLEU score (0-100). |
| |
| GLEU is the geometric mean of n-gram precisions and recall, |
| preferred over BLEU for GEC because it equally penalises |
| both under-correction and over-correction. |
| """ |
| if not predictions or not references: |
| return 0.0 |
|
|
| |
| refs = [references] |
|
|
| |
| |
| bleu = sacrebleu.corpus_bleu( |
| predictions, |
| refs, |
| smooth_method="exp", |
| smooth_value=0.1, |
| ) |
| return bleu.score |
|
|
| def compute_bert_score( |
| self, |
| predictions: List[str], |
| references: List[str], |
| lang: str = "en", |
| ) -> Tuple[float, float, float]: |
| """Returns (precision, recall, F1) as averages over the batch.""" |
| if not predictions or not references: |
| return (0.0, 0.0, 0.0) |
|
|
| try: |
| P, R, F1 = bert_score_fn( |
| predictions, |
| references, |
| lang=lang, |
| verbose=False, |
| device="cpu", |
| ) |
| return ( |
| P.mean().item(), |
| R.mean().item(), |
| F1.mean().item(), |
| ) |
| except Exception as e: |
| logger.warning(f"BERTScore computation failed: {e}") |
| return (0.0, 0.0, 0.0) |
|
|