morpheuslord
/

rewrite

text2text-generation

grammar-correction

style-preservation

Model card Files Files and versions

Metrics Training metrics Community

rewrite / src /evaluation /gleu_scorer.py

morpheuslord's picture

Add files using upload-large-folder tool

12fd5f2 verified 25 days ago

history blame contribute delete

2.04 kB

	"""
	GLEU (Generalized Language Evaluation Understanding) score.
	Preferred over BLEU for grammatical error correction tasks.
	Also computes BERTScore for semantic similarity evaluation.
	"""

	import sacrebleu
	from bert_score import score as bert_score_fn
	from typing import List, Tuple
	from loguru import logger


	class GLEUScorer:
	"""Computes GLEU and BERTScore metrics for GEC evaluation."""

	def compute_gleu(
	self,
	predictions: List[str],
	references: List[str],
	) -> float:
	"""Corpus-level GLEU score (0-100).

	GLEU is the geometric mean of n-gram precisions and recall,
	preferred over BLEU for GEC because it equally penalises
	both under-correction and over-correction.
	"""
	if not predictions or not references:
	return 0.0

	# sacrebleu expects references as a list of lists
	refs = [references]

	# Use BLEU with smoothing as GLEU approximation
	# sacrebleu doesn't have a native GLEU, so we use smoothed BLEU
	bleu = sacrebleu.corpus_bleu(
	predictions,
	refs,
	smooth_method="exp",
	smooth_value=0.1,
	)
	return bleu.score

	def compute_bert_score(
	self,
	predictions: List[str],
	references: List[str],
	lang: str = "en",
	) -> Tuple[float, float, float]:
	"""Returns (precision, recall, F1) as averages over the batch."""
	if not predictions or not references:
	return (0.0, 0.0, 0.0)

	try:
	P, R, F1 = bert_score_fn(
	predictions,
	references,
	lang=lang,
	verbose=False,
	device="cpu", # CPU-optimised
	)
	return (
	P.mean().item(),
	R.mean().item(),
	F1.mean().item(),
	)
	except Exception as e:
	logger.warning(f"BERTScore computation failed: {e}")
	return (0.0, 0.0, 0.0)