evaluator / evaluators /evaluator.py
kleervoyans's picture
Update evaluators/evaluator.py
c4d24a3 verified
import logging
import evaluate
class TranslationEvaluator:
def __init__(self):
self.bleu = evaluate.load("bleu")
self.bertscore = evaluate.load("bertscore")
# COMET MQM model
self.comet = evaluate.load("comet", model_id="unbabel/comet-mqm-qe-da")
logging.info("Loaded BLEU, BERTScore, COMET metrics")
def evaluate(self, sources, references, predictions):
"""
- sources: List[str]
- references: List[str]
- predictions: List[str]
Returns a dict: { "BLEU": float, "BERTScore": float, "BERTurk": float, "COMET": float }
"""
results = {}
# BLEU
results["BLEU"] = self.bleu.compute(
predictions=predictions,
references=[[r] for r in references]
)["bleu"]
# BERTScore (general, lang="xx")
bs = self.bertscore.compute(
predictions=predictions,
references=references,
lang="xx"
)
results["BERTScore"] = float(sum(bs["f1"]) / len(bs["f1"])) if bs["f1"] else 0.0
# BERTurk (lang="tr")
bs_tr = self.bertscore.compute(
predictions=predictions,
references=references,
lang="tr"
)
results["BERTurk"] = float(sum(bs_tr["f1"]) / len(bs_tr["f1"])) if bs_tr["f1"] else 0.0
# COMET (expects srcs, hyps, refs)
comet_out = self.comet.compute(
srcs=sources,
hyps=predictions,
refs=references
)
scores = comet_out.get("scores", None)
if isinstance(scores, list):
results["COMET"] = float(scores[0]) if scores else 0.0
else:
results["COMET"] = float(scores) if scores is not None else 0.0
return results