Spaces:
Sleeping
Sleeping
| import logging | |
| import evaluate | |
| class TranslationEvaluator: | |
| def __init__(self): | |
| self.bleu = evaluate.load("bleu") | |
| self.bertscore = evaluate.load("bertscore") | |
| # COMET MQM model | |
| self.comet = evaluate.load("comet", model_id="unbabel/comet-mqm-qe-da") | |
| logging.info("Loaded BLEU, BERTScore, COMET metrics") | |
| def evaluate(self, sources, references, predictions): | |
| """ | |
| - sources: List[str] | |
| - references: List[str] | |
| - predictions: List[str] | |
| Returns a dict: { "BLEU": float, "BERTScore": float, "BERTurk": float, "COMET": float } | |
| """ | |
| results = {} | |
| # BLEU | |
| results["BLEU"] = self.bleu.compute( | |
| predictions=predictions, | |
| references=[[r] for r in references] | |
| )["bleu"] | |
| # BERTScore (general, lang="xx") | |
| bs = self.bertscore.compute( | |
| predictions=predictions, | |
| references=references, | |
| lang="xx" | |
| ) | |
| results["BERTScore"] = float(sum(bs["f1"]) / len(bs["f1"])) if bs["f1"] else 0.0 | |
| # BERTurk (lang="tr") | |
| bs_tr = self.bertscore.compute( | |
| predictions=predictions, | |
| references=references, | |
| lang="tr" | |
| ) | |
| results["BERTurk"] = float(sum(bs_tr["f1"]) / len(bs_tr["f1"])) if bs_tr["f1"] else 0.0 | |
| # COMET (expects srcs, hyps, refs) | |
| comet_out = self.comet.compute( | |
| srcs=sources, | |
| hyps=predictions, | |
| refs=references | |
| ) | |
| scores = comet_out.get("scores", None) | |
| if isinstance(scores, list): | |
| results["COMET"] = float(scores[0]) if scores else 0.0 | |
| else: | |
| results["COMET"] = float(scores) if scores is not None else 0.0 | |
| return results | |