| import json |
| import jieba |
| from rouge_chinese import Rouge |
| import numpy as np |
| from evaluate import load |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|
|
| def compute_metrics(eval_preds): |
| preds, labels = eval_preds |
|
|
| score_dict = { |
| "rouge-1": [], |
| "rouge-2": [], |
| "rouge-l": [], |
| "bleu-1": [], |
| "bleu-2": [], |
| "bleu-3": [], |
| "bleu-4": [], |
| } |
| for pred, label in zip(preds, labels): |
| hypothesis = list(jieba.cut(pred)) |
| reference = list(jieba.cut(label)) |
| rouge = Rouge() |
| scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) |
| result = scores[0] |
|
|
| for k, v in result.items(): |
| score_dict[k].append(round(v["f"] * 100, 4)) |
| bleu_weights = [ |
| [1.0, 0.0, 0.0, 0.0], |
| [0.5, 0.5, 0.0, 0.0], |
| [1.0 / 3] * 3 + [0.0], |
| [0.25] * 4, |
| ] |
| for ibw, bw in enumerate(bleu_weights): |
| bleu_score = sentence_bleu( |
| [list(label)], |
| list(pred), |
| weights=bw, |
| smoothing_function=SmoothingFunction().method3, |
| ) |
| score_dict[f"bleu-{ibw+1}"].append(round(bleu_score * 100, 4)) |
|
|
| for k, v in score_dict.items(): |
| score_dict[k] = float(np.mean(v)) |
| return score_dict |
|
|
| |
| model_name = "chatgpt" |
| fname = f"./{model_name}/result.json" |
|
|
| references, predictions = [], [] |
| with open(fname, "r") as fr: |
| for line in fr.readlines(): |
| sample = json.loads(line.strip()) |
| references.append(sample["response"]) |
| predictions.append(sample["prediction"]) |
|
|
| result = compute_metrics([predictions, references]) |
| print(result) |