import json import jieba from rouge_chinese import Rouge import numpy as np from evaluate import load from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction def compute_metrics(eval_preds): preds, labels = eval_preds score_dict = { "rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-1": [], "bleu-2": [], "bleu-3": [], "bleu-4": [], } for pred, label in zip(preds, labels): hypothesis = list(jieba.cut(pred)) reference = list(jieba.cut(label)) rouge = Rouge() scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) result = scores[0] for k, v in result.items(): score_dict[k].append(round(v["f"] * 100, 4)) bleu_weights = [ [1.0, 0.0, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0], [1.0 / 3] * 3 + [0.0], [0.25] * 4, ] for ibw, bw in enumerate(bleu_weights): bleu_score = sentence_bleu( [list(label)], list(pred), weights=bw, smoothing_function=SmoothingFunction().method3, ) score_dict[f"bleu-{ibw+1}"].append(round(bleu_score * 100, 4)) for k, v in score_dict.items(): score_dict[k] = float(np.mean(v)) return score_dict # model_name = "chatglm" model_name = "chatgpt" fname = f"./{model_name}/result.json" references, predictions = [], [] with open(fname, "r") as fr: for line in fr.readlines(): sample = json.loads(line.strip()) references.append(sample["response"]) predictions.append(sample["prediction"]) result = compute_metrics([predictions, references]) print(result)