Spaces:
Running
Running
FIX: extraction func of C-Eval; logging metrics (#3)
Browse files- Delete: print extraction info (868c1b24af35fcd7a5a09f382e15d6e794a7b5d7)
- FIX: extraction func of C-Eval; logging metrics (e637e0c3ca817b49543ee6615f52d7c4cc7158fa)
Co-authored-by: songzhuoyang <Cookize@users.noreply.huggingface.co>
tasks.py
CHANGED
|
@@ -149,14 +149,15 @@ class Task:
|
|
| 149 |
return
|
| 150 |
self.outputs = outputs
|
| 151 |
try:
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
except Exception as e:
|
| 156 |
-
result = self.metric.compute(
|
| 157 |
-
responses=outputs, references=self.dataset[self.label_column]
|
| 158 |
-
)
|
| 159 |
-
finally:
|
| 160 |
result = outputs
|
| 161 |
# if log:
|
| 162 |
# name = name or pipeline.__name__
|
|
@@ -188,7 +189,7 @@ class Metrics:
|
|
| 188 |
mmlu = multichoice
|
| 189 |
|
| 190 |
def ceval(responses: list[str], answers: list[str | int]):
|
| 191 |
-
responses = [
|
| 192 |
return responses, answers
|
| 193 |
|
| 194 |
def winogrande(responses: list[str], answers: list[str | int]):
|
|
@@ -892,7 +893,7 @@ class CEVAL:
|
|
| 892 |
prefix = (
|
| 893 |
f"以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n"
|
| 894 |
if chat
|
| 895 |
-
else "
|
| 896 |
)
|
| 897 |
|
| 898 |
prompt = prefix + f'{example["question"]}'
|
|
@@ -1043,6 +1044,7 @@ class CEVAL:
|
|
| 1043 |
suite = defaultdict(list)
|
| 1044 |
cls.categories = defaultdict(list)
|
| 1045 |
for task, info in cls.ceval_subject_mapping.items():
|
|
|
|
| 1046 |
cls.categories[info[2]].append(task)
|
| 1047 |
cls.categories["all"] = list(cls.ceval_subject_mapping.keys())
|
| 1048 |
for k, v in cls.categories.items():
|
|
|
|
| 149 |
return
|
| 150 |
self.outputs = outputs
|
| 151 |
try:
|
| 152 |
+
try:
|
| 153 |
+
result = self.metric._compute(
|
| 154 |
+
responses=outputs, references=self.dataset[self.label_column]
|
| 155 |
+
)
|
| 156 |
+
except Exception as e:
|
| 157 |
+
result = self.metric.compute(
|
| 158 |
+
responses=outputs, references=self.dataset[self.label_column]
|
| 159 |
+
)
|
| 160 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
result = outputs
|
| 162 |
# if log:
|
| 163 |
# name = name or pipeline.__name__
|
|
|
|
| 189 |
mmlu = multichoice
|
| 190 |
|
| 191 |
def ceval(responses: list[str], answers: list[str | int]):
|
| 192 |
+
responses = [extract_choice_zh(pred) for pred in responses]
|
| 193 |
return responses, answers
|
| 194 |
|
| 195 |
def winogrande(responses: list[str], answers: list[str | int]):
|
|
|
|
| 893 |
prefix = (
|
| 894 |
f"以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n"
|
| 895 |
if chat
|
| 896 |
+
else "问题:"
|
| 897 |
)
|
| 898 |
|
| 899 |
prompt = prefix + f'{example["question"]}'
|
|
|
|
| 1044 |
suite = defaultdict(list)
|
| 1045 |
cls.categories = defaultdict(list)
|
| 1046 |
for task, info in cls.ceval_subject_mapping.items():
|
| 1047 |
+
cls.categories[info[0]].append(task)
|
| 1048 |
cls.categories[info[2]].append(task)
|
| 1049 |
cls.categories["all"] = list(cls.ceval_subject_mapping.keys())
|
| 1050 |
for k, v in cls.categories.items():
|
tlem.py
CHANGED
|
@@ -58,7 +58,7 @@ class ReasoningMetric(evaluate.Metric):
|
|
| 58 |
)
|
| 59 |
df["extract_responses"] = extract_responses
|
| 60 |
df["extract_references"] = extract_references
|
| 61 |
-
print(df)
|
| 62 |
results = {
|
| 63 |
"Accuracy": (df["extract_references"] == df["extract_responses"])
|
| 64 |
.astype(int)
|
|
@@ -139,7 +139,7 @@ class Suite(EvaluationSuite):
|
|
| 139 |
case _ if "test" in name:
|
| 140 |
suite = suite["Test"]
|
| 141 |
|
| 142 |
-
self.suite = suite
|
| 143 |
|
| 144 |
def __init__(self, name="tlem"):
|
| 145 |
super().__init__(name)
|
|
|
|
| 58 |
)
|
| 59 |
df["extract_responses"] = extract_responses
|
| 60 |
df["extract_references"] = extract_references
|
| 61 |
+
# print(df)
|
| 62 |
results = {
|
| 63 |
"Accuracy": (df["extract_references"] == df["extract_responses"])
|
| 64 |
.astype(int)
|
|
|
|
| 139 |
case _ if "test" in name:
|
| 140 |
suite = suite["Test"]
|
| 141 |
|
| 142 |
+
self.suite = [suite] if isinstance(suite, Task) else suite
|
| 143 |
|
| 144 |
def __init__(self, name="tlem"):
|
| 145 |
super().__init__(name)
|