Spaces:

SUSTech
/

tlem

Running

App Files Files Community

facat commited on Nov 12, 2023

Commit

33a6f85

1 Parent(s): 9199665

update

Browse files

Files changed (3) hide show

tasks.py +22 -4
tlem.py +12 -37
utils.py +28 -0

tasks.py CHANGED Viewed

@@ -6,11 +6,20 @@ from typing import Any, Optional, Protocol, Iterable, Callable
 import logging
 import pandas as pd
 from functools import partial
 from .utils import *
 from evaluate import load
 from collections import defaultdict
 def fake_pipeline(prompts: Iterable[str]) -> list[str]:
@@ -100,6 +109,7 @@ class Task:
         )
         return metric
     def run(
         self,
         pipeline,
@@ -129,14 +139,20 @@ def multichoice(responses: Any, references: list[str]):
     else:
         responses = decode_choice(responses)
-    # return [
-    #     int(response == reference) for reference, response in zip(references, responses)
-    # ]
     return responses, references
 class Metrics:
-    cmmlu = multichoice
     mmlu = multichoice
     def gsm8k(responses: list[str], answers: list[str | int]):
@@ -299,6 +315,7 @@ class CMMLU:
             .to_dict()
         )
         suite = defaultdict(list)
         for k, v in cls.categories.items():
             for subject in v:
                 suite[k].extend(
@@ -429,6 +446,7 @@ class MMLU:
             .to_dict()
         )
         suite = defaultdict(list)
         for k, v in cls.categories.items():
             for subject in v:
                 suite[k].extend(

 import logging
 import pandas as pd
 from functools import partial
+from datasets.utils.logging import disable_progress_bar
 from .utils import *
 from evaluate import load
 from collections import defaultdict
+import sys
+# if sys.version_info >= (3, 9):
+#     from functools import cache
+# else:
+#     from functools import lru_cache as cache
+disable_progress_bar()
 def fake_pipeline(prompts: Iterable[str]) -> list[str]:
         )
         return metric
+    # @cache
     def run(
         self,
         pipeline,
     else:
         responses = decode_choice(responses)
+    return responses, references
+def multichoice_zh(responses: Any, references: list[str]):
+    if isinstance(responses[0], str):
+        responses = [extract_choice_zh(response) for response in responses]
+    else:
+        responses = decode_choice(responses)
     return responses, references
 class Metrics:
+    cmmlu = multichoice_zh
     mmlu = multichoice
     def gsm8k(responses: list[str], answers: list[str | int]):
             .to_dict()
         )
         suite = defaultdict(list)
+        cls.categories["all"] = list(finer_categories.keys())
         for k, v in cls.categories.items():
             for subject in v:
                 suite[k].extend(
             .to_dict()
         )
         suite = defaultdict(list)
+        cls.categories["all"] = list(finer_categories.keys())
         for k, v in cls.categories.items():
             for subject in v:
                 suite[k].extend(

tlem.py CHANGED Viewed

@@ -16,32 +16,7 @@ import pandas as pd
 from .tasks import *
 from .utils import is_equiv
-# %%
-# %cd ../tlem
-# %load_ext ipytorch
-# %ls
-# TODO: Add BibTeX citation
-_CITATION = """\
-"""
-# TODO: Add description of the module here
-_DESCRIPTION = """\
-"""
-# TODO: Add description of the arguments of the module here
-_KWARGS_DESCRIPTION = """
-"""
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
-# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class ReasoningMetric(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
@@ -59,9 +34,9 @@ class ReasoningMetric(evaluate.Metric):
         return evaluate.EvaluationModuleInfo(
             # This is the description that will appear on the modules page.
             # module_type="measurement",
-            description=_DESCRIPTION,
-            citation=_CITATION,
-            inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=features,
             # Homepage of the module for documentation
@@ -106,26 +81,30 @@ class Suite(EvaluationSuite):
     def run(
         self,
         model_or_pipeline: Any,
-        name="tlem",
     ) -> dict[str, float]:
         self.assert_suite_nonempty()
         def run_tasks(tasks):
-            for task in tqdm(tasks):
                 if task.name not in self.cached_result:
                     self.cached_result[task.name] = task.run(model_or_pipeline)
             results = [self.cached_result[task.name] for task in tasks]
             return pd.DataFrame(results).mean().to_dict()
         if isinstance(self.suite, dict):
-            for category, tasks in tqdm(self.suite.items()):
-                logging.warning(f"Combined results: {category}:{run_tasks(tasks)}")
         else:
             logging.warning(f"Combined results: {run_tasks(self.suite)}")
         return self.cached_result
     def add(self, name):
         chat = False
         match name:
             case _ if "chat" in name:
@@ -146,8 +125,4 @@ class Suite(EvaluationSuite):
     def __init__(self, name="tlem"):
         super().__init__(name)
         self.cached_result = {}
-        self.suite = [
-            # TASK_REGISTRY["gsm8k"],
-            # TASK_REGISTRY["competition_math"],
-        ]

 from .tasks import *
 from .utils import is_equiv
 class ReasoningMetric(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
         return evaluate.EvaluationModuleInfo(
             # This is the description that will appear on the modules page.
             # module_type="measurement",
+            description="",
+            citation="",
+            inputs_description="",
             # This defines the format of each prediction and reference
             features=features,
             # Homepage of the module for documentation
     def run(
         self,
         model_or_pipeline: Any,
     ) -> dict[str, float]:
         self.assert_suite_nonempty()
         def run_tasks(tasks):
+            for task in (bar := tqdm(tasks, leave=False)):
+                bar.desc = f"complete {task.name}."
                 if task.name not in self.cached_result:
                     self.cached_result[task.name] = task.run(model_or_pipeline)
             results = [self.cached_result[task.name] for task in tasks]
             return pd.DataFrame(results).mean().to_dict()
         if isinstance(self.suite, dict):
+            for category, tasks in (bar := tqdm(self.suite.items())):
+                bar.desc = f"complete {category}."
+                logging.warning(f"Combined results {category}: {run_tasks(tasks)}")
         else:
             logging.warning(f"Combined results: {run_tasks(self.suite)}")
         return self.cached_result
     def add(self, name):
+        self.load(name)
+    def load(self, name):
         chat = False
         match name:
             case _ if "chat" in name:
     def __init__(self, name="tlem"):
         super().__init__(name)
         self.cached_result = {}
+        self.suite = []

utils.py CHANGED Viewed

@@ -9,6 +9,34 @@ NUMERIC_IN_ZH = (
 )
 def extract_choice(gen):
     # answer is A | choice is A | choose A
     res = re.search(

 )
+def extract_choice_zh(gen):
+    # 答案是A | 选项是A | 应该选A选项
+    res = re.search(
+        r"(?:(?:选|选择|选定)[：:]?\s*|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|选|为|：|:|】))[^ABCD]{0,10}?(?:是|选|为|：|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|，|,|．|、|A|B|C|D|$|：|:|\)|）)",
+        gen,
+    )
+    # A选项正确 | A选项符合题意
+    if res is None:
+        res = re.search(
+            r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对[的，。：]|符合))[^ABCD]{0,4}?(?:正确|对[的，。：]|符合)",
+            gen,
+        )
+    # 直接输出 A
+    if res is None:
+        res = re.search(r"^[\(（]?(A|B|C|D)(?:。|\)|）|\.|，|,|．|：|:|$)", gen)
+    # 获取第一个出现的字母
+    if res is None:
+        res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
+    if res is None:
+        res = "A"
+    if isinstance(res, str):
+        return res
+    return res.group(1)
 def extract_choice(gen):
     # answer is A | choice is A | choose A
     res = re.search(