Spaces:

SUSTech
/

tlem

Running

App Files Files Community

facat commited on Oct 29, 2023

Commit

be1543a

1 Parent(s): 044ed98

add mmlu and cmmlu

Browse files

Files changed (3) hide show

tasks.py +364 -49
tlem.py +48 -48
utils.py +37 -9

tasks.py CHANGED Viewed

@@ -3,19 +3,14 @@ from datasets import load_dataset, Dataset
 from functools import cached_property
 from tqdm.auto import tqdm
 from typing import Any, Optional, Protocol, Iterable, Callable
-from .utils import (
-    NUMERIC_IN_ZH,
-    extract_choice_ans,
-    extract_numeric,
-    get_answer,
-    is_equiv,
-)
 from evaluate import load
-TextGenerationPipeline = Callable[[Iterable[str]], list[str]]
 def fake_pipeline(prompts: Iterable[str]) -> list[str]:
     return [prompt for prompt in tqdm(prompts)]
@@ -30,14 +25,25 @@ class Task:
     input_column: str = "question"
     label_column: str = "answer"
     prompt: Optional[Callable | str] = None
-    @cached_property
-    def name(self):
-        return (
-            self.dataset_name
             if isinstance(self.dataset_name, str)
-            else self.dataset_name[0]
-        ) + f"-{self.split}"
     @cached_property
     def samples(self):
@@ -49,20 +55,38 @@ class Task:
             *self.dataset_name
             if isinstance(self.dataset_name, tuple)
             else self.dataset_name,
-            split=self.split,
         )
         if self.prompt is not None:
-            ds = ds.map(
                 lambda example: {
-                    self.input_column: self.prompt.format(
-                        input_column=example[self.input_column]
-                    )
                 }
-                if isinstance(self.prompt, str)
-                else self.prompt(example),
             )
-        return ds
     @cached_property
     def metric(self):
@@ -73,14 +97,44 @@ class Task:
         )
         return metric
-    def run(self, pipeline: TextGenerationPipeline = fake_pipeline):
-        outputs = pipeline(self.samples)
-        return self.metric.compute(
-            responses=outputs, references=self.dataset[self.label_column]
-        )
 class Metrics:
     def gsm8k(responses: list[str], answers: list[str | int]):
         scores = []
         for response, answer in zip(responses, answers):
@@ -112,26 +166,287 @@ class Metrics:
             scores.append(1.0 * (pred == gold))
         return scores
-    def gsm8k_zh(responses: list[str], answers: list[str]):
-        scores = []
-        for response, answer in zip(responses, answers):
-            pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
-            gold = extract_numeric(answer)
-            scores.append(1.0 * (pred == gold))
-        return scores
-    def svamp(responses: list[float], answers: list[str]):
-        scores = []
-        for response, answer in zip(responses, answers):
-            pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
-            gold = answer
-            scores.append(1.0 * (float(pred) == gold))
-        return scores
-    def mmlu(responses, answers):
-        scores = []
-        for response, answer in zip(responses, answers):
-            pred = extract_choice_ans(response)
-            gold = answer.lower()
-            scores.append(1.0 * (pred == gold))
-        return scores

 from functools import cached_property
 from tqdm.auto import tqdm
 from typing import Any, Optional, Protocol, Iterable, Callable
+import logging
+import pandas as pd
+from functools import partial
+from .utils import *
 from evaluate import load
 def fake_pipeline(prompts: Iterable[str]) -> list[str]:
     return [prompt for prompt in tqdm(prompts)]
     input_column: str = "question"
     label_column: str = "answer"
     prompt: Optional[Callable | str] = None
+    few_shot: int = 0
+    few_shot_from: Optional[str] = None
+    # results: dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self):
+        names = (
+            [self.dataset_name]
             if isinstance(self.dataset_name, str)
+            else list(self.dataset_name)
+        )
+        names[0] = names[0].split("/")[-1]
+        self.name = "-".join(names) + f"-{self.split}"
+        if isinstance(self.prompt, str):
+            self.prompt = lambda example: {
+                self.input_column: self.prompt.format(
+                    input_column=example[self.input_column]
+                )
+            }
     @cached_property
     def samples(self):
             *self.dataset_name
             if isinstance(self.dataset_name, tuple)
             else self.dataset_name,
+            # split=self.split,
         )
+        test_ds = ds[self.split]
         if self.prompt is not None:
+            test_ds = test_ds.map(self.prompt)
+        if self.few_shot:
+            if self.few_shot_from is None:
+                for name in ["train", "validation", "val", "dev"]:
+                    if name in ds:
+                        self.few_shot_from = name
+                        break
+            shots = ds[self.few_shot_from].select(range(self.few_shot))
+            if self.prompt is not None:
+                shots = shots.map(self.prompt)
+            shots = shots.map(
                 lambda example: {
+                    self.input_column: example[self.input_column]
+                    + example[self.label_column],
+                }
+            )[self.input_column]
+            few_shot_prompts = "\n".join(shots)
+            test_ds = test_ds.map(
+                lambda example: {
+                    self.input_column: few_shot_prompts + example[self.input_column],
                 }
             )
+        return test_ds
     @cached_property
     def metric(self):
         )
         return metric
+    def run(
+        self,
+        pipeline,
+    ):
+        if (outputs := pipeline(self.samples)) is None:
+            logging.warning("pipeline returns None")
+            return
+        self.outputs = outputs
+        try:
+            result = self.metric._compute(
+                responses=outputs, references=self.dataset[self.label_column]
+            )
+        except Exception as e:
+            result = self.metric.compute(
+                responses=outputs, references=self.dataset[self.label_column]
+            )
+        # if log:
+        #     name = name or pipeline.__name__
+        #     self.results[name] = result
+        return result
+def multichoice(responses: Any, references: list[str]):
+    if isinstance(responses[0], str):
+        responses = [extract_choice(response) for response in responses]
+    else:
+        responses = decode_choice(responses)
+    return [
+        int(response == reference) for reference, response in zip(references, responses)
+    ]
 class Metrics:
+    cmmlu = multichoice
+    mmlu = multichoice
     def gsm8k(responses: list[str], answers: list[str | int]):
         scores = []
         for response, answer in zip(responses, answers):
             scores.append(1.0 * (pred == gold))
         return scores
+class CMMLU:
+    def prompt_cmmlu(example, chat=False):
+        prefix = "以下是一道多项选择题，请从A、B、C和D中选择最合适的答案作为这个问题的答案。\n\n" if chat else "问题："
+        prompt = prefix + example["Question"]
+        for choice in list("ABCD"):
+            prompt += f"\n{choice}. {example[choice]}"
+            prompt += "\n答案："
+        return {"prompt": prompt}
+    subcategories = {
+        "agronomy": ["other"],
+        "anatomy": ["biology"],
+        "ancient_chinese": ["linguistics", "china specific"],
+        "arts": ["arts"],
+        "astronomy": ["physics"],
+        "business_ethics": ["business"],
+        "chinese_civil_service_exam": ["politics", "china specific"],
+        "chinese_driving_rule": ["other", "china specific"],
+        "chinese_food_culture": ["culture", "china specific"],
+        "chinese_foreign_policy": ["politics", "china specific"],
+        "chinese_history": ["history", "china specific"],
+        "chinese_literature": ["literature", "china specific"],
+        "chinese_teacher_qualification": ["education", "china specific"],
+        "college_actuarial_science": ["math"],
+        "college_education": ["education"],
+        "college_engineering_hydrology": ["engineering"],
+        "college_law": ["law"],
+        "college_mathematics": ["math"],
+        "college_medical_statistics": ["statistics"],
+        "clinical_knowledge": ["other"],
+        "college_medicine": ["other"],
+        "computer_science": ["computer science"],
+        "computer_security": ["other"],
+        "conceptual_physics": ["physics"],
+        "construction_project_management": ["other", "china specific"],
+        "economics": ["economics"],
+        "education": ["education"],
+        "elementary_chinese": ["linguistics", "china specific"],
+        "elementary_commonsense": ["other", "china specific"],
+        "elementary_information_and_technology": ["other"],
+        "electrical_engineering": ["engineering"],
+        "elementary_mathematics": ["math"],
+        "ethnology": ["culture", "china specific"],
+        "food_science": ["other"],
+        "genetics": ["biology"],
+        "global_facts": ["global"],
+        "high_school_biology": ["biology"],
+        "high_school_chemistry": ["chemistry"],
+        "high_school_geography": ["geography"],
+        "high_school_mathematics": ["math"],
+        "high_school_physics": ["physics"],
+        "high_school_politics": ["politics", "china specific"],
+        "human_sexuality": ["other"],
+        "international_law": ["law"],
+        "journalism": ["sociology"],
+        "jurisprudence": ["law"],
+        "legal_and_moral_basis": ["other"],
+        "logical": ["philosophy"],
+        "machine_learning": ["computer science"],
+        "management": ["business"],
+        "marketing": ["business"],
+        "marxist_theory": ["philosophy"],
+        "modern_chinese": ["linguistics", "china specific"],
+        "nutrition": ["other"],
+        "philosophy": ["philosophy"],
+        "professional_accounting": ["business"],
+        "professional_law": ["law"],
+        "professional_medicine": ["other"],
+        "professional_psychology": ["psychology"],
+        "public_relations": ["politics"],
+        "security_study": ["politics"],
+        "sociology": ["culture"],
+        "sports_science": ["other"],
+        "traditional_chinese_medicine": ["other", "china specific"],
+        "virology": ["biology"],
+        "world_history": ["history"],
+        "world_religions": ["global"],
+    }
+    categories = {
+        "STEM": [
+            "physics",
+            "chemistry",
+            "biology",
+            "computer science",
+            "math",
+            "engineering",
+            "statistics",
+        ],
+        "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
+        "Social Science": [
+            "linguistics",
+            "business",
+            "politics",
+            "culture",
+            "economics",
+            "geography",
+            "psychology",
+            "education",
+            "sociology",
+        ],
+        "Other": ["other"],
+        "China specific": ["china specific"],
+        "Test": ["computer science"],
+    }
+    finer_categories = (
+        pd.Series(subcategories)  # noqa # type: ignore
+        .explode()
+        .reset_index()
+        .set_index(0)
+        .groupby(0)
+        .agg(list)["index"]
+        .to_dict()
+    )
+    @classmethod
+    def suite(cls, chat=False):
+        suite = {}
+        for k, v in cls.categories.items():
+            for subject in v:
+                suite[k] = [
+                    Task(
+                        ("haonan-li/cmmlu", subcategories),
+                        metric_name=("sustech/tlem", "cmmlu"),
+                        input_column="prompt",
+                        label_column="Answer",
+                        prompt=partial(cls.prompt_cmmlu, chat=chat),
+                    )
+                    for subcategories in cls.finer_categories[subject]
+                ]
+        return suite
+class MMLU:
+    input_column = "prompt"
+    label_column = "target"
+    @classmethod
+    def prompt_mmlu(cls, example, chat=False):
+        prefix = (
+            "The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n"
+            if chat
+            else "Question: "
+        )
+        prompt = prefix + example["input"]
+        for choice in list("ABCD"):
+            prompt += f"\n{choice}. {example[choice]}"
+        prompt += "\nAnswer："
+        return {"prompt": prompt}
+    subcategories = {
+        "abstract_algebra": ["math"],
+        "anatomy": ["health"],
+        "astronomy": ["physics"],
+        "business_ethics": ["business"],
+        "clinical_knowledge": ["health"],
+        "college_biology": ["biology"],
+        "college_chemistry": ["chemistry"],
+        "college_computer_science": ["computer science"],
+        "college_mathematics": ["math"],
+        "college_medicine": ["health"],
+        "college_physics": ["physics"],
+        "computer_security": ["computer science"],
+        "conceptual_physics": ["physics"],
+        "econometrics": ["economics"],
+        "electrical_engineering": ["engineering"],
+        "elementary_mathematics": ["math"],
+        "formal_logic": ["philosophy"],
+        "global_facts": ["other"],
+        "high_school_biology": ["biology"],
+        "high_school_chemistry": ["chemistry"],
+        "high_school_computer_science": ["computer science"],
+        "high_school_european_history": ["history"],
+        "high_school_geography": ["geography"],
+        "high_school_government_and_politics": ["politics"],
+        "high_school_macroeconomics": ["economics"],
+        "high_school_mathematics": ["math"],
+        "high_school_microeconomics": ["economics"],
+        "high_school_physics": ["physics"],
+        "high_school_psychology": ["psychology"],
+        "high_school_statistics": ["math"],
+        "high_school_us_history": ["history"],
+        "high_school_world_history": ["history"],
+        "human_aging": ["health"],
+        "human_sexuality": ["culture"],
+        "international_law": ["law"],
+        "jurisprudence": ["law"],
+        "logical_fallacies": ["philosophy"],
+        "machine_learning": ["computer science"],
+        "management": ["business"],
+        "marketing": ["business"],
+        "medical_genetics": ["health"],
+        "miscellaneous": ["other"],
+        "moral_disputes": ["philosophy"],
+        "moral_scenarios": ["philosophy"],
+        "nutrition": ["health"],
+        "philosophy": ["philosophy"],
+        "prehistory": ["history"],
+        "professional_accounting": ["other"],
+        "professional_law": ["law"],
+        "professional_medicine": ["health"],
+        "professional_psychology": ["psychology"],
+        "public_relations": ["politics"],
+        "security_studies": ["politics"],
+        "sociology": ["culture"],
+        "us_foreign_policy": ["politics"],
+        "virology": ["health"],
+        "world_religions": ["philosophy"],
+    }
+    categories = {
+        "Math": [
+            "math",
+        ],
+        "STEM": [
+            "physics",
+            "chemistry",
+            "biology",
+            "computer science",
+            "math",
+            "engineering",
+        ],
+        "humanities": ["history", "philosophy", "law"],
+        "social sciences": [
+            "politics",
+            "culture",
+            "economics",
+            "geography",
+            "psychology",
+        ],
+        "Other": ["other", "business", "health"],
+        "All": [
+            "physics",
+            "chemistry",
+            "biology",
+            "computer science",
+            "math",
+            "engineering",
+            "history",
+            "philosophy",
+            "law",
+            "politics",
+            "culture",
+            "economics",
+            "geography",
+            "psychology",
+            "other",
+            "business",
+            "health",
+        ],
+        "Test": ["culture"],
+    }
+    @classmethod
+    def suite(cls, chat=False):
+        finer_categories = (
+            pd.Series(cls.subcategories)  # noqa # type: ignore
+            .explode()
+            .reset_index()
+            .set_index(0)
+            .groupby(0)
+            .agg(list)["index"]
+            .to_dict()
+        )
+        suite = {}
+        for k, v in cls.categories.items():
+            for subject in v:
+                suite[k] = [
+                    Task(
+                        ("lukaemon/mmlu", subcategories),
+                        metric_name=("sustech/tlem", "mmlu"),
+                        input_column=cls.input_column,
+                        label_column=cls.label_column,
+                        prompt=partial(cls.prompt_mmlu, chat=chat),
+                        few_shot=0 if chat else 5,
+                        few_shot_from="validation"
+                    )
+                    for subcategories in finer_categories[subject]
+                ]
+        return suite

tlem.py CHANGED Viewed

@@ -11,7 +11,8 @@ from evaluate.evaluation_suite import EvaluationSuite
 import evaluate
 import numpy as np
 import datasets
-from .tasks import Task, Metrics
 from .utils import is_equiv
 # %%
@@ -24,56 +25,35 @@ from .utils import is_equiv
 # TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
-}
 """
 # TODO: Add description of the module here
 _DESCRIPTION = """\
-A simple measurement that returns the number of elements in dataset.
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
-Calculates number of elements in dataset
-Args:
-    data: list of elements.
-Returns:
-    element_count: number of elements in dataset,
-Examples:
-    >>> measure = evaluate.load("lvwerra/element_count")
-    >>> measure.compute(["a", "b", "c")
-    {"element_count": 3}
 """
 # TODO: Define external resources urls if needed
 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
-@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class ReasoningMetric(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
     def _info(self):
         features = datasets.Features(
             {
                 "responses": datasets.Value("string"),
                 "references": datasets.Value("string"),
             }
         )
-        if self.config_name == "svamp":
-            features = datasets.Features(
-                {
-                    "responses": datasets.Value("string"),
-                    "references": datasets.Value("float"),
-                }
-            )
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.EvaluationModuleInfo(
             # This is the description that will appear on the modules page.
@@ -90,38 +70,59 @@ class ReasoningMetric(evaluate.Metric):
             reference_urls=["http://path.to.reference.url/new_module"],
         )
-    def _compute(self, responses, references, verbose=False):
-        results = {}
         scores = getattr(Metrics, self.config_name)(responses, references)
-        acc = np.asarray(scores).mean()
-        results = {
-            "accuracy": acc,
-            "scores": scores,
-        }
-        if verbose:
-            results["references"] = references
-            results["answers"] = responses
-            # results["scores"] = scores
         return results
 class Suite(EvaluationSuite):
     def run(
-        self, model_or_pipeline: Any, prompt: str = "{instruction}"
     ) -> dict[str, float]:
         self.assert_suite_nonempty()
-        results_all = {}
-        for task in tqdm(self.suite, desc="Running tasks"):
-            task_name = task.name
-            results = task.run(model_or_pipeline)
-            results_all[task_name] = results
-        return results_all
-    def __init__(self, name):
         super().__init__(name)
         self.suite = [
             Task(
@@ -136,4 +137,3 @@ class Suite(EvaluationSuite):
 # %%

 import evaluate
 import numpy as np
 import datasets
+import pandas as pd
+from .tasks import *
 from .utils import is_equiv
 # %%
 # TODO: Add BibTeX citation
 _CITATION = """\
 """
 # TODO: Add description of the module here
 _DESCRIPTION = """\
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
 """
 # TODO: Define external resources urls if needed
 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
+# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class ReasoningMetric(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
     def _info(self):
+        # if self.config_name in ["cmmlu"]:
         features = datasets.Features(
             {
                 "responses": datasets.Value("string"),
+                # "responses": datasets.Sequence(datasets.Value("float")),
                 "references": datasets.Value("string"),
             }
         )
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.EvaluationModuleInfo(
             # This is the description that will appear on the modules page.
             reference_urls=["http://path.to.reference.url/new_module"],
         )
+    def _compute(self, responses, references):
         scores = getattr(Metrics, self.config_name)(responses, references)
+        results = {"Accuracy": np.nanmean(scores)}
+        logging.info(results)
         return results
 class Suite(EvaluationSuite):
     def run(
+        self,
+        model_or_pipeline: Any,
+        name="tlem",
     ) -> dict[str, float]:
         self.assert_suite_nonempty()
+        def run_tasks(tasks):
+            for task in tqdm(tasks):
+                if task.name not in self.cached_result:
+                    self.cached_result[task.name] = task.run(model_or_pipeline)
+            results = [self.cached_result[task.name] for task in tasks]
+            return pd.DataFrame(results).mean().to_dict()
+        if isinstance(self.suite, dict):
+            for category, tasks in tqdm(self.suite.items()):
+                logging.warning(f"Combined results: {category}:{run_tasks(tasks)}")
+        else:
+            logging.warning(f"Combined results: {run_tasks(self.suite)}")
+        return self.cached_result
+    def add(self, name):
+        chat = False
+        match name:
+            case _ if "chat" in name:
+                chat = True
+        match name:
+            case _ if name.startswith("mmlu"):
+                suite = MMLU.suite(chat=chat)
+            case _ if name.startswith("cmmlu"):
+                suite = CMMLU.suite(chat=chat)
+        match name:
+            case _ if "test" in name:
+                suite = suite["Test"]
+        self.suite = suite
+    def __init__(self, name="tlem"):
         super().__init__(name)
+        self.cached_result = {}
+        match self.name:
+            case "cmmlu":
+                pass
         self.suite = [
             Task(
 # %%

utils.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import logging
 import re
 NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
 NUMERIC_IN_ZH = (
@@ -7,17 +9,43 @@ NUMERIC_IN_ZH = (
 )
-def extract_choice_ans(text):
-    pattern1 = r"\b[ABCDabcd]\b"
-    pattern2 = r"\([ABCDabcd]\)"
-    matches1 = re.findall(pattern1, text)
-    matches2 = re.findall(pattern2, text)
-    matches = matches1 + matches2
-    def standardize(ans):
-        return ans if len(ans) == 1 else ans[1]
-    return standardize(matches[-1]).lower() if matches else "_"
 def extract_numeric(string, pattern=NUMERIC_IN_EN) -> str:

 import logging
 import re
+import numpy as np
+from typing import Any
 NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
 NUMERIC_IN_ZH = (
 )
+def extract_choice(gen):
+    # answer is A | choice is A | choose A
+    res = re.search(
+        r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b",
+        gen,
+    )
+    # A is correct | A is right
+    if res is None:
+        res = re.search(
+            r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b",
+            gen,
+        )
+    # straight answer: A
+    if res is None:
+        res = re.search(r"^(A|B|C|D)(?:\.|,|:|$)", gen)
+    # simply extract the first appearred letter
+    if res is None:
+        res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
+    if res is None:
+        res = "A"
+    if isinstance(res, str):
+        return res
+    return res.group(1)
+def decode_choice(responses: list[Any]):
+    num_choices = responses[0].shape[0]
+    choices = np.argmax(np.asarray(responses), axis=1)
+    responses = np.array(list("ABCDEFGHIJKL"[:num_choices]))[choices]
+    # return (responses == np.array(references)).mean()
+    return responses
 def extract_numeric(string, pattern=NUMERIC_IN_EN) -> str: