Spaces:
Running
Running
| # %% | |
| try: | |
| from ipytorch import logging | |
| except Exception as e: | |
| import logging | |
| from typing import Any, Optional, Protocol, Iterable, Callable | |
| from numpy.lib import extract | |
| from tqdm.auto import tqdm | |
| from evaluate.evaluation_suite import EvaluationSuite | |
| import evaluate | |
| import numpy as np | |
| import datasets | |
| import pandas as pd | |
| from .tasks import * | |
| from .utils import is_equiv | |
| # %% | |
| # %cd ../tlem | |
| # %load_ext ipytorch | |
| # %ls | |
| # TODO: Add BibTeX citation | |
| _CITATION = """\ | |
| """ | |
| # TODO: Add description of the module here | |
| _DESCRIPTION = """\ | |
| """ | |
| # TODO: Add description of the arguments of the module here | |
| _KWARGS_DESCRIPTION = """ | |
| """ | |
| # TODO: Define external resources urls if needed | |
| BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" | |
| # @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) | |
| class ReasoningMetric(evaluate.Metric): | |
| """TODO: Short description of my evaluation module.""" | |
| def _info(self): | |
| # if self.config_name in ["cmmlu"]: | |
| features = datasets.Features( | |
| { | |
| "responses": datasets.Value("string"), | |
| # "responses": datasets.Sequence(datasets.Value("float")), | |
| "references": datasets.Value("string"), | |
| } | |
| ) | |
| # TODO: Specifies the evaluate.EvaluationModuleInfo object | |
| return evaluate.EvaluationModuleInfo( | |
| # This is the description that will appear on the modules page. | |
| # module_type="measurement", | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| # This defines the format of each prediction and reference | |
| features=features, | |
| # Homepage of the module for documentation | |
| homepage="http://module.homepage", | |
| # Additional links to the codebase or references | |
| codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
| reference_urls=["http://path.to.reference.url/new_module"], | |
| ) | |
| def _compute(self, responses, references, verbose=False): | |
| extract_responses, extract_references = getattr(Metrics, self.config_name)( | |
| responses, references | |
| ) | |
| df = pd.DataFrame( | |
| { | |
| "responses": responses, | |
| "references": references, | |
| } | |
| ) | |
| df["extract_responses"] = extract_responses | |
| df["extract_references"] = extract_references | |
| results = { | |
| "Accuracy": (df["extract_references"] == df["extract_responses"]) | |
| .astype(int) | |
| .mean(), | |
| } | |
| logging.info(results) | |
| if verbose: | |
| results["df"] = df | |
| return results | |
| gsm8k = Task( | |
| dataset_name=("gsm8k", "main"), | |
| metric_name=("sustech/tlem", "gsm8k"), | |
| input_column="question", | |
| label_column="answer", | |
| ) | |
| class Suite(EvaluationSuite): | |
| def run( | |
| self, | |
| model_or_pipeline: Any, | |
| name="tlem", | |
| ) -> dict[str, float]: | |
| self.assert_suite_nonempty() | |
| def run_tasks(tasks): | |
| for task in tqdm(tasks): | |
| if task.name not in self.cached_result: | |
| self.cached_result[task.name] = task.run(model_or_pipeline) | |
| results = [self.cached_result[task.name] for task in tasks] | |
| return pd.DataFrame(results).mean().to_dict() | |
| if isinstance(self.suite, dict): | |
| for category, tasks in tqdm(self.suite.items()): | |
| logging.warning(f"Combined results: {category}:{run_tasks(tasks)}") | |
| else: | |
| logging.warning(f"Combined results: {run_tasks(self.suite)}") | |
| return self.cached_result | |
| def add(self, name): | |
| chat = False | |
| match name: | |
| case _ if "chat" in name: | |
| chat = True | |
| match name: | |
| case _ if name.startswith("mmlu"): | |
| suite = MMLU.suite(chat=chat) | |
| case _ if name.startswith("cmmlu"): | |
| suite = CMMLU.suite(chat=chat) | |
| case "gsm8k": | |
| suite = [gsm8k] | |
| match name: | |
| case _ if "test" in name: | |
| suite = suite["Test"] | |
| self.suite = suite | |
| def __init__(self, name="tlem"): | |
| super().__init__(name) | |
| self.cached_result = {} | |
| self.suite = [ | |
| # TASK_REGISTRY["gsm8k"], | |
| # TASK_REGISTRY["competition_math"], | |
| ] | |