koichi12 commited on Nov 28, 2024

Commit

42bd089

verified ·

1 Parent(s): 77c6d8e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

scripts/yans/eval/lm-evaluation-harness/lm_eval/__init__.py +0 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/base.py +1051 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/evaluator.py +381 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/README.md +2 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__init__.py +0 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__pycache__/evaluate.cpython-310.pyc +0 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__pycache__/jasquad.cpython-310.pyc +0 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/evaluate.py +121 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/jasquad.py +128 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/requirements.txt +1 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/metrics.py +286 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/prompts.py +33 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/suites/__init__.py +56 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/suites/configs/ja8.conf +33 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/coqa.py +178 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/hellaswag.py +77 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/lambada.py +108 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/lambada_multilingual.py +123 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/qa4mre.py +76 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/squad.py +219 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/superglue.py +490 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/translation.py +244 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/utils.py +301 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/harness.jsquad-1.2.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/harness.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/result.json +59 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/result.jsquad-1.2.json +22 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/result.mgsm.json +0 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/harness.jsquad-1.2.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/harness.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/result.json +71 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json +22 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/result.mgsm.json +0 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/harness.jsquad-1.2.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/harness.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/result.json +71 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json +22 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/result.mgsm.json +0 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/harness.jsquad-1.2.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/harness.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/result.json +59 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/result.jsquad-1.2.json +22 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/harness.jsquad-1.2.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/harness.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/result.json +59 -0
scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/result.jsquad-1.2.json +22 -0
scripts/yans/eval/lm-evaluation-harness/models/llama/llama-13b/harness.sh +3 -0
scripts/yans/eval/lm-evaluation-harness/models/llama/llama-13b/result.json +48 -0
scripts/yans/eval/lm-evaluation-harness/models/llama/llama-30b/harness.sh +3 -0

scripts/yans/eval/lm-evaluation-harness/lm_eval/__init__.py ADDED Viewed

File without changes

scripts/yans/eval/lm-evaluation-harness/lm_eval/base.py ADDED Viewed

	@@ -0,0 +1,1051 @@

+import abc
+from collections import defaultdict
+from typing import Iterable
+import numpy as np
+import random
+import re
+import os
+import json
+import hashlib
+import datasets
+from sqlitedict import SqliteDict
+from tqdm import tqdm
+import torch
+import torch.nn.functional as F
+from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
+from lm_eval.metrics import balanced_mean, matthews_corrcoef, macro_f1
+from lm_eval import utils
+from abc import abstractmethod
+class LM(abc.ABC):
+    def __init__(self):
+        self.cache_hook = CacheHook(None)
+    @abstractmethod
+    def loglikelihood(self, requests):
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LM calls whenever possible.
+        :param requests: list
+            A list of pairs (context, continuation)
+            context: str
+                Context string. Implementations of LM must be able to handle an
+                empty context string.
+            continuation: str
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+        :return: list
+            A list of pairs (logprob, isgreedy)
+            logprob: float
+                The log probability of `continuation`
+            isgreedy:
+                Whether `continuation` would be generated by greedy sampling from `context`
+        """
+        pass
+    @abstractmethod
+    def loglikelihood_rolling(self, requests):
+        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
+        - We will use the full max context length of the model.
+        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
+        the max context length.
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
+          which may simply concatenate multiple documents together.
+        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
+          multiple chunks, the last input will still a full-sized context.
+          Example:
+            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
+            Prefix: EOT
+            Max context length: 4
+            Resulting input/prediction pairs:
+                INPUT:  EOT   0   1   2
+                PRED:     0   1   2   3
+                INPUT:    3   4   5   6
+                PRED:     4   5   6   7
+                INPUT:    5   6   7   8
+                PRED:             8   9
+          Observe that:
+            1. Each token is predicted exactly once
+            2. For the last pair, we provide the full context, but only score the last two tokens
+        :param requests: list
+            A list of strings
+            string: str
+                String for which we are computing per-toke  loglikelihood
+        :return: list
+            A list of pairs (logprob, isgreedy)
+            logprob: float
+                The log probability of `continuation`
+            isgreedy:
+                Whether `continuation` would be generated by greedy sampling from `context`
+        """
+        pass
+    # TODO: Add an optional max length
+    @abstractmethod
+    def greedy_until(self, requests):
+        """Generate greedily until a stopping sequence
+        :param requests: list
+            A list of pairs (context, until) or (context, until, max_num_tokens)
+            context: str
+                Context string
+            until: [str]
+                The string sequences to generate until. These string sequences
+                may each span across multiple tokens, or may be part of one token.
+            (optional) max_num_tokens: int
+                Indicate the max length of the generation
+        :return: list
+            A list of strings continuation
+            continuation: str
+                The generated continuation.
+        """
+        pass
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    def set_cache_hook(self, cache_hook):
+        self.cache_hook = cache_hook
+class BaseLM(LM):
+    @property
+    @abstractmethod
+    def eot_token_id(self):
+        pass
+    @property
+    @abstractmethod
+    def max_length(self):
+        pass
+    @property
+    @abstractmethod
+    def max_gen_toks(self):
+        pass
+    @property
+    @abstractmethod
+    def batch_size(self):
+        pass
+    @property
+    @abstractmethod
+    def device(self):
+        pass
+    @abstractmethod
+    def tok_encode(self, string: str):
+        pass
+    @abstractmethod
+    def tok_decode(self, tokens: Iterable[int]):
+        pass
+    @abstractmethod
+    def _model_generate(self, context, max_length, eos_token_id):
+        pass
+    @abstractmethod
+    def _model_call(self, inps):
+        """
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model
+        """
+        pass
+    # subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
+    # TODO: enforce this somehow
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in requests:
+            if context == "":
+                # end of text as context
+                context_enc = [self.eot_token_id]
+            else:
+                context_enc = self.tok_encode(context)
+            if continuation == "__lasttoken__":
+                # take last token from context
+                continuation_enc = [context_enc[-1]]
+                context_enc = context_enc[:-1]
+            else:
+                continuation_enc = self.tok_encode(continuation)
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs)
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+        loglikelihoods = []
+        for (string,) in tqdm(requests):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
+            # that
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows, disable_tqdm=True
+            )
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+        # TODO: automatic (variable) batch size detection for vectorization
+        re_ord = utils.Reorderer(requests, _collate)
+        for chunk in utils.chunks(
+            tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
+        ):
+            inps = []
+            cont_toks_list = []
+            inplens = []
+            padding_length = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+                # how this all works:
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # gpt2    \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+                # when too long to fit in context, truncate from the left
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
+                ).to(self.device)
+                (inplen,) = inp.shape
+                cont = continuation_enc
+                # since in _collate we make sure length is descending, the longest is always the first one.
+                padding_length = (
+                    padding_length if padding_length is not None else inplen
+                )
+                # pad length from seq to padding_length
+                inp = torch.cat(
+                    [
+                        inp,  # [seq]
+                        torch.zeros(padding_length - inplen, dtype=torch.long).to(
+                            inp.device
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                )
+                inps.append(inp.unsqueeze(0))  # [1, padding_length]
+                cont_toks_list.append(cont)
+                inplens.append(inplen)
+            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps), dim=-1
+            ).cpu()  # [batch, padding_length, vocab]
+            for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(
+                chunk, multi_logits, inps, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                logits = logits[inplen - contlen : inplen].unsqueeze(
+                    0
+                )  # [1, seq, vocab]
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
+                    0
+                )  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+                # Obtain log-probs at the corresponding continuation token indices
+                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                    -1
+                )  # [1, seq]
+                # Answer: (log prob, is-exact-match)
+                answer = (float(logits.sum()), bool(max_equal))
+                # partial caching
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                res.append(answer)
+        return re_ord.get_original(res)
+    def greedy_until(self, requests):
+        # TODO: implement fully general `until` that handles until that are
+        #       multiple tokens or that span multiple tokens correctly
+        # TODO: extract to TokenizedLM?
+        res = []
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+        re_ord = utils.Reorderer(requests, _collate)
+        for req in tqdm(re_ord.get_reordered()):
+            if len(req) == 2:
+                context, until = req
+                max_gen_toks = self.max_gen_toks
+            elif len(req) == 3:
+                context, until, max_num_tokens = req
+                max_gen_toks = max_num_tokens
+            else:
+                raise NotImplementedError
+            if isinstance(until, str):
+                until = [until]
+            # (primary_until,) = self.tok_encode(until[0])
+            primary_until = self.tok_encode(until[0])
+            if len(primary_until) == 0:
+                primary_until = self.tokenizer.eos_token_id
+            else:
+                primary_until = primary_until[-1]
+            context_enc = torch.tensor(
+                [self.tok_encode(context)[max_gen_toks - self.max_length :]]
+            ).to(self.device)
+            cont = self._model_generate(
+                context_enc, context_enc.shape[1] + max_gen_toks, primary_until
+            )
+            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :])
+            for term in until:
+                s = s.split(term)[0]
+            # partial caching
+            self.cache_hook.add_partial("greedy_until", (context, until), s)
+            res.append(s)
+        return re_ord.get_original(res)
+class Task(abc.ABC):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, and evaluation methods. See BoolQ for a simple example implementation
+    A `doc` can be any python object which represents one instance of evaluation.
+    This is usually a dictionary e.g.
+        {"question": ..., "answer": ...} or
+        {"question": ..., question, answer)
+    """
+    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
+    # or a path to a custom `datasets` loading script.
+    DATASET_PATH: str = None
+    # The name of a subset within `DATASET_PATH`.
+    DATASET_NAME: str = None
+    # Load tokenizer inside Task class
+    LOAD_TOKENIZER: bool = False
+    def __init__(self, data_dir=None, cache_dir=None, download_mode=None):
+        """
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            by setting the shell environment variable, `HF_DATASETS_CACHE`,
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.download(data_dir, cache_dir, download_mode)
+        self._training_docs = None
+        self._fewshot_docs = None
+        self._target_to_docs = None
+        self._target_to_ratio = None
+    def download(self, data_dir=None, cache_dir=None, download_mode=None):
+        """Downloads and returns the task dataset.
+        Override this method to download the dataset from a custom API.
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            by setting the shell environment variable, `HF_DATASETS_CACHE`,
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_mode=download_mode,
+        )
+    def should_decontaminate(self):
+        """Whether this task supports decontamination against model training set."""
+        return False
+    @abstractmethod
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        pass
+    @abstractmethod
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        pass
+    @abstractmethod
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        pass
+    def training_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def validation_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def test_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def _process_doc(self, doc):
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    def fewshot_examples(self, k, rnd, stratified=False):
+        """Returns few shot examples from training docs"""
+        if self._training_docs is None:
+            self._training_docs = list(self.training_docs())
+        if stratified:
+            return self._stratified_fewshot_examples(self._training_docs, k, rnd)
+        else:
+            return rnd.sample(self._training_docs, k)
+    def _stratified_fewshot_examples(self, docs, k, rnd):
+        """Returns few shot examples from `docs` with stratified sampling,
+        using the target from `self.doc_to_target` as the stratum.
+        WARNING: in order to speed up computation, this method caches the following
+        based on `docs`:
+        - `self._target_to_docs`, which stores a mapping from target to docs, and
+        - `self._target_to_ratio`, which stores a mapping from target to the ratio of docs
+        Thus, `docs` MUST be constant across different method calls.
+        This assumption should generally hold true, since for a given task `docs`
+        will typically be either one of:
+        - `self._training_docs` if the dataset for the task has training data, or
+        - `self._fewshot_docs` if the dataset for the task does not have any training data
+        """
+        if self._target_to_docs is None or self._target_to_ratio is None:
+            self._target_to_docs = defaultdict(list)
+            for doc in docs:
+                target = self.doc_to_target(doc)
+                self._target_to_docs[target].append(doc)
+            self._target_to_ratio = {
+                target: len(_docs) / len(docs)
+                for target, _docs in self._target_to_docs.items()
+            }
+        # `k` should generally be constant across different method calls
+        # (as the number of few-shot is typically fixed for a given task),
+        # but this may not be guaranteed, so calculate the number of sample
+        # for each target per method call
+        target_to_num_samples = {
+            target: int(ratio * k) for target, ratio in self._target_to_ratio.items()
+        }
+        # Handle any rounding discrepancies by adjusting the counts
+        remaining_samples = k - sum(target_to_num_samples.values())
+        if remaining_samples > 0:
+            for _ in range(remaining_samples):
+                # Increment the min value
+                target = min(target_to_num_samples, key=target_to_num_samples.get)
+                target_to_num_samples[target] += 1
+        samples = []
+        for target, num_samples in target_to_num_samples.items():
+            samples.extend(rnd.sample(self._target_to_docs[target], num_samples))
+        # Randomly shuffle the samples to prevent potential biases
+        # that may arise from a fixed ordering of the targets
+        rnd.shuffle(samples)
+        return samples
+    def doc_to_decontamination_query(self, doc):
+        print(
+            "Override doc_to_decontamination_query with document specific decontamination query."
+        )
+        assert False
+    @abstractmethod
+    def doc_to_text(self, doc):
+        pass
+    @abstractmethod
+    def doc_to_target(self, doc):
+        pass
+    @abstractmethod
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        pass
+    @abstractmethod
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        pass
+    @abstractmethod
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        pass
+    @abstractmethod
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        pass
+    def fewshot_description(self):
+        import warnings
+        warnings.warn(
+            "`fewshot_description` will be removed in futures versions. Pass "
+            "any custom descriptions to the `evaluate` function instead.",
+            DeprecationWarning,
+        )
+        return ""
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc,
+        num_fewshot,
+        provide_description=None,
+        rnd=None,
+        description=None,
+        stratified=False,
+    ):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param provide_description: bool
+            Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
+        :param stratified: bool
+            When true, does stratified sampling, using the target from `self.doc_to_target` as the stratum.
+        :returns: str
+            The fewshot context.
+        """
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
+        assert not provide_description, (
+            "The `provide_description` arg will be removed in future versions. To prepend "
+            "a custom description to the context, supply the corresponding string via the "
+            "`description` arg."
+        )
+        if provide_description is not None:
+            # nudge people to not specify it at all
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
+        if hasattr(self, "FEWSHOT_SEP"):
+            FEWSHOT_SEP = self.FEWSHOT_SEP
+        elif hasattr(self, "SEP"):
+            FEWSHOT_SEP = f"{self.SEP}{self.SEP}"
+        else:
+            FEWSHOT_SEP = "\n\n"
+        if description:
+            description += FEWSHOT_SEP
+        elif hasattr(self, "DESCRIPTION"):
+            description = self.DESCRIPTION
+        else:
+            description = ""
+        if num_fewshot == 0:
+            labeled_examples = ""
+        else:
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(
+                    k=num_fewshot, rnd=rnd, stratified=stratified
+                )
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+                if stratified:
+                    fewshotex = self._stratified_fewshot_examples(
+                        self._fewshot_docs, num_fewshot + 1, rnd=rnd
+                    )
+                else:
+                    fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+            labeled_examples = (
+                FEWSHOT_SEP.join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + FEWSHOT_SEP
+            )
+        example = self.doc_to_text(doc)
+        return description + labeled_examples + example
+    def set_tokenizer(self, tokenizer):
+        self.tokenizer = tokenizer
+class MultipleChoiceTask(Task):
+    def doc_to_target(self, doc):
+        return " " + doc["choices"][doc["gold"]]
+    def construct_requests(self, doc, ctx):
+        lls = [
+            rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
+        ]
+        return lls
+    def process_results(self, doc, results):
+        gold = doc["gold"]
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+            "details": {
+                "scores": results,
+            },
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True,
+            "acc_norm": True,
+        }
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "acc_norm": mean,
+        }
+class BalancedMultipleChoiceTask(MultipleChoiceTask):
+    """A task where the choices are the same every time, and accuracy should be
+    calculated separately for each class.
+    Originally created for marc-ja, which is severely imbalanced, though also
+    useful with less weird datasets. Not suitable for datasets where the choices
+    change for every question.
+    """
+    def process_results(self, doc, results):
+        gold = doc["gold"]
+        # This isn't very clean, but it may be the best we can do since lm ops
+        # are submitted as an iterator for batching
+        response = None
+        if isinstance(results[-1], str):
+            response = results.pop()
+        pred = np.argmax(results)
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+            "balanced_acc": (acc, gold),
+            "mcc": (gold, pred),
+            "macro_f1": (gold, pred),
+            "details": {
+                "question": self.doc_to_text(doc),
+                "response": response,
+                "scores": results,
+            },
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True,
+            "acc_norm": True,
+            "balanced_acc": True,
+            "mcc": True,
+            "macro_f1": True,
+        }
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "acc_norm": mean,
+            "balanced_acc": balanced_mean,
+            "mcc": matthews_corrcoef,
+            "macro_f1": macro_f1,
+        }
+class PerplexityTask(Task, abc.ABC):
+    def should_decontaminate(self):
+        """Whether this task supports decontamination against model training set."""
+        return True
+    def has_training_docs(self):
+        return False
+    def fewshot_examples(self, k, rnd):
+        assert k == 0
+        return []
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        assert (
+            num_fewshot == 0
+        ), "The number of fewshot examples must be 0 for perplexity tasks."
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`."
+        assert not provide_description, (
+            "The `provide_description` arg will be removed in future versions. To prepend "
+            "a custom description to the context, supply the corresponding string via the "
+            "`description` arg."
+        )
+        if provide_description is not None:
+            # nudge people to not specify it at all
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
+        return ""
+    def higher_is_better(self):
+        return {
+            "word_perplexity": False,
+            "byte_perplexity": False,
+            "bits_per_byte": False,
+        }
+    def doc_to_decontamination_query(self, doc):
+        return doc
+    def doc_to_text(self, doc):
+        return ""
+    def doc_to_target(self, doc):
+        return doc
+    def construct_requests(self, doc, ctx):
+        assert not ctx
+        req = rf.loglikelihood_rolling(self.doc_to_target(doc))
+        return req
+    def process_results(self, doc, results):
+        (loglikelihood,) = results
+        words = self.count_words(doc)
+        bytes_ = self.count_bytes(doc)
+        return {
+            "word_perplexity": (loglikelihood, words),
+            "byte_perplexity": (loglikelihood, bytes_),
+            "bits_per_byte": (loglikelihood, bytes_),
+        }
+    def aggregation(self):
+        return {
+            "word_perplexity": weighted_perplexity,
+            "byte_perplexity": weighted_perplexity,
+            "bits_per_byte": bits_per_byte,
+        }
+    @classmethod
+    def count_bytes(cls, doc):
+        return len(doc.encode("utf-8"))
+    @classmethod
+    def count_words(cls, doc):
+        """Downstream tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
+def hash_args(attr, args):
+    dat = json.dumps([attr] + list(args))
+    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
+class CacheHook:
+    def __init__(self, cachinglm):
+        if cachinglm is None:
+            self.dbdict = None
+            return
+        self.dbdict = cachinglm.dbdict
+    def add_partial(self, attr, req, res):
+        if self.dbdict is None:
+            return
+        hsh = hash_args(attr, req)
+        self.dbdict[hsh] = res
+class CachingLM:
+    def __init__(self, lm, cache_db):
+        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
+        :param lm: LM
+            Underlying LM
+        :param cache_db: str
+            Path to cache db
+        """
+        self.lm = lm
+        self.cache_db = cache_db
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        self.dbdict = SqliteDict(cache_db, autocommit=True)
+        # add hook to lm
+        lm.set_cache_hook(self.get_cache_hook())
+    def __getattr__(self, attr):
+        def fn(requests):
+            res = []
+            remaining_reqs = []
+            # figure out which ones are cached and which ones are new
+            for req in requests:
+                hsh = hash_args(attr, req)
+                if hsh in self.dbdict:
+                    ob = self.dbdict[hsh]
+                    assert ob is not None
+                    res.append(ob)
+                else:
+                    res.append(None)
+                    remaining_reqs.append(req)
+            # actually run the LM on the requests that do not have cached results
+            rem_res = getattr(self.lm, attr)(remaining_reqs)
+            # stick the new ones back into the list and also cache any of the new ones
+            resptr = 0
+            for req, r in zip(remaining_reqs, rem_res):
+                while res[resptr] is not None:
+                    resptr += 1
+                res[resptr] = r
+                # caching
+                hsh = hash_args(attr, req)
+                self.dbdict[hsh] = r
+            self.dbdict.commit()
+            return res
+        return fn
+    def get_cache_hook(self):
+        return CacheHook(self)
+REQUEST_RETURN_LENGTHS = {
+    "loglikelihood": 2,
+    "greedy_until": None,
+    "loglikelihood_rolling": None,
+}
+class Request:
+    def __init__(self, request_type, args, index=None):
+        if request_type not in REQUEST_RETURN_LENGTHS.keys():
+            raise NotImplementedError(
+                "The request type {} is not implemented!".format(request_type)
+            )
+        self.request_type = request_type
+        self.args = args
+        self.index = index
+    def __iter__(self):
+        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
+            raise IndexError("This request type does not return multiple arguments!")
+        for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
+            yield Request(self.request_type, self.args, i)
+    def __getitem__(self, i):
+        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
+            raise IndexError("This request type does not return multiple arguments!")
+        return Request(self.request_type, self.args, i)
+    def __eq__(self, other):
+        return (
+            self.request_type == other.request_type
+            and self.args == other.args
+            and self.index == other.index
+        )
+    def __repr__(self):
+        return f"Req_{self.request_type}{self.args}[{self.index}]\n"
+class RequestFactory:
+    def __getattr__(self, attr):
+        def fn(*args):
+            return Request(attr, args)
+        return fn
+rf = RequestFactory()

scripts/yans/eval/lm-evaluation-harness/lm_eval/evaluator.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import collections
+import itertools
+import numpy as np
+import random
+import lm_eval.metrics
+import lm_eval.models
+import lm_eval.tasks
+import lm_eval.base
+from lm_eval.utils import positional_deprecated, run_task_tests
+@positional_deprecated
+def simple_evaluate(
+    model,
+    model_args=None,
+    tasks=[],
+    num_fewshot=0,
+    batch_size=None,
+    device=None,
+    no_cache=False,
+    limit=None,
+    bootstrap_iters=100000,
+    description_dict=None,
+    check_integrity=False,
+    decontamination_ngrams_path=None,
+    verbose=False,
+):
+    """Instantiate and evaluate a model on a list of tasks.
+    :param model: Union[str, LM]
+        Name of model or LM object, see lm_eval.models.get_model
+    :param model_args: Optional[str]
+        String arguments for each model class, see LM.create_from_arg_string.
+        Ignored if `model` argument is a LM object.
+    :param tasks: list[Union[str, Task]]
+        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+    :param num_fewshot: int or list of int
+        Number of examples in few-shot context
+    :param batch_size: int, optional
+        Batch size for model
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param no_cache: bool
+        Whether or not to cache
+    :param limit: int or list of int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :param description_dict: dict[str, str]
+        Dictionary of custom task descriptions of the form: `task_name: description`
+    :param check_integrity: bool
+        Whether to run the relevant part of the test suite for the tasks
+    :return
+        Dictionary of results
+    """
+    random.seed(1234)
+    np.random.seed(1234)
+    assert tasks != [], "No tasks specified"
+    if isinstance(model, str):
+        if model_args is None:
+            model_args = ""
+        lm = lm_eval.models.get_model(model).create_from_arg_string(
+            model_args, {"batch_size": batch_size, "device": device}
+        )
+    else:
+        assert isinstance(model, lm_eval.base.LM)
+        lm = model
+    if not no_cache:
+        lm = lm_eval.base.CachingLM(
+            lm,
+            "lm_cache/"
+            + model
+            + "_"
+            + model_args.replace("=", "-").replace(",", "_").replace("/", "-")
+            + ".db",
+        )
+    task_dict = lm_eval.tasks.get_task_dict(tasks)
+    if check_integrity:
+        run_task_tests(task_list=tasks)
+    results = evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        num_fewshot=num_fewshot,
+        limit=limit,
+        bootstrap_iters=bootstrap_iters,
+        description_dict=description_dict,
+        decontamination_ngrams_path=decontamination_ngrams_path,
+        verbose=verbose,
+    )
+    # add info about the model and few shot config
+    results["config"] = {
+        "model": model,
+        "model_args": model_args,
+        "num_fewshot": num_fewshot,
+        "batch_size": batch_size,
+        "device": device,
+        "no_cache": no_cache,
+        "limit": limit,
+        "bootstrap_iters": bootstrap_iters,
+        "description_dict": description_dict,
+    }
+    return results
+decontaminate_suffix = "_decontaminate"
+@positional_deprecated
+def evaluate(
+    lm,
+    task_dict,
+    provide_description=None,
+    num_fewshot=0,
+    limit=None,
+    bootstrap_iters=100000,
+    description_dict=None,
+    decontamination_ngrams_path=None,
+    verbose=False,
+):
+    """Instantiate and evaluate a model on a list of tasks.
+    :param lm: obj
+        Language Model
+    :param task_dict: dict[str, Task]
+        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+    :param provide_description: bool
+        Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
+    :param num_fewshot: int or list of int
+        Number of examples in few-shot context
+    :param limit: int or list of int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :param description_dict: dict[str, str]
+        Dictionary of custom task descriptions of the form: `task_name: description`
+    :return
+        Dictionary of results
+    """
+    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
+    # TODO: todo: implement proper description-providing system
+    assert not provide_description  # not implemented.
+    if provide_description is not None:
+        # nudge people to not specify it at all
+        print(
+            "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+        )
+    if isinstance(num_fewshot, list):
+        assert len(task_dict) == len(
+            num_fewshot
+        ), f"The number of tasks ({len(task_dict)}) must be same as the number of elements in `num_fewshot` ({len(num_fewshot)})"
+    else:
+        # num_fewshot is int
+        num_fewshot = [num_fewshot] * len(task_dict)
+    if isinstance(limit, list):
+        assert len(task_dict) == len(
+            limit
+        ), f"The number of tasks ({len(task_dict)}) must be same as the number of elements in `num_fewshot` ({len(limit)})"
+    else:
+        # limit is int or None
+        limit = [limit] * len(task_dict)
+    decontaminate = decontamination_ngrams_path is not None
+    task_dict_items = [
+        (name, task)
+        for name, task in task_dict.items()
+        if (task.has_validation_docs() or task.has_test_docs())
+    ]
+    results = collections.defaultdict(dict)
+    versions = collections.defaultdict(dict)
+    requests = collections.defaultdict(list)
+    requests_origin = collections.defaultdict(list)
+    overlaps = collections.defaultdict(list)  # {task_name: contaminated_docs}
+    # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
+    # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
+    # over-engineering is bad (or we could make it write the requests to disk and then read them back out again
+    #  - probably using an sqlite db because of all the moving parts we have
+    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
+    docs = {}
+    docs_for_decontamination = collections.defaultdict(list)
+    # get lists of each type of request
+    for idx, (task_name, task) in enumerate(task_dict_items):
+        versions[task_name] = task.VERSION
+        # default to test doc, fall back to val doc if validation unavailable
+        # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
+        if task.has_test_docs():
+            task_doc_func = task.test_docs
+            task_set = "test"  # Required for caching in the decontamination
+        elif task.has_validation_docs():
+            task_set = "val"  # Required for caching in the decontamination
+            task_doc_func = task.validation_docs
+        else:
+            raise RuntimeError("Task has neither test_docs nor validation_docs")
+        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
+        task_docs = list(task_doc_func())
+        rnd = random.Random()
+        rnd.seed(42)
+        rnd.shuffle(task_docs)
+        description = (
+            description_dict[task_name]
+            if description_dict and task_name in description_dict
+            else ""
+        )
+        # set tokenizer inside task
+        if task.LOAD_TOKENIZER:
+            if isinstance(lm, lm_eval.base.CachingLM):
+                task.set_tokenizer(lm.lm.tokenizer)
+            else:
+                task.set_tokenizer(lm.tokenizer)
+        # set max_length to task object
+        task.max_length = (
+            lm.lm.max_length
+            if isinstance(lm, lm_eval.base.CachingLM)
+            else lm.max_length
+        )
+        task.max_gen_toks = (
+            lm.lm.max_gen_toks
+            if isinstance(lm, lm_eval.base.CachingLM)
+            else lm.max_gen_toks
+        )
+        limit_local = limit[idx]
+        if isinstance(limit_local, float):
+            limit_local = int(limit_local * len(task_docs))
+            print(
+                f"Use {limit_local}/{len(task_docs)} samples corresponding to the ratio of {limit[idx]}"
+            )
+        for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit_local)):
+            if decontaminate and task.should_decontaminate():
+                docs_for_decontamination[(task_name, task_set)].append(
+                    task.doc_to_decontamination_query(doc)
+                )
+            docs[(task_name, doc_id)] = doc
+            ctx = task.fewshot_context(
+                doc=doc, num_fewshot=num_fewshot[idx], rnd=rnd, description=description
+            )
+            reqs = task.construct_requests(doc, ctx)
+            if not isinstance(reqs, (list, tuple)):
+                reqs = [reqs]
+            for i, req in enumerate(reqs):
+                requests[req.request_type].append(req)
+                # i: index in requests for a single task instance
+                # doc_id: unique id that we can get back to a doc using `docs`
+                requests_origin[req.request_type].append((i, task_name, doc, doc_id))
+    # Compare all tasks/sets at once to ensure a single training set scan
+    if decontaminate:
+        from lm_eval.decontamination.decontaminate import get_train_overlap
+        print("Finding train/test overlap, please wait...")
+        overlaps = get_train_overlap(
+            docs_for_decontamination, decontamination_ngrams_path, limit
+        )
+    # all responses for each (task, doc)
+    process_res_queue = collections.defaultdict(list)
+    # execute each type of request
+    for reqtype, reqs in requests.items():
+        # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
+        #       only in index. We could implement some kind of caching, but that would be more of a band-aid
+        #       solution. we could also implement some kind of auto-grouping here;
+        #       they should end up next to each other.
+        print("Running", reqtype, "requests")
+        resps = getattr(lm, reqtype)([req.args for req in reqs])
+        resps = [
+            x if req.index is None else x[req.index] for x, req in zip(resps, reqs)
+        ]
+        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
+            process_res_queue[(task_name, doc_id)].append((i, resp))
+    vals = collections.defaultdict(list)
+    # holds detailed responses for error analysis
+    details = collections.defaultdict(list)
+    # unpack results and sort back in order and return control to Task
+    for (task_name, doc_id), requests in process_res_queue.items():
+        requests.sort(key=lambda x: x[0])
+        requests = [x[1] for x in requests]
+        task = task_dict[task_name]
+        doc = docs[(task_name, doc_id)]
+        metrics = task.process_results(doc, requests)
+        if "details" in metrics:
+            details[task_name].append(metrics["details"])
+            del metrics["details"]
+        for metric, value in metrics.items():
+            vals[(task_name, metric)].append(value)
+            # Re-use the evaluation for the decontaminated set by just ignoring the overlaps
+            if decontaminate and task_name in overlaps:
+                if doc_id not in overlaps[task_name]:
+                    vals[(task_name, metric + decontaminate_suffix)].append(value)
+    # aggregate results
+    for (task_name, metric), items in vals.items():
+        task = task_dict[task_name]
+        real_metric = metric  # key when looking up the metric with task.aggregation
+        if metric.endswith(decontaminate_suffix):
+            real_metric = metric.replace(
+                decontaminate_suffix, ""
+            )  # decontaminated still uses the same metric
+        results[task_name][metric] = task.aggregation()[real_metric](items)
+        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
+        # so we run them less iterations. still looking for a cleaner way to do this
+        stderr = lm_eval.metrics.stderr_for_metric(
+            metric=task.aggregation()[real_metric],
+            bootstrap_iters=min(bootstrap_iters, 1000)
+            if metric in ["bleu", "chrf", "ter"]
+            else bootstrap_iters,
+        )
+        if stderr is not None:
+            results[task_name][metric + "_stderr"] = stderr(items)
+        if verbose and task_name in details:
+            results[task_name]["details"] = details[task_name]
+    return {"results": dict(results), "versions": dict(versions)}
+def make_table(result_dict):
+    """Generate table of results."""
+    from pytablewriter import MarkdownTableWriter, LatexTableWriter
+    md_writer = MarkdownTableWriter()
+    latex_writer = LatexTableWriter()
+    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+    values = []
+    for k, dic in result_dict["results"].items():
+        version = result_dict["versions"][k]
+        for m, v in dic.items():
+            if m == "details":
+                continue
+            if m.endswith("_stderr"):
+                continue
+            if m + "_stderr" in dic:
+                se = dic[m + "_stderr"]
+                values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se])
+            else:
+                values.append([k, version, m, "%.4f" % v, "", ""])
+            k = ""
+            version = ""
+    md_writer.value_matrix = values
+    latex_writer.value_matrix = values
+    # todo: make latex table look good
+    # print(latex_writer.dumps())
+    return md_writer.dumps()

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Metric Card For Japanese SQuAD
2	+ heavily refer to https://github.com/huggingface/datasets/tree/main/metrics/squad

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__init__.py ADDED Viewed

File without changes

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (169 Bytes). View file

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__pycache__/evaluate.cpython-310.pyc ADDED Viewed

Binary file (4.06 kB). View file

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/__pycache__/jasquad.cpython-310.pyc ADDED Viewed

Binary file (4.06 kB). View file

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/evaluate.py ADDED Viewed

	@@ -0,0 +1,121 @@

+""" Official evaluation script for v1.1 of the SQuAD dataset. """
+import argparse
+import json
+import re
+import string
+import sys
+from collections import Counter
+def remove_punc(tokens):
+    exclude = (
+        "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+    )
+    exclude += string.punctuation
+    exclude = [*exclude]
+    return [tok for tok in tokens if tok not in exclude]
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    import emoji
+    import neologdn
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_emoji(text):
+        text = "".join(["" if emoji.is_emoji(c) else c for c in text])
+        emoji_pattern = re.compile(
+            "["
+            "\U0001F600-\U0001F64F"  # emoticons
+            "\U0001F300-\U0001F5FF"  # symbols & pictographs
+            "\U0001F680-\U0001F6FF"  # transport & map symbols
+            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+            "\U00002702-\U000027B0"
+            "]+",
+            flags=re.UNICODE,
+        )
+        return emoji_pattern.sub(r"", text)
+    return white_space_fix((neologdn.normalize(remove_emoji(s))))
+def f1_score(prediction, ground_truth):
+    from fugashi import Tagger
+    tagger = Tagger("-Owakati")
+    prediction_tokens = remove_punc(tagger.parse(normalize_answer(prediction)).split())
+    ground_truth_tokens = remove_punc(
+        tagger.parse(normalize_answer(ground_truth)).split()
+    )
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def evaluate(dataset, predictions):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article["paragraphs"]:
+            for qa in paragraph["qas"]:
+                total += 1
+                if qa["id"] not in predictions:
+                    message = (
+                        "Unanswered question " + qa["id"] + " will receive score 0."
+                    )
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = [x["text"] for x in qa["answers"]]
+                prediction = predictions[qa["id"]]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths
+                )
+                f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+    return {"exact_match": exact_match, "f1": f1}
+if __name__ == "__main__":
+    expected_version = "1.1"
+    parser = argparse.ArgumentParser(
+        description="Evaluation for Japanese SQuAD " + expected_version
+    )
+    parser.add_argument("dataset_file", help="Dataset file")
+    parser.add_argument("prediction_file", help="Prediction File")
+    args = parser.parse_args()
+    with open(args.dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if dataset_json["version"] != expected_version:
+            print(
+                "Evaluation expects v-"
+                + expected_version
+                + ", but got dataset with v-"
+                + dataset_json["version"],
+                file=sys.stderr,
+            )
+        dataset = dataset_json["data"]
+    with open(args.prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    print(json.dumps(evaluate(dataset, predictions)))

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/jasquad.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright 2020 The HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SQuAD metric. """
+import datasets
+from .evaluate import evaluate
+_CITATION = """\
+@inproceedings{Rajpurkar2016SQuAD10,
+  title={SQuAD: 100, 000+ Questions for Machine Comprehension of Text},
+  author={Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang},
+  booktitle={EMNLP},
+  year={2016}
+}
+"""
+_DESCRIPTION = """
+This metric wrap the official scoring script for version 1 of the Stanford Question Answering Dataset (SQuAD).
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by
+crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span,
+from the corresponding reading passage, or the question might be unanswerable.
+"""
+_KWARGS_DESCRIPTION = """
+Computes SQuAD scores (F1 and EM).
+Args:
+    predictions: List of question-answers dictionaries with the following key-values:
+        - 'id': id of the question-answer pair as given in the references (see below)
+        - 'prediction_text': the text of the answer
+    references: List of question-answers dictionaries with the following key-values:
+        - 'id': id of the question-answer pair (see above),
+        - 'answers': a Dict in the SQuAD dataset format
+            {
+                'text': list of possible texts for the answer, as a list of strings
+                'answer_start': list of start positions for the answer, as a list of ints
+            }
+            Note that answer_start values are not taken into account to compute the metric.
+Returns:
+    'exact_match': Exact match (the normalized answer exactly match the gold answer)
+    'f1': The F-score of predicted tokens versus the gold answer
+Examples:
+    >>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
+    >>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
+    >>> squad_metric = datasets.load_metric("squad")
+    >>> results = squad_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'exact_match': 100.0, 'f1': 100.0}
+"""
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class JaSquad(datasets.Metric):
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": {
+                        "id": datasets.Value("string"),
+                        "prediction_text": datasets.Value("string"),
+                    },
+                    "references": {
+                        "id": datasets.Value("string"),
+                        "answers": datasets.features.Sequence(
+                            {
+                                "text": datasets.Value("string"),
+                                "answer_start": datasets.Value("int32"),
+                            }
+                        ),
+                    },
+                }
+            ),
+            codebase_urls=["https://rajpurkar.github.io/SQuAD-explorer/"],
+            reference_urls=["https://rajpurkar.github.io/SQuAD-explorer/"],
+        )
+    def _compute(self, predictions, references):
+        pred_dict = {
+            prediction["id"]: prediction["prediction_text"]
+            for prediction in predictions
+        }
+        dataset = [
+            {
+                "paragraphs": [
+                    {
+                        "qas": [
+                            {
+                                "answers": [
+                                    {"text": answer_text}
+                                    for answer_text in ref["answers"]["text"]
+                                ],
+                                "id": ref["id"],
+                            }
+                            for ref in references
+                        ]
+                    }
+                ]
+            }
+        ]
+        score = getattr(self, "cached_s", None)
+        if score:
+            cached_p = getattr(self, "cached_p", None)
+            cached_r = getattr(self, "cached_r", None)
+            if cached_p == predictions and cached_r == references:
+                return score
+        score = evaluate(dataset=dataset, predictions=pred_dict)
+        setattr(self, "cached_s", score)
+        setattr(self, "cached_p", list(predictions))
+        setattr(self, "cached_r", list(references))
+        return score

scripts/yans/eval/lm-evaluation-harness/lm_eval/jasquad/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}

scripts/yans/eval/lm-evaluation-harness/lm_eval/metrics.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import math
+from collections.abc import Iterable
+import numpy as np
+import sacrebleu
+import sklearn.metrics
+import random
+from collections import defaultdict
+def mean(arr):
+    return sum(arr) / len(arr)
+def pop_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+def sample_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+def mean_stderr(arr):
+    return sample_stddev(arr) / math.sqrt(len(arr))
+def median(arr):
+    return arr[len(arr) // 2]
+def balanced_mean(arr):
+    # each entry is of the form (acc score, class label)
+    # first group the results
+    by_class = defaultdict(list)
+    for acc, label in arr:
+        by_class[label].append(acc)
+    # calculate class averages
+    avgs = []
+    for key, vals in by_class.items():
+        avgs.append(sum(vals) / len(vals))
+    # average the class values
+    return sum(avgs) / len(avgs)
+def matthews_corrcoef(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return sklearn.metrics.matthews_corrcoef(golds, preds)
+def f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = sklearn.metrics.f1_score(golds, preds)
+    return np.max(fscore)
+def macro_f1(items):
+    # this is different from f1-score which uses default binary avg
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = sklearn.metrics.f1_score(golds, preds, average="macro")
+    return fscore
+def acc_all(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        paragraph_id = doc["idx"]["paragraph"]
+        question_id = doc["idx"]["question"]
+        if (paragraph_id, question_id) not in question_scoring_dict:
+            question_scoring_dict[(paragraph_id, question_id)] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
+    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def acc_all_stderr(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        question_id = doc["idx"]["question"]
+        if question_id not in question_scoring_dict:
+            question_scoring_dict[question_id] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[question_id].append(gold_label == pred)
+    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Compute max metric between prediction and each ground truth."""
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def perplexity(items):
+    return math.exp(-mean(items))
+def weighted_mean(items):
+    a, b = zip(*items)
+    return sum(a) / sum(b)
+def weighted_perplexity(items):
+    return math.exp(-weighted_mean(items))
+def bits_per_byte(items):
+    return -weighted_mean(items) / math.log(2)
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+def is_non_str_iterable(obj):
+    return isinstance(obj, Iterable) and not isinstance(obj, str)
+def _sacreformat(refs, preds):
+    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
+    # Sacrebleu expects (List[str], List[List[str])
+    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
+    # Note [ref1_stream] is the first reference for each pred.
+    # So lists are size N and (M, N) for N preds and M possible refs for each pred
+    # This is a different order of dimensions that I would expect
+    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
+    # Must become List[List[str]] with the inner list corresponding to preds
+    if not is_non_str_iterable(refs):
+        refs = list(refs)
+    if not is_non_str_iterable(refs[0]):
+        refs = [[ref] for ref in refs]
+    refs = list(zip(*refs))
+    # Note the number of refs in each ref list much match the number of preds
+    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
+    if not is_non_str_iterable(preds):
+        preds = list(preds)
+    if is_non_str_iterable(preds[0]):
+        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
+        preds = [pred[0] for pred in preds]
+    return refs, preds
+# stderr stuff
+class _bootstrap_internal:
+    def __init__(self, f, n):
+        self.f = f
+        self.n = n
+    def __call__(self, v):
+        i, xs = v
+        rnd = random.Random()
+        rnd.seed(i)
+        res = []
+        for _ in range(self.n):
+            res.append(self.f(rnd.choices(xs, k=len(xs))))
+        return res
+def bootstrap_stderr(f, xs, iters):
+    import multiprocessing as mp
+    pool = mp.Pool(mp.cpu_count())
+    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+    # equivalent to stderr calculated without Bessel's correction in the stddev.
+    # Unfortunately, I haven't been able to figure out what the right correction is
+    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+    res = []
+    chunk_size = min(1000, iters)
+    from tqdm import tqdm
+    print("bootstrapping for stddev:", f.__name__)
+    for bootstrap in tqdm(
+        pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)],
+        ),
+        total=iters // chunk_size,
+    ):
+        # sample w replacement
+        res.extend(bootstrap)
+    pool.close()
+    return sample_stddev(res)
+def stderr_for_metric(metric, bootstrap_iters):
+    bootstrappable = [
+        median,
+        matthews_corrcoef,
+        f1_score,
+        perplexity,
+        bleu,
+        chrf,
+        ter,
+    ]
+    if metric in bootstrappable:
+        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
+    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
+    return stderr.get(metric, None)
+def yesno(x):
+    if x:
+        return "yes"
+    else:
+        return "no"

scripts/yans/eval/lm-evaluation-harness/lm_eval/prompts.py ADDED Viewed

	@@ -0,0 +1,33 @@

+def jslm_beta(task):
+    """JSLM Beta uses a different prompt for JCommonSenseQA."""
+    if task == "jcommonsenseqa":
+        return "0.2.1"
+    else:
+        return "0.2"
+PROMPT_CODES = {
+    "user": "0.0",
+    "jgpt": "0.1",
+    "fintan": "0.2",
+    "fintan2": "0.2.1",
+    "ja-alpaca": "0.3",
+    "rinna-sft": "0.4",
+    "rinna-bilingual": "0.5",
+    "llama2": "0.6",
+    "jslm-beta": jslm_beta,
+}
+def get_prompt_code(short_name, task=None):
+    """Get the prompt code given a short name.
+    Usually, this is a simple dictionary lookup. But it can depend on the task
+    sometimes.
+    """
+    code = PROMPT_CODES[short_name]
+    if callable(code):
+        return callable(task)
+    else:
+        return code

scripts/yans/eval/lm-evaluation-harness/lm_eval/suites/__init__.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Functionality related to "eval suites". A suite is a collection of tasks with
+# options pre-configured. Different models can be run with the same suite to
+# compare them.
+import configparser
+from dataclasses import dataclass
+from typing import Optional
+import os
+from pathlib import Path
+# This file is the path where suite configs go
+SUITE_DIR = Path(os.path.dirname(os.path.realpath(__file__))) / "configs"
+@dataclass
+class TaskSpec:
+    """Specification of a task in an eval suite.
+    A suite is a list of these specs, plus a prompt."""
+    # The real arguments have to be massaged into messy strings and parallel
+    # lists, but this is a more reasonable structure - we can handle conversion
+    # separately.
+    name: str
+    fewshot: int
+    version: Optional[str]
+def load_suite(name):
+    """Read in configuration for a test suite.
+    A suite will have a config file named something like `my_suite.conf`. For
+    each task in the file, a version, fewshot config, and any other details
+    will be specified.
+    Example entry:
+        [tasks.mgsm]
+        version = 1.0
+        fewshot = 5
+    """
+    conf = configparser.ConfigParser()
+    conf.read(SUITE_DIR / (name + ".conf"))
+    specs = []
+    for key, val in conf.items():
+        if not key.startswith("tasks."):
+            continue
+        spec = TaskSpec(
+            name=key.split(".", 1)[1],
+            version=val.get("version", None),
+            fewshot=int(val["fewshot"]),
+        )
+        specs.append(spec)
+    return specs

scripts/yans/eval/lm-evaluation-harness/lm_eval/suites/configs/ja8.conf ADDED Viewed

	@@ -0,0 +1,33 @@

+# This is the standard eight-task eval suite.
+[tasks.mgsm]
+version = 1.0
+fewshot = 5
+[tasks.xwinograd_ja]
+# this has no version
+fewshot = 0
+[tasks.xlsum_ja]
+version = 1.0
+fewshot = 1
+[tasks.jaqket_v2]
+version = 0.2
+fewshot = 1
+[tasks.marc_ja]
+version = 1.1
+fewshot = 3
+[tasks.jnli]
+version = 1.3
+fewshot = 3
+[tasks.jcommonsenseqa]
+version = 1.1
+fewshot = 3
+[tasks.jsquad]
+version = 1.1
+fewshot = 2

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/coqa.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""
+CoQA: A Conversational Question Answering Challenge
+https://arxiv.org/pdf/1808.07042.pdf
+CoQA is a large-scale dataset for building Conversational Question Answering
+systems. The goal of the CoQA challenge is to measure the ability of machines to
+understand a text passage and answer a series of interconnected questions that
+appear in a conversation.
+Homepage: https://stanfordnlp.github.io/coqa/
+"""
+import inspect
+import transformers.data.metrics.squad_metrics as squad_metrics
+import lm_eval.datasets.coqa.coqa
+from lm_eval.base import Task, rf, mean
+from itertools import zip_longest
+_CITATION = """
+@misc{reddy2018coqa,
+    title={CoQA: A Conversational Question Answering Challenge},
+    author={Siva Reddy and Danqi Chen and Christopher D. Manning},
+    year={2018},
+    eprint={1808.07042},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+class CoQA(Task):
+    VERSION = 1
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.coqa.coqa)
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        return self.dataset["train"]
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        pass
+    def doc_to_text(self, doc):
+        # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
+        # and a question qi, the task is to predict the answer ai
+        doc_text = doc["story"] + "\n\n"
+        for (q, a) in zip_longest(
+            doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
+        ):  # omit target answer ai
+            question = f"Q: {q}\n\n"
+            answer = f"A: {a}\n\n" if a is not None else "A:"
+            doc_text += question + answer
+        return doc_text
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["story"] + " " + "\n".join(doc["questions"]["input_text"])
+    @classmethod
+    def get_answers(cls, doc, turn_id):
+        # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
+        answers = []
+        answer_forturn = doc["answers"]["input_text"][turn_id - 1]
+        answers.append(answer_forturn)
+        additional_answers = doc.get("additional_answers")
+        if additional_answers:
+            for key in additional_answers:
+                additional_answer_for_turn = additional_answers[key]["input_text"][
+                    turn_id - 1
+                ]
+                if additional_answer_for_turn.lower() not in map(str.lower, answers):
+                    answers.append(additional_answer_for_turn)
+        return answers
+    @classmethod
+    def get_answer_choice(self, raw_text):
+        # Function maps answers to CoQA answer categories
+        # ~ 1/5 of the CoQA answers are Yes/No
+        # ~ 2/3 of the CoQA answers are span-based
+        # (answers overlap with the passage ignoring punctuation and case mismatch)
+        if raw_text == "unknown":
+            return "0"
+        if squad_metrics.normalize_answer(raw_text) == "yes":
+            return "1"
+        if squad_metrics.normalize_answer(raw_text) == "no":
+            return "2"
+        return "3"  # Not a yes/no question
+    @staticmethod
+    def compute_scores(gold_list, pred):
+        # tests for exact match and on the normalised answer (compute_exact)
+        # test for overlap (compute_f1)
+        f1_sum = 0.0
+        em_sum = 0.0
+        if len(gold_list) > 1:
+            for i in range(len(gold_list)):
+                gold_answers = gold_list[0:i] + gold_list[i + 1 :]
+                # predictions compared against (n) golds and take maximum
+                em_sum += max(
+                    squad_metrics.compute_exact(a, pred) for a in gold_answers
+                )
+                f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
+        else:
+            em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
+            f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
+        return {
+            "em": em_sum / max(1, len(gold_list)),
+            "f1": f1_sum / max(1, len(gold_list)),
+        }
+    def doc_to_target(self, doc, turnid=None):
+        # Default to prediction of last turn.
+        if turnid is None:
+            turnid = len(doc["questions"]["input_text"])
+        raw_text = doc["answers"]["input_text"][turnid - 1]
+        return " " + raw_text
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        cont_request = rf.greedy_until(ctx, ["\nQ:"])
+        return cont_request
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        turn_id = len(doc["questions"]["input_text"])
+        gold_list = self.get_answers(doc, turn_id)
+        pred = results[0].strip().split("\n")[0]
+        scores = self.compute_scores(gold_list, pred)
+        return {
+            "f1": scores["f1"],
+            "em": scores["em"],
+        }
+    def higher_is_better(self):
+        return {
+            "f1": True,
+            "em": True,
+        }
+    def aggregation(self):
+        return {
+            "f1": mean,
+            "em": mean,
+        }

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/hellaswag.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+HellaSwag: Can a Machine Really Finish Your Sentence?
+https://arxiv.org/pdf/1905.07830.pdf
+Hellaswag is a commonsense inference challenge dataset. Though its questions are
+trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
+achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
+series of discriminators iteratively select an adversarial set of machine-generated
+wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
+the length and complexity of the dataset examples towards a critical 'Goldilocks'
+zone wherein generated text is ridiculous to humans, yet often misclassified by
+state-of-the-art models.
+Homepage: https://rowanzellers.com/hellaswag/
+"""
+import re
+from lm_eval.base import MultipleChoiceTask
+_CITATION = """
+@inproceedings{zellers2019hellaswag,
+    title={HellaSwag: Can a Machine Really Finish Your Sentence?},
+    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
+    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+    year={2019}
+}
+"""
+class HellaSwag(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "hellaswag"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+    def _process_doc(self, doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": self.preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [self.preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+    @classmethod
+    def preprocess(cls, text):
+        text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+    def doc_to_text(self, doc):
+        return doc["query"]
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/lambada.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+The LAMBADA dataset: Word prediction requiring a broad discourse context∗
+https://arxiv.org/pdf/1606.06031.pdf
+LAMBADA is a dataset to evaluate the capabilities of computational models for text
+understanding by means of a word prediction task. LAMBADA is a collection of narrative
+passages sharing the characteristic that human subjects are able to guess their last
+word if they are exposed to the whole passage, but not if they only see the last
+sentence preceding the target word. To succeed on LAMBADA, computational models
+cannot simply rely on local context, but must be able to keep track of information
+in the broader discourse.
+Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
+"""
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean, perplexity
+_CITATION = """
+@misc{
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
+    title={The LAMBADA dataset},
+    DOI={10.5281/zenodo.2630551},
+    publisher={Zenodo},
+    year={2016},
+    month={Aug}
+}
+"""
+class LambadaBase(Task):
+    VERSION = None
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+    def doc_to_text(self, doc):
+        return doc["text"].rsplit(" ", 1)[0]
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["text"]
+    def doc_to_target(self, doc):
+        return " " + doc["text"].rsplit(" ", 1)[1]
+    def construct_requests(self, doc, ctx):
+        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
+        return ll, is_greedy
+    def process_results(self, doc, results):
+        ll, is_greedy = results
+        return {"ppl": ll, "acc": int(is_greedy)}
+    def aggregation(self):
+        return {"ppl": perplexity, "acc": mean}
+    def higher_is_better(self):
+        return {"ppl": False, "acc": True}
+class LambadaStandard(LambadaBase):
+    """The LAMBADA task using the standard original LAMBADA dataset."""
+    VERSION = 0
+    DATASET_PATH = "lambada"
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+class LambadaOpenAI(LambadaBase):
+    """The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the
+    original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model.
+    Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
+    """
+    VERSION = 0
+    DATASET_PATH = "EleutherAI/lambada_openai"
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return False
+    def has_test_docs(self):
+        return True

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/lambada_multilingual.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+The LAMBADA (OpenAI) dataset: Word prediction requiring a broad discourse context∗
+https://arxiv.org/pdf/1606.06031.pdf
+The LAMBADA OpenAI dataset machine-translated to other languages.
+LAMBADA is a dataset to evaluate the capabilities of computational models for text
+understanding by means of a word prediction task. LAMBADA is a collection of narrative
+passages sharing the characteristic that human subjects are able to guess their last
+word if they are exposed to the whole passage, but not if they only see the last
+sentence preceding the target word. To succeed on LAMBADA, computational models
+cannot simply rely on local context, but must be able to keep track of information
+in the broader discourse.
+Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
+Reference (OpenAI): https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
+"""
+import inspect
+from .lambada import LambadaOpenAI
+from lm_eval.base import rf
+import lm_eval.datasets.lambada_ja.lambada_ja
+from lm_eval.metrics import mean, perplexity
+_CITATION = """
+@misc{
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
+    title={The LAMBADA dataset},
+    DOI={10.5281/zenodo.2630551},
+    publisher={Zenodo},
+    year={2016},
+    month={Aug}
+}
+"""
+class LambadaOpenAIMultilingualEnglish(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "en"
+class LambadaOpenAIMultilingualFrench(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "fr"
+class LambadaOpenAIMultilingualGerman(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "de"
+class LambadaOpenAIMultilingualItalian(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "it"
+class LambadaOpenAIMultilingualSpanish(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "es"
+class LambadaOpenAIMultilingualJapanese(LambadaOpenAI):
+    VERSION = 0
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.lambada_ja.lambada_ja)
+    DATASET_NAME = "ja"
+    def test_docs(self):
+        # TODO: because all lambda texts are not translated yet, only take 1k translated texts
+        # return self.dataset['test']
+        texts = [item["text"] for item in self.dataset["test"] if item["text"] != ""][
+            :1000
+        ]
+        # remove last 。
+        texts = [text[:-1] if text[-1] == "。" else text for text in texts]
+        return texts
+        # for doc in self.dataset["test"]:
+        #     yield doc["text"]
+    def doc_to_text(self, doc):
+        # return doc.rsplit(" ", 1)[0]
+        return doc
+        # # using janome
+        # try:
+        #     from janome.tokenizer import Tokenizer
+        #     t = Tokenizer()
+        # except ImportError:
+        #     raise ImportError("Please install janome first! (`pip install janome`)")
+        # words = [token.surface for token in t.tokenize(doc)][:-1]
+        # return "".join(words)
+    def doc_to_target(self, doc):
+        # return " " + doc["text"].rsplit(" ", 1)[1]
+        # # take last token from context
+        return "__lasttoken__"
+        # # using janome
+        # try:
+        #     from janome.tokenizer import Tokenizer
+        #     t = Tokenizer()
+        # except ImportError:
+        #     raise ImportError("Please install janome first! (`pip install janome`)")
+        # word = [token.surface for token in t.tokenize(doc)][-1]
+        # return word
+    def construct_requests(self, doc, ctx):
+        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
+        return ll, is_greedy
+LANG_CLASSES = [
+    LambadaOpenAIMultilingualEnglish,
+    LambadaOpenAIMultilingualFrench,
+    LambadaOpenAIMultilingualGerman,
+    LambadaOpenAIMultilingualItalian,
+    LambadaOpenAIMultilingualSpanish,
+    LambadaOpenAIMultilingualJapanese,
+]
+def construct_tasks():
+    tasks = {}
+    for lang_class in LANG_CLASSES:
+        tasks[f"lambada_openai_mt_{lang_class.DATASET_NAME}"] = lang_class
+    return tasks

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/qa4mre.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation
+https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf
+The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013.
+The main objective of this exercise is to develop a methodology for evaluating
+Machine Reading systems through Question Answering and Reading Comprehension
+Tests. Systems should be able to extract knowledge from large volumes of text
+and use this knowledge to answer questions. Four different tasks have been
+organized during these years: Main Task, Processing Modality and Negation for
+Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease,
+and Entrance Exam.
+Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php
+"""
+from lm_eval.base import MultipleChoiceTask
+_CITATION = """
+@inproceedings{Peas2013QA4MRE2O,
+    title={QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation},
+    author={Anselmo Pe{\~n}as and Eduard H. Hovy and Pamela Forner and {\'A}lvaro Rodrigo and Richard F. E. Sutcliffe and Roser Morante},
+    booktitle={CLEF},
+    year={2013}
+}
+"""  # noqa: W605
+class QA4MRE(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "qa4mre"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return False
+    def has_test_docs(self):
+        return True
+    def test_docs(self):
+        # `qa4mre` only has train data so we use it for the test docs.
+        return map(self._process_doc, self.dataset["train"])
+    def _process_doc(self, doc):
+        choices = doc["answer_options"]["answer_str"]
+        out_doc = {
+            "source": doc["document_str"].strip().replace("'", "'"),
+            "query": doc["question_str"],
+            "choices": choices,
+            "gold": int(doc["correct_answer_id"]) - 1,
+        }
+        return out_doc
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["source"] + " " + doc["query"]
+class QA4MRE_2011(QA4MRE):
+    DATASET_NAME = "2011.main.EN"
+class QA4MRE_2012(QA4MRE):
+    DATASET_NAME = "2012.main.EN"
+class QA4MRE_2013(QA4MRE):
+    DATASET_NAME = "2013.main.EN"

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/squad.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Know What You Don’t Know: Unanswerable Questions for SQuAD
+https://arxiv.org/pdf/1806.03822.pdf
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/
+"""
+import datasets
+from math import exp
+from lm_eval.base import rf, Task
+from functools import partial
+from packaging import version
+_CITATION = """
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+def _squad_metric(predictions, references):
+    squad_metric = datasets.load_metric("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+def _squad_agg(key, items):
+    predictions, references = zip(*items)
+    return _squad_metric(predictions=predictions, references=references).get(key, 0)
+class SQuAD2(Task):
+    VERSION = 1
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+    # HF changed squad on us so we have to make sure we aren't running the old one
+    assert version.parse(datasets.__version__) >= version.parse(
+        "1.11.0"
+    ), "datasets v1.11.0 or later required for SQuAD"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        return self.dataset["train"]
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return (
+            "Title: "
+            + doc["title"]
+            + "\n\n"
+            + "Background: "
+            + doc["context"]
+            + "\n\n"
+            + "Question: "
+            + doc["question"]
+            + "\n\n"
+            + "Answer:"
+        )
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["context"]
+    def doc_to_target(self, doc):
+        answer_list = doc["answers"]["text"]
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = "unanswerable"
+        return " " + answer
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        continuation = rf.greedy_until(ctx, ["\n"])
+        is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
+        return continuation, is_unanswerable
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        continuation, (logprob_unanswerable, _) = results
+        no_answer_probability = exp(logprob_unanswerable)
+        predictions = {
+            "id": doc["id"],
+            "prediction_text": continuation,
+            "no_answer_probability": no_answer_probability,
+        }
+        references = {
+            "id": doc["id"],
+            "answers": doc["answers"],
+        }
+        return {
+            "exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": (
+                predictions,
+                references,
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": (
+                predictions,
+                references,
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": (
+                predictions,
+                references,
+            ),  # Best exact match (with varying threshold)
+            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
+        }
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "exact": partial(
+                _squad_agg, "exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": partial(
+                _squad_agg, "f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": partial(
+                _squad_agg, "HasAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": partial(
+                _squad_agg, "HasAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": partial(
+                _squad_agg, "NoAns_exact"
+            ),  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": partial(
+                _squad_agg, "NoAns_f1"
+            ),  # The F-score of predicted tokens versus the gold answer
+            "best_exact": partial(
+                _squad_agg, "best_exact"
+            ),  # Best exact match (with varying threshold)
+            "best_f1": partial(
+                _squad_agg, "best_f1"
+            ),  # Best F1 (with varying threshold)
+        }
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "f1": True,  # The F-score of predicted tokens versus the gold answer
+            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
+            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
+            "best_exact": True,  # Best exact match (with varying threshold)
+            "best_f1": True,  # Best F1 (with varying threshold)
+        }

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/superglue.py ADDED Viewed

	@@ -0,0 +1,490 @@

+"""
+SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems
+https://w4ngatang.github.io/static/papers/superglue.pdf
+SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
+understanding tasks.
+Homepage: https://super.gluebenchmark.com/
+TODO: WSC requires free-form generation.
+"""
+import numpy as np
+import sklearn
+import transformers.data.metrics.squad_metrics as squad_metrics
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean, acc_all, metric_max_over_ground_truths, yesno
+from lm_eval.utils import general_detokenize
+_CITATION = """
+@inproceedings{NEURIPS2019_4496bf24,
+    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
+    pages = {},
+    publisher = {Curran Associates, Inc.},
+    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
+    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
+    volume = {32},
+    year = {2019}
+}
+"""
+class BoolQ(Task):
+    VERSION = 1
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "boolq"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["passage"]
+    def doc_to_target(self, doc):
+        return " " + yesno(doc["label"])
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]
+        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
+        return {"acc": acc}
+    def higher_is_better(self):
+        return {"acc": True}
+    def aggregation(self):
+        return {"acc": mean}
+class CommitmentBank(Task):
+    VERSION = 1
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "cb"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
+            doc["premise"],
+            doc["hypothesis"],
+        )
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
+        return ll_true, ll_false, ll_neither
+    def process_results(self, doc, results):
+        gold = doc["label"]
+        pred = np.argmax(results)
+        acc = 1.0 if pred == gold else 0.0
+        return {"acc": acc, "f1": (pred, gold)}
+    def higher_is_better(self):
+        return {"acc": True, "f1": True}
+    @classmethod
+    def cb_multi_fi(cls, items):
+        preds, golds = zip(*items)
+        preds = np.array(preds)
+        golds = np.array(golds)
+        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
+        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
+        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
+        avg_f1 = mean([f11, f12, f13])
+        return avg_f1
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "f1": self.cb_multi_fi,
+        }
+class Copa(Task):
+    VERSION = 0
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "copa"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        # Drop the period
+        connector = {
+            "cause": "because",
+            "effect": "therefore",
+        }[doc["question"]]
+        return doc["premise"].strip()[:-1] + f" {connector}"
+    def doc_to_target(self, doc):
+        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
+        # Connect the sentences
+        return " " + self.convert_choice(correct_choice)
+    def construct_requests(self, doc, ctx):
+        choice1 = " " + self.convert_choice(doc["choice1"])
+        choice2 = " " + self.convert_choice(doc["choice2"])
+        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
+        ll_choice2, _ = rf.loglikelihood(ctx, choice2)
+        return ll_choice1, ll_choice2
+    def process_results(self, doc, results):
+        gold = doc["label"]
+        pred = np.argmax(results)
+        acc = 1.0 if pred == gold else 0.0
+        return {"acc": acc}
+    def higher_is_better(self):
+        return {"acc": True}
+    def aggregation(self):
+        return {"acc": mean}
+    @staticmethod
+    def convert_choice(choice):
+        return choice[0].lower() + choice[1:]
+class MultiRC(Task):
+    VERSION = 1
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "multirc"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
+    def doc_to_target(self, doc):
+        return " " + self.format_answer(answer=doc["answer"], label=doc["label"])
+    @staticmethod
+    def format_answer(answer, label):
+        label_str = "yes" if label else "no"
+        return f"{answer}\nIs the answer correct? {label_str}"
+    def construct_requests(self, doc, ctx):
+        true_choice = self.format_answer(answer=doc["answer"], label=True)
+        false_choice = self.format_answer(answer=doc["answer"], label=False)
+        ll_true_choice, _ = rf.loglikelihood(ctx, f" {true_choice}")
+        ll_false_choice, _ = rf.loglikelihood(ctx, f" {false_choice}")
+        return ll_true_choice, ll_false_choice
+    def process_results(self, doc, results):
+        ll_true_choice, ll_false_choice = results
+        pred = ll_true_choice > ll_false_choice
+        return {"acc": (pred, doc)}
+    def higher_is_better(self):
+        return {"acc": True}
+    def aggregation(self):
+        return {"acc": acc_all}
+class ReCoRD(Task):
+    VERSION = 0
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "record"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
+        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
+        if self._training_docs is None:
+            self._training_docs = []
+            for doc in self.dataset["train"]:
+                self._training_docs.append(self._process_doc(doc))
+        return self._training_docs
+    def validation_docs(self):
+        # See: training_docs
+        for doc in self.dataset["validation"]:
+            yield self._process_doc(doc)
+    @classmethod
+    def _process_doc(cls, doc):
+        return {
+            "passage": doc["passage"],
+            "query": doc["query"],
+            "entities": sorted(list(set(doc["entities"]))),
+            "answers": sorted(list(set(doc["answers"]))),
+        }
+    def doc_to_text(self, doc):
+        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
+        text = initial_text + "\n\n"
+        for highlight in highlights:
+            text += f"  - {highlight}.\n"
+        return text
+    @classmethod
+    def format_answer(cls, query, entity):
+        return f"  - {query}".replace("@placeholder", entity)
+    def doc_to_target(self, doc):
+        # We only output the first correct entity in a doc
+        return self.format_answer(query=doc["query"], entity=doc["answers"][0])
+    def construct_requests(self, doc, ctx):
+        requests = [
+            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
+            for entity in doc["entities"]
+        ]
+        return requests
+    def process_results(self, doc, results):
+        # ReCoRD's evaluation is actually deceptively simple:
+        # - Pick the maximum likelihood prediction entity
+        # - Evaluate the accuracy and token F1 PER EXAMPLE
+        # - Average over all examples
+        max_idx = np.argmax(np.array([result[0] for result in results]))
+        prediction = doc["entities"][max_idx]
+        gold_label_set = doc["answers"]
+        f1 = metric_max_over_ground_truths(
+            squad_metrics.compute_f1, prediction, gold_label_set
+        )
+        em = metric_max_over_ground_truths(
+            squad_metrics.compute_exact, prediction, gold_label_set
+        )
+        return {
+            "f1": f1,
+            "em": em,
+        }
+    def higher_is_better(self):
+        return {
+            "f1": True,
+            "em": True,
+        }
+    def aggregation(self):
+        return {
+            "f1": mean,
+            "em": mean,
+        }
+class WordsInContext(Task):
+    VERSION = 0
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "wic"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return (
+            "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
+            " two sentences above?\nAnswer:".format(
+                doc["sentence1"],
+                doc["sentence2"],
+                doc["sentence1"][doc["start1"] : doc["end1"]],
+            )
+        )
+    def doc_to_target(self, doc):
+        return " {}".format({0: "no", 1: "yes"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]
+        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
+        return {"acc": acc}
+    def higher_is_better(self):
+        return {"acc": True}
+    def aggregation(self):
+        return {"acc": mean}
+class SGWinogradSchemaChallenge(Task):
+    VERSION = 0
+    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
+    #       binary version of the task.
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "wsc"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                # GPT-3 Paper's format only uses positive examples for fewshot "training"
+                self._training_docs = [
+                    doc for doc in self.dataset["train"] if doc["label"]
+                ]
+            return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        raw_passage = doc["text"]
+        # NOTE: HuggingFace span indices are word-based not character-based.
+        pre = " ".join(raw_passage.split()[: doc["span2_index"]])
+        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1 :]
+        passage = general_detokenize(pre + " *{}*".format(doc["span2_text"]) + post)
+        noun = doc["span1_text"]
+        pronoun = doc["span2_text"]
+        text = (
+            f"Passage: {passage}\n"
+            + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
+            + "Answer:"
+        )
+        return text
+    def doc_to_target(self, doc):
+        return " " + yesno(doc["label"])
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]
+        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
+        return {"acc": acc}
+    def higher_is_better(self):
+        return {"acc": True}
+    def aggregation(self):
+        return {"acc": mean}

scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/translation.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+NOTE: This file implements translation tasks using datasets from WMT conferences,
+provided by sacrebleu. Traditionally they are evaluated with BLEU scores. TER
+and CHRF are other options.
+We defer citations and descriptions of the many translations tasks used
+here to the SacreBLEU repo from which we've obtained the datasets:
+https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/dataset.py
+Homepage: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/dataset.py
+"""
+import pycountry
+from pprint import pprint
+from sacrebleu import sacrebleu
+from lm_eval import metrics
+from lm_eval.base import Task, rf
+from typing import List
+try:
+    import nagisa
+    HAS_NAGISA = True
+except ImportError:
+    HAS_NAGISA = False
+try:
+    import jieba
+    HAS_JIEBA = True
+except ImportError:
+    HAS_JIEBA = False
+_CITATION = """
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+sacrebleu_datasets = sacrebleu.DATASETS
+def create_tasks_from_benchmarks(benchmark_dict):
+    """Creates a dictionary of tasks from a dict
+    :param benchmark_dict: { dataset: [lang_pair, ...], }
+    :return: {task_name: task}
+        e.g. {wmt14-fr-en: Task, wmt16-de-en: Task}
+    """
+    def version_of(dataset, language_pair):
+        if language_pair[-2:] in ["zh", "ja"]:
+            return 1  # changed to use jieba/nagisa
+        return 0
+    return {
+        f"{dataset}-{language_pair}": create_translation_task(
+            dataset, language_pair, version_of(dataset, language_pair)
+        )
+        for dataset, language_pairs in benchmark_dict.items()
+        for language_pair in language_pairs
+    }
+########################################
+# Language Specifics
+########################################
+def zh_split(zh_text: List[str]) -> List[str]:
+    """Chinese splitting"""
+    if not HAS_JIEBA:
+        raise ImportError(
+            "Chinese text splitting requires the `jieba` package. "
+            "Please install it with:\npip install jieba"
+        )
+    return [" ".join(jieba.cut(txt.strip())) for txt in zh_text]
+def ja_split(ja_text: List[str]) -> List[str]:
+    """Japanese splitting"""
+    if not HAS_NAGISA:
+        raise ImportError(
+            "Japanese text splitting requires the `nagisa` package. "
+            "Please install it with:\npip install nagisa"
+        )
+    return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text]
+NO_SPACE_LANG = {"zh": zh_split, "ja": ja_split}
+########################################
+# Tasks
+########################################
+def create_translation_task(dataset, language_pair, version=0):
+    class TranslationTask(GeneralTranslationTask):
+        VERSION = version
+        def __init__(self):
+            super().__init__(dataset, language_pair)
+    return TranslationTask
+class GeneralTranslationTask(Task):
+    VERSION = 0
+    # e.g. ("wmt14", "fr-en")
+    def __init__(self, sacrebleu_dataset, sacrebleu_language_pair=None):
+        self.sacrebleu_dataset = sacrebleu_dataset
+        self.sacrebleu_language_pair = sacrebleu_language_pair
+        self.src_file = self.ref_file = self.src_data = self.ref_data = None
+        super().__init__()
+    def download(self, data_dir=None, cache_dir=None, download_mode=None):
+        # This caches in the users home dir automatically
+        self.src_file, self.ref_file = sacrebleu.download_test_set(
+            self.sacrebleu_dataset, self.sacrebleu_language_pair
+        )
+        self.src_data, self.ref_data = [
+            [line.rstrip() for line in sacrebleu.smart_open(file)]
+            for file in (self.src_file, self.ref_file)
+        ]
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        # TODO In the future we could be more discerning. Some more recent tests have train and dev sets
+        return False
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        return False
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        return True
+    def test_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return [
+            {"src": src, "ref": ref} for src, ref in zip(self.src_data, self.ref_data)
+        ]
+    def doc_to_text(self, doc):
+        language_codes = self.sacrebleu_language_pair.split("-")
+        src_lang = code_to_language(language_codes[0])
+        tar_lang = code_to_language(language_codes[1])
+        return f"{src_lang} phrase: " + doc["src"] + f"\n{tar_lang} phrase:"
+    def should_decontaminate(self):
+        return True
+    def doc_to_decontamination_query(self, doc):
+        return doc["src"]
+    def doc_to_target(self, doc):
+        # This shows a single target, though there may be multiple targets in a lang test
+        return " " + doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        return rf.greedy_until(ctx, ["\n"])
+    def process_results(self, doc, results):
+        # Add spaces between words for BLEU score calculation of target languages like Chinese
+        tar_lang_code = self.sacrebleu_language_pair.split("-")[-1]
+        if tar_lang_code in NO_SPACE_LANG:
+            doc["ref"] = NO_SPACE_LANG[tar_lang_code]([doc["ref"]])[0]
+            results = NO_SPACE_LANG[tar_lang_code](results)
+        # These metrics are corpus-level not sentence level, so we'll hide the
+        # results in this dict and compute the corpus score in the aggregate method
+        ref_pred = (doc["ref"], results)
+        return {
+            "bleu": ref_pred,
+            "chrf": ref_pred,
+            "ter": ref_pred,
+        }
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "bleu": metrics.bleu,
+            "chrf": metrics.chrf,
+            "ter": metrics.ter,
+        }
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "bleu": True,
+            "chrf": True,
+            "ter": False,
+        }
+    def __str__(self):
+        language_codes = self.sacrebleu_language_pair.split("-")
+        src_lang = code_to_language(language_codes[0])
+        tar_lang = code_to_language(language_codes[1])
+        return f"{self.sacrebleu_dataset.upper()} {src_lang} to {tar_lang} Task"
+########################################
+# Util
+########################################
+def code_to_language(code):
+    # key is alpha_2 or alpha_3 depending on the code length
+    language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
+    return language_tuple.name

scripts/yans/eval/lm-evaluation-harness/lm_eval/utils.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import os
+import pathlib
+import re
+import collections
+import functools
+import inspect
+import sys
+from typing import List, Union
+import torch
+from omegaconf import OmegaConf
+import sacrebleu
+from rouge_score import rouge_scorer, scoring
+class ExitCodeError(Exception):
+    pass
+def sh(x):
+    if os.system(x):
+        raise ExitCodeError()
+def simple_parse_args_string(args_string):
+    """
+    Parses something like
+        args1=val1,arg2=val2
+    Into a dictionary
+    """
+    args_string = args_string.strip()
+    if not args_string:
+        return {}
+    arg_list = args_string.split(",")
+    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    return args_dict
+def join_iters(iters):
+    for iter in iters:
+        yield from iter
+def chunks(iter, n):
+    arr = []
+    for x in iter:
+        arr.append(x)
+        if len(arr) == n:
+            yield arr
+            arr = []
+    if arr:
+        yield arr
+def group(arr, fn):
+    res = collections.defaultdict(list)
+    for ob in arr:
+        res[fn(ob)].append(ob)
+    return list(res.values())
+def general_detokenize(string):
+    string = string.replace(" n't", "n't")
+    string = string.replace(" )", ")")
+    string = string.replace("( ", "(")
+    string = string.replace('" ', '"')
+    string = string.replace(' "', '"')
+    string = re.sub(r" (['.,])", r"\1", string)
+    return string
+def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
+    """
+    - context_len allows for a rolling window context, allowing each prediction window to potentially
+      condition on some context
+    :param token_list: list
+        List of tokens to be PREDICTED
+    :param max_seq_len: int
+        max_seq_len of model (or max_seq_len we want to use)
+    :param context_len: int
+        Amount of desired token context for prediction. Needs to be at least 1.
+    :param prefix_token: token
+        Dummy token like <eos> so the first token has something to condition on
+    :return: generator
+        Generator of tuples
+            (input_tokens, pred_tokens)
+        Note: Score only the last len(pred_tokens) logits of the LM
+    """
+    assert 1 <= context_len <= max_seq_len
+    if not token_list:
+        return
+    # +1 offset, going from input->preds
+    pred_len = max_seq_len - context_len + 1
+    predicted = 0
+    # Special handling for first window: predict all tokens
+    first_seq_len = min(max_seq_len, len(token_list))
+    yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len])
+    predicted += first_seq_len
+    while predicted < len(token_list):
+        window_pred_len = min(len(token_list) - predicted, pred_len)
+        window_end = predicted + window_pred_len
+        yield (
+            token_list[window_end - max_seq_len - 1 : window_end - 1],
+            token_list[window_end - window_pred_len : window_end],
+        )
+        predicted += window_pred_len
+def make_disjoint_window(pair):
+    """Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
+    a, b = pair
+    return a[: len(a) - (len(b) - 1)], b
+def select_continuation_from_batch_left_padding(
+    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
+):
+    """Select the continuation from the batch, removing prompts of different lengths.
+    Args:
+        generations (Union[List[List[int]], torch.Tensor]):
+            A tensor or list-of-lists of shape [batch_size, sequence length].
+        max_context_size (int):
+            The size of the biggest context; generations will proceed from that
+            index.
+    Example:
+        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
+        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
+    Output:
+        [every day of the week]
+        [yesterday]  PAD PAD PAD PAD
+    """
+    return generations[:, max_context_size:]
+class Reorderer:
+    def __init__(self, arr, fn):
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+        arr = group(arr, lambda x: fn(x[1]))
+        arr = [([y[0] for y in x], x[0][1]) for x in arr]
+        arr.sort(key=lambda x: fn(x[1]))
+        self.arr = arr
+    def get_reordered(self):
+        return [x[1] for x in self.arr]
+    def get_original(self, newarr):
+        res = [None] * self.size
+        cov = [False] * self.size
+        for (inds, _), v in zip(self.arr, newarr):
+            for ind in inds:
+                res[ind] = v
+                cov[ind] = True
+        assert all(cov)
+        return res
+def positional_deprecated(fn):
+    """
+    A decorator to nudge users into passing only keyword args (`kwargs`) to the
+    wrapped function, `fn`.
+    """
+    @functools.wraps(fn)
+    def _wrapper(*args, **kwargs):
+        if len(args) != 1 if inspect.ismethod(fn) else 0:
+            print(
+                f"WARNING: using {fn.__name__} with positional arguments is "
+                "deprecated and will be disallowed in a future version of "
+                "lm-evaluation-harness!"
+            )
+        return fn(*args, **kwargs)
+    return _wrapper
+@positional_deprecated
+def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
+    """
+    Search upward in the directory tree to a maximum of three layers
+    to find and return the package root (containing the 'tests' folder)
+    """
+    cur_path = start_path.resolve()
+    max_layers = 3
+    for _ in range(max_layers):
+        if (cur_path / "tests" / "test_version_stable.py").exists():
+            return cur_path
+        else:
+            cur_path = cur_path.parent.resolve()
+    raise FileNotFoundError(
+        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
+    )
+@positional_deprecated
+def run_task_tests(task_list: List[str]):
+    """
+    Find the package root and run the tests for the given tasks
+    """
+    import pytest
+    package_root = find_test_root(start_path=pathlib.Path(__file__))
+    task_string = " or ".join(task_list)
+    args = [
+        f"{package_root}/tests/test_version_stable.py",
+        f"--rootdir={package_root}",
+        "-k",
+        f"{task_string}",
+    ]
+    sys.path.append(str(package_root))
+    pytest_return_val = pytest.main(args)
+    if pytest_return_val:
+        raise ValueError(
+            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
+        )
+def bleu(refs, preds):
+    """
+    Returns `t5` style BLEU scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
+    :param refs:
+        A `list` of `list` of reference `str`s.
+    :param preds:
+        A `list` of predicted `str`s.
+    """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
+    return score
+def rouge(refs, preds):
+    """
+    Returns `t5` style ROUGE scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
+    :param refs:
+        A `list` of reference `strs`.
+    :param preds:
+        A `list` of predicted `strs`.
+    """
+    rouge_types = ["rouge1", "rouge2", "rougeLsum"]
+    scorer = rouge_scorer.RougeScorer(rouge_types)
+    # Add newlines between sentences to correctly compute `rougeLsum`.
+    def _prepare_summary(summary):
+        summary = summary.replace(" . ", ".\n")
+        return summary
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        ref = _prepare_summary(ref)
+        pred = _prepare_summary(pred)
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
+def rouge2_mecab(refs, preds, tokenizer):
+    """This uses a MeCab tokenizer for Japanese text.
+    Besides specifying the tokenizer, this does not perform the rougeLsum
+    related sentence/newline normalization, and only calculates rouge2.
+    Otherwise it is the same as the generic rouge scoring.
+    """
+    rouge_types = ["rouge2"]
+    # mecab-based rouge
+    scorer = rouge_scorer.RougeScorer(
+        rouge_types,
+        tokenizer=tokenizer,
+    )
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/harness.jsquad-1.2.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-1b,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-1b/result.jsquad-1.2.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/harness.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-1b"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,xlsum_ja"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1" --device "cuda" --output_path "models/cyberagent-open-calm-1b/result.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/result.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "results": {
+    "jcommonsenseqa-1.1-0.2": {
+      "acc": 0.26899016979445933,
+      "acc_stderr": 0.013261996572328063,
+      "acc_norm": 0.24754244861483468,
+      "acc_norm_stderr": 0.01290758346346734
+    },
+    "jnli-1.1-0.2": {
+      "acc": 0.33566146261298274,
+      "acc_stderr": 0.00957358086224245,
+      "acc_norm": 0.3331963845521775,
+      "acc_norm_stderr": 0.009556042193601356
+    },
+    "marc_ja-1.1-0.2": {
+      "acc": 0.7792117195674921,
+      "acc_stderr": 0.005478034657719626,
+      "acc_norm": 0.7792117195674921,
+      "acc_norm_stderr": 0.005478034657719626
+    },
+    "jsquad-1.1-0.2": {
+      "exact_match": 37.12291760468258,
+      "f1": 47.171446643186265
+    },
+    "xlsum_ja": {
+      "rouge2": 2.288077088085482
+    },
+    "xwinograd_ja": {
+      "acc": 0.6089676746611054,
+      "acc_stderr": 0.015765969995357912
+    }
+  },
+  "versions": {
+    "jcommonsenseqa-1.1-0.2": 1.1,
+    "jnli-1.1-0.2": 1.1,
+    "jsquad-1.1-0.2": 1.1,
+    "marc_ja-1.1-0.2": 1.1,
+    "xlsum_ja": 1.0,
+    "xwinograd_ja": 1.0
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-1b",
+    "num_fewshot": [
+      2,
+      3,
+      3,
+      3,
+      1,
+      0
+    ],
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/result.jsquad-1.2.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 39.53174245835209,
+      "f1": 49.49399460234075
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-1b",
+    "num_fewshot": 3,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-1b/result.mgsm.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/harness.jsquad-1.2.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-3b,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/harness.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-3b"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-3b/result.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/result.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "results": {
+    "jcommonsenseqa-1.1-0.2": {
+      "acc": 0.2779267202859696,
+      "acc_stderr": 0.013397843071173697,
+      "acc_norm": 0.2529043789097408,
+      "acc_norm_stderr": 0.013000060342436679
+    },
+    "jnli-1.1-0.2": {
+      "acc": 0.40345110928512734,
+      "acc_stderr": 0.009945976384444125,
+      "acc_norm": 0.37674609695973704,
+      "acc_norm_stderr": 0.009823942907406487
+    },
+    "marc_ja-1.1-0.2": {
+      "acc": 0.8620509243111266,
+      "acc_stderr": 0.004554438976572761,
+      "acc_norm": 0.8620509243111266,
+      "acc_norm_stderr": 0.004554438976572761
+    },
+    "xwinograd_ja": {
+      "acc": 0.6360792492179353,
+      "acc_stderr": 0.015544482535576241
+    },
+    "jsquad-1.1-0.2": {
+      "exact_match": 40.45475011256191,
+      "f1": 52.73709875917724
+    },
+    "jaqket_v2-0.1-0.2": {
+      "exact_match": 46.90721649484536,
+      "f1": 51.615597556319194
+    },
+    "xlsum_ja": {
+      "rouge2": 1.948450071736146
+    },
+    "mgsm": {
+      "acc": 0.016,
+      "acc_stderr": 0.007951661188874344
+    }
+  },
+  "versions": {
+    "jcommonsenseqa-1.1-0.2": 1.1,
+    "jnli-1.1-0.2": 1.1,
+    "marc_ja-1.1-0.2": 1.1,
+    "jsquad-1.1-0.2": 1.1,
+    "jaqket_v2-0.1-0.2": 0.1,
+    "xlsum_ja": 1.0,
+    "xwinograd_ja": 1.0,
+    "mgsm": 1.0
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-3b",
+    "num_fewshot": [
+      3,
+      3,
+      3,
+      2,
+      1,
+      1,
+      0,
+      5
+    ],
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 44.529491220171096,
+      "f1": 56.02141036867636
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-3b,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 2,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-3b/result.mgsm.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/harness.jsquad-1.2.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-7b,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/harness.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-7b"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,jaqket_v2-0.1-0.2,xlsum_ja,xwinograd_ja,mgsm"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3,3,3,2,1,1,0,5" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-7b/result.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/result.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "results": {
+    "jcommonsenseqa-1.1-0.2": {
+      "acc": 0.2421805183199285,
+      "acc_stderr": 0.012812432289317907,
+      "acc_norm": 0.24396782841823056,
+      "acc_norm_stderr": 0.012844450125623429
+    },
+    "jnli-1.1-0.2": {
+      "acc": 0.3763352506162695,
+      "acc_stderr": 0.00982182053150895,
+      "acc_norm": 0.3463434675431389,
+      "acc_norm_stderr": 0.009646221914241809
+    },
+    "marc_ja-1.1-0.2": {
+      "acc": 0.7411928845483083,
+      "acc_stderr": 0.005784459117732042,
+      "acc_norm": 0.7411928845483083,
+      "acc_norm_stderr": 0.005784459117732042
+    },
+    "xwinograd_ja": {
+      "acc": 0.6506777893639207,
+      "acc_stderr": 0.01540328448938605
+    },
+    "jsquad-1.1-0.2": {
+      "exact_match": 45.79018460153084,
+      "f1": 59.03158509144496
+    },
+    "jaqket_v2-0.1-0.2": {
+      "exact_match": 60.738831615120276,
+      "f1": 64.89929362352039
+    },
+    "xlsum_ja": {
+      "rouge2": 2.0382422339290223
+    },
+    "mgsm": {
+      "acc": 0.008,
+      "acc_stderr": 0.005645483676690164
+    }
+  },
+  "versions": {
+    "jcommonsenseqa-1.1-0.2": 1.1,
+    "jnli-1.1-0.2": 1.1,
+    "marc_ja-1.1-0.2": 1.1,
+    "jsquad-1.1-0.2": 1.1,
+    "jaqket_v2-0.1-0.2": 0.1,
+    "xlsum_ja": 1.0,
+    "xwinograd_ja": 1.0,
+    "mgsm": 1.0
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-7b",
+    "num_fewshot": [
+      3,
+      3,
+      3,
+      2,
+      1,
+      1,
+      0,
+      5
+    ],
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 48.10895992796038,
+      "f1": 60.90961937230767
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-7b,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 2,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-7b/result.mgsm.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/harness.jsquad-1.2.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-large,use_fast=True,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-large/result.jsquad-1.2.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/harness.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-large,use_fast=True"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,xlsum_ja"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1" --device "cuda" --output_path "models/cyberagent-open-calm-large/result.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/result.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "results": {
+    "jcommonsenseqa-1.1-0.2": {
+      "acc": 0.2993744414655943,
+      "acc_stderr": 0.013697125864334919,
+      "acc_norm": 0.2752457551385165,
+      "acc_norm_stderr": 0.013357795705028184
+    },
+    "jnli-1.1-0.2": {
+      "acc": 0.40838126540673786,
+      "acc_stderr": 0.009965126356916034,
+      "acc_norm": 0.3751027115858669,
+      "acc_norm_stderr": 0.009815408241248635
+    },
+    "marc_ja-1.1-0.2": {
+      "acc": 0.7912452040460412,
+      "acc_stderr": 0.005367632889806105,
+      "acc_norm": 0.7912452040460412,
+      "acc_norm_stderr": 0.005367632889806105
+    },
+    "jsquad-1.1-0.2": {
+      "exact_match": 37.23547951373255,
+      "f1": 48.50349592141573
+    },
+    "xlsum_ja": {
+      "rouge2": 1.9854375467671679
+    },
+    "xwinograd_ja": {
+      "acc": 0.6152241918665277,
+      "acc_stderr": 0.015719467393137274
+    }
+  },
+  "versions": {
+    "jcommonsenseqa-1.1-0.2": 1.1,
+    "jnli-1.1-0.2": 1.1,
+    "jsquad-1.1-0.2": 1.1,
+    "marc_ja-1.1-0.2": 1.1,
+    "xlsum_ja": 1.0,
+    "xwinograd_ja": 1.0
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-large,use_fast=True",
+    "num_fewshot": [
+      2,
+      3,
+      3,
+      3,
+      1,
+      0
+    ],
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-large/result.jsquad-1.2.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 40.4997748761819,
+      "f1": 51.32160467436942
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-large,use_fast=True,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 3,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/harness.jsquad-1.2.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-medium,use_fast=True,device_map=auto,torch_dtype=auto"
+TASK="jsquad-1.2-0.2"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-medium/result.jsquad-1.2.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/harness.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=cyberagent/open-calm-medium,use_fast=True"
+TASK="jcommonsenseqa-1.1-0.2,jnli-1.1-0.2,marc_ja-1.1-0.2,jsquad-1.1-0.2,xlsum_ja"
+python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1" --device "cuda" --output_path "models/cyberagent-open-calm-medium/result.json"

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/result.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "results": {
+    "jcommonsenseqa-1.1-0.2": {
+      "acc": 0.39499553172475427,
+      "acc_stderr": 0.0146202392872941,
+      "acc_norm": 0.2868632707774799,
+      "acc_norm_stderr": 0.013527046208250626
+    },
+    "jnli-1.1-0.2": {
+      "acc": 0.4231717337715694,
+      "acc_stderr": 0.010016374130527417,
+      "acc_norm": 0.3972884141331142,
+      "acc_norm_stderr": 0.009920570907906705
+    },
+    "marc_ja-1.1-0.2": {
+      "acc": 0.8357167771189397,
+      "acc_stderr": 0.004893675823612713,
+      "acc_norm": 0.8357167771189397,
+      "acc_norm_stderr": 0.004893675823612713
+    },
+    "jsquad-1.1-0.2": {
+      "exact_match": 28.725799189554255,
+      "f1": 39.80333448254385
+    },
+    "xlsum_ja": {
+      "rouge2": 2.5775988917922406
+    },
+    "xwinograd_ja": {
+      "acc": 0.5964546402502607,
+      "acc_stderr": 0.015850834635341565
+    }
+  },
+  "versions": {
+    "jcommonsenseqa-1.1-0.2": 1.1,
+    "jnli-1.1-0.2": 1.1,
+    "jsquad-1.1-0.2": 1.1,
+    "marc_ja-1.1-0.2": 1.1,
+    "xlsum_ja": 1.0,
+    "xwinograd_ja": 1.0
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-medium,use_fast=True",
+    "num_fewshot": [
+      2,
+      3,
+      3,
+      3,
+      1,
+      0
+    ],
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/cyberagent/cyberagent-open-calm-medium/result.jsquad-1.2.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "results": {
+    "jsquad-1.2-0.2": {
+      "exact_match": 29.85141828005403,
+      "f1": 40.49655778214922
+    }
+  },
+  "versions": {
+    "jsquad-1.2-0.2": 1.2
+  },
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=cyberagent/open-calm-medium,use_fast=True,device_map=auto,torch_dtype=auto",
+    "num_fewshot": 3,
+    "batch_size": null,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/llama/llama-13b/harness.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=huggyllama/llama-13b,use_accelerate=True,load_in_8bit=True"
+TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
+python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3" --device "cuda" --output_path "models/llama/llama-13b/result.json" --batch_size 2  > models/llama/llama-13b/harness.out 2> models/llama/llama-13b/harness.err

scripts/yans/eval/lm-evaluation-harness/models/llama/llama-13b/result.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "results": {
+    "jsquad-1.1-0.3": {
+      "exact_match": 51.05808194506979,
+      "f1": 65.19689339101781
+    },
+    "jcommonsenseqa-1.1-0.3": {
+      "acc": 0.4932975871313673,
+      "acc_stderr": 0.014952371541808172,
+      "acc_norm": 0.29848078641644327,
+      "acc_norm_stderr": 0.013685386698397504
+    },
+    "jnli-1.1-0.3": {
+      "acc": 0.24116680361544782,
+      "acc_stderr": 0.008672830725110452,
+      "acc_norm": 0.30156121610517667,
+      "acc_norm_stderr": 0.009304239098715018
+    },
+    "marc_ja-1.1-0.3": {
+      "acc": 0.8791419602371817,
+      "acc_stderr": 0.004305031232204757,
+      "acc_norm": 0.8791419602371817,
+      "acc_norm_stderr": 0.004305031232204757
+    }
+  },
+  "versions": {
+    "jsquad-1.1-0.3": 1.1,
+    "jcommonsenseqa-1.1-0.3": 1.1,
+    "jnli-1.1-0.3": 1.1,
+    "marc_ja-1.1-0.3": 1.1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=huggyllama/llama-13b,use_accelerate=True,load_in_8bit=True",
+    "num_fewshot": [
+      2,
+      3,
+      3,
+      3
+    ],
+    "batch_size": 2,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

scripts/yans/eval/lm-evaluation-harness/models/llama/llama-30b/harness.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ARGS="pretrained=huggyllama/llama-30b,use_accelerate=True,load_in_8bit=True"
+TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
+python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3" --device "cuda" --output_path "models/llama/llama-30b/result.json" --batch_size 2  > models/llama/llama-30b/harness.out 2> models/llama/llama-30b/harness.err