blimp

Running

App Files Files Community

yu-val-weiss commited on Mar 10

Commit

8f3cd77

1 Parent(s): 803da62

Update blimp.py

Browse files

Files changed (1) hide show

blimp.py +166 -79

blimp.py CHANGED Viewed

@@ -15,13 +15,83 @@
 import datasets
 import evaluate
-import numpy as np
 import torch
 from evaluate import logging
-from torch.nn import CrossEntropyLoss
 from transformers import AutoModelForCausalLM, AutoTokenizer
-_CITATION = """\
 @article{warstadt2020blimp,
     author = {Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei and Wang, Sheng-Fu and Bowman, Samuel R.},
     title = {BLiMP: The Benchmark of Linguistic Minimal Pairs for English},
@@ -37,8 +107,7 @@ _CITATION = """\
 }
 """
-_DESCRIPTION = """
-BLiMP is a challenge set for evaluating what language models (LMs) know about major grammatical phenomena in English.
 BLiMP consists of 67 sub-datasets, each containing 1000 minimal pairs isolating specific contrasts in syntax, morphology, or semantics.
 The data is automatically generated according to expert-crafted grammars. Aggregate human agreement with the labels is 96.4%.
 We use BLiMP to evaluate an n-gram LM, LSTM LM, GPT-2, and Transformer-XL.
@@ -48,9 +117,12 @@ For more info see https://github.com/alexwarstadt/blimp.
 _KWARGS_DESCRIPTION = """
 Args:
-    model_id (str): model used for calculating Blimp
     batch_size (int): the batch size to run texts through the model. Defaults to 16.
-    device (str): device to run on, defaults to 'cuda' when available
 Returns:
     blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
     An LM’s overall accuracy on BLiMP is simply the proportion of the 67,000 minimal pairs in which the model assigns a higher probability to the acceptable sentence.
@@ -60,7 +132,7 @@ Examples:
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class Perplexity(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
             module_type="metric",
@@ -80,12 +152,11 @@ class Perplexity(evaluate.Metric):
     def _compute(
         self,
-        predictions,
         model_id,
         batch_size: int = 16,
-        add_start_token: bool = True,
         device=None,
-        max_length=None,
     ):
         if device is not None:
             assert device in ["gpu", "cpu", "cuda", "mps"], (
@@ -102,6 +173,7 @@ class Perplexity(evaluate.Metric):
         model = AutoModelForCausalLM.from_pretrained(model_id)
         model = model.to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -119,78 +191,93 @@ class Perplexity(evaluate.Metric):
             # assign one of the special tokens to also be the pad token
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
-        if add_start_token and max_length:
-            # leave room for <BOS> token to be added:
-            assert tokenizer.bos_token is not None, (
-                "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
-            )
-            max_tokenized_len = max_length - 1
-        else:
-            max_tokenized_len = max_length
-        encodings = tokenizer(
-            predictions,
-            add_special_tokens=False,
-            padding=True,
-            truncation=True if max_tokenized_len else False,
-            max_length=max_tokenized_len,
-            return_tensors="pt",
-            return_attention_mask=True,
-        ).to(device)
-        encoded_texts = encodings["input_ids"]
-        attn_masks = encodings["attention_mask"]
-        # check that each input is long enough:
-        if add_start_token:
-            assert torch.all(torch.ge(attn_masks.sum(1), 1)), (
-                "Each input text must be at least one token long."
             )
-        else:
-            assert torch.all(torch.ge(attn_masks.sum(1), 2)), (
-                "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
             )
-        ppls = []
-        loss_fct = CrossEntropyLoss(reduction="none")
-        for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
-            end_index = min(start_index + batch_size, len(encoded_texts))
-            encoded_batch = encoded_texts[start_index:end_index]
-            attn_mask = attn_masks[start_index:end_index]
-            if add_start_token:
-                bos_tokens_tensor = torch.tensor(
-                    [[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)
-                ).to(device)
-                encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
-                attn_mask = torch.cat(
-                    [
-                        torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(
-                            device
-                        ),
-                        attn_mask,
-                    ],
-                    dim=1,
-                )
-            labels = encoded_batch
-            with torch.no_grad():
-                out_logits = model(encoded_batch, attention_mask=attn_mask).logits
-            shift_logits = out_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
-            perplexity_batch = torch.exp(
-                (
-                    loss_fct(shift_logits.transpose(1, 2), shift_labels)
-                    * shift_attention_mask_batch
-                ).sum(1)
-                / shift_attention_mask_batch.sum(1)
-            )
-            ppls += perplexity_batch.tolist()
-        return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}

 import datasets
 import evaluate
 import torch
 from evaluate import logging
 from transformers import AutoModelForCausalLM, AutoTokenizer
+datasets.logging.set_verbosity_error()
+BLIMP_PHENOMENA = [
+    "adjunct_island",
+    "anaphor_gender_agreement",
+    "anaphor_number_agreement",
+    "animate_subject_passive",
+    "animate_subject_trans",
+    "causative",
+    "complex_NP_island",
+    "coordinate_structure_constraint_complex_left_branch",
+    "coordinate_structure_constraint_object_extraction",
+    "determiner_noun_agreement_1",
+    "determiner_noun_agreement_2",
+    "determiner_noun_agreement_irregular_1",
+    "determiner_noun_agreement_irregular_2",
+    "determiner_noun_agreement_with_adj_2",
+    "determiner_noun_agreement_with_adj_irregular_1",
+    "determiner_noun_agreement_with_adj_irregular_2",
+    "determiner_noun_agreement_with_adjective_1",
+    "distractor_agreement_relational_noun",
+    "distractor_agreement_relative_clause",
+    "drop_argument",
+    "ellipsis_n_bar_1",
+    "ellipsis_n_bar_2",
+    "existential_there_object_raising",
+    "existential_there_quantifiers_1",
+    "existential_there_quantifiers_2",
+    "existential_there_subject_raising",
+    "expletive_it_object_raising",
+    "inchoative",
+    "intransitive",
+    "irregular_past_participle_adjectives",
+    "irregular_past_participle_verbs",
+    "irregular_plural_subject_verb_agreement_1",
+    "irregular_plural_subject_verb_agreement_2",
+    "left_branch_island_echo_question",
+    "left_branch_island_simple_question",
+    "matrix_question_npi_licensor_present",
+    "npi_present_1",
+    "npi_present_2",
+    "only_npi_licensor_present",
+    "only_npi_scope",
+    "passive_1",
+    "passive_2",
+    "principle_A_c_command",
+    "principle_A_case_1",
+    "principle_A_case_2",
+    "principle_A_domain_1",
+    "principle_A_domain_2",
+    "principle_A_domain_3",
+    "principle_A_reconstruction",
+    "regular_plural_subject_verb_agreement_1",
+    "regular_plural_subject_verb_agreement_2",
+    "sentential_negation_npi_licensor_present",
+    "sentential_negation_npi_scope",
+    "sentential_subject_island",
+    "superlative_quantifiers_1",
+    "superlative_quantifiers_2",
+    "tough_vs_raising_1",
+    "tough_vs_raising_2",
+    "transitive",
+    "wh_island",
+    "wh_questions_object_gap",
+    "wh_questions_subject_gap",
+    "wh_questions_subject_gap_long_distance",
+    "wh_vs_that_no_gap",
+    "wh_vs_that_no_gap_long_distance",
+    "wh_vs_that_with_gap",
+    "wh_vs_that_with_gap_long_distance",
+]
+_CITATION = r"""
 @article{warstadt2020blimp,
     author = {Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei and Wang, Sheng-Fu and Bowman, Samuel R.},
     title = {BLiMP: The Benchmark of Linguistic Minimal Pairs for English},
 }
 """
+_DESCRIPTION = """BLiMP is a challenge set for evaluating what language models (LMs) know about major grammatical phenomena in English.
 BLiMP consists of 67 sub-datasets, each containing 1000 minimal pairs isolating specific contrasts in syntax, morphology, or semantics.
 The data is automatically generated according to expert-crafted grammars. Aggregate human agreement with the labels is 96.4%.
 We use BLiMP to evaluate an n-gram LM, LSTM LM, GPT-2, and Transformer-XL.
 _KWARGS_DESCRIPTION = """
 Args:
+    model_id (str): model used for calculating Blimp, NOTE: should be a causal LM model
+    predictions (list[str]): names of metrics to run. pass empty list or ["*"] to run all of them
     batch_size (int): the batch size to run texts through the model. Defaults to 16.
+    device (str): device to run on, defaults to 'cuda' when available.
+    samples_per_set (int): the number of samples per phenomenon, defaults to 1_000.
 Returns:
     blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
     An LM’s overall accuracy on BLiMP is simply the proportion of the 67,000 minimal pairs in which the model assigns a higher probability to the acceptable sentence.
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Blimp(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
             module_type="metric",
     def _compute(
         self,
         model_id,
+        predictions=None,
         batch_size: int = 16,
         device=None,
+        samples_per_set: int = 1_000,
     ):
         if device is not None:
             assert device in ["gpu", "cpu", "cuda", "mps"], (
         model = AutoModelForCausalLM.from_pretrained(model_id)
         model = model.to(device)
+        model.eval()
         tokenizer = AutoTokenizer.from_pretrained(model_id)
             # assign one of the special tokens to also be the pad token
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
+        print("PAD", tokenizer.pad_token_id)
+        run_all = len(predictions) == 0 or predictions[0] == "*"
+        blimp_sets = (
+            BLIMP_PHENOMENA
+            if run_all
+            else [p for p in BLIMP_PHENOMENA if p.lower() in predictions]
+        )
+        assert len(blimp_sets) > 0, "no valid phenomena selected"
+        results = {}
+        for phenomenon in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
+            dataset = datasets.load_dataset("nyu-mll/blimp", phenomenon)["train"]
+            # Prepare batches of good and bad sentences
+            sents = [(x["sentence_good"], x["sentence_bad"]) for x in dataset]
+            good_sents, bad_sents = zip(*sents[: min(1000, samples_per_set)])
+            # Get probabilities in batches
+            good_probs = get_batch_probabilities(
+                model, tokenizer, good_sents, device, batch_size, phenomenon
             )
+            bad_probs = get_batch_probabilities(
+                model,
+                tokenizer,
+                bad_sents,
+                device,
+                batch_size,
+                phenomenon,
+                sent_type="bad",
             )
+            # Compare probabilities
+            correct = sum(g > b for g, b in zip(good_probs, bad_probs))
+            accuracy = correct / len(good_probs)
+            results[phenomenon] = accuracy
+        # Calculate overall accuracy
+        overall_accuracy = sum(results.values()) / len(results)
+        return {"phenomenon_accuracies": results, "overall_accuracy": overall_accuracy}
+def get_batch_probabilities(
+    model,
+    tokenizer,
+    sentences: list[str],
+    device: str,
+    batch_size: int,
+    phenomenon: str,
+    sent_type: str = "good",
+):
+    """Compute log probabilities for a batch of sentences"""
+    probs = []
+    for i in logging.tqdm(
+        range(0, len(sentences), batch_size),
+        desc=f"{phenomenon} - {sent_type} sentences...",
+        leave=False,
+    ):
+        batch = sentences[i : i + batch_size]
+        inputs = tokenizer(
+            batch, padding=batch_size > 1, return_tensors="pt", truncation=True
+        ).to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        labels = inputs.input_ids
+        # Compute log probabilities
+        log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
+        # Get probability of each actual token
+        token_log_probs = torch.gather(log_probs, 2, labels.unsqueeze(-1)).squeeze(-1)
+        if batch_size > 1:
+            # Create attention mask for padding
+            mask = (labels != tokenizer.pad_token_id).float()
+            token_log_probs *= mask
+        # sum log probabilities
+        sequence_log_probs = (token_log_probs).sum(dim=1)
+        probs.extend(sequence_log_probs.cpu().tolist())
+    return probs