blimp

Running

App Files Files Community

yu-val-weiss commited on Mar 13, 2025

Commit

c26f589

1 Parent(s): 0f3b529

trust remote code

Browse files

Files changed (2) hide show

README.md +2 -1
blimp.py +11 -3

README.md CHANGED Viewed

@@ -47,7 +47,8 @@ results = blimp.compute(model_id='pico-lm/pico-decoder')
 - **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
 - **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
 - **device** (str): device to run on, defaults to `cuda` when available
-- **samples_per_set** (int): the number of samples per metric, defaults to 1_000. Maximum 1_000 (enforced with a `min` call).
 ### Output Values

 - **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
 - **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
 - **device** (str): device to run on, defaults to `cuda` when available
+- **samples_per_set** (Optional[int]): the number of samples per metric. Maximum 1_000 (enforced with a `min` call). If None, defaults to 1000.
+- **trust_remote_code** (bool): whether to trust datasets code , default False.
 ### Output Values

blimp.py CHANGED Viewed

@@ -125,6 +125,7 @@ Args:
     batch_size (int): the batch size to run texts through the model. Defaults to 16.
     device (str): device to run on, defaults to 'cuda' when available.
     samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
 Returns:
     blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
@@ -158,6 +159,7 @@ class Blimp(evaluate.Metric):
         batch_size: int = 16,
         device=None,
         samples_per_set: Optional[int] = None,
     ):
         if device is not None:
             assert device in ["gpu", "cpu", "cuda", "mps"], (
@@ -175,11 +177,15 @@ class Blimp(evaluate.Metric):
         if samples_per_set is None or samples_per_set <= 0:
             samples_per_set = 1000
-        model = AutoModelForCausalLM.from_pretrained(model_id)
         model = model.to(device)
         model.eval()
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
         # if batch_size > 1 (which generally leads to padding being required), and
         # if there is not an already assigned pad_token, assign an existing
@@ -213,7 +219,9 @@ class Blimp(evaluate.Metric):
         phenom_results = defaultdict(list)
         for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
-            dataset = datasets.load_dataset("nyu-mll/blimp", category)["train"]
             # Prepare batches of good and bad sentences

     batch_size (int): the batch size to run texts through the model. Defaults to 16.
     device (str): device to run on, defaults to 'cuda' when available.
     samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
+    trust_remote_code (bool): whether to trust datasets code , default False.
 Returns:
     blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
         batch_size: int = 16,
         device=None,
         samples_per_set: Optional[int] = None,
+        trust_remote_code: bool = False,
     ):
         if device is not None:
             assert device in ["gpu", "cpu", "cuda", "mps"], (
         if samples_per_set is None or samples_per_set <= 0:
             samples_per_set = 1000
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, trust_remote_code=trust_remote_code
+        )
         model = model.to(device)
         model.eval()
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, trust_remote_code=trust_remote_code
+        )
         # if batch_size > 1 (which generally leads to padding being required), and
         # if there is not an already assigned pad_token, assign an existing
         phenom_results = defaultdict(list)
         for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
+            dataset = datasets.load_dataset(
+                "nyu-mll/blimp", category, trust_remote_code=trust_remote_code
+            )["train"]
             # Prepare batches of good and bad sentences