yu-val-weiss
commited on
Commit
·
c26f589
1
Parent(s):
0f3b529
trust remote code
Browse files
README.md
CHANGED
|
@@ -47,7 +47,8 @@ results = blimp.compute(model_id='pico-lm/pico-decoder')
|
|
| 47 |
- **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
|
| 48 |
- **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
|
| 49 |
- **device** (str): device to run on, defaults to `cuda` when available
|
| 50 |
-
- **samples_per_set** (int): the number of samples per metric
|
|
|
|
| 51 |
|
| 52 |
### Output Values
|
| 53 |
|
|
|
|
| 47 |
- **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
|
| 48 |
- **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
|
| 49 |
- **device** (str): device to run on, defaults to `cuda` when available
|
| 50 |
+
- **samples_per_set** (Optional[int]): the number of samples per metric. Maximum 1_000 (enforced with a `min` call). If None, defaults to 1000.
|
| 51 |
+
- **trust_remote_code** (bool): whether to trust datasets code , default False.
|
| 52 |
|
| 53 |
### Output Values
|
| 54 |
|
blimp.py
CHANGED
|
@@ -125,6 +125,7 @@ Args:
|
|
| 125 |
batch_size (int): the batch size to run texts through the model. Defaults to 16.
|
| 126 |
device (str): device to run on, defaults to 'cuda' when available.
|
| 127 |
samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
|
|
|
|
| 128 |
|
| 129 |
Returns:
|
| 130 |
blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
|
|
@@ -158,6 +159,7 @@ class Blimp(evaluate.Metric):
|
|
| 158 |
batch_size: int = 16,
|
| 159 |
device=None,
|
| 160 |
samples_per_set: Optional[int] = None,
|
|
|
|
| 161 |
):
|
| 162 |
if device is not None:
|
| 163 |
assert device in ["gpu", "cpu", "cuda", "mps"], (
|
|
@@ -175,11 +177,15 @@ class Blimp(evaluate.Metric):
|
|
| 175 |
if samples_per_set is None or samples_per_set <= 0:
|
| 176 |
samples_per_set = 1000
|
| 177 |
|
| 178 |
-
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
|
|
| 179 |
model = model.to(device)
|
| 180 |
model.eval()
|
| 181 |
|
| 182 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
|
|
|
| 183 |
|
| 184 |
# if batch_size > 1 (which generally leads to padding being required), and
|
| 185 |
# if there is not an already assigned pad_token, assign an existing
|
|
@@ -213,7 +219,9 @@ class Blimp(evaluate.Metric):
|
|
| 213 |
phenom_results = defaultdict(list)
|
| 214 |
|
| 215 |
for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
|
| 216 |
-
dataset = datasets.load_dataset(
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# Prepare batches of good and bad sentences
|
| 219 |
|
|
|
|
| 125 |
batch_size (int): the batch size to run texts through the model. Defaults to 16.
|
| 126 |
device (str): device to run on, defaults to 'cuda' when available.
|
| 127 |
samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
|
| 128 |
+
trust_remote_code (bool): whether to trust datasets code , default False.
|
| 129 |
|
| 130 |
Returns:
|
| 131 |
blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
|
|
|
|
| 159 |
batch_size: int = 16,
|
| 160 |
device=None,
|
| 161 |
samples_per_set: Optional[int] = None,
|
| 162 |
+
trust_remote_code: bool = False,
|
| 163 |
):
|
| 164 |
if device is not None:
|
| 165 |
assert device in ["gpu", "cpu", "cuda", "mps"], (
|
|
|
|
| 177 |
if samples_per_set is None or samples_per_set <= 0:
|
| 178 |
samples_per_set = 1000
|
| 179 |
|
| 180 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 181 |
+
model_id, trust_remote_code=trust_remote_code
|
| 182 |
+
)
|
| 183 |
model = model.to(device)
|
| 184 |
model.eval()
|
| 185 |
|
| 186 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 187 |
+
model_id, trust_remote_code=trust_remote_code
|
| 188 |
+
)
|
| 189 |
|
| 190 |
# if batch_size > 1 (which generally leads to padding being required), and
|
| 191 |
# if there is not an already assigned pad_token, assign an existing
|
|
|
|
| 219 |
phenom_results = defaultdict(list)
|
| 220 |
|
| 221 |
for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
|
| 222 |
+
dataset = datasets.load_dataset(
|
| 223 |
+
"nyu-mll/blimp", category, trust_remote_code=trust_remote_code
|
| 224 |
+
)["train"]
|
| 225 |
|
| 226 |
# Prepare batches of good and bad sentences
|
| 227 |
|