Adding filtering option
Browse files
eval.py
CHANGED
|
@@ -1,3 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import argparse
|
| 3 |
import re
|
|
@@ -126,7 +161,12 @@ def normalize_text(original_text: str, dataset: str) -> str:
|
|
| 126 |
def main(args):
|
| 127 |
# load dataset
|
| 128 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
# for testing: only process the first two examples as a test
|
| 131 |
# dataset = dataset.select(range(10))
|
| 132 |
|
|
@@ -197,6 +237,9 @@ if __name__ == "__main__":
|
|
| 197 |
parser.add_argument(
|
| 198 |
"--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
|
| 199 |
)
|
|
|
|
|
|
|
|
|
|
| 200 |
parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
|
| 201 |
parser.add_argument(
|
| 202 |
"--text_column", type=str, default="text", help="Column name containing the transcription."
|
|
@@ -222,3 +265,4 @@ if __name__ == "__main__":
|
|
| 222 |
args = parser.parse_args()
|
| 223 |
|
| 224 |
main(args)
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
Hugging Face's logo Hugging Face
|
| 3 |
+
|
| 4 |
+
Models
|
| 5 |
+
Datasets
|
| 6 |
+
Spaces
|
| 7 |
+
Docs
|
| 8 |
+
Pricing
|
| 9 |
+
|
| 10 |
+
NbAiLab
|
| 11 |
+
/
|
| 12 |
+
wav2vec2-1b-npsc-nst
|
| 13 |
+
Automatic Speech Recognition
|
| 14 |
+
PyTorch
|
| 15 |
+
Transformers
|
| 16 |
+
wav2vec2
|
| 17 |
+
generated_from_trainer
|
| 18 |
+
License: apache-2.0
|
| 19 |
+
Model card
|
| 20 |
+
Files and versions
|
| 21 |
+
Community
|
| 22 |
+
Settings
|
| 23 |
+
wav2vec2-1b-npsc-nst
|
| 24 |
+
/ eval.py
|
| 25 |
+
versae's picture
|
| 26 |
+
versae
|
| 27 |
+
Update eval.py
|
| 28 |
+
f9bf2c9
|
| 29 |
+
less than a minute ago
|
| 30 |
+
raw
|
| 31 |
+
history
|
| 32 |
+
blame
|
| 33 |
+
edit
|
| 34 |
+
delete
|
| 35 |
+
8.9 kB
|
| 36 |
#!/usr/bin/env python3
|
| 37 |
import argparse
|
| 38 |
import re
|
|
|
|
| 161 |
def main(args):
|
| 162 |
# load dataset
|
| 163 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
| 164 |
+
if args.filter:
|
| 165 |
+
attribute, value = list(map(str.strip, args.filter.split(":")))
|
| 166 |
+
dataset = dataset.filter(
|
| 167 |
+
lambda x: x[attribute == value],
|
| 168 |
+
desc=f"Filtering on {args.filter}",
|
| 169 |
+
)
|
| 170 |
# for testing: only process the first two examples as a test
|
| 171 |
# dataset = dataset.select(range(10))
|
| 172 |
|
|
|
|
| 237 |
parser.add_argument(
|
| 238 |
"--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
|
| 239 |
)
|
| 240 |
+
parser.add_argument(
|
| 241 |
+
"--filter", type=str, default="", help="Simple filter on attributes. *E.g.* `region_of_youth:Troms` would only keep those samples for which the condition is met"
|
| 242 |
+
)
|
| 243 |
parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
|
| 244 |
parser.add_argument(
|
| 245 |
"--text_column", type=str, default="text", help="Column name containing the transcription."
|
|
|
|
| 265 |
args = parser.parse_args()
|
| 266 |
|
| 267 |
main(args)
|
| 268 |
+
|