NbAiLab
/

wav2vec2-1b-npsc-nst

Automatic Speech Recognition

Generated from Trainer

Model card Files Files and versions

versae commited on Jan 30, 2023

Commit

1567496

·

1 Parent(s): f9bf2c9

Adding filtering option

Files changed (1) hide show

eval.py +45 -1

eval.py CHANGED Viewed

@@ -1,3 +1,38 @@
 #!/usr/bin/env python3
 import argparse
 import re
@@ -126,7 +161,12 @@ def normalize_text(original_text: str, dataset: str) -> str:
 def main(args):
     # load dataset
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
     # for testing: only process the first two examples as a test
     # dataset = dataset.select(range(10))
@@ -197,6 +237,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
     )
     parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
     parser.add_argument(
         "--text_column", type=str, default="text", help="Column name containing the transcription."
@@ -222,3 +265,4 @@ if __name__ == "__main__":
     args = parser.parse_args()
     main(args)

+Hugging Face's logo Hugging Face
+Models
+Datasets
+Spaces
+Docs
+Pricing
+NbAiLab
+/
+wav2vec2-1b-npsc-nst
+Automatic Speech Recognition
+PyTorch
+Transformers
+wav2vec2
+generated_from_trainer
+License: apache-2.0
+Model card
+Files and versions
+Community
+Settings
+wav2vec2-1b-npsc-nst
+/ eval.py
+versae's picture
+versae
+Update eval.py
+f9bf2c9
+less than a minute ago
+raw
+history
+blame
+edit
+delete
+8.9 kB
 #!/usr/bin/env python3
 import argparse
 import re
 def main(args):
     # load dataset
     dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
+    if args.filter:
+        attribute, value = list(map(str.strip, args.filter.split(":")))
+        dataset = dataset.filter(
+            lambda x: x[attribute == value],
+            desc=f"Filtering on {args.filter}",
+        )
     # for testing: only process the first two examples as a test
     # dataset = dataset.select(range(10))
     parser.add_argument(
         "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
     )
+    parser.add_argument(
+        "--filter", type=str, default="", help="Simple filter on attributes. *E.g.* `region_of_youth:Troms` would only keep those samples for which the condition is met"
+    )
     parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
     parser.add_argument(
         "--text_column", type=str, default="text", help="Column name containing the transcription."
     args = parser.parse_args()
     main(args)