eval results

Browse files

Files changed (10) hide show

.ipynb_checkpoints/README-checkpoint.md +85 -0
.ipynb_checkpoints/eval-checkpoint.py +29 -3
.ipynb_checkpoints/run_eval-checkpoint.sh +8 -0
.ipynb_checkpoints/run_speech_recognition_ctc-checkpoint.py +32 -6
eval.py +29 -3
log_mozilla-foundation_common_voice_7_0_hi_test_predictions.txt +0 -0
log_mozilla-foundation_common_voice_7_0_hi_test_targets.txt +0 -0
mozilla-foundation_common_voice_7_0_hi_test_eval_results.txt +2 -0
run_eval.sh +8 -0
run_speech_recognition_ctc.py +32 -6

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,85 @@

+---
+language:
+- hi
+license: apache-2.0
+tags:
+- automatic-speech-recognition
+- mozilla-foundation/common_voice_7_0
+- robust-speech-event
+- generated_from_trainer
+datasets:
+- common_voice
+model-index:
+- name: ''
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+#
+This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the MOZILLA-FOUNDATION/COMMON_VOICE_7_0 - HI dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.7346
+- Wer: 1.0479
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 500
+- training_steps: 8000
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Wer    |
+|:-------------:|:-----:|:----:|:---------------:|:------:|
+| No log        | 1.36  | 400  | 1.4595          | 1.0039 |
+| 4.7778        | 2.71  | 800  | 0.8082          | 1.0115 |
+| 0.6408        | 4.07  | 1200 | 0.7032          | 1.0079 |
+| 0.3937        | 5.42  | 1600 | 0.6889          | 1.0433 |
+| 0.3           | 6.78  | 2000 | 0.6820          | 1.0069 |
+| 0.3           | 8.14  | 2400 | 0.6670          | 1.0196 |
+| 0.226         | 9.49  | 2800 | 0.7216          | 1.0422 |
+| 0.197         | 10.85 | 3200 | 0.7669          | 1.0534 |
+| 0.165         | 12.2  | 3600 | 0.7517          | 1.0200 |
+| 0.1486        | 13.56 | 4000 | 0.7125          | 1.0357 |
+| 0.1486        | 14.92 | 4400 | 0.7447          | 1.0347 |
+| 0.122         | 16.27 | 4800 | 0.6899          | 1.0440 |
+| 0.1069        | 17.63 | 5200 | 0.7212          | 1.0350 |
+| 0.0961        | 18.98 | 5600 | 0.7417          | 1.0408 |
+| 0.086         | 20.34 | 6000 | 0.7402          | 1.0356 |
+| 0.086         | 21.69 | 6400 | 0.7761          | 1.0420 |
+| 0.0756        | 23.05 | 6800 | 0.7346          | 1.0369 |
+| 0.0666        | 24.41 | 7200 | 0.7506          | 1.0449 |
+| 0.0595        | 25.76 | 7600 | 0.7319          | 1.0476 |
+| 0.054         | 27.12 | 8000 | 0.7346          | 1.0479 |
+### Framework versions
+- Transformers 4.16.0.dev0
+- Pytorch 1.10.1+cu102
+- Datasets 1.18.3
+- Tokenizers 0.11.0

.ipynb_checkpoints/eval-checkpoint.py CHANGED Viewed

@@ -47,11 +47,32 @@ def log_results(result: Dataset, args: Dict[str, str]):
             result.map(write_to_file, with_indices=True)
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
@@ -132,6 +153,11 @@ if __name__ == "__main__":
         default=None,
         help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
     )
     args = parser.parse_args()
     main(args)

             result.map(write_to_file, with_indices=True)
+def replace_text(text):
+    text=text.replace('„', r'"')
+    text=text.replace('“', r'"')
+    text=text.replace('”', r'"')
+    text=text.replace('–', r'-')
+    text=text.replace('—', r' - ')
+    text=text.replace('´', r"'")
+    text=text.replace('‘', r"'")
+    text=text.replace('‚', r"'")
+    text=text.replace('’', r"'")
+    text=text.replace("''", r'"')
+    text=text.replace('´´', r'"')
+    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+    for t in token_sequences_to_ignore:
+        text = " ".join(text.split(t))
+    return text
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+#     chars_to_ignore_regex = (
+#         f'[{"".join(args.chars_to_ignore)}]' if args.chars_to_ignore is not None else None
+#     )
+    text=replace_text(text)
+    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–"\'-]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+#     print(chars_to_ignore_regex)
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
         default=None,
         help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
     )
+    parser.add_argument(
+        "--chars_to_ignore",
+        default=None,
+        help="characters to ignore  in text",
+    )
     args = parser.parse_args()
     main(args)

.ipynb_checkpoints/run_eval-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+python eval.py \
+--model_id "checkpoint-8000" \
+--dataset "mozilla-foundation/common_voice_7_0" \
+--config "hi" \
+--split "test" \
+--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+--log_outputs

.ipynb_checkpoints/run_speech_recognition_ctc-checkpoint.py CHANGED Viewed

@@ -435,16 +435,42 @@ def main():
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
-    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
-    )
     text_column_name = data_args.text_column_name
-    def remove_special_characters(batch):
         if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
         else:
-            batch["target_text"] = batch[text_column_name].lower() + " "
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):

     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
+#     chars_to_ignore_regex = (
+#         f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+#     )
+    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'
     text_column_name = data_args.text_column_name
+    def replace_text(text):
+        text=text.replace('„', r'"')
+        text=text.replace('“', r'"')
+        text=text.replace('”', r'"')
+        text=text.replace('–', r'-')
+        text=text.replace('—', r' - ')
+        text=text.replace('´', r"'")
+        text=text.replace('‘', r"'")
+        text=text.replace('‚', r"'")
+        text=text.replace('’', r"'")
+        text=text.replace("''", r'"')
+        text=text.replace('´´', r'"')
+        token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+        for t in token_sequences_to_ignore:
+            text = " ".join(text.split(t))
+        return text
+    def remove_special_characters(text):
+        text=batch[text_column_name]
+        text=replace_text(text)
         if chars_to_ignore_regex is not None:
+            target_text = re.sub(chars_to_ignore_regex, "", text).lower() + " "
         else:
+            target_text = text.lower() + " "
+        batch["target_text"]=target_text
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):

eval.py CHANGED Viewed

@@ -47,11 +47,32 @@ def log_results(result: Dataset, args: Dict[str, str]):
             result.map(write_to_file, with_indices=True)
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
@@ -132,6 +153,11 @@ if __name__ == "__main__":
         default=None,
         help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
     )
     args = parser.parse_args()
     main(args)

             result.map(write_to_file, with_indices=True)
+def replace_text(text):
+    text=text.replace('„', r'"')
+    text=text.replace('“', r'"')
+    text=text.replace('”', r'"')
+    text=text.replace('–', r'-')
+    text=text.replace('—', r' - ')
+    text=text.replace('´', r"'")
+    text=text.replace('‘', r"'")
+    text=text.replace('‚', r"'")
+    text=text.replace('’', r"'")
+    text=text.replace("''", r'"')
+    text=text.replace('´´', r'"')
+    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+    for t in token_sequences_to_ignore:
+        text = " ".join(text.split(t))
+    return text
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+#     chars_to_ignore_regex = (
+#         f'[{"".join(args.chars_to_ignore)}]' if args.chars_to_ignore is not None else None
+#     )
+    text=replace_text(text)
+    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–"\'-]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+#     print(chars_to_ignore_regex)
     text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
         default=None,
         help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
     )
+    parser.add_argument(
+        "--chars_to_ignore",
+        default=None,
+        help="characters to ignore  in text",
+    )
     args = parser.parse_args()
     main(args)

log_mozilla-foundation_common_voice_7_0_hi_test_predictions.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

log_mozilla-foundation_common_voice_7_0_hi_test_targets.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

mozilla-foundation_common_voice_7_0_hi_test_eval_results.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.38507940416102426
2	+ CER: 0.13082663533294167

run_eval.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+python eval.py \
+--model_id "checkpoint-8000" \
+--dataset "mozilla-foundation/common_voice_7_0" \
+--config "hi" \
+--split "test" \
+--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+--log_outputs

run_speech_recognition_ctc.py CHANGED Viewed

@@ -435,16 +435,42 @@ def main():
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
-    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
-    )
     text_column_name = data_args.text_column_name
-    def remove_special_characters(batch):
         if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
         else:
-            batch["target_text"] = batch[text_column_name].lower() + " "
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):

     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
+#     chars_to_ignore_regex = (
+#         f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+#     )
+    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'
     text_column_name = data_args.text_column_name
+    def replace_text(text):
+        text=text.replace('„', r'"')
+        text=text.replace('“', r'"')
+        text=text.replace('”', r'"')
+        text=text.replace('–', r'-')
+        text=text.replace('—', r' - ')
+        text=text.replace('´', r"'")
+        text=text.replace('‘', r"'")
+        text=text.replace('‚', r"'")
+        text=text.replace('’', r"'")
+        text=text.replace("''", r'"')
+        text=text.replace('´´', r'"')
+        token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+        for t in token_sequences_to_ignore:
+            text = " ".join(text.split(t))
+        return text
+    def remove_special_characters(text):
+        text=batch[text_column_name]
+        text=replace_text(text)
         if chars_to_ignore_regex is not None:
+            target_text = re.sub(chars_to_ignore_regex, "", text).lower() + " "
         else:
+            target_text = text.lower() + " "
+        batch["target_text"]=target_text
         return batch
     with training_args.main_process_first(desc="dataset map special characters removal"):