Model save
Browse files
.ipynb_checkpoints/run_speech_recognition_ctc-checkpoint.py
CHANGED
|
@@ -434,14 +434,11 @@ def main():
|
|
| 434 |
# that make training complicated and do not help in transcribing the speech
|
| 435 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
| 436 |
# that could be easily picked up by the model
|
| 437 |
-
chars_to_ignore_regex = '[
|
| 438 |
text_column_name = data_args.text_column_name
|
| 439 |
|
| 440 |
def remove_and_replace_special_characters(batch):
|
| 441 |
-
|
| 442 |
-
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
|
| 443 |
-
else:
|
| 444 |
-
batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
|
| 445 |
return batch
|
| 446 |
|
| 447 |
with training_args.main_process_first(desc="dataset map special characters removal"):
|
|
|
|
| 434 |
# that make training complicated and do not help in transcribing the speech
|
| 435 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
| 436 |
# that could be easily picked up by the model
|
| 437 |
+
chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\'’ ]'
|
| 438 |
text_column_name = data_args.text_column_name
|
| 439 |
|
| 440 |
def remove_and_replace_special_characters(batch):
|
| 441 |
+
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name].lower()).replace('’', "'") + " "
|
|
|
|
|
|
|
|
|
|
| 442 |
return batch
|
| 443 |
|
| 444 |
with training_args.main_process_first(desc="dataset map special characters removal"):
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1263088113
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ec1c7675b56877de46f623ac51149e73d4f88cac691dc72595621d35344ce9b
|
| 3 |
size 1263088113
|
run_speech_recognition_ctc.py
CHANGED
|
@@ -434,14 +434,11 @@ def main():
|
|
| 434 |
# that make training complicated and do not help in transcribing the speech
|
| 435 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
| 436 |
# that could be easily picked up by the model
|
| 437 |
-
chars_to_ignore_regex = '[
|
| 438 |
text_column_name = data_args.text_column_name
|
| 439 |
|
| 440 |
def remove_and_replace_special_characters(batch):
|
| 441 |
-
|
| 442 |
-
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
|
| 443 |
-
else:
|
| 444 |
-
batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
|
| 445 |
return batch
|
| 446 |
|
| 447 |
with training_args.main_process_first(desc="dataset map special characters removal"):
|
|
|
|
| 434 |
# that make training complicated and do not help in transcribing the speech
|
| 435 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
| 436 |
# that could be easily picked up by the model
|
| 437 |
+
chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\'’ ]'
|
| 438 |
text_column_name = data_args.text_column_name
|
| 439 |
|
| 440 |
def remove_and_replace_special_characters(batch):
|
| 441 |
+
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name].lower()).replace('’', "'") + " "
|
|
|
|
|
|
|
|
|
|
| 442 |
return batch
|
| 443 |
|
| 444 |
with training_args.main_process_first(desc="dataset map special characters removal"):
|