Add kenlm 5gram

Browse files

Files changed (5) hide show

.gitattributes +2 -0
5gram.arpa +3 -0
5gram.arpa.orig +3 -0
npsc.txt +0 -0
prepare.py +79 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+5gram.arpa filter=lfs diff=lfs merge=lfs -text
+5gram.arpa.orig filter=lfs diff=lfs merge=lfs -text

5gram.arpa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f27d26c69868db542f7ae90deeb4a90ebcbecbcef50366cea55823cab19ae429
+size 117739352

5gram.arpa.orig ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8565e3c62ada6f5d665aebf261b407fa8778593aed57ad75b6bf926201bb7d22
+size 117739333

npsc.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

prepare.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env python
+# coding=utf-8
+import re
+from datasets import load_dataset
+TEXT_COLUMN_NAME = "text"
+AUDIO_COLUMN_NAME = "audio"
+CHARS_TO_IGNORE_REGEX = r"[,?.!\-;:“%‘”�—’…–+\"'#/<>\\]"
+# Pre-processing dataset
+def filter_dataset(batch):
+    return (
+        "inaudible" not in batch[TEXT_COLUMN_NAME].lower()
+        and batch["sentence_language_code"].lower() == "nb-no"
+    )
+def replace_hatted_characters(batch):
+    text = batch["text"]
+    text = re.sub(CHARS_TO_IGNORE_REGEX, '', text).lower()
+    text = re.sub('[áàâ]', 'a', text)
+    text = re.sub('[ä]', 'æ', text)
+    text = re.sub('[éèëê]', 'e', text)
+    text = re.sub('[íìïî]', 'i', text)
+    text = re.sub('[óòöô]', 'o', text)
+    text = re.sub('[ö]', 'ø', text)
+    text = re.sub('[ç]', 'c', text)
+    text = re.sub('[úùüû]', 'u', text)
+    text = re.sub('\xa0', ' ', text)
+    text = re.sub('<ee>', 'eee', text)
+    text = re.sub('<qq>', 'qqq', text)
+    text = re.sub('<mm>', 'mmm', text)
+    text = re.sub('<inaudible>', '?', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return {"text": text}
+def main():
+    dataset = load_dataset(
+        "NbAiLab/NPSC",
+        "16K_mp3",
+        split="train+validation",
+        use_auth_token=True,
+    )
+    dataset = dataset.filter(
+        filter_dataset,
+        desc="filtering out inaudible examples and keeping only nb-NO",
+    ).map(
+        replace_hatted_characters,
+        desc="replacing hesitations and homophones",
+    )
+    # Create file with all text together
+    text = " ".join(dataset["text"])
+    with open("npsc.txt", "w") as text_file:
+        text_file.write(text)
+    # Create KenLM model
+    !~/bin/lmplz -o 5 <"npsc.txt" > "5gram.arpa.orig"
+    # Adjusting for Huggingface decoding
+    with open("5gram.arpa.orig", "r") as read_file, open("5gram.arpa", "w") as write_file:
+        has_added_eos = False
+        for line in read_file:
+          if not has_added_eos and "ngram 1=" in line:
+            count=line.strip().split("=")[-1]
+            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
+          elif not has_added_eos and "<s>" in line:
+            write_file.write(line)
+            write_file.write(line.replace("<s>", "</s>"))
+            has_added_eos = True
+          else:
+            write_file.write(line)
+if __name__ == "__main__":
+    main()