versae commited on
Commit
9b5a8ef
·
1 Parent(s): 1158eea

Add kenlm 5gram

Browse files
Files changed (5) hide show
  1. .gitattributes +2 -0
  2. 5gram.arpa +3 -0
  3. 5gram.arpa.orig +3 -0
  4. npsc.txt +0 -0
  5. prepare.py +79 -0
.gitattributes CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ 5gram.arpa filter=lfs diff=lfs merge=lfs -text
29
+ 5gram.arpa.orig filter=lfs diff=lfs merge=lfs -text
5gram.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f27d26c69868db542f7ae90deeb4a90ebcbecbcef50366cea55823cab19ae429
3
+ size 117739352
5gram.arpa.orig ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8565e3c62ada6f5d665aebf261b407fa8778593aed57ad75b6bf926201bb7d22
3
+ size 117739333
npsc.txt ADDED
The diff for this file is too large to render. See raw diff
 
prepare.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ import re
4
+ from datasets import load_dataset
5
+
6
+ TEXT_COLUMN_NAME = "text"
7
+ AUDIO_COLUMN_NAME = "audio"
8
+ CHARS_TO_IGNORE_REGEX = r"[,?.!\-;:“%‘”�—’…–+\"'#/<>\\]"
9
+
10
+ # Pre-processing dataset
11
+ def filter_dataset(batch):
12
+ return (
13
+ "inaudible" not in batch[TEXT_COLUMN_NAME].lower()
14
+ and batch["sentence_language_code"].lower() == "nb-no"
15
+ )
16
+
17
+ def replace_hatted_characters(batch):
18
+ text = batch["text"]
19
+ text = re.sub(CHARS_TO_IGNORE_REGEX, '', text).lower()
20
+ text = re.sub('[áàâ]', 'a', text)
21
+ text = re.sub('[ä]', 'æ', text)
22
+ text = re.sub('[éèëê]', 'e', text)
23
+ text = re.sub('[íìïî]', 'i', text)
24
+ text = re.sub('[óòöô]', 'o', text)
25
+ text = re.sub('[ö]', 'ø', text)
26
+ text = re.sub('[ç]', 'c', text)
27
+ text = re.sub('[úùüû]', 'u', text)
28
+ text = re.sub('\xa0', ' ', text)
29
+ text = re.sub('<ee>', 'eee', text)
30
+ text = re.sub('<qq>', 'qqq', text)
31
+ text = re.sub('<mm>', 'mmm', text)
32
+ text = re.sub('<inaudible>', '?', text)
33
+ text = re.sub(r'\s+', ' ', text)
34
+ text = text.strip()
35
+ return {"text": text}
36
+
37
+
38
+ def main():
39
+ dataset = load_dataset(
40
+ "NbAiLab/NPSC",
41
+ "16K_mp3",
42
+ split="train+validation",
43
+ use_auth_token=True,
44
+ )
45
+ dataset = dataset.filter(
46
+ filter_dataset,
47
+ desc="filtering out inaudible examples and keeping only nb-NO",
48
+ ).map(
49
+ replace_hatted_characters,
50
+ desc="replacing hesitations and homophones",
51
+ )
52
+
53
+ # Create file with all text together
54
+ text = " ".join(dataset["text"])
55
+ with open("npsc.txt", "w") as text_file:
56
+ text_file.write(text)
57
+
58
+ # Create KenLM model
59
+ !~/bin/lmplz -o 5 <"npsc.txt" > "5gram.arpa.orig"
60
+
61
+ # Adjusting for Huggingface decoding
62
+ with open("5gram.arpa.orig", "r") as read_file, open("5gram.arpa", "w") as write_file:
63
+ has_added_eos = False
64
+ for line in read_file:
65
+ if not has_added_eos and "ngram 1=" in line:
66
+ count=line.strip().split("=")[-1]
67
+ write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
68
+ elif not has_added_eos and "<s>" in line:
69
+ write_file.write(line)
70
+ write_file.write(line.replace("<s>", "</s>"))
71
+ has_added_eos = True
72
+ else:
73
+ write_file.write(line)
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
78
+
79
+