13Aluminium commited on
Commit
79a81b3
·
verified ·
1 Parent(s): 28fe1b0

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -0
  2. config.json +7 -0
  3. morpheme_freq.pkl +3 -0
  4. morpheme_tokenizer_api.py +86 -0
  5. trie.json +3 -0
  6. vocab.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ trie.json filter=lfs diff=lfs merge=lfs -text
37
+ vocab.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 534925,
3
+ "unk_token": "[UNK]",
4
+ "pad_token": "[PAD]",
5
+ "cls_token": "[CLS]",
6
+ "sep_token": "[SEP]"
7
+ }
morpheme_freq.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:082309996584a177f1ac6bcabfb8779b96d99b42b0856eeeaa088cab63c16083
3
+ size 16106435
morpheme_tokenizer_api.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto-generated simple loader for the morpheme tokenizer
2
+ import json, unicodedata
3
+ from collections import defaultdict
4
+
5
+ SPECIAL_TOKENS = ['[PAD]', '[UNK]', '[BOS]', '[EOS]']
6
+
7
+ class TrieNode(defaultdict):
8
+ def __init__(self):
9
+ super().__init__(TrieNode)
10
+ self.end = False
11
+
12
+ def dict_to_node(d):
13
+ node = TrieNode()
14
+ node.end = d.get("#end", False)
15
+ for k,v in d.items():
16
+ if k == "#end": continue
17
+ node[k] = dict_to_node(v)
18
+ return node
19
+
20
+ def normalize_text(s):
21
+ s = unicodedata.normalize("NFC", s)
22
+ s = __import__("re").sub(r"\s+", " ", s).strip()
23
+ return s
24
+
25
+ def is_dev_char(ch):
26
+ cp = ord(ch)
27
+ for a,b in [(0x0900,0x097F),(0xA8E0,0xA8FF),(0x1CD0,0x1CFF)]:
28
+ if a <= cp <= b:
29
+ return True
30
+ return False
31
+
32
+ PUNCT_CHARS = set(list("।॥,;:—-–()[]{}\"'‘’“”…!?|/\\·•*^`~"))
33
+
34
+ def longest_match_tokenize(text, trie, unk_token="[UNK]"):
35
+ text = normalize_text(text)
36
+ out = []
37
+ i, n = 0, len(text)
38
+ while i < n:
39
+ ch = text[i]
40
+ if ch.isspace(): i+=1; continue
41
+ if not is_dev_char(ch) and ch not in PUNCT_CHARS:
42
+ j = i+1
43
+ while j < n and (not is_dev_char(text[j])) and (text[j] not in PUNCT_CHARS) and (not text[j].isspace()):
44
+ j+=1
45
+ out.append(text[i:j]); i=j; continue
46
+ if ch in PUNCT_CHARS:
47
+ out.append(ch); i+=1; continue
48
+ node = trie; j = i; last = -1
49
+ while j < n and text[j] in node:
50
+ node = node[text[j]]; j+=1
51
+ if node.end: last = j
52
+ if last != -1:
53
+ out.append(text[i:last]); i=last
54
+ else:
55
+ j=i+1
56
+ while j<n and is_dev_char(text[j]) and text[j] not in PUNCT_CHARS and not text[j].isspace():
57
+ j+=1
58
+ cand = text[i:j]
59
+ out.append(cand if "#FORCE#"=="never" else unk_token) # keep UNK
60
+ i=j
61
+ return [t for t in out if t!=""]
62
+
63
+ class MorphemeTokenizer:
64
+ def __init__(self, path):
65
+ with open(path+"/vocab.json","r",encoding="utf-8") as f:
66
+ vv = json.load(f)
67
+ self.tok2id = vv["tok2id"]
68
+ self.id2tok = {int(k):v for k,v in vv["id2tok"].items()}
69
+ with open(path+"/trie.json","r",encoding="utf-8") as f:
70
+ self.trie = dict_to_node(json.load(f))
71
+ self.bos_token="[BOS]"; self.eos_token="[EOS]"; self.unk_token="[UNK]"; self.pad_token="[PAD]"
72
+
73
+ def tokenize(self, text):
74
+ return longest_match_tokenize(text, self.trie, self.unk_token)
75
+
76
+ def encode(self, text, add_special_tokens=True):
77
+ toks = self.tokenize(text)
78
+ if add_special_tokens:
79
+ toks = [self.bos_token] + toks + [self.eos_token]
80
+ return [self.tok2id.get(t, self.tok2id[self.unk_token]) for t in toks]
81
+
82
+ def decode(self, ids, skip_special_tokens=True):
83
+ toks = [self.id2tok.get(i, self.unk_token) for i in ids]
84
+ if skip_special_tokens:
85
+ toks = [t for t in toks if t not in SPECIAL_TOKENS]
86
+ return "".join(toks)
trie.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6b324672f943386f6613a6fc31cd75fde64fe40f25179765600370d4c99c350
3
+ size 24997801
vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b72d7f2dc2fd4250172195320ee16a2b6050eb0c3598c31c95b5fa5b3236f906
3
+ size 20802202