Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +2 -0
config.json +7 -0
morpheme_freq.pkl +3 -0
morpheme_tokenizer_api.py +86 -0
trie.json +3 -0
vocab.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+trie.json filter=lfs diff=lfs merge=lfs -text
+vocab.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "vocab_size": 534925,
+  "unk_token": "[UNK]",
+  "pad_token": "[PAD]",
+  "cls_token": "[CLS]",
+  "sep_token": "[SEP]"
+}

morpheme_freq.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:082309996584a177f1ac6bcabfb8779b96d99b42b0856eeeaa088cab63c16083
+size 16106435

morpheme_tokenizer_api.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Auto-generated simple loader for the morpheme tokenizer
+import json, unicodedata
+from collections import defaultdict
+SPECIAL_TOKENS = ['[PAD]', '[UNK]', '[BOS]', '[EOS]']
+class TrieNode(defaultdict):
+    def __init__(self):
+        super().__init__(TrieNode)
+        self.end = False
+def dict_to_node(d):
+    node = TrieNode()
+    node.end = d.get("#end", False)
+    for k,v in d.items():
+        if k == "#end": continue
+        node[k] = dict_to_node(v)
+    return node
+def normalize_text(s):
+    s = unicodedata.normalize("NFC", s)
+    s = __import__("re").sub(r"\s+", " ", s).strip()
+    return s
+def is_dev_char(ch):
+    cp = ord(ch)
+    for a,b in [(0x0900,0x097F),(0xA8E0,0xA8FF),(0x1CD0,0x1CFF)]:
+        if a <= cp <= b:
+            return True
+    return False
+PUNCT_CHARS = set(list("।॥,;:—-–()[]{}\"'‘’“”…!?|/\\·•*^`~"))
+def longest_match_tokenize(text, trie, unk_token="[UNK]"):
+    text = normalize_text(text)
+    out = []
+    i, n = 0, len(text)
+    while i < n:
+        ch = text[i]
+        if ch.isspace(): i+=1; continue
+        if not is_dev_char(ch) and ch not in PUNCT_CHARS:
+            j = i+1
+            while j < n and (not is_dev_char(text[j])) and (text[j] not in PUNCT_CHARS) and (not text[j].isspace()):
+                j+=1
+            out.append(text[i:j]); i=j; continue
+        if ch in PUNCT_CHARS:
+            out.append(ch); i+=1; continue
+        node = trie; j = i; last = -1
+        while j < n and text[j] in node:
+            node = node[text[j]]; j+=1
+            if node.end: last = j
+        if last != -1:
+            out.append(text[i:last]); i=last
+        else:
+            j=i+1
+            while j<n and is_dev_char(text[j]) and text[j] not in PUNCT_CHARS and not text[j].isspace():
+                j+=1
+            cand = text[i:j]
+            out.append(cand if "#FORCE#"=="never" else unk_token)  # keep UNK
+            i=j
+    return [t for t in out if t!=""]
+class MorphemeTokenizer:
+    def __init__(self, path):
+        with open(path+"/vocab.json","r",encoding="utf-8") as f:
+            vv = json.load(f)
+        self.tok2id = vv["tok2id"]
+        self.id2tok = {int(k):v for k,v in vv["id2tok"].items()}
+        with open(path+"/trie.json","r",encoding="utf-8") as f:
+            self.trie = dict_to_node(json.load(f))
+        self.bos_token="[BOS]"; self.eos_token="[EOS]"; self.unk_token="[UNK]"; self.pad_token="[PAD]"
+    def tokenize(self, text):
+        return longest_match_tokenize(text, self.trie, self.unk_token)
+    def encode(self, text, add_special_tokens=True):
+        toks = self.tokenize(text)
+        if add_special_tokens:
+            toks = [self.bos_token] + toks + [self.eos_token]
+        return [self.tok2id.get(t, self.tok2id[self.unk_token]) for t in toks]
+    def decode(self, ids, skip_special_tokens=True):
+        toks = [self.id2tok.get(i, self.unk_token) for i in ids]
+        if skip_special_tokens:
+            toks = [t for t in toks if t not in SPECIAL_TOKENS]
+        return "".join(toks)

trie.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6b324672f943386f6613a6fc31cd75fde64fe40f25179765600370d4c99c350
+size 24997801

vocab.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b72d7f2dc2fd4250172195320ee16a2b6050eb0c3598c31c95b5fa5b3236f906
+size 20802202