boeing
/

aviation-ner

Token Classification

entity recognition

aviation safety

Model card Files Files and versions

ethaneng commited on Nov 26, 2025

Commit

97fb329

·

verified ·

1 Parent(s): f05971a

Upload ner_tokenization.txt

Files changed (1) hide show

ner_tokenization.txt +93 -0

ner_tokenization.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+import re
+"""Basic Tokenization to separate non-word characters from tokens"""
+class NerTokenization:
+    def __init__(self):
+        self.tok_final = {".", ",", "-", "/", "\\", ";", ":", '"', "'", ")", "]"}
+        self.tok_initial = {'"', "'", "(","["}
+        self.internal_split = {"/", ";", ":", "\\"}
+        self.split_regex = '|'.join(map(re.escape, self.internal_split)) # characters concatenated on logical or ("|")
+    def right_strip(self, tok):
+        retokenized = []
+        if all(c in self.tok_final for c in tok):
+            retokenized.append(tok)
+        else:
+            while True:
+                if tok[-1] in self.tok_final:
+                    tok, punc = tok[:-1], tok[-1]
+                    retokenized.insert(0, punc)
+                else:
+                    retokenized.insert(0, tok)
+                    break
+        return retokenized
+    def left_strip(self, token_list):
+        retokenized = []
+        for tok in token_list:
+            if all(c in self.tok_final for c in tok):
+                retokenized.append(tok)
+            else:
+                while True:
+                    try:
+                        if tok[0] in self.tok_initial:
+                            punc, tok = tok[0], tok[1:]
+                            retokenized.append(punc)
+                        else:
+                            retokenized.append(tok)
+                            break
+                    except:
+                        break
+        return retokenized
+    def tok_split(self, tok):
+        if any(c in self.internal_split for c in tok[:-1]):
+            result = re.split(f'({self.split_regex})', tok)
+            tok_split = [s for s in result if s]
+        else:
+            tok_split = [tok]
+        return tok_split
+    def tokenize_string(self, string_to_tokenize):
+        tokenized_string = []
+        toks = string_to_tokenize.split()
+        for this_tok in toks:
+            split_toks = self.tok_split(this_tok) # check if splits
+            for tok in split_toks:
+                r_stripped_toks = self.right_strip(tok)
+                l_stripped_toks = self.left_strip(r_stripped_toks)
+                tokenized_string.extend(l_stripped_toks)
+        return " ".join(tokenized_string)
+    @staticmethod
+    def convert_to_training_format(conll_file_path):
+        pass
+if __name__ == "__main__":
+    s = 'The "dog." is (here): to stay, I think. 47-52'
+    tokenizer = NerTokenization()
+    s2 = tokenizer.tokenize_string(s)
+    print(s2)