Spaces:

asynchronousai
/

paragraph-embedder

Sleeping

App Files Files Community

asynchronousai commited on Sep 18, 2024

Commit

bcfa582

verified ·

1 Parent(s): b1111ce

Update tok.py

Browse files

Files changed (1) hide show

tok.py +0 -48

tok.py CHANGED Viewed

@@ -1,38 +1,5 @@
 import string
 from textsearch import TextSearch
-from contractions import contractions_dict, leftovers_dict
-ABBREVS = (
-    "a.m.",
-    "adm.",
-    "bros.",
-    "co.",
-    "corp.",
-    "d.c.",
-    "dr.",
-    "e.g.",
-    "gen.",
-    "gov.",
-    "i.e.",
-    "inc.",
-    "jr.",
-    "ltd.",
-    "md.",
-    "messrs.",
-    "mo.",
-    "mont.",
-    "mr.",
-    "mrs.",
-    "ms.",
-    "p.m.",
-    "ph.d.",
-    "rep.",
-    "rev.",
-    "sen.",
-    "st.",
-    "vs.",
-)
 class Tokenizer:
     def __init__(
@@ -44,14 +11,8 @@ class Tokenizer:
         eol="\n",
         currencies=("$",),
         protected_words=None,
-        contractions=True,
         language="en",
-        abbrevs=ABBREVS,
     ):
-        # set() set() should fallback to just using __iter__ of automaton for a speedboost
-        if language != "en" and contractions:
-            raise ValueError("No contractions known for languages other than English.")
-        self.contractions = contractions
         self.tokenizer = None
         self.handle_http = handle_http
         self.handle_domains = handle_domains
@@ -60,7 +21,6 @@ class Tokenizer:
         self.eol = eol
         self.currencies = currencies or []
         self.protected_words = protected_words or []
-        self.abbrevs = abbrevs
         self.explain_dict = {}
         self.setup()
@@ -77,14 +37,6 @@ class Tokenizer:
                 ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
         if self.handle_domains:
             self.add_domain_handler()
-        if self.contractions:
-            if self.contractions == True:
-                self.contractions = {}
-                self.contractions.update(contractions_dict)
-                self.contractions.update(leftovers_dict)
-            self.add_words(self.contractions)
-        if self.abbrevs:
-            self.add_words(self.abbrevs)
     def add_words(self, words):
         words = words.items() if isinstance(words, dict) else words

 import string
 from textsearch import TextSearch
 class Tokenizer:
     def __init__(
         eol="\n",
         currencies=("$",),
         protected_words=None,
         language="en",
     ):
         self.tokenizer = None
         self.handle_http = handle_http
         self.handle_domains = handle_domains
         self.eol = eol
         self.currencies = currencies or []
         self.protected_words = protected_words or []
         self.explain_dict = {}
         self.setup()
                 ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
         if self.handle_domains:
             self.add_domain_handler()
     def add_words(self, words):
         words = words.items() if isinstance(words, dict) else words