Remove unused code
Browse files- transforms_cased.py +2 -28
transforms_cased.py
CHANGED
|
@@ -160,14 +160,12 @@ class FilterPOS(BaseTextTransform):
|
|
| 160 |
Args:
|
| 161 |
tags (list): List of POS tags to remove.
|
| 162 |
engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
|
| 163 |
-
keep_compound_nouns (bool): Whether to keep composed words. Defaults to True.
|
| 164 |
"""
|
| 165 |
|
| 166 |
-
def __init__(self, tags: list, engine: str = "nltk"
|
| 167 |
super().__init__()
|
| 168 |
self.tags = tags
|
| 169 |
self.engine = engine
|
| 170 |
-
self.keep_compound_nouns = keep_compound_nouns
|
| 171 |
|
| 172 |
if engine == "nltk":
|
| 173 |
nltk.download("averaged_perceptron_tagger", quiet=True)
|
|
@@ -189,30 +187,6 @@ class FilterPOS(BaseTextTransform):
|
|
| 189 |
self.tagger(sentence)
|
| 190 |
text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
|
| 191 |
|
| 192 |
-
if self.keep_compound_nouns:
|
| 193 |
-
compound_nouns = []
|
| 194 |
-
|
| 195 |
-
if self.engine == "nltk":
|
| 196 |
-
for i in range(len(word_tags) - 1):
|
| 197 |
-
if word_tags[i][1] == "NN" and word_tags[i + 1][1] == "NN":
|
| 198 |
-
# if they are the same word, skip
|
| 199 |
-
if word_tags[i][0] == word_tags[i + 1][0]:
|
| 200 |
-
continue
|
| 201 |
-
|
| 202 |
-
compound_noun = word_tags[i][0] + "_" + word_tags[i + 1][0]
|
| 203 |
-
compound_nouns.append(compound_noun)
|
| 204 |
-
elif self.engine == "flair":
|
| 205 |
-
for i in range(len(sentence.tokens) - 1):
|
| 206 |
-
if sentence.tokens[i].tag == "NN" and sentence.tokens[i + 1].tag == "NN":
|
| 207 |
-
# if they are the same word, skip
|
| 208 |
-
if sentence.tokens[i].text == sentence.tokens[i + 1].text:
|
| 209 |
-
continue
|
| 210 |
-
|
| 211 |
-
compound_noun = sentence.tokens[i].text + "_" + sentence.tokens[i + 1].text
|
| 212 |
-
compound_nouns.append(compound_noun)
|
| 213 |
-
|
| 214 |
-
text = " ".join([text, " ".join(compound_nouns)])
|
| 215 |
-
|
| 216 |
return text
|
| 217 |
|
| 218 |
def __repr__(self) -> str:
|
|
@@ -396,7 +370,7 @@ def default_vocabulary_transforms() -> TextCompose:
|
|
| 396 |
transforms.append(ToSingular())
|
| 397 |
transforms.append(DropWords(words=words_to_drop))
|
| 398 |
transforms.append(FrequencyMinWordCount(min_count=2))
|
| 399 |
-
transforms.append(FilterPOS(tags=pos_tags, engine="flair"
|
| 400 |
transforms.append(RemoveDuplicates())
|
| 401 |
|
| 402 |
transforms = TextCompose(transforms)
|
|
|
|
| 160 |
Args:
|
| 161 |
tags (list): List of POS tags to remove.
|
| 162 |
engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
|
|
|
|
| 163 |
"""
|
| 164 |
|
| 165 |
+
def __init__(self, tags: list, engine: str = "nltk") -> None:
|
| 166 |
super().__init__()
|
| 167 |
self.tags = tags
|
| 168 |
self.engine = engine
|
|
|
|
| 169 |
|
| 170 |
if engine == "nltk":
|
| 171 |
nltk.download("averaged_perceptron_tagger", quiet=True)
|
|
|
|
| 187 |
self.tagger(sentence)
|
| 188 |
text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
return text
|
| 191 |
|
| 192 |
def __repr__(self) -> str:
|
|
|
|
| 370 |
transforms.append(ToSingular())
|
| 371 |
transforms.append(DropWords(words=words_to_drop))
|
| 372 |
transforms.append(FrequencyMinWordCount(min_count=2))
|
| 373 |
+
transforms.append(FilterPOS(tags=pos_tags, engine="flair"))
|
| 374 |
transforms.append(RemoveDuplicates())
|
| 375 |
|
| 376 |
transforms = TextCompose(transforms)
|