altndrr
/

cased

@@ -160,14 +160,12 @@ class FilterPOS(BaseTextTransform):
     Args:
         tags (list): List of POS tags to remove.
         engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
-        keep_compound_nouns (bool): Whether to keep composed words. Defaults to True.
     """
-    def __init__(self, tags: list, engine: str = "nltk", keep_compound_nouns: bool = True) -> None:
         super().__init__()
         self.tags = tags
         self.engine = engine
-        self.keep_compound_nouns = keep_compound_nouns
         if engine == "nltk":
             nltk.download("averaged_perceptron_tagger", quiet=True)
@@ -189,30 +187,6 @@ class FilterPOS(BaseTextTransform):
             self.tagger(sentence)
             text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
-        if self.keep_compound_nouns:
-            compound_nouns = []
-            if self.engine == "nltk":
-                for i in range(len(word_tags) - 1):
-                    if word_tags[i][1] == "NN" and word_tags[i + 1][1] == "NN":
-                        # if they are the same word, skip
-                        if word_tags[i][0] == word_tags[i + 1][0]:
-                            continue
-                        compound_noun = word_tags[i][0] + "_" + word_tags[i + 1][0]
-                        compound_nouns.append(compound_noun)
-            elif self.engine == "flair":
-                for i in range(len(sentence.tokens) - 1):
-                    if sentence.tokens[i].tag == "NN" and sentence.tokens[i + 1].tag == "NN":
-                        # if they are the same word, skip
-                        if sentence.tokens[i].text == sentence.tokens[i + 1].text:
-                            continue
-                        compound_noun = sentence.tokens[i].text + "_" + sentence.tokens[i + 1].text
-                        compound_nouns.append(compound_noun)
-            text = " ".join([text, " ".join(compound_nouns)])
         return text
     def __repr__(self) -> str:
@@ -396,7 +370,7 @@ def default_vocabulary_transforms() -> TextCompose:
     transforms.append(ToSingular())
     transforms.append(DropWords(words=words_to_drop))
     transforms.append(FrequencyMinWordCount(min_count=2))
-    transforms.append(FilterPOS(tags=pos_tags, engine="flair", keep_compound_nouns=False))
     transforms.append(RemoveDuplicates())
     transforms = TextCompose(transforms)

     Args:
         tags (list): List of POS tags to remove.
         engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
     """
+    def __init__(self, tags: list, engine: str = "nltk") -> None:
         super().__init__()
         self.tags = tags
         self.engine = engine
         if engine == "nltk":
             nltk.download("averaged_perceptron_tagger", quiet=True)
             self.tagger(sentence)
             text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
         return text
     def __repr__(self) -> str:
     transforms.append(ToSingular())
     transforms.append(DropWords(words=words_to_drop))
     transforms.append(FrequencyMinWordCount(min_count=2))
+    transforms.append(FilterPOS(tags=pos_tags, engine="flair"))
     transforms.append(RemoveDuplicates())
     transforms = TextCompose(transforms)