Remove unused transform
Browse files- transforms_cased.py +0 -38
transforms_cased.py
CHANGED
|
@@ -17,7 +17,6 @@ __all__ = [
|
|
| 17 |
"DropWords",
|
| 18 |
"FilterPOS",
|
| 19 |
"FrequencyMinWordCount",
|
| 20 |
-
"FrequencyTopK",
|
| 21 |
"ReplaceSeparators",
|
| 22 |
"ToLowercase",
|
| 23 |
"ToSingular",
|
|
@@ -257,43 +256,6 @@ class FrequencyMinWordCount(BaseTextTransform):
|
|
| 257 |
return f"{self.__class__.__name__}(min_count={self.min_count})"
|
| 258 |
|
| 259 |
|
| 260 |
-
class FrequencyTopK(BaseTextTransform):
|
| 261 |
-
"""Keep only the top k most frequent words in the input text.
|
| 262 |
-
|
| 263 |
-
In case of a tie, all words with the same count as the last word are kept.
|
| 264 |
-
|
| 265 |
-
Args:
|
| 266 |
-
top_k (int): Number of top words to keep.
|
| 267 |
-
"""
|
| 268 |
-
|
| 269 |
-
def __init__(self, top_k: int) -> None:
|
| 270 |
-
super().__init__()
|
| 271 |
-
self.top_k = top_k
|
| 272 |
-
|
| 273 |
-
def __call__(self, text: str) -> str:
|
| 274 |
-
"""
|
| 275 |
-
Args:
|
| 276 |
-
text (str): Text to remove infrequent words from.
|
| 277 |
-
"""
|
| 278 |
-
if self.top_k < 1:
|
| 279 |
-
return text
|
| 280 |
-
|
| 281 |
-
words = text.split()
|
| 282 |
-
word_counts = {word: words.count(word) for word in words}
|
| 283 |
-
top_words = sorted(word_counts, key=word_counts.get, reverse=True)
|
| 284 |
-
|
| 285 |
-
# in case of a tie, keep all words with the same count
|
| 286 |
-
top_words = top_words[: self.top_k]
|
| 287 |
-
top_words = [word for word in top_words if word_counts[word] == word_counts[top_words[-1]]]
|
| 288 |
-
|
| 289 |
-
text = " ".join([word for word in words if word in top_words])
|
| 290 |
-
|
| 291 |
-
return text
|
| 292 |
-
|
| 293 |
-
def __repr__(self) -> str:
|
| 294 |
-
return f"{self.__class__.__name__}(top_k={self.top_k})"
|
| 295 |
-
|
| 296 |
-
|
| 297 |
class ReplaceSeparators(BaseTextTransform):
|
| 298 |
"""Replace underscores and dashes with spaces."""
|
| 299 |
|
|
|
|
| 17 |
"DropWords",
|
| 18 |
"FilterPOS",
|
| 19 |
"FrequencyMinWordCount",
|
|
|
|
| 20 |
"ReplaceSeparators",
|
| 21 |
"ToLowercase",
|
| 22 |
"ToSingular",
|
|
|
|
| 256 |
return f"{self.__class__.__name__}(min_count={self.min_count})"
|
| 257 |
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
class ReplaceSeparators(BaseTextTransform):
|
| 260 |
"""Replace underscores and dashes with spaces."""
|
| 261 |
|