Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
π§ add stopword removal fn
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
utils.py
CHANGED
|
@@ -15,8 +15,10 @@ logging.basicConfig(
|
|
| 15 |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
| 16 |
level=logging.INFO,
|
| 17 |
)
|
|
|
|
| 18 |
import torch
|
| 19 |
from natsort import natsorted
|
|
|
|
| 20 |
from rapidfuzz import fuzz
|
| 21 |
|
| 22 |
# Define stopwords
|
|
@@ -25,6 +27,28 @@ STOPWORDS = set(
|
|
| 25 |
)
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def remove_stagnant_files(
|
| 29 |
freq: str = "hourly",
|
| 30 |
search_path: str = ".",
|
|
|
|
| 15 |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
| 16 |
level=logging.INFO,
|
| 17 |
)
|
| 18 |
+
|
| 19 |
import torch
|
| 20 |
from natsort import natsorted
|
| 21 |
+
from nltk.tokenize import word_tokenize
|
| 22 |
from rapidfuzz import fuzz
|
| 23 |
|
| 24 |
# Define stopwords
|
|
|
|
| 27 |
)
|
| 28 |
|
| 29 |
|
| 30 |
+
def remove_stopwords(text: str, stopwords: list = STOPWORDS) -> str:
|
| 31 |
+
"""
|
| 32 |
+
remove_stopwords - Remove stopwords from a string.
|
| 33 |
+
|
| 34 |
+
:param str text: text to remove stopwords from
|
| 35 |
+
:param list stopwords: list of stopwords to remove, defaults to STOPWORDS
|
| 36 |
+
:return, str: text with stopwords removed
|
| 37 |
+
"""
|
| 38 |
+
words = word_tokenize(text)
|
| 39 |
+
filtered_words = []
|
| 40 |
+
|
| 41 |
+
for word in words:
|
| 42 |
+
word = word.strip(string.punctuation) # remove punctuation
|
| 43 |
+
|
| 44 |
+
if word.lower() not in stopwords:
|
| 45 |
+
filtered_words.append(word)
|
| 46 |
+
|
| 47 |
+
filtered_text = " ".join(filtered_words)
|
| 48 |
+
|
| 49 |
+
return filtered_text
|
| 50 |
+
|
| 51 |
+
|
| 52 |
def remove_stagnant_files(
|
| 53 |
freq: str = "hourly",
|
| 54 |
search_path: str = ".",
|