Spaces:

pszemraj
/

document-summarization

Running on CPU Upgrade

pszemraj commited on May 28, 2023

Commit

62a2921

1 Parent(s): af3f9ae

🚧 add stopword removal fn

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -15,8 +15,10 @@ logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
     level=logging.INFO,
 )
 import torch
 from natsort import natsorted
 from rapidfuzz import fuzz
 # Define stopwords
@@ -25,6 +27,28 @@ STOPWORDS = set(
 )
 def remove_stagnant_files(
     freq: str = "hourly",
     search_path: str = ".",

     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
     level=logging.INFO,
 )
 import torch
 from natsort import natsorted
+from nltk.tokenize import word_tokenize
 from rapidfuzz import fuzz
 # Define stopwords
 )
+def remove_stopwords(text: str, stopwords: list = STOPWORDS) -> str:
+    """
+    remove_stopwords - Remove stopwords from a string.
+    :param str text: text to remove stopwords from
+    :param list stopwords: list of stopwords to remove, defaults to STOPWORDS
+    :return, str: text with stopwords removed
+    """
+    words = word_tokenize(text)
+    filtered_words = []
+    for word in words:
+        word = word.strip(string.punctuation)  # remove punctuation
+        if word.lower() not in stopwords:
+            filtered_words.append(word)
+    filtered_text = " ".join(filtered_words)
+    return filtered_text
 def remove_stagnant_files(
     freq: str = "hourly",
     search_path: str = ".",