Spaces:
Running
Running
File size: 1,173 Bytes
11c72a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | import numpy as np
from backend.datasets.data import file_utils
def get_top_words(beta, vocab, num_top_words, verbose=False):
topic_str_list = list()
for i, topic_dist in enumerate(beta):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(num_top_words + 1):-1]
topic_str = ' '.join(topic_words)
topic_str_list.append(topic_str)
if verbose:
print('Topic {}: {}'.format(i, topic_str))
return topic_str_list
def get_stopwords_set(stopwords=[]):
from backend.datasets.data.download import download_dataset
if stopwords == 'English':
from gensim.parsing.preprocessing import STOPWORDS as stopwords
elif stopwords in ['mallet', 'snowball']:
download_dataset('stopwords', cache_path='./')
path = f'./stopwords/{stopwords}_stopwords.txt'
stopwords = file_utils.read_text(path)
stopword_set = frozenset(stopwords)
return stopword_set
if __name__ == '__main__':
print(list(get_stopwords_set('English'))[:10])
print(list(get_stopwords_set('mallet'))[:10])
print(list(get_stopwords_set('snowball'))[:10])
print(list(get_stopwords_set())[:10])
|