Spaces:

nlomov
/

thematizer

Sleeping

App Files Files Community

thematizer / src /contextualized_topic_models /utils /preprocessing.py

nlomov

Added all source code

ab33b80 23 days ago

Raw

History Blame Contribute Delete

6.34 kB

	from sklearn.feature_extraction.text import CountVectorizer
	import string
	from nltk.corpus import stopwords as stop_words
	from gensim.utils import deaccent
	import warnings

	class WhiteSpacePreprocessing():
	"""
	Provides a very simple preprocessing script that filters infrequent tokens from text
	"""

	def __init__(self, documents, stopwords_language="english", vocabulary_size=2000):
	"""

	:param documents: list of strings
	:param stopwords_language: string of the language of the stopwords (see nltk stopwords)
	:param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
	"""
	self.documents = documents
	self.stopwords = set(stop_words.words(stopwords_language))
	self.vocabulary_size = vocabulary_size

	warnings.simplefilter('always', DeprecationWarning)
	warnings.warn("WhiteSpacePreprocessing is deprecated and will be removed in future versions."
	"Use WhiteSpacePreprocessingStopwords.")

	def preprocess(self):
	"""
	Note that if after filtering some documents do not contain words we remove them. That is why we return also the
	list of unpreprocessed documents.

	:return: preprocessed documents, unpreprocessed documents and the vocabulary list
	"""
	preprocessed_docs_tmp = self.documents
	preprocessed_docs_tmp = [deaccent(doc.lower()) for doc in preprocessed_docs_tmp]
	preprocessed_docs_tmp = [doc.translate(
	str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]
	preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords])
	for doc in preprocessed_docs_tmp]

	vectorizer = CountVectorizer(max_features=self.vocabulary_size)
	vectorizer.fit_transform(preprocessed_docs_tmp)
	temp_vocabulary = set(vectorizer.get_feature_names_out())

	preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
	for doc in preprocessed_docs_tmp]

	# the size of the preprocessed or unpreprocessed_docs might be less than given docs
	# for that reason, we need to return retained indices to change the shape of given custom embeddings.
	preprocessed_docs, unpreprocessed_docs, retained_indices = [], [], []
	for i, doc in enumerate(preprocessed_docs_tmp):
	if len(doc) > 0:
	preprocessed_docs.append(doc)
	unpreprocessed_docs.append(self.documents[i])
	retained_indices.append(i)

	vocabulary = list(set([item for doc in preprocessed_docs for item in doc.split()]))

	return preprocessed_docs, unpreprocessed_docs, vocabulary, retained_indices


	class WhiteSpacePreprocessingStopwords():
	"""
	Provides a very simple preprocessing script that filters infrequent tokens from text
	"""

	def __init__(self, documents, stopwords_list=None, vocabulary_size=2000, max_df=1.0, min_words=1,
	remove_numbers=True):
	"""

	:param documents: list of strings
	:param stopwords_list: list of the stopwords to remove
	:param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
	:param max_df : float or int, default=1.0
	When building the vocabulary ignore terms that have a document
	frequency strictly higher than the given threshold (corpus-specific
	stop words).
	If float in range [0.0, 1.0], the parameter represents a proportion of
	documents, integer absolute counts.
	This parameter is ignored if vocabulary is not None.
	:param min_words: int, default=1. Documents with less words than the parameter
	will be removed
	:param remove_numbers: bool, default=True. If true, numbers are removed from docs
	"""
	self.documents = documents
	if stopwords_list is not None:
	self.stopwords = set(stopwords_list)
	else:
	self.stopwords = []

	self.vocabulary_size = vocabulary_size
	self.max_df = max_df
	self.min_words = min_words
	self.remove_numbers = remove_numbers

	def preprocess(self):
	"""
	Note that if after filtering some documents do not contain words we remove them. That is why we return also the
	list of unpreprocessed documents.

	:return: preprocessed documents, unpreprocessed documents and the vocabulary list
	"""
	preprocessed_docs_tmp = self.documents
	preprocessed_docs_tmp = [deaccent(doc.lower()) for doc in preprocessed_docs_tmp]
	preprocessed_docs_tmp = [doc.translate(
	str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]
	if self.remove_numbers:
	preprocessed_docs_tmp = [doc.translate(str.maketrans("0123456789", ' ' * len("0123456789")))
	for doc in preprocessed_docs_tmp]
	preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords])
	for doc in preprocessed_docs_tmp]

	vectorizer = CountVectorizer(max_features=self.vocabulary_size, max_df=self.max_df)
	vectorizer.fit_transform(preprocessed_docs_tmp)
	temp_vocabulary = set(vectorizer.get_feature_names_out())

	preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
	for doc in preprocessed_docs_tmp]

	preprocessed_docs, unpreprocessed_docs, retained_indices = [], [], []
	for i, doc in enumerate(preprocessed_docs_tmp):
	if len(doc) > 0 and len(doc) >= self.min_words:
	preprocessed_docs.append(doc)
	unpreprocessed_docs.append(self.documents[i])
	retained_indices.append(i)

	vocabulary = list(set([item for doc in preprocessed_docs for item in doc.split()]))

	return preprocessed_docs, unpreprocessed_docs, vocabulary, retained_indices