Spaces:

Futyn-Maker
/

textmeme_search

Sleeping

textmeme_search / src /preprocessing /mystem_tokenizer.py

Futyn-Maker

Deploy the app

7e1f5f6 over 1 year ago

1.34 kB

	from typing import List, Union

	import nltk
	from pymystem3 import Mystem


	class MystemTokenizer:
	def __init__(self, stopwords: Union[List[str], str] = "ru"):
	"""
	Initialize the MystemTokenizer.

	Args:
	stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords.
	"""
	if stopwords == "ru":
	try:
	self.stopwords = nltk.corpus.stopwords.words("russian")
	except LookupError:
	# Download stopwords if not available
	nltk.download("stopwords")
	self.stopwords = nltk.corpus.stopwords.words("russian")
	else:
	self.stopwords = stopwords

	self.mystem = Mystem()

	def tokenize(self, text: str) -> List[str]:
	"""
	Tokenize and lemmatize the input text, removing stopwords.

	Args:
	text (str): The input text to tokenize.

	Returns:
	List[str]: A list of lemmatized tokens.
	"""
	# Lemmatize and tokenize using Mystem
	lemmas = self.mystem.lemmatize(text.lower())

	# Filter out non-letter tokens and stopwords
	tokens = [
	lemma for lemma in lemmas
	if lemma.isalpha() and lemma not in self.stopwords
	]

	return tokens