Spaces:
Runtime error
Runtime error
File size: 674 Bytes
d7a9803 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text: str) -> str:
"""Clean and preprocess input text."""
text = text.lower()
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = nltk.word_tokenize(text)
tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return " ".join(tokens)
|