import re import string import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nltk.download('punkt') nltk.download('punkt_tab') nltk.download('stopwords') nltk.download('wordnet') stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def preprocess_text(text: str) -> str: """Clean and preprocess input text.""" text = text.lower() text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) tokens = nltk.word_tokenize(text) tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation] tokens = [lemmatizer.lemmatize(word) for word in tokens] return " ".join(tokens)