import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer # Ensure nltk resources are downloaded try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') try: nltk.data.find('corpora/wordnet') except LookupError: nltk.download('wordnet') stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def preprocess_text(text): if not isinstance(text, str): return "" # Lowercase text = text.lower() # Remove special characters, numbers, and urls text = re.sub(r'http\S+', '', text) text = re.sub(r'[^a-zA-Z\s]', '', text) # Tokenize and remove stopwords & lemmatize words = text.split() clean_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words] return " ".join(clean_words)