Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| # Ensure nltk resources are downloaded | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| try: | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('wordnet') | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| def preprocess_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| # Lowercase | |
| text = text.lower() | |
| # Remove special characters, numbers, and urls | |
| text = re.sub(r'http\S+', '', text) | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Tokenize and remove stopwords & lemmatize | |
| words = text.split() | |
| clean_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words] | |
| return " ".join(clean_words) | |