Spaces:
Sleeping
Sleeping
| # Coding by Samitha Randika | https://www.linkedin.com/in/samitha-randika-edirisinghe-b3a68a2b6 # | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| import string | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('stopwords', quiet=True) | |
| def clean_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| # Remove URLs | |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) | |
| # Remove special characters | |
| text = re.sub(r'[^\w\s]', '', text) | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove numbers | |
| text = re.sub(r'\d+', '', text) | |
| # Remove punctuation | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def tokenize_text(text): | |
| text = clean_text(text) | |
| tokens = word_tokenize(text) | |
| stop_words = set(stopwords.words('english')) | |
| return [word for word in tokens if word not in stop_words and len(word) > 2] |