Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| import string | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('stopwords', quiet=True) | |
| def clean_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| # Remove URLs | |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) | |
| # Remove special characters | |
| text = re.sub(r'[^\w\s]', '', text) | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove numbers | |
| text = re.sub(r'\d+', '', text) | |
| # Remove punctuation | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def tokenize_text(text, keep_numbers=False): | |
| """Tokenize text with optional number preservation""" | |
| text = clean_text(text) | |
| # Handle special cases for job titles | |
| special_cases = { | |
| "system administrator": "system_administrator", | |
| "database administrator": "database_administrator", | |
| "web developer": "web_developer", | |
| "security analyst": "security_analyst", | |
| "data scientist": "data_scientist", | |
| "devops engineer": "devops_engineer", | |
| "cloud engineer": "cloud_engineer", | |
| "machine learning engineer": "machine_learning_engineer", | |
| "software engineer": "software_engineer" | |
| } | |
| for phrase, replacement in special_cases.items(): | |
| text = text.replace(phrase, replacement) | |
| tokens = word_tokenize(text) | |
| stop_words = set(stopwords.words('english')) | |
| # Filter tokens | |
| filtered = [] | |
| for word in tokens: | |
| if word in stop_words: | |
| continue | |
| if not keep_numbers and word.isdigit(): | |
| continue | |
| if len(word) < 2: | |
| continue | |
| filtered.append(word) | |
| return filtered |