Spaces:
Sleeping
Sleeping
| import re | |
| import logging | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| class JobTitlePreprocessor(): | |
| """Preprocesses job titles by converting to lowercase, removing unwanted words, special characters, numbers greater than 10, and content from location, states, regions, etc.""" | |
| def __init__(self): | |
| self.unwanted_words = ['remote', 'hybrid', 'flexible location', 'location', 'open to work', 'role', 'job', 'level', 'remot'] | |
| def remove_location_unwanted_words_brackets(self, title: str) -> str: | |
| """Removes parts of the title based on unwanted words, bracketed content, numbers greater than 10, and also removes symbols other than alphanumeric.""" | |
| # Remove unwanted words | |
| for word in self.unwanted_words: | |
| pattern = r'\b{}\b'.format(re.escape(word)) | |
| title = re.sub(pattern, '', title, flags=re.IGNORECASE) | |
| # Remove content within brackets | |
| title = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', title) | |
| # Remove any non-alphanumeric characters (keeping spaces) | |
| title = re.sub(r'[^a-zA-Z0-9\s]', '', title) | |
| # Remove numbers greater than 10 | |
| title = re.sub(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', '', title) | |
| # Clean up extra spaces | |
| title = re.sub(r'\s+', ' ', title).strip() | |
| return title | |
| def preprocess(self, title: str) -> str: | |
| """Converts title to lowercase, removes unwanted words, replaces specific terms, and standardizes job titles.""" | |
| if not isinstance(title, str): | |
| return title | |
| # Convert to lowercase | |
| title = title.lower() | |
| # Remove unwanted words | |
| for word in self.unwanted_words: | |
| title = re.sub(r'\b{}\b'.format(re.escape(word)), '', title, flags=re.IGNORECASE) | |
| # Replace specific terms and Roman numerals | |
| replacements = [ | |
| (r'\b(?:SR|sr|Sr\.?|SR\.?|Senior|senior)\b', 'Senior'), | |
| (r'\b(?:JR|jr|Jr\.?|JR\.?|Junior|junior)\b', 'Junior'), | |
| (r'\b(?:AIML|aiml|ML|ml|MachineLearning|machinelearning|machine[_\-]learning)\b', 'Machine Learning'), | |
| (r'\b(?:GenAI|genai|Genai|generative[_\-]ai|GenerativeAI|generativeai)\b', 'Generative AI'), | |
| (r'\b(?:NLP|nlp|natural[_\-]language[_\-]processing|natural language processing)\b', 'NLP'), | |
| (r'\b(?:i|I)\b', '1'), | |
| (r'\b(?:ii|II)\b', '2'), | |
| (r'\b(?:iii|III)\b', '3'), | |
| (r'\b(?:iv|IV)\b', '4'), | |
| (r'\b(?:v|V)\b', '5') | |
| ] | |
| for pattern, replacement in replacements: | |
| title = re.sub(pattern, replacement, title, flags=re.IGNORECASE) | |
| # Handle specific Data Scientist cases | |
| title = re.sub(r'\b(director|dir\.?|dir)\b.*?(data\s*scientist|data\s*science)', 'Director Data Scientist', title, flags=re.IGNORECASE) | |
| title = re.sub(r'\b(manager|mgr)\b.*?(data\s*scientist|data\s*science)', 'Manager Data Scientist', title, flags=re.IGNORECASE) | |
| title = re.sub(r'\b(lead)\b.*?(data\s*scientist|data\s*science)', 'Lead Data Scientist', title, flags=re.IGNORECASE) | |
| title = re.sub(r'\b(associate|associates?)\b.*?(data\s*scientist|data\s*science)', 'Associate Data Scientist', title, flags=re.IGNORECASE) | |
| title = re.sub(r'\b(applied)\b.*?(data\s*scientist|data\s*science)', 'Applied Data Scientist', title, flags=re.IGNORECASE) | |
| title = re.sub(r'\b(intern|internship|trainee)\b.*?(data\s*scientist|data\s*science)', 'Intern Data Scientist', title, flags=re.IGNORECASE) | |
| # Clean up extra spaces | |
| title = re.sub(r'\s+', ' ', title).strip() | |
| return title | |
| def preprocess_single_title(title: str) -> str: | |
| preprocessor = JobTitlePreprocessor() | |
| clean_title = preprocessor.remove_location_unwanted_words_brackets(title) | |
| clean_title = preprocessor.preprocess(clean_title) | |
| return clean_title | |
| if __name__ == "__main__": | |
| # Example single title | |
| title = "Senior Remote Machine Learning Data Scientist (Manager)" | |
| clean_title = preprocess_single_title(title) | |
| logger.info(f"Original title: {title}") | |
| logger.info(f"Preprocessed title: {clean_title}") | |