File size: 1,870 Bytes
984c70c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text, keep_numbers=False):
    """Tokenize text with optional number preservation"""
    text = clean_text(text)
    
    # Handle special cases for job titles
    special_cases = {
        "system administrator": "system_administrator",
        "database administrator": "database_administrator",
        "web developer": "web_developer",
        "security analyst": "security_analyst",
        "data scientist": "data_scientist",
        "devops engineer": "devops_engineer",
        "cloud engineer": "cloud_engineer",
        "machine learning engineer": "machine_learning_engineer",
        "software engineer": "software_engineer"
    }
    
    for phrase, replacement in special_cases.items():
        text = text.replace(phrase, replacement)
    
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    
    # Filter tokens
    filtered = []
    for word in tokens:
        if word in stop_words:
            continue
        if not keep_numbers and word.isdigit():
            continue
        if len(word) < 2:
            continue
        filtered.append(word)
    
    return filtered