File size: 4,482 Bytes
96c59c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
from word2number import w2n

def parse_sections(text):
    """Splits text into sections to isolate requirements from company fluff."""
    sections = {'requirements': [], 'company': [], 'other': []}
    lines = text.split('\n')
    current_section = 'other'
    req_keywords = ['requirements', 'qualifications', 'what you bring', 'skills', 'who you are', 'ideal candidate', 'what we look for']
    company_keywords = ['about us', 'who we are', 'company description', 'our mission', 'about the role']
    
    for line in lines:
        line_clean = line.lower().strip()
        if any(k in line_clean for k in req_keywords): current_section = 'requirements'
        elif any(k in line_clean for k in company_keywords): current_section = 'company'
        sections[current_section].append(line)
        
    return {k: "\n".join(v) for k, v in sections.items()}

def convert_words_to_numbers(text):
    """Safely converts written numbers (e.g., 'twelve') to digits."""
    words = text.split()
    for i, word in enumerate(words):
        clean_word = re.sub(r'[^a-zA-Z]', '', word)
        try:
            val = w2n.word_to_num(clean_word)
            # Only replace if it's a realistic YOE number to avoid noise
            if 0 <= val <= 30:
                words[i] = str(val)
        except ValueError:
            continue
    return " ".join(words)

def extract_deep_features(text):
    """Extracts exact heuristics and metadata."""
    text = text.lower()
    text = convert_words_to_numbers(text)
        
    sections = parse_sections(text)
    req_text = sections['requirements']
    
    ignore_keywords = ['founded', 'history', 'ago', 'size', 'employees', 'offices', 'countries', 'revenue']
    
    patterns = [
        #"3-5 years", "3 to 5 years"
        (r"(\d+)\s*(?:to|-)\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "range"),
        # "5+ years", "5 years+"
        (r"(\d+)\s*\+\s*(?:years?|yrs?)", "plus"),
        (r"(\d+)\s*(?:years?|yrs?)\s*\+", "plus"),
        # "at least 4 years", "minimum 6 yrs", "requires 3 years"
        (r"(?:at least|minimum of|minimum|min|around|roughly|requires?|preferred|prefer)\s*(\d+)\s*(?:years?|yrs?)", "min"),
        # "4 years of experience", "6 yrs experience"
        (r"(\d+)\s*(?:years?|yrs?)\s*(?:of)?\s*(?:professional|industry|relevant|applied|working|total)?\s*experience", "exp"),
        # "experience: 5 years", "experience with ... 3 years"
        (r"experience(?:\s+with|\s+in|\s*:)?\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "exp_prefix")
    ]
    
    min_found = []
    max_found = []
    search_scope = req_text if len(req_text) > 50 else text
    
    for line in search_scope.split('\n'):
        if any(k in line for k in ignore_keywords):
            continue
            
        for p, _ in patterns:
            for m in re.findall(p, line):
                if isinstance(m, tuple):
                    start_val = int(m[0])
                    end_val = int(m[1])
                    if 0 <= start_val <= 25 and 0 <= end_val <= 30:
                        min_found.append(min(start_val, end_val))
                        max_found.append(max(start_val, end_val))
                else:
                    val = int(m)
                    if 0 <= val <= 25:
                        min_found.append(val)
                        max_found.append(val)

    if min_found:
        primary_yoe = min(min_found)
        max_yoe = max(max_found) if max_found else primary_yoe
    else:
        primary_yoe = -1
        max_yoe = -1

    regex_count = len(min_found)
    has_explicit_yoe = 1 if primary_yoe >= 0 else 0
    extraction_quality = 0
    if has_explicit_yoe:
        extraction_quality = 1
        if len(req_text) > 50:
            extraction_quality += 1
        if regex_count > 1:
            extraction_quality += 1

    return {
        'min_yoe_found': primary_yoe,
        'max_yoe_found': max_yoe,
        'regex_count': regex_count,
        'has_explicit_yoe': has_explicit_yoe,
        'extraction_quality': extraction_quality,
        'in_req_section': 1 if len(req_text) > 50 else 0,
        'has_phd': 1 if 'phd' in text or 'doctorate' in text else 0,
        'has_masters': 1 if 'masters' in text or "master's" in text or ' mba ' in text else 0,
        'is_manager': 1 if any(k in text for k in ['manager', 'director', 'lead', 'principal', 'head of']) else 0
    }