PatienceIzere's picture
Upload 12 files
96c59c3 verified
import re
from word2number import w2n
def parse_sections(text):
"""Splits text into sections to isolate requirements from company fluff."""
sections = {'requirements': [], 'company': [], 'other': []}
lines = text.split('\n')
current_section = 'other'
req_keywords = ['requirements', 'qualifications', 'what you bring', 'skills', 'who you are', 'ideal candidate', 'what we look for']
company_keywords = ['about us', 'who we are', 'company description', 'our mission', 'about the role']
for line in lines:
line_clean = line.lower().strip()
if any(k in line_clean for k in req_keywords): current_section = 'requirements'
elif any(k in line_clean for k in company_keywords): current_section = 'company'
sections[current_section].append(line)
return {k: "\n".join(v) for k, v in sections.items()}
def convert_words_to_numbers(text):
"""Safely converts written numbers (e.g., 'twelve') to digits."""
words = text.split()
for i, word in enumerate(words):
clean_word = re.sub(r'[^a-zA-Z]', '', word)
try:
val = w2n.word_to_num(clean_word)
# Only replace if it's a realistic YOE number to avoid noise
if 0 <= val <= 30:
words[i] = str(val)
except ValueError:
continue
return " ".join(words)
def extract_deep_features(text):
"""Extracts exact heuristics and metadata."""
text = text.lower()
text = convert_words_to_numbers(text)
sections = parse_sections(text)
req_text = sections['requirements']
ignore_keywords = ['founded', 'history', 'ago', 'size', 'employees', 'offices', 'countries', 'revenue']
patterns = [
#"3-5 years", "3 to 5 years"
(r"(\d+)\s*(?:to|-)\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "range"),
# "5+ years", "5 years+"
(r"(\d+)\s*\+\s*(?:years?|yrs?)", "plus"),
(r"(\d+)\s*(?:years?|yrs?)\s*\+", "plus"),
# "at least 4 years", "minimum 6 yrs", "requires 3 years"
(r"(?:at least|minimum of|minimum|min|around|roughly|requires?|preferred|prefer)\s*(\d+)\s*(?:years?|yrs?)", "min"),
# "4 years of experience", "6 yrs experience"
(r"(\d+)\s*(?:years?|yrs?)\s*(?:of)?\s*(?:professional|industry|relevant|applied|working|total)?\s*experience", "exp"),
# "experience: 5 years", "experience with ... 3 years"
(r"experience(?:\s+with|\s+in|\s*:)?\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "exp_prefix")
]
min_found = []
max_found = []
search_scope = req_text if len(req_text) > 50 else text
for line in search_scope.split('\n'):
if any(k in line for k in ignore_keywords):
continue
for p, _ in patterns:
for m in re.findall(p, line):
if isinstance(m, tuple):
start_val = int(m[0])
end_val = int(m[1])
if 0 <= start_val <= 25 and 0 <= end_val <= 30:
min_found.append(min(start_val, end_val))
max_found.append(max(start_val, end_val))
else:
val = int(m)
if 0 <= val <= 25:
min_found.append(val)
max_found.append(val)
if min_found:
primary_yoe = min(min_found)
max_yoe = max(max_found) if max_found else primary_yoe
else:
primary_yoe = -1
max_yoe = -1
regex_count = len(min_found)
has_explicit_yoe = 1 if primary_yoe >= 0 else 0
extraction_quality = 0
if has_explicit_yoe:
extraction_quality = 1
if len(req_text) > 50:
extraction_quality += 1
if regex_count > 1:
extraction_quality += 1
return {
'min_yoe_found': primary_yoe,
'max_yoe_found': max_yoe,
'regex_count': regex_count,
'has_explicit_yoe': has_explicit_yoe,
'extraction_quality': extraction_quality,
'in_req_section': 1 if len(req_text) > 50 else 0,
'has_phd': 1 if 'phd' in text or 'doctorate' in text else 0,
'has_masters': 1 if 'masters' in text or "master's" in text or ' mba ' in text else 0,
'is_manager': 1 if any(k in text for k in ['manager', 'director', 'lead', 'principal', 'head of']) else 0
}