| import re
|
| from word2number import w2n
|
|
|
| def parse_sections(text):
|
| """Splits text into sections to isolate requirements from company fluff."""
|
| sections = {'requirements': [], 'company': [], 'other': []}
|
| lines = text.split('\n')
|
| current_section = 'other'
|
| req_keywords = ['requirements', 'qualifications', 'what you bring', 'skills', 'who you are', 'ideal candidate', 'what we look for']
|
| company_keywords = ['about us', 'who we are', 'company description', 'our mission', 'about the role']
|
|
|
| for line in lines:
|
| line_clean = line.lower().strip()
|
| if any(k in line_clean for k in req_keywords): current_section = 'requirements'
|
| elif any(k in line_clean for k in company_keywords): current_section = 'company'
|
| sections[current_section].append(line)
|
|
|
| return {k: "\n".join(v) for k, v in sections.items()}
|
|
|
| def convert_words_to_numbers(text):
|
| """Safely converts written numbers (e.g., 'twelve') to digits."""
|
| words = text.split()
|
| for i, word in enumerate(words):
|
| clean_word = re.sub(r'[^a-zA-Z]', '', word)
|
| try:
|
| val = w2n.word_to_num(clean_word)
|
|
|
| if 0 <= val <= 30:
|
| words[i] = str(val)
|
| except ValueError:
|
| continue
|
| return " ".join(words)
|
|
|
| def extract_deep_features(text):
|
| """Extracts exact heuristics and metadata."""
|
| text = text.lower()
|
| text = convert_words_to_numbers(text)
|
|
|
| sections = parse_sections(text)
|
| req_text = sections['requirements']
|
|
|
| ignore_keywords = ['founded', 'history', 'ago', 'size', 'employees', 'offices', 'countries', 'revenue']
|
|
|
| patterns = [
|
|
|
| (r"(\d+)\s*(?:to|-)\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "range"),
|
|
|
| (r"(\d+)\s*\+\s*(?:years?|yrs?)", "plus"),
|
| (r"(\d+)\s*(?:years?|yrs?)\s*\+", "plus"),
|
|
|
| (r"(?:at least|minimum of|minimum|min|around|roughly|requires?|preferred|prefer)\s*(\d+)\s*(?:years?|yrs?)", "min"),
|
|
|
| (r"(\d+)\s*(?:years?|yrs?)\s*(?:of)?\s*(?:professional|industry|relevant|applied|working|total)?\s*experience", "exp"),
|
|
|
| (r"experience(?:\s+with|\s+in|\s*:)?\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "exp_prefix")
|
| ]
|
|
|
| min_found = []
|
| max_found = []
|
| search_scope = req_text if len(req_text) > 50 else text
|
|
|
| for line in search_scope.split('\n'):
|
| if any(k in line for k in ignore_keywords):
|
| continue
|
|
|
| for p, _ in patterns:
|
| for m in re.findall(p, line):
|
| if isinstance(m, tuple):
|
| start_val = int(m[0])
|
| end_val = int(m[1])
|
| if 0 <= start_val <= 25 and 0 <= end_val <= 30:
|
| min_found.append(min(start_val, end_val))
|
| max_found.append(max(start_val, end_val))
|
| else:
|
| val = int(m)
|
| if 0 <= val <= 25:
|
| min_found.append(val)
|
| max_found.append(val)
|
|
|
| if min_found:
|
| primary_yoe = min(min_found)
|
| max_yoe = max(max_found) if max_found else primary_yoe
|
| else:
|
| primary_yoe = -1
|
| max_yoe = -1
|
|
|
| regex_count = len(min_found)
|
| has_explicit_yoe = 1 if primary_yoe >= 0 else 0
|
| extraction_quality = 0
|
| if has_explicit_yoe:
|
| extraction_quality = 1
|
| if len(req_text) > 50:
|
| extraction_quality += 1
|
| if regex_count > 1:
|
| extraction_quality += 1
|
|
|
| return {
|
| 'min_yoe_found': primary_yoe,
|
| 'max_yoe_found': max_yoe,
|
| 'regex_count': regex_count,
|
| 'has_explicit_yoe': has_explicit_yoe,
|
| 'extraction_quality': extraction_quality,
|
| 'in_req_section': 1 if len(req_text) > 50 else 0,
|
| 'has_phd': 1 if 'phd' in text or 'doctorate' in text else 0,
|
| 'has_masters': 1 if 'masters' in text or "master's" in text or ' mba ' in text else 0,
|
| 'is_manager': 1 if any(k in text for k in ['manager', 'director', 'lead', 'principal', 'head of']) else 0
|
| } |