import re from word2number import w2n def parse_sections(text): """Splits text into sections to isolate requirements from company fluff.""" sections = {'requirements': [], 'company': [], 'other': []} lines = text.split('\n') current_section = 'other' req_keywords = ['requirements', 'qualifications', 'what you bring', 'skills', 'who you are', 'ideal candidate', 'what we look for'] company_keywords = ['about us', 'who we are', 'company description', 'our mission', 'about the role'] for line in lines: line_clean = line.lower().strip() if any(k in line_clean for k in req_keywords): current_section = 'requirements' elif any(k in line_clean for k in company_keywords): current_section = 'company' sections[current_section].append(line) return {k: "\n".join(v) for k, v in sections.items()} def convert_words_to_numbers(text): """Safely converts written numbers (e.g., 'twelve') to digits.""" words = text.split() for i, word in enumerate(words): clean_word = re.sub(r'[^a-zA-Z]', '', word) try: val = w2n.word_to_num(clean_word) # Only replace if it's a realistic YOE number to avoid noise if 0 <= val <= 30: words[i] = str(val) except ValueError: continue return " ".join(words) def extract_deep_features(text): """Extracts exact heuristics and metadata.""" text = text.lower() text = convert_words_to_numbers(text) sections = parse_sections(text) req_text = sections['requirements'] ignore_keywords = ['founded', 'history', 'ago', 'size', 'employees', 'offices', 'countries', 'revenue'] patterns = [ #"3-5 years", "3 to 5 years" (r"(\d+)\s*(?:to|-)\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "range"), # "5+ years", "5 years+" (r"(\d+)\s*\+\s*(?:years?|yrs?)", "plus"), (r"(\d+)\s*(?:years?|yrs?)\s*\+", "plus"), # "at least 4 years", "minimum 6 yrs", "requires 3 years" (r"(?:at least|minimum of|minimum|min|around|roughly|requires?|preferred|prefer)\s*(\d+)\s*(?:years?|yrs?)", "min"), # "4 years of experience", "6 yrs experience" (r"(\d+)\s*(?:years?|yrs?)\s*(?:of)?\s*(?:professional|industry|relevant|applied|working|total)?\s*experience", "exp"), # "experience: 5 years", "experience with ... 3 years" (r"experience(?:\s+with|\s+in|\s*:)?\s*(\d+)\s*(?:\+?\s*)?(?:years?|yrs?)", "exp_prefix") ] min_found = [] max_found = [] search_scope = req_text if len(req_text) > 50 else text for line in search_scope.split('\n'): if any(k in line for k in ignore_keywords): continue for p, _ in patterns: for m in re.findall(p, line): if isinstance(m, tuple): start_val = int(m[0]) end_val = int(m[1]) if 0 <= start_val <= 25 and 0 <= end_val <= 30: min_found.append(min(start_val, end_val)) max_found.append(max(start_val, end_val)) else: val = int(m) if 0 <= val <= 25: min_found.append(val) max_found.append(val) if min_found: primary_yoe = min(min_found) max_yoe = max(max_found) if max_found else primary_yoe else: primary_yoe = -1 max_yoe = -1 regex_count = len(min_found) has_explicit_yoe = 1 if primary_yoe >= 0 else 0 extraction_quality = 0 if has_explicit_yoe: extraction_quality = 1 if len(req_text) > 50: extraction_quality += 1 if regex_count > 1: extraction_quality += 1 return { 'min_yoe_found': primary_yoe, 'max_yoe_found': max_yoe, 'regex_count': regex_count, 'has_explicit_yoe': has_explicit_yoe, 'extraction_quality': extraction_quality, 'in_req_section': 1 if len(req_text) > 50 else 0, 'has_phd': 1 if 'phd' in text or 'doctorate' in text else 0, 'has_masters': 1 if 'masters' in text or "master's" in text or ' mba ' in text else 0, 'is_manager': 1 if any(k in text for k in ['manager', 'director', 'lead', 'principal', 'head of']) else 0 }