Spaces:

PatienceIzere
/

CandoramClassification

Sleeping

App Files Files Community

CandoramClassification / utils.py

PatienceIzere

Upload 12 files

96c59c3 verified 9 days ago

raw

history blame contribute delete

4.48 kB

	import re
	from word2number import w2n

	def parse_sections(text):
	"""Splits text into sections to isolate requirements from company fluff."""
	sections = {'requirements': [], 'company': [], 'other': []}
	lines = text.split('\n')
	current_section = 'other'
	req_keywords = ['requirements', 'qualifications', 'what you bring', 'skills', 'who you are', 'ideal candidate', 'what we look for']
	company_keywords = ['about us', 'who we are', 'company description', 'our mission', 'about the role']

	for line in lines:
	line_clean = line.lower().strip()
	if any(k in line_clean for k in req_keywords): current_section = 'requirements'
	elif any(k in line_clean for k in company_keywords): current_section = 'company'
	sections[current_section].append(line)

	return {k: "\n".join(v) for k, v in sections.items()}

	def convert_words_to_numbers(text):
	"""Safely converts written numbers (e.g., 'twelve') to digits."""
	words = text.split()
	for i, word in enumerate(words):
	clean_word = re.sub(r'[^a-zA-Z]', '', word)
	try:
	val = w2n.word_to_num(clean_word)
	# Only replace if it's a realistic YOE number to avoid noise
	if 0 <= val <= 30:
	words[i] = str(val)
	except ValueError:
	continue
	return " ".join(words)

	def extract_deep_features(text):
	"""Extracts exact heuristics and metadata."""
	text = text.lower()
	text = convert_words_to_numbers(text)

	sections = parse_sections(text)
	req_text = sections['requirements']

	ignore_keywords = ['founded', 'history', 'ago', 'size', 'employees', 'offices', 'countries', 'revenue']

	patterns = [
	#"3-5 years", "3 to 5 years"
	(r"(\d+)\s(?:to\|-)\s(\d+)\s(?:\+?\s)?(?:years?\|yrs?)", "range"),
	# "5+ years", "5 years+"
	(r"(\d+)\s\+\s(?:years?\|yrs?)", "plus"),
	(r"(\d+)\s(?:years?\|yrs?)\s\+", "plus"),
	# "at least 4 years", "minimum 6 yrs", "requires 3 years"
	(r"(?:at least\|minimum of\|minimum\|min\|around\|roughly\|requires?\|preferred\|prefer)\s(\d+)\s(?:years?\|yrs?)", "min"),
	# "4 years of experience", "6 yrs experience"
	(r"(\d+)\s(?:years?\|yrs?)\s(?:of)?\s(?:professional\|industry\|relevant\|applied\|working\|total)?\sexperience", "exp"),
	# "experience: 5 years", "experience with ... 3 years"
	(r"experience(?:\s+with\|\s+in\|\s:)?\s(\d+)\s(?:\+?\s)?(?:years?\|yrs?)", "exp_prefix")
	]

	min_found = []
	max_found = []
	search_scope = req_text if len(req_text) > 50 else text

	for line in search_scope.split('\n'):
	if any(k in line for k in ignore_keywords):
	continue

	for p, _ in patterns:
	for m in re.findall(p, line):
	if isinstance(m, tuple):
	start_val = int(m[0])
	end_val = int(m[1])
	if 0 <= start_val <= 25 and 0 <= end_val <= 30:
	min_found.append(min(start_val, end_val))
	max_found.append(max(start_val, end_val))
	else:
	val = int(m)
	if 0 <= val <= 25:
	min_found.append(val)
	max_found.append(val)

	if min_found:
	primary_yoe = min(min_found)
	max_yoe = max(max_found) if max_found else primary_yoe
	else:
	primary_yoe = -1
	max_yoe = -1

	regex_count = len(min_found)
	has_explicit_yoe = 1 if primary_yoe >= 0 else 0
	extraction_quality = 0
	if has_explicit_yoe:
	extraction_quality = 1
	if len(req_text) > 50:
	extraction_quality += 1
	if regex_count > 1:
	extraction_quality += 1

	return {
	'min_yoe_found': primary_yoe,
	'max_yoe_found': max_yoe,
	'regex_count': regex_count,
	'has_explicit_yoe': has_explicit_yoe,
	'extraction_quality': extraction_quality,
	'in_req_section': 1 if len(req_text) > 50 else 0,
	'has_phd': 1 if 'phd' in text or 'doctorate' in text else 0,
	'has_masters': 1 if 'masters' in text or "master's" in text or ' mba ' in text else 0,
	'is_manager': 1 if any(k in text for k in ['manager', 'director', 'lead', 'principal', 'head of']) else 0
	}