Spaces:

ketannnn
/

coderound

Sleeping

App Files Files Community

coderound / backend /src /ml /feature_builder.py

ketannnn

fix: growth_velocity always 60% bug - handle JSON string and empty work_exp

4b9553f 27 days ago

raw

history blame contribute delete

9.78 kB

	import re
	import math
	from typing import Any


	SENIORITY_MAP = {
	"intern": 0, "trainee": 0, "junior": 1, "associate": 1,
	"mid": 2, "senior": 3, "lead": 4, "staff": 4,
	"principal": 5, "architect": 5, "manager": 4, "director": 6, "vp": 7, "cto": 8,
	}

	TIER1_EDU = {"iit", "iim", "nit", "bits", "iiit", "mit", "stanford", "cmu", "berkeley"}


	def build_candidate_text(candidate: dict[str, Any]) -> str:
	parts = []
	if candidate.get("parsed_summary"):
	parts.append(candidate["parsed_summary"])
	if candidate.get("parsed_skills"):
	parts.append(f"Skills: {candidate['parsed_skills']}")
	langs = candidate.get("programming_languages") or []
	if langs:
	parts.append(f"Languages: {', '.join(langs)}")
	frameworks = (candidate.get("backend_frameworks") or []) + (candidate.get("frontend_technologies") or [])
	if frameworks:
	parts.append(f"Frameworks: {', '.join(frameworks)}")
	work_exp = candidate.get("parsed_work_experience") or []
	for we in work_exp[:3]:
	if isinstance(we, dict):
	desc = we.get("description") or we.get("role") or ""
	company = we.get("company") or ""
	if desc or company:
	parts.append(f"{company}: {desc}".strip(": "))
	if candidate.get("most_recent_company_description"):
	parts.append(candidate["most_recent_company_description"])
	return " \| ".join(filter(None, parts))


	def _parse_duration_months(entry: dict) -> float:
	duration = entry.get("duration") or entry.get("tenure") or ""
	if not duration:
	return 12.0
	years = re.findall(r"(\d+\.?\d)\s(?:year\|yr)", duration, re.IGNORECASE)
	months = re.findall(r"(\d+\.?\d)\s(?:month\|mo)", duration, re.IGNORECASE)
	total = sum(float(y) * 12 for y in years) + sum(float(m) for m in months)
	return total if total > 0 else 12.0


	def _extract_seniority(title: str) -> int:
	title_lower = title.lower()
	for key, val in sorted(SENIORITY_MAP.items(), key=lambda x: -x[1]):
	if key in title_lower:
	return val
	return 2


	def compute_growth_velocity(work_experience: list[dict], is_funded: bool = False) -> float:
	import json as _json

	# Handle case where work_experience arrives as a JSON string (not yet parsed)
	if isinstance(work_experience, str):
	try:
	work_experience = _json.loads(work_experience)
	except Exception:
	work_experience = []

	# Filter to only valid dict entries that have a title/role
	valid_entries = [e for e in (work_experience or []) if isinstance(e, dict) and (e.get("title") or e.get("role"))]

	if len(valid_entries) < 2:
	# Fallback: compute from YOE-like numeric if available,
	# otherwise use funded signal
	base = 0.6 if is_funded else 0.5
	return base

	entries = sorted(valid_entries, key=lambda x: x.get("start_date", "") or "")
	seniority_levels = []
	total_months = 0.0

	for entry in entries:
	title = entry.get("title") or entry.get("role") or ""
	seniority_levels.append(_extract_seniority(title))
	total_months += _parse_duration_months(entry)

	if len(seniority_levels) < 2:
	return 0.5

	seniority_gain = seniority_levels[-1] - seniority_levels[0]
	years_elapsed = max(total_months / 12, 0.5)
	velocity = seniority_gain / years_elapsed

	normalized = min(max((velocity + 1) / 3, 0.0), 1.0)

	if is_funded:
	normalized = min(normalized + 0.1, 1.0)

	return round(normalized, 4)


	def skill_jaccard(jd_skills: list[str], candidate_skills: list[str]) -> float:
	if not jd_skills:
	return 0.5
	jd_set = {s.lower().strip() for s in jd_skills if s}
	cand_set = {s.lower().strip() for s in candidate_skills if s}
	if not cand_set:
	return 0.0
	intersection = jd_set & cand_set
	union = jd_set \| cand_set
	return len(intersection) / len(union) if union else 0.0


	def yoe_match(min_yoe: float \| None, max_yoe: float \| None, candidate_yoe: float \| None) -> float:
	if candidate_yoe is None:
	return 0.5
	if min_yoe is None and max_yoe is None:
	return 0.7
	candidate_yoe = float(candidate_yoe)
	if min_yoe is not None and candidate_yoe < min_yoe:
	gap = min_yoe - candidate_yoe
	return max(0.0, 1.0 - gap * 0.2)
	if max_yoe is not None and candidate_yoe > max_yoe + 3:
	return 0.7
	return 1.0


	def company_quality_signal(candidate: dict[str, Any]) -> float:
	score = 0.5
	if candidate.get("most_recent_company_is_product_company"):
	score += 0.2
	if candidate.get("most_recent_company_is_funded"):
	score += 0.15
	funding = candidate.get("most_recent_company_total_funding") or 0
	if funding > 10_000_000:
	score += 0.1
	if funding > 100_000_000:
	score += 0.05
	return min(score, 1.0)


	def education_match(candidate: dict[str, Any]) -> float:
	degree = (candidate.get("degree") or "").lower()
	status = (candidate.get("education_status") or "").lower()
	score = 0.5
	if "bachelor" in degree or "b.tech" in degree or "be " in degree:
	score = 0.6
	if "master" in degree or "m.tech" in degree or "mba" in degree:
	score = 0.8
	if "phd" in degree or "doctorate" in degree:
	score = 0.9
	for uni in TIER1_EDU:
	if uni in degree or uni in status:
	score = min(score + 0.15, 1.0)
	break
	return score


	def compute_jd_quality(raw_text: str, parsed: dict[str, Any], candidate_count: int = 0) -> dict[str, Any]:
	required_skills = parsed.get("required_skills") or []
	skill_count = len(required_skills)

	vagueness_score = 1.0
	if skill_count >= 5:
	vagueness_score = 0.2
	elif skill_count >= 3:
	vagueness_score = 0.5
	elif skill_count >= 1:
	vagueness_score = 0.75

	word_count = len(raw_text.split())
	if word_count < 50:
	vagueness_score = min(vagueness_score + 0.3, 1.0)

	contradictions = []
	min_yoe = parsed.get("min_yoe")
	engineer_type = (parsed.get("engineer_type") or "").lower()
	if min_yoe and min_yoe >= 5 and "junior" in raw_text.lower():
	contradictions.append("Requires 5+ YOE but mentions junior role")
	if min_yoe and min_yoe <= 1 and "senior" in raw_text.lower():
	contradictions.append("Entry-level YOE but expects senior candidate")

	breadth_score = 0.0
	if candidate_count > 0 and skill_count < 2:
	breadth_score = 0.9

	warnings = []
	if vagueness_score > 0.6:
	warnings.append("JD is too vague — add more specific skill requirements for better match quality")
	if contradictions:
	warnings.append(f"Contradictions detected: {'; '.join(contradictions)}")
	if breadth_score > 0.7:
	warnings.append("Requirements are too broad — almost all candidates will match")

	overall = "good"
	if vagueness_score > 0.6 or contradictions or breadth_score > 0.7:
	overall = "poor"
	elif vagueness_score > 0.35:
	overall = "fair"

	return {
	"overall": overall,
	"vagueness_score": round(vagueness_score, 3),
	"breadth_score": round(breadth_score, 3),
	"skill_count": skill_count,
	"contradictions": contradictions,
	"warnings": warnings,
	}


	def parse_jd_requirements(raw_text: str) -> dict[str, Any]:
	skills = []
	skill_patterns = [
	r"\b(python\|javascript\|typescript\|java\|go\|golang\|rust\|c\+\+\|ruby\|php\|scala\|kotlin\|swift)\b",
	r"\b(react\|angular\|vue\|nextjs\|fastapi\|django\|flask\|express\|springboot\|rails)\b",
	r"\b(postgresql\|mysql\|mongodb\|redis\|elasticsearch\|kafka\|rabbitmq\|cassandra)\b",
	r"\b(aws\|gcp\|azure\|docker\|kubernetes\|terraform\|ansible\|ci\/cd\|devops)\b",
	r"\b(machine learning\|deep learning\|nlp\|llm\|rag\|vector\|embedding\|pytorch\|tensorflow)\b",
	r"\b(sql\|nosql\|graphql\|rest\|grpc\|microservices\|api)\b",
	]
	for pattern in skill_patterns:
	found = re.findall(pattern, raw_text, re.IGNORECASE)
	skills.extend([f.lower() for f in found])
	skills = list(dict.fromkeys(skills))

	yoe_match_obj = re.search(r"(\d+)\+?\s(?:years?\|yrs?)\s(?:of\s*)?(?:experience\|exp)", raw_text, re.IGNORECASE)
	min_yoe = float(yoe_match_obj.group(1)) if yoe_match_obj else None

	role_type = None
	if re.search(r"\bfull.?time\b", raw_text, re.IGNORECASE):
	role_type = "full-time"
	elif re.search(r"\bcontract\b", raw_text, re.IGNORECASE):
	role_type = "contract"
	elif re.search(r"\bpart.?time\b", raw_text, re.IGNORECASE):
	role_type = "part-time"

	engineer_type = None
	if re.search(r"\bbackend\b", raw_text, re.IGNORECASE):
	engineer_type = "backend"
	elif re.search(r"\bfrontend\b", raw_text, re.IGNORECASE):
	engineer_type = "frontend"
	elif re.search(r"\bfullstack\b\|full.?stack\b", raw_text, re.IGNORECASE):
	engineer_type = "fullstack"
	elif re.search(r"\bai\s+engineer\|ml\s+engineer\|machine\s+learning", raw_text, re.IGNORECASE):
	engineer_type = "ai"
	elif re.search(r"\bdata\s+engineer\b", raw_text, re.IGNORECASE):
	engineer_type = "data"

	remote_allowed = bool(re.search(r"\bremote\b", raw_text, re.IGNORECASE))

	location_match = re.search(
	r"\b(bangalore\|mumbai\|delhi\|hyderabad\|chennai\|pune\|kolkata\|remote\|india\|us\|usa\|uk\|london\|new york\|san francisco)\b",
	raw_text, re.IGNORECASE
	)
	location = location_match.group(0).title() if location_match else None

	return {
	"required_skills": skills,
	"min_yoe": min_yoe,
	"max_yoe": None,
	"role_type": role_type,
	"engineer_type": engineer_type,
	"remote_allowed": remote_allowed,
	"location": location,
	}