coderound / backend /src /ml /feature_builder.py
ketannnn's picture
fix: growth_velocity always 60% bug - handle JSON string and empty work_exp
4b9553f
import re
import math
from typing import Any
SENIORITY_MAP = {
"intern": 0, "trainee": 0, "junior": 1, "associate": 1,
"mid": 2, "senior": 3, "lead": 4, "staff": 4,
"principal": 5, "architect": 5, "manager": 4, "director": 6, "vp": 7, "cto": 8,
}
TIER1_EDU = {"iit", "iim", "nit", "bits", "iiit", "mit", "stanford", "cmu", "berkeley"}
def build_candidate_text(candidate: dict[str, Any]) -> str:
parts = []
if candidate.get("parsed_summary"):
parts.append(candidate["parsed_summary"])
if candidate.get("parsed_skills"):
parts.append(f"Skills: {candidate['parsed_skills']}")
langs = candidate.get("programming_languages") or []
if langs:
parts.append(f"Languages: {', '.join(langs)}")
frameworks = (candidate.get("backend_frameworks") or []) + (candidate.get("frontend_technologies") or [])
if frameworks:
parts.append(f"Frameworks: {', '.join(frameworks)}")
work_exp = candidate.get("parsed_work_experience") or []
for we in work_exp[:3]:
if isinstance(we, dict):
desc = we.get("description") or we.get("role") or ""
company = we.get("company") or ""
if desc or company:
parts.append(f"{company}: {desc}".strip(": "))
if candidate.get("most_recent_company_description"):
parts.append(candidate["most_recent_company_description"])
return " | ".join(filter(None, parts))
def _parse_duration_months(entry: dict) -> float:
duration = entry.get("duration") or entry.get("tenure") or ""
if not duration:
return 12.0
years = re.findall(r"(\d+\.?\d*)\s*(?:year|yr)", duration, re.IGNORECASE)
months = re.findall(r"(\d+\.?\d*)\s*(?:month|mo)", duration, re.IGNORECASE)
total = sum(float(y) * 12 for y in years) + sum(float(m) for m in months)
return total if total > 0 else 12.0
def _extract_seniority(title: str) -> int:
title_lower = title.lower()
for key, val in sorted(SENIORITY_MAP.items(), key=lambda x: -x[1]):
if key in title_lower:
return val
return 2
def compute_growth_velocity(work_experience: list[dict], is_funded: bool = False) -> float:
import json as _json
# Handle case where work_experience arrives as a JSON string (not yet parsed)
if isinstance(work_experience, str):
try:
work_experience = _json.loads(work_experience)
except Exception:
work_experience = []
# Filter to only valid dict entries that have a title/role
valid_entries = [e for e in (work_experience or []) if isinstance(e, dict) and (e.get("title") or e.get("role"))]
if len(valid_entries) < 2:
# Fallback: compute from YOE-like numeric if available,
# otherwise use funded signal
base = 0.6 if is_funded else 0.5
return base
entries = sorted(valid_entries, key=lambda x: x.get("start_date", "") or "")
seniority_levels = []
total_months = 0.0
for entry in entries:
title = entry.get("title") or entry.get("role") or ""
seniority_levels.append(_extract_seniority(title))
total_months += _parse_duration_months(entry)
if len(seniority_levels) < 2:
return 0.5
seniority_gain = seniority_levels[-1] - seniority_levels[0]
years_elapsed = max(total_months / 12, 0.5)
velocity = seniority_gain / years_elapsed
normalized = min(max((velocity + 1) / 3, 0.0), 1.0)
if is_funded:
normalized = min(normalized + 0.1, 1.0)
return round(normalized, 4)
def skill_jaccard(jd_skills: list[str], candidate_skills: list[str]) -> float:
if not jd_skills:
return 0.5
jd_set = {s.lower().strip() for s in jd_skills if s}
cand_set = {s.lower().strip() for s in candidate_skills if s}
if not cand_set:
return 0.0
intersection = jd_set & cand_set
union = jd_set | cand_set
return len(intersection) / len(union) if union else 0.0
def yoe_match(min_yoe: float | None, max_yoe: float | None, candidate_yoe: float | None) -> float:
if candidate_yoe is None:
return 0.5
if min_yoe is None and max_yoe is None:
return 0.7
candidate_yoe = float(candidate_yoe)
if min_yoe is not None and candidate_yoe < min_yoe:
gap = min_yoe - candidate_yoe
return max(0.0, 1.0 - gap * 0.2)
if max_yoe is not None and candidate_yoe > max_yoe + 3:
return 0.7
return 1.0
def company_quality_signal(candidate: dict[str, Any]) -> float:
score = 0.5
if candidate.get("most_recent_company_is_product_company"):
score += 0.2
if candidate.get("most_recent_company_is_funded"):
score += 0.15
funding = candidate.get("most_recent_company_total_funding") or 0
if funding > 10_000_000:
score += 0.1
if funding > 100_000_000:
score += 0.05
return min(score, 1.0)
def education_match(candidate: dict[str, Any]) -> float:
degree = (candidate.get("degree") or "").lower()
status = (candidate.get("education_status") or "").lower()
score = 0.5
if "bachelor" in degree or "b.tech" in degree or "be " in degree:
score = 0.6
if "master" in degree or "m.tech" in degree or "mba" in degree:
score = 0.8
if "phd" in degree or "doctorate" in degree:
score = 0.9
for uni in TIER1_EDU:
if uni in degree or uni in status:
score = min(score + 0.15, 1.0)
break
return score
def compute_jd_quality(raw_text: str, parsed: dict[str, Any], candidate_count: int = 0) -> dict[str, Any]:
required_skills = parsed.get("required_skills") or []
skill_count = len(required_skills)
vagueness_score = 1.0
if skill_count >= 5:
vagueness_score = 0.2
elif skill_count >= 3:
vagueness_score = 0.5
elif skill_count >= 1:
vagueness_score = 0.75
word_count = len(raw_text.split())
if word_count < 50:
vagueness_score = min(vagueness_score + 0.3, 1.0)
contradictions = []
min_yoe = parsed.get("min_yoe")
engineer_type = (parsed.get("engineer_type") or "").lower()
if min_yoe and min_yoe >= 5 and "junior" in raw_text.lower():
contradictions.append("Requires 5+ YOE but mentions junior role")
if min_yoe and min_yoe <= 1 and "senior" in raw_text.lower():
contradictions.append("Entry-level YOE but expects senior candidate")
breadth_score = 0.0
if candidate_count > 0 and skill_count < 2:
breadth_score = 0.9
warnings = []
if vagueness_score > 0.6:
warnings.append("JD is too vague — add more specific skill requirements for better match quality")
if contradictions:
warnings.append(f"Contradictions detected: {'; '.join(contradictions)}")
if breadth_score > 0.7:
warnings.append("Requirements are too broad — almost all candidates will match")
overall = "good"
if vagueness_score > 0.6 or contradictions or breadth_score > 0.7:
overall = "poor"
elif vagueness_score > 0.35:
overall = "fair"
return {
"overall": overall,
"vagueness_score": round(vagueness_score, 3),
"breadth_score": round(breadth_score, 3),
"skill_count": skill_count,
"contradictions": contradictions,
"warnings": warnings,
}
def parse_jd_requirements(raw_text: str) -> dict[str, Any]:
skills = []
skill_patterns = [
r"\b(python|javascript|typescript|java|go|golang|rust|c\+\+|ruby|php|scala|kotlin|swift)\b",
r"\b(react|angular|vue|nextjs|fastapi|django|flask|express|springboot|rails)\b",
r"\b(postgresql|mysql|mongodb|redis|elasticsearch|kafka|rabbitmq|cassandra)\b",
r"\b(aws|gcp|azure|docker|kubernetes|terraform|ansible|ci\/cd|devops)\b",
r"\b(machine learning|deep learning|nlp|llm|rag|vector|embedding|pytorch|tensorflow)\b",
r"\b(sql|nosql|graphql|rest|grpc|microservices|api)\b",
]
for pattern in skill_patterns:
found = re.findall(pattern, raw_text, re.IGNORECASE)
skills.extend([f.lower() for f in found])
skills = list(dict.fromkeys(skills))
yoe_match_obj = re.search(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)", raw_text, re.IGNORECASE)
min_yoe = float(yoe_match_obj.group(1)) if yoe_match_obj else None
role_type = None
if re.search(r"\bfull.?time\b", raw_text, re.IGNORECASE):
role_type = "full-time"
elif re.search(r"\bcontract\b", raw_text, re.IGNORECASE):
role_type = "contract"
elif re.search(r"\bpart.?time\b", raw_text, re.IGNORECASE):
role_type = "part-time"
engineer_type = None
if re.search(r"\bbackend\b", raw_text, re.IGNORECASE):
engineer_type = "backend"
elif re.search(r"\bfrontend\b", raw_text, re.IGNORECASE):
engineer_type = "frontend"
elif re.search(r"\bfullstack\b|full.?stack\b", raw_text, re.IGNORECASE):
engineer_type = "fullstack"
elif re.search(r"\bai\s+engineer|ml\s+engineer|machine\s+learning", raw_text, re.IGNORECASE):
engineer_type = "ai"
elif re.search(r"\bdata\s+engineer\b", raw_text, re.IGNORECASE):
engineer_type = "data"
remote_allowed = bool(re.search(r"\bremote\b", raw_text, re.IGNORECASE))
location_match = re.search(
r"\b(bangalore|mumbai|delhi|hyderabad|chennai|pune|kolkata|remote|india|us|usa|uk|london|new york|san francisco)\b",
raw_text, re.IGNORECASE
)
location = location_match.group(0).title() if location_match else None
return {
"required_skills": skills,
"min_yoe": min_yoe,
"max_yoe": None,
"role_type": role_type,
"engineer_type": engineer_type,
"remote_allowed": remote_allowed,
"location": location,
}