import re import math from typing import Any SENIORITY_MAP = { "intern": 0, "trainee": 0, "junior": 1, "associate": 1, "mid": 2, "senior": 3, "lead": 4, "staff": 4, "principal": 5, "architect": 5, "manager": 4, "director": 6, "vp": 7, "cto": 8, } TIER1_EDU = {"iit", "iim", "nit", "bits", "iiit", "mit", "stanford", "cmu", "berkeley"} def build_candidate_text(candidate: dict[str, Any]) -> str: parts = [] if candidate.get("parsed_summary"): parts.append(candidate["parsed_summary"]) if candidate.get("parsed_skills"): parts.append(f"Skills: {candidate['parsed_skills']}") langs = candidate.get("programming_languages") or [] if langs: parts.append(f"Languages: {', '.join(langs)}") frameworks = (candidate.get("backend_frameworks") or []) + (candidate.get("frontend_technologies") or []) if frameworks: parts.append(f"Frameworks: {', '.join(frameworks)}") work_exp = candidate.get("parsed_work_experience") or [] for we in work_exp[:3]: if isinstance(we, dict): desc = we.get("description") or we.get("role") or "" company = we.get("company") or "" if desc or company: parts.append(f"{company}: {desc}".strip(": ")) if candidate.get("most_recent_company_description"): parts.append(candidate["most_recent_company_description"]) return " | ".join(filter(None, parts)) def _parse_duration_months(entry: dict) -> float: duration = entry.get("duration") or entry.get("tenure") or "" if not duration: return 12.0 years = re.findall(r"(\d+\.?\d*)\s*(?:year|yr)", duration, re.IGNORECASE) months = re.findall(r"(\d+\.?\d*)\s*(?:month|mo)", duration, re.IGNORECASE) total = sum(float(y) * 12 for y in years) + sum(float(m) for m in months) return total if total > 0 else 12.0 def _extract_seniority(title: str) -> int: title_lower = title.lower() for key, val in sorted(SENIORITY_MAP.items(), key=lambda x: -x[1]): if key in title_lower: return val return 2 def compute_growth_velocity(work_experience: list[dict], is_funded: bool = False) -> float: import json as _json # Handle case where work_experience arrives as a JSON string (not yet parsed) if isinstance(work_experience, str): try: work_experience = _json.loads(work_experience) except Exception: work_experience = [] # Filter to only valid dict entries that have a title/role valid_entries = [e for e in (work_experience or []) if isinstance(e, dict) and (e.get("title") or e.get("role"))] if len(valid_entries) < 2: # Fallback: compute from YOE-like numeric if available, # otherwise use funded signal base = 0.6 if is_funded else 0.5 return base entries = sorted(valid_entries, key=lambda x: x.get("start_date", "") or "") seniority_levels = [] total_months = 0.0 for entry in entries: title = entry.get("title") or entry.get("role") or "" seniority_levels.append(_extract_seniority(title)) total_months += _parse_duration_months(entry) if len(seniority_levels) < 2: return 0.5 seniority_gain = seniority_levels[-1] - seniority_levels[0] years_elapsed = max(total_months / 12, 0.5) velocity = seniority_gain / years_elapsed normalized = min(max((velocity + 1) / 3, 0.0), 1.0) if is_funded: normalized = min(normalized + 0.1, 1.0) return round(normalized, 4) def skill_jaccard(jd_skills: list[str], candidate_skills: list[str]) -> float: if not jd_skills: return 0.5 jd_set = {s.lower().strip() for s in jd_skills if s} cand_set = {s.lower().strip() for s in candidate_skills if s} if not cand_set: return 0.0 intersection = jd_set & cand_set union = jd_set | cand_set return len(intersection) / len(union) if union else 0.0 def yoe_match(min_yoe: float | None, max_yoe: float | None, candidate_yoe: float | None) -> float: if candidate_yoe is None: return 0.5 if min_yoe is None and max_yoe is None: return 0.7 candidate_yoe = float(candidate_yoe) if min_yoe is not None and candidate_yoe < min_yoe: gap = min_yoe - candidate_yoe return max(0.0, 1.0 - gap * 0.2) if max_yoe is not None and candidate_yoe > max_yoe + 3: return 0.7 return 1.0 def company_quality_signal(candidate: dict[str, Any]) -> float: score = 0.5 if candidate.get("most_recent_company_is_product_company"): score += 0.2 if candidate.get("most_recent_company_is_funded"): score += 0.15 funding = candidate.get("most_recent_company_total_funding") or 0 if funding > 10_000_000: score += 0.1 if funding > 100_000_000: score += 0.05 return min(score, 1.0) def education_match(candidate: dict[str, Any]) -> float: degree = (candidate.get("degree") or "").lower() status = (candidate.get("education_status") or "").lower() score = 0.5 if "bachelor" in degree or "b.tech" in degree or "be " in degree: score = 0.6 if "master" in degree or "m.tech" in degree or "mba" in degree: score = 0.8 if "phd" in degree or "doctorate" in degree: score = 0.9 for uni in TIER1_EDU: if uni in degree or uni in status: score = min(score + 0.15, 1.0) break return score def compute_jd_quality(raw_text: str, parsed: dict[str, Any], candidate_count: int = 0) -> dict[str, Any]: required_skills = parsed.get("required_skills") or [] skill_count = len(required_skills) vagueness_score = 1.0 if skill_count >= 5: vagueness_score = 0.2 elif skill_count >= 3: vagueness_score = 0.5 elif skill_count >= 1: vagueness_score = 0.75 word_count = len(raw_text.split()) if word_count < 50: vagueness_score = min(vagueness_score + 0.3, 1.0) contradictions = [] min_yoe = parsed.get("min_yoe") engineer_type = (parsed.get("engineer_type") or "").lower() if min_yoe and min_yoe >= 5 and "junior" in raw_text.lower(): contradictions.append("Requires 5+ YOE but mentions junior role") if min_yoe and min_yoe <= 1 and "senior" in raw_text.lower(): contradictions.append("Entry-level YOE but expects senior candidate") breadth_score = 0.0 if candidate_count > 0 and skill_count < 2: breadth_score = 0.9 warnings = [] if vagueness_score > 0.6: warnings.append("JD is too vague — add more specific skill requirements for better match quality") if contradictions: warnings.append(f"Contradictions detected: {'; '.join(contradictions)}") if breadth_score > 0.7: warnings.append("Requirements are too broad — almost all candidates will match") overall = "good" if vagueness_score > 0.6 or contradictions or breadth_score > 0.7: overall = "poor" elif vagueness_score > 0.35: overall = "fair" return { "overall": overall, "vagueness_score": round(vagueness_score, 3), "breadth_score": round(breadth_score, 3), "skill_count": skill_count, "contradictions": contradictions, "warnings": warnings, } def parse_jd_requirements(raw_text: str) -> dict[str, Any]: skills = [] skill_patterns = [ r"\b(python|javascript|typescript|java|go|golang|rust|c\+\+|ruby|php|scala|kotlin|swift)\b", r"\b(react|angular|vue|nextjs|fastapi|django|flask|express|springboot|rails)\b", r"\b(postgresql|mysql|mongodb|redis|elasticsearch|kafka|rabbitmq|cassandra)\b", r"\b(aws|gcp|azure|docker|kubernetes|terraform|ansible|ci\/cd|devops)\b", r"\b(machine learning|deep learning|nlp|llm|rag|vector|embedding|pytorch|tensorflow)\b", r"\b(sql|nosql|graphql|rest|grpc|microservices|api)\b", ] for pattern in skill_patterns: found = re.findall(pattern, raw_text, re.IGNORECASE) skills.extend([f.lower() for f in found]) skills = list(dict.fromkeys(skills)) yoe_match_obj = re.search(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)", raw_text, re.IGNORECASE) min_yoe = float(yoe_match_obj.group(1)) if yoe_match_obj else None role_type = None if re.search(r"\bfull.?time\b", raw_text, re.IGNORECASE): role_type = "full-time" elif re.search(r"\bcontract\b", raw_text, re.IGNORECASE): role_type = "contract" elif re.search(r"\bpart.?time\b", raw_text, re.IGNORECASE): role_type = "part-time" engineer_type = None if re.search(r"\bbackend\b", raw_text, re.IGNORECASE): engineer_type = "backend" elif re.search(r"\bfrontend\b", raw_text, re.IGNORECASE): engineer_type = "frontend" elif re.search(r"\bfullstack\b|full.?stack\b", raw_text, re.IGNORECASE): engineer_type = "fullstack" elif re.search(r"\bai\s+engineer|ml\s+engineer|machine\s+learning", raw_text, re.IGNORECASE): engineer_type = "ai" elif re.search(r"\bdata\s+engineer\b", raw_text, re.IGNORECASE): engineer_type = "data" remote_allowed = bool(re.search(r"\bremote\b", raw_text, re.IGNORECASE)) location_match = re.search( r"\b(bangalore|mumbai|delhi|hyderabad|chennai|pune|kolkata|remote|india|us|usa|uk|london|new york|san francisco)\b", raw_text, re.IGNORECASE ) location = location_match.group(0).title() if location_match else None return { "required_skills": skills, "min_yoe": min_yoe, "max_yoe": None, "role_type": role_type, "engineer_type": engineer_type, "remote_allowed": remote_allowed, "location": location, }