| """ |
| jd_parser.py |
| |
| Extracts a structured JDConfig from data/skill_aliases.json. |
| All downstream modules import parse_jd() — never rebuild this object at runtime. |
| |
| No network calls. No datetime.now(). Pure parsing only. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| from dataclasses import dataclass, field |
| from typing import Dict, List, Set |
|
|
|
|
| @dataclass |
| class JDConfig: |
| """ |
| Structured representation of the Job Description requirements. |
| Populated from data/skill_aliases.json, which is the authoritative taxonomy. |
| """ |
| |
| hard_requirements: Dict[str, List[str]] = field(default_factory=dict) |
|
|
| |
| preferred_requirements: Dict[str, List[str]] = field(default_factory=dict) |
|
|
| |
| negative_signals: Dict[str, List[str]] = field(default_factory=dict) |
|
|
| |
| production_keywords: List[str] = field(default_factory=list) |
|
|
| |
| rare_terms: List[str] = field(default_factory=list) |
|
|
| |
| all_hard_aliases: Set[str] = field(default_factory=set) |
| all_preferred_aliases: Set[str] = field(default_factory=set) |
| all_negative_aliases: Set[str] = field(default_factory=set) |
|
|
| def get_all_query_terms(self) -> List[str]: |
| """Return all hard + preferred aliases for BM25 Pass A query.""" |
| terms = [] |
| for aliases in self.hard_requirements.values(): |
| terms.extend(aliases) |
| for aliases in self.preferred_requirements.values(): |
| terms.extend(aliases) |
| return list(set(terms)) |
|
|
| def hard_req_names(self) -> List[str]: |
| """Canonical names for the hard requirements (for coverage scoring).""" |
| return list(self.hard_requirements.keys()) |
|
|
| def preferred_req_names(self) -> List[str]: |
| return list(self.preferred_requirements.keys()) |
|
|
|
|
| def parse_jd(skill_aliases_path: str) -> JDConfig: |
| """ |
| Parse data/skill_aliases.json into a JDConfig object. |
| |
| Args: |
| skill_aliases_path: Absolute or relative path to skill_aliases.json. |
| |
| Returns: |
| JDConfig with all fields populated. |
| |
| Raises: |
| FileNotFoundError: If the aliases file doesn't exist. |
| ValueError: If the file is malformed. |
| """ |
| if not os.path.isfile(skill_aliases_path): |
| raise FileNotFoundError( |
| f"skill_aliases.json not found at: {skill_aliases_path}" |
| ) |
|
|
| with open(skill_aliases_path, "r", encoding="utf-8") as f: |
| raw = json.load(f) |
|
|
| jd = JDConfig() |
|
|
| |
| jd_reqs = raw.get("jd_requirements", {}) |
| for canonical_name, req_data in jd_reqs.items(): |
| req_type = req_data.get("type", "preferred") |
| aliases = [a.lower().strip() for a in req_data.get("aliases", [])] |
|
|
| if req_type == "hard_requirement": |
| jd.hard_requirements[canonical_name] = aliases |
| jd.all_hard_aliases.update(aliases) |
| else: |
| |
| jd.preferred_requirements[canonical_name] = aliases |
| jd.all_preferred_aliases.update(aliases) |
|
|
| |
| neg = raw.get("negative_signals", {}) |
| for group_name, alias_list in neg.items(): |
| if group_name.startswith("_"): |
| continue |
| jd.negative_signals[group_name] = [a.lower().strip() for a in alias_list] |
| jd.all_negative_aliases.update(a.lower().strip() for a in alias_list) |
|
|
| |
| |
| jd.production_keywords = [ |
| "deployed", "scale", "serving", "latency", |
| "production", "inference", "throughput", "real-time", |
| "pipeline", "distributed" |
| ] |
|
|
| |
| jd.rare_terms = ["pinecone", "lambdarank"] |
|
|
| return jd |
|
|
|
|
| def hard_req_coverage_score(candidate: dict, jd_config: JDConfig) -> float: |
| """ |
| Compute fraction of hard requirements covered by candidate's skills. |
| |
| A hard requirement is "covered" if any of its aliases appears (case-insensitive) |
| in the candidate's skill names. Falls back gracefully on missing/empty skills. |
| |
| Schema fields read: skills[].name |
| |
| Returns: float in [0.0, 1.0] |
| """ |
| skills = candidate.get("skills", []) |
| if not skills or not jd_config.hard_requirements: |
| return 0.0 |
|
|
| |
| candidate_skill_names: Set[str] = set() |
| for s in skills: |
| name = s.get("name", "") |
| if name: |
| candidate_skill_names.add(name.lower().strip()) |
|
|
| |
| career_text = " ".join( |
| (ch.get("description", "") or "").lower() |
| for ch in candidate.get("career_history", []) |
| ) |
|
|
| covered = 0 |
| total = len(jd_config.hard_requirements) |
|
|
| for canonical_name, aliases in jd_config.hard_requirements.items(): |
| |
| if any(alias in candidate_skill_names for alias in aliases): |
| covered += 1 |
| elif any(alias in career_text for alias in aliases): |
| covered += 1 |
|
|
| return covered / total if total > 0 else 0.0 |
|
|
|
|
| if __name__ == "__main__": |
| import sys |
|
|
| base_dir = os.path.dirname(os.path.abspath(__file__)) |
| aliases_path = os.path.join(base_dir, "data", "skill_aliases.json") |
|
|
| jd = parse_jd(aliases_path) |
|
|
| print("=== JDConfig ===") |
| print(f"\nHard Requirements ({len(jd.hard_requirements)}):") |
| for name, aliases in jd.hard_requirements.items(): |
| print(f" {name}: {len(aliases)} aliases") |
|
|
| print(f"\nPreferred Requirements ({len(jd.preferred_requirements)}):") |
| for name, aliases in jd.preferred_requirements.items(): |
| print(f" {name}: {len(aliases)} aliases") |
|
|
| print(f"\nNegative Signal Groups ({len(jd.negative_signals)}):") |
| for group, aliases in jd.negative_signals.items(): |
| print(f" {group}: {len(aliases)} aliases") |
|
|
| print(f"\nProduction Keywords ({len(jd.production_keywords)}): {jd.production_keywords}") |
| print(f"Rare Terms ({len(jd.rare_terms)}): {jd.rare_terms}") |
| print(f"\nTotal hard aliases (flat set): {len(jd.all_hard_aliases)}") |
| print(f"Total preferred aliases (flat set): {len(jd.all_preferred_aliases)}") |
| print(f"Total query terms (Pass A): {len(jd.get_all_query_terms())}") |
|
|