Expand skill aliases and canonicalize JD skills for fair matching
Browse files- utilities/skills.py +51 -21
utilities/skills.py
CHANGED
|
@@ -181,20 +181,40 @@ def extract_skills(text: str) -> set:
|
|
| 181 |
# ---------------------------------------------------------------------------
|
| 182 |
SKILL_ALIASES: dict[str, str] = {
|
| 183 |
# ML / AI shorthands
|
| 184 |
-
"ml":
|
| 185 |
-
"ai":
|
| 186 |
-
"dl":
|
| 187 |
-
"cv":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
# API plurals / variants β chain: fastapi/rest apis β rest api β api
|
| 189 |
-
"apis":
|
| 190 |
-
"rest apis":
|
| 191 |
-
"restful apis":
|
| 192 |
-
"rest api":
|
| 193 |
-
"restful api":
|
| 194 |
-
"api gateway":
|
| 195 |
-
"fastapi":
|
| 196 |
-
"
|
| 197 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
}
|
| 199 |
|
| 200 |
|
|
@@ -216,11 +236,21 @@ def _expand_with_aliases(skills: set) -> set:
|
|
| 216 |
return expanded
|
| 217 |
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
def extract_required_skills_from_jd(jd_text: str) -> dict:
|
| 220 |
-
"""Return JD skills with mention-frequency as
|
| 221 |
-
|
| 222 |
cleaned = clean_text(jd_text)
|
| 223 |
-
|
|
|
|
| 224 |
|
| 225 |
|
| 226 |
def extract_resume_skills(resume_text: str) -> set:
|
|
@@ -228,14 +258,14 @@ def extract_resume_skills(resume_text: str) -> set:
|
|
| 228 |
|
| 229 |
|
| 230 |
def find_missing_skills(resume_text: str, jd_text: str) -> list:
|
| 231 |
-
jd_skills
|
| 232 |
-
resume_skills = extract_resume_skills(resume_text)
|
| 233 |
-
return
|
| 234 |
|
| 235 |
|
| 236 |
def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
|
| 237 |
-
jd_skills
|
| 238 |
-
resume_skills = extract_resume_skills(resume_text)
|
| 239 |
if not jd_skills:
|
| 240 |
return 0.0
|
| 241 |
return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)
|
|
|
|
| 181 |
# ---------------------------------------------------------------------------
|
| 182 |
SKILL_ALIASES: dict[str, str] = {
|
| 183 |
# ML / AI shorthands
|
| 184 |
+
"ml": "machine learning",
|
| 185 |
+
"ai": "machine learning",
|
| 186 |
+
"dl": "deep learning",
|
| 187 |
+
"cv": "computer vision",
|
| 188 |
+
"nlp": "natural language processing",
|
| 189 |
+
"gen ai": "generative ai",
|
| 190 |
+
"llm": "large language model",
|
| 191 |
+
"sklearn": "scikit-learn",
|
| 192 |
+
"scikit learn": "scikit-learn",
|
| 193 |
# API plurals / variants β chain: fastapi/rest apis β rest api β api
|
| 194 |
+
"apis": "api",
|
| 195 |
+
"rest apis": "rest api",
|
| 196 |
+
"restful apis": "restful api",
|
| 197 |
+
"rest api": "api",
|
| 198 |
+
"restful api": "api",
|
| 199 |
+
"api gateway": "api",
|
| 200 |
+
"fastapi": "api",
|
| 201 |
+
"flask": "api",
|
| 202 |
+
"grpc": "api",
|
| 203 |
+
"graphql": "api",
|
| 204 |
+
# Infra / data aliases
|
| 205 |
+
"k8s": "kubernetes",
|
| 206 |
+
"postgres": "postgresql",
|
| 207 |
+
"mongo": "mongodb",
|
| 208 |
+
"hf": "huggingface",
|
| 209 |
+
"hugging face": "huggingface",
|
| 210 |
+
"aws": "amazon web services",
|
| 211 |
+
"gcp": "google cloud",
|
| 212 |
+
"ci cd": "ci/cd",
|
| 213 |
+
"node": "javascript",
|
| 214 |
+
"nodejs": "javascript",
|
| 215 |
+
"js": "javascript",
|
| 216 |
+
"ts": "typescript",
|
| 217 |
+
"py": "python",
|
| 218 |
}
|
| 219 |
|
| 220 |
|
|
|
|
| 236 |
return expanded
|
| 237 |
|
| 238 |
|
| 239 |
+
def _canonicalize_jd_frequencies(freq: dict[str, int]) -> dict[str, int]:
|
| 240 |
+
"""Merge JD skill counts onto alias-expanded canonical terms."""
|
| 241 |
+
canonical: dict[str, int] = {}
|
| 242 |
+
for skill, count in freq.items():
|
| 243 |
+
for term in _expand_with_aliases({skill}):
|
| 244 |
+
canonical[term] = canonical.get(term, 0) + count
|
| 245 |
+
return canonical
|
| 246 |
+
|
| 247 |
+
|
| 248 |
def extract_required_skills_from_jd(jd_text: str) -> dict:
|
| 249 |
+
"""Return JD skills (alias-expanded) with mention-frequency as importance."""
|
| 250 |
+
raw = extract_skills(jd_text)
|
| 251 |
cleaned = clean_text(jd_text)
|
| 252 |
+
freq = {skill: cleaned.count(skill) for skill in raw}
|
| 253 |
+
return _canonicalize_jd_frequencies(freq)
|
| 254 |
|
| 255 |
|
| 256 |
def extract_resume_skills(resume_text: str) -> set:
|
|
|
|
| 258 |
|
| 259 |
|
| 260 |
def find_missing_skills(resume_text: str, jd_text: str) -> list:
|
| 261 |
+
jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
|
| 262 |
+
resume_skills = extract_resume_skills(resume_text)
|
| 263 |
+
return sorted(s for s in jd_skills if s not in resume_skills)
|
| 264 |
|
| 265 |
|
| 266 |
def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
|
| 267 |
+
jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
|
| 268 |
+
resume_skills = extract_resume_skills(resume_text)
|
| 269 |
if not jd_skills:
|
| 270 |
return 0.0
|
| 271 |
return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)
|