add more skill abbreviation
Browse files- utilities/skills.py +55 -8
utilities/skills.py
CHANGED
|
@@ -28,9 +28,10 @@ FRONTEND = {
|
|
| 28 |
|
| 29 |
# --- APIs & Architecture ---
|
| 30 |
API_ARCH = {
|
| 31 |
-
"rest api", "restful api", "
|
|
|
|
| 32 |
"microservices", "microservice", "event driven", "message queue",
|
| 33 |
-
"api gateway", "api",
|
| 34 |
}
|
| 35 |
|
| 36 |
# --- Databases ---
|
|
@@ -60,6 +61,8 @@ ML_AI = {
|
|
| 60 |
"data analysis", "data analytics", "data science",
|
| 61 |
"statistical analysis", "statistics", "probability",
|
| 62 |
"a/b testing", "hypothesis testing",
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
|
| 65 |
# --- Data Engineering ---
|
|
@@ -153,9 +156,12 @@ def extract_skills(text: str) -> set:
|
|
| 153 |
break
|
| 154 |
end = idx + len(skill)
|
| 155 |
|
| 156 |
-
# Boundary check β skill must
|
|
|
|
| 157 |
before_ok = (idx == 0 or cleaned[idx - 1] == ' ')
|
| 158 |
-
|
|
|
|
|
|
|
| 159 |
|
| 160 |
if before_ok and after_ok:
|
| 161 |
span = set(range(idx, end))
|
|
@@ -169,6 +175,47 @@ def extract_skills(text: str) -> set:
|
|
| 169 |
return found
|
| 170 |
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
def extract_required_skills_from_jd(jd_text: str) -> dict:
|
| 173 |
"""Return JD skills with mention-frequency as an importance signal."""
|
| 174 |
skills = extract_skills(jd_text)
|
|
@@ -177,18 +224,18 @@ def extract_required_skills_from_jd(jd_text: str) -> dict:
|
|
| 177 |
|
| 178 |
|
| 179 |
def extract_resume_skills(resume_text: str) -> set:
|
| 180 |
-
return extract_skills(resume_text)
|
| 181 |
|
| 182 |
|
| 183 |
def find_missing_skills(resume_text: str, jd_text: str) -> list:
|
| 184 |
-
jd_skills
|
| 185 |
-
resume_skills = extract_resume_skills(resume_text)
|
| 186 |
return [s for s in jd_skills if s not in resume_skills]
|
| 187 |
|
| 188 |
|
| 189 |
def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
|
| 190 |
jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
|
| 191 |
-
resume_skills = extract_resume_skills(resume_text)
|
| 192 |
if not jd_skills:
|
| 193 |
return 0.0
|
| 194 |
return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)
|
|
|
|
| 28 |
|
| 29 |
# --- APIs & Architecture ---
|
| 30 |
API_ARCH = {
|
| 31 |
+
"rest api", "restful api", "rest apis", "restful apis",
|
| 32 |
+
"graphql", "grpc", "websocket",
|
| 33 |
"microservices", "microservice", "event driven", "message queue",
|
| 34 |
+
"api gateway", "api", "apis",
|
| 35 |
}
|
| 36 |
|
| 37 |
# --- Databases ---
|
|
|
|
| 61 |
"data analysis", "data analytics", "data science",
|
| 62 |
"statistical analysis", "statistics", "probability",
|
| 63 |
"a/b testing", "hypothesis testing",
|
| 64 |
+
# Common abbreviations / shorthands
|
| 65 |
+
"ml", "ai", "dl", "cv",
|
| 66 |
}
|
| 67 |
|
| 68 |
# --- Data Engineering ---
|
|
|
|
| 156 |
break
|
| 157 |
end = idx + len(skill)
|
| 158 |
|
| 159 |
+
# Boundary check β skill must start and end on a word boundary.
|
| 160 |
+
# Allow a trailing 's' so "rest apis" matches "rest api" etc.
|
| 161 |
before_ok = (idx == 0 or cleaned[idx - 1] == ' ')
|
| 162 |
+
after_char = cleaned[end] if end < len(cleaned) else ' '
|
| 163 |
+
after_ok = (after_char == ' ' or after_char == 's' and
|
| 164 |
+
(end + 1 == len(cleaned) or cleaned[end + 1] == ' '))
|
| 165 |
|
| 166 |
if before_ok and after_ok:
|
| 167 |
span = set(range(idx, end))
|
|
|
|
| 175 |
return found
|
| 176 |
|
| 177 |
|
| 178 |
+
# ---------------------------------------------------------------------------
|
| 179 |
+
# Alias map β resume variant β canonical JD term
|
| 180 |
+
# If a resume has any alias, it counts as having the canonical skill.
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
SKILL_ALIASES: dict[str, str] = {
|
| 183 |
+
# ML / AI shorthands
|
| 184 |
+
"ml": "machine learning",
|
| 185 |
+
"ai": "machine learning",
|
| 186 |
+
"dl": "deep learning",
|
| 187 |
+
"cv": "computer vision",
|
| 188 |
+
# API plurals / variants β chain: fastapi/rest apis β rest api β api
|
| 189 |
+
"apis": "api",
|
| 190 |
+
"rest apis": "rest api",
|
| 191 |
+
"restful apis": "restful api",
|
| 192 |
+
"rest api": "api",
|
| 193 |
+
"restful api": "api",
|
| 194 |
+
"api gateway": "api",
|
| 195 |
+
"fastapi": "api",
|
| 196 |
+
"grpc": "api",
|
| 197 |
+
"graphql": "api",
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def _expand_with_aliases(skills: set) -> set:
|
| 202 |
+
"""
|
| 203 |
+
Transitively expand aliases until no new terms are added.
|
| 204 |
+
e.g. fastapi β api; rest apis β rest api β api
|
| 205 |
+
"""
|
| 206 |
+
expanded = set(skills)
|
| 207 |
+
while True:
|
| 208 |
+
new_terms = {
|
| 209 |
+
SKILL_ALIASES[s]
|
| 210 |
+
for s in expanded
|
| 211 |
+
if s in SKILL_ALIASES and SKILL_ALIASES[s] not in expanded
|
| 212 |
+
}
|
| 213 |
+
if not new_terms:
|
| 214 |
+
break
|
| 215 |
+
expanded |= new_terms
|
| 216 |
+
return expanded
|
| 217 |
+
|
| 218 |
+
|
| 219 |
def extract_required_skills_from_jd(jd_text: str) -> dict:
|
| 220 |
"""Return JD skills with mention-frequency as an importance signal."""
|
| 221 |
skills = extract_skills(jd_text)
|
|
|
|
| 224 |
|
| 225 |
|
| 226 |
def extract_resume_skills(resume_text: str) -> set:
|
| 227 |
+
return _expand_with_aliases(extract_skills(resume_text))
|
| 228 |
|
| 229 |
|
| 230 |
def find_missing_skills(resume_text: str, jd_text: str) -> list:
|
| 231 |
+
jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
|
| 232 |
+
resume_skills = extract_resume_skills(resume_text) # already alias-expanded
|
| 233 |
return [s for s in jd_skills if s not in resume_skills]
|
| 234 |
|
| 235 |
|
| 236 |
def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
|
| 237 |
jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
|
| 238 |
+
resume_skills = extract_resume_skills(resume_text) # already alias-expanded
|
| 239 |
if not jd_skills:
|
| 240 |
return 0.0
|
| 241 |
return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)
|