Siggmoid Cursor commited on
Commit
2717aab
Β·
1 Parent(s): dfc11de

Expand skill aliases and canonicalize JD skills for fair matching

Browse files
Files changed (1) hide show
  1. utilities/skills.py +51 -21
utilities/skills.py CHANGED
@@ -181,20 +181,40 @@ def extract_skills(text: str) -> set:
181
  # ---------------------------------------------------------------------------
182
  SKILL_ALIASES: dict[str, str] = {
183
  # ML / AI shorthands
184
- "ml": "machine learning",
185
- "ai": "machine learning",
186
- "dl": "deep learning",
187
- "cv": "computer vision",
 
 
 
 
 
188
  # API plurals / variants β€” chain: fastapi/rest apis β†’ rest api β†’ api
189
- "apis": "api",
190
- "rest apis": "rest api",
191
- "restful apis": "restful api",
192
- "rest api": "api",
193
- "restful api": "api",
194
- "api gateway": "api",
195
- "fastapi": "api",
196
- "grpc": "api",
197
- "graphql": "api",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  }
199
 
200
 
@@ -216,11 +236,21 @@ def _expand_with_aliases(skills: set) -> set:
216
  return expanded
217
 
218
 
 
 
 
 
 
 
 
 
 
219
  def extract_required_skills_from_jd(jd_text: str) -> dict:
220
- """Return JD skills with mention-frequency as an importance signal."""
221
- skills = extract_skills(jd_text)
222
  cleaned = clean_text(jd_text)
223
- return {skill: cleaned.count(skill) for skill in skills}
 
224
 
225
 
226
  def extract_resume_skills(resume_text: str) -> set:
@@ -228,14 +258,14 @@ def extract_resume_skills(resume_text: str) -> set:
228
 
229
 
230
  def find_missing_skills(resume_text: str, jd_text: str) -> list:
231
- jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
232
- resume_skills = extract_resume_skills(resume_text) # already alias-expanded
233
- return [s for s in jd_skills if s not in resume_skills]
234
 
235
 
236
  def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
237
- jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
238
- resume_skills = extract_resume_skills(resume_text) # already alias-expanded
239
  if not jd_skills:
240
  return 0.0
241
  return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)
 
181
  # ---------------------------------------------------------------------------
182
  SKILL_ALIASES: dict[str, str] = {
183
  # ML / AI shorthands
184
+ "ml": "machine learning",
185
+ "ai": "machine learning",
186
+ "dl": "deep learning",
187
+ "cv": "computer vision",
188
+ "nlp": "natural language processing",
189
+ "gen ai": "generative ai",
190
+ "llm": "large language model",
191
+ "sklearn": "scikit-learn",
192
+ "scikit learn": "scikit-learn",
193
  # API plurals / variants β€” chain: fastapi/rest apis β†’ rest api β†’ api
194
+ "apis": "api",
195
+ "rest apis": "rest api",
196
+ "restful apis": "restful api",
197
+ "rest api": "api",
198
+ "restful api": "api",
199
+ "api gateway": "api",
200
+ "fastapi": "api",
201
+ "flask": "api",
202
+ "grpc": "api",
203
+ "graphql": "api",
204
+ # Infra / data aliases
205
+ "k8s": "kubernetes",
206
+ "postgres": "postgresql",
207
+ "mongo": "mongodb",
208
+ "hf": "huggingface",
209
+ "hugging face": "huggingface",
210
+ "aws": "amazon web services",
211
+ "gcp": "google cloud",
212
+ "ci cd": "ci/cd",
213
+ "node": "javascript",
214
+ "nodejs": "javascript",
215
+ "js": "javascript",
216
+ "ts": "typescript",
217
+ "py": "python",
218
  }
219
 
220
 
 
236
  return expanded
237
 
238
 
239
+ def _canonicalize_jd_frequencies(freq: dict[str, int]) -> dict[str, int]:
240
+ """Merge JD skill counts onto alias-expanded canonical terms."""
241
+ canonical: dict[str, int] = {}
242
+ for skill, count in freq.items():
243
+ for term in _expand_with_aliases({skill}):
244
+ canonical[term] = canonical.get(term, 0) + count
245
+ return canonical
246
+
247
+
248
  def extract_required_skills_from_jd(jd_text: str) -> dict:
249
+ """Return JD skills (alias-expanded) with mention-frequency as importance."""
250
+ raw = extract_skills(jd_text)
251
  cleaned = clean_text(jd_text)
252
+ freq = {skill: cleaned.count(skill) for skill in raw}
253
+ return _canonicalize_jd_frequencies(freq)
254
 
255
 
256
  def extract_resume_skills(resume_text: str) -> set:
 
258
 
259
 
260
  def find_missing_skills(resume_text: str, jd_text: str) -> list:
261
+ jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
262
+ resume_skills = extract_resume_skills(resume_text)
263
+ return sorted(s for s in jd_skills if s not in resume_skills)
264
 
265
 
266
  def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
267
+ jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
268
+ resume_skills = extract_resume_skills(resume_text)
269
  if not jd_skills:
270
  return 0.0
271
  return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)