Siggmoid commited on
Commit
dfc11de
Β·
1 Parent(s): 9c72612

add more skill abbreviation

Browse files
Files changed (1) hide show
  1. utilities/skills.py +55 -8
utilities/skills.py CHANGED
@@ -28,9 +28,10 @@ FRONTEND = {
28
 
29
  # --- APIs & Architecture ---
30
  API_ARCH = {
31
- "rest api", "restful api", "graphql", "grpc", "websocket",
 
32
  "microservices", "microservice", "event driven", "message queue",
33
- "api gateway", "api",
34
  }
35
 
36
  # --- Databases ---
@@ -60,6 +61,8 @@ ML_AI = {
60
  "data analysis", "data analytics", "data science",
61
  "statistical analysis", "statistics", "probability",
62
  "a/b testing", "hypothesis testing",
 
 
63
  }
64
 
65
  # --- Data Engineering ---
@@ -153,9 +156,12 @@ def extract_skills(text: str) -> set:
153
  break
154
  end = idx + len(skill)
155
 
156
- # Boundary check β€” skill must be a whole word / phrase
 
157
  before_ok = (idx == 0 or cleaned[idx - 1] == ' ')
158
- after_ok = (end == len(cleaned) or cleaned[end] == ' ')
 
 
159
 
160
  if before_ok and after_ok:
161
  span = set(range(idx, end))
@@ -169,6 +175,47 @@ def extract_skills(text: str) -> set:
169
  return found
170
 
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  def extract_required_skills_from_jd(jd_text: str) -> dict:
173
  """Return JD skills with mention-frequency as an importance signal."""
174
  skills = extract_skills(jd_text)
@@ -177,18 +224,18 @@ def extract_required_skills_from_jd(jd_text: str) -> dict:
177
 
178
 
179
  def extract_resume_skills(resume_text: str) -> set:
180
- return extract_skills(resume_text)
181
 
182
 
183
  def find_missing_skills(resume_text: str, jd_text: str) -> list:
184
- jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
185
- resume_skills = extract_resume_skills(resume_text)
186
  return [s for s in jd_skills if s not in resume_skills]
187
 
188
 
189
  def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
190
  jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
191
- resume_skills = extract_resume_skills(resume_text)
192
  if not jd_skills:
193
  return 0.0
194
  return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)
 
28
 
29
  # --- APIs & Architecture ---
30
  API_ARCH = {
31
+ "rest api", "restful api", "rest apis", "restful apis",
32
+ "graphql", "grpc", "websocket",
33
  "microservices", "microservice", "event driven", "message queue",
34
+ "api gateway", "api", "apis",
35
  }
36
 
37
  # --- Databases ---
 
61
  "data analysis", "data analytics", "data science",
62
  "statistical analysis", "statistics", "probability",
63
  "a/b testing", "hypothesis testing",
64
+ # Common abbreviations / shorthands
65
+ "ml", "ai", "dl", "cv",
66
  }
67
 
68
  # --- Data Engineering ---
 
156
  break
157
  end = idx + len(skill)
158
 
159
+ # Boundary check β€” skill must start and end on a word boundary.
160
+ # Allow a trailing 's' so "rest apis" matches "rest api" etc.
161
  before_ok = (idx == 0 or cleaned[idx - 1] == ' ')
162
+ after_char = cleaned[end] if end < len(cleaned) else ' '
163
+ after_ok = (after_char == ' ' or after_char == 's' and
164
+ (end + 1 == len(cleaned) or cleaned[end + 1] == ' '))
165
 
166
  if before_ok and after_ok:
167
  span = set(range(idx, end))
 
175
  return found
176
 
177
 
178
+ # ---------------------------------------------------------------------------
179
+ # Alias map β€” resume variant β†’ canonical JD term
180
+ # If a resume has any alias, it counts as having the canonical skill.
181
+ # ---------------------------------------------------------------------------
182
+ SKILL_ALIASES: dict[str, str] = {
183
+ # ML / AI shorthands
184
+ "ml": "machine learning",
185
+ "ai": "machine learning",
186
+ "dl": "deep learning",
187
+ "cv": "computer vision",
188
+ # API plurals / variants β€” chain: fastapi/rest apis β†’ rest api β†’ api
189
+ "apis": "api",
190
+ "rest apis": "rest api",
191
+ "restful apis": "restful api",
192
+ "rest api": "api",
193
+ "restful api": "api",
194
+ "api gateway": "api",
195
+ "fastapi": "api",
196
+ "grpc": "api",
197
+ "graphql": "api",
198
+ }
199
+
200
+
201
+ def _expand_with_aliases(skills: set) -> set:
202
+ """
203
+ Transitively expand aliases until no new terms are added.
204
+ e.g. fastapi β†’ api; rest apis β†’ rest api β†’ api
205
+ """
206
+ expanded = set(skills)
207
+ while True:
208
+ new_terms = {
209
+ SKILL_ALIASES[s]
210
+ for s in expanded
211
+ if s in SKILL_ALIASES and SKILL_ALIASES[s] not in expanded
212
+ }
213
+ if not new_terms:
214
+ break
215
+ expanded |= new_terms
216
+ return expanded
217
+
218
+
219
  def extract_required_skills_from_jd(jd_text: str) -> dict:
220
  """Return JD skills with mention-frequency as an importance signal."""
221
  skills = extract_skills(jd_text)
 
224
 
225
 
226
  def extract_resume_skills(resume_text: str) -> set:
227
+ return _expand_with_aliases(extract_skills(resume_text))
228
 
229
 
230
  def find_missing_skills(resume_text: str, jd_text: str) -> list:
231
+ jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
232
+ resume_skills = extract_resume_skills(resume_text) # already alias-expanded
233
  return [s for s in jd_skills if s not in resume_skills]
234
 
235
 
236
  def calculate_skill_overlap(resume_text: str, jd_text: str) -> float:
237
  jd_skills = set(extract_required_skills_from_jd(jd_text).keys())
238
+ resume_skills = extract_resume_skills(resume_text) # already alias-expanded
239
  if not jd_skills:
240
  return 0.0
241
  return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2)