Siggmoid Cursor commited on
Commit
1905876
Β·
1 Parent(s): 02e400a

Update scoring: MS MARCO embeddings and skill-centric semantic matching

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. utilities/keyword_match.py +48 -31
Dockerfile CHANGED
@@ -23,7 +23,7 @@ COPY --chown=user requirements.txt requirements.txt
23
  RUN pip install --no-cache-dir -r requirements.txt
24
 
25
  # Pre-download embedding model at build time (faster Space cold start)
26
- RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('all-mpnet-base-v2')\""
27
 
28
  COPY --chown=user . /app
29
 
 
23
  RUN pip install --no-cache-dir -r requirements.txt
24
 
25
  # Pre-download embedding model at build time (faster Space cold start)
26
+ RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('msmarco-distilbert-base-v4')\""
27
 
28
  COPY --chown=user . /app
29
 
utilities/keyword_match.py CHANGED
@@ -12,7 +12,7 @@ from utilities.skills import (
12
  )
13
 
14
  # MPNet is stronger than MiniLM for long-form resume/JD similarity.
15
- SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "all-mpnet-base-v2")
16
  MAX_DOC_CHARS = 8000
17
  MAX_CHUNKS = 24
18
  MIN_CHUNK_CHARS = 35
@@ -88,8 +88,8 @@ def calibrate_semantic_score(cosine: float) -> float:
88
  """
89
  Map raw cosine similarity to a 0–100 ATS-style scale.
90
 
91
- MPNet cosine for resume/JD pairs sits roughly in 0.20–0.78.
92
- Floor at 0.20 (not 0.32) so valid weak matches aren't clamped to zero.
93
  """
94
  cosine = float(np.clip(cosine, 0.0, 1.0))
95
  low, high = 0.20, 0.78
@@ -154,29 +154,47 @@ def keyword_match_score(resume_text: str, jd_text: str) -> float:
154
  return round(matched_weight / total_weight * 100, 2)
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def semantic_match_score(resume_text: str, jd_text: str) -> float:
158
  """
159
- Semantic similarity tuned for resume ↔ JD alignment.
160
 
161
- Combines:
162
- 1. Full-document embedding (overall theme)
163
- 2. Chunk-level bi-directional match (handles length mismatch)
164
- 3. Skill-heavy lines (secondary boost)
165
 
166
- Raw cosine is calibrated to a more intuitive 0–100 scale.
 
 
 
167
  """
168
- # Keep raw text for chunking (needs \n, ., ? boundaries intact)
169
- # Only clean for the full-doc embedding where punctuation adds no value
170
  if not resume_text.strip() or not jd_text.strip():
171
  return 0.0
172
 
173
- resume_doc = truncate_text(clean_text(resume_text))
174
- jd_doc = truncate_text(clean_text(jd_text))
175
-
176
- doc_emb = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
177
- full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
178
 
179
- # Chunk on RAW text so sentence/line boundaries are preserved
180
  resume_chunks = split_into_chunks(resume_text)
181
  jd_chunks = split_into_chunks(jd_text)
182
  if len(resume_chunks) > 1 and len(jd_chunks) > 1:
@@ -184,19 +202,20 @@ def semantic_match_score(resume_text: str, jd_text: str) -> float:
184
  else:
185
  chunk_sim = full_sim
186
 
187
- # Skill sentences β€” also extract from raw then clean per-segment (done inside)
188
- resume_skills_text = extract_skill_sentences(resume_text)
189
- jd_skills_text = extract_skill_sentences(jd_text)
190
- if resume_skills_text and jd_skills_text:
191
  skill_emb = model.encode(
192
- [truncate_text(resume_skills_text, 4000), truncate_text(jd_skills_text, 4000)],
 
193
  convert_to_numpy=True,
194
  )
195
  skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
196
  else:
197
  skill_sim = full_sim
198
 
199
- raw_cosine = 0.30 * full_sim + 0.55 * chunk_sim + 0.15 * skill_sim
200
  return calibrate_semantic_score(raw_cosine)
201
 
202
 
@@ -235,21 +254,19 @@ def experience_level_penalty(resume_text: str, jd_text: str) -> float:
235
  def final_ats_score(resume_text: str, jd_text: str) -> dict:
236
  """
237
  Composite ATS score weighted as:
238
- 45% semantic similarity (contextual understanding)
239
- 55% keyword match (skill taxonomy match, frequency-weighted)
240
-
241
- Keyword is weighted slightly higher because it is deterministic and
242
- directly reflects JD skill requirements. Semantic rewards phrasing
243
- quality but shouldn't dominate when skills strongly match.
244
 
245
  A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
 
 
246
  """
247
  semantic = semantic_match_score(resume_text, jd_text)
248
  keyword = keyword_match_score(resume_text, jd_text)
249
  penalty = experience_level_penalty(resume_text, jd_text)
250
 
251
- raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
252
- final = round(max(0.0, raw_score - penalty), 2)
253
 
254
  return {
255
  "semantic_score": round(semantic, 2),
 
12
  )
13
 
14
  # MPNet is stronger than MiniLM for long-form resume/JD similarity.
15
+ SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "msmarco-distilbert-base-v4")
16
  MAX_DOC_CHARS = 8000
17
  MAX_CHUNKS = 24
18
  MIN_CHUNK_CHARS = 35
 
88
  """
89
  Map raw cosine similarity to a 0–100 ATS-style scale.
90
 
91
+ MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82,
92
+ not 0.9+, so raw cosine understates good matches without calibration.
93
  """
94
  cosine = float(np.clip(cosine, 0.0, 1.0))
95
  low, high = 0.20, 0.78
 
154
  return round(matched_weight / total_weight * 100, 2)
155
 
156
 
157
+ def _normalize_for_embedding(text: str) -> str:
158
+ """
159
+ Convert resume or JD into a neutral skill-centric representation.
160
+
161
+ Problem: resumes use first-person achievement language; JDs use
162
+ third-person requirement language. A general-purpose model sees these
163
+ as stylistically distant (cosine ~0.40) even when skills match perfectly.
164
+
165
+ Fix: extract skills + skill-heavy sentences and represent both docs
166
+ in the same "skills: X Y Z context: ..." format so the model compares
167
+ skill vocabulary, not writing style.
168
+ """
169
+ cleaned = clean_text(text)
170
+ extracted_skills = extract_resume_skills(cleaned)
171
+ skill_list = " ".join(sorted(extracted_skills))
172
+ skill_context = clean_text(extract_skill_sentences(text))
173
+ return f"skills: {skill_list} context: {skill_context}"
174
+
175
+
176
  def semantic_match_score(resume_text: str, jd_text: str) -> float:
177
  """
178
+ Semantic similarity tuned for resume <-> JD alignment.
179
 
180
+ Both documents are normalised into skill-centric representations
181
+ before embedding so the model compares skill overlap, not writing style.
 
 
182
 
183
+ Combines:
184
+ 1. Normalised full-doc embedding (50%) - fixes style mismatch
185
+ 2. Chunk-level bi-directional on RAW text (35%) - preserves sentence boundaries
186
+ 3. Skill-sentences-only embedding (15%) - fine-grained skill context
187
  """
 
 
188
  if not resume_text.strip() or not jd_text.strip():
189
  return 0.0
190
 
191
+ # Signal 1: normalised doc (style-agnostic skill comparison)
192
+ resume_norm = _normalize_for_embedding(resume_text)
193
+ jd_norm = _normalize_for_embedding(jd_text)
194
+ doc_emb = model.encode([resume_norm, jd_norm], convert_to_numpy=True)
195
+ full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
196
 
197
+ # Signal 2: chunk-level on RAW text (needs \n/. boundaries intact)
198
  resume_chunks = split_into_chunks(resume_text)
199
  jd_chunks = split_into_chunks(jd_text)
200
  if len(resume_chunks) > 1 and len(jd_chunks) > 1:
 
202
  else:
203
  chunk_sim = full_sim
204
 
205
+ # Signal 3: skill-sentence embedding
206
+ resume_skill_text = extract_skill_sentences(resume_text)
207
+ jd_skill_text = extract_skill_sentences(jd_text)
208
+ if resume_skill_text and jd_skill_text:
209
  skill_emb = model.encode(
210
+ [truncate_text(resume_skill_text, 4000),
211
+ truncate_text(jd_skill_text, 4000)],
212
  convert_to_numpy=True,
213
  )
214
  skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
215
  else:
216
  skill_sim = full_sim
217
 
218
+ raw_cosine = 0.50 * full_sim + 0.35 * chunk_sim + 0.15 * skill_sim
219
  return calibrate_semantic_score(raw_cosine)
220
 
221
 
 
254
  def final_ats_score(resume_text: str, jd_text: str) -> dict:
255
  """
256
  Composite ATS score weighted as:
257
+ 60% semantic similarity (contextual understanding)
258
+ 40% keyword match (skill taxonomy match, frequency-weighted)
 
 
 
 
259
 
260
  A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
261
+
262
+ Returns a dict compatible with ScoreResponse schema.
263
  """
264
  semantic = semantic_match_score(resume_text, jd_text)
265
  keyword = keyword_match_score(resume_text, jd_text)
266
  penalty = experience_level_penalty(resume_text, jd_text)
267
 
268
+ raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
269
+ final = round(max(0.0, raw_score - penalty), 2)
270
 
271
  return {
272
  "semantic_score": round(semantic, 2),