Siggmoid Cursor commited on
Commit
02e400a
Β·
1 Parent(s): 2717aab

Tune keyword_match: raw-text chunking and 45/55 score weights

Browse files
Files changed (1) hide show
  1. utilities/keyword_match.py +24 -21
utilities/keyword_match.py CHANGED
@@ -88,8 +88,8 @@ def calibrate_semantic_score(cosine: float) -> float:
88
  """
89
  Map raw cosine similarity to a 0–100 ATS-style scale.
90
 
91
- MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82,
92
- not 0.9+, so raw cosine understates good matches without calibration.
93
  """
94
  cosine = float(np.clip(cosine, 0.0, 1.0))
95
  low, high = 0.20, 0.78
@@ -165,30 +165,31 @@ def semantic_match_score(resume_text: str, jd_text: str) -> float:
165
 
166
  Raw cosine is calibrated to a more intuitive 0–100 scale.
167
  """
168
- resume_clean = clean_text(resume_text)
169
- jd_clean = clean_text(jd_text)
170
-
171
- if not resume_clean or not jd_clean:
172
  return 0.0
173
 
174
- resume_doc = truncate_text(resume_clean)
175
- jd_doc = truncate_text(jd_clean)
176
 
177
- doc_emb = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
178
  full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
179
 
180
- resume_chunks = split_into_chunks(resume_clean)
181
- jd_chunks = split_into_chunks(jd_clean)
 
182
  if len(resume_chunks) > 1 and len(jd_chunks) > 1:
183
  chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
184
  else:
185
  chunk_sim = full_sim
186
 
187
- resume_skills = extract_skill_sentences(resume_clean)
188
- jd_skills = extract_skill_sentences(jd_clean)
189
- if resume_skills and jd_skills:
 
190
  skill_emb = model.encode(
191
- [truncate_text(resume_skills, 4000), truncate_text(jd_skills, 4000)],
192
  convert_to_numpy=True,
193
  )
194
  skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
@@ -234,19 +235,21 @@ def experience_level_penalty(resume_text: str, jd_text: str) -> float:
234
  def final_ats_score(resume_text: str, jd_text: str) -> dict:
235
  """
236
  Composite ATS score weighted as:
237
- 60% semantic similarity (contextual understanding)
238
- 40% keyword match (skill taxonomy match, frequency-weighted)
239
 
240
- A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
 
 
241
 
242
- Returns a dict compatible with ScoreResponse schema.
243
  """
244
  semantic = semantic_match_score(resume_text, jd_text)
245
  keyword = keyword_match_score(resume_text, jd_text)
246
  penalty = experience_level_penalty(resume_text, jd_text)
247
 
248
- raw_score = round(0.6 * semantic + 0.4 * keyword, 2)
249
- final = round(max(0.0, raw_score - penalty), 2)
250
 
251
  return {
252
  "semantic_score": round(semantic, 2),
 
88
  """
89
  Map raw cosine similarity to a 0–100 ATS-style scale.
90
 
91
+ MPNet cosine for resume/JD pairs sits roughly in 0.20–0.78.
92
+ Floor at 0.20 (not 0.32) so valid weak matches aren't clamped to zero.
93
  """
94
  cosine = float(np.clip(cosine, 0.0, 1.0))
95
  low, high = 0.20, 0.78
 
165
 
166
  Raw cosine is calibrated to a more intuitive 0–100 scale.
167
  """
168
+ # Keep raw text for chunking (needs \n, ., ? boundaries intact)
169
+ # Only clean for the full-doc embedding where punctuation adds no value
170
+ if not resume_text.strip() or not jd_text.strip():
 
171
  return 0.0
172
 
173
+ resume_doc = truncate_text(clean_text(resume_text))
174
+ jd_doc = truncate_text(clean_text(jd_text))
175
 
176
+ doc_emb = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
177
  full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
178
 
179
+ # Chunk on RAW text so sentence/line boundaries are preserved
180
+ resume_chunks = split_into_chunks(resume_text)
181
+ jd_chunks = split_into_chunks(jd_text)
182
  if len(resume_chunks) > 1 and len(jd_chunks) > 1:
183
  chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
184
  else:
185
  chunk_sim = full_sim
186
 
187
+ # Skill sentences β€” also extract from raw then clean per-segment (done inside)
188
+ resume_skills_text = extract_skill_sentences(resume_text)
189
+ jd_skills_text = extract_skill_sentences(jd_text)
190
+ if resume_skills_text and jd_skills_text:
191
  skill_emb = model.encode(
192
+ [truncate_text(resume_skills_text, 4000), truncate_text(jd_skills_text, 4000)],
193
  convert_to_numpy=True,
194
  )
195
  skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
 
235
  def final_ats_score(resume_text: str, jd_text: str) -> dict:
236
  """
237
  Composite ATS score weighted as:
238
+ 45% semantic similarity (contextual understanding)
239
+ 55% keyword match (skill taxonomy match, frequency-weighted)
240
 
241
+ Keyword is weighted slightly higher because it is deterministic and
242
+ directly reflects JD skill requirements. Semantic rewards phrasing
243
+ quality but shouldn't dominate when skills strongly match.
244
 
245
+ A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
246
  """
247
  semantic = semantic_match_score(resume_text, jd_text)
248
  keyword = keyword_match_score(resume_text, jd_text)
249
  penalty = experience_level_penalty(resume_text, jd_text)
250
 
251
+ raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
252
+ final = round(max(0.0, raw_score - penalty), 2)
253
 
254
  return {
255
  "semantic_score": round(semantic, 2),