Tune keyword_match: raw-text chunking and 45/55 score weights
Browse files- utilities/keyword_match.py +24 -21
utilities/keyword_match.py
CHANGED
|
@@ -88,8 +88,8 @@ def calibrate_semantic_score(cosine: float) -> float:
|
|
| 88 |
"""
|
| 89 |
Map raw cosine similarity to a 0β100 ATS-style scale.
|
| 90 |
|
| 91 |
-
MPNet
|
| 92 |
-
not 0.
|
| 93 |
"""
|
| 94 |
cosine = float(np.clip(cosine, 0.0, 1.0))
|
| 95 |
low, high = 0.20, 0.78
|
|
@@ -165,30 +165,31 @@ def semantic_match_score(resume_text: str, jd_text: str) -> float:
|
|
| 165 |
|
| 166 |
Raw cosine is calibrated to a more intuitive 0β100 scale.
|
| 167 |
"""
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
if not resume_clean or not jd_clean:
|
| 172 |
return 0.0
|
| 173 |
|
| 174 |
-
resume_doc = truncate_text(
|
| 175 |
-
jd_doc
|
| 176 |
|
| 177 |
-
doc_emb
|
| 178 |
full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
|
|
|
| 182 |
if len(resume_chunks) > 1 and len(jd_chunks) > 1:
|
| 183 |
chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
|
| 184 |
else:
|
| 185 |
chunk_sim = full_sim
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
| 190 |
skill_emb = model.encode(
|
| 191 |
-
[truncate_text(
|
| 192 |
convert_to_numpy=True,
|
| 193 |
)
|
| 194 |
skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
|
|
@@ -234,19 +235,21 @@ def experience_level_penalty(resume_text: str, jd_text: str) -> float:
|
|
| 234 |
def final_ats_score(resume_text: str, jd_text: str) -> dict:
|
| 235 |
"""
|
| 236 |
Composite ATS score weighted as:
|
| 237 |
-
|
| 238 |
-
|
| 239 |
|
| 240 |
-
|
|
|
|
|
|
|
| 241 |
|
| 242 |
-
|
| 243 |
"""
|
| 244 |
semantic = semantic_match_score(resume_text, jd_text)
|
| 245 |
keyword = keyword_match_score(resume_text, jd_text)
|
| 246 |
penalty = experience_level_penalty(resume_text, jd_text)
|
| 247 |
|
| 248 |
-
raw_score
|
| 249 |
-
final
|
| 250 |
|
| 251 |
return {
|
| 252 |
"semantic_score": round(semantic, 2),
|
|
|
|
| 88 |
"""
|
| 89 |
Map raw cosine similarity to a 0β100 ATS-style scale.
|
| 90 |
|
| 91 |
+
MPNet cosine for resume/JD pairs sits roughly in 0.20β0.78.
|
| 92 |
+
Floor at 0.20 (not 0.32) so valid weak matches aren't clamped to zero.
|
| 93 |
"""
|
| 94 |
cosine = float(np.clip(cosine, 0.0, 1.0))
|
| 95 |
low, high = 0.20, 0.78
|
|
|
|
| 165 |
|
| 166 |
Raw cosine is calibrated to a more intuitive 0β100 scale.
|
| 167 |
"""
|
| 168 |
+
# Keep raw text for chunking (needs \n, ., ? boundaries intact)
|
| 169 |
+
# Only clean for the full-doc embedding where punctuation adds no value
|
| 170 |
+
if not resume_text.strip() or not jd_text.strip():
|
|
|
|
| 171 |
return 0.0
|
| 172 |
|
| 173 |
+
resume_doc = truncate_text(clean_text(resume_text))
|
| 174 |
+
jd_doc = truncate_text(clean_text(jd_text))
|
| 175 |
|
| 176 |
+
doc_emb = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
|
| 177 |
full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
|
| 178 |
|
| 179 |
+
# Chunk on RAW text so sentence/line boundaries are preserved
|
| 180 |
+
resume_chunks = split_into_chunks(resume_text)
|
| 181 |
+
jd_chunks = split_into_chunks(jd_text)
|
| 182 |
if len(resume_chunks) > 1 and len(jd_chunks) > 1:
|
| 183 |
chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
|
| 184 |
else:
|
| 185 |
chunk_sim = full_sim
|
| 186 |
|
| 187 |
+
# Skill sentences β also extract from raw then clean per-segment (done inside)
|
| 188 |
+
resume_skills_text = extract_skill_sentences(resume_text)
|
| 189 |
+
jd_skills_text = extract_skill_sentences(jd_text)
|
| 190 |
+
if resume_skills_text and jd_skills_text:
|
| 191 |
skill_emb = model.encode(
|
| 192 |
+
[truncate_text(resume_skills_text, 4000), truncate_text(jd_skills_text, 4000)],
|
| 193 |
convert_to_numpy=True,
|
| 194 |
)
|
| 195 |
skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
|
|
|
|
| 235 |
def final_ats_score(resume_text: str, jd_text: str) -> dict:
|
| 236 |
"""
|
| 237 |
Composite ATS score weighted as:
|
| 238 |
+
45% semantic similarity (contextual understanding)
|
| 239 |
+
55% keyword match (skill taxonomy match, frequency-weighted)
|
| 240 |
|
| 241 |
+
Keyword is weighted slightly higher because it is deterministic and
|
| 242 |
+
directly reflects JD skill requirements. Semantic rewards phrasing
|
| 243 |
+
quality but shouldn't dominate when skills strongly match.
|
| 244 |
|
| 245 |
+
A seniority mismatch penalty (0β10 pts) is subtracted from the final score.
|
| 246 |
"""
|
| 247 |
semantic = semantic_match_score(resume_text, jd_text)
|
| 248 |
keyword = keyword_match_score(resume_text, jd_text)
|
| 249 |
penalty = experience_level_penalty(resume_text, jd_text)
|
| 250 |
|
| 251 |
+
raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
|
| 252 |
+
final = round(max(0.0, raw_score - penalty), 2)
|
| 253 |
|
| 254 |
return {
|
| 255 |
"semantic_score": round(semantic, 2),
|