Update scoring: MS MARCO embeddings and skill-centric semantic matching
Browse files- Dockerfile +1 -1
- utilities/keyword_match.py +48 -31
Dockerfile
CHANGED
|
@@ -23,7 +23,7 @@ COPY --chown=user requirements.txt requirements.txt
|
|
| 23 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 24 |
|
| 25 |
# Pre-download embedding model at build time (faster Space cold start)
|
| 26 |
-
RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('
|
| 27 |
|
| 28 |
COPY --chown=user . /app
|
| 29 |
|
|
|
|
| 23 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 24 |
|
| 25 |
# Pre-download embedding model at build time (faster Space cold start)
|
| 26 |
+
RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('msmarco-distilbert-base-v4')\""
|
| 27 |
|
| 28 |
COPY --chown=user . /app
|
| 29 |
|
utilities/keyword_match.py
CHANGED
|
@@ -12,7 +12,7 @@ from utilities.skills import (
|
|
| 12 |
)
|
| 13 |
|
| 14 |
# MPNet is stronger than MiniLM for long-form resume/JD similarity.
|
| 15 |
-
SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "
|
| 16 |
MAX_DOC_CHARS = 8000
|
| 17 |
MAX_CHUNKS = 24
|
| 18 |
MIN_CHUNK_CHARS = 35
|
|
@@ -88,8 +88,8 @@ def calibrate_semantic_score(cosine: float) -> float:
|
|
| 88 |
"""
|
| 89 |
Map raw cosine similarity to a 0β100 ATS-style scale.
|
| 90 |
|
| 91 |
-
MPNet cosine for resume/JD pairs sits
|
| 92 |
-
|
| 93 |
"""
|
| 94 |
cosine = float(np.clip(cosine, 0.0, 1.0))
|
| 95 |
low, high = 0.20, 0.78
|
|
@@ -154,29 +154,47 @@ def keyword_match_score(resume_text: str, jd_text: str) -> float:
|
|
| 154 |
return round(matched_weight / total_weight * 100, 2)
|
| 155 |
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
def semantic_match_score(resume_text: str, jd_text: str) -> float:
|
| 158 |
"""
|
| 159 |
-
Semantic similarity tuned for resume
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
2. Chunk-level bi-directional match (handles length mismatch)
|
| 164 |
-
3. Skill-heavy lines (secondary boost)
|
| 165 |
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
| 167 |
"""
|
| 168 |
-
# Keep raw text for chunking (needs \n, ., ? boundaries intact)
|
| 169 |
-
# Only clean for the full-doc embedding where punctuation adds no value
|
| 170 |
if not resume_text.strip() or not jd_text.strip():
|
| 171 |
return 0.0
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
doc_emb
|
| 177 |
-
full_sim
|
| 178 |
|
| 179 |
-
#
|
| 180 |
resume_chunks = split_into_chunks(resume_text)
|
| 181 |
jd_chunks = split_into_chunks(jd_text)
|
| 182 |
if len(resume_chunks) > 1 and len(jd_chunks) > 1:
|
|
@@ -184,19 +202,20 @@ def semantic_match_score(resume_text: str, jd_text: str) -> float:
|
|
| 184 |
else:
|
| 185 |
chunk_sim = full_sim
|
| 186 |
|
| 187 |
-
#
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
if
|
| 191 |
skill_emb = model.encode(
|
| 192 |
-
[truncate_text(
|
|
|
|
| 193 |
convert_to_numpy=True,
|
| 194 |
)
|
| 195 |
skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
|
| 196 |
else:
|
| 197 |
skill_sim = full_sim
|
| 198 |
|
| 199 |
-
raw_cosine = 0.
|
| 200 |
return calibrate_semantic_score(raw_cosine)
|
| 201 |
|
| 202 |
|
|
@@ -235,21 +254,19 @@ def experience_level_penalty(resume_text: str, jd_text: str) -> float:
|
|
| 235 |
def final_ats_score(resume_text: str, jd_text: str) -> dict:
|
| 236 |
"""
|
| 237 |
Composite ATS score weighted as:
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
Keyword is weighted slightly higher because it is deterministic and
|
| 242 |
-
directly reflects JD skill requirements. Semantic rewards phrasing
|
| 243 |
-
quality but shouldn't dominate when skills strongly match.
|
| 244 |
|
| 245 |
A seniority mismatch penalty (0β10 pts) is subtracted from the final score.
|
|
|
|
|
|
|
| 246 |
"""
|
| 247 |
semantic = semantic_match_score(resume_text, jd_text)
|
| 248 |
keyword = keyword_match_score(resume_text, jd_text)
|
| 249 |
penalty = experience_level_penalty(resume_text, jd_text)
|
| 250 |
|
| 251 |
-
raw_score
|
| 252 |
-
final
|
| 253 |
|
| 254 |
return {
|
| 255 |
"semantic_score": round(semantic, 2),
|
|
|
|
| 12 |
)
|
| 13 |
|
| 14 |
# MPNet is stronger than MiniLM for long-form resume/JD similarity.
|
| 15 |
+
SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "msmarco-distilbert-base-v4")
|
| 16 |
MAX_DOC_CHARS = 8000
|
| 17 |
MAX_CHUNKS = 24
|
| 18 |
MIN_CHUNK_CHARS = 35
|
|
|
|
| 88 |
"""
|
| 89 |
Map raw cosine similarity to a 0β100 ATS-style scale.
|
| 90 |
|
| 91 |
+
MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35β0.82,
|
| 92 |
+
not 0.9+, so raw cosine understates good matches without calibration.
|
| 93 |
"""
|
| 94 |
cosine = float(np.clip(cosine, 0.0, 1.0))
|
| 95 |
low, high = 0.20, 0.78
|
|
|
|
| 154 |
return round(matched_weight / total_weight * 100, 2)
|
| 155 |
|
| 156 |
|
| 157 |
+
def _normalize_for_embedding(text: str) -> str:
|
| 158 |
+
"""
|
| 159 |
+
Convert resume or JD into a neutral skill-centric representation.
|
| 160 |
+
|
| 161 |
+
Problem: resumes use first-person achievement language; JDs use
|
| 162 |
+
third-person requirement language. A general-purpose model sees these
|
| 163 |
+
as stylistically distant (cosine ~0.40) even when skills match perfectly.
|
| 164 |
+
|
| 165 |
+
Fix: extract skills + skill-heavy sentences and represent both docs
|
| 166 |
+
in the same "skills: X Y Z context: ..." format so the model compares
|
| 167 |
+
skill vocabulary, not writing style.
|
| 168 |
+
"""
|
| 169 |
+
cleaned = clean_text(text)
|
| 170 |
+
extracted_skills = extract_resume_skills(cleaned)
|
| 171 |
+
skill_list = " ".join(sorted(extracted_skills))
|
| 172 |
+
skill_context = clean_text(extract_skill_sentences(text))
|
| 173 |
+
return f"skills: {skill_list} context: {skill_context}"
|
| 174 |
+
|
| 175 |
+
|
| 176 |
def semantic_match_score(resume_text: str, jd_text: str) -> float:
|
| 177 |
"""
|
| 178 |
+
Semantic similarity tuned for resume <-> JD alignment.
|
| 179 |
|
| 180 |
+
Both documents are normalised into skill-centric representations
|
| 181 |
+
before embedding so the model compares skill overlap, not writing style.
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
Combines:
|
| 184 |
+
1. Normalised full-doc embedding (50%) - fixes style mismatch
|
| 185 |
+
2. Chunk-level bi-directional on RAW text (35%) - preserves sentence boundaries
|
| 186 |
+
3. Skill-sentences-only embedding (15%) - fine-grained skill context
|
| 187 |
"""
|
|
|
|
|
|
|
| 188 |
if not resume_text.strip() or not jd_text.strip():
|
| 189 |
return 0.0
|
| 190 |
|
| 191 |
+
# Signal 1: normalised doc (style-agnostic skill comparison)
|
| 192 |
+
resume_norm = _normalize_for_embedding(resume_text)
|
| 193 |
+
jd_norm = _normalize_for_embedding(jd_text)
|
| 194 |
+
doc_emb = model.encode([resume_norm, jd_norm], convert_to_numpy=True)
|
| 195 |
+
full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
|
| 196 |
|
| 197 |
+
# Signal 2: chunk-level on RAW text (needs \n/. boundaries intact)
|
| 198 |
resume_chunks = split_into_chunks(resume_text)
|
| 199 |
jd_chunks = split_into_chunks(jd_text)
|
| 200 |
if len(resume_chunks) > 1 and len(jd_chunks) > 1:
|
|
|
|
| 202 |
else:
|
| 203 |
chunk_sim = full_sim
|
| 204 |
|
| 205 |
+
# Signal 3: skill-sentence embedding
|
| 206 |
+
resume_skill_text = extract_skill_sentences(resume_text)
|
| 207 |
+
jd_skill_text = extract_skill_sentences(jd_text)
|
| 208 |
+
if resume_skill_text and jd_skill_text:
|
| 209 |
skill_emb = model.encode(
|
| 210 |
+
[truncate_text(resume_skill_text, 4000),
|
| 211 |
+
truncate_text(jd_skill_text, 4000)],
|
| 212 |
convert_to_numpy=True,
|
| 213 |
)
|
| 214 |
skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
|
| 215 |
else:
|
| 216 |
skill_sim = full_sim
|
| 217 |
|
| 218 |
+
raw_cosine = 0.50 * full_sim + 0.35 * chunk_sim + 0.15 * skill_sim
|
| 219 |
return calibrate_semantic_score(raw_cosine)
|
| 220 |
|
| 221 |
|
|
|
|
| 254 |
def final_ats_score(resume_text: str, jd_text: str) -> dict:
|
| 255 |
"""
|
| 256 |
Composite ATS score weighted as:
|
| 257 |
+
60% semantic similarity (contextual understanding)
|
| 258 |
+
40% keyword match (skill taxonomy match, frequency-weighted)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
A seniority mismatch penalty (0β10 pts) is subtracted from the final score.
|
| 261 |
+
|
| 262 |
+
Returns a dict compatible with ScoreResponse schema.
|
| 263 |
"""
|
| 264 |
semantic = semantic_match_score(resume_text, jd_text)
|
| 265 |
keyword = keyword_match_score(resume_text, jd_text)
|
| 266 |
penalty = experience_level_penalty(resume_text, jd_text)
|
| 267 |
|
| 268 |
+
raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
|
| 269 |
+
final = round(max(0.0, raw_score - penalty), 2)
|
| 270 |
|
| 271 |
return {
|
| 272 |
"semantic_score": round(semantic, 2),
|