Spaces:

Siggmoid
/

ATS-Intelligence-Engine

Running

App Files Files Community

Siggmoid Cursor commited on 10 days ago

Commit

1905876

1 Parent(s): 02e400a

Update scoring: MS MARCO embeddings and skill-centric semantic matching

Browse files

Files changed (2) hide show

Dockerfile +1 -1
utilities/keyword_match.py +48 -31

Dockerfile CHANGED Viewed

@@ -23,7 +23,7 @@ COPY --chown=user requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 # Pre-download embedding model at build time (faster Space cold start)
-RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('all-mpnet-base-v2')\""
 COPY --chown=user . /app

 RUN pip install --no-cache-dir -r requirements.txt
 # Pre-download embedding model at build time (faster Space cold start)
+RUN su - user -c "python -c \"from sentence_transformers import SentenceTransformer; SentenceTransformer('msmarco-distilbert-base-v4')\""
 COPY --chown=user . /app

utilities/keyword_match.py CHANGED Viewed

@@ -12,7 +12,7 @@ from utilities.skills import (
 )
 # MPNet is stronger than MiniLM for long-form resume/JD similarity.
-SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "all-mpnet-base-v2")
 MAX_DOC_CHARS = 8000
 MAX_CHUNKS = 24
 MIN_CHUNK_CHARS = 35
@@ -88,8 +88,8 @@ def calibrate_semantic_score(cosine: float) -> float:
     """
     Map raw cosine similarity to a 0–100 ATS-style scale.
-    MPNet cosine for resume/JD pairs sits roughly in 0.20–0.78.
-    Floor at 0.20 (not 0.32) so valid weak matches aren't clamped to zero.
     """
     cosine = float(np.clip(cosine, 0.0, 1.0))
     low, high = 0.20, 0.78
@@ -154,29 +154,47 @@ def keyword_match_score(resume_text: str, jd_text: str) -> float:
     return round(matched_weight / total_weight * 100, 2)
 def semantic_match_score(resume_text: str, jd_text: str) -> float:
     """
-    Semantic similarity tuned for resume ↔ JD alignment.
-    Combines:
-    1. Full-document embedding (overall theme)
-    2. Chunk-level bi-directional match (handles length mismatch)
-    3. Skill-heavy lines (secondary boost)
-    Raw cosine is calibrated to a more intuitive 0–100 scale.
     """
-    # Keep raw text for chunking (needs \n, ., ? boundaries intact)
-    # Only clean for the full-doc embedding where punctuation adds no value
     if not resume_text.strip() or not jd_text.strip():
         return 0.0
-    resume_doc = truncate_text(clean_text(resume_text))
-    jd_doc     = truncate_text(clean_text(jd_text))
-    doc_emb  = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
-    full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
-    # Chunk on RAW text so sentence/line boundaries are preserved
     resume_chunks = split_into_chunks(resume_text)
     jd_chunks     = split_into_chunks(jd_text)
     if len(resume_chunks) > 1 and len(jd_chunks) > 1:
@@ -184,19 +202,20 @@ def semantic_match_score(resume_text: str, jd_text: str) -> float:
     else:
         chunk_sim = full_sim
-    # Skill sentences — also extract from raw then clean per-segment (done inside)
-    resume_skills_text = extract_skill_sentences(resume_text)
-    jd_skills_text     = extract_skill_sentences(jd_text)
-    if resume_skills_text and jd_skills_text:
         skill_emb = model.encode(
-            [truncate_text(resume_skills_text, 4000), truncate_text(jd_skills_text, 4000)],
             convert_to_numpy=True,
         )
         skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
     else:
         skill_sim = full_sim
-    raw_cosine = 0.30 * full_sim + 0.55 * chunk_sim + 0.15 * skill_sim
     return calibrate_semantic_score(raw_cosine)
@@ -235,21 +254,19 @@ def experience_level_penalty(resume_text: str, jd_text: str) -> float:
 def final_ats_score(resume_text: str, jd_text: str) -> dict:
     """
     Composite ATS score weighted as:
-      45%  semantic similarity  (contextual understanding)
-      55%  keyword match        (skill taxonomy match, frequency-weighted)
-    Keyword is weighted slightly higher because it is deterministic and
-    directly reflects JD skill requirements. Semantic rewards phrasing
-    quality but shouldn't dominate when skills strongly match.
     A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
     """
     semantic = semantic_match_score(resume_text, jd_text)
     keyword  = keyword_match_score(resume_text, jd_text)
     penalty  = experience_level_penalty(resume_text, jd_text)
-    raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
-    final     = round(max(0.0, raw_score - penalty), 2)
     return {
         "semantic_score":  round(semantic, 2),

 )
 # MPNet is stronger than MiniLM for long-form resume/JD similarity.
+SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "msmarco-distilbert-base-v4")
 MAX_DOC_CHARS = 8000
 MAX_CHUNKS = 24
 MIN_CHUNK_CHARS = 35
     """
     Map raw cosine similarity to a 0–100 ATS-style scale.
+    MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82,
+    not 0.9+, so raw cosine understates good matches without calibration.
     """
     cosine = float(np.clip(cosine, 0.0, 1.0))
     low, high = 0.20, 0.78
     return round(matched_weight / total_weight * 100, 2)
+def _normalize_for_embedding(text: str) -> str:
+    """
+    Convert resume or JD into a neutral skill-centric representation.
+    Problem: resumes use first-person achievement language; JDs use
+    third-person requirement language. A general-purpose model sees these
+    as stylistically distant (cosine ~0.40) even when skills match perfectly.
+    Fix: extract skills + skill-heavy sentences and represent both docs
+    in the same "skills: X Y Z context: ..." format so the model compares
+    skill vocabulary, not writing style.
+    """
+    cleaned = clean_text(text)
+    extracted_skills = extract_resume_skills(cleaned)
+    skill_list = " ".join(sorted(extracted_skills))
+    skill_context = clean_text(extract_skill_sentences(text))
+    return f"skills: {skill_list} context: {skill_context}"
 def semantic_match_score(resume_text: str, jd_text: str) -> float:
     """
+    Semantic similarity tuned for resume <-> JD alignment.
+    Both documents are normalised into skill-centric representations
+    before embedding so the model compares skill overlap, not writing style.
+    Combines:
+    1. Normalised full-doc embedding  (50%) - fixes style mismatch
+    2. Chunk-level bi-directional on RAW text (35%) - preserves sentence boundaries
+    3. Skill-sentences-only embedding (15%) - fine-grained skill context
     """
     if not resume_text.strip() or not jd_text.strip():
         return 0.0
+    # Signal 1: normalised doc (style-agnostic skill comparison)
+    resume_norm = _normalize_for_embedding(resume_text)
+    jd_norm     = _normalize_for_embedding(jd_text)
+    doc_emb     = model.encode([resume_norm, jd_norm], convert_to_numpy=True)
+    full_sim    = _pairwise_cosine(doc_emb[0], doc_emb[1])
+    # Signal 2: chunk-level on RAW text (needs \n/. boundaries intact)
     resume_chunks = split_into_chunks(resume_text)
     jd_chunks     = split_into_chunks(jd_text)
     if len(resume_chunks) > 1 and len(jd_chunks) > 1:
     else:
         chunk_sim = full_sim
+    # Signal 3: skill-sentence embedding
+    resume_skill_text = extract_skill_sentences(resume_text)
+    jd_skill_text     = extract_skill_sentences(jd_text)
+    if resume_skill_text and jd_skill_text:
         skill_emb = model.encode(
+            [truncate_text(resume_skill_text, 4000),
+             truncate_text(jd_skill_text, 4000)],
             convert_to_numpy=True,
         )
         skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
     else:
         skill_sim = full_sim
+    raw_cosine = 0.50 * full_sim + 0.35 * chunk_sim + 0.15 * skill_sim
     return calibrate_semantic_score(raw_cosine)
 def final_ats_score(resume_text: str, jd_text: str) -> dict:
     """
     Composite ATS score weighted as:
+      60%  semantic similarity  (contextual understanding)
+      40%  keyword match        (skill taxonomy match, frequency-weighted)
     A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
+    Returns a dict compatible with ScoreResponse schema.
     """
     semantic = semantic_match_score(resume_text, jd_text)
     keyword  = keyword_match_score(resume_text, jd_text)
     penalty  = experience_level_penalty(resume_text, jd_text)
+    raw_score  = round(0.45 * semantic + 0.55 * keyword, 2)
+    final      = round(max(0.0, raw_score - penalty), 2)
     return {
         "semantic_score":  round(semantic, 2),