Spaces:

Siggmoid
/

ATS-Intelligence-Engine

Running

App Files Files Community

Siggmoid Cursor commited on 10 days ago

Commit

02e400a

1 Parent(s): 2717aab

Tune keyword_match: raw-text chunking and 45/55 score weights

Browse files

Files changed (1) hide show

utilities/keyword_match.py +24 -21

utilities/keyword_match.py CHANGED Viewed

@@ -88,8 +88,8 @@ def calibrate_semantic_score(cosine: float) -> float:
     """
     Map raw cosine similarity to a 0–100 ATS-style scale.
-    MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82,
-    not 0.9+, so raw cosine understates good matches without calibration.
     """
     cosine = float(np.clip(cosine, 0.0, 1.0))
     low, high = 0.20, 0.78
@@ -165,30 +165,31 @@ def semantic_match_score(resume_text: str, jd_text: str) -> float:
     Raw cosine is calibrated to a more intuitive 0–100 scale.
     """
-    resume_clean = clean_text(resume_text)
-    jd_clean = clean_text(jd_text)
-    if not resume_clean or not jd_clean:
         return 0.0
-    resume_doc = truncate_text(resume_clean)
-    jd_doc = truncate_text(jd_clean)
-    doc_emb = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
     full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
-    resume_chunks = split_into_chunks(resume_clean)
-    jd_chunks = split_into_chunks(jd_clean)
     if len(resume_chunks) > 1 and len(jd_chunks) > 1:
         chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
     else:
         chunk_sim = full_sim
-    resume_skills = extract_skill_sentences(resume_clean)
-    jd_skills = extract_skill_sentences(jd_clean)
-    if resume_skills and jd_skills:
         skill_emb = model.encode(
-            [truncate_text(resume_skills, 4000), truncate_text(jd_skills, 4000)],
             convert_to_numpy=True,
         )
         skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
@@ -234,19 +235,21 @@ def experience_level_penalty(resume_text: str, jd_text: str) -> float:
 def final_ats_score(resume_text: str, jd_text: str) -> dict:
     """
     Composite ATS score weighted as:
-      60%  semantic similarity  (contextual understanding)
-      40%  keyword match        (skill taxonomy match, frequency-weighted)
-    A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
-    Returns a dict compatible with ScoreResponse schema.
     """
     semantic = semantic_match_score(resume_text, jd_text)
     keyword  = keyword_match_score(resume_text, jd_text)
     penalty  = experience_level_penalty(resume_text, jd_text)
-    raw_score  = round(0.6 * semantic + 0.4 * keyword, 2)
-    final      = round(max(0.0, raw_score - penalty), 2)
     return {
         "semantic_score":  round(semantic, 2),

     """
     Map raw cosine similarity to a 0–100 ATS-style scale.
+    MPNet cosine for resume/JD pairs sits roughly in 0.20–0.78.
+    Floor at 0.20 (not 0.32) so valid weak matches aren't clamped to zero.
     """
     cosine = float(np.clip(cosine, 0.0, 1.0))
     low, high = 0.20, 0.78
     Raw cosine is calibrated to a more intuitive 0–100 scale.
     """
+    # Keep raw text for chunking (needs \n, ., ? boundaries intact)
+    # Only clean for the full-doc embedding where punctuation adds no value
+    if not resume_text.strip() or not jd_text.strip():
         return 0.0
+    resume_doc = truncate_text(clean_text(resume_text))
+    jd_doc     = truncate_text(clean_text(jd_text))
+    doc_emb  = model.encode([resume_doc, jd_doc], convert_to_numpy=True)
     full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
+    # Chunk on RAW text so sentence/line boundaries are preserved
+    resume_chunks = split_into_chunks(resume_text)
+    jd_chunks     = split_into_chunks(jd_text)
     if len(resume_chunks) > 1 and len(jd_chunks) > 1:
         chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
     else:
         chunk_sim = full_sim
+    # Skill sentences — also extract from raw then clean per-segment (done inside)
+    resume_skills_text = extract_skill_sentences(resume_text)
+    jd_skills_text     = extract_skill_sentences(jd_text)
+    if resume_skills_text and jd_skills_text:
         skill_emb = model.encode(
+            [truncate_text(resume_skills_text, 4000), truncate_text(jd_skills_text, 4000)],
             convert_to_numpy=True,
         )
         skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
 def final_ats_score(resume_text: str, jd_text: str) -> dict:
     """
     Composite ATS score weighted as:
+      45%  semantic similarity  (contextual understanding)
+      55%  keyword match        (skill taxonomy match, frequency-weighted)
+    Keyword is weighted slightly higher because it is deterministic and
+    directly reflects JD skill requirements. Semantic rewards phrasing
+    quality but shouldn't dominate when skills strongly match.
+    A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
     """
     semantic = semantic_match_score(resume_text, jd_text)
     keyword  = keyword_match_score(resume_text, jd_text)
     penalty  = experience_level_penalty(resume_text, jd_text)
+    raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
+    final     = round(max(0.0, raw_score - penalty), 2)
     return {
         "semantic_score":  round(semantic, 2),