ATS-Intelligence-Engine / utilities /keyword_match.py
Siggmoid's picture
Update scoring: MS MARCO embeddings and skill-centric semantic matching
1905876
import os
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from utilities.skills import (
extract_resume_skills,
extract_required_skills_from_jd,
SKILLS_SORTED_BY_LENGTH,
clean_text,
)
# MPNet is stronger than MiniLM for long-form resume/JD similarity.
SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "msmarco-distilbert-base-v4")
MAX_DOC_CHARS = 8000
MAX_CHUNKS = 24
MIN_CHUNK_CHARS = 35
model = SentenceTransformer(SEMANTIC_MODEL_ID)
# ---------------------------------------------------------------------------
# Stop-word list — common English words that pollute keyword matching
# ---------------------------------------------------------------------------
STOP_WORDS: set = {
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
"of", "with", "by", "from", "as", "is", "was", "are", "were", "be",
"been", "being", "have", "has", "had", "do", "does", "did", "will",
"would", "could", "should", "may", "might", "shall", "can", "need",
"that", "this", "these", "those", "it", "its", "we", "our", "you",
"your", "they", "their", "he", "she", "his", "her", "i", "my",
"not", "no", "so", "if", "then", "than", "also", "just", "only",
"about", "up", "out", "over", "into", "through", "during", "including",
"used", "use", "using", "work", "working", "works", "strong", "good",
"experience", "experiences", "role", "team", "company", "environment",
"ability", "skills", "skill", "looking", "required", "requirement",
"plus", "bonus", "nice", "preferred", "knowledge", "understanding",
"familiarity", "proficiency", "proficient", "hands", "on",
}
# ---------------------------------------------------------------------------
# Text utilities
# ---------------------------------------------------------------------------
def truncate_text(text: str, max_chars: int = MAX_DOC_CHARS) -> str:
if len(text) <= max_chars:
return text
return text[:max_chars].rsplit(" ", 1)[0]
def split_into_chunks(text: str, max_chunks: int = MAX_CHUNKS) -> list[str]:
"""Split resume/JD into comparable segments (bullets, lines, sentences)."""
if not text:
return []
parts = re.split(r"[\n\r]+|(?<=[.!?])\s+", text)
chunks = [p.strip() for p in parts if len(p.strip()) >= MIN_CHUNK_CHARS]
if not chunks and text.strip():
words = text.split()
window = 55
for i in range(0, len(words), window):
piece = " ".join(words[i : i + window])
if len(piece) >= MIN_CHUNK_CHARS:
chunks.append(piece)
return chunks[:max_chunks]
def extract_skill_sentences(text: str) -> str:
"""
Skill-heavy lines only — used as a secondary signal, not the main embedding.
"""
segments = re.split(r"[\n\r.;]+", text)
relevant = []
for seg in segments:
seg_clean = clean_text(seg)
if len(seg_clean) < MIN_CHUNK_CHARS:
continue
if any(skill in seg_clean for skill in SKILLS_SORTED_BY_LENGTH):
relevant.append(seg_clean)
return " ".join(relevant) if relevant else clean_text(text)
def calibrate_semantic_score(cosine: float) -> float:
"""
Map raw cosine similarity to a 0–100 ATS-style scale.
MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82,
not 0.9+, so raw cosine understates good matches without calibration.
"""
cosine = float(np.clip(cosine, 0.0, 1.0))
low, high = 0.20, 0.78
scaled = (cosine - low) / (high - low) * 100.0
return round(float(np.clip(scaled, 0.0, 100.0)), 2)
def _pairwise_cosine(a: np.ndarray, b: np.ndarray) -> float:
return float(cosine_similarity([a], [b])[0][0])
def _chunk_bidirectional_score(resume_chunks: list[str], jd_chunks: list[str]) -> float:
"""How well JD requirements are covered by resume (and vice versa)."""
if not resume_chunks or not jd_chunks:
return 0.0
resume_emb = model.encode(resume_chunks, convert_to_numpy=True)
jd_emb = model.encode(jd_chunks, convert_to_numpy=True)
sim_matrix = cosine_similarity(resume_emb, jd_emb)
jd_coverage = float(sim_matrix.max(axis=0).mean())
resume_coverage = float(sim_matrix.max(axis=1).mean())
return (jd_coverage + resume_coverage) / 2.0
def remove_stop_words(text: str) -> set:
"""Return meaningful tokens after removing stop words."""
tokens = set(text.split())
return tokens - STOP_WORDS
# ---------------------------------------------------------------------------
# Scoring functions
# ---------------------------------------------------------------------------
def keyword_match_score(resume_text: str, jd_text: str) -> float:
"""
Skill-only keyword match.
Strategy:
- Extract recognised tech skills from both texts using the master taxonomy.
- Score = |resume_skills ∩ jd_skills| / |jd_skills|
- This eliminates stop-word noise and counts only meaningful tech terms.
Weighting bonus:
- JD skills that appear multiple times are treated as high-priority.
A missing high-frequency skill is penalised more heavily.
"""
jd_skills_freq = extract_required_skills_from_jd(jd_text) # {skill: freq}
resume_skills = extract_resume_skills(resume_text)
if not jd_skills_freq:
return 0.0
# Weighted scoring: skills mentioned more in JD carry more weight
total_weight = sum(jd_skills_freq.values())
matched_weight = sum(
freq for skill, freq in jd_skills_freq.items()
if skill in resume_skills
)
return round(matched_weight / total_weight * 100, 2)
def _normalize_for_embedding(text: str) -> str:
"""
Convert resume or JD into a neutral skill-centric representation.
Problem: resumes use first-person achievement language; JDs use
third-person requirement language. A general-purpose model sees these
as stylistically distant (cosine ~0.40) even when skills match perfectly.
Fix: extract skills + skill-heavy sentences and represent both docs
in the same "skills: X Y Z context: ..." format so the model compares
skill vocabulary, not writing style.
"""
cleaned = clean_text(text)
extracted_skills = extract_resume_skills(cleaned)
skill_list = " ".join(sorted(extracted_skills))
skill_context = clean_text(extract_skill_sentences(text))
return f"skills: {skill_list} context: {skill_context}"
def semantic_match_score(resume_text: str, jd_text: str) -> float:
"""
Semantic similarity tuned for resume <-> JD alignment.
Both documents are normalised into skill-centric representations
before embedding so the model compares skill overlap, not writing style.
Combines:
1. Normalised full-doc embedding (50%) - fixes style mismatch
2. Chunk-level bi-directional on RAW text (35%) - preserves sentence boundaries
3. Skill-sentences-only embedding (15%) - fine-grained skill context
"""
if not resume_text.strip() or not jd_text.strip():
return 0.0
# Signal 1: normalised doc (style-agnostic skill comparison)
resume_norm = _normalize_for_embedding(resume_text)
jd_norm = _normalize_for_embedding(jd_text)
doc_emb = model.encode([resume_norm, jd_norm], convert_to_numpy=True)
full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])
# Signal 2: chunk-level on RAW text (needs \n/. boundaries intact)
resume_chunks = split_into_chunks(resume_text)
jd_chunks = split_into_chunks(jd_text)
if len(resume_chunks) > 1 and len(jd_chunks) > 1:
chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
else:
chunk_sim = full_sim
# Signal 3: skill-sentence embedding
resume_skill_text = extract_skill_sentences(resume_text)
jd_skill_text = extract_skill_sentences(jd_text)
if resume_skill_text and jd_skill_text:
skill_emb = model.encode(
[truncate_text(resume_skill_text, 4000),
truncate_text(jd_skill_text, 4000)],
convert_to_numpy=True,
)
skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
else:
skill_sim = full_sim
raw_cosine = 0.50 * full_sim + 0.35 * chunk_sim + 0.15 * skill_sim
return calibrate_semantic_score(raw_cosine)
def experience_level_penalty(resume_text: str, jd_text: str) -> float:
"""
Detects seniority mismatch and returns a 0–10 penalty.
E.g. a senior-level JD matched against a junior resume
should score lower even if skills overlap.
"""
SENIOR_SIGNALS = {"senior", "lead", "principal", "architect", "staff", "head of"}
JUNIOR_SIGNALS = {"junior", "entry level", "entry-level", "graduate", "intern", "fresher"}
jd_lower = jd_text.lower()
resume_lower = resume_text.lower()
jd_is_senior = any(s in jd_lower for s in SENIOR_SIGNALS)
jd_is_junior = any(s in jd_lower for s in JUNIOR_SIGNALS)
resume_is_senior = any(s in resume_lower for s in SENIOR_SIGNALS)
resume_is_junior = any(s in resume_lower for s in JUNIOR_SIGNALS)
# JD wants senior but resume signals junior
if jd_is_senior and resume_is_junior:
return 10.0
# JD wants junior but resume is over-qualified (minor penalty)
if jd_is_junior and resume_is_senior:
return 3.0
return 0.0
# ---------------------------------------------------------------------------
# Final composite score
# ---------------------------------------------------------------------------
def final_ats_score(resume_text: str, jd_text: str) -> dict:
"""
Composite ATS score weighted as:
60% semantic similarity (contextual understanding)
40% keyword match (skill taxonomy match, frequency-weighted)
A seniority mismatch penalty (0–10 pts) is subtracted from the final score.
Returns a dict compatible with ScoreResponse schema.
"""
semantic = semantic_match_score(resume_text, jd_text)
keyword = keyword_match_score(resume_text, jd_text)
penalty = experience_level_penalty(resume_text, jd_text)
raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
final = round(max(0.0, raw_score - penalty), 2)
return {
"semantic_score": round(semantic, 2),
"keyword_score": round(keyword, 2),
"final_ats_score": final,
}
# ---------------------------------------------------------------------------
# Smoke-test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
examples = [
[
"Python developer with FastAPI, SQL, and machine learning experience",
"Looking for a Python developer with FastAPI, SQL, and ML skills",
],
[
"Built backend services using Python frameworks and databases",
"Python developer with FastAPI and SQL",
],
[
"Python Python Python SQL SQL FastAPI",
"Python developer with FastAPI and SQL",
],
[
"Professional photographer specialising in portraits and wildlife",
"Hiring a machine learning engineer with Python and PyTorch",
],
[
"Led ML teams, deployed large-scale models, optimised transformers",
"Junior Python developer with basic ML",
],
[
"NLP engineer: PyTorch, HuggingFace transformers, LLM fine-tuning, RAG pipelines",
"Senior ML engineer: LLM, RAG, fine-tuning, Python, AWS SageMaker",
],
]
print(f"{'#':<3} {'Semantic':>10} {'Keyword':>10} {'Final ATS':>10}")
print("-" * 38)
for i, (resume, jd) in enumerate(examples):
result = final_ats_score(resume, jd)
print(
f"{i:<3} {result['semantic_score']:>10} "
f"{result['keyword_score']:>10} "
f"{result['final_ats_score']:>10}"
)