Spaces:

Siggmoid
/

ATS-Intelligence-Engine

Running

App Files Files Community

ATS-Intelligence-Engine / utilities /keyword_match.py

Siggmoid

Update scoring: MS MARCO embeddings and skill-centric semantic matching

1905876 11 days ago

raw

history blame contribute delete

12 kB

	import os
	import re

	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from sentence_transformers import SentenceTransformer
	from utilities.skills import (
	extract_resume_skills,
	extract_required_skills_from_jd,
	SKILLS_SORTED_BY_LENGTH,
	clean_text,
	)

	# MPNet is stronger than MiniLM for long-form resume/JD similarity.
	SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "msmarco-distilbert-base-v4")
	MAX_DOC_CHARS = 8000
	MAX_CHUNKS = 24
	MIN_CHUNK_CHARS = 35

	model = SentenceTransformer(SEMANTIC_MODEL_ID)


	# ---------------------------------------------------------------------------
	# Stop-word list — common English words that pollute keyword matching
	# ---------------------------------------------------------------------------
	STOP_WORDS: set = {
	"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
	"of", "with", "by", "from", "as", "is", "was", "are", "were", "be",
	"been", "being", "have", "has", "had", "do", "does", "did", "will",
	"would", "could", "should", "may", "might", "shall", "can", "need",
	"that", "this", "these", "those", "it", "its", "we", "our", "you",
	"your", "they", "their", "he", "she", "his", "her", "i", "my",
	"not", "no", "so", "if", "then", "than", "also", "just", "only",
	"about", "up", "out", "over", "into", "through", "during", "including",
	"used", "use", "using", "work", "working", "works", "strong", "good",
	"experience", "experiences", "role", "team", "company", "environment",
	"ability", "skills", "skill", "looking", "required", "requirement",
	"plus", "bonus", "nice", "preferred", "knowledge", "understanding",
	"familiarity", "proficiency", "proficient", "hands", "on",
	}


	# ---------------------------------------------------------------------------
	# Text utilities
	# ---------------------------------------------------------------------------

	def truncate_text(text: str, max_chars: int = MAX_DOC_CHARS) -> str:
	if len(text) <= max_chars:
	return text
	return text[:max_chars].rsplit(" ", 1)[0]


	def split_into_chunks(text: str, max_chunks: int = MAX_CHUNKS) -> list[str]:
	"""Split resume/JD into comparable segments (bullets, lines, sentences)."""
	if not text:
	return []

	parts = re.split(r"[\n\r]+\|(?<=[.!?])\s+", text)
	chunks = [p.strip() for p in parts if len(p.strip()) >= MIN_CHUNK_CHARS]

	if not chunks and text.strip():
	words = text.split()
	window = 55
	for i in range(0, len(words), window):
	piece = " ".join(words[i : i + window])
	if len(piece) >= MIN_CHUNK_CHARS:
	chunks.append(piece)

	return chunks[:max_chunks]


	def extract_skill_sentences(text: str) -> str:
	"""
	Skill-heavy lines only — used as a secondary signal, not the main embedding.
	"""
	segments = re.split(r"[\n\r.;]+", text)
	relevant = []
	for seg in segments:
	seg_clean = clean_text(seg)
	if len(seg_clean) < MIN_CHUNK_CHARS:
	continue
	if any(skill in seg_clean for skill in SKILLS_SORTED_BY_LENGTH):
	relevant.append(seg_clean)
	return " ".join(relevant) if relevant else clean_text(text)


	def calibrate_semantic_score(cosine: float) -> float:
	"""
	Map raw cosine similarity to a 0–100 ATS-style scale.

	MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82,
	not 0.9+, so raw cosine understates good matches without calibration.
	"""
	cosine = float(np.clip(cosine, 0.0, 1.0))
	low, high = 0.20, 0.78
	scaled = (cosine - low) / (high - low) * 100.0
	return round(float(np.clip(scaled, 0.0, 100.0)), 2)


	def _pairwise_cosine(a: np.ndarray, b: np.ndarray) -> float:
	return float(cosine_similarity([a], [b])[0][0])


	def _chunk_bidirectional_score(resume_chunks: list[str], jd_chunks: list[str]) -> float:
	"""How well JD requirements are covered by resume (and vice versa)."""
	if not resume_chunks or not jd_chunks:
	return 0.0

	resume_emb = model.encode(resume_chunks, convert_to_numpy=True)
	jd_emb = model.encode(jd_chunks, convert_to_numpy=True)
	sim_matrix = cosine_similarity(resume_emb, jd_emb)

	jd_coverage = float(sim_matrix.max(axis=0).mean())
	resume_coverage = float(sim_matrix.max(axis=1).mean())
	return (jd_coverage + resume_coverage) / 2.0


	def remove_stop_words(text: str) -> set:
	"""Return meaningful tokens after removing stop words."""
	tokens = set(text.split())
	return tokens - STOP_WORDS


	# ---------------------------------------------------------------------------
	# Scoring functions
	# ---------------------------------------------------------------------------

	def keyword_match_score(resume_text: str, jd_text: str) -> float:
	"""
	Skill-only keyword match.

	Strategy:
	- Extract recognised tech skills from both texts using the master taxonomy.
	- Score = \|resume_skills ∩ jd_skills\| / \|jd_skills\|
	- This eliminates stop-word noise and counts only meaningful tech terms.

	Weighting bonus:
	- JD skills that appear multiple times are treated as high-priority.
	A missing high-frequency skill is penalised more heavily.
	"""
	jd_skills_freq = extract_required_skills_from_jd(jd_text) # {skill: freq}
	resume_skills = extract_resume_skills(resume_text)

	if not jd_skills_freq:
	return 0.0

	# Weighted scoring: skills mentioned more in JD carry more weight
	total_weight = sum(jd_skills_freq.values())
	matched_weight = sum(
	freq for skill, freq in jd_skills_freq.items()
	if skill in resume_skills
	)

	return round(matched_weight / total_weight * 100, 2)


	def _normalize_for_embedding(text: str) -> str:
	"""
	Convert resume or JD into a neutral skill-centric representation.

	Problem: resumes use first-person achievement language; JDs use
	third-person requirement language. A general-purpose model sees these
	as stylistically distant (cosine ~0.40) even when skills match perfectly.

	Fix: extract skills + skill-heavy sentences and represent both docs
	in the same "skills: X Y Z context: ..." format so the model compares
	skill vocabulary, not writing style.
	"""
	cleaned = clean_text(text)
	extracted_skills = extract_resume_skills(cleaned)
	skill_list = " ".join(sorted(extracted_skills))
	skill_context = clean_text(extract_skill_sentences(text))
	return f"skills: {skill_list} context: {skill_context}"


	def semantic_match_score(resume_text: str, jd_text: str) -> float:
	"""
	Semantic similarity tuned for resume <-> JD alignment.

	Both documents are normalised into skill-centric representations
	before embedding so the model compares skill overlap, not writing style.

	Combines:
	1. Normalised full-doc embedding (50%) - fixes style mismatch
	2. Chunk-level bi-directional on RAW text (35%) - preserves sentence boundaries
	3. Skill-sentences-only embedding (15%) - fine-grained skill context
	"""
	if not resume_text.strip() or not jd_text.strip():
	return 0.0

	# Signal 1: normalised doc (style-agnostic skill comparison)
	resume_norm = _normalize_for_embedding(resume_text)
	jd_norm = _normalize_for_embedding(jd_text)
	doc_emb = model.encode([resume_norm, jd_norm], convert_to_numpy=True)
	full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1])

	# Signal 2: chunk-level on RAW text (needs \n/. boundaries intact)
	resume_chunks = split_into_chunks(resume_text)
	jd_chunks = split_into_chunks(jd_text)
	if len(resume_chunks) > 1 and len(jd_chunks) > 1:
	chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks)
	else:
	chunk_sim = full_sim

	# Signal 3: skill-sentence embedding
	resume_skill_text = extract_skill_sentences(resume_text)
	jd_skill_text = extract_skill_sentences(jd_text)
	if resume_skill_text and jd_skill_text:
	skill_emb = model.encode(
	[truncate_text(resume_skill_text, 4000),
	truncate_text(jd_skill_text, 4000)],
	convert_to_numpy=True,
	)
	skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1])
	else:
	skill_sim = full_sim

	raw_cosine = 0.50 * full_sim + 0.35 * chunk_sim + 0.15 * skill_sim
	return calibrate_semantic_score(raw_cosine)


	def experience_level_penalty(resume_text: str, jd_text: str) -> float:
	"""
	Detects seniority mismatch and returns a 0–10 penalty.

	E.g. a senior-level JD matched against a junior resume
	should score lower even if skills overlap.
	"""
	SENIOR_SIGNALS = {"senior", "lead", "principal", "architect", "staff", "head of"}
	JUNIOR_SIGNALS = {"junior", "entry level", "entry-level", "graduate", "intern", "fresher"}

	jd_lower = jd_text.lower()
	resume_lower = resume_text.lower()

	jd_is_senior = any(s in jd_lower for s in SENIOR_SIGNALS)
	jd_is_junior = any(s in jd_lower for s in JUNIOR_SIGNALS)

	resume_is_senior = any(s in resume_lower for s in SENIOR_SIGNALS)
	resume_is_junior = any(s in resume_lower for s in JUNIOR_SIGNALS)

	# JD wants senior but resume signals junior
	if jd_is_senior and resume_is_junior:
	return 10.0
	# JD wants junior but resume is over-qualified (minor penalty)
	if jd_is_junior and resume_is_senior:
	return 3.0
	return 0.0


	# ---------------------------------------------------------------------------
	# Final composite score
	# ---------------------------------------------------------------------------

	def final_ats_score(resume_text: str, jd_text: str) -> dict:
	"""
	Composite ATS score weighted as:
	60% semantic similarity (contextual understanding)
	40% keyword match (skill taxonomy match, frequency-weighted)

	A seniority mismatch penalty (0–10 pts) is subtracted from the final score.

	Returns a dict compatible with ScoreResponse schema.
	"""
	semantic = semantic_match_score(resume_text, jd_text)
	keyword = keyword_match_score(resume_text, jd_text)
	penalty = experience_level_penalty(resume_text, jd_text)

	raw_score = round(0.45 * semantic + 0.55 * keyword, 2)
	final = round(max(0.0, raw_score - penalty), 2)

	return {
	"semantic_score": round(semantic, 2),
	"keyword_score": round(keyword, 2),
	"final_ats_score": final,
	}


	# ---------------------------------------------------------------------------
	# Smoke-test
	# ---------------------------------------------------------------------------
	if __name__ == "__main__":
	examples = [
	[
	"Python developer with FastAPI, SQL, and machine learning experience",
	"Looking for a Python developer with FastAPI, SQL, and ML skills",
	],
	[
	"Built backend services using Python frameworks and databases",
	"Python developer with FastAPI and SQL",
	],
	[
	"Python Python Python SQL SQL FastAPI",
	"Python developer with FastAPI and SQL",
	],
	[
	"Professional photographer specialising in portraits and wildlife",
	"Hiring a machine learning engineer with Python and PyTorch",
	],
	[
	"Led ML teams, deployed large-scale models, optimised transformers",
	"Junior Python developer with basic ML",
	],
	[
	"NLP engineer: PyTorch, HuggingFace transformers, LLM fine-tuning, RAG pipelines",
	"Senior ML engineer: LLM, RAG, fine-tuning, Python, AWS SageMaker",
	],
	]

	print(f"{'#':<3} {'Semantic':>10} {'Keyword':>10} {'Final ATS':>10}")
	print("-" * 38)
	for i, (resume, jd) in enumerate(examples):
	result = final_ats_score(resume, jd)
	print(
	f"{i:<3} {result['semantic_score']:>10} "
	f"{result['keyword_score']:>10} "
	f"{result['final_ats_score']:>10}"
	)