import os import re import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer from utilities.skills import ( extract_resume_skills, extract_required_skills_from_jd, SKILLS_SORTED_BY_LENGTH, clean_text, ) # MPNet is stronger than MiniLM for long-form resume/JD similarity. SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "msmarco-distilbert-base-v4") MAX_DOC_CHARS = 8000 MAX_CHUNKS = 24 MIN_CHUNK_CHARS = 35 model = SentenceTransformer(SEMANTIC_MODEL_ID) # --------------------------------------------------------------------------- # Stop-word list — common English words that pollute keyword matching # --------------------------------------------------------------------------- STOP_WORDS: set = { "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can", "need", "that", "this", "these", "those", "it", "its", "we", "our", "you", "your", "they", "their", "he", "she", "his", "her", "i", "my", "not", "no", "so", "if", "then", "than", "also", "just", "only", "about", "up", "out", "over", "into", "through", "during", "including", "used", "use", "using", "work", "working", "works", "strong", "good", "experience", "experiences", "role", "team", "company", "environment", "ability", "skills", "skill", "looking", "required", "requirement", "plus", "bonus", "nice", "preferred", "knowledge", "understanding", "familiarity", "proficiency", "proficient", "hands", "on", } # --------------------------------------------------------------------------- # Text utilities # --------------------------------------------------------------------------- def truncate_text(text: str, max_chars: int = MAX_DOC_CHARS) -> str: if len(text) <= max_chars: return text return text[:max_chars].rsplit(" ", 1)[0] def split_into_chunks(text: str, max_chunks: int = MAX_CHUNKS) -> list[str]: """Split resume/JD into comparable segments (bullets, lines, sentences).""" if not text: return [] parts = re.split(r"[\n\r]+|(?<=[.!?])\s+", text) chunks = [p.strip() for p in parts if len(p.strip()) >= MIN_CHUNK_CHARS] if not chunks and text.strip(): words = text.split() window = 55 for i in range(0, len(words), window): piece = " ".join(words[i : i + window]) if len(piece) >= MIN_CHUNK_CHARS: chunks.append(piece) return chunks[:max_chunks] def extract_skill_sentences(text: str) -> str: """ Skill-heavy lines only — used as a secondary signal, not the main embedding. """ segments = re.split(r"[\n\r.;]+", text) relevant = [] for seg in segments: seg_clean = clean_text(seg) if len(seg_clean) < MIN_CHUNK_CHARS: continue if any(skill in seg_clean for skill in SKILLS_SORTED_BY_LENGTH): relevant.append(seg_clean) return " ".join(relevant) if relevant else clean_text(text) def calibrate_semantic_score(cosine: float) -> float: """ Map raw cosine similarity to a 0–100 ATS-style scale. MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82, not 0.9+, so raw cosine understates good matches without calibration. """ cosine = float(np.clip(cosine, 0.0, 1.0)) low, high = 0.20, 0.78 scaled = (cosine - low) / (high - low) * 100.0 return round(float(np.clip(scaled, 0.0, 100.0)), 2) def _pairwise_cosine(a: np.ndarray, b: np.ndarray) -> float: return float(cosine_similarity([a], [b])[0][0]) def _chunk_bidirectional_score(resume_chunks: list[str], jd_chunks: list[str]) -> float: """How well JD requirements are covered by resume (and vice versa).""" if not resume_chunks or not jd_chunks: return 0.0 resume_emb = model.encode(resume_chunks, convert_to_numpy=True) jd_emb = model.encode(jd_chunks, convert_to_numpy=True) sim_matrix = cosine_similarity(resume_emb, jd_emb) jd_coverage = float(sim_matrix.max(axis=0).mean()) resume_coverage = float(sim_matrix.max(axis=1).mean()) return (jd_coverage + resume_coverage) / 2.0 def remove_stop_words(text: str) -> set: """Return meaningful tokens after removing stop words.""" tokens = set(text.split()) return tokens - STOP_WORDS # --------------------------------------------------------------------------- # Scoring functions # --------------------------------------------------------------------------- def keyword_match_score(resume_text: str, jd_text: str) -> float: """ Skill-only keyword match. Strategy: - Extract recognised tech skills from both texts using the master taxonomy. - Score = |resume_skills ∩ jd_skills| / |jd_skills| - This eliminates stop-word noise and counts only meaningful tech terms. Weighting bonus: - JD skills that appear multiple times are treated as high-priority. A missing high-frequency skill is penalised more heavily. """ jd_skills_freq = extract_required_skills_from_jd(jd_text) # {skill: freq} resume_skills = extract_resume_skills(resume_text) if not jd_skills_freq: return 0.0 # Weighted scoring: skills mentioned more in JD carry more weight total_weight = sum(jd_skills_freq.values()) matched_weight = sum( freq for skill, freq in jd_skills_freq.items() if skill in resume_skills ) return round(matched_weight / total_weight * 100, 2) def _normalize_for_embedding(text: str) -> str: """ Convert resume or JD into a neutral skill-centric representation. Problem: resumes use first-person achievement language; JDs use third-person requirement language. A general-purpose model sees these as stylistically distant (cosine ~0.40) even when skills match perfectly. Fix: extract skills + skill-heavy sentences and represent both docs in the same "skills: X Y Z context: ..." format so the model compares skill vocabulary, not writing style. """ cleaned = clean_text(text) extracted_skills = extract_resume_skills(cleaned) skill_list = " ".join(sorted(extracted_skills)) skill_context = clean_text(extract_skill_sentences(text)) return f"skills: {skill_list} context: {skill_context}" def semantic_match_score(resume_text: str, jd_text: str) -> float: """ Semantic similarity tuned for resume <-> JD alignment. Both documents are normalised into skill-centric representations before embedding so the model compares skill overlap, not writing style. Combines: 1. Normalised full-doc embedding (50%) - fixes style mismatch 2. Chunk-level bi-directional on RAW text (35%) - preserves sentence boundaries 3. Skill-sentences-only embedding (15%) - fine-grained skill context """ if not resume_text.strip() or not jd_text.strip(): return 0.0 # Signal 1: normalised doc (style-agnostic skill comparison) resume_norm = _normalize_for_embedding(resume_text) jd_norm = _normalize_for_embedding(jd_text) doc_emb = model.encode([resume_norm, jd_norm], convert_to_numpy=True) full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1]) # Signal 2: chunk-level on RAW text (needs \n/. boundaries intact) resume_chunks = split_into_chunks(resume_text) jd_chunks = split_into_chunks(jd_text) if len(resume_chunks) > 1 and len(jd_chunks) > 1: chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks) else: chunk_sim = full_sim # Signal 3: skill-sentence embedding resume_skill_text = extract_skill_sentences(resume_text) jd_skill_text = extract_skill_sentences(jd_text) if resume_skill_text and jd_skill_text: skill_emb = model.encode( [truncate_text(resume_skill_text, 4000), truncate_text(jd_skill_text, 4000)], convert_to_numpy=True, ) skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1]) else: skill_sim = full_sim raw_cosine = 0.50 * full_sim + 0.35 * chunk_sim + 0.15 * skill_sim return calibrate_semantic_score(raw_cosine) def experience_level_penalty(resume_text: str, jd_text: str) -> float: """ Detects seniority mismatch and returns a 0–10 penalty. E.g. a senior-level JD matched against a junior resume should score lower even if skills overlap. """ SENIOR_SIGNALS = {"senior", "lead", "principal", "architect", "staff", "head of"} JUNIOR_SIGNALS = {"junior", "entry level", "entry-level", "graduate", "intern", "fresher"} jd_lower = jd_text.lower() resume_lower = resume_text.lower() jd_is_senior = any(s in jd_lower for s in SENIOR_SIGNALS) jd_is_junior = any(s in jd_lower for s in JUNIOR_SIGNALS) resume_is_senior = any(s in resume_lower for s in SENIOR_SIGNALS) resume_is_junior = any(s in resume_lower for s in JUNIOR_SIGNALS) # JD wants senior but resume signals junior if jd_is_senior and resume_is_junior: return 10.0 # JD wants junior but resume is over-qualified (minor penalty) if jd_is_junior and resume_is_senior: return 3.0 return 0.0 # --------------------------------------------------------------------------- # Final composite score # --------------------------------------------------------------------------- def final_ats_score(resume_text: str, jd_text: str) -> dict: """ Composite ATS score weighted as: 60% semantic similarity (contextual understanding) 40% keyword match (skill taxonomy match, frequency-weighted) A seniority mismatch penalty (0–10 pts) is subtracted from the final score. Returns a dict compatible with ScoreResponse schema. """ semantic = semantic_match_score(resume_text, jd_text) keyword = keyword_match_score(resume_text, jd_text) penalty = experience_level_penalty(resume_text, jd_text) raw_score = round(0.45 * semantic + 0.55 * keyword, 2) final = round(max(0.0, raw_score - penalty), 2) return { "semantic_score": round(semantic, 2), "keyword_score": round(keyword, 2), "final_ats_score": final, } # --------------------------------------------------------------------------- # Smoke-test # --------------------------------------------------------------------------- if __name__ == "__main__": examples = [ [ "Python developer with FastAPI, SQL, and machine learning experience", "Looking for a Python developer with FastAPI, SQL, and ML skills", ], [ "Built backend services using Python frameworks and databases", "Python developer with FastAPI and SQL", ], [ "Python Python Python SQL SQL FastAPI", "Python developer with FastAPI and SQL", ], [ "Professional photographer specialising in portraits and wildlife", "Hiring a machine learning engineer with Python and PyTorch", ], [ "Led ML teams, deployed large-scale models, optimised transformers", "Junior Python developer with basic ML", ], [ "NLP engineer: PyTorch, HuggingFace transformers, LLM fine-tuning, RAG pipelines", "Senior ML engineer: LLM, RAG, fine-tuning, Python, AWS SageMaker", ], ] print(f"{'#':<3} {'Semantic':>10} {'Keyword':>10} {'Final ATS':>10}") print("-" * 38) for i, (resume, jd) in enumerate(examples): result = final_ats_score(resume, jd) print( f"{i:<3} {result['semantic_score']:>10} " f"{result['keyword_score']:>10} " f"{result['final_ats_score']:>10}" )