| import os |
| import re |
|
|
| import numpy as np |
| from sklearn.metrics.pairwise import cosine_similarity |
| from sentence_transformers import SentenceTransformer |
| from utilities.skills import ( |
| extract_resume_skills, |
| extract_required_skills_from_jd, |
| SKILLS_SORTED_BY_LENGTH, |
| clean_text, |
| ) |
|
|
| |
| SEMANTIC_MODEL_ID = os.getenv("SEMANTIC_MODEL", "msmarco-distilbert-base-v4") |
| MAX_DOC_CHARS = 8000 |
| MAX_CHUNKS = 24 |
| MIN_CHUNK_CHARS = 35 |
|
|
| model = SentenceTransformer(SEMANTIC_MODEL_ID) |
|
|
|
|
| |
| |
| |
| STOP_WORDS: set = { |
| "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", |
| "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", |
| "been", "being", "have", "has", "had", "do", "does", "did", "will", |
| "would", "could", "should", "may", "might", "shall", "can", "need", |
| "that", "this", "these", "those", "it", "its", "we", "our", "you", |
| "your", "they", "their", "he", "she", "his", "her", "i", "my", |
| "not", "no", "so", "if", "then", "than", "also", "just", "only", |
| "about", "up", "out", "over", "into", "through", "during", "including", |
| "used", "use", "using", "work", "working", "works", "strong", "good", |
| "experience", "experiences", "role", "team", "company", "environment", |
| "ability", "skills", "skill", "looking", "required", "requirement", |
| "plus", "bonus", "nice", "preferred", "knowledge", "understanding", |
| "familiarity", "proficiency", "proficient", "hands", "on", |
| } |
|
|
|
|
| |
| |
| |
|
|
| def truncate_text(text: str, max_chars: int = MAX_DOC_CHARS) -> str: |
| if len(text) <= max_chars: |
| return text |
| return text[:max_chars].rsplit(" ", 1)[0] |
|
|
|
|
| def split_into_chunks(text: str, max_chunks: int = MAX_CHUNKS) -> list[str]: |
| """Split resume/JD into comparable segments (bullets, lines, sentences).""" |
| if not text: |
| return [] |
|
|
| parts = re.split(r"[\n\r]+|(?<=[.!?])\s+", text) |
| chunks = [p.strip() for p in parts if len(p.strip()) >= MIN_CHUNK_CHARS] |
|
|
| if not chunks and text.strip(): |
| words = text.split() |
| window = 55 |
| for i in range(0, len(words), window): |
| piece = " ".join(words[i : i + window]) |
| if len(piece) >= MIN_CHUNK_CHARS: |
| chunks.append(piece) |
|
|
| return chunks[:max_chunks] |
|
|
|
|
| def extract_skill_sentences(text: str) -> str: |
| """ |
| Skill-heavy lines only — used as a secondary signal, not the main embedding. |
| """ |
| segments = re.split(r"[\n\r.;]+", text) |
| relevant = [] |
| for seg in segments: |
| seg_clean = clean_text(seg) |
| if len(seg_clean) < MIN_CHUNK_CHARS: |
| continue |
| if any(skill in seg_clean for skill in SKILLS_SORTED_BY_LENGTH): |
| relevant.append(seg_clean) |
| return " ".join(relevant) if relevant else clean_text(text) |
|
|
|
|
| def calibrate_semantic_score(cosine: float) -> float: |
| """ |
| Map raw cosine similarity to a 0–100 ATS-style scale. |
| |
| MPNet/MiniLM cosine for related resume/JD pairs usually sits in ~0.35–0.82, |
| not 0.9+, so raw cosine understates good matches without calibration. |
| """ |
| cosine = float(np.clip(cosine, 0.0, 1.0)) |
| low, high = 0.20, 0.78 |
| scaled = (cosine - low) / (high - low) * 100.0 |
| return round(float(np.clip(scaled, 0.0, 100.0)), 2) |
|
|
|
|
| def _pairwise_cosine(a: np.ndarray, b: np.ndarray) -> float: |
| return float(cosine_similarity([a], [b])[0][0]) |
|
|
|
|
| def _chunk_bidirectional_score(resume_chunks: list[str], jd_chunks: list[str]) -> float: |
| """How well JD requirements are covered by resume (and vice versa).""" |
| if not resume_chunks or not jd_chunks: |
| return 0.0 |
|
|
| resume_emb = model.encode(resume_chunks, convert_to_numpy=True) |
| jd_emb = model.encode(jd_chunks, convert_to_numpy=True) |
| sim_matrix = cosine_similarity(resume_emb, jd_emb) |
|
|
| jd_coverage = float(sim_matrix.max(axis=0).mean()) |
| resume_coverage = float(sim_matrix.max(axis=1).mean()) |
| return (jd_coverage + resume_coverage) / 2.0 |
|
|
|
|
| def remove_stop_words(text: str) -> set: |
| """Return meaningful tokens after removing stop words.""" |
| tokens = set(text.split()) |
| return tokens - STOP_WORDS |
|
|
|
|
| |
| |
| |
|
|
| def keyword_match_score(resume_text: str, jd_text: str) -> float: |
| """ |
| Skill-only keyword match. |
| |
| Strategy: |
| - Extract recognised tech skills from both texts using the master taxonomy. |
| - Score = |resume_skills ∩ jd_skills| / |jd_skills| |
| - This eliminates stop-word noise and counts only meaningful tech terms. |
| |
| Weighting bonus: |
| - JD skills that appear multiple times are treated as high-priority. |
| A missing high-frequency skill is penalised more heavily. |
| """ |
| jd_skills_freq = extract_required_skills_from_jd(jd_text) |
| resume_skills = extract_resume_skills(resume_text) |
|
|
| if not jd_skills_freq: |
| return 0.0 |
|
|
| |
| total_weight = sum(jd_skills_freq.values()) |
| matched_weight = sum( |
| freq for skill, freq in jd_skills_freq.items() |
| if skill in resume_skills |
| ) |
|
|
| return round(matched_weight / total_weight * 100, 2) |
|
|
|
|
| def _normalize_for_embedding(text: str) -> str: |
| """ |
| Convert resume or JD into a neutral skill-centric representation. |
| |
| Problem: resumes use first-person achievement language; JDs use |
| third-person requirement language. A general-purpose model sees these |
| as stylistically distant (cosine ~0.40) even when skills match perfectly. |
| |
| Fix: extract skills + skill-heavy sentences and represent both docs |
| in the same "skills: X Y Z context: ..." format so the model compares |
| skill vocabulary, not writing style. |
| """ |
| cleaned = clean_text(text) |
| extracted_skills = extract_resume_skills(cleaned) |
| skill_list = " ".join(sorted(extracted_skills)) |
| skill_context = clean_text(extract_skill_sentences(text)) |
| return f"skills: {skill_list} context: {skill_context}" |
|
|
|
|
| def semantic_match_score(resume_text: str, jd_text: str) -> float: |
| """ |
| Semantic similarity tuned for resume <-> JD alignment. |
| |
| Both documents are normalised into skill-centric representations |
| before embedding so the model compares skill overlap, not writing style. |
| |
| Combines: |
| 1. Normalised full-doc embedding (50%) - fixes style mismatch |
| 2. Chunk-level bi-directional on RAW text (35%) - preserves sentence boundaries |
| 3. Skill-sentences-only embedding (15%) - fine-grained skill context |
| """ |
| if not resume_text.strip() or not jd_text.strip(): |
| return 0.0 |
|
|
| |
| resume_norm = _normalize_for_embedding(resume_text) |
| jd_norm = _normalize_for_embedding(jd_text) |
| doc_emb = model.encode([resume_norm, jd_norm], convert_to_numpy=True) |
| full_sim = _pairwise_cosine(doc_emb[0], doc_emb[1]) |
|
|
| |
| resume_chunks = split_into_chunks(resume_text) |
| jd_chunks = split_into_chunks(jd_text) |
| if len(resume_chunks) > 1 and len(jd_chunks) > 1: |
| chunk_sim = _chunk_bidirectional_score(resume_chunks, jd_chunks) |
| else: |
| chunk_sim = full_sim |
|
|
| |
| resume_skill_text = extract_skill_sentences(resume_text) |
| jd_skill_text = extract_skill_sentences(jd_text) |
| if resume_skill_text and jd_skill_text: |
| skill_emb = model.encode( |
| [truncate_text(resume_skill_text, 4000), |
| truncate_text(jd_skill_text, 4000)], |
| convert_to_numpy=True, |
| ) |
| skill_sim = _pairwise_cosine(skill_emb[0], skill_emb[1]) |
| else: |
| skill_sim = full_sim |
|
|
| raw_cosine = 0.50 * full_sim + 0.35 * chunk_sim + 0.15 * skill_sim |
| return calibrate_semantic_score(raw_cosine) |
|
|
|
|
| def experience_level_penalty(resume_text: str, jd_text: str) -> float: |
| """ |
| Detects seniority mismatch and returns a 0–10 penalty. |
| |
| E.g. a senior-level JD matched against a junior resume |
| should score lower even if skills overlap. |
| """ |
| SENIOR_SIGNALS = {"senior", "lead", "principal", "architect", "staff", "head of"} |
| JUNIOR_SIGNALS = {"junior", "entry level", "entry-level", "graduate", "intern", "fresher"} |
|
|
| jd_lower = jd_text.lower() |
| resume_lower = resume_text.lower() |
|
|
| jd_is_senior = any(s in jd_lower for s in SENIOR_SIGNALS) |
| jd_is_junior = any(s in jd_lower for s in JUNIOR_SIGNALS) |
|
|
| resume_is_senior = any(s in resume_lower for s in SENIOR_SIGNALS) |
| resume_is_junior = any(s in resume_lower for s in JUNIOR_SIGNALS) |
|
|
| |
| if jd_is_senior and resume_is_junior: |
| return 10.0 |
| |
| if jd_is_junior and resume_is_senior: |
| return 3.0 |
| return 0.0 |
|
|
|
|
| |
| |
| |
|
|
| def final_ats_score(resume_text: str, jd_text: str) -> dict: |
| """ |
| Composite ATS score weighted as: |
| 60% semantic similarity (contextual understanding) |
| 40% keyword match (skill taxonomy match, frequency-weighted) |
| |
| A seniority mismatch penalty (0–10 pts) is subtracted from the final score. |
| |
| Returns a dict compatible with ScoreResponse schema. |
| """ |
| semantic = semantic_match_score(resume_text, jd_text) |
| keyword = keyword_match_score(resume_text, jd_text) |
| penalty = experience_level_penalty(resume_text, jd_text) |
|
|
| raw_score = round(0.45 * semantic + 0.55 * keyword, 2) |
| final = round(max(0.0, raw_score - penalty), 2) |
|
|
| return { |
| "semantic_score": round(semantic, 2), |
| "keyword_score": round(keyword, 2), |
| "final_ats_score": final, |
| } |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| examples = [ |
| [ |
| "Python developer with FastAPI, SQL, and machine learning experience", |
| "Looking for a Python developer with FastAPI, SQL, and ML skills", |
| ], |
| [ |
| "Built backend services using Python frameworks and databases", |
| "Python developer with FastAPI and SQL", |
| ], |
| [ |
| "Python Python Python SQL SQL FastAPI", |
| "Python developer with FastAPI and SQL", |
| ], |
| [ |
| "Professional photographer specialising in portraits and wildlife", |
| "Hiring a machine learning engineer with Python and PyTorch", |
| ], |
| [ |
| "Led ML teams, deployed large-scale models, optimised transformers", |
| "Junior Python developer with basic ML", |
| ], |
| [ |
| "NLP engineer: PyTorch, HuggingFace transformers, LLM fine-tuning, RAG pipelines", |
| "Senior ML engineer: LLM, RAG, fine-tuning, Python, AWS SageMaker", |
| ], |
| ] |
|
|
| print(f"{'#':<3} {'Semantic':>10} {'Keyword':>10} {'Final ATS':>10}") |
| print("-" * 38) |
| for i, (resume, jd) in enumerate(examples): |
| result = final_ats_score(resume, jd) |
| print( |
| f"{i:<3} {result['semantic_score']:>10} " |
| f"{result['keyword_score']:>10} " |
| f"{result['final_ats_score']:>10}" |
| ) |