import re import logging from io import BytesIO from typing import Dict, Any import pdfplumber from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import nltk from nltk.corpus import stopwords from nltk import pos_tag from nltk.tokenize import wordpunct_tokenize # ✅ SAFE TOKENIZER logging.getLogger("pdfminer").setLevel(logging.ERROR) STOPWORDS = set(stopwords.words("english")) model = SentenceTransformer("all-MiniLM-L6-v2") def f(x): return float(round(x, 2)) def clean(text: str) -> str: text = text.lower() text = re.sub(r"[^a-z0-9\s\-]", " ", text) return re.sub(r"\s+", " ", text).strip() def extract_pdf_text(file_bytes: bytes) -> str: text = "" with pdfplumber.open(BytesIO(file_bytes)) as pdf: for page in pdf.pages: if page.extract_text(): text += page.extract_text() + "\n" return text.strip() def extract_email(text: str): m = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text) return m.group(0) if m else None def extract_phone(text: str): patterns = [ r"\b(\+91[-\s]?)?[6-9]\d{9}\b", r"\b\+?\d{1,3}[-\s]?\(?\d{2,4}\)?[-\s]?\d{3,4}[-\s]?\d{4}\b" ] for p in patterns: m = re.search(p, text) if m: return m.group(0) return None def extract_name(text: str): BAD = { "resume", "curriculum", "vitae", "engineer", "developer", "analyst", "software", "machine", "data" } lines = [l.strip() for l in text.split("\n") if l.strip()] for line in lines[:5]: words = line.split() if 2 <= len(words) <= 4 and all(w[0].isupper() for w in words): low = line.lower() if not any(b in low for b in BAD): return line return None def embed_sim(a: str, b: str) -> float: emb = model.encode([a, b]) return float(cosine_similarity([emb[0]], [emb[1]])[0][0]) def chunk_text(text: str, size=120): words = text.split() return [" ".join(words[i:i + size]) for i in range(0, len(words), size)] def chunked_similarity(long_text: str, short_text: str) -> float: chunks = chunk_text(long_text) if not chunks: return 0.0 sims = [embed_sim(c, short_text) for c in chunks] sims.sort(reverse=True) return sum(sims[:5]) / min(5, len(sims)) def extract_keywords(text: str): tokens = wordpunct_tokenize(text) # ✅ NO punkt, NO punkt_tab tagged = pos_tag(tokens) return { w for w, t in tagged if t.startswith("NN") and w not in STOPWORDS and len(w) > 2 } def formatting_score(text: str) -> float: score = 0 t = text.lower() if "experience" in t: score += 2 if "skills" in t: score += 2 if "education" in t: score += 2 if "-" in text or "•" in text: score += 2 wc = len(text.split()) if 300 <= wc <= 900: score += 2 return score / 10.0 def generic_penalty(text: str) -> float: GENERIC = [ "hardworking", "team player", "looking for opportunity", "learn new things", "technology", "computer", "passionate", "motivated", "self learner" ] penalty = sum(0.05 for g in GENERIC if g in text) return min(penalty, 0.20) def verdict(score: float) -> str: if score < 35: return "Poor Fit" if score < 50: return "Below Average" if score < 65: return "Average Fit" if score < 80: return "Good Fit" return "Strong Fit" def calibrate(raw: float) -> float: RAW_MIN, RAW_MAX = 0.08, 0.70 raw = max(min(raw, RAW_MAX), RAW_MIN) scaled = (raw - RAW_MIN) / (RAW_MAX - RAW_MIN) return 20 + scaled * 70 # → [20, 90] def ats_score(resume_text: str, jd_text: str, role: str) -> Dict[str, Any]: resume_clean = clean(resume_text) jd_clean = clean(jd_text) # Contact info name = extract_name(resume_text) email = extract_email(resume_text) phone = extract_phone(resume_text) # Similarities skill_sim = chunked_similarity(resume_clean, jd_clean) exp_sim = chunked_similarity(resume_clean, jd_clean) # Keywords r_kw = extract_keywords(resume_clean) j_kw = extract_keywords(jd_clean) kw_ratio = min(len(r_kw & j_kw) / max(len(j_kw), 1), 1.0) # Formatting fmt = formatting_score(resume_clean) # Weights if role in {"researcher", "research_engineer"}: w_skill, w_kw, w_exp, w_fmt = 0.45, 0.20, 0.25, 0.10 else: w_skill, w_kw, w_exp, w_fmt = 0.40, 0.25, 0.25, 0.10 raw = ( w_skill * skill_sim + w_kw * kw_ratio + w_exp * exp_sim + w_fmt * fmt ) raw -= generic_penalty(resume_clean) raw = max(raw, 0.0) final_score = f(calibrate(raw)) # Component maxima MAX = { "skill": 40, "keyword": 30, "experience": 20, "formatting": 10 } components = { "skill": skill_sim * MAX["skill"], "keyword": kw_ratio * MAX["keyword"], "experience": exp_sim * MAX["experience"], "formatting": fmt * MAX["formatting"] } component_sum = sum(components.values()) or 1.0 scale = final_score / component_sum breakdown = { "skill_match": { "score": f(components["skill"] * scale), "max": MAX["skill"] }, "keyword_match": { "score": f(components["keyword"] * scale), "max": MAX["keyword"] }, "experience_match": { "score": f(components["experience"] * scale), "max": MAX["experience"] }, "formatting": { "score": f(components["formatting"] * scale), "max": MAX["formatting"] } } return { "name": name, "email": email, "phone": phone, "ats_score": final_score, "verdict": verdict(final_score), "score_breakdown": breakdown, "missing_keywords": sorted(j_kw - r_kw)[:10] }