Spaces:
Runtime error
Runtime error
| import re | |
| import logging | |
| from io import BytesIO | |
| from typing import Dict, Any | |
| import pdfplumber | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk import pos_tag | |
| from nltk.tokenize import wordpunct_tokenize # ✅ SAFE TOKENIZER | |
| logging.getLogger("pdfminer").setLevel(logging.ERROR) | |
| STOPWORDS = set(stopwords.words("english")) | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def f(x): | |
| return float(round(x, 2)) | |
| def clean(text: str) -> str: | |
| text = text.lower() | |
| text = re.sub(r"[^a-z0-9\s\-]", " ", text) | |
| return re.sub(r"\s+", " ", text).strip() | |
| def extract_pdf_text(file_bytes: bytes) -> str: | |
| text = "" | |
| with pdfplumber.open(BytesIO(file_bytes)) as pdf: | |
| for page in pdf.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() + "\n" | |
| return text.strip() | |
| def extract_email(text: str): | |
| m = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text) | |
| return m.group(0) if m else None | |
| def extract_phone(text: str): | |
| patterns = [ | |
| r"\b(\+91[-\s]?)?[6-9]\d{9}\b", | |
| r"\b\+?\d{1,3}[-\s]?\(?\d{2,4}\)?[-\s]?\d{3,4}[-\s]?\d{4}\b" | |
| ] | |
| for p in patterns: | |
| m = re.search(p, text) | |
| if m: | |
| return m.group(0) | |
| return None | |
| def extract_name(text: str): | |
| BAD = { | |
| "resume", "curriculum", "vitae", | |
| "engineer", "developer", "analyst", | |
| "software", "machine", "data" | |
| } | |
| lines = [l.strip() for l in text.split("\n") if l.strip()] | |
| for line in lines[:5]: | |
| words = line.split() | |
| if 2 <= len(words) <= 4 and all(w[0].isupper() for w in words): | |
| low = line.lower() | |
| if not any(b in low for b in BAD): | |
| return line | |
| return None | |
| def embed_sim(a: str, b: str) -> float: | |
| emb = model.encode([a, b]) | |
| return float(cosine_similarity([emb[0]], [emb[1]])[0][0]) | |
| def chunk_text(text: str, size=120): | |
| words = text.split() | |
| return [" ".join(words[i:i + size]) for i in range(0, len(words), size)] | |
| def chunked_similarity(long_text: str, short_text: str) -> float: | |
| chunks = chunk_text(long_text) | |
| if not chunks: | |
| return 0.0 | |
| sims = [embed_sim(c, short_text) for c in chunks] | |
| sims.sort(reverse=True) | |
| return sum(sims[:5]) / min(5, len(sims)) | |
| def extract_keywords(text: str): | |
| tokens = wordpunct_tokenize(text) # ✅ NO punkt, NO punkt_tab | |
| tagged = pos_tag(tokens) | |
| return { | |
| w for w, t in tagged | |
| if t.startswith("NN") and w not in STOPWORDS and len(w) > 2 | |
| } | |
| def formatting_score(text: str) -> float: | |
| score = 0 | |
| t = text.lower() | |
| if "experience" in t: score += 2 | |
| if "skills" in t: score += 2 | |
| if "education" in t: score += 2 | |
| if "-" in text or "•" in text: score += 2 | |
| wc = len(text.split()) | |
| if 300 <= wc <= 900: score += 2 | |
| return score / 10.0 | |
| def generic_penalty(text: str) -> float: | |
| GENERIC = [ | |
| "hardworking", "team player", "looking for opportunity", | |
| "learn new things", "technology", "computer", | |
| "passionate", "motivated", "self learner" | |
| ] | |
| penalty = sum(0.05 for g in GENERIC if g in text) | |
| return min(penalty, 0.20) | |
| def verdict(score: float) -> str: | |
| if score < 35: return "Poor Fit" | |
| if score < 50: return "Below Average" | |
| if score < 65: return "Average Fit" | |
| if score < 80: return "Good Fit" | |
| return "Strong Fit" | |
| def calibrate(raw: float) -> float: | |
| RAW_MIN, RAW_MAX = 0.08, 0.70 | |
| raw = max(min(raw, RAW_MAX), RAW_MIN) | |
| scaled = (raw - RAW_MIN) / (RAW_MAX - RAW_MIN) | |
| return 20 + scaled * 70 # → [20, 90] | |
| def ats_score(resume_text: str, jd_text: str, role: str) -> Dict[str, Any]: | |
| resume_clean = clean(resume_text) | |
| jd_clean = clean(jd_text) | |
| # Contact info | |
| name = extract_name(resume_text) | |
| email = extract_email(resume_text) | |
| phone = extract_phone(resume_text) | |
| # Similarities | |
| skill_sim = chunked_similarity(resume_clean, jd_clean) | |
| exp_sim = chunked_similarity(resume_clean, jd_clean) | |
| # Keywords | |
| r_kw = extract_keywords(resume_clean) | |
| j_kw = extract_keywords(jd_clean) | |
| kw_ratio = min(len(r_kw & j_kw) / max(len(j_kw), 1), 1.0) | |
| # Formatting | |
| fmt = formatting_score(resume_clean) | |
| # Weights | |
| if role in {"researcher", "research_engineer"}: | |
| w_skill, w_kw, w_exp, w_fmt = 0.45, 0.20, 0.25, 0.10 | |
| else: | |
| w_skill, w_kw, w_exp, w_fmt = 0.40, 0.25, 0.25, 0.10 | |
| raw = ( | |
| w_skill * skill_sim + | |
| w_kw * kw_ratio + | |
| w_exp * exp_sim + | |
| w_fmt * fmt | |
| ) | |
| raw -= generic_penalty(resume_clean) | |
| raw = max(raw, 0.0) | |
| final_score = f(calibrate(raw)) | |
| # Component maxima | |
| MAX = { | |
| "skill": 40, | |
| "keyword": 30, | |
| "experience": 20, | |
| "formatting": 10 | |
| } | |
| components = { | |
| "skill": skill_sim * MAX["skill"], | |
| "keyword": kw_ratio * MAX["keyword"], | |
| "experience": exp_sim * MAX["experience"], | |
| "formatting": fmt * MAX["formatting"] | |
| } | |
| component_sum = sum(components.values()) or 1.0 | |
| scale = final_score / component_sum | |
| breakdown = { | |
| "skill_match": { | |
| "score": f(components["skill"] * scale), | |
| "max": MAX["skill"] | |
| }, | |
| "keyword_match": { | |
| "score": f(components["keyword"] * scale), | |
| "max": MAX["keyword"] | |
| }, | |
| "experience_match": { | |
| "score": f(components["experience"] * scale), | |
| "max": MAX["experience"] | |
| }, | |
| "formatting": { | |
| "score": f(components["formatting"] * scale), | |
| "max": MAX["formatting"] | |
| } | |
| } | |
| return { | |
| "name": name, | |
| "email": email, | |
| "phone": phone, | |
| "ats_score": final_score, | |
| "verdict": verdict(final_score), | |
| "score_breakdown": breakdown, | |
| "missing_keywords": sorted(j_kw - r_kw)[:10] | |
| } | |