Spaces:
Runtime error
Runtime error
File size: 5,983 Bytes
931c067 536d156 6c4cb5c 536d156 6c4cb5c 536d156 6c4cb5c 1c5cee5 931c067 fa2618a 536d156 af5cacc 536d156 6294116 af5cacc 6c4cb5c af5cacc 6c4cb5c af5cacc 8cb5278 6c4cb5c af5cacc 6c4cb5c af5cacc 6c4cb5c af5cacc 8cb5278 6c4cb5c 931c067 af5cacc f3fd8de 6c4cb5c 536d156 6c4cb5c f3fd8de 6c4cb5c 931c067 af5cacc 6c4cb5c 931c067 6c4cb5c 536d156 6c4cb5c 536d156 6294116 6c4cb5c 536d156 931c067 af5cacc 6c4cb5c 931c067 6c4cb5c f3fd8de af5cacc f3fd8de b8691b9 6c4cb5c f3fd8de 6c4cb5c f3fd8de 6294116 6c4cb5c 33f4e97 6c4cb5c af5cacc 33f4e97 6c4cb5c af5cacc b8691b9 6c4cb5c af5cacc b8691b9 6c4cb5c af5cacc 6c4cb5c 931c067 6c4cb5c af5cacc b8691b9 6c4cb5c 6f801ea 6c4cb5c 6f801ea 6c4cb5c b8691b9 6c4cb5c c1da06e 6294116 8cb5278 6c4cb5c b8691b9 6c4cb5c 931c067 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | import re
import logging
from io import BytesIO
from typing import Dict, Any
import pdfplumber
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import wordpunct_tokenize # ✅ SAFE TOKENIZER
logging.getLogger("pdfminer").setLevel(logging.ERROR)
STOPWORDS = set(stopwords.words("english"))
model = SentenceTransformer("all-MiniLM-L6-v2")
def f(x):
return float(round(x, 2))
def clean(text: str) -> str:
text = text.lower()
text = re.sub(r"[^a-z0-9\s\-]", " ", text)
return re.sub(r"\s+", " ", text).strip()
def extract_pdf_text(file_bytes: bytes) -> str:
text = ""
with pdfplumber.open(BytesIO(file_bytes)) as pdf:
for page in pdf.pages:
if page.extract_text():
text += page.extract_text() + "\n"
return text.strip()
def extract_email(text: str):
m = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
return m.group(0) if m else None
def extract_phone(text: str):
patterns = [
r"\b(\+91[-\s]?)?[6-9]\d{9}\b",
r"\b\+?\d{1,3}[-\s]?\(?\d{2,4}\)?[-\s]?\d{3,4}[-\s]?\d{4}\b"
]
for p in patterns:
m = re.search(p, text)
if m:
return m.group(0)
return None
def extract_name(text: str):
BAD = {
"resume", "curriculum", "vitae",
"engineer", "developer", "analyst",
"software", "machine", "data"
}
lines = [l.strip() for l in text.split("\n") if l.strip()]
for line in lines[:5]:
words = line.split()
if 2 <= len(words) <= 4 and all(w[0].isupper() for w in words):
low = line.lower()
if not any(b in low for b in BAD):
return line
return None
def embed_sim(a: str, b: str) -> float:
emb = model.encode([a, b])
return float(cosine_similarity([emb[0]], [emb[1]])[0][0])
def chunk_text(text: str, size=120):
words = text.split()
return [" ".join(words[i:i + size]) for i in range(0, len(words), size)]
def chunked_similarity(long_text: str, short_text: str) -> float:
chunks = chunk_text(long_text)
if not chunks:
return 0.0
sims = [embed_sim(c, short_text) for c in chunks]
sims.sort(reverse=True)
return sum(sims[:5]) / min(5, len(sims))
def extract_keywords(text: str):
tokens = wordpunct_tokenize(text) # ✅ NO punkt, NO punkt_tab
tagged = pos_tag(tokens)
return {
w for w, t in tagged
if t.startswith("NN") and w not in STOPWORDS and len(w) > 2
}
def formatting_score(text: str) -> float:
score = 0
t = text.lower()
if "experience" in t: score += 2
if "skills" in t: score += 2
if "education" in t: score += 2
if "-" in text or "•" in text: score += 2
wc = len(text.split())
if 300 <= wc <= 900: score += 2
return score / 10.0
def generic_penalty(text: str) -> float:
GENERIC = [
"hardworking", "team player", "looking for opportunity",
"learn new things", "technology", "computer",
"passionate", "motivated", "self learner"
]
penalty = sum(0.05 for g in GENERIC if g in text)
return min(penalty, 0.20)
def verdict(score: float) -> str:
if score < 35: return "Poor Fit"
if score < 50: return "Below Average"
if score < 65: return "Average Fit"
if score < 80: return "Good Fit"
return "Strong Fit"
def calibrate(raw: float) -> float:
RAW_MIN, RAW_MAX = 0.08, 0.70
raw = max(min(raw, RAW_MAX), RAW_MIN)
scaled = (raw - RAW_MIN) / (RAW_MAX - RAW_MIN)
return 20 + scaled * 70 # → [20, 90]
def ats_score(resume_text: str, jd_text: str, role: str) -> Dict[str, Any]:
resume_clean = clean(resume_text)
jd_clean = clean(jd_text)
# Contact info
name = extract_name(resume_text)
email = extract_email(resume_text)
phone = extract_phone(resume_text)
# Similarities
skill_sim = chunked_similarity(resume_clean, jd_clean)
exp_sim = chunked_similarity(resume_clean, jd_clean)
# Keywords
r_kw = extract_keywords(resume_clean)
j_kw = extract_keywords(jd_clean)
kw_ratio = min(len(r_kw & j_kw) / max(len(j_kw), 1), 1.0)
# Formatting
fmt = formatting_score(resume_clean)
# Weights
if role in {"researcher", "research_engineer"}:
w_skill, w_kw, w_exp, w_fmt = 0.45, 0.20, 0.25, 0.10
else:
w_skill, w_kw, w_exp, w_fmt = 0.40, 0.25, 0.25, 0.10
raw = (
w_skill * skill_sim +
w_kw * kw_ratio +
w_exp * exp_sim +
w_fmt * fmt
)
raw -= generic_penalty(resume_clean)
raw = max(raw, 0.0)
final_score = f(calibrate(raw))
# Component maxima
MAX = {
"skill": 40,
"keyword": 30,
"experience": 20,
"formatting": 10
}
components = {
"skill": skill_sim * MAX["skill"],
"keyword": kw_ratio * MAX["keyword"],
"experience": exp_sim * MAX["experience"],
"formatting": fmt * MAX["formatting"]
}
component_sum = sum(components.values()) or 1.0
scale = final_score / component_sum
breakdown = {
"skill_match": {
"score": f(components["skill"] * scale),
"max": MAX["skill"]
},
"keyword_match": {
"score": f(components["keyword"] * scale),
"max": MAX["keyword"]
},
"experience_match": {
"score": f(components["experience"] * scale),
"max": MAX["experience"]
},
"formatting": {
"score": f(components["formatting"] * scale),
"max": MAX["formatting"]
}
}
return {
"name": name,
"email": email,
"phone": phone,
"ats_score": final_score,
"verdict": verdict(final_score),
"score_breakdown": breakdown,
"missing_keywords": sorted(j_kw - r_kw)[:10]
}
|