ats-score-analyzer / ats_core.py
Dev1012's picture
adding ats suggestion code(basic)
b4b6f00
import re
import logging
from io import BytesIO
from typing import Dict, Any
import pdfplumber
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import wordpunct_tokenize # ✅ SAFE TOKENIZER
logging.getLogger("pdfminer").setLevel(logging.ERROR)
STOPWORDS = set(stopwords.words("english"))
model = SentenceTransformer("all-MiniLM-L6-v2")
def f(x):
return float(round(x, 2))
def clean(text: str) -> str:
text = text.lower()
text = re.sub(r"[^a-z0-9\s\-]", " ", text)
return re.sub(r"\s+", " ", text).strip()
def extract_pdf_text(file_bytes: bytes) -> str:
text = ""
with pdfplumber.open(BytesIO(file_bytes)) as pdf:
for page in pdf.pages:
if page.extract_text():
text += page.extract_text() + "\n"
return text.strip()
def extract_email(text: str):
m = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
return m.group(0) if m else None
def extract_phone(text: str):
patterns = [
r"\b(\+91[-\s]?)?[6-9]\d{9}\b",
r"\b\+?\d{1,3}[-\s]?\(?\d{2,4}\)?[-\s]?\d{3,4}[-\s]?\d{4}\b"
]
for p in patterns:
m = re.search(p, text)
if m:
return m.group(0)
return None
def extract_name(text: str):
BAD = {
"resume", "curriculum", "vitae",
"engineer", "developer", "analyst",
"software", "machine", "data"
}
lines = [l.strip() for l in text.split("\n") if l.strip()]
for line in lines[:5]:
words = line.split()
if 2 <= len(words) <= 4 and all(w[0].isupper() for w in words):
low = line.lower()
if not any(b in low for b in BAD):
return line
return None
def embed_sim(a: str, b: str) -> float:
emb = model.encode([a, b])
return float(cosine_similarity([emb[0]], [emb[1]])[0][0])
def chunk_text(text: str, size=120):
words = text.split()
return [" ".join(words[i:i + size]) for i in range(0, len(words), size)]
def chunked_similarity(long_text: str, short_text: str) -> float:
chunks = chunk_text(long_text)
if not chunks:
return 0.0
sims = [embed_sim(c, short_text) for c in chunks]
sims.sort(reverse=True)
return sum(sims[:5]) / min(5, len(sims))
def extract_keywords(text: str):
tokens = wordpunct_tokenize(text) # ✅ NO punkt, NO punkt_tab
tagged = pos_tag(tokens)
return {
w for w, t in tagged
if t.startswith("NN") and w not in STOPWORDS and len(w) > 2
}
def formatting_score(text: str) -> float:
score = 0
t = text.lower()
if "experience" in t: score += 2
if "skills" in t: score += 2
if "education" in t: score += 2
if "-" in text or "•" in text: score += 2
wc = len(text.split())
if 300 <= wc <= 900: score += 2
return score / 10.0
def generic_penalty(text: str) -> float:
GENERIC = [
"hardworking", "team player", "looking for opportunity",
"learn new things", "technology", "computer",
"passionate", "motivated", "self learner"
]
penalty = sum(0.05 for g in GENERIC if g in text)
return min(penalty, 0.20)
def verdict(score: float) -> str:
if score < 35: return "Poor Fit"
if score < 50: return "Below Average"
if score < 65: return "Average Fit"
if score < 80: return "Good Fit"
return "Strong Fit"
def calibrate(raw: float) -> float:
RAW_MIN, RAW_MAX = 0.08, 0.70
raw = max(min(raw, RAW_MAX), RAW_MIN)
scaled = (raw - RAW_MIN) / (RAW_MAX - RAW_MIN)
return 20 + scaled * 70 # → [20, 90]
def ats_score(resume_text: str, jd_text: str, role: str) -> Dict[str, Any]:
resume_clean = clean(resume_text)
jd_clean = clean(jd_text)
# Contact info
name = extract_name(resume_text)
email = extract_email(resume_text)
phone = extract_phone(resume_text)
# Similarities
skill_sim = chunked_similarity(resume_clean, jd_clean)
exp_sim = chunked_similarity(resume_clean, jd_clean)
# Keywords
r_kw = extract_keywords(resume_clean)
j_kw = extract_keywords(jd_clean)
kw_ratio = min(len(r_kw & j_kw) / max(len(j_kw), 1), 1.0)
# Formatting
fmt = formatting_score(resume_clean)
# Weights
if role in {"researcher", "research_engineer"}:
w_skill, w_kw, w_exp, w_fmt = 0.45, 0.20, 0.25, 0.10
else:
w_skill, w_kw, w_exp, w_fmt = 0.40, 0.25, 0.25, 0.10
raw = (
w_skill * skill_sim +
w_kw * kw_ratio +
w_exp * exp_sim +
w_fmt * fmt
)
raw -= generic_penalty(resume_clean)
raw = max(raw, 0.0)
final_score = f(calibrate(raw))
# Component maxima
MAX = {
"skill": 40,
"keyword": 30,
"experience": 20,
"formatting": 10
}
components = {
"skill": skill_sim * MAX["skill"],
"keyword": kw_ratio * MAX["keyword"],
"experience": exp_sim * MAX["experience"],
"formatting": fmt * MAX["formatting"]
}
component_sum = sum(components.values()) or 1.0
scale = final_score / component_sum
breakdown = {
"skill_match": {
"score": f(components["skill"] * scale),
"max": MAX["skill"]
},
"keyword_match": {
"score": f(components["keyword"] * scale),
"max": MAX["keyword"]
},
"experience_match": {
"score": f(components["experience"] * scale),
"max": MAX["experience"]
},
"formatting": {
"score": f(components["formatting"] * scale),
"max": MAX["formatting"]
}
}
return {
"name": name,
"email": email,
"phone": phone,
"ats_score": final_score,
"verdict": verdict(final_score),
"score_breakdown": breakdown,
"missing_keywords": sorted(j_kw - r_kw)[:10]
}