Resume_Screening_Model / utils /nlp_utils.py
chirag1121's picture
Create nlp_utils.py
54b9947 verified
"""
nlp_utils.py β€” NLP utilities for resume analysis.
Responsibilities:
- Named Entity Recognition (NER) using spaCy
- Section detection (Skills, Education, Experience, Projects)
- Skill keyword extraction from a predefined skill list
- Resume classification heuristic (Good / Average / Poor)
"""
import re
import spacy
# ---------------------------------------------------------------------------
# spaCy model β€” loaded once at import time
# ---------------------------------------------------------------------------
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
# Fallback: download the model at runtime if missing
import subprocess, sys
subprocess.run(
[sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
check=True,
)
nlp = spacy.load("en_core_web_sm")
# ---------------------------------------------------------------------------
# Predefined skill taxonomy
# ---------------------------------------------------------------------------
TECHNICAL_SKILLS = {
# Programming languages
"python", "java", "javascript", "typescript", "c++", "c#", "c", "go",
"rust", "kotlin", "swift", "ruby", "php", "scala", "r", "matlab",
"bash", "shell", "perl", "lua",
# Web / frontend
"html", "css", "react", "angular", "vue", "next.js", "nuxt.js",
"svelte", "tailwind", "bootstrap", "jquery", "webpack", "vite",
# Backend / frameworks
"node.js", "django", "flask", "fastapi", "spring", "express",
"rails", "laravel", "asp.net",
# Databases
"sql", "mysql", "postgresql", "mongodb", "redis", "elasticsearch",
"sqlite", "oracle", "cassandra", "dynamodb", "firebase",
# Cloud & DevOps
"aws", "azure", "gcp", "docker", "kubernetes", "terraform",
"ansible", "jenkins", "github actions", "ci/cd", "linux",
"nginx", "apache",
# ML / AI
"machine learning", "deep learning", "nlp", "computer vision",
"tensorflow", "pytorch", "keras", "scikit-learn", "pandas",
"numpy", "matplotlib", "seaborn", "hugging face", "transformers",
"langchain", "openai", "llm",
# Data
"data analysis", "data science", "power bi", "tableau", "excel",
"spark", "hadoop", "airflow", "dbt", "snowflake", "bigquery",
# Version control & tools
"git", "github", "gitlab", "bitbucket", "jira", "confluence",
"postman", "swagger",
# Other
"rest api", "graphql", "microservices", "agile", "scrum",
"unit testing", "pytest", "jest", "selenium", "linux",
}
SOFT_SKILLS = {
"leadership", "communication", "teamwork", "problem solving",
"critical thinking", "time management", "adaptability",
"collaboration", "creativity", "project management",
}
ALL_SKILLS = TECHNICAL_SKILLS | SOFT_SKILLS
# ---------------------------------------------------------------------------
# Section header keywords
# ---------------------------------------------------------------------------
SECTION_KEYWORDS = {
"skills": [
"skills", "technical skills", "core competencies",
"technologies", "tools", "expertise", "proficiencies",
],
"education": [
"education", "academic background", "qualification",
"degree", "university", "college", "school",
],
"experience": [
"experience", "work experience", "employment history",
"professional experience", "work history", "career",
"internship", "internships",
],
"projects": [
"projects", "personal projects", "side projects",
"academic projects", "portfolio",
],
"summary": [
"summary", "objective", "profile", "about me",
"professional summary", "career objective",
],
"certifications": [
"certifications", "certificates", "licenses", "awards",
],
"contact": [
"contact", "contact information", "personal details",
],
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def extract_entities(text: str) -> dict:
"""
Run spaCy NER and return a dict of entity labels β†’ list of values.
Labels returned: PERSON, ORG, GPE (location), DATE, plus a best-guess
for the candidate name (first PERSON entity or first line heuristic).
"""
if not text:
return {"name": None, "organizations": [], "locations": [], "dates": []}
doc = nlp(text[:5000]) # limit to first 5000 chars for speed
persons = []
orgs = []
locations = []
dates = []
for ent in doc.ents:
if ent.label_ == "PERSON":
persons.append(ent.text.strip())
elif ent.label_ == "ORG":
orgs.append(ent.text.strip())
elif ent.label_ in ("GPE", "LOC"):
locations.append(ent.text.strip())
elif ent.label_ == "DATE":
dates.append(ent.text.strip())
# Best-guess for name: first PERSON entity or first non-empty line
name = persons[0] if persons else _guess_name_from_first_line(text)
return {
"name": name,
"organizations": list(dict.fromkeys(orgs)), # dedupe, keep order
"locations": list(dict.fromkeys(locations)),
"dates": list(dict.fromkeys(dates[:10])), # top 10
}
def _guess_name_from_first_line(text: str) -> str | None:
"""Heuristic: the first short line often contains the candidate's name."""
for line in text.splitlines():
line = line.strip()
if 2 <= len(line.split()) <= 5 and line.replace(" ", "").isalpha():
return line
return None
def detect_sections(text: str) -> dict:
"""
Detect which resume sections are present.
Returns:
dict mapping section name β†’ True/False
"""
text_lower = text.lower()
detected = {}
for section, keywords in SECTION_KEYWORDS.items():
detected[section] = any(kw in text_lower for kw in keywords)
return detected
def extract_skills(text: str) -> dict:
"""
Extract skills from resume text.
Returns:
dict with keys 'technical' and 'soft' β€” each a sorted list of found skills.
"""
text_lower = text.lower()
found_technical = []
found_soft = []
for skill in TECHNICAL_SKILLS:
# Use word-boundary matching to avoid partial matches
pattern = r"\b" + re.escape(skill) + r"\b"
if re.search(pattern, text_lower):
found_technical.append(skill)
for skill in SOFT_SKILLS:
pattern = r"\b" + re.escape(skill) + r"\b"
if re.search(pattern, text_lower):
found_soft.append(skill)
return {
"technical": sorted(found_technical),
"soft": sorted(found_soft),
"all": sorted(found_technical + found_soft),
}
def get_missing_sections(sections: dict) -> list:
"""
Return list of important sections that are missing from the resume.
Args:
sections: result of detect_sections()
Returns:
List of human-readable missing section names.
"""
important = {
"skills": "Skills section",
"education": "Education section",
"experience": "Work Experience section",
"projects": "Projects section",
"summary": "Professional Summary / Objective",
}
missing = []
for key, label in important.items():
if not sections.get(key, False):
missing.append(label)
return missing
def classify_resume(score: float) -> dict:
"""
Classify a resume based on its ATS score.
Args:
score: ATS score (0–100).
Returns:
dict with 'label' (Good/Average/Poor) and 'color' for UI display.
"""
if score >= 70:
return {"label": "Good βœ…", "color": "green"}
elif score >= 45:
return {"label": "Average ⚠️", "color": "orange"}
else:
return {"label": "Poor ❌", "color": "red"}
def generate_suggestions(
sections: dict,
skills: dict,
score: float,
job_match: float,
) -> list:
"""
Rule-based suggestions engine.
Analyzes resume structure and scores to generate actionable improvement tips.
Args:
sections : result of detect_sections()
skills : result of extract_skills()
score : resume base score (0–100)
job_match : job description match % (0–100)
Returns:
List of suggestion strings.
"""
suggestions = []
# Section-based suggestions
if not sections.get("summary"):
suggestions.append(
"πŸ“ Add a Professional Summary at the top of your resume "
"(2–3 lines highlighting your key strengths and career goal)."
)
if not sections.get("skills"):
suggestions.append(
"πŸ› οΈ Add a dedicated Skills section listing your technical "
"and soft skills clearly."
)
if not sections.get("experience"):
suggestions.append(
"πŸ’Ό Add a Work Experience section with job titles, company names, "
"dates, and bullet-point achievements."
)
if not sections.get("projects"):
suggestions.append(
"πŸš€ Include a Projects section. Showcase 2–3 projects with a brief "
"description, technologies used, and impact or outcome."
)
if not sections.get("certifications"):
suggestions.append(
"πŸ† Consider adding Certifications or Awards if you have any relevant ones."
)
# Skill-based suggestions
tech_count = len(skills.get("technical", []))
if tech_count < 5:
suggestions.append(
f"βš™οΈ Only {tech_count} technical skill(s) found. "
"Add more relevant technical skills (aim for 8–15)."
)
if not skills.get("soft"):
suggestions.append(
"🀝 Mention soft skills such as 'Leadership', 'Teamwork', or "
"'Communication' β€” many ATS systems look for these."
)
# Score-based suggestions
if score < 60:
suggestions.append(
"πŸ“ Your resume may be too short. ATS systems reward detailed resumes. "
"Aim for at least 400–600 words."
)
# Job-match suggestions
if job_match < 50:
suggestions.append(
"🎯 Low job description match. Tailor your resume keywords to match "
"the exact terms in the job posting."
)
elif job_match < 70:
suggestions.append(
"🎯 Moderate job match. Review the job description and ensure your "
"skills and experience directly address its requirements."
)
# Formatting suggestions
suggestions.append(
"πŸ“ Use clean formatting: clear headings, consistent font, and bullet points. "
"Avoid tables or graphics β€” they confuse most ATS parsers."
)
suggestions.append(
"πŸ“Š Quantify your achievements where possible "
"(e.g., 'Reduced load time by 40%', 'Led a team of 5 engineers')."
)
return suggestions