Spaces:

chirag1121
/

Resume_Screening_Model

Sleeping

App Files Files Community

Resume_Screening_Model / utils /nlp_utils.py

chirag1121

Create nlp_utils.py

54b9947 verified about 1 month ago

raw

history blame contribute delete

11.1 kB

	"""
	nlp_utils.py — NLP utilities for resume analysis.

	Responsibilities:
	- Named Entity Recognition (NER) using spaCy
	- Section detection (Skills, Education, Experience, Projects)
	- Skill keyword extraction from a predefined skill list
	- Resume classification heuristic (Good / Average / Poor)
	"""

	import re
	import spacy

	# ---------------------------------------------------------------------------
	# spaCy model — loaded once at import time
	# ---------------------------------------------------------------------------
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	# Fallback: download the model at runtime if missing
	import subprocess, sys
	subprocess.run(
	[sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
	check=True,
	)
	nlp = spacy.load("en_core_web_sm")


	# ---------------------------------------------------------------------------
	# Predefined skill taxonomy
	# ---------------------------------------------------------------------------
	TECHNICAL_SKILLS = {
	# Programming languages
	"python", "java", "javascript", "typescript", "c++", "c#", "c", "go",
	"rust", "kotlin", "swift", "ruby", "php", "scala", "r", "matlab",
	"bash", "shell", "perl", "lua",
	# Web / frontend
	"html", "css", "react", "angular", "vue", "next.js", "nuxt.js",
	"svelte", "tailwind", "bootstrap", "jquery", "webpack", "vite",
	# Backend / frameworks
	"node.js", "django", "flask", "fastapi", "spring", "express",
	"rails", "laravel", "asp.net",
	# Databases
	"sql", "mysql", "postgresql", "mongodb", "redis", "elasticsearch",
	"sqlite", "oracle", "cassandra", "dynamodb", "firebase",
	# Cloud & DevOps
	"aws", "azure", "gcp", "docker", "kubernetes", "terraform",
	"ansible", "jenkins", "github actions", "ci/cd", "linux",
	"nginx", "apache",
	# ML / AI
	"machine learning", "deep learning", "nlp", "computer vision",
	"tensorflow", "pytorch", "keras", "scikit-learn", "pandas",
	"numpy", "matplotlib", "seaborn", "hugging face", "transformers",
	"langchain", "openai", "llm",
	# Data
	"data analysis", "data science", "power bi", "tableau", "excel",
	"spark", "hadoop", "airflow", "dbt", "snowflake", "bigquery",
	# Version control & tools
	"git", "github", "gitlab", "bitbucket", "jira", "confluence",
	"postman", "swagger",
	# Other
	"rest api", "graphql", "microservices", "agile", "scrum",
	"unit testing", "pytest", "jest", "selenium", "linux",
	}

	SOFT_SKILLS = {
	"leadership", "communication", "teamwork", "problem solving",
	"critical thinking", "time management", "adaptability",
	"collaboration", "creativity", "project management",
	}

	ALL_SKILLS = TECHNICAL_SKILLS \| SOFT_SKILLS


	# ---------------------------------------------------------------------------
	# Section header keywords
	# ---------------------------------------------------------------------------
	SECTION_KEYWORDS = {
	"skills": [
	"skills", "technical skills", "core competencies",
	"technologies", "tools", "expertise", "proficiencies",
	],
	"education": [
	"education", "academic background", "qualification",
	"degree", "university", "college", "school",
	],
	"experience": [
	"experience", "work experience", "employment history",
	"professional experience", "work history", "career",
	"internship", "internships",
	],
	"projects": [
	"projects", "personal projects", "side projects",
	"academic projects", "portfolio",
	],
	"summary": [
	"summary", "objective", "profile", "about me",
	"professional summary", "career objective",
	],
	"certifications": [
	"certifications", "certificates", "licenses", "awards",
	],
	"contact": [
	"contact", "contact information", "personal details",
	],
	}


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------

	def extract_entities(text: str) -> dict:
	"""
	Run spaCy NER and return a dict of entity labels → list of values.

	Labels returned: PERSON, ORG, GPE (location), DATE, plus a best-guess
	for the candidate name (first PERSON entity or first line heuristic).
	"""
	if not text:
	return {"name": None, "organizations": [], "locations": [], "dates": []}

	doc = nlp(text[:5000]) # limit to first 5000 chars for speed

	persons = []
	orgs = []
	locations = []
	dates = []

	for ent in doc.ents:
	if ent.label_ == "PERSON":
	persons.append(ent.text.strip())
	elif ent.label_ == "ORG":
	orgs.append(ent.text.strip())
	elif ent.label_ in ("GPE", "LOC"):
	locations.append(ent.text.strip())
	elif ent.label_ == "DATE":
	dates.append(ent.text.strip())

	# Best-guess for name: first PERSON entity or first non-empty line
	name = persons[0] if persons else _guess_name_from_first_line(text)

	return {
	"name": name,
	"organizations": list(dict.fromkeys(orgs)), # dedupe, keep order
	"locations": list(dict.fromkeys(locations)),
	"dates": list(dict.fromkeys(dates[:10])), # top 10
	}


	def _guess_name_from_first_line(text: str) -> str \| None:
	"""Heuristic: the first short line often contains the candidate's name."""
	for line in text.splitlines():
	line = line.strip()
	if 2 <= len(line.split()) <= 5 and line.replace(" ", "").isalpha():
	return line
	return None


	def detect_sections(text: str) -> dict:
	"""
	Detect which resume sections are present.

	Returns:
	dict mapping section name → True/False
	"""
	text_lower = text.lower()
	detected = {}
	for section, keywords in SECTION_KEYWORDS.items():
	detected[section] = any(kw in text_lower for kw in keywords)
	return detected


	def extract_skills(text: str) -> dict:
	"""
	Extract skills from resume text.

	Returns:
	dict with keys 'technical' and 'soft' — each a sorted list of found skills.
	"""
	text_lower = text.lower()
	found_technical = []
	found_soft = []

	for skill in TECHNICAL_SKILLS:
	# Use word-boundary matching to avoid partial matches
	pattern = r"\b" + re.escape(skill) + r"\b"
	if re.search(pattern, text_lower):
	found_technical.append(skill)

	for skill in SOFT_SKILLS:
	pattern = r"\b" + re.escape(skill) + r"\b"
	if re.search(pattern, text_lower):
	found_soft.append(skill)

	return {
	"technical": sorted(found_technical),
	"soft": sorted(found_soft),
	"all": sorted(found_technical + found_soft),
	}


	def get_missing_sections(sections: dict) -> list:
	"""
	Return list of important sections that are missing from the resume.

	Args:
	sections: result of detect_sections()

	Returns:
	List of human-readable missing section names.
	"""
	important = {
	"skills": "Skills section",
	"education": "Education section",
	"experience": "Work Experience section",
	"projects": "Projects section",
	"summary": "Professional Summary / Objective",
	}
	missing = []
	for key, label in important.items():
	if not sections.get(key, False):
	missing.append(label)
	return missing


	def classify_resume(score: float) -> dict:
	"""
	Classify a resume based on its ATS score.

	Args:
	score: ATS score (0–100).

	Returns:
	dict with 'label' (Good/Average/Poor) and 'color' for UI display.
	"""
	if score >= 70:
	return {"label": "Good ✅", "color": "green"}
	elif score >= 45:
	return {"label": "Average ⚠️", "color": "orange"}
	else:
	return {"label": "Poor ❌", "color": "red"}


	def generate_suggestions(
	sections: dict,
	skills: dict,
	score: float,
	job_match: float,
	) -> list:
	"""
	Rule-based suggestions engine.

	Analyzes resume structure and scores to generate actionable improvement tips.

	Args:
	sections : result of detect_sections()
	skills : result of extract_skills()
	score : resume base score (0–100)
	job_match : job description match % (0–100)

	Returns:
	List of suggestion strings.
	"""
	suggestions = []

	# Section-based suggestions
	if not sections.get("summary"):
	suggestions.append(
	"📝 Add a Professional Summary at the top of your resume "
	"(2–3 lines highlighting your key strengths and career goal)."
	)
	if not sections.get("skills"):
	suggestions.append(
	"🛠️ Add a dedicated Skills section listing your technical "
	"and soft skills clearly."
	)
	if not sections.get("experience"):
	suggestions.append(
	"💼 Add a Work Experience section with job titles, company names, "
	"dates, and bullet-point achievements."
	)
	if not sections.get("projects"):
	suggestions.append(
	"🚀 Include a Projects section. Showcase 2–3 projects with a brief "
	"description, technologies used, and impact or outcome."
	)
	if not sections.get("certifications"):
	suggestions.append(
	"🏆 Consider adding Certifications or Awards if you have any relevant ones."
	)

	# Skill-based suggestions
	tech_count = len(skills.get("technical", []))
	if tech_count < 5:
	suggestions.append(
	f"⚙️ Only {tech_count} technical skill(s) found. "
	"Add more relevant technical skills (aim for 8–15)."
	)
	if not skills.get("soft"):
	suggestions.append(
	"🤝 Mention soft skills such as 'Leadership', 'Teamwork', or "
	"'Communication' — many ATS systems look for these."
	)

	# Score-based suggestions
	if score < 60:
	suggestions.append(
	"📏 Your resume may be too short. ATS systems reward detailed resumes. "
	"Aim for at least 400–600 words."
	)

	# Job-match suggestions
	if job_match < 50:
	suggestions.append(
	"🎯 Low job description match. Tailor your resume keywords to match "
	"the exact terms in the job posting."
	)
	elif job_match < 70:
	suggestions.append(
	"🎯 Moderate job match. Review the job description and ensure your "
	"skills and experience directly address its requirements."
	)

	# Formatting suggestions
	suggestions.append(
	"📐 Use clean formatting: clear headings, consistent font, and bullet points. "
	"Avoid tables or graphics — they confuse most ATS parsers."
	)
	suggestions.append(
	"📊 Quantify your achievements where possible "
	"(e.g., 'Reduced load time by 40%', 'Led a team of 5 engineers')."
	)

	return suggestions