Spaces:
Sleeping
Sleeping
File size: 11,113 Bytes
54b9947 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | """
nlp_utils.py β NLP utilities for resume analysis.
Responsibilities:
- Named Entity Recognition (NER) using spaCy
- Section detection (Skills, Education, Experience, Projects)
- Skill keyword extraction from a predefined skill list
- Resume classification heuristic (Good / Average / Poor)
"""
import re
import spacy
# ---------------------------------------------------------------------------
# spaCy model β loaded once at import time
# ---------------------------------------------------------------------------
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
# Fallback: download the model at runtime if missing
import subprocess, sys
subprocess.run(
[sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
check=True,
)
nlp = spacy.load("en_core_web_sm")
# ---------------------------------------------------------------------------
# Predefined skill taxonomy
# ---------------------------------------------------------------------------
TECHNICAL_SKILLS = {
# Programming languages
"python", "java", "javascript", "typescript", "c++", "c#", "c", "go",
"rust", "kotlin", "swift", "ruby", "php", "scala", "r", "matlab",
"bash", "shell", "perl", "lua",
# Web / frontend
"html", "css", "react", "angular", "vue", "next.js", "nuxt.js",
"svelte", "tailwind", "bootstrap", "jquery", "webpack", "vite",
# Backend / frameworks
"node.js", "django", "flask", "fastapi", "spring", "express",
"rails", "laravel", "asp.net",
# Databases
"sql", "mysql", "postgresql", "mongodb", "redis", "elasticsearch",
"sqlite", "oracle", "cassandra", "dynamodb", "firebase",
# Cloud & DevOps
"aws", "azure", "gcp", "docker", "kubernetes", "terraform",
"ansible", "jenkins", "github actions", "ci/cd", "linux",
"nginx", "apache",
# ML / AI
"machine learning", "deep learning", "nlp", "computer vision",
"tensorflow", "pytorch", "keras", "scikit-learn", "pandas",
"numpy", "matplotlib", "seaborn", "hugging face", "transformers",
"langchain", "openai", "llm",
# Data
"data analysis", "data science", "power bi", "tableau", "excel",
"spark", "hadoop", "airflow", "dbt", "snowflake", "bigquery",
# Version control & tools
"git", "github", "gitlab", "bitbucket", "jira", "confluence",
"postman", "swagger",
# Other
"rest api", "graphql", "microservices", "agile", "scrum",
"unit testing", "pytest", "jest", "selenium", "linux",
}
SOFT_SKILLS = {
"leadership", "communication", "teamwork", "problem solving",
"critical thinking", "time management", "adaptability",
"collaboration", "creativity", "project management",
}
ALL_SKILLS = TECHNICAL_SKILLS | SOFT_SKILLS
# ---------------------------------------------------------------------------
# Section header keywords
# ---------------------------------------------------------------------------
SECTION_KEYWORDS = {
"skills": [
"skills", "technical skills", "core competencies",
"technologies", "tools", "expertise", "proficiencies",
],
"education": [
"education", "academic background", "qualification",
"degree", "university", "college", "school",
],
"experience": [
"experience", "work experience", "employment history",
"professional experience", "work history", "career",
"internship", "internships",
],
"projects": [
"projects", "personal projects", "side projects",
"academic projects", "portfolio",
],
"summary": [
"summary", "objective", "profile", "about me",
"professional summary", "career objective",
],
"certifications": [
"certifications", "certificates", "licenses", "awards",
],
"contact": [
"contact", "contact information", "personal details",
],
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def extract_entities(text: str) -> dict:
"""
Run spaCy NER and return a dict of entity labels β list of values.
Labels returned: PERSON, ORG, GPE (location), DATE, plus a best-guess
for the candidate name (first PERSON entity or first line heuristic).
"""
if not text:
return {"name": None, "organizations": [], "locations": [], "dates": []}
doc = nlp(text[:5000]) # limit to first 5000 chars for speed
persons = []
orgs = []
locations = []
dates = []
for ent in doc.ents:
if ent.label_ == "PERSON":
persons.append(ent.text.strip())
elif ent.label_ == "ORG":
orgs.append(ent.text.strip())
elif ent.label_ in ("GPE", "LOC"):
locations.append(ent.text.strip())
elif ent.label_ == "DATE":
dates.append(ent.text.strip())
# Best-guess for name: first PERSON entity or first non-empty line
name = persons[0] if persons else _guess_name_from_first_line(text)
return {
"name": name,
"organizations": list(dict.fromkeys(orgs)), # dedupe, keep order
"locations": list(dict.fromkeys(locations)),
"dates": list(dict.fromkeys(dates[:10])), # top 10
}
def _guess_name_from_first_line(text: str) -> str | None:
"""Heuristic: the first short line often contains the candidate's name."""
for line in text.splitlines():
line = line.strip()
if 2 <= len(line.split()) <= 5 and line.replace(" ", "").isalpha():
return line
return None
def detect_sections(text: str) -> dict:
"""
Detect which resume sections are present.
Returns:
dict mapping section name β True/False
"""
text_lower = text.lower()
detected = {}
for section, keywords in SECTION_KEYWORDS.items():
detected[section] = any(kw in text_lower for kw in keywords)
return detected
def extract_skills(text: str) -> dict:
"""
Extract skills from resume text.
Returns:
dict with keys 'technical' and 'soft' β each a sorted list of found skills.
"""
text_lower = text.lower()
found_technical = []
found_soft = []
for skill in TECHNICAL_SKILLS:
# Use word-boundary matching to avoid partial matches
pattern = r"\b" + re.escape(skill) + r"\b"
if re.search(pattern, text_lower):
found_technical.append(skill)
for skill in SOFT_SKILLS:
pattern = r"\b" + re.escape(skill) + r"\b"
if re.search(pattern, text_lower):
found_soft.append(skill)
return {
"technical": sorted(found_technical),
"soft": sorted(found_soft),
"all": sorted(found_technical + found_soft),
}
def get_missing_sections(sections: dict) -> list:
"""
Return list of important sections that are missing from the resume.
Args:
sections: result of detect_sections()
Returns:
List of human-readable missing section names.
"""
important = {
"skills": "Skills section",
"education": "Education section",
"experience": "Work Experience section",
"projects": "Projects section",
"summary": "Professional Summary / Objective",
}
missing = []
for key, label in important.items():
if not sections.get(key, False):
missing.append(label)
return missing
def classify_resume(score: float) -> dict:
"""
Classify a resume based on its ATS score.
Args:
score: ATS score (0β100).
Returns:
dict with 'label' (Good/Average/Poor) and 'color' for UI display.
"""
if score >= 70:
return {"label": "Good β
", "color": "green"}
elif score >= 45:
return {"label": "Average β οΈ", "color": "orange"}
else:
return {"label": "Poor β", "color": "red"}
def generate_suggestions(
sections: dict,
skills: dict,
score: float,
job_match: float,
) -> list:
"""
Rule-based suggestions engine.
Analyzes resume structure and scores to generate actionable improvement tips.
Args:
sections : result of detect_sections()
skills : result of extract_skills()
score : resume base score (0β100)
job_match : job description match % (0β100)
Returns:
List of suggestion strings.
"""
suggestions = []
# Section-based suggestions
if not sections.get("summary"):
suggestions.append(
"π Add a Professional Summary at the top of your resume "
"(2β3 lines highlighting your key strengths and career goal)."
)
if not sections.get("skills"):
suggestions.append(
"π οΈ Add a dedicated Skills section listing your technical "
"and soft skills clearly."
)
if not sections.get("experience"):
suggestions.append(
"πΌ Add a Work Experience section with job titles, company names, "
"dates, and bullet-point achievements."
)
if not sections.get("projects"):
suggestions.append(
"π Include a Projects section. Showcase 2β3 projects with a brief "
"description, technologies used, and impact or outcome."
)
if not sections.get("certifications"):
suggestions.append(
"π Consider adding Certifications or Awards if you have any relevant ones."
)
# Skill-based suggestions
tech_count = len(skills.get("technical", []))
if tech_count < 5:
suggestions.append(
f"βοΈ Only {tech_count} technical skill(s) found. "
"Add more relevant technical skills (aim for 8β15)."
)
if not skills.get("soft"):
suggestions.append(
"π€ Mention soft skills such as 'Leadership', 'Teamwork', or "
"'Communication' β many ATS systems look for these."
)
# Score-based suggestions
if score < 60:
suggestions.append(
"π Your resume may be too short. ATS systems reward detailed resumes. "
"Aim for at least 400β600 words."
)
# Job-match suggestions
if job_match < 50:
suggestions.append(
"π― Low job description match. Tailor your resume keywords to match "
"the exact terms in the job posting."
)
elif job_match < 70:
suggestions.append(
"π― Moderate job match. Review the job description and ensure your "
"skills and experience directly address its requirements."
)
# Formatting suggestions
suggestions.append(
"π Use clean formatting: clear headings, consistent font, and bullet points. "
"Avoid tables or graphics β they confuse most ATS parsers."
)
suggestions.append(
"π Quantify your achievements where possible "
"(e.g., 'Reduced load time by 40%', 'Led a team of 5 engineers')."
)
return suggestions |