""" nlp_utils.py — NLP utilities for resume analysis. Responsibilities: - Named Entity Recognition (NER) using spaCy - Section detection (Skills, Education, Experience, Projects) - Skill keyword extraction from a predefined skill list - Resume classification heuristic (Good / Average / Poor) """ import re import spacy # --------------------------------------------------------------------------- # spaCy model — loaded once at import time # --------------------------------------------------------------------------- try: nlp = spacy.load("en_core_web_sm") except OSError: # Fallback: download the model at runtime if missing import subprocess, sys subprocess.run( [sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True, ) nlp = spacy.load("en_core_web_sm") # --------------------------------------------------------------------------- # Predefined skill taxonomy # --------------------------------------------------------------------------- TECHNICAL_SKILLS = { # Programming languages "python", "java", "javascript", "typescript", "c++", "c#", "c", "go", "rust", "kotlin", "swift", "ruby", "php", "scala", "r", "matlab", "bash", "shell", "perl", "lua", # Web / frontend "html", "css", "react", "angular", "vue", "next.js", "nuxt.js", "svelte", "tailwind", "bootstrap", "jquery", "webpack", "vite", # Backend / frameworks "node.js", "django", "flask", "fastapi", "spring", "express", "rails", "laravel", "asp.net", # Databases "sql", "mysql", "postgresql", "mongodb", "redis", "elasticsearch", "sqlite", "oracle", "cassandra", "dynamodb", "firebase", # Cloud & DevOps "aws", "azure", "gcp", "docker", "kubernetes", "terraform", "ansible", "jenkins", "github actions", "ci/cd", "linux", "nginx", "apache", # ML / AI "machine learning", "deep learning", "nlp", "computer vision", "tensorflow", "pytorch", "keras", "scikit-learn", "pandas", "numpy", "matplotlib", "seaborn", "hugging face", "transformers", "langchain", "openai", "llm", # Data "data analysis", "data science", "power bi", "tableau", "excel", "spark", "hadoop", "airflow", "dbt", "snowflake", "bigquery", # Version control & tools "git", "github", "gitlab", "bitbucket", "jira", "confluence", "postman", "swagger", # Other "rest api", "graphql", "microservices", "agile", "scrum", "unit testing", "pytest", "jest", "selenium", "linux", } SOFT_SKILLS = { "leadership", "communication", "teamwork", "problem solving", "critical thinking", "time management", "adaptability", "collaboration", "creativity", "project management", } ALL_SKILLS = TECHNICAL_SKILLS | SOFT_SKILLS # --------------------------------------------------------------------------- # Section header keywords # --------------------------------------------------------------------------- SECTION_KEYWORDS = { "skills": [ "skills", "technical skills", "core competencies", "technologies", "tools", "expertise", "proficiencies", ], "education": [ "education", "academic background", "qualification", "degree", "university", "college", "school", ], "experience": [ "experience", "work experience", "employment history", "professional experience", "work history", "career", "internship", "internships", ], "projects": [ "projects", "personal projects", "side projects", "academic projects", "portfolio", ], "summary": [ "summary", "objective", "profile", "about me", "professional summary", "career objective", ], "certifications": [ "certifications", "certificates", "licenses", "awards", ], "contact": [ "contact", "contact information", "personal details", ], } # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def extract_entities(text: str) -> dict: """ Run spaCy NER and return a dict of entity labels → list of values. Labels returned: PERSON, ORG, GPE (location), DATE, plus a best-guess for the candidate name (first PERSON entity or first line heuristic). """ if not text: return {"name": None, "organizations": [], "locations": [], "dates": []} doc = nlp(text[:5000]) # limit to first 5000 chars for speed persons = [] orgs = [] locations = [] dates = [] for ent in doc.ents: if ent.label_ == "PERSON": persons.append(ent.text.strip()) elif ent.label_ == "ORG": orgs.append(ent.text.strip()) elif ent.label_ in ("GPE", "LOC"): locations.append(ent.text.strip()) elif ent.label_ == "DATE": dates.append(ent.text.strip()) # Best-guess for name: first PERSON entity or first non-empty line name = persons[0] if persons else _guess_name_from_first_line(text) return { "name": name, "organizations": list(dict.fromkeys(orgs)), # dedupe, keep order "locations": list(dict.fromkeys(locations)), "dates": list(dict.fromkeys(dates[:10])), # top 10 } def _guess_name_from_first_line(text: str) -> str | None: """Heuristic: the first short line often contains the candidate's name.""" for line in text.splitlines(): line = line.strip() if 2 <= len(line.split()) <= 5 and line.replace(" ", "").isalpha(): return line return None def detect_sections(text: str) -> dict: """ Detect which resume sections are present. Returns: dict mapping section name → True/False """ text_lower = text.lower() detected = {} for section, keywords in SECTION_KEYWORDS.items(): detected[section] = any(kw in text_lower for kw in keywords) return detected def extract_skills(text: str) -> dict: """ Extract skills from resume text. Returns: dict with keys 'technical' and 'soft' — each a sorted list of found skills. """ text_lower = text.lower() found_technical = [] found_soft = [] for skill in TECHNICAL_SKILLS: # Use word-boundary matching to avoid partial matches pattern = r"\b" + re.escape(skill) + r"\b" if re.search(pattern, text_lower): found_technical.append(skill) for skill in SOFT_SKILLS: pattern = r"\b" + re.escape(skill) + r"\b" if re.search(pattern, text_lower): found_soft.append(skill) return { "technical": sorted(found_technical), "soft": sorted(found_soft), "all": sorted(found_technical + found_soft), } def get_missing_sections(sections: dict) -> list: """ Return list of important sections that are missing from the resume. Args: sections: result of detect_sections() Returns: List of human-readable missing section names. """ important = { "skills": "Skills section", "education": "Education section", "experience": "Work Experience section", "projects": "Projects section", "summary": "Professional Summary / Objective", } missing = [] for key, label in important.items(): if not sections.get(key, False): missing.append(label) return missing def classify_resume(score: float) -> dict: """ Classify a resume based on its ATS score. Args: score: ATS score (0–100). Returns: dict with 'label' (Good/Average/Poor) and 'color' for UI display. """ if score >= 70: return {"label": "Good ✅", "color": "green"} elif score >= 45: return {"label": "Average ⚠️", "color": "orange"} else: return {"label": "Poor ❌", "color": "red"} def generate_suggestions( sections: dict, skills: dict, score: float, job_match: float, ) -> list: """ Rule-based suggestions engine. Analyzes resume structure and scores to generate actionable improvement tips. Args: sections : result of detect_sections() skills : result of extract_skills() score : resume base score (0–100) job_match : job description match % (0–100) Returns: List of suggestion strings. """ suggestions = [] # Section-based suggestions if not sections.get("summary"): suggestions.append( "📝 Add a Professional Summary at the top of your resume " "(2–3 lines highlighting your key strengths and career goal)." ) if not sections.get("skills"): suggestions.append( "🛠️ Add a dedicated Skills section listing your technical " "and soft skills clearly." ) if not sections.get("experience"): suggestions.append( "💼 Add a Work Experience section with job titles, company names, " "dates, and bullet-point achievements." ) if not sections.get("projects"): suggestions.append( "🚀 Include a Projects section. Showcase 2–3 projects with a brief " "description, technologies used, and impact or outcome." ) if not sections.get("certifications"): suggestions.append( "🏆 Consider adding Certifications or Awards if you have any relevant ones." ) # Skill-based suggestions tech_count = len(skills.get("technical", [])) if tech_count < 5: suggestions.append( f"⚙️ Only {tech_count} technical skill(s) found. " "Add more relevant technical skills (aim for 8–15)." ) if not skills.get("soft"): suggestions.append( "🤝 Mention soft skills such as 'Leadership', 'Teamwork', or " "'Communication' — many ATS systems look for these." ) # Score-based suggestions if score < 60: suggestions.append( "📏 Your resume may be too short. ATS systems reward detailed resumes. " "Aim for at least 400–600 words." ) # Job-match suggestions if job_match < 50: suggestions.append( "🎯 Low job description match. Tailor your resume keywords to match " "the exact terms in the job posting." ) elif job_match < 70: suggestions.append( "🎯 Moderate job match. Review the job description and ensure your " "skills and experience directly address its requirements." ) # Formatting suggestions suggestions.append( "📐 Use clean formatting: clear headings, consistent font, and bullet points. " "Avoid tables or graphics — they confuse most ATS parsers." ) suggestions.append( "📊 Quantify your achievements where possible " "(e.g., 'Reduced load time by 40%', 'Led a team of 5 engineers')." ) return suggestions