Spaces:
Sleeping
Sleeping
| """ | |
| nlp_utils.py β NLP utilities for resume analysis. | |
| Responsibilities: | |
| - Named Entity Recognition (NER) using spaCy | |
| - Section detection (Skills, Education, Experience, Projects) | |
| - Skill keyword extraction from a predefined skill list | |
| - Resume classification heuristic (Good / Average / Poor) | |
| """ | |
| import re | |
| import spacy | |
| # --------------------------------------------------------------------------- | |
| # spaCy model β loaded once at import time | |
| # --------------------------------------------------------------------------- | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| # Fallback: download the model at runtime if missing | |
| import subprocess, sys | |
| subprocess.run( | |
| [sys.executable, "-m", "spacy", "download", "en_core_web_sm"], | |
| check=True, | |
| ) | |
| nlp = spacy.load("en_core_web_sm") | |
| # --------------------------------------------------------------------------- | |
| # Predefined skill taxonomy | |
| # --------------------------------------------------------------------------- | |
| TECHNICAL_SKILLS = { | |
| # Programming languages | |
| "python", "java", "javascript", "typescript", "c++", "c#", "c", "go", | |
| "rust", "kotlin", "swift", "ruby", "php", "scala", "r", "matlab", | |
| "bash", "shell", "perl", "lua", | |
| # Web / frontend | |
| "html", "css", "react", "angular", "vue", "next.js", "nuxt.js", | |
| "svelte", "tailwind", "bootstrap", "jquery", "webpack", "vite", | |
| # Backend / frameworks | |
| "node.js", "django", "flask", "fastapi", "spring", "express", | |
| "rails", "laravel", "asp.net", | |
| # Databases | |
| "sql", "mysql", "postgresql", "mongodb", "redis", "elasticsearch", | |
| "sqlite", "oracle", "cassandra", "dynamodb", "firebase", | |
| # Cloud & DevOps | |
| "aws", "azure", "gcp", "docker", "kubernetes", "terraform", | |
| "ansible", "jenkins", "github actions", "ci/cd", "linux", | |
| "nginx", "apache", | |
| # ML / AI | |
| "machine learning", "deep learning", "nlp", "computer vision", | |
| "tensorflow", "pytorch", "keras", "scikit-learn", "pandas", | |
| "numpy", "matplotlib", "seaborn", "hugging face", "transformers", | |
| "langchain", "openai", "llm", | |
| # Data | |
| "data analysis", "data science", "power bi", "tableau", "excel", | |
| "spark", "hadoop", "airflow", "dbt", "snowflake", "bigquery", | |
| # Version control & tools | |
| "git", "github", "gitlab", "bitbucket", "jira", "confluence", | |
| "postman", "swagger", | |
| # Other | |
| "rest api", "graphql", "microservices", "agile", "scrum", | |
| "unit testing", "pytest", "jest", "selenium", "linux", | |
| } | |
| SOFT_SKILLS = { | |
| "leadership", "communication", "teamwork", "problem solving", | |
| "critical thinking", "time management", "adaptability", | |
| "collaboration", "creativity", "project management", | |
| } | |
| ALL_SKILLS = TECHNICAL_SKILLS | SOFT_SKILLS | |
| # --------------------------------------------------------------------------- | |
| # Section header keywords | |
| # --------------------------------------------------------------------------- | |
| SECTION_KEYWORDS = { | |
| "skills": [ | |
| "skills", "technical skills", "core competencies", | |
| "technologies", "tools", "expertise", "proficiencies", | |
| ], | |
| "education": [ | |
| "education", "academic background", "qualification", | |
| "degree", "university", "college", "school", | |
| ], | |
| "experience": [ | |
| "experience", "work experience", "employment history", | |
| "professional experience", "work history", "career", | |
| "internship", "internships", | |
| ], | |
| "projects": [ | |
| "projects", "personal projects", "side projects", | |
| "academic projects", "portfolio", | |
| ], | |
| "summary": [ | |
| "summary", "objective", "profile", "about me", | |
| "professional summary", "career objective", | |
| ], | |
| "certifications": [ | |
| "certifications", "certificates", "licenses", "awards", | |
| ], | |
| "contact": [ | |
| "contact", "contact information", "personal details", | |
| ], | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def extract_entities(text: str) -> dict: | |
| """ | |
| Run spaCy NER and return a dict of entity labels β list of values. | |
| Labels returned: PERSON, ORG, GPE (location), DATE, plus a best-guess | |
| for the candidate name (first PERSON entity or first line heuristic). | |
| """ | |
| if not text: | |
| return {"name": None, "organizations": [], "locations": [], "dates": []} | |
| doc = nlp(text[:5000]) # limit to first 5000 chars for speed | |
| persons = [] | |
| orgs = [] | |
| locations = [] | |
| dates = [] | |
| for ent in doc.ents: | |
| if ent.label_ == "PERSON": | |
| persons.append(ent.text.strip()) | |
| elif ent.label_ == "ORG": | |
| orgs.append(ent.text.strip()) | |
| elif ent.label_ in ("GPE", "LOC"): | |
| locations.append(ent.text.strip()) | |
| elif ent.label_ == "DATE": | |
| dates.append(ent.text.strip()) | |
| # Best-guess for name: first PERSON entity or first non-empty line | |
| name = persons[0] if persons else _guess_name_from_first_line(text) | |
| return { | |
| "name": name, | |
| "organizations": list(dict.fromkeys(orgs)), # dedupe, keep order | |
| "locations": list(dict.fromkeys(locations)), | |
| "dates": list(dict.fromkeys(dates[:10])), # top 10 | |
| } | |
| def _guess_name_from_first_line(text: str) -> str | None: | |
| """Heuristic: the first short line often contains the candidate's name.""" | |
| for line in text.splitlines(): | |
| line = line.strip() | |
| if 2 <= len(line.split()) <= 5 and line.replace(" ", "").isalpha(): | |
| return line | |
| return None | |
| def detect_sections(text: str) -> dict: | |
| """ | |
| Detect which resume sections are present. | |
| Returns: | |
| dict mapping section name β True/False | |
| """ | |
| text_lower = text.lower() | |
| detected = {} | |
| for section, keywords in SECTION_KEYWORDS.items(): | |
| detected[section] = any(kw in text_lower for kw in keywords) | |
| return detected | |
| def extract_skills(text: str) -> dict: | |
| """ | |
| Extract skills from resume text. | |
| Returns: | |
| dict with keys 'technical' and 'soft' β each a sorted list of found skills. | |
| """ | |
| text_lower = text.lower() | |
| found_technical = [] | |
| found_soft = [] | |
| for skill in TECHNICAL_SKILLS: | |
| # Use word-boundary matching to avoid partial matches | |
| pattern = r"\b" + re.escape(skill) + r"\b" | |
| if re.search(pattern, text_lower): | |
| found_technical.append(skill) | |
| for skill in SOFT_SKILLS: | |
| pattern = r"\b" + re.escape(skill) + r"\b" | |
| if re.search(pattern, text_lower): | |
| found_soft.append(skill) | |
| return { | |
| "technical": sorted(found_technical), | |
| "soft": sorted(found_soft), | |
| "all": sorted(found_technical + found_soft), | |
| } | |
| def get_missing_sections(sections: dict) -> list: | |
| """ | |
| Return list of important sections that are missing from the resume. | |
| Args: | |
| sections: result of detect_sections() | |
| Returns: | |
| List of human-readable missing section names. | |
| """ | |
| important = { | |
| "skills": "Skills section", | |
| "education": "Education section", | |
| "experience": "Work Experience section", | |
| "projects": "Projects section", | |
| "summary": "Professional Summary / Objective", | |
| } | |
| missing = [] | |
| for key, label in important.items(): | |
| if not sections.get(key, False): | |
| missing.append(label) | |
| return missing | |
| def classify_resume(score: float) -> dict: | |
| """ | |
| Classify a resume based on its ATS score. | |
| Args: | |
| score: ATS score (0β100). | |
| Returns: | |
| dict with 'label' (Good/Average/Poor) and 'color' for UI display. | |
| """ | |
| if score >= 70: | |
| return {"label": "Good β ", "color": "green"} | |
| elif score >= 45: | |
| return {"label": "Average β οΈ", "color": "orange"} | |
| else: | |
| return {"label": "Poor β", "color": "red"} | |
| def generate_suggestions( | |
| sections: dict, | |
| skills: dict, | |
| score: float, | |
| job_match: float, | |
| ) -> list: | |
| """ | |
| Rule-based suggestions engine. | |
| Analyzes resume structure and scores to generate actionable improvement tips. | |
| Args: | |
| sections : result of detect_sections() | |
| skills : result of extract_skills() | |
| score : resume base score (0β100) | |
| job_match : job description match % (0β100) | |
| Returns: | |
| List of suggestion strings. | |
| """ | |
| suggestions = [] | |
| # Section-based suggestions | |
| if not sections.get("summary"): | |
| suggestions.append( | |
| "π Add a Professional Summary at the top of your resume " | |
| "(2β3 lines highlighting your key strengths and career goal)." | |
| ) | |
| if not sections.get("skills"): | |
| suggestions.append( | |
| "π οΈ Add a dedicated Skills section listing your technical " | |
| "and soft skills clearly." | |
| ) | |
| if not sections.get("experience"): | |
| suggestions.append( | |
| "πΌ Add a Work Experience section with job titles, company names, " | |
| "dates, and bullet-point achievements." | |
| ) | |
| if not sections.get("projects"): | |
| suggestions.append( | |
| "π Include a Projects section. Showcase 2β3 projects with a brief " | |
| "description, technologies used, and impact or outcome." | |
| ) | |
| if not sections.get("certifications"): | |
| suggestions.append( | |
| "π Consider adding Certifications or Awards if you have any relevant ones." | |
| ) | |
| # Skill-based suggestions | |
| tech_count = len(skills.get("technical", [])) | |
| if tech_count < 5: | |
| suggestions.append( | |
| f"βοΈ Only {tech_count} technical skill(s) found. " | |
| "Add more relevant technical skills (aim for 8β15)." | |
| ) | |
| if not skills.get("soft"): | |
| suggestions.append( | |
| "π€ Mention soft skills such as 'Leadership', 'Teamwork', or " | |
| "'Communication' β many ATS systems look for these." | |
| ) | |
| # Score-based suggestions | |
| if score < 60: | |
| suggestions.append( | |
| "π Your resume may be too short. ATS systems reward detailed resumes. " | |
| "Aim for at least 400β600 words." | |
| ) | |
| # Job-match suggestions | |
| if job_match < 50: | |
| suggestions.append( | |
| "π― Low job description match. Tailor your resume keywords to match " | |
| "the exact terms in the job posting." | |
| ) | |
| elif job_match < 70: | |
| suggestions.append( | |
| "π― Moderate job match. Review the job description and ensure your " | |
| "skills and experience directly address its requirements." | |
| ) | |
| # Formatting suggestions | |
| suggestions.append( | |
| "π Use clean formatting: clear headings, consistent font, and bullet points. " | |
| "Avoid tables or graphics β they confuse most ATS parsers." | |
| ) | |
| suggestions.append( | |
| "π Quantify your achievements where possible " | |
| "(e.g., 'Reduced load time by 40%', 'Led a team of 5 engineers')." | |
| ) | |
| return suggestions |