Spaces:

chirag1121
/

Resume_Screening_Model

Sleeping

File size: 11,113 Bytes

54b9947

"""
nlp_utils.py — NLP utilities for resume analysis.
 
Responsibilities:
  - Named Entity Recognition (NER) using spaCy
  - Section detection (Skills, Education, Experience, Projects)
  - Skill keyword extraction from a predefined skill list
  - Resume classification heuristic (Good / Average / Poor)
"""
 
import re
import spacy
 
# ---------------------------------------------------------------------------
# spaCy model — loaded once at import time
# ---------------------------------------------------------------------------
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # Fallback: download the model at runtime if missing
    import subprocess, sys
    subprocess.run(
        [sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
        check=True,
    )
    nlp = spacy.load("en_core_web_sm")
 
 
# ---------------------------------------------------------------------------
# Predefined skill taxonomy
# ---------------------------------------------------------------------------
TECHNICAL_SKILLS = {
    # Programming languages
    "python", "java", "javascript", "typescript", "c++", "c#", "c", "go",
    "rust", "kotlin", "swift", "ruby", "php", "scala", "r", "matlab",
    "bash", "shell", "perl", "lua",
    # Web / frontend
    "html", "css", "react", "angular", "vue", "next.js", "nuxt.js",
    "svelte", "tailwind", "bootstrap", "jquery", "webpack", "vite",
    # Backend / frameworks
    "node.js", "django", "flask", "fastapi", "spring", "express",
    "rails", "laravel", "asp.net",
    # Databases
    "sql", "mysql", "postgresql", "mongodb", "redis", "elasticsearch",
    "sqlite", "oracle", "cassandra", "dynamodb", "firebase",
    # Cloud & DevOps
    "aws", "azure", "gcp", "docker", "kubernetes", "terraform",
    "ansible", "jenkins", "github actions", "ci/cd", "linux",
    "nginx", "apache",
    # ML / AI
    "machine learning", "deep learning", "nlp", "computer vision",
    "tensorflow", "pytorch", "keras", "scikit-learn", "pandas",
    "numpy", "matplotlib", "seaborn", "hugging face", "transformers",
    "langchain", "openai", "llm",
    # Data
    "data analysis", "data science", "power bi", "tableau", "excel",
    "spark", "hadoop", "airflow", "dbt", "snowflake", "bigquery",
    # Version control & tools
    "git", "github", "gitlab", "bitbucket", "jira", "confluence",
    "postman", "swagger",
    # Other
    "rest api", "graphql", "microservices", "agile", "scrum",
    "unit testing", "pytest", "jest", "selenium", "linux",
}
 
SOFT_SKILLS = {
    "leadership", "communication", "teamwork", "problem solving",
    "critical thinking", "time management", "adaptability",
    "collaboration", "creativity", "project management",
}
 
ALL_SKILLS = TECHNICAL_SKILLS | SOFT_SKILLS
 
 
# ---------------------------------------------------------------------------
# Section header keywords
# ---------------------------------------------------------------------------
SECTION_KEYWORDS = {
    "skills": [
        "skills", "technical skills", "core competencies",
        "technologies", "tools", "expertise", "proficiencies",
    ],
    "education": [
        "education", "academic background", "qualification",
        "degree", "university", "college", "school",
    ],
    "experience": [
        "experience", "work experience", "employment history",
        "professional experience", "work history", "career",
        "internship", "internships",
    ],
    "projects": [
        "projects", "personal projects", "side projects",
        "academic projects", "portfolio",
    ],
    "summary": [
        "summary", "objective", "profile", "about me",
        "professional summary", "career objective",
    ],
    "certifications": [
        "certifications", "certificates", "licenses", "awards",
    ],
    "contact": [
        "contact", "contact information", "personal details",
    ],
}
 
 
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
 
def extract_entities(text: str) -> dict:
    """
    Run spaCy NER and return a dict of entity labels → list of values.
 
    Labels returned: PERSON, ORG, GPE (location), DATE, plus a best-guess
    for the candidate name (first PERSON entity or first line heuristic).
    """
    if not text:
        return {"name": None, "organizations": [], "locations": [], "dates": []}
 
    doc = nlp(text[:5000])  # limit to first 5000 chars for speed
 
    persons = []
    orgs = []
    locations = []
    dates = []
 
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            persons.append(ent.text.strip())
        elif ent.label_ == "ORG":
            orgs.append(ent.text.strip())
        elif ent.label_ in ("GPE", "LOC"):
            locations.append(ent.text.strip())
        elif ent.label_ == "DATE":
            dates.append(ent.text.strip())
 
    # Best-guess for name: first PERSON entity or first non-empty line
    name = persons[0] if persons else _guess_name_from_first_line(text)
 
    return {
        "name": name,
        "organizations": list(dict.fromkeys(orgs)),       # dedupe, keep order
        "locations":     list(dict.fromkeys(locations)),
        "dates":         list(dict.fromkeys(dates[:10])),  # top 10
    }
 
 
def _guess_name_from_first_line(text: str) -> str | None:
    """Heuristic: the first short line often contains the candidate's name."""
    for line in text.splitlines():
        line = line.strip()
        if 2 <= len(line.split()) <= 5 and line.replace(" ", "").isalpha():
            return line
    return None
 
 
def detect_sections(text: str) -> dict:
    """
    Detect which resume sections are present.
 
    Returns:
        dict mapping section name → True/False
    """
    text_lower = text.lower()
    detected = {}
    for section, keywords in SECTION_KEYWORDS.items():
        detected[section] = any(kw in text_lower for kw in keywords)
    return detected
 
 
def extract_skills(text: str) -> dict:
    """
    Extract skills from resume text.
 
    Returns:
        dict with keys 'technical' and 'soft' — each a sorted list of found skills.
    """
    text_lower = text.lower()
    found_technical = []
    found_soft = []
 
    for skill in TECHNICAL_SKILLS:
        # Use word-boundary matching to avoid partial matches
        pattern = r"\b" + re.escape(skill) + r"\b"
        if re.search(pattern, text_lower):
            found_technical.append(skill)
 
    for skill in SOFT_SKILLS:
        pattern = r"\b" + re.escape(skill) + r"\b"
        if re.search(pattern, text_lower):
            found_soft.append(skill)
 
    return {
        "technical": sorted(found_technical),
        "soft": sorted(found_soft),
        "all": sorted(found_technical + found_soft),
    }
 
 
def get_missing_sections(sections: dict) -> list:
    """
    Return list of important sections that are missing from the resume.
 
    Args:
        sections: result of detect_sections()
 
    Returns:
        List of human-readable missing section names.
    """
    important = {
        "skills": "Skills section",
        "education": "Education section",
        "experience": "Work Experience section",
        "projects": "Projects section",
        "summary": "Professional Summary / Objective",
    }
    missing = []
    for key, label in important.items():
        if not sections.get(key, False):
            missing.append(label)
    return missing
 
 
def classify_resume(score: float) -> dict:
    """
    Classify a resume based on its ATS score.
 
    Args:
        score: ATS score (0–100).
 
    Returns:
        dict with 'label' (Good/Average/Poor) and 'color' for UI display.
    """
    if score >= 70:
        return {"label": "Good ✅", "color": "green"}
    elif score >= 45:
        return {"label": "Average ⚠️", "color": "orange"}
    else:
        return {"label": "Poor ❌", "color": "red"}
 
 
def generate_suggestions(
    sections: dict,
    skills: dict,
    score: float,
    job_match: float,
) -> list:
    """
    Rule-based suggestions engine.
 
    Analyzes resume structure and scores to generate actionable improvement tips.
 
    Args:
        sections   : result of detect_sections()
        skills     : result of extract_skills()
        score      : resume base score (0–100)
        job_match  : job description match % (0–100)
 
    Returns:
        List of suggestion strings.
    """
    suggestions = []
 
    # Section-based suggestions
    if not sections.get("summary"):
        suggestions.append(
            "📝 Add a Professional Summary at the top of your resume "
            "(2–3 lines highlighting your key strengths and career goal)."
        )
    if not sections.get("skills"):
        suggestions.append(
            "🛠️ Add a dedicated Skills section listing your technical "
            "and soft skills clearly."
        )
    if not sections.get("experience"):
        suggestions.append(
            "💼 Add a Work Experience section with job titles, company names, "
            "dates, and bullet-point achievements."
        )
    if not sections.get("projects"):
        suggestions.append(
            "🚀 Include a Projects section. Showcase 2–3 projects with a brief "
            "description, technologies used, and impact or outcome."
        )
    if not sections.get("certifications"):
        suggestions.append(
            "🏆 Consider adding Certifications or Awards if you have any relevant ones."
        )
 
    # Skill-based suggestions
    tech_count = len(skills.get("technical", []))
    if tech_count < 5:
        suggestions.append(
            f"⚙️ Only {tech_count} technical skill(s) found. "
            "Add more relevant technical skills (aim for 8–15)."
        )
    if not skills.get("soft"):
        suggestions.append(
            "🤝 Mention soft skills such as 'Leadership', 'Teamwork', or "
            "'Communication' — many ATS systems look for these."
        )
 
    # Score-based suggestions
    if score < 60:
        suggestions.append(
            "📏 Your resume may be too short. ATS systems reward detailed resumes. "
            "Aim for at least 400–600 words."
        )
 
    # Job-match suggestions
    if job_match < 50:
        suggestions.append(
            "🎯 Low job description match. Tailor your resume keywords to match "
            "the exact terms in the job posting."
        )
    elif job_match < 70:
        suggestions.append(
            "🎯 Moderate job match. Review the job description and ensure your "
            "skills and experience directly address its requirements."
        )
 
    # Formatting suggestions
    suggestions.append(
        "📐 Use clean formatting: clear headings, consistent font, and bullet points. "
        "Avoid tables or graphics — they confuse most ATS parsers."
    )
    suggestions.append(
        "📊 Quantify your achievements where possible "
        "(e.g., 'Reduced load time by 40%', 'Led a team of 5 engineers')."
    )
 
    return suggestions