| import re |
|
|
| |
| |
| |
| |
| |
|
|
| |
| LANGUAGES = { |
| "python", "java", "javascript", "typescript", "golang", "go", |
| "rust", "ruby", "scala", "kotlin", "swift", "php", "r", "matlab", |
| "bash", "shell", "c", "cpp", "c++", "csharp", "c#", |
| } |
|
|
| |
| WEB_FRAMEWORKS = { |
| "fastapi", "flask", "django", "spring boot", "spring", |
| "express", "nestjs", "nextjs", "nuxtjs", "rails", |
| "laravel", "fiber", "gin", |
| } |
|
|
| |
| FRONTEND = { |
| "react", "angular", "vue", "svelte", "html", "css", |
| "tailwind", "bootstrap", "redux", "webpack", "vite", |
| } |
|
|
| |
| API_ARCH = { |
| "rest api", "restful api", "rest apis", "restful apis", |
| "graphql", "grpc", "websocket", |
| "microservices", "microservice", "event driven", "message queue", |
| "api gateway", "api", "apis", |
| } |
|
|
| |
| DATABASES = { |
| "postgresql", "postgres", "mysql", "sqlite", "oracle", |
| "mongodb", "mongo", "redis", "cassandra", "dynamodb", |
| "elasticsearch", "neo4j", "firebase", "supabase", |
| "sql", "nosql", "vector database", "pinecone", "weaviate", |
| } |
|
|
| |
| ML_AI = { |
| "machine learning", "deep learning", "reinforcement learning", |
| "supervised learning", "unsupervised learning", |
| "natural language processing", "nlp", "computer vision", |
| "large language model", "llm", "generative ai", "gen ai", |
| "transformers", "bert", "gpt", "llama", "mistral", |
| "scikit-learn", "scikit learn", "sklearn", |
| "pytorch", "torch", "tensorflow", "keras", "jax", |
| "hugging face", "huggingface", "langchain", "llamaindex", |
| "xgboost", "lightgbm", "catboost", |
| "pandas", "numpy", "scipy", "matplotlib", "seaborn", "plotly", |
| "mlflow", "mlops", "model serving", "model deployment", |
| "feature engineering", "hyperparameter tuning", |
| "rag", "retrieval augmented generation", "fine tuning", "fine-tuning", |
| "diffusion models", "stable diffusion", |
| "data analysis", "data analytics", "data science", |
| "statistical analysis", "statistics", "probability", |
| "a/b testing", "hypothesis testing", |
| |
| "ml", "ai", "dl", "cv", |
| } |
|
|
| |
| DATA_ENGINEERING = { |
| "apache spark", "spark", "hadoop", "kafka", "airflow", |
| "dbt", "flink", "hive", "presto", "trino", |
| "etl", "elt", "data pipeline", "data warehouse", |
| "snowflake", "bigquery", "redshift", "databricks", |
| } |
|
|
| |
| CLOUD = { |
| "aws", "amazon web services", "azure", "gcp", "google cloud", |
| "ec2", "s3", "lambda", "sagemaker", "bedrock", |
| "cloudformation", "terraform", "pulumi", |
| "serverless", "cloud functions", |
| } |
|
|
| |
| DEVOPS = { |
| "docker", "kubernetes", "k8s", "helm", |
| "ci/cd", "ci cd", "github actions", "gitlab ci", "jenkins", |
| "ansible", "chef", "puppet", |
| "linux", "unix", "nginx", "apache", |
| "monitoring", "observability", "prometheus", "grafana", |
| "opentelemetry", "datadog", "new relic", |
| } |
|
|
| |
| VCS = { |
| "git", "github", "gitlab", "bitbucket", "version control", |
| } |
|
|
| |
| ENGINEERING = { |
| "system design", "software design", "object oriented", "oop", |
| "design patterns", "solid principles", "clean code", |
| "distributed systems", "high availability", "scalability", |
| "load balancing", "caching", "message broker", |
| "unit testing", "integration testing", "tdd", "bdd", |
| "pytest", "junit", "jest", "mocha", |
| "code review", "agile", "scrum", "kanban", |
| } |
|
|
| |
| SECURITY = { |
| "cybersecurity", "penetration testing", "pen testing", |
| "oauth", "jwt", "ssl", "tls", "encryption", |
| "owasp", "security auditing", |
| } |
|
|
| |
| |
| |
| GENERAL_TECH_SKILLS: set = ( |
| LANGUAGES | WEB_FRAMEWORKS | FRONTEND | API_ARCH | |
| DATABASES | ML_AI | DATA_ENGINEERING | CLOUD | |
| DEVOPS | VCS | ENGINEERING | SECURITY |
| ) |
|
|
| |
| SKILLS_SORTED_BY_LENGTH: list = sorted(GENERAL_TECH_SKILLS, key=len, reverse=True) |
|
|
|
|
| |
| |
| |
|
|
| def clean_text(text: str) -> str: |
| text = text.lower() |
| text = re.sub(r'[^a-z0-9\s/]', ' ', text) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
|
|
| def extract_skills(text: str) -> set: |
| """ |
| Greedy left-to-right phrase match. |
| Multi-word skills (e.g. 'machine learning') are checked before |
| their constituent tokens to prevent double-counting. |
| """ |
| cleaned = clean_text(text) |
| found: set = set() |
| consumed_positions: set = set() |
|
|
| for skill in SKILLS_SORTED_BY_LENGTH: |
| start = 0 |
| while True: |
| idx = cleaned.find(skill, start) |
| if idx == -1: |
| break |
| end = idx + len(skill) |
|
|
| |
| |
| before_ok = (idx == 0 or cleaned[idx - 1] == ' ') |
| after_char = cleaned[end] if end < len(cleaned) else ' ' |
| after_ok = (after_char == ' ' or after_char == 's' and |
| (end + 1 == len(cleaned) or cleaned[end + 1] == ' ')) |
|
|
| if before_ok and after_ok: |
| span = set(range(idx, end)) |
| if not span & consumed_positions: |
| found.add(skill) |
| consumed_positions |= span |
| break |
|
|
| start = idx + 1 |
|
|
| return found |
|
|
|
|
| |
| |
| |
| |
| SKILL_ALIASES: dict[str, str] = { |
| |
| "ml": "machine learning", |
| "ai": "machine learning", |
| "dl": "deep learning", |
| "cv": "computer vision", |
| "nlp": "natural language processing", |
| "gen ai": "generative ai", |
| "llm": "large language model", |
| "sklearn": "scikit-learn", |
| "scikit learn": "scikit-learn", |
| |
| "apis": "api", |
| "rest apis": "rest api", |
| "restful apis": "restful api", |
| "rest api": "api", |
| "restful api": "api", |
| "api gateway": "api", |
| "fastapi": "api", |
| "flask": "api", |
| "grpc": "api", |
| "graphql": "api", |
| |
| "k8s": "kubernetes", |
| "postgres": "postgresql", |
| "mongo": "mongodb", |
| "hf": "huggingface", |
| "hugging face": "huggingface", |
| "aws": "amazon web services", |
| "gcp": "google cloud", |
| "ci cd": "ci/cd", |
| "node": "javascript", |
| "nodejs": "javascript", |
| "js": "javascript", |
| "ts": "typescript", |
| "py": "python", |
| } |
|
|
|
|
| def _expand_with_aliases(skills: set) -> set: |
| """ |
| Transitively expand aliases until no new terms are added. |
| e.g. fastapi β api; rest apis β rest api β api |
| """ |
| expanded = set(skills) |
| while True: |
| new_terms = { |
| SKILL_ALIASES[s] |
| for s in expanded |
| if s in SKILL_ALIASES and SKILL_ALIASES[s] not in expanded |
| } |
| if not new_terms: |
| break |
| expanded |= new_terms |
| return expanded |
|
|
|
|
| def _canonicalize_jd_frequencies(freq: dict[str, int]) -> dict[str, int]: |
| """Merge JD skill counts onto alias-expanded canonical terms.""" |
| canonical: dict[str, int] = {} |
| for skill, count in freq.items(): |
| for term in _expand_with_aliases({skill}): |
| canonical[term] = canonical.get(term, 0) + count |
| return canonical |
|
|
|
|
| def extract_required_skills_from_jd(jd_text: str) -> dict: |
| """Return JD skills (alias-expanded) with mention-frequency as importance.""" |
| raw = extract_skills(jd_text) |
| cleaned = clean_text(jd_text) |
| freq = {skill: cleaned.count(skill) for skill in raw} |
| return _canonicalize_jd_frequencies(freq) |
|
|
|
|
| def extract_resume_skills(resume_text: str) -> set: |
| return _expand_with_aliases(extract_skills(resume_text)) |
|
|
|
|
| def find_missing_skills(resume_text: str, jd_text: str) -> list: |
| jd_skills = set(extract_required_skills_from_jd(jd_text).keys()) |
| resume_skills = extract_resume_skills(resume_text) |
| return sorted(s for s in jd_skills if s not in resume_skills) |
|
|
|
|
| def calculate_skill_overlap(resume_text: str, jd_text: str) -> float: |
| jd_skills = set(extract_required_skills_from_jd(jd_text).keys()) |
| resume_skills = extract_resume_skills(resume_text) |
| if not jd_skills: |
| return 0.0 |
| return round(len(jd_skills & resume_skills) / len(jd_skills) * 100, 2) |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| resume_text = ( |
| "Python, NumPy, Pandas, Scikit-learn, PyTorch, TensorFlow, spaCy. " |
| "Machine Learning, NLP, Feature Engineering, Model Evaluation. " |
| "Flask, FastAPI, Git, GitHub, Linux, MLflow, Docker." |
| ) |
| jd_text = ( |
| "Machine Learning Engineer. Strong Python. Amazon SageMaker. " |
| "ML model deployment. APIs. GenAI / LLM solutions. " |
| "MLOps: model monitoring, drift detection, retraining. " |
| "Data pipelines. CI/CD. Kubernetes." |
| ) |
| clean_r = clean_text(resume_text) |
| clean_j = clean_text(jd_text) |
| print("Missing skills :", find_missing_skills(clean_r, clean_j)) |
| print("Skill overlap :", calculate_skill_overlap(clean_r, clean_j), "%") |
| print("Resume skills :", extract_resume_skills(clean_r)) |
| print("JD skills :", set(extract_required_skills_from_jd(clean_j).keys())) |