Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced Job Role to Skill Recommendation API | |
| ============================================== | |
| Features: | |
| 1. Hybrid recommendations (embeddings + collaborative + rules) | |
| 2. Confidence scores and explanations | |
| 3. Skill gap analysis with priorities | |
| 4. Learning path suggestions | |
| 5. Similar role discovery | |
| 6. Skill clustering | |
| 7. Advanced filtering and ranking | |
| Author: Enhanced Version | |
| Date: 2024 | |
| """ | |
| import os | |
| import json | |
| import numpy as np | |
| from typing import List, Dict, Tuple, Optional | |
| from collections import defaultdict | |
| from dataclasses import dataclass | |
| from fastapi import FastAPI, HTTPException, Query | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from rapidfuzz import process, fuzz | |
| # Import configuration | |
| try: | |
| from config import ( | |
| RECOMMENDATION_WEIGHTS, | |
| MIN_CONFIDENCE_THRESHOLD, | |
| MIN_SOURCES_REQUIRED, | |
| DATASET_SKILLS_BYPASS_SOURCE_CHECK, | |
| PREFER_DATASET_SKILLS, | |
| DATASET_BONUS, | |
| ROLE_MATCH_THRESHOLD, | |
| SKILL_MATCH_THRESHOLD, | |
| DEFAULT_TOP_K, | |
| DEFAULT_MIN_CONFIDENCE, | |
| MAX_TOP_K, | |
| CANDIDATE_MULTIPLIER, | |
| USE_CUSTOM_FILTER, | |
| custom_skill_filter | |
| ) | |
| except ImportError: | |
| # Fallback defaults if config.py not found | |
| RECOMMENDATION_WEIGHTS = {'embedding': 0.3, 'dataset': 0.6, 'collaborative': 0.1} | |
| MIN_CONFIDENCE_THRESHOLD = 0.15 | |
| MIN_SOURCES_REQUIRED = 1 | |
| DATASET_SKILLS_BYPASS_SOURCE_CHECK = True | |
| PREFER_DATASET_SKILLS = True | |
| DATASET_BONUS = 0.1 | |
| ROLE_MATCH_THRESHOLD = 70 | |
| SKILL_MATCH_THRESHOLD = 80 | |
| DEFAULT_TOP_K = 20 | |
| DEFAULT_MIN_CONFIDENCE = 0.25 | |
| MAX_TOP_K = 100 | |
| CANDIDATE_MULTIPLIER = 3 | |
| USE_CUSTOM_FILTER = False | |
| custom_skill_filter = None | |
| # ============================================================================ | |
| # APP INITIALIZATION | |
| # ============================================================================ | |
| app = FastAPI( | |
| title="Enhanced Job Role → Skill Recommendation API", | |
| description="Advanced skill recommendation system with hybrid algorithms", | |
| version="2.0" | |
| ) | |
| # Add CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ============================================================================ | |
| # CONFIGURATION | |
| # ============================================================================ | |
| ARTIFACT_PATH = "artifacts" | |
| ARTIFACTS_LOADED = False | |
| # Global data structures | |
| JOB_ROLE_TO_IDX: Dict[str, int] = {} | |
| IDX_TO_ROLE: Dict[int, str] = {} | |
| IDX_TO_SKILL: Dict[int, str] = {} | |
| SKILL_TO_IDX: Dict[str, int] = {} | |
| ROLE_TO_SKILLS: Dict[str, List[str]] = {} | |
| ROLE_TO_SKILL_SCORES: Dict[str, Dict[str, float]] = {} | |
| ROLE_PROFILES: Dict[str, Dict] = {} | |
| SKILL_COOCCURRENCE: Dict[str, Dict[str, int]] = {} | |
| CONFIG: Dict = {} | |
| job_role_emb: Optional[np.ndarray] = None | |
| skill_emb: Optional[np.ndarray] = None | |
| collab_similarity: Optional[np.ndarray] = None | |
| # ============================================================================ | |
| # LOAD ARTIFACTS | |
| # ============================================================================ | |
| def load_artifacts(): | |
| """Load all artifacts at startup""" | |
| global ARTIFACTS_LOADED, JOB_ROLE_TO_IDX, IDX_TO_ROLE, IDX_TO_SKILL | |
| global SKILL_TO_IDX, ROLE_TO_SKILLS, ROLE_TO_SKILL_SCORES, ROLE_PROFILES | |
| global SKILL_COOCCURRENCE, CONFIG | |
| global job_role_emb, skill_emb, collab_similarity | |
| try: | |
| print("Loading artifacts...") | |
| # 1. Load mappings | |
| with open(os.path.join(ARTIFACT_PATH, "mappings.json"), "r", encoding="utf-8") as f: | |
| mappings = json.load(f) | |
| JOB_ROLE_TO_IDX = mappings["job_role_to_idx"] | |
| IDX_TO_ROLE = {v: k for k, v in JOB_ROLE_TO_IDX.items()} | |
| IDX_TO_SKILL = {int(k): v for k, v in mappings["idx_to_skill"].items()} | |
| SKILL_TO_IDX = {v: int(k) for k, v in mappings["idx_to_skill"].items()} | |
| ROLE_TO_SKILLS = mappings.get("role_to_skills", {}) | |
| ROLE_TO_SKILL_SCORES = mappings.get("role_to_skill_scores", {}) | |
| ROLE_PROFILES = mappings.get("role_profiles", {}) | |
| CONFIG = mappings.get("config", {}) | |
| # 2. Load embeddings | |
| job_role_emb = np.load(os.path.join(ARTIFACT_PATH, "job_role_emb.npy")) | |
| skill_emb = np.load(os.path.join(ARTIFACT_PATH, "skill_emb.npy")) | |
| # 3. Load collaborative similarity if available | |
| collab_path = os.path.join(ARTIFACT_PATH, "collab_similarity.npy") | |
| if os.path.exists(collab_path): | |
| collab_similarity = np.load(collab_path) | |
| # 4. Load skill co-occurrence | |
| cooccur_path = os.path.join(ARTIFACT_PATH, "skill_cooccurrence.json") | |
| if os.path.exists(cooccur_path): | |
| with open(cooccur_path, "r", encoding="utf-8") as f: | |
| SKILL_COOCCURRENCE = json.load(f) | |
| # Validation | |
| assert job_role_emb is not None and skill_emb is not None | |
| assert len(JOB_ROLE_TO_IDX) == job_role_emb.shape[0] | |
| assert len(IDX_TO_SKILL) == skill_emb.shape[0] | |
| ARTIFACTS_LOADED = True | |
| print(f"✓ Artifacts loaded successfully") | |
| print(f" - {len(JOB_ROLE_TO_IDX)} job roles") | |
| print(f" - {len(IDX_TO_SKILL)} skills") | |
| print(f" - Collaborative: {collab_similarity is not None}") | |
| except Exception as e: | |
| print(f"✗ Failed to load artifacts: {e}") | |
| ARTIFACTS_LOADED = False | |
| # Load on startup | |
| load_artifacts() | |
| # ============================================================================ | |
| # REQUEST/RESPONSE MODELS | |
| # ============================================================================ | |
| class SkillRecommendation(BaseModel): | |
| skill: str | |
| confidence: float = Field(..., ge=0, le=1, description="Confidence score 0-1") | |
| importance: str = Field(..., description="core, important, or nice-to-have") | |
| sources: List[str] = Field(..., description="Recommendation sources") | |
| related_skills: List[str] = Field(default=[], description="Frequently co-occurring skills") | |
| class RecommendationsRequest(BaseModel): | |
| job_role: str | |
| top_k: int = Field(default=DEFAULT_TOP_K, ge=1, le=MAX_TOP_K) | |
| use_hybrid: bool = Field(default=True, description="Use hybrid recommendations") | |
| min_confidence: float = Field(default=DEFAULT_MIN_CONFIDENCE, ge=0, le=1, description="Minimum confidence threshold") | |
| class RecommendationsResponse(BaseModel): | |
| input_role: str | |
| matched_role: str | |
| match_confidence: float | |
| total_recommendations: int | |
| recommendations: List[SkillRecommendation] | |
| role_profile: Optional[Dict] = None | |
| class SkillGapRequest(BaseModel): | |
| job_role: str | |
| current_skills: List[str] | |
| top_k: int = Field(default=15, ge=5, le=50, description="Number of top skills to consider") | |
| use_hybrid: bool = Field(default=True) | |
| include_learning_path: bool = Field(default=True) | |
| min_confidence: float = Field(default=0.30, ge=0, le=1, description="Minimum confidence for required skills") | |
| class SkillGapResponse(BaseModel): | |
| input_role: str | |
| matched_role: str | |
| total_required: int | |
| matched_count: int | |
| missing_count: int | |
| matched_skills: List[str] | |
| missing_skills: List[Dict] | |
| skill_coverage: float | |
| learning_path: Optional[List[Dict]] = None | |
| class SimilarRolesRequest(BaseModel): | |
| job_role: str | |
| top_k: int = Field(default=5, ge=1, le=20) | |
| class SimilarRolesResponse(BaseModel): | |
| input_role: str | |
| matched_role: str | |
| similar_roles: List[Dict] | |
| class RoleGapRequest(BaseModel): | |
| current_role: str | |
| target_role: str | |
| include_transition_path: bool = Field(default=True, description="Include skill transition recommendations") | |
| top_k: int = Field(default=15, ge=5, le=30, description="Skills to consider for each role") | |
| class RoleGapResponse(BaseModel): | |
| current_role: str | |
| target_role: str | |
| role_similarity: float = Field(..., description="How similar the roles are (0-1)") | |
| transferable_skills: List[str] = Field(..., description="Skills you already have that transfer") | |
| skills_to_learn: List[Dict] = Field(..., description="New skills needed for target role") | |
| skills_to_deemphasize: List[str] = Field(..., description="Skills less relevant in target role") | |
| difficulty_level: str = Field(..., description="easy/medium/hard transition") | |
| transition_path: Optional[List[Dict]] = None | |
| # ============================================================================ | |
| # UTILITY FUNCTIONS | |
| # ============================================================================ | |
| def normalize_text(text: str) -> str: | |
| """Normalize text for matching""" | |
| return str(text).lower().strip() | |
| def find_closest_role(role_input: str, min_score: int = None) -> Tuple[str, float]: | |
| """Find closest matching role with confidence""" | |
| if min_score is None: | |
| min_score = ROLE_MATCH_THRESHOLD | |
| roles = list(JOB_ROLE_TO_IDX.keys()) | |
| match = process.extractOne( | |
| normalize_text(role_input), | |
| roles, | |
| scorer=fuzz.token_sort_ratio | |
| ) | |
| if not match: | |
| raise HTTPException( | |
| status_code=404, | |
| detail=f"Job role '{role_input}' not found" | |
| ) | |
| role, score, _ = match | |
| confidence = score / 100.0 | |
| if score < min_score: | |
| raise HTTPException( | |
| status_code=404, | |
| detail=f"No close match found for '{role_input}' (best: {role}, score: {score})" | |
| ) | |
| return role, confidence | |
| def match_user_skills( | |
| required_skills: List[str], | |
| user_skills: List[str], | |
| threshold: int = None | |
| ) -> Tuple[List[str], Dict[str, str]]: | |
| """Match user skills to required skills with mapping""" | |
| if threshold is None: | |
| threshold = SKILL_MATCH_THRESHOLD | |
| matched = [] | |
| skill_mapping = {} # required -> user skill | |
| # Normalize user skills | |
| user_norm = [normalize_text(s) for s in user_skills] | |
| # Common skill synonyms/variations | |
| skill_synonyms = { | |
| 'python': ['programming and coding', 'coding', 'programming'], | |
| 'sql': ['data analytics', 'database', 'data querying'], | |
| 'excel': ['spreadsheet applications', 'data analysis', 'spreadsheet'], | |
| 'r': ['programming and coding', 'statistical programming'], | |
| 'tableau': ['data visualization', 'data storytelling and visualisation', 'infographics and data visualisation'], | |
| 'power bi': ['data visualization', 'business intelligence and data analytics'], | |
| 'java': ['programming and coding', 'coding'], | |
| 'javascript': ['programming and coding', 'web development'], | |
| 'machine learning': ['data mining and modelling', 'ai', 'artificial intelligence'], | |
| 'deep learning': ['data mining and modelling', 'neural networks'], | |
| 'statistics': ['data analytics and computational modelling', 'statistical analysis'], | |
| 'communication': ['stakeholder management', 'stakeholder engagement'], | |
| 'leadership': ['project management', 'team management'] | |
| } | |
| for req_skill in required_skills: | |
| req_norm = normalize_text(req_skill) | |
| matched_this = False | |
| # Try exact match first | |
| if req_norm in user_norm: | |
| matched.append(req_skill) | |
| idx = user_norm.index(req_norm) | |
| skill_mapping[req_skill] = user_skills[idx] | |
| continue | |
| # Try synonym matching | |
| for user_skill, user_norm_skill in zip(user_skills, user_norm): | |
| # Check if user skill maps to required skill via synonyms | |
| synonyms = skill_synonyms.get(user_norm_skill, []) | |
| if any(syn in req_norm for syn in synonyms) or req_norm in user_norm_skill: | |
| matched.append(req_skill) | |
| skill_mapping[req_skill] = user_skill | |
| matched_this = True | |
| break | |
| if matched_this: | |
| continue | |
| # Try fuzzy match | |
| best_match = process.extractOne( | |
| req_norm, | |
| user_norm, | |
| scorer=fuzz.token_set_ratio | |
| ) | |
| if best_match and best_match[1] >= threshold: | |
| matched.append(req_skill) | |
| idx = user_norm.index(best_match[0]) | |
| skill_mapping[req_skill] = user_skills[idx] | |
| return matched, skill_mapping | |
| # ============================================================================ | |
| # RECOMMENDATION ENGINE | |
| # ============================================================================ | |
| def get_embedding_recommendations( | |
| role: str, | |
| top_k: int = 50 | |
| ) -> List[Tuple[str, float]]: | |
| """Get recommendations based on embeddings""" | |
| if job_role_emb is None or skill_emb is None: | |
| return [] | |
| role_idx = JOB_ROLE_TO_IDX[role] | |
| role_vec = job_role_emb[role_idx].reshape(1, -1) | |
| # Compute similarities | |
| sims = cosine_similarity(role_vec, skill_emb)[0] | |
| # Get top-k (use multiplier for more candidates) | |
| candidates = top_k * CANDIDATE_MULTIPLIER | |
| top_indices = np.argsort(sims)[::-1][:candidates] | |
| return [(IDX_TO_SKILL[i], float(sims[i])) for i in top_indices] | |
| def get_dataset_recommendations( | |
| role: str, | |
| top_k: int = 50 | |
| ) -> List[Tuple[str, float]]: | |
| """Get recommendations from dataset (ground truth)""" | |
| if role not in ROLE_TO_SKILLS: | |
| return [] | |
| skills = ROLE_TO_SKILLS[role] | |
| scores = ROLE_TO_SKILL_SCORES.get(role, {}) | |
| # Get scores or use default | |
| skill_scores = [] | |
| for skill in skills[:top_k]: | |
| score = scores.get(skill, 0.5) | |
| skill_scores.append((skill, score)) | |
| return skill_scores | |
| def get_collaborative_recommendations( | |
| role: str, | |
| top_k: int = 50 | |
| ) -> List[Tuple[str, float]]: | |
| """Get recommendations from collaborative filtering""" | |
| if collab_similarity is None: | |
| return [] | |
| role_idx = JOB_ROLE_TO_IDX[role] | |
| scores = collab_similarity[role_idx] | |
| # Get top-k (use multiplier) | |
| candidates = top_k * CANDIDATE_MULTIPLIER | |
| top_indices = np.argsort(scores)[::-1][:candidates] | |
| # Normalize scores | |
| max_score = scores[top_indices[0]] if len(top_indices) > 0 else 1.0 | |
| return [ | |
| (IDX_TO_SKILL[i], float(scores[i] / max_score)) | |
| for i in top_indices | |
| if scores[i] > 0 | |
| ] | |
| def get_hybrid_recommendations( | |
| role: str, | |
| top_k: int = 50, | |
| weights: Dict[str, float] = None, | |
| min_confidence: float = None, | |
| prefer_dataset: bool = None | |
| ) -> List[SkillRecommendation]: | |
| """ | |
| Get hybrid recommendations combining multiple sources | |
| Args: | |
| role: Job role name | |
| top_k: Number of recommendations | |
| weights: Weights for each source (embedding, dataset, collaborative) | |
| min_confidence: Minimum confidence score to include | |
| prefer_dataset: Give bonus to skills in dataset (ground truth) | |
| """ | |
| # Use config defaults if not specified | |
| if weights is None: | |
| weights = RECOMMENDATION_WEIGHTS | |
| if min_confidence is None: | |
| min_confidence = MIN_CONFIDENCE_THRESHOLD | |
| if prefer_dataset is None: | |
| prefer_dataset = PREFER_DATASET_SKILLS | |
| # Get recommendations from all sources | |
| emb_recs = get_embedding_recommendations(role, top_k) | |
| dataset_recs = get_dataset_recommendations(role, top_k) | |
| collab_recs = get_collaborative_recommendations(role, top_k) | |
| # Combine scores | |
| skill_scores = defaultdict(lambda: {'total': 0.0, 'sources': [], 'scores': {}}) | |
| # Track which skills are in dataset (ground truth) | |
| dataset_skills = set(s for s, _ in dataset_recs) | |
| # Process embedding recommendations | |
| for skill, score in emb_recs: | |
| skill_scores[skill]['scores']['embedding'] = score | |
| skill_scores[skill]['sources'].append('embedding') | |
| skill_scores[skill]['total'] += score * weights['embedding'] | |
| # Process dataset recommendations (ground truth - higher weight) | |
| for skill, score in dataset_recs: | |
| skill_scores[skill]['scores']['dataset'] = score | |
| if 'dataset' not in skill_scores[skill]['sources']: | |
| skill_scores[skill]['sources'].append('dataset') | |
| skill_scores[skill]['total'] += score * weights['dataset'] | |
| # BONUS: If in dataset, boost confidence | |
| if prefer_dataset: | |
| skill_scores[skill]['total'] += DATASET_BONUS | |
| # Process collaborative recommendations | |
| for skill, score in collab_recs: | |
| skill_scores[skill]['scores']['collaborative'] = score | |
| if 'collaborative' not in skill_scores[skill]['sources']: | |
| skill_scores[skill]['sources'].append('collaborative') | |
| skill_scores[skill]['total'] += score * weights['collaborative'] | |
| # FILTER: Remove skills based on criteria | |
| filtered_skills = {} | |
| for skill, data in skill_scores.items(): | |
| # Check minimum sources requirement | |
| has_enough_sources = len(data['sources']) >= MIN_SOURCES_REQUIRED | |
| is_dataset_skill = 'dataset' in data['sources'] | |
| # Bypass source check for dataset skills if configured | |
| if DATASET_SKILLS_BYPASS_SOURCE_CHECK and is_dataset_skill: | |
| has_enough_sources = True | |
| if not has_enough_sources: | |
| continue | |
| # Check minimum confidence | |
| if data['total'] < MIN_CONFIDENCE_THRESHOLD: | |
| continue | |
| # Apply custom filter if enabled | |
| if USE_CUSTOM_FILTER and custom_skill_filter: | |
| if not custom_skill_filter(skill, data['total'], data['sources'], role): | |
| continue | |
| filtered_skills[skill] = data | |
| # Determine importance level | |
| role_profile = ROLE_PROFILES.get(role, {}) | |
| core_skills = set(role_profile.get('core_skills', [])) | |
| nice_to_have = set(role_profile.get('nice_to_have', [])) | |
| # Create recommendations | |
| recommendations = [] | |
| for skill, data in filtered_skills.items(): | |
| # Determine importance | |
| if skill in core_skills: | |
| importance = 'core' | |
| elif skill in nice_to_have: | |
| importance = 'nice-to-have' | |
| else: | |
| # If in dataset but not classified, it's important | |
| importance = 'important' if skill in dataset_skills else 'nice-to-have' | |
| # Get related skills from co-occurrence | |
| related_skills = [] | |
| if skill in SKILL_COOCCURRENCE: | |
| related = sorted( | |
| SKILL_COOCCURRENCE[skill].items(), | |
| key=lambda x: x[1], | |
| reverse=True | |
| )[:5] | |
| related_skills = [s for s, _ in related] | |
| recommendations.append(SkillRecommendation( | |
| skill=skill, | |
| confidence=min(data['total'], 1.0), | |
| importance=importance, | |
| sources=data['sources'], | |
| related_skills=related_skills | |
| )) | |
| # Sort by confidence and return top-k | |
| recommendations.sort(key=lambda x: x.confidence, reverse=True) | |
| return recommendations[:top_k] | |
| # ============================================================================ | |
| # API ENDPOINTS | |
| # ============================================================================ | |
| def root(): | |
| """API information""" | |
| return { | |
| "name": "Enhanced Job Role to Skill Recommendation API", | |
| "version": "2.0", | |
| "status": "running", | |
| "artifacts_loaded": ARTIFACTS_LOADED, | |
| "endpoints": { | |
| "/health": "Health check and system stats", | |
| "/recommendations": "Get skill recommendations for a role", | |
| "/skill-gap": "Analyze skill gaps for a role", | |
| "/role-gap": "Analyze transition from current role to target role", | |
| "/similar-roles": "Find similar job roles", | |
| "/roles": "List all available roles", | |
| "/skills": "Search skills", | |
| "/debug/role/{name}": "Debug role details", | |
| "/debug/match-skills": "Test skill matching" | |
| } | |
| } | |
| def health(): | |
| """Health check with detailed stats""" | |
| return { | |
| "status": "healthy" if ARTIFACTS_LOADED else "unhealthy", | |
| "artifacts_loaded": ARTIFACTS_LOADED, | |
| "statistics": { | |
| "total_roles": len(JOB_ROLE_TO_IDX), | |
| "total_skills": len(IDX_TO_SKILL), | |
| "has_collaborative": collab_similarity is not None, | |
| "has_cooccurrence": len(SKILL_COOCCURRENCE) > 0, | |
| "embedding_dimension": int(skill_emb.shape[1]) if skill_emb is not None else 0 | |
| }, | |
| "config": CONFIG | |
| } | |
| def get_recommendations(req: RecommendationsRequest): | |
| """ | |
| Get skill recommendations for a job role | |
| Uses hybrid algorithm combining: | |
| - Semantic embeddings | |
| - Historical data (dataset) | |
| - Collaborative filtering | |
| """ | |
| if not ARTIFACTS_LOADED: | |
| raise HTTPException(status_code=503, detail="Service not ready") | |
| # Find closest matching role | |
| role, match_conf = find_closest_role(req.job_role) | |
| # Get recommendations with proper filtering | |
| if req.use_hybrid: | |
| recommendations = get_hybrid_recommendations( | |
| role, | |
| req.top_k * 2, # Get more candidates | |
| min_confidence=max(req.min_confidence, 0.20) # Enforce minimum | |
| ) | |
| # Apply additional filtering | |
| filtered = [] | |
| for rec in recommendations: | |
| # Skip if below user's threshold | |
| if rec.confidence < req.min_confidence: | |
| continue | |
| # Prefer skills with dataset or multiple sources | |
| if len(rec.sources) >= 2 or 'dataset' in rec.sources: | |
| filtered.append(rec) | |
| elif rec.confidence >= 0.35: # Or very high confidence | |
| filtered.append(rec) | |
| # Take top K | |
| recommendations = filtered[:req.top_k] | |
| else: | |
| # Use dataset only | |
| dataset_recs = get_dataset_recommendations(role, req.top_k) | |
| recommendations = [ | |
| SkillRecommendation( | |
| skill=skill, | |
| confidence=score, | |
| importance='important', | |
| sources=['dataset'], | |
| related_skills=[] | |
| ) | |
| for skill, score in dataset_recs | |
| if score >= req.min_confidence | |
| ][:req.top_k] | |
| # Get role profile | |
| role_profile = ROLE_PROFILES.get(role) | |
| return RecommendationsResponse( | |
| input_role=req.job_role, | |
| matched_role=role, | |
| match_confidence=match_conf, | |
| total_recommendations=len(recommendations), | |
| recommendations=recommendations, | |
| role_profile=role_profile | |
| ) | |
| def analyze_skill_gap(req: SkillGapRequest): | |
| """ | |
| Analyze skill gaps between current and required skills | |
| Provides: | |
| - Matched skills (fuzzy matching) | |
| - Missing skills with priorities (core first) | |
| - Focused learning path (not overwhelming) | |
| """ | |
| if not ARTIFACTS_LOADED: | |
| raise HTTPException(status_code=503, detail="Service not ready") | |
| # Find closest role | |
| role, _ = find_closest_role(req.job_role) | |
| # Get required skills with STRICT filtering | |
| if req.use_hybrid: | |
| # Get more candidates initially | |
| all_recs = get_hybrid_recommendations( | |
| role, | |
| top_k=req.top_k * 2, | |
| min_confidence=req.min_confidence | |
| ) | |
| # Further filter: remove very low confidence and non-dataset skills | |
| filtered_recs = [] | |
| for rec in all_recs: | |
| # Must be in dataset OR have very high confidence | |
| if 'dataset' in rec.sources or rec.confidence > 0.40: | |
| filtered_recs.append(rec) | |
| # Take top K after filtering | |
| required_recs = filtered_recs[:req.top_k] | |
| required_skills = [r.skill for r in required_recs] | |
| skill_info = {r.skill: r for r in required_recs} | |
| else: | |
| # Dataset only | |
| dataset_recs = get_dataset_recommendations(role, req.top_k) | |
| required_skills = [s for s, score in dataset_recs if score >= req.min_confidence] | |
| skill_info = {} | |
| if not required_skills: | |
| raise HTTPException( | |
| status_code=404, | |
| detail=f"No high-confidence skills found for {role}. Try lowering min_confidence." | |
| ) | |
| # Match user skills with better fuzzy matching | |
| matched, skill_mapping = match_user_skills( | |
| required_skills, | |
| req.current_skills, | |
| threshold=75 # More lenient matching | |
| ) | |
| # Identify missing skills | |
| missing = [s for s in required_skills if s not in matched] | |
| # Create detailed missing skills list (prioritized) | |
| missing_details = [] | |
| for skill in missing: | |
| info = skill_info.get(skill) | |
| if info: | |
| missing_details.append({ | |
| 'skill': skill, | |
| 'confidence': round(info.confidence, 3), | |
| 'importance': info.importance, | |
| 'related_skills': info.related_skills[:3] # Only top 3 | |
| }) | |
| else: | |
| # Fallback for non-hybrid mode | |
| missing_details.append({ | |
| 'skill': skill, | |
| 'confidence': 0.5, | |
| 'importance': 'important', | |
| 'related_skills': [] | |
| }) | |
| # Sort by importance then confidence | |
| importance_order = {'core': 0, 'important': 1, 'nice-to-have': 2} | |
| missing_details.sort( | |
| key=lambda x: (importance_order.get(x['importance'], 1), -x['confidence']) | |
| ) | |
| # LIMIT output: max 10 missing skills shown | |
| missing_details = missing_details[:10] | |
| # Calculate coverage | |
| coverage = len(matched) / len(required_skills) if required_skills else 0.0 | |
| # Generate FOCUSED learning path | |
| learning_path = None | |
| if req.include_learning_path and missing_details: | |
| learning_path = [] | |
| # Split by importance | |
| core_missing = [s for s in missing_details if s['importance'] == 'core'] | |
| important_missing = [s for s in missing_details if s['importance'] == 'important'] | |
| # Foundation: Max 3 core skills | |
| if core_missing: | |
| learning_path.append({ | |
| 'phase': 'Foundation', | |
| 'priority': 'high', | |
| 'skills': [s['skill'] for s in core_missing[:3]], | |
| 'description': 'Essential skills to acquire first', | |
| 'estimated_time': '2-3 months' | |
| }) | |
| # Development: Max 4 important skills | |
| if important_missing: | |
| learning_path.append({ | |
| 'phase': 'Development', | |
| 'priority': 'medium', | |
| 'skills': [s['skill'] for s in important_missing[:4]], | |
| 'description': 'Build core competency in these areas', | |
| 'estimated_time': '3-6 months' | |
| }) | |
| return SkillGapResponse( | |
| input_role=req.job_role, | |
| matched_role=role, | |
| total_required=len(required_skills), | |
| matched_count=len(matched), | |
| missing_count=len(missing), | |
| matched_skills=matched, | |
| missing_skills=missing_details, | |
| skill_coverage=round(coverage, 3), | |
| learning_path=learning_path | |
| ) | |
| def find_similar_roles(req: SimilarRolesRequest): | |
| """Find similar job roles based on skill overlap""" | |
| if not ARTIFACTS_LOADED: | |
| raise HTTPException(status_code=503, detail="Service not ready") | |
| # Find input role | |
| role, _ = find_closest_role(req.job_role) | |
| role_idx = JOB_ROLE_TO_IDX[role] | |
| # Compute similarity to all other roles | |
| role_vec = job_role_emb[role_idx].reshape(1, -1) | |
| similarities = cosine_similarity(role_vec, job_role_emb)[0] | |
| # Get top similar (excluding self) | |
| top_indices = np.argsort(similarities)[::-1][1:req.top_k+1] | |
| similar_roles = [] | |
| for idx in top_indices: | |
| similar_role = IDX_TO_ROLE[idx] | |
| similarity = float(similarities[idx]) | |
| # Get overlapping skills | |
| role_skills = set(ROLE_TO_SKILLS.get(role, [])) | |
| similar_skills = set(ROLE_TO_SKILLS.get(similar_role, [])) | |
| overlap = role_skills & similar_skills | |
| similar_roles.append({ | |
| 'role': similar_role, | |
| 'similarity': round(similarity, 3), | |
| 'shared_skills': len(overlap), | |
| 'total_skills': len(similar_skills), | |
| 'overlap_percentage': round(len(overlap) / len(role_skills) * 100, 1) if role_skills else 0 | |
| }) | |
| return SimilarRolesResponse( | |
| input_role=req.job_role, | |
| matched_role=role, | |
| similar_roles=similar_roles | |
| ) | |
| def analyze_role_gap(req: RoleGapRequest): | |
| """ | |
| Analyze the gap between current role and target role | |
| Provides: | |
| - Role similarity score | |
| - Transferable skills (already have) | |
| - Skills to learn (need to acquire) | |
| - Skills to deemphasize (less important) | |
| - Transition difficulty assessment | |
| - Step-by-step transition path | |
| """ | |
| if not ARTIFACTS_LOADED: | |
| raise HTTPException(status_code=503, detail="Service not ready") | |
| # Find both roles | |
| current_role, _ = find_closest_role(req.current_role) | |
| target_role, _ = find_closest_role(req.target_role) | |
| if current_role == target_role: | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Current and target roles are the same. No transition needed." | |
| ) | |
| # Get embeddings for similarity | |
| current_idx = JOB_ROLE_TO_IDX[current_role] | |
| target_idx = JOB_ROLE_TO_IDX[target_role] | |
| current_vec = job_role_emb[current_idx].reshape(1, -1) | |
| target_vec = job_role_emb[target_idx].reshape(1, -1) | |
| role_similarity = float(cosine_similarity(current_vec, target_vec)[0][0]) | |
| # Get skills for both roles | |
| current_recs = get_hybrid_recommendations(current_role, top_k=req.top_k, min_confidence=0.25) | |
| target_recs = get_hybrid_recommendations(target_role, top_k=req.top_k, min_confidence=0.25) | |
| current_skills = {r.skill: r for r in current_recs} | |
| target_skills = {r.skill: r for r in target_recs} | |
| # Analyze skill overlap | |
| current_skill_names = set(current_skills.keys()) | |
| target_skill_names = set(target_skills.keys()) | |
| # Transferable skills (in both roles) | |
| transferable = list(current_skill_names & target_skill_names) | |
| # Skills to learn (in target but not current) | |
| to_learn_names = target_skill_names - current_skill_names | |
| skills_to_learn = [] | |
| for skill in to_learn_names: | |
| rec = target_skills[skill] | |
| skills_to_learn.append({ | |
| 'skill': skill, | |
| 'confidence': round(rec.confidence, 3), | |
| 'importance': rec.importance, | |
| 'related_skills': rec.related_skills[:3] | |
| }) | |
| # Sort by importance and confidence | |
| importance_order = {'core': 0, 'important': 1, 'nice-to-have': 2} | |
| skills_to_learn.sort( | |
| key=lambda x: (importance_order.get(x['importance'], 1), -x['confidence']) | |
| ) | |
| # Limit to top 10 | |
| skills_to_learn = skills_to_learn[:10] | |
| # Skills to deemphasize (in current but not target) | |
| to_deemphasize = list(current_skill_names - target_skill_names)[:5] | |
| # Determine difficulty level | |
| overlap_pct = len(transferable) / len(target_skill_names) if target_skill_names else 0 | |
| if overlap_pct >= 0.7 or role_similarity >= 0.85: | |
| difficulty = "easy" | |
| difficulty_desc = "High skill overlap - smooth transition" | |
| elif overlap_pct >= 0.4 or role_similarity >= 0.70: | |
| difficulty = "medium" | |
| difficulty_desc = "Moderate overlap - some new skills needed" | |
| else: | |
| difficulty = "hard" | |
| difficulty_desc = "Low overlap - significant reskilling required" | |
| # Generate transition path | |
| transition_path = None | |
| if req.include_transition_path and skills_to_learn: | |
| transition_path = [] | |
| # Phase 1: Leverage transferable skills | |
| if transferable: | |
| transition_path.append({ | |
| 'phase': 'Leverage Current Strengths', | |
| 'duration': '1-2 weeks', | |
| 'description': 'Focus on these skills you already have', | |
| 'skills': transferable[:5], | |
| 'action': 'Highlight these in resume and interviews' | |
| }) | |
| # Phase 2: Core new skills | |
| core_to_learn = [s for s in skills_to_learn if s['importance'] == 'core'] | |
| if core_to_learn: | |
| transition_path.append({ | |
| 'phase': 'Build Core Competencies', | |
| 'duration': '2-4 months', | |
| 'description': 'Essential skills for the target role', | |
| 'skills': [s['skill'] for s in core_to_learn[:4]], | |
| 'action': 'Take courses, build projects, get certifications' | |
| }) | |
| # Phase 3: Important skills | |
| important_to_learn = [s for s in skills_to_learn if s['importance'] == 'important'] | |
| if important_to_learn: | |
| transition_path.append({ | |
| 'phase': 'Expand Capabilities', | |
| 'duration': '2-3 months', | |
| 'description': 'Important skills to be competitive', | |
| 'skills': [s['skill'] for s in important_to_learn[:4]], | |
| 'action': 'Apply in side projects, volunteer work, or current role' | |
| }) | |
| # Phase 4: Apply | |
| transition_path.append({ | |
| 'phase': 'Transition & Apply', | |
| 'duration': '1-2 months', | |
| 'description': 'Start applying and interviewing', | |
| 'skills': transferable[:3], | |
| 'action': 'Update resume, network, apply for target roles' | |
| }) | |
| return RoleGapResponse( | |
| current_role=current_role, | |
| target_role=target_role, | |
| role_similarity=round(role_similarity, 3), | |
| transferable_skills=transferable, | |
| skills_to_learn=skills_to_learn, | |
| skills_to_deemphasize=to_deemphasize, | |
| difficulty_level=f"{difficulty} - {difficulty_desc}", | |
| transition_path=transition_path | |
| ) | |
| def list_roles( | |
| search: Optional[str] = Query(None, description="Search query"), | |
| limit: int = Query(50, ge=1, le=500) | |
| ): | |
| """List all available job roles with optional search""" | |
| roles = list(JOB_ROLE_TO_IDX.keys()) | |
| if search: | |
| # Fuzzy search | |
| matches = process.extract( | |
| normalize_text(search), | |
| roles, | |
| scorer=fuzz.token_sort_ratio, | |
| limit=limit | |
| ) | |
| results = [ | |
| { | |
| 'role': role, | |
| 'match_score': score / 100.0, | |
| 'total_skills': len(ROLE_TO_SKILLS.get(role, [])) | |
| } | |
| for role, score, _ in matches | |
| if score >= 60 | |
| ] | |
| else: | |
| # Return all (limited) | |
| results = [ | |
| { | |
| 'role': role, | |
| 'total_skills': len(ROLE_TO_SKILLS.get(role, [])) | |
| } | |
| for role in sorted(roles)[:limit] | |
| ] | |
| return { | |
| 'total': len(roles), | |
| 'returned': len(results), | |
| 'roles': results | |
| } | |
| def search_skills( | |
| search: str = Query(..., min_length=2, description="Search query"), | |
| limit: int = Query(20, ge=1, le=100) | |
| ): | |
| """Search for skills""" | |
| skills = list(SKILL_TO_IDX.keys()) | |
| # Fuzzy search | |
| matches = process.extract( | |
| normalize_text(search), | |
| skills, | |
| scorer=fuzz.token_sort_ratio, | |
| limit=limit | |
| ) | |
| results = [] | |
| for skill, score, _ in matches: | |
| if score >= 60: | |
| # Count how many roles use this skill | |
| role_count = sum( | |
| 1 for role_skills in ROLE_TO_SKILLS.values() | |
| if skill in role_skills | |
| ) | |
| results.append({ | |
| 'skill': skill, | |
| 'match_score': score / 100.0, | |
| 'used_in_roles': role_count | |
| }) | |
| return { | |
| 'query': search, | |
| 'total_results': len(results), | |
| 'skills': results | |
| } | |
| def debug_role(role_name: str): | |
| """Debug endpoint to inspect role details""" | |
| if not ARTIFACTS_LOADED: | |
| raise HTTPException(status_code=503, detail="Service not ready") | |
| role, confidence = find_closest_role(role_name) | |
| role_idx = JOB_ROLE_TO_IDX[role] | |
| # Get embedding similarities | |
| role_vec = job_role_emb[role_idx].reshape(1, -1) | |
| sims = cosine_similarity(role_vec, skill_emb)[0] | |
| return { | |
| 'input': role_name, | |
| 'matched_role': role, | |
| 'match_confidence': confidence, | |
| 'role_index': role_idx, | |
| 'embedding_stats': { | |
| 'min_similarity': float(np.min(sims)), | |
| 'max_similarity': float(np.max(sims)), | |
| 'mean_similarity': float(np.mean(sims)), | |
| 'std_similarity': float(np.std(sims)) | |
| }, | |
| 'dataset_info': { | |
| 'total_skills': len(ROLE_TO_SKILLS.get(role, [])), | |
| 'has_scores': role in ROLE_TO_SKILL_SCORES, | |
| 'has_profile': role in ROLE_PROFILES | |
| }, | |
| 'profile': ROLE_PROFILES.get(role) | |
| } | |
| def debug_match_skills( | |
| job_role: str, | |
| current_skills: List[str] | |
| ): | |
| """ | |
| Debug endpoint to see how your skills match against role requirements | |
| Helps understand why skills are/aren't matching | |
| """ | |
| if not ARTIFACTS_LOADED: | |
| raise HTTPException(status_code=503, detail="Service not ready") | |
| role, _ = find_closest_role(job_role) | |
| # Get required skills | |
| recs = get_hybrid_recommendations(role, top_k=20, min_confidence=0.20) | |
| required_skills = [r.skill for r in recs] | |
| # Test matching | |
| matched, skill_mapping = match_user_skills(required_skills, current_skills) | |
| # Detailed match info | |
| match_details = [] | |
| for req_skill in required_skills[:15]: # Top 15 | |
| if req_skill in matched: | |
| match_details.append({ | |
| 'required_skill': req_skill, | |
| 'matched': True, | |
| 'user_skill': skill_mapping.get(req_skill), | |
| 'match_type': 'synonym' if skill_mapping.get(req_skill, '').lower() != req_skill else 'exact' | |
| }) | |
| else: | |
| # Find closest match even if not above threshold | |
| best = process.extractOne( | |
| normalize_text(req_skill), | |
| [normalize_text(s) for s in current_skills], | |
| scorer=fuzz.token_set_ratio | |
| ) | |
| match_details.append({ | |
| 'required_skill': req_skill, | |
| 'matched': False, | |
| 'closest_user_skill': current_skills[best[2]] if best else None, | |
| 'similarity_score': best[1] if best else 0, | |
| 'threshold': SKILL_MATCH_THRESHOLD | |
| }) | |
| return { | |
| 'role': role, | |
| 'total_required': len(required_skills), | |
| 'user_provided': len(current_skills), | |
| 'matched_count': len(matched), | |
| 'match_details': match_details, | |
| 'suggestions': [ | |
| 'Try variations like "Programming" instead of "Python"', | |
| 'Use broader terms like "Data Analysis" instead of "Excel"', | |
| 'Check spelling and exact phrasing' | |
| ] | |
| } | |
| # ============================================================================ | |
| # STARTUP/SHUTDOWN | |
| # ============================================================================ | |
| async def startup_event(): | |
| """Run on startup""" | |
| print("="*60) | |
| print("Enhanced Job Role to Skill Recommendation API") | |
| print("="*60) | |
| if ARTIFACTS_LOADED: | |
| print(f"✓ Ready with {len(JOB_ROLE_TO_IDX)} roles and {len(IDX_TO_SKILL)} skills") | |
| else: | |
| print("✗ Artifacts not loaded - service unavailable") | |
| async def shutdown_event(): | |
| """Run on shutdown""" | |
| print("Shutting down...") |