import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity class SkillMapper: def __init__(self): # Load CSV self.df = pd.read_csv("job_titles_classification_extended.csv").dropna() # Convert skills to lowercase self.df["skills_required"] = self.df["skills_required"].str.lower() # TF-IDF Vectorizer self.vectorizer = TfidfVectorizer(ngram_range=(1, 2)) self.skill_matrix = self.vectorizer.fit_transform(self.df["skills_required"]) def predict_top_roles(self, skills_list, top_n=3): """Return top N job titles with similarity scores.""" if not skills_list: return [] skills_text = " ".join(skills_list).lower() user_vec = self.vectorizer.transform([skills_text]) similarities = cosine_similarity(user_vec, self.skill_matrix)[0] top_indices = similarities.argsort()[-top_n:][::-1] results = [] for idx in top_indices: job_title = self.df.iloc[idx]["job_title"] # ✅ Use job_title score = round(float(similarities[idx]), 3) results.append((job_title, score)) return results def predict_role(self, skills_list): """Return single best matching job_title with score.""" top_roles = self.predict_top_roles(skills_list, top_n=1) if top_roles: return top_roles[0] # (job_title, score) else: return ("Unknown Job Title", 0.0)