Spaces:
Running
Running
| import pandas as pd | |
| import pickle | |
| import numpy as np | |
| from datasets import load_dataset | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.ensemble import GradientBoostingRegressor | |
| from sklearn.model_selection import train_test_split | |
| import re | |
| import time | |
| def train_ats_scorer(): | |
| # 1. Load Dependencies | |
| print("Loading TF-IDF Vectorizer (from Step 1)...") | |
| try: | |
| tfidf = pickle.load(open('tfidf.pkl', 'rb')) | |
| except FileNotFoundError: | |
| print("ERROR: 'tfidf.pkl' not found. Run 'train_model.py' first!") | |
| exit() | |
| # 2. Load ATS Dataset (0xnbk) | |
| print("Loading 0xnbk/resume-ats-score-v1-en...") | |
| try: | |
| ds = load_dataset("0xnbk/resume-ats-score-v1-en") | |
| df = pd.DataFrame(ds['train']) | |
| print(f"Loaded {len(df)} rows.") | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| exit() | |
| # 3. Pre-Process | |
| res_col = 'text' | |
| score_col = 'ats_score' | |
| cat_col = 'original_label' | |
| df[score_col] = pd.to_numeric(df[score_col], errors='coerce') | |
| df.dropna(subset=[score_col, res_col], inplace=True) | |
| # 4. Generate Training Prototypes | |
| print("Generating Training Prototypes...") | |
| # Group resumes by label to simulate "Job Descriptions" | |
| train_prototypes = df.groupby(cat_col)[res_col].apply(lambda x: ' '.join(x)).to_dict() | |
| # Optimization: Pre-calculate vectors | |
| print("Pre-calculating vectors...") | |
| proto_vectors = {} | |
| proto_tokens = {} | |
| for cat, text in train_prototypes.items(): | |
| proto_vectors[cat] = tfidf.transform([text]) | |
| proto_tokens[cat] = set(re.findall(r'\w+', text.lower())) | |
| # 5. Feature Engineering | |
| print("Calculating features...") | |
| cosine_sims = [] | |
| keyword_matches = [] | |
| for i, row in enumerate(df.itertuples()): | |
| text = str(getattr(row, res_col)) | |
| cat = getattr(row, cat_col) | |
| if cat in proto_vectors: | |
| # Feature 1: Similarity | |
| vec = tfidf.transform([text]) | |
| target_vec = proto_vectors[cat] | |
| sim = cosine_similarity(vec, target_vec)[0][0] | |
| # Feature 2: Keyword Match | |
| tokens = set(re.findall(r'\w+', text.lower())) | |
| target_tokens = proto_tokens[cat] | |
| match = len(tokens.intersection(target_tokens)) / len(target_tokens) if target_tokens else 0 | |
| else: | |
| sim = 0 | |
| match = 0 | |
| cosine_sims.append(sim) | |
| keyword_matches.append(match) | |
| df['cosine_sim'] = cosine_sims | |
| df['keyword_match'] = keyword_matches | |
| # 6. Train Regressor | |
| print("Training ATS Regressor...") | |
| X = df[['cosine_sim', 'keyword_match']] | |
| y = df[score_col] | |
| reg = GradientBoostingRegressor() | |
| reg.fit(X, y) | |
| # 7. Save | |
| pickle.dump(reg, open('ats_scorer.pkl', 'wb')) | |
| print("SUCCESS: 'ats_scorer.pkl' saved.") | |
| if __name__ == "__main__": | |
| train_ats_scorer() |