File size: 3,029 Bytes
74f28d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | import pandas as pd
import pickle
import numpy as np
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import re
import time
def train_ats_scorer():
# 1. Load Dependencies
print("Loading TF-IDF Vectorizer (from Step 1)...")
try:
tfidf = pickle.load(open('tfidf.pkl', 'rb'))
except FileNotFoundError:
print("ERROR: 'tfidf.pkl' not found. Run 'train_model.py' first!")
exit()
# 2. Load ATS Dataset (0xnbk)
print("Loading 0xnbk/resume-ats-score-v1-en...")
try:
ds = load_dataset("0xnbk/resume-ats-score-v1-en")
df = pd.DataFrame(ds['train'])
print(f"Loaded {len(df)} rows.")
except Exception as e:
print(f"Error loading dataset: {e}")
exit()
# 3. Pre-Process
res_col = 'text'
score_col = 'ats_score'
cat_col = 'original_label'
df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
df.dropna(subset=[score_col, res_col], inplace=True)
# 4. Generate Training Prototypes
print("Generating Training Prototypes...")
# Group resumes by label to simulate "Job Descriptions"
train_prototypes = df.groupby(cat_col)[res_col].apply(lambda x: ' '.join(x)).to_dict()
# Optimization: Pre-calculate vectors
print("Pre-calculating vectors...")
proto_vectors = {}
proto_tokens = {}
for cat, text in train_prototypes.items():
proto_vectors[cat] = tfidf.transform([text])
proto_tokens[cat] = set(re.findall(r'\w+', text.lower()))
# 5. Feature Engineering
print("Calculating features...")
cosine_sims = []
keyword_matches = []
for i, row in enumerate(df.itertuples()):
text = str(getattr(row, res_col))
cat = getattr(row, cat_col)
if cat in proto_vectors:
# Feature 1: Similarity
vec = tfidf.transform([text])
target_vec = proto_vectors[cat]
sim = cosine_similarity(vec, target_vec)[0][0]
# Feature 2: Keyword Match
tokens = set(re.findall(r'\w+', text.lower()))
target_tokens = proto_tokens[cat]
match = len(tokens.intersection(target_tokens)) / len(target_tokens) if target_tokens else 0
else:
sim = 0
match = 0
cosine_sims.append(sim)
keyword_matches.append(match)
df['cosine_sim'] = cosine_sims
df['keyword_match'] = keyword_matches
# 6. Train Regressor
print("Training ATS Regressor...")
X = df[['cosine_sim', 'keyword_match']]
y = df[score_col]
reg = GradientBoostingRegressor()
reg.fit(X, y)
# 7. Save
pickle.dump(reg, open('ats_scorer.pkl', 'wb'))
print("SUCCESS: 'ats_scorer.pkl' saved.")
if __name__ == "__main__":
train_ats_scorer() |