resume-analyzer-app / train_ats_model.py
SoS13's picture
Upload 10 files
74f28d3 verified
import pandas as pd
import pickle
import numpy as np
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import re
import time
def train_ats_scorer():
# 1. Load Dependencies
print("Loading TF-IDF Vectorizer (from Step 1)...")
try:
tfidf = pickle.load(open('tfidf.pkl', 'rb'))
except FileNotFoundError:
print("ERROR: 'tfidf.pkl' not found. Run 'train_model.py' first!")
exit()
# 2. Load ATS Dataset (0xnbk)
print("Loading 0xnbk/resume-ats-score-v1-en...")
try:
ds = load_dataset("0xnbk/resume-ats-score-v1-en")
df = pd.DataFrame(ds['train'])
print(f"Loaded {len(df)} rows.")
except Exception as e:
print(f"Error loading dataset: {e}")
exit()
# 3. Pre-Process
res_col = 'text'
score_col = 'ats_score'
cat_col = 'original_label'
df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
df.dropna(subset=[score_col, res_col], inplace=True)
# 4. Generate Training Prototypes
print("Generating Training Prototypes...")
# Group resumes by label to simulate "Job Descriptions"
train_prototypes = df.groupby(cat_col)[res_col].apply(lambda x: ' '.join(x)).to_dict()
# Optimization: Pre-calculate vectors
print("Pre-calculating vectors...")
proto_vectors = {}
proto_tokens = {}
for cat, text in train_prototypes.items():
proto_vectors[cat] = tfidf.transform([text])
proto_tokens[cat] = set(re.findall(r'\w+', text.lower()))
# 5. Feature Engineering
print("Calculating features...")
cosine_sims = []
keyword_matches = []
for i, row in enumerate(df.itertuples()):
text = str(getattr(row, res_col))
cat = getattr(row, cat_col)
if cat in proto_vectors:
# Feature 1: Similarity
vec = tfidf.transform([text])
target_vec = proto_vectors[cat]
sim = cosine_similarity(vec, target_vec)[0][0]
# Feature 2: Keyword Match
tokens = set(re.findall(r'\w+', text.lower()))
target_tokens = proto_tokens[cat]
match = len(tokens.intersection(target_tokens)) / len(target_tokens) if target_tokens else 0
else:
sim = 0
match = 0
cosine_sims.append(sim)
keyword_matches.append(match)
df['cosine_sim'] = cosine_sims
df['keyword_match'] = keyword_matches
# 6. Train Regressor
print("Training ATS Regressor...")
X = df[['cosine_sim', 'keyword_match']]
y = df[score_col]
reg = GradientBoostingRegressor()
reg.fit(X, y)
# 7. Save
pickle.dump(reg, open('ats_scorer.pkl', 'wb'))
print("SUCCESS: 'ats_scorer.pkl' saved.")
if __name__ == "__main__":
train_ats_scorer()