|
|
import pandas as pd
|
|
|
import pickle
|
|
|
import numpy as np
|
|
|
from datasets import load_dataset
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
from sklearn.ensemble import GradientBoostingRegressor
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
import re
|
|
|
import time
|
|
|
|
|
|
def train_ats_scorer():
|
|
|
|
|
|
print("Loading TF-IDF Vectorizer (from Step 1)...")
|
|
|
try:
|
|
|
tfidf = pickle.load(open('tfidf.pkl', 'rb'))
|
|
|
except FileNotFoundError:
|
|
|
print("ERROR: 'tfidf.pkl' not found. Run 'train_model.py' first!")
|
|
|
exit()
|
|
|
|
|
|
|
|
|
print("Loading 0xnbk/resume-ats-score-v1-en...")
|
|
|
try:
|
|
|
ds = load_dataset("0xnbk/resume-ats-score-v1-en")
|
|
|
df = pd.DataFrame(ds['train'])
|
|
|
print(f"Loaded {len(df)} rows.")
|
|
|
except Exception as e:
|
|
|
print(f"Error loading dataset: {e}")
|
|
|
exit()
|
|
|
|
|
|
|
|
|
res_col = 'text'
|
|
|
score_col = 'ats_score'
|
|
|
cat_col = 'original_label'
|
|
|
|
|
|
df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
|
|
|
df.dropna(subset=[score_col, res_col], inplace=True)
|
|
|
|
|
|
|
|
|
print("Generating Training Prototypes...")
|
|
|
|
|
|
train_prototypes = df.groupby(cat_col)[res_col].apply(lambda x: ' '.join(x)).to_dict()
|
|
|
|
|
|
|
|
|
print("Pre-calculating vectors...")
|
|
|
proto_vectors = {}
|
|
|
proto_tokens = {}
|
|
|
|
|
|
for cat, text in train_prototypes.items():
|
|
|
proto_vectors[cat] = tfidf.transform([text])
|
|
|
proto_tokens[cat] = set(re.findall(r'\w+', text.lower()))
|
|
|
|
|
|
|
|
|
print("Calculating features...")
|
|
|
cosine_sims = []
|
|
|
keyword_matches = []
|
|
|
|
|
|
for i, row in enumerate(df.itertuples()):
|
|
|
text = str(getattr(row, res_col))
|
|
|
cat = getattr(row, cat_col)
|
|
|
|
|
|
if cat in proto_vectors:
|
|
|
|
|
|
vec = tfidf.transform([text])
|
|
|
target_vec = proto_vectors[cat]
|
|
|
sim = cosine_similarity(vec, target_vec)[0][0]
|
|
|
|
|
|
|
|
|
tokens = set(re.findall(r'\w+', text.lower()))
|
|
|
target_tokens = proto_tokens[cat]
|
|
|
match = len(tokens.intersection(target_tokens)) / len(target_tokens) if target_tokens else 0
|
|
|
else:
|
|
|
sim = 0
|
|
|
match = 0
|
|
|
|
|
|
cosine_sims.append(sim)
|
|
|
keyword_matches.append(match)
|
|
|
|
|
|
df['cosine_sim'] = cosine_sims
|
|
|
df['keyword_match'] = keyword_matches
|
|
|
|
|
|
|
|
|
print("Training ATS Regressor...")
|
|
|
X = df[['cosine_sim', 'keyword_match']]
|
|
|
y = df[score_col]
|
|
|
|
|
|
reg = GradientBoostingRegressor()
|
|
|
reg.fit(X, y)
|
|
|
|
|
|
|
|
|
pickle.dump(reg, open('ats_scorer.pkl', 'wb'))
|
|
|
print("SUCCESS: 'ats_scorer.pkl' saved.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
train_ats_scorer() |