File size: 3,029 Bytes
74f28d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
import pickle
import numpy as np
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import re
import time

def train_ats_scorer():
    # 1. Load Dependencies
    print("Loading TF-IDF Vectorizer (from Step 1)...")
    try:
        tfidf = pickle.load(open('tfidf.pkl', 'rb'))
    except FileNotFoundError:
        print("ERROR: 'tfidf.pkl' not found. Run 'train_model.py' first!")
        exit()

    # 2. Load ATS Dataset (0xnbk)
    print("Loading 0xnbk/resume-ats-score-v1-en...")
    try:
        ds = load_dataset("0xnbk/resume-ats-score-v1-en")
        df = pd.DataFrame(ds['train'])
        print(f"Loaded {len(df)} rows.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # 3. Pre-Process
    res_col = 'text'
    score_col = 'ats_score'
    cat_col = 'original_label'

    df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
    df.dropna(subset=[score_col, res_col], inplace=True)

    # 4. Generate Training Prototypes
    print("Generating Training Prototypes...")
    # Group resumes by label to simulate "Job Descriptions"
    train_prototypes = df.groupby(cat_col)[res_col].apply(lambda x: ' '.join(x)).to_dict()

    # Optimization: Pre-calculate vectors
    print("Pre-calculating vectors...")
    proto_vectors = {}
    proto_tokens = {}
    
    for cat, text in train_prototypes.items():
        proto_vectors[cat] = tfidf.transform([text])
        proto_tokens[cat] = set(re.findall(r'\w+', text.lower()))

    # 5. Feature Engineering
    print("Calculating features...")
    cosine_sims = []
    keyword_matches = []

    for i, row in enumerate(df.itertuples()):
        text = str(getattr(row, res_col))
        cat = getattr(row, cat_col)
        
        if cat in proto_vectors:
            # Feature 1: Similarity
            vec = tfidf.transform([text])
            target_vec = proto_vectors[cat]
            sim = cosine_similarity(vec, target_vec)[0][0]
            
            # Feature 2: Keyword Match
            tokens = set(re.findall(r'\w+', text.lower()))
            target_tokens = proto_tokens[cat]
            match = len(tokens.intersection(target_tokens)) / len(target_tokens) if target_tokens else 0
        else:
            sim = 0
            match = 0
            
        cosine_sims.append(sim)
        keyword_matches.append(match)

    df['cosine_sim'] = cosine_sims
    df['keyword_match'] = keyword_matches

    # 6. Train Regressor
    print("Training ATS Regressor...")
    X = df[['cosine_sim', 'keyword_match']]
    y = df[score_col]

    reg = GradientBoostingRegressor()
    reg.fit(X, y)

    # 7. Save
    pickle.dump(reg, open('ats_scorer.pkl', 'wb'))
    print("SUCCESS: 'ats_scorer.pkl' saved.")

if __name__ == "__main__":
    train_ats_scorer()