resume-analyzer-app / train_model.py
SoS13's picture
Upload 10 files
74f28d3 verified
import pandas as pd
import pickle
import re
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
def train_classifier():
# 1. Load Dataset (AzharAli05)
print("Loading AzharAli05/Resume-Screening-Dataset...")
try:
ds = load_dataset("AzharAli05/Resume-Screening-Dataset")
df = pd.DataFrame(ds['train'])
print(f"Loaded {len(df)} resumes.")
except Exception as e:
print(f"Error loading dataset: {e}")
exit()
# 2. Setup Columns
# Based on your dataset check: Text='Resume', Label='Role'
text_col = 'Resume'
label_col = 'Role'
# 3. Cleaning Function
def clean_resume(txt):
cleanText = re.sub(r'http\S+\s', ' ', str(txt))
cleanText = re.sub(r'RT|cc', ' ', cleanText)
cleanText = re.sub(r'#\S+\s', ' ', cleanText)
cleanText = re.sub(r'@\S+', ' ', cleanText)
cleanText = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', cleanText)
cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
cleanText = re.sub(r'\s+', ' ', cleanText)
return cleanText
print("Cleaning data...")
df['cleaned_resume'] = df[text_col].apply(clean_resume)
# 4. Generate & Save Prototypes (Crucial for App)
print("Generating Master Profiles (Prototypes)...")
# We combine all resumes for a specific role to create a "Master Profile"
prototypes = df.groupby(label_col)['cleaned_resume'].apply(lambda x: ' '.join(x)).to_dict()
pickle.dump(prototypes, open('prototypes.pkl', 'wb'))
# 5. Encoding Labels
le = LabelEncoder()
df['Category_ID'] = le.fit_transform(df[label_col])
# 6. Vectorizing
print("Vectorizing...")
tfidf = TfidfVectorizer(stop_words='english', max_features=200)
tfidf.fit(df['cleaned_resume'])
requiredText = tfidf.transform(df['cleaned_resume'])
# 7. Training
print("Training Classifier...")
clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(requiredText, df['Category_ID'])
# 8. Saving Models
print("Saving models...")
pickle.dump(clf, open('clf.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
pickle.dump(le, open('encoder.pkl', 'wb'))
print("SUCCESS: Classification models + Prototypes saved.")
if __name__ == "__main__":
train_classifier()