import pandas as pd import pickle import re from datasets import load_dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.multiclass import OneVsRestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import LabelEncoder def train_classifier(): # 1. Load Dataset (AzharAli05) print("Loading AzharAli05/Resume-Screening-Dataset...") try: ds = load_dataset("AzharAli05/Resume-Screening-Dataset") df = pd.DataFrame(ds['train']) print(f"Loaded {len(df)} resumes.") except Exception as e: print(f"Error loading dataset: {e}") exit() # 2. Setup Columns # Based on your dataset check: Text='Resume', Label='Role' text_col = 'Resume' label_col = 'Role' # 3. Cleaning Function def clean_resume(txt): cleanText = re.sub(r'http\S+\s', ' ', str(txt)) cleanText = re.sub(r'RT|cc', ' ', cleanText) cleanText = re.sub(r'#\S+\s', ' ', cleanText) cleanText = re.sub(r'@\S+', ' ', cleanText) cleanText = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', cleanText) cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) cleanText = re.sub(r'\s+', ' ', cleanText) return cleanText print("Cleaning data...") df['cleaned_resume'] = df[text_col].apply(clean_resume) # 4. Generate & Save Prototypes (Crucial for App) print("Generating Master Profiles (Prototypes)...") # We combine all resumes for a specific role to create a "Master Profile" prototypes = df.groupby(label_col)['cleaned_resume'].apply(lambda x: ' '.join(x)).to_dict() pickle.dump(prototypes, open('prototypes.pkl', 'wb')) # 5. Encoding Labels le = LabelEncoder() df['Category_ID'] = le.fit_transform(df[label_col]) # 6. Vectorizing print("Vectorizing...") tfidf = TfidfVectorizer(stop_words='english', max_features=5000) tfidf.fit(df['cleaned_resume']) requiredText = tfidf.transform(df['cleaned_resume']) # 7. Training print("Training Classifier...") clf = OneVsRestClassifier(KNeighborsClassifier()) clf.fit(requiredText, df['Category_ID']) # 8. Saving Models print("Saving models...") pickle.dump(clf, open('clf.pkl', 'wb')) pickle.dump(tfidf, open('tfidf.pkl', 'wb')) pickle.dump(le, open('encoder.pkl', 'wb')) print("SUCCESS: Classification models + Prototypes saved.") if __name__ == "__main__": train_classifier()