import pandas as pd
import pickle
import re
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

def train_classifier():
    # 1. Load Dataset (AzharAli05)
    print("Loading AzharAli05/Resume-Screening-Dataset...")
    try:
        ds = load_dataset("AzharAli05/Resume-Screening-Dataset")
        df = pd.DataFrame(ds['train'])
        print(f"Loaded {len(df)} resumes.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit()

    # 2. Setup Columns
    # Based on your dataset check: Text='Resume', Label='Role'
    text_col = 'Resume'
    label_col = 'Role'

    # 3. Cleaning Function
    def clean_resume(txt):
        cleanText = re.sub(r'http\S+\s', ' ', str(txt))
        cleanText = re.sub(r'RT|cc', ' ', cleanText)
        cleanText = re.sub(r'#\S+\s', ' ', cleanText)
        cleanText = re.sub(r'@\S+', '  ', cleanText)
        cleanText = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', cleanText)
        cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
        cleanText = re.sub(r'\s+', ' ', cleanText)
        return cleanText

    print("Cleaning data...")
    df['cleaned_resume'] = df[text_col].apply(clean_resume)

    # 4. Generate & Save Prototypes (Crucial for App)
    print("Generating Master Profiles (Prototypes)...")
    # We combine all resumes for a specific role to create a "Master Profile"
    prototypes = df.groupby(label_col)['cleaned_resume'].apply(lambda x: ' '.join(x)).to_dict()
    pickle.dump(prototypes, open('prototypes.pkl', 'wb'))

    # 5. Encoding Labels
    le = LabelEncoder()
    df['Category_ID'] = le.fit_transform(df[label_col])

    # 6. Vectorizing
    print("Vectorizing...")
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf.fit(df['cleaned_resume'])
    requiredText = tfidf.transform(df['cleaned_resume'])

    # 7. Training
    print("Training Classifier...")
    clf = OneVsRestClassifier(KNeighborsClassifier())
    clf.fit(requiredText, df['Category_ID'])

    # 8. Saving Models
    print("Saving models...")
    pickle.dump(clf, open('clf.pkl', 'wb'))
    pickle.dump(tfidf, open('tfidf.pkl', 'wb')) 
    pickle.dump(le, open('encoder.pkl', 'wb'))
    print("SUCCESS: Classification models + Prototypes saved.")

if __name__ == "__main__":
    train_classifier()