XGB_VALIDATION / train.py
subbunanepalli's picture
Create train.py
0a7836a verified
import pandas as pd
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
# === Load Dataset ===
df = pd.read_csv("synthetic_transactions_samples_5000.csv")
TEXT_COLUMN = "Sanction_Context"
LABEL_COLUMNS = ["Red_Flag_Reason", "Maker_Action", "Escalation_Level", "Risk_Category", "Risk_Drivers", "Investigation_Outcome"]
# === TF-IDF Vectorization ===
tfidf = TfidfVectorizer(max_features=10000)
X = tfidf.fit_transform(df[TEXT_COLUMN].fillna(""))
# === Multi-Label Model Training ===
models = {}
label_encoders = {}
for label in LABEL_COLUMNS:
le = LabelEncoder()
y = le.fit_transform(df[label].fillna("Unknown"))
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X, y)
models[label] = model
label_encoders[label] = le
# === Save Models ===
os.makedirs("models", exist_ok=True)
joblib.dump(tfidf, "models/tfidf_vectorizer.pkl")
joblib.dump(models, "models/xgb_models.pkl")
joblib.dump(label_encoders, "models/label_encoders.pkl")
print("Training complete and models saved.")