import pandas as pd import pickle import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import xgboost as xgb DATA_PATH = "synthetic_transactions_samples_5000.csv" TEXT_COLUMN = "Sanction_Context" LABEL_COLUMNS = [ "Red_Flag_Reason", "Maker_Action", "Escalation_Level", "Risk_Category", "Risk_Drivers", "Investigation_Outcome" ] TFIDF_MAX_FEATURES = 5000 NGRAM_RANGE = (1, 2) USE_STOPWORDS = True RANDOM_STATE = 42 TEST_SIZE = 0.2 OUTPUT_DIR = "./" MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, "xgb_models.pkl") LABEL_ENCODERS_PATH = os.path.join(OUTPUT_DIR, "label_encoders.pkl") TFIDF_VECTORIZER_PATH = os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl") def load_data(path): df = pd.read_csv(path) df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True) return df def save_pickle(obj, path): with open(path, "wb") as f: pickle.dump(obj, f) def train(): print("📥 Loading data...") df = load_data(DATA_PATH) X = df[TEXT_COLUMN] print("🔤 Fitting TF-IDF...") stop_words = 'english' if USE_STOPWORDS else None tfidf = TfidfVectorizer( max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words=stop_words ) X_tfidf = tfidf.fit_transform(X) save_pickle(tfidf, TFIDF_VECTORIZER_PATH) models = {} label_encoders = {} for label in LABEL_COLUMNS: print(f"🔁 Processing label: {label}") le = LabelEncoder() y = le.fit_transform(df[label]) X_train, _, y_train, _ = train_test_split( X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE ) model = xgb.XGBClassifier( use_label_encoder=False, eval_metric="mlogloss", random_state=RANDOM_STATE ) model.fit(X_train, y_train) models[label] = model label_encoders[label] = le save_pickle(models, MODEL_SAVE_PATH) save_pickle(label_encoders, LABEL_ENCODERS_PATH) print("✅ Training complete.") if __name__ == "__main__": train()