Spaces:
No application file
No application file
| import pandas as pd | |
| import joblib | |
| import os | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import LabelEncoder | |
| from xgboost import XGBClassifier | |
| # === Load Dataset === | |
| df = pd.read_csv("synthetic_transactions_samples_5000.csv") | |
| TEXT_COLUMN = "Sanction_Context" | |
| LABEL_COLUMNS = ["Red_Flag_Reason", "Maker_Action", "Escalation_Level", "Risk_Category", "Risk_Drivers", "Investigation_Outcome"] | |
| # === TF-IDF Vectorization === | |
| tfidf = TfidfVectorizer(max_features=10000) | |
| X = tfidf.fit_transform(df[TEXT_COLUMN].fillna("")) | |
| # === Multi-Label Model Training === | |
| models = {} | |
| label_encoders = {} | |
| for label in LABEL_COLUMNS: | |
| le = LabelEncoder() | |
| y = le.fit_transform(df[label].fillna("Unknown")) | |
| model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss') | |
| model.fit(X, y) | |
| models[label] = model | |
| label_encoders[label] = le | |
| # === Save Models === | |
| os.makedirs("models", exist_ok=True) | |
| joblib.dump(tfidf, "models/tfidf_vectorizer.pkl") | |
| joblib.dump(models, "models/xgb_models.pkl") | |
| joblib.dump(label_encoders, "models/label_encoders.pkl") | |
| print("Training complete and models saved.") | |