Spaces:
Build error
Build error
| import pandas as pd | |
| import pickle | |
| import os | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| import xgboost as xgb | |
| DATA_PATH = "synthetic_transactions_samples_5000.csv" | |
| TEXT_COLUMN = "Sanction_Context" | |
| LABEL_COLUMNS = [ | |
| "Red_Flag_Reason", | |
| "Maker_Action", | |
| "Escalation_Level", | |
| "Risk_Category", | |
| "Risk_Drivers", | |
| "Investigation_Outcome" | |
| ] | |
| TFIDF_MAX_FEATURES = 5000 | |
| NGRAM_RANGE = (1, 2) | |
| USE_STOPWORDS = True | |
| RANDOM_STATE = 42 | |
| TEST_SIZE = 0.2 | |
| OUTPUT_DIR = "./" | |
| MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, "xgb_models.pkl") | |
| LABEL_ENCODERS_PATH = os.path.join(OUTPUT_DIR, "label_encoders.pkl") | |
| TFIDF_VECTORIZER_PATH = os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl") | |
| def load_data(path): | |
| df = pd.read_csv(path) | |
| df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True) | |
| return df | |
| def save_pickle(obj, path): | |
| with open(path, "wb") as f: | |
| pickle.dump(obj, f) | |
| def train(): | |
| print("π₯ Loading data...") | |
| df = load_data(DATA_PATH) | |
| X = df[TEXT_COLUMN] | |
| print("π€ Fitting TF-IDF...") | |
| stop_words = 'english' if USE_STOPWORDS else None | |
| tfidf = TfidfVectorizer( | |
| max_features=TFIDF_MAX_FEATURES, | |
| ngram_range=NGRAM_RANGE, | |
| stop_words=stop_words | |
| ) | |
| X_tfidf = tfidf.fit_transform(X) | |
| save_pickle(tfidf, TFIDF_VECTORIZER_PATH) | |
| models = {} | |
| label_encoders = {} | |
| for label in LABEL_COLUMNS: | |
| print(f"π Processing label: {label}") | |
| le = LabelEncoder() | |
| y = le.fit_transform(df[label]) | |
| X_train, _, y_train, _ = train_test_split( | |
| X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE | |
| ) | |
| model = xgb.XGBClassifier( | |
| use_label_encoder=False, | |
| eval_metric="mlogloss", | |
| random_state=RANDOM_STATE | |
| ) | |
| model.fit(X_train, y_train) | |
| models[label] = model | |
| label_encoders[label] = le | |
| save_pickle(models, MODEL_SAVE_PATH) | |
| save_pickle(label_encoders, LABEL_ENCODERS_PATH) | |
| print("β Training complete.") | |
| if __name__ == "__main__": | |
| train() |