logreg / train.py
ganeshkonapalli's picture
Upload train.py
f2aec1b verified
import pandas as pd
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
DATA_PATH = "synthetic_transactions_samples_5000.csv"
TEXT_COLUMN = "Sanction_Context"
LABEL_COLUMNS = [
"Red_Flag_Reason",
"Maker_Action",
"Escalation_Level",
"Risk_Category",
"Risk_Drivers",
"Investigation_Outcome"
]
TFIDF_MAX_FEATURES = 5000
NGRAM_RANGE = (1, 2)
USE_STOPWORDS = True
RANDOM_STATE = 42
TEST_SIZE = 0.2
OUTPUT_DIR = "./"
MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, "xgb_models.pkl")
LABEL_ENCODERS_PATH = os.path.join(OUTPUT_DIR, "label_encoders.pkl")
TFIDF_VECTORIZER_PATH = os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl")
def load_data(path):
df = pd.read_csv(path)
df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
return df
def save_pickle(obj, path):
with open(path, "wb") as f:
pickle.dump(obj, f)
def train():
print("πŸ“₯ Loading data...")
df = load_data(DATA_PATH)
X = df[TEXT_COLUMN]
print("πŸ”€ Fitting TF-IDF...")
stop_words = 'english' if USE_STOPWORDS else None
tfidf = TfidfVectorizer(
max_features=TFIDF_MAX_FEATURES,
ngram_range=NGRAM_RANGE,
stop_words=stop_words
)
X_tfidf = tfidf.fit_transform(X)
save_pickle(tfidf, TFIDF_VECTORIZER_PATH)
models = {}
label_encoders = {}
for label in LABEL_COLUMNS:
print(f"πŸ” Processing label: {label}")
le = LabelEncoder()
y = le.fit_transform(df[label])
X_train, _, y_train, _ = train_test_split(
X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
model = xgb.XGBClassifier(
use_label_encoder=False,
eval_metric="mlogloss",
random_state=RANDOM_STATE
)
model.fit(X_train, y_train)
models[label] = model
label_encoders[label] = le
save_pickle(models, MODEL_SAVE_PATH)
save_pickle(label_encoders, LABEL_ENCODERS_PATH)
print("βœ… Training complete.")
if __name__ == "__main__":
train()