Spaces:

ganeshkonapalli
/

logreg

Build error

App Files Files Community

logreg / train.py

ganeshkonapalli

Upload train.py

f2aec1b verified 8 months ago

raw

history blame contribute delete

2.18 kB

	import pandas as pd
	import pickle
	import os
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	import xgboost as xgb

	DATA_PATH = "synthetic_transactions_samples_5000.csv"
	TEXT_COLUMN = "Sanction_Context"
	LABEL_COLUMNS = [
	"Red_Flag_Reason",
	"Maker_Action",
	"Escalation_Level",
	"Risk_Category",
	"Risk_Drivers",
	"Investigation_Outcome"
	]

	TFIDF_MAX_FEATURES = 5000
	NGRAM_RANGE = (1, 2)
	USE_STOPWORDS = True
	RANDOM_STATE = 42
	TEST_SIZE = 0.2

	OUTPUT_DIR = "./"
	MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, "xgb_models.pkl")
	LABEL_ENCODERS_PATH = os.path.join(OUTPUT_DIR, "label_encoders.pkl")
	TFIDF_VECTORIZER_PATH = os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl")

	def load_data(path):
	df = pd.read_csv(path)
	df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
	return df

	def save_pickle(obj, path):
	with open(path, "wb") as f:
	pickle.dump(obj, f)

	def train():
	print("📥 Loading data...")
	df = load_data(DATA_PATH)
	X = df[TEXT_COLUMN]

	print("🔤 Fitting TF-IDF...")
	stop_words = 'english' if USE_STOPWORDS else None
	tfidf = TfidfVectorizer(
	max_features=TFIDF_MAX_FEATURES,
	ngram_range=NGRAM_RANGE,
	stop_words=stop_words
	)
	X_tfidf = tfidf.fit_transform(X)
	save_pickle(tfidf, TFIDF_VECTORIZER_PATH)

	models = {}
	label_encoders = {}

	for label in LABEL_COLUMNS:
	print(f"🔁 Processing label: {label}")
	le = LabelEncoder()
	y = le.fit_transform(df[label])

	X_train, _, y_train, _ = train_test_split(
	X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
	)

	model = xgb.XGBClassifier(
	use_label_encoder=False,
	eval_metric="mlogloss",
	random_state=RANDOM_STATE
	)
	model.fit(X_train, y_train)

	models[label] = model
	label_encoders[label] = le

	save_pickle(models, MODEL_SAVE_PATH)
	save_pickle(label_encoders, LABEL_ENCODERS_PATH)
	print("✅ Training complete.")

	if __name__ == "__main__":
	train()