import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.multioutput import MultiOutputClassifier from sklearn.metrics import accuracy_score import joblib import os from typing import Dict, Any from config import DATA_PATH, MODEL_PATH, TFIDF_PATH, MODEL_SAVE_DIR def train_model() -> Dict[str, Any]: try: # Ensure the model save directory exists os.makedirs(MODEL_SAVE_DIR, exist_ok=True) # Load data df = pd.read_csv(DATA_PATH) # Features and labels X = df["Sanction_Context"] y = df[["Maker_Action", "Escalation_Level", "Risk_Category", "Risk_Drivers", "Red_Flag_Reason", "Investigation_Outcome"]] # Train-test split for evaluation X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y["Maker_Action"] ) # TF-IDF vectorization vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') # Added max_features and stop_words X_train_vec = vectorizer.fit_transform(X_train) X_test_vec = vectorizer.transform(X_test) # Multi-output Logistic Regression model model = MultiOutputClassifier(LogisticRegression(max_iter=1000)) model.fit(X_train_vec, y_train) # Predict on test set y_pred = model.predict(X_test_vec) # Calculate accuracy per label accuracy = {} for i, col in enumerate(y.columns): accuracy[col] = round(accuracy_score(y_test[col], y_pred[:, i]), 4) # Save model and vectorizer joblib.dump(model, MODEL_PATH) joblib.dump(vectorizer, TFIDF_PATH) return { "message": f"Model trained and saved to '{MODEL_SAVE_DIR}'", "accuracy": accuracy } except Exception as e: return { "message": "Training failed", "error": str(e) }