subbunanepalli commited on
Commit
1370132
·
verified ·
1 Parent(s): 69093fd

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +52 -38
train.py CHANGED
@@ -6,41 +6,55 @@ from sklearn.multioutput import MultiOutputClassifier
6
  from sklearn.metrics import accuracy_score
7
  import joblib
8
  import os
9
-
10
- from config import DATA_PATH, MODEL_PATH, TFIDF_PATH
11
-
12
- def train_model():
13
- os.makedirs("saved_models", exist_ok=True)
14
- df = pd.read_csv(DATA_PATH)
15
-
16
- # Features and labels
17
- X = df["Sanction_Context"]
18
- y = df[["Maker_Action", "Escalation_Level", "Risk_Category", "Risk_Drivers", "Investigation_Outcome"]]
19
-
20
- # Split for evaluation
21
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
22
-
23
- # TF-IDF vectorization
24
- vectorizer = TfidfVectorizer()
25
- X_train_vec = vectorizer.fit_transform(X_train)
26
- X_test_vec = vectorizer.transform(X_test)
27
-
28
- # Multi-output Logistic Regression
29
- model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
30
- model.fit(X_train_vec, y_train)
31
-
32
- # Predict and calculate accuracy per label
33
- y_pred = model.predict(X_test_vec)
34
- accuracy = {
35
- col: accuracy_score(y_test[col], [pred[i] for pred in y_pred])
36
- for i, col in enumerate(y.columns)
37
- }
38
-
39
- # Save model and vectorizer
40
- joblib.dump(model, MODEL_PATH)
41
- joblib.dump(vectorizer, TFIDF_PATH)
42
-
43
- return {
44
- "message": "Model trained and saved to 'saved_models/'",
45
- "accuracy": accuracy
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from sklearn.metrics import accuracy_score
7
  import joblib
8
  import os
9
+ from typing import Dict, Any
10
+
11
+ from config import DATA_PATH, MODEL_PATH, TFIDF_PATH, MODEL_SAVE_DIR
12
+
13
+ def train_model() -> Dict[str, Any]:
14
+ try:
15
+ # Ensure the model save directory exists
16
+ os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
17
+
18
+ # Load data
19
+ df = pd.read_csv(DATA_PATH)
20
+
21
+ # Features and labels
22
+ X = df["Sanction_Context"]
23
+ y = df[["Maker_Action", "Escalation_Level", "Risk_Category", "Risk_Drivers", "Red_Flag_Reason", "Investigation_Outcome"]]
24
+
25
+ # Train-test split for evaluation
26
+ X_train, X_test, y_train, y_test = train_test_split(
27
+ X, y, test_size=0.2, random_state=42, stratify=y["Maker_Action"]
28
+ )
29
+
30
+ # TF-IDF vectorization
31
+ vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') # Added max_features and stop_words
32
+ X_train_vec = vectorizer.fit_transform(X_train)
33
+ X_test_vec = vectorizer.transform(X_test)
34
+
35
+ # Multi-output Logistic Regression model
36
+ model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
37
+ model.fit(X_train_vec, y_train)
38
+
39
+ # Predict on test set
40
+ y_pred = model.predict(X_test_vec)
41
+
42
+ # Calculate accuracy per label
43
+ accuracy = {}
44
+ for i, col in enumerate(y.columns):
45
+ accuracy[col] = round(accuracy_score(y_test[col], y_pred[:, i]), 4)
46
+
47
+ # Save model and vectorizer
48
+ joblib.dump(model, MODEL_PATH)
49
+ joblib.dump(vectorizer, TFIDF_PATH)
50
+
51
+ return {
52
+ "message": f"Model trained and saved to '{MODEL_SAVE_DIR}'",
53
+ "accuracy": accuracy
54
+ }
55
+
56
+ except Exception as e:
57
+ return {
58
+ "message": "Training failed",
59
+ "error": str(e)
60
+ }