ganeshkonapalli commited on
Commit
278d368
·
verified ·
1 Parent(s): 277edc8

Create validate.py

Browse files
Files changed (1) hide show
  1. validate.py +75 -0
validate.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ import os
4
+ from sklearn.metrics import classification_report
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ # === File Paths ===
8
+ DATA_PATH = "data.csv" # Change this to your actual CSV
9
+ TEXT_COLUMNS = [
10
+ "Transaction_Id", "Origin", "Designation", "Keywords", "Name", "SWIFT_Tag", "Currency",
11
+ "Entity", "Message", "City", "Country", "State", "Hit_Type", "Record_Matching_String",
12
+ "WatchList_Match_String", "Payment_Sender_Name", "Payment_Reciever_Name", "Swift_Message_Type",
13
+ "Text_Sanction_Data", "Matched_Sanctioned_Entity", "Red_Flag_Reason", "Risk_Level",
14
+ "Risk_Score", "CDD_Level", "PEP_Status", "Sanction_Description", "Checker_Notes",
15
+ "Sanction_Context", "Maker_Action", "Customer_Type", "Industry", "Transaction_Type",
16
+ "Transaction_Channel", "Geographic_Origin", "Geographic_Destination", "Risk_Category",
17
+ "Risk_Drivers", "Alert_Status", "Investigation_Outcome", "Source_Of_Funds",
18
+ "Purpose_Of_Transaction", "Beneficial_Owner"
19
+ ]
20
+
21
+ LABEL_COLUMNS = [
22
+ "Red_Flag_Reason", "Maker_Action", "Escalation_Level",
23
+ "Risk_Category", "Risk_Drivers", "Investigation_Outcome"
24
+ ]
25
+
26
+ VECTORIZER_PATH = "tfidf_vectorizer (2).pkl"
27
+ MODELS_PATH = "xgb_model.pkl"
28
+ ENCODERS_PATH = "label_encoders (5).pkl"
29
+
30
+ RANDOM_STATE = 42
31
+ TEST_SIZE = 0.2
32
+
33
+ # === Utils ===
34
+ def load_pickle(path):
35
+ with open(path, "rb") as f:
36
+ return pickle.load(f)
37
+
38
+ # === Main ===
39
+ def validate():
40
+ print("📥 Loading data...")
41
+ df = pd.read_csv(DATA_PATH)
42
+ df.dropna(subset=["Sanction_Context"] + LABEL_COLUMNS, inplace=True)
43
+
44
+ print("🧠 Loading vectorizer, models, encoders...")
45
+ tfidf = load_pickle(VECTORIZER_PATH)
46
+ models = load_pickle(MODELS_PATH)
47
+ label_encoders = load_pickle(ENCODERS_PATH)
48
+
49
+ print("📊 TF-IDF transforming text...")
50
+ def concat_text(row):
51
+ return "\n".join([str(row[col]) for col in TEXT_COLUMNS if col in row and pd.notna(row[col])])
52
+
53
+ df["combined_text"] = df.apply(concat_text, axis=1)
54
+ X = tfidf.transform(df["combined_text"])
55
+
56
+ for label in LABEL_COLUMNS:
57
+ print(f"\n🔍 Validating: {label}")
58
+ y_raw = df[label]
59
+ encoder = label_encoders[label]
60
+ y_encoded = encoder.transform(y_raw)
61
+
62
+ _, X_test, _, y_test = train_test_split(
63
+ X, y_encoded, test_size=TEST_SIZE, random_state=RANDOM_STATE
64
+ )
65
+
66
+ model = models[label]
67
+ y_pred = model.predict(X_test)
68
+
69
+ print(classification_report(
70
+ encoder.inverse_transform(y_test),
71
+ encoder.inverse_transform(y_pred)
72
+ ))
73
+
74
+ if __name__ == "__main__":
75
+ validate()