Spaces:
Sleeping
Sleeping
Create validate.py
Browse files- validate.py +75 -0
validate.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import pickle
|
| 3 |
+
import os
|
| 4 |
+
from sklearn.metrics import classification_report
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
|
| 7 |
+
# === File Paths ===
|
| 8 |
+
DATA_PATH = "data.csv" # Change this to your actual CSV
|
| 9 |
+
TEXT_COLUMNS = [
|
| 10 |
+
"Transaction_Id", "Origin", "Designation", "Keywords", "Name", "SWIFT_Tag", "Currency",
|
| 11 |
+
"Entity", "Message", "City", "Country", "State", "Hit_Type", "Record_Matching_String",
|
| 12 |
+
"WatchList_Match_String", "Payment_Sender_Name", "Payment_Reciever_Name", "Swift_Message_Type",
|
| 13 |
+
"Text_Sanction_Data", "Matched_Sanctioned_Entity", "Red_Flag_Reason", "Risk_Level",
|
| 14 |
+
"Risk_Score", "CDD_Level", "PEP_Status", "Sanction_Description", "Checker_Notes",
|
| 15 |
+
"Sanction_Context", "Maker_Action", "Customer_Type", "Industry", "Transaction_Type",
|
| 16 |
+
"Transaction_Channel", "Geographic_Origin", "Geographic_Destination", "Risk_Category",
|
| 17 |
+
"Risk_Drivers", "Alert_Status", "Investigation_Outcome", "Source_Of_Funds",
|
| 18 |
+
"Purpose_Of_Transaction", "Beneficial_Owner"
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
LABEL_COLUMNS = [
|
| 22 |
+
"Red_Flag_Reason", "Maker_Action", "Escalation_Level",
|
| 23 |
+
"Risk_Category", "Risk_Drivers", "Investigation_Outcome"
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
VECTORIZER_PATH = "tfidf_vectorizer (2).pkl"
|
| 27 |
+
MODELS_PATH = "xgb_model.pkl"
|
| 28 |
+
ENCODERS_PATH = "label_encoders (5).pkl"
|
| 29 |
+
|
| 30 |
+
RANDOM_STATE = 42
|
| 31 |
+
TEST_SIZE = 0.2
|
| 32 |
+
|
| 33 |
+
# === Utils ===
|
| 34 |
+
def load_pickle(path):
|
| 35 |
+
with open(path, "rb") as f:
|
| 36 |
+
return pickle.load(f)
|
| 37 |
+
|
| 38 |
+
# === Main ===
|
| 39 |
+
def validate():
|
| 40 |
+
print("📥 Loading data...")
|
| 41 |
+
df = pd.read_csv(DATA_PATH)
|
| 42 |
+
df.dropna(subset=["Sanction_Context"] + LABEL_COLUMNS, inplace=True)
|
| 43 |
+
|
| 44 |
+
print("🧠 Loading vectorizer, models, encoders...")
|
| 45 |
+
tfidf = load_pickle(VECTORIZER_PATH)
|
| 46 |
+
models = load_pickle(MODELS_PATH)
|
| 47 |
+
label_encoders = load_pickle(ENCODERS_PATH)
|
| 48 |
+
|
| 49 |
+
print("📊 TF-IDF transforming text...")
|
| 50 |
+
def concat_text(row):
|
| 51 |
+
return "\n".join([str(row[col]) for col in TEXT_COLUMNS if col in row and pd.notna(row[col])])
|
| 52 |
+
|
| 53 |
+
df["combined_text"] = df.apply(concat_text, axis=1)
|
| 54 |
+
X = tfidf.transform(df["combined_text"])
|
| 55 |
+
|
| 56 |
+
for label in LABEL_COLUMNS:
|
| 57 |
+
print(f"\n🔍 Validating: {label}")
|
| 58 |
+
y_raw = df[label]
|
| 59 |
+
encoder = label_encoders[label]
|
| 60 |
+
y_encoded = encoder.transform(y_raw)
|
| 61 |
+
|
| 62 |
+
_, X_test, _, y_test = train_test_split(
|
| 63 |
+
X, y_encoded, test_size=TEST_SIZE, random_state=RANDOM_STATE
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
model = models[label]
|
| 67 |
+
y_pred = model.predict(X_test)
|
| 68 |
+
|
| 69 |
+
print(classification_report(
|
| 70 |
+
encoder.inverse_transform(y_test),
|
| 71 |
+
encoder.inverse_transform(y_pred)
|
| 72 |
+
))
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
validate()
|