Spaces:

ganeshkonapalli
/

xgboost_validation

Sleeping

App Files Files Community

ganeshkonapalli commited on Jun 16, 2025

Commit

278d368

verified ·

1 Parent(s): 277edc8

Create validate.py

Browse files

Files changed (1) hide show

validate.py +75 -0

validate.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import pandas as pd
+import pickle
+import os
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+# === File Paths ===
+DATA_PATH = "data.csv"  # Change this to your actual CSV
+TEXT_COLUMNS = [
+    "Transaction_Id", "Origin", "Designation", "Keywords", "Name", "SWIFT_Tag", "Currency",
+    "Entity", "Message", "City", "Country", "State", "Hit_Type", "Record_Matching_String",
+    "WatchList_Match_String", "Payment_Sender_Name", "Payment_Reciever_Name", "Swift_Message_Type",
+    "Text_Sanction_Data", "Matched_Sanctioned_Entity", "Red_Flag_Reason", "Risk_Level",
+    "Risk_Score", "CDD_Level", "PEP_Status", "Sanction_Description", "Checker_Notes",
+    "Sanction_Context", "Maker_Action", "Customer_Type", "Industry", "Transaction_Type",
+    "Transaction_Channel", "Geographic_Origin", "Geographic_Destination", "Risk_Category",
+    "Risk_Drivers", "Alert_Status", "Investigation_Outcome", "Source_Of_Funds",
+    "Purpose_Of_Transaction", "Beneficial_Owner"
+]
+LABEL_COLUMNS = [
+    "Red_Flag_Reason", "Maker_Action", "Escalation_Level",
+    "Risk_Category", "Risk_Drivers", "Investigation_Outcome"
+]
+VECTORIZER_PATH = "tfidf_vectorizer (2).pkl"
+MODELS_PATH = "xgb_model.pkl"
+ENCODERS_PATH = "label_encoders (5).pkl"
+RANDOM_STATE = 42
+TEST_SIZE = 0.2
+# === Utils ===
+def load_pickle(path):
+    with open(path, "rb") as f:
+        return pickle.load(f)
+# === Main ===
+def validate():
+    print("📥 Loading data...")
+    df = pd.read_csv(DATA_PATH)
+    df.dropna(subset=["Sanction_Context"] + LABEL_COLUMNS, inplace=True)
+    print("🧠 Loading vectorizer, models, encoders...")
+    tfidf = load_pickle(VECTORIZER_PATH)
+    models = load_pickle(MODELS_PATH)
+    label_encoders = load_pickle(ENCODERS_PATH)
+    print("📊 TF-IDF transforming text...")
+    def concat_text(row):
+        return "\n".join([str(row[col]) for col in TEXT_COLUMNS if col in row and pd.notna(row[col])])
+    df["combined_text"] = df.apply(concat_text, axis=1)
+    X = tfidf.transform(df["combined_text"])
+    for label in LABEL_COLUMNS:
+        print(f"\n🔍 Validating: {label}")
+        y_raw = df[label]
+        encoder = label_encoders[label]
+        y_encoded = encoder.transform(y_raw)
+        _, X_test, _, y_test = train_test_split(
+            X, y_encoded, test_size=TEST_SIZE, random_state=RANDOM_STATE
+        )
+        model = models[label]
+        y_pred = model.predict(X_test)
+        print(classification_report(
+            encoder.inverse_transform(y_test),
+            encoder.inverse_transform(y_pred)
+        ))
+if __name__ == "__main__":
+    validate()