Spaces:

point9
/

LOGREG_TTV

Sleeping

App Files Files Community

ganeshkonapalli commited on Jun 17, 2025

Commit

c778365

verified ·

1 Parent(s): 4281e33

Create train_test_validate.py

Browse files

Files changed (1) hide show

train_test_validate.py +242 -0

train_test_validate.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import os
+import json
+import joblib
+import requests
+import pandas as pd
+from typing import List
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.multioutput import MultiOutputClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+from sklearn.linear_model import LogisticRegression
+from pydantic import BaseModel, ValidationError
+import argparse
+# --- CONFIG ---
+DATA_PATH = "data.csv"
+TEXT_COLUMN = "Sanction_Context"
+LABEL_COLUMNS = [
+    "Red_Flag_Reason", "Maker_Action", "Escalation_Level",
+    "Risk_Category", "Risk_Drivers", "Investigation_Outcome"
+]
+MODEL_SAVE_DIR = "models"
+LABEL_ENCODERS_PATH = os.path.join(MODEL_SAVE_DIR, "label_encoders.pkl")
+TFIDF_MAX_FEATURES = 1000
+NGRAM_RANGE = (1, 2)
+USE_STOPWORDS = True
+RANDOM_STATE = 42
+TEST_SIZE = 0.2
+API_URL = "https://your-hf-api-url.hf.space/predict"  # Replace with actual URL
+os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
+# --- Pydantic schema ---
+class TransactionData(BaseModel):
+    Transaction_Id: str
+    Hit_Seq: int
+    Hit_Id_List: str
+    Origin: str
+    Designation: str
+    Keywords: str
+    Name: str
+    SWIFT_Tag: str
+    Currency: str
+    Entity: str
+    Message: str
+    City: str
+    Country: str
+    State: str
+    Hit_Type: str
+    Record_Matching_String: str
+    WatchList_Match_String: str
+    Payment_Sender_Name: str
+    Payment_Reciever_Name: str
+    Swift_Message_Type: str
+    Text_Sanction_Data: str
+    Matched_Sanctioned_Entity: str
+    Is_Match: int
+    Red_Flag_Reason: str
+    Risk_Level: str
+    Risk_Score: float
+    Risk_Score_Description: str
+    CDD_Level: str
+    PEP_Status: str
+    Value_Date: str
+    Last_Review_Date: str
+    Next_Review_Date: str
+    Sanction_Description: str
+    Checker_Notes: str
+    Sanction_Context: str
+    Maker_Action: str
+    Customer_ID: int
+    Customer_Type: str
+    Industry: str
+    Transaction_Date_Time: str
+    Transaction_Type: str
+    Transaction_Channel: str
+    Originating_Bank: str
+    Beneficiary_Bank: str
+    Geographic_Origin: str
+    Geographic_Destination: str
+    Match_Score: float
+    Match_Type: str
+    Sanctions_List_Version: str
+    Screening_Date_Time: str
+    Risk_Category: str
+    Risk_Drivers: str
+    Alert_Status: str
+    Investigation_Outcome: str
+    Case_Owner_Analyst: str
+    Escalation_Level: str
+    Escalation_Date: str
+    Regulatory_Reporting_Flags: bool
+    Audit_Trail_Timestamp: str
+    Source_Of_Funds: str
+    Purpose_Of_Transaction: str
+    Beneficial_Owner: str
+    Sanctions_Exposure_History: bool
+# --- Train function ---
+def train_pipeline():
+    print("📥 Loading dataset...")
+    df = pd.read_csv(DATA_PATH)
+    df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
+    label_encoders = {}
+    for col in LABEL_COLUMNS:
+        le = LabelEncoder()
+        df[col] = le.fit_transform(df[col])
+        label_encoders[col] = le
+    X = df[TEXT_COLUMN]
+    Y = df[LABEL_COLUMNS]
+    print("✂️ Splitting train/test...")
+    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
+    print("🔧 Building pipeline with Logistic Regression...")
+    stop_words = "english" if USE_STOPWORDS else None
+    pipeline = Pipeline([
+        ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words=stop_words)),
+        ('clf', MultiOutputClassifier(LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)))
+    ])
+    print("🏋️ Training...")
+    pipeline.fit(X_train, y_train)
+    model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
+    print(f"💾 Saving model to {model_path}")
+    joblib.dump(pipeline, model_path)
+    print(f"💾 Saving label encoders to {LABEL_ENCODERS_PATH}")
+    joblib.dump(label_encoders, LABEL_ENCODERS_PATH)
+    tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl")
+    joblib.dump(pipeline.named_steps["tfidf"], tfidf_path)
+    print("✅ Training complete.")
+# --- Input Validator ---
+def validate_sample_input(sample_input):
+    try:
+        validated = TransactionData(**sample_input)
+        print("✅ Input is valid.")
+    except ValidationError as e:
+        print("❌ Validation error:")
+        print(e.json(indent=2))
+# --- API Test ---
+def test_api(sample_payload):
+    headers = {"Content-Type": "application/json"}
+    print(f"🚀 Posting to {API_URL}")
+    response = requests.post(API_URL, headers=headers, data=json.dumps(sample_payload))
+    print("📥 Status Code:", response.status_code)
+    try:
+        print("📤 Response:", json.dumps(response.json(), indent=2))
+    except Exception as e:
+        print("❌ Failed to parse response:", str(e))
+# --- Sample Payload (unchanged) ---
+sample_payload = {
+    "transaction_data": {
+        "Transaction_Id": "TXN12345",
+        "Hit_Seq": 1,
+        "Hit_Id_List": "HIT789",
+        "Origin": "India",
+        "Designation": "Manager",
+        "Keywords": "fraud",
+        "Name": "John Doe",
+        "SWIFT_Tag": "TAG001",
+        "Currency": "INR",
+        "Entity": "ABC Ltd",
+        "Message": "Payment for services",
+        "City": "Hyderabad",
+        "Country": "India",
+        "State": "Telangana",
+        "Hit_Type": "Individual",
+        "Record_Matching_String": "John Doe",
+        "WatchList_Match_String": "Doe, John",
+        "Payment_Sender_Name": "John Doe",
+        "Payment_Reciever_Name": "Jane Smith",
+        "Swift_Message_Type": "MT103",
+        "Text_Sanction_Data": "Suspicious transfer to offshore account",
+        "Matched_Sanctioned_Entity": "John Doe",
+        "Is_Match": 1,
+        "Red_Flag_Reason": "High value transaction",
+        "Risk_Level": "High",
+        "Risk_Score": 87.5,
+        "Risk_Score_Description": "Very High",
+        "CDD_Level": "Enhanced",
+        "PEP_Status": "Yes",
+        "Value_Date": "2023-01-01",
+        "Last_Review_Date": "2023-06-01",
+        "Next_Review_Date": "2024-06-01",
+        "Sanction_Description": "OFAC List",
+        "Checker_Notes": "Urgent check required",
+        "Sanction_Context": "Payment matched with OFAC entry",
+        "Maker_Action": "Escalate",
+        "Customer_ID": 1001,
+        "Customer_Type": "Corporate",
+        "Industry": "Finance",
+        "Transaction_Date_Time": "2023-12-15T10:00:00",
+        "Transaction_Type": "Credit",
+        "Transaction_Channel": "Online",
+        "Originating_Bank": "ABC Bank",
+        "Beneficiary_Bank": "XYZ Bank",
+        "Geographic_Origin": "India",
+        "Geographic_Destination": "USA",
+        "Match_Score": 96.2,
+        "Match_Type": "Exact",
+        "Sanctions_List_Version": "2023-V5",
+        "Screening_Date_Time": "2023-12-15T09:55:00",
+        "Risk_Category": "Sanctions",
+        "Risk_Drivers": "PEP, High Value",
+        "Alert_Status": "Open",
+        "Investigation_Outcome": "Pending",
+        "Case_Owner_Analyst": "analyst1",
+        "Escalation_Level": "L2",
+        "Escalation_Date": "2023-12-16",
+        "Regulatory_Reporting_Flags": True,
+        "Audit_Trail_Timestamp": "2023-12-15T10:05:00",
+        "Source_Of_Funds": "Corporate Account",
+        "Purpose_Of_Transaction": "Service Payment",
+        "Beneficial_Owner": "John Doe",
+        "Sanctions_Exposure_History": False
+    }
+}
+# --- Main Entry ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train", action="store_true", help="Train the model")
+    parser.add_argument("--validate", action="store_true", help="Validate sample input")
+    parser.add_argument("--test", action="store_true", help="Test prediction API")
+    args = parser.parse_args()
+    if args.train:
+        train_pipeline()
+    if args.validate:
+        validate_sample_input(sample_payload["transaction_data"])
+    if args.test:
+        test_api(sample_payload)