{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[{"file_id":"18nXuMYjrxwX-viL5rRN1wqryrVBQymqO","timestamp":1755060968785},{"file_id":"1qyHIahzNxJaA2R8v0eLU6Oz916q_ClEd","timestamp":1754580111827}],"gpuType":"T4","collapsed_sections":["7ZAjRBVfieVb","dMzAIFLoiMkH","v6Icoc935YEM","ijEICHy37mAS","bu8AEbpQiWqh","RvxGHyn2gtoq","NoPz1HGTXyIn","PHtAoA-mhb_Y"],"authorship_tag":"ABX9TyNFzZdE1nHewkTFyc0AshJ6"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# Import master data and conditions files"],"metadata":{"id":"7ZAjRBVfieVb"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"_4KJEzFBzlTY"},"outputs":[],"source":["#Import the necessary libraries\n","import pandas as pd\n","import numpy as np\n","import random"]},{"cell_type":"code","source":["#Install Faker to enable generation of random customer names\n","!pip install faker"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XC7I8W0z1HXi","executionInfo":{"status":"ok","timestamp":1755093291359,"user_tz":-330,"elapsed":6086,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"b8776ca6-7e90-41f1-e695-2f0e8b55a321"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting faker\n"," Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)\n","Requirement already satisfied: tzdata in /usr/local/lib/python3.11/dist-packages (from faker) (2025.2)\n","Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)\n","\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.9 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.7/1.9 MB\u001b[0m \u001b[31m22.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m33.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: faker\n","Successfully installed faker-37.5.3\n"]}]},{"cell_type":"code","source":["from faker import Faker\n","faker = Faker()\n","Faker.seed(42)"],"metadata":{"id":"gXG07tuR00Wb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Set the path to the master data file and conditions file\n","master_data_path = \"MasterData.xlsx\"\n","conditions_data_path = \"Conditions.xlsx\"\n","\n","# Load all sheets in both the files\n","master_data = pd.read_excel(master_data_path, sheet_name=None)\n","conditions_data = pd.read_excel(conditions_data_path, sheet_name=None)\n","\n","# Print the sheet names\n","print(\"Master Data Sheets:\", list(master_data.keys()))\n","print(\"Conditions Data Sheets:\", list(conditions_data.keys()))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"rHcJMebjztxh","executionInfo":{"status":"ok","timestamp":1755093308518,"user_tz":-330,"elapsed":424,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"afa26325-4689-4216-c013-c8aa394f3179"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Master Data Sheets: ['Customer Information', 'Account Information', 'Transaction data', 'Business', 'Geography', 'Customer Type', 'Product', 'Channels', 'TranType', 'Currency', 'watch list data']\n","Conditions Data Sheets: ['Prod-TranType Mapping', 'Channe-TranType Mapping']\n"]}]},{"cell_type":"code","source":["#Extract each masterdata excel sheet into a separate dataframe\n","business_master = master_data[\"Business\"]\n","currency_master = master_data[\"Currency\"]\n","geo_master = master_data[\"Geography\"]\n","ctype_master = master_data[\"Customer Type\"]\n","prod_master = master_data[\"Product\"]\n","chann_master = master_data[\"Channels\"]\n","tran_master = master_data[\"TranType\"]"],"metadata":{"id":"xOnEasfg0FLp"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Check the business_master dataframe\n","business_master"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":833},"collapsed":true,"id":"KPczQnGSTtoj","executionInfo":{"status":"ok","timestamp":1755070641632,"user_tz":-330,"elapsed":152,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"7625377e-b2bd-4d32-cd01-d2db26257058"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" BusinessCode BusinessDesc RiskRating\n","0 1001 Food LR\n","1 1002 Textiles LR\n","2 1003 Machinary and Equipment LR\n","3 1004 Chemicals LR\n","4 1005 Electronics LR\n","5 1006 Metal & Metal Products LR\n","6 1007 Retail LR\n","7 1008 Information Tech LR\n","8 1009 Construction LR\n","9 1010 Transport MR\n","10 1011 Oil and Gas LR\n","11 1012 Hotels MR\n","12 1013 Financial Institutions LR\n","13 1014 Casinos & Gambling HR\n","14 1015 Real Estate LR\n","15 1016 Precious Metals & Stones LR\n","16 1017 NA - Individual LR\n","17 1018 Steel & Alloys LR\n","18 1019 Heavy Metal LR\n","19 1020 Press Products LR\n","20 1021 Auto Ancillaries LR\n","21 1022 Automobiles LR\n","22 1023 Startups MR\n","23 1024 Hospitality LR\n","24 1025 Jewellery LR"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
BusinessCodeBusinessDescRiskRating
01001FoodLR
11002TextilesLR
21003Machinary and EquipmentLR
31004ChemicalsLR
41005ElectronicsLR
51006Metal & Metal ProductsLR
61007RetailLR
71008Information TechLR
81009ConstructionLR
91010TransportMR
101011Oil and GasLR
111012HotelsMR
121013Financial InstitutionsLR
131014Casinos & GamblingHR
141015Real EstateLR
151016Precious Metals & StonesLR
161017NA - IndividualLR
171018Steel & AlloysLR
181019Heavy MetalLR
191020Press ProductsLR
201021Auto AncillariesLR
211022AutomobilesLR
221023StartupsMR
231024HospitalityLR
241025JewelleryLR
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n"," \n"," \n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"business_master","summary":"{\n \"name\": \"business_master\",\n \"rows\": 25,\n \"fields\": [\n {\n \"column\": \"BusinessCode\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7,\n \"min\": 1001,\n \"max\": 1025,\n \"num_unique_values\": 25,\n \"samples\": [\n 1009,\n 1017,\n 1001\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BusinessDesc\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 25,\n \"samples\": [\n \"Construction \",\n \"NA - Individual \",\n \"Food\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"RiskRating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"LR\",\n \"MR\",\n \"HR\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["#Check the geo_master dataframe\n","geo_master"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":582},"collapsed":true,"id":"7DQqTgvOUmcb","executionInfo":{"status":"ok","timestamp":1755070645108,"user_tz":-330,"elapsed":83,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"572169a2-7cdd-4f1f-e63c-c9c85c7c8db3"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" GeoCode GeoDesc RiskRating\n","0 CAN Canada LR\n","1 AUT Austria LR\n","2 COL Colambia LR\n","3 BRA Brazil LR\n","4 NPL Nepal HR\n","5 IND India LR\n","6 USD United States of America LR\n","7 ARE UAE LR\n","8 AUS Australia LR\n","9 HKG Hongkong LR\n","10 GBR United Kingdom LR\n","11 KEN Kenya MR\n","12 ZAF South Africa MR\n","13 ARG Argentina LR\n","14 BHR Bahrain LR\n","15 CHN China LR\n","16 DNK Denmark LR"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
GeoCodeGeoDescRiskRating
0CANCanadaLR
1AUTAustriaLR
2COLColambiaLR
3BRABrazilLR
4NPLNepalHR
5INDIndiaLR
6USDUnited States of AmericaLR
7AREUAELR
8AUSAustraliaLR
9HKGHongkongLR
10GBRUnited KingdomLR
11KENKenyaMR
12ZAFSouth AfricaMR
13ARGArgentinaLR
14BHRBahrainLR
15CHNChinaLR
16DNKDenmarkLR
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n"," \n"," \n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"geo_master","summary":"{\n \"name\": \"geo_master\",\n \"rows\": 17,\n \"fields\": [\n {\n \"column\": \"GeoCode\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 17,\n \"samples\": [\n \"CAN\",\n \"AUT\",\n \"IND\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"GeoDesc\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 17,\n \"samples\": [\n \"Canada\",\n \"Austria\",\n \"India\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"RiskRating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"LR\",\n \"HR\",\n \"MR\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":7}]},{"cell_type":"markdown","source":["# Generate customer data"],"metadata":{"id":"dMzAIFLoiMkH"}},{"cell_type":"code","source":["#Define a function to ensure the synthetic customer generation has a realistic mix of High Risk, Med Risk and Low risk master data (Business Type, Geography, Customer Type)\n","#Set thresholds for the same High Risk-0.5%, Med Risk-1% and Low risk-98.5% for Business type for example\n","def sample_with_risk_distribution(master_df, risk_col, code_col, target_dist, total_needed):\n"," samples = []\n","\n"," for risk_level, fraction in target_dist.items():\n"," subset = master_df[master_df[risk_col] == risk_level]\n"," n_samples = int(total_needed * fraction)\n","\n"," if len(subset) == 0:\n"," continue # Skip if no rows with this risk rating\n","\n"," sampled = subset.sample(n=n_samples, replace=(n_samples > len(subset)))\n"," samples.append(sampled[[code_col]])\n","\n"," return pd.concat(samples, ignore_index=True)\n","\n","# Desired distributions\n","business_risk_dist = {\"HR\": 0.005, \"MR\": 0.01, \"LR\": 0.985}\n","geo_risk_dist = {\"HR\": 0.005, \"MR\": 0.01, \"LR\": 0.985}\n","cust_type_risk_dist = {\"HR\": 0.005, \"MR\": 0.01, \"LR\": 0.985}\n","\n","num_customers = 400\n","\n","# Sample for business\n","business_sample = sample_with_risk_distribution(\n"," business_master, \"RiskRating\", \"BusinessCode\", business_risk_dist, num_customers\n",")\n","\n","geo_sample = sample_with_risk_distribution(\n"," geo_master, \"RiskRating\", \"GeoCode\", geo_risk_dist, num_customers\n",")\n","\n","cust_type_sample = sample_with_risk_distribution(\n"," ctype_master, \"RiskRating\", \"CustomerTypeCode\", cust_type_risk_dist, num_customers)\n"],"metadata":{"id":"2Hi_3OUj4Xu_"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Generate 400 customers\n","# Add a meaningful suffix with the names of the customers based on customer type\n","customers = []\n","for i in range(400):\n"," cust_type = cust_type_sample.iloc[i % len(cust_type_sample)][\"CustomerTypeCode\"]\n"," name = faker.name()\n"," suffix_map = {\n"," \"PUBL\": \" Pub Ltd Company\",\n"," \"PVTL\": \" Pvt Ltd Company\",\n"," \"GOVT\": \" Govt Co\",\n"," \"SPRF\": \" Sole Prop firm\",\n"," \"PART\": \" and Partners\",\n"," \"LLPF\": \" Limited LP\",\n"," \"TRST\": \" Trust Co\",\n"," \"CLSO\": \" Club\",\n"," \"NBFC\": \" Non-Banking Fin Co\",\n"," \"BFIS\": \" Banking Ltd\",\n"," \"SGOV\": \" State Gov Co\",\n"," \"ASSO\": \" & Association\",\n"," \"MBNK\": \" Banking Ltd\",\n"," \"NGOS\": \" Non-Govt Org\",\n"," \"FCOM\": \" Foreign Co\"\n"," }\n"," suffix = suffix_map.get(cust_type, \"\")\n"," name += suffix\n"," cust = {\n"," \"CustomerID\": f\"CUST{i+1:04}\",\n"," \"Name\": name,\n"," \"PEP_Flag\": 'Y' if np.random.rand() < 0.02 else 'N',\n"," \"Business\": business_sample.iloc[i % len(business_sample)][\"BusinessCode\"],\n"," \"Geography\": geo_sample.iloc[i % len(geo_sample)][\"GeoCode\"],\n"," \"CustomerType\": cust_type\n"," }\n"," customers.append(cust)\n","\n","\n","df_customers = pd.DataFrame(customers)"],"metadata":{"id":"ksUAP28dgi5q"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Check the structure of the customers dataframe\n","df_customers.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3wt73AVy5KK3","executionInfo":{"status":"ok","timestamp":1755093334118,"user_tz":-330,"elapsed":20,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"b09b3021-44b1-4f36-d7ec-8f32904a0f10"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 400 entries, 0 to 399\n","Data columns (total 6 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 CustomerID 400 non-null object\n"," 1 Name 400 non-null object\n"," 2 PEP_Flag 400 non-null object\n"," 3 Business 400 non-null int64 \n"," 4 Geography 400 non-null object\n"," 5 CustomerType 400 non-null object\n","dtypes: int64(1), object(5)\n","memory usage: 18.9+ KB\n"]}]},{"cell_type":"code","source":["#Check the sample data in the customers dataframe\n","df_customers.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"uJWJK-Z05NZ8","executionInfo":{"status":"ok","timestamp":1755093339650,"user_tz":-330,"elapsed":82,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"6be32658-db4a-4f5c-f5c7-1a165289ac4b"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" CustomerID Name PEP_Flag Business Geography \\\n","0 CUST0001 Allison Hill Club N 1014 NPL \n","1 CUST0002 Noah Rhodes Club N 1014 NPL \n","2 CUST0003 Angie Henderson Sole Prop firm N 1012 KEN \n","3 CUST0004 Daniel Wagner Sole Prop firm N 1012 ZAF \n","4 CUST0005 Cristian Santos Sole Prop firm N 1010 KEN \n","\n"," CustomerType \n","0 CLSO \n","1 CLSO \n","2 SPRF \n","3 SPRF \n","4 SPRF "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
CustomerIDNamePEP_FlagBusinessGeographyCustomerType
0CUST0001Allison Hill ClubN1014NPLCLSO
1CUST0002Noah Rhodes ClubN1014NPLCLSO
2CUST0003Angie Henderson Sole Prop firmN1012KENSPRF
3CUST0004Daniel Wagner Sole Prop firmN1012ZAFSPRF
4CUST0005Cristian Santos Sole Prop firmN1010KENSPRF
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"df_customers","summary":"{\n \"name\": \"df_customers\",\n \"rows\": 400,\n \"fields\": [\n {\n \"column\": \"CustomerID\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"CUST0210\",\n \"CUST0281\",\n \"CUST0034\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"Brian Lee Pvt Ltd Company\",\n \"Amanda Jones Trust Co\",\n \"Nathan Maldonado Trust Co\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PEP_Flag\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Y\",\n \"N\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Business\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7,\n \"min\": 1001,\n \"max\": 1025,\n \"num_unique_values\": 25,\n \"samples\": [\n 1021,\n 1004\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Geography\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 17,\n \"samples\": [\n \"NPL\",\n \"KEN\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CustomerType\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 17,\n \"samples\": [\n \"CLSO\",\n \"SPRF\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":9}]},{"cell_type":"code","source":["#Download the synthetic customers dataframe to a csv file\n","df_customers.to_csv(\"customers.csv\", index=False)"],"metadata":{"id":"-nCF_rpA3SJx"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Generate account Data"],"metadata":{"id":"v6Icoc935YEM"}},{"cell_type":"code","source":["#Define a function to ensure the synthetic accounts data generation has a realistic mix of High Risk, Med Risk and Low risk master data (Product Type and Channel)\n","#Set thresholds for the same High Risk-1%, Med Risk-2% and Low risk-96% for Product for example\n","product_risk_dist = {\"HR\": 0.01, \"MR\": 0.02, \"LR\": 0.97}\n","currency_risk_dist = {\"HR\": 0.005, \"MR\": 0.01, \"LR\": 0.985}\n","num_accounts = 1000\n","\n","# Sample products and currencies as per risk split\n","product_sample = sample_with_risk_distribution(\n","prod_master, \"RiskRating\", \"ProductCode\", product_risk_dist, num_accounts\n",")\n","\n","currency_sample = sample_with_risk_distribution(\n"," currency_master, \"RiskRating\", \"CurrencyCode\", currency_risk_dist, num_accounts\n",")"],"metadata":{"id":"Sa8HqStp5bA-"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Define thresholds for synthetic generation based on currency, for example 70% of the accounts to be in USD currency, to reflect a realistic composition of accounts\n","num_accounts = 1000\n","num_customers = len(df_customers)\n","assert num_accounts >= num_customers, \"Number of accounts must be >= number of customers\"\n","\n","# Phase 1: One account per customer\n","accounts = []\n","\n","currency_list = currency_master[\"CurrencyCode\"].tolist()\n","\n","explicit_weights = {\n"," \"USD\": 0.7,\n"," \"EUR\": 0.1,\n"," \"GBP\": 0.05,\n"," \"CAD\": 0.03,\n"," \"INR\": 0.02,\n"," \"JPY\": 0.02,\n"," \"AUD\": 0.02,\n","}\n","\n","def choose_currency_weighted(currency_list, weights_dict):\n"," weights = [weights_dict.get(curr, 0.06 / (len(currency_list) - len(weights_dict)))\n"," if curr not in weights_dict else weights_dict[curr]\n"," for curr in currency_list]\n"," return random.choices(currency_list, weights=weights, k=1)[0]\n","\n","# Sample products and currencies upfront\n","product_sample = sample_with_risk_distribution(\n"," prod_master, \"RiskRating\", \"ProductCode\", product_risk_dist, num_accounts\n",")\n","currency_sample = sample_with_risk_distribution(\n"," currency_master, \"RiskRating\", \"CurrencyCode\", currency_risk_dist, num_accounts\n",")\n","\n","for i, cust_id in enumerate(df_customers[\"CustomerID\"]):\n"," acc = {\n"," \"AccountNumber\": f\"ACC{i+1:05}\",\n"," \"CustomerID\": cust_id,\n"," \"ProductCode\": product_sample.iloc[i % len(product_sample)][\"ProductCode\"],\n"," \"CurrencyCode\": choose_currency_weighted(currency_list, explicit_weights),\n"," \"Balance\": round(np.random.uniform(1000, 1000000), 2)\n"," }\n"," accounts.append(acc)\n","\n","# Phase 2: Remaining random accounts\n","remaining_accounts = num_accounts - num_customers\n","for i in range(remaining_accounts):\n"," acc = {\n"," \"AccountNumber\": f\"ACC{num_customers + i + 1:05}\",\n"," \"CustomerID\": np.random.choice(df_customers[\"CustomerID\"]),\n"," \"ProductCode\": product_sample.iloc[(num_customers + i) % len(product_sample)][\"ProductCode\"],\n"," \"CurrencyCode\": choose_currency_weighted(currency_list, explicit_weights),\n"," \"Balance\": round(np.random.uniform(1000, 1000000), 2)\n"," }\n"," accounts.append(acc)\n","\n","df_accounts = pd.DataFrame(accounts)"],"metadata":{"id":"7kERPknzXFBk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Check the structure of the accounts dataframe\n","df_accounts.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TwuFlTZF6zH6","executionInfo":{"status":"ok","timestamp":1755093354156,"user_tz":-330,"elapsed":22,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"c34f0771-7836-4637-dbf7-55a991b0f70f"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 1000 entries, 0 to 999\n","Data columns (total 5 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 AccountNumber 1000 non-null object \n"," 1 CustomerID 1000 non-null object \n"," 2 ProductCode 1000 non-null object \n"," 3 CurrencyCode 1000 non-null object \n"," 4 Balance 1000 non-null float64\n","dtypes: float64(1), object(4)\n","memory usage: 39.2+ KB\n"]}]},{"cell_type":"code","source":["#Check the sample data in the accounts dataframe\n","df_accounts.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"sTpJM7vR61oM","executionInfo":{"status":"ok","timestamp":1755093356653,"user_tz":-330,"elapsed":73,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"b429c805-c599-4289-d73b-799be6a07591"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" AccountNumber CustomerID ProductCode CurrencyCode Balance\n","0 ACC00001 CUST0001 BUSL USD 787832.95\n","1 ACC00002 CUST0002 BUSL USD 431871.46\n","2 ACC00003 CUST0003 BUSL USD 981516.44\n","3 ACC00004 CUST0004 BUSL USD 202038.16\n","4 ACC00005 CUST0005 BUSL USD 525102.99"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
AccountNumberCustomerIDProductCodeCurrencyCodeBalance
0ACC00001CUST0001BUSLUSD787832.95
1ACC00002CUST0002BUSLUSD431871.46
2ACC00003CUST0003BUSLUSD981516.44
3ACC00004CUST0004BUSLUSD202038.16
4ACC00005CUST0005BUSLUSD525102.99
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"df_accounts","summary":"{\n \"name\": \"df_accounts\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"AccountNumber\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1000,\n \"samples\": [\n \"ACC00522\",\n \"ACC00738\",\n \"ACC00741\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CustomerID\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 400,\n \"samples\": [\n \"CUST0210\",\n \"CUST0281\",\n \"CUST0034\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ProductCode\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"AUTL\",\n \"GLON\",\n \"BUSL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CurrencyCode\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 13,\n \"samples\": [\n \"DKK\",\n \"CNY\",\n \"USD\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Balance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 287950.2255022987,\n \"min\": 1787.26,\n \"max\": 999449.91,\n \"num_unique_values\": 1000,\n \"samples\": [\n 602474.76,\n 883296.44,\n 279381.28\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":13}]},{"cell_type":"code","source":["#Download the synthetic accounts dataframe to a csv file\n","df_accounts.to_csv(\"accounts.csv\", index=False)"],"metadata":{"id":"DFiLw9J03WXx"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Generate transactions data"],"metadata":{"id":"ijEICHy37mAS"}},{"cell_type":"code","source":["#Define a function to set thresolds based on channel, currency, geography and transaction type\n","def weighted_sample(df, rating_col, code_col, total=10000, weights={\"HR\":0.005, \"MR\":0.01, \"LR\":0.985}):\n"," samples = []\n"," for risk, pct in weights.items():\n"," subset = df[df[rating_col] == risk]\n"," if not subset.empty:\n"," count = int(total * pct)\n"," samples.extend(subset.sample(n=count, replace=True)[code_col].tolist())\n"," return samples\n","\n","tran_types = weighted_sample(tran_master, \"RiskRating\", \"TranTypeCode\")\n","channels = weighted_sample(chann_master, \"RiskRating\", \"ChannelCode\", total=len(tran_types))\n","currencies = weighted_sample(currency_master, \"RiskRating\", \"CurrencyCode\", total=len(tran_types))\n","geos = weighted_sample(geo_master, \"RiskRating\", \"GeoCode\", total=len(tran_types))"],"metadata":{"id":"FGNO_s5064mK"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Import the conditions of allowed combinations (Product-Transaction type and Product-Channel) in transactions\n","prod_tran_map = conditions_data[\"Prod-TranType Mapping\"]\n","channel_tran_map = conditions_data[\"Channe-TranType Mapping\"]"],"metadata":{"id":"9tBL4CAp8cam"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Check the allowed combination for Channel-TranType mapping in the dataframe\n","channel_tran_map"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":394},"id":"riK26LtIxcYy","executionInfo":{"status":"ok","timestamp":1755070682777,"user_tz":-330,"elapsed":88,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"7d297d18-2400-4071-8abe-06ff9570d480","collapsed":true},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" ChannelCode Channel Name TranTypeCode\n","0 BRAN Branch CDEP, CWIT,FTDO,FTIN, LDIST\n","1 CATM ATM CDEP, CWIT,FEEC\n","2 CPOS POS MPAY, FEEC,LREP\n","3 IBAN Internet Banking FTDO, FTIN, CCPY,MPAY, LREP, BPAY, FEEC\n","4 MBAN Mobile Banking FTDO, FTIN, CCPY,MPAY, LREP, BPAY, FEEC\n","5 CUPI UPI CDEP, CWIT, LREP\n","6 CHEQ Cheque / DD FTDO,CCPY, LDIST,LREP,BPAY,FEEC\n","7 CARD Cards FTDO, FTIN, CCPY,MPAY, FEEC\n","8 WALL Wallets FTDO, FTIN, CCPY,MPAY,LDIST\n","9 CUST Customer Terminal FTDO, FTIN, CCPY,MPAY,LREP\n","10 SELF Self Service Kiosk FTDO, FTIN, CCPY,MPAY,FEEC"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
ChannelCodeChannel NameTranTypeCode
0BRANBranchCDEP, CWIT,FTDO,FTIN, LDIST
1CATMATMCDEP, CWIT,FEEC
2CPOSPOSMPAY, FEEC,LREP
3IBANInternet BankingFTDO, FTIN, CCPY,MPAY, LREP, BPAY, FEEC
4MBANMobile BankingFTDO, FTIN, CCPY,MPAY, LREP, BPAY, FEEC
5CUPIUPICDEP, CWIT, LREP
6CHEQCheque / DDFTDO,CCPY, LDIST,LREP,BPAY,FEEC
7CARDCardsFTDO, FTIN, CCPY,MPAY, FEEC
8WALLWalletsFTDO, FTIN, CCPY,MPAY,LDIST
9CUSTCustomer TerminalFTDO, FTIN, CCPY,MPAY,LREP
10SELFSelf Service KioskFTDO, FTIN, CCPY,MPAY,FEEC
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n"," \n"," \n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"channel_tran_map","summary":"{\n \"name\": \"channel_tran_map\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"ChannelCode\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 11,\n \"samples\": [\n \"CUPI\",\n \"BRAN\",\n \"CUST\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Channel Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 11,\n \"samples\": [\n \"UPI\",\n \"Branch\",\n \"Customer Terminal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"TranTypeCode\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"FTDO, FTIN, CCPY,MPAY,LREP\",\n \"CDEP, CWIT,FEEC\",\n \"FTDO,CCPY, LDIST,LREP,BPAY,FEEC\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":18}]},{"cell_type":"code","source":["#Check the allowed combination for Product-TranType mapping in the dataframe\n","prod_tran_map"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":488},"id":"E28VdwN09etf","executionInfo":{"status":"ok","timestamp":1755070686629,"user_tz":-330,"elapsed":95,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"c0e6cbaf-3613-4256-9cd5-2785ea34b5a4","collapsed":true},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" ProductCode Prod Name \\\n","0 FIXD Fixed Deposit \n","1 BUSL Business Loans \n","2 SAVS Savings \n","3 CHEK Checking \n","4 OVDF Overdraft \n","5 AUTL Auto Loan \n","6 HOUL Housing Loan \n","7 BILLS Bills Discounting \n","8 GLON Gold Loans \n","9 ELON Education Loans \n","10 PLON Personal Loans \n","11 SMEL SME Loans \n","12 LAPR Loan against Prop \n","13 LTLO Long Term Loans \n","\n"," TranTypeCode \n","0 CDEP, CWIT, FTDO, BPAY, FEEC \n","1 CDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B... \n","2 CDEP, CWIT, CCPAY, FTDO, FTIN \n","3 CDEP, CWIT, CCPAY, FTDO, FTIN, BPAY \n","4 CDEP, CWIT, FTDO, FTIN,MPAY,CCPY \n","5 FTDO, CDEP, FEEC \n","6 FTDO, CDEP, FEEC \n","7 CDEP, CWIT, CCPAY, FTDO, FTIN, BPAY \n","8 CDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B... \n","9 CDEP, CWIT, FTDO, FTIN,MPAY,CCPY \n","10 CDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B... \n","11 CDEP, CWIT, FTDO, FTIN,MPAY,CCPY \n","12 CDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B... \n","13 CWIT, FTDO, FTIN,MPAY "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
ProductCodeProd NameTranTypeCode
0FIXDFixed DepositCDEP, CWIT, FTDO, BPAY, FEEC
1BUSLBusiness LoansCDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B...
2SAVSSavingsCDEP, CWIT, CCPAY, FTDO, FTIN
3CHEKCheckingCDEP, CWIT, CCPAY, FTDO, FTIN, BPAY
4OVDFOverdraftCDEP, CWIT, FTDO, FTIN,MPAY,CCPY
5AUTLAuto LoanFTDO, CDEP, FEEC
6HOULHousing LoanFTDO, CDEP, FEEC
7BILLSBills DiscountingCDEP, CWIT, CCPAY, FTDO, FTIN, BPAY
8GLONGold LoansCDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B...
9ELONEducation LoansCDEP, CWIT, FTDO, FTIN,MPAY,CCPY
10PLONPersonal LoansCDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B...
11SMELSME LoansCDEP, CWIT, FTDO, FTIN,MPAY,CCPY
12LAPRLoan against PropCDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B...
13LTLOLong Term LoansCWIT, FTDO, FTIN,MPAY
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n"," \n"," \n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"prod_tran_map","summary":"{\n \"name\": \"prod_tran_map\",\n \"rows\": 14,\n \"fields\": [\n {\n \"column\": \"ProductCode\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"ELON\",\n \"SMEL \",\n \"FIXD\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Prod Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"Education Loans\",\n \"SME Loans \",\n \"Fixed Deposit\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"TranTypeCode\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"CDEP, CWIT, FTDO, BPAY, FEEC\",\n \"CDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,BPAY,FEEC\",\n \"FTDO, CDEP, FEEC\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":19}]},{"cell_type":"code","source":["#Seperate the allowed transaction types (which were mapped with a ',')\n","normalized_prod_tran_map = (\n"," prod_tran_map.assign(TranTypeCode=prod_tran_map[\"TranTypeCode\"].str.split(\",\"))\n"," .explode(\"TranTypeCode\")\n",")\n","normalized_prod_tran_map[\"TranTypeCode\"] = normalized_prod_tran_map[\"TranTypeCode\"].str.strip()\n","\n","normalized_chan_tran_map = (\n"," channel_tran_map.assign(TranTypeCode=channel_tran_map[\"TranTypeCode\"].str.split(\",\"))\n"," .explode(\"TranTypeCode\")\n",")\n","normalized_chan_tran_map[\"TranTypeCode\"] = normalized_chan_tran_map[\"TranTypeCode\"].str.strip()"],"metadata":{"id":"sprJyC985x9k"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Define the exchange rates to be used to convert the transaction amount to 'amount in local currency(LCY)'\n","\n","exchange_rates = {\n"," \"USD\": 1.0,\n"," \"EUR\": 1.1,\n"," \"GBP\": 1.25,\n"," \"SGD\": 0.78,\n"," \"INR\": 0.012,\n"," \"VEF\": 0.08,\n"," \"CAD\": 0.73,\n"," \"JPY\": 0.0068,\n"," \"ZAR\": 0.056,\n"," \"CNY\": 0.14,\n"," \"DKK\": 0.15,\n"," \"EGP\": 0.021,\n"," \"AUD\": 0.65\n","}"],"metadata":{"id":"Eya59Z8gWFZ-"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Generate 10000 synthetic transactions with (a) thresolds on the currency of the transaction (b)thresolds on the amount of the transactions\n","from collections import defaultdict\n","num_transactions=10000\n","transactions = []\n","valid_combinations = [] # (TranType, Channel, Product)\n","for _, row in normalized_prod_tran_map.iterrows():\n"," tran_types = [t.strip() for t in row[\"TranTypeCode\"].split(\",\")]\n"," product = row[\"ProductCode\"]\n"," for tran in tran_types:\n"," allowed_channels = normalized_chan_tran_map[normalized_chan_tran_map[\"TranTypeCode\"] == tran][\"ChannelCode\"].tolist()\n"," for channel in allowed_channels:\n"," valid_combinations.append((tran, channel, product))\n","\n","currency_list = currency_master[\"CurrencyCode\"].tolist()\n","explicit_weights = {\n"," \"USD\": 0.7,\n"," \"EUR\": 0.1,\n"," \"GBP\": 0.05,\n"," \"CAD\": 0.03,\n"," \"INR\": 0.02,\n"," \"JPY\": 0.02,\n"," \"AUD\": 0.02\n","}\n","\n","def choose_currency_weighted(currency_list, explicit_weights, other_weight=0.06):\n"," all_weights = {}\n"," others = [c for c in currency_list if c not in explicit_weights]\n","\n"," # Distribute 'others' weight equally\n"," if others:\n"," per_other_weight = other_weight / len(others)\n"," for c in others:\n"," all_weights[c] = per_other_weight\n","\n"," # Add explicit weights\n"," all_weights.update(explicit_weights)\n","\n"," # Normalize and sample\n"," currencies = list(all_weights.keys())\n"," weights = list(all_weights.values())\n"," return random.choices(currencies, weights=weights, k=1)[0]\n","\n","amount_slabs = [\n"," (50, 1000),\n"," (1001, 5000),\n"," (5001, 20000),\n"," (20001, 100000)\n","]\n","slab_weights = [0.4, 0.3, 0.2, 0.1] # Should sum to 1.0\n","\n","def generate_transaction_amount(slabs, weights):\n"," selected_slab = random.choices(slabs, weights=weights, k=1)[0]\n"," return round(random.uniform(selected_slab[0], selected_slab[1]), 2)\n","\n","# Now generate transactions using these valid combinations\n","\n","for i in range(num_transactions):\n"," tran_type, channel, product = random.choice(valid_combinations)\n"," #amount = round(np.random.uniform(50, 100000), 2)\n"," amount = generate_transaction_amount(amount_slabs, slab_weights)\n"," #currency = random.choice(currency_master[\"CurrencyCode\"].tolist())\n"," currency = choose_currency_weighted(currency_list, explicit_weights)\n"," exchange_rate = exchange_rates.get(currency, 1.0) # default to 1.0 if not found\n"," tlcy_amount = round(amount * exchange_rate, 2)\n","\n"," start_date = pd.to_datetime(\"2025-07-20\")\n"," end_date = pd.to_datetime(\"2025-08-02\")\n"," date_range = pd.date_range(start=start_date, end=end_date).to_list()\n","\n"," # Choose initiating and counterparty customers\n"," initiating = df_customers.sample(1).iloc[0]\n"," counterparty = df_customers.sample(1).iloc[0]\n"," while counterparty[\"CustomerID\"] == initiating[\"CustomerID\"]:\n"," counterparty = df_customers.sample(1).iloc[0]\n","\n"," initiating = df_customers.sample(1).iloc[0]\n"," # get associated account\n"," initiating_accounts = df_accounts[df_accounts[\"CustomerID\"] == initiating[\"CustomerID\"]]\n"," initiating_account_id = initiating_accounts.sample(1).iloc[0][\"AccountNumber\"] if not initiating_accounts.empty else \"Unknown\"\n","\n"," # Set geography rules for FTDO/FTIN\n"," if tran_type == \"FTDO\":\n"," counter_geo = initiating[\"Geography\"]\n"," elif tran_type == \"FTIN\":\n"," counter_geo = random.choice([g for g in geo_master[\"GeoCode\"] if g != initiating[\"Geography\"]])\n"," else:\n"," counter_geo = counterparty[\"Geography\"]\n","\n"," transaction = {\n"," \"TransactionDate\": random.choice(date_range).strftime(\"%Y-%m-%d\"),\n"," \"TransactionID\": f\"T{i+1:05d}\",\n"," \"InitiatingCustomer\": initiating[\"CustomerID\"],\n"," \"AccountNumber\": initiating_account_id,\n"," \"Counterparty\": counterparty[\"CustomerID\"],\n"," \"TranType\": tran_type,\n"," \"Channel\": channel,\n"," \"Currency\": currency,\n"," \"Credit/Debit\": random.choice([\"Cr\", \"Dr\"]),\n"," \"Amount\": amount,\n"," \"Amount in LCY\":tlcy_amount,\n"," \"Product\": product,\n"," \"InitiatingGeo\": initiating[\"Geography\"],\n"," \"CounterpartyGeo\": counter_geo\n"," #\"Timestamp\": faker.date_time_between(start_date='-30d', end_date='now')\n"," }\n"," transactions.append(transaction)\n"],"metadata":{"id":"vgwYBZvh8kmS"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Save the transactions file in a csv format\n","transaction_df = pd.DataFrame(transactions)\n","transaction_df.to_csv(\"transactions.csv\", index=False)\n","print(\"✅ Synthetic transaction dataset created with\", len(transaction_df), \"rows.\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"54zByFBA4jet","executionInfo":{"status":"ok","timestamp":1755093537643,"user_tz":-330,"elapsed":127,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"9bdbda00-dd28-42cc-c6f9-b865243f41ee"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["✅ Synthetic transaction dataset created with 10000 rows.\n"]}]},{"cell_type":"markdown","source":["# Generate watchlist data"],"metadata":{"id":"bu8AEbpQiWqh"}},{"cell_type":"code","source":["# Generate Watchlist by randomly marking 10 customers as watchlisted\n","watchlist_customers = df_customers.sample(10)\n","df_watchlist_cust = watchlist_customers[[\"Name\"]].copy()\n","df_watchlist_cust[\"WatchReason\"] = \"FATF\"\n","#df_watchlist_cust.to_csv(\"watchlist_cust.csv\", index=False)"],"metadata":{"id":"B47slCqoczms"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## 1.1 Import synthetic data"],"metadata":{"id":"vzpqnhUVf-YR"}},{"cell_type":"markdown","source":["## 1. 2 Enrich Synthetic data with other relevant columns\n","By linking transaction file with customer and account files"],"metadata":{"id":"FufF4vi0gEeg"}},{"cell_type":"code","source":["#Load the synthetic data generated (customers, accounts, transactions and watchlist)\n","accounts = df_accounts\n","customers = df_customers\n","transactions = transaction_df\n","watchlist = df_watchlist_cust"],"metadata":{"id":"sLD5jhLC17ee"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Add other relevant columns to the base transaction file, by merging with the relevant master data file\n","#Other relevant fields-customer related: BusinessCode, GeoCode,Customer Type (along with the Risk levels for these fields), PEP Flag and Name of both Initiating customer and Counterparty\n","#Other relevant fields-transaction related: Channel, Product, Transaction Type (along with the Risk levels for these fields)\n","transactions_merge = transactions.merge(\n"," customers.set_index(\"CustomerID\")[[\"Business\",\"CustomerType\",\"PEP_Flag\",\"Name\"]],\n"," left_on=\"InitiatingCustomer\",\n"," right_index=True,\n"," how=\"left\"\n",").rename(columns={\"Business\": \"BusinessCode_IC\",\n"," \"CustomerType\": \"CustTypeCode_IC\",\n"," \"Name\":\"ICName\",\n"," \"PEP_Flag\":\"PEP_Flag_IC\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," customers.set_index(\"CustomerID\")[[\"Business\",\"CustomerType\",\"PEP_Flag\",\"Name\"]],\n"," left_on=\"Counterparty\",\n"," right_index=True,\n"," how=\"left\"\n",").rename(columns={\"Business\": \"BusinessCode_CP\",\n"," \"CustomerType\": \"CustTypeCode_CP\",\n"," \"Name\":\"CounterpartyName\",\n"," \"PEP_Flag\":\"PEP_Flag_CP\"})\n","\n","transactions_merge = transactions_merge.merge(accounts.set_index(\"AccountNumber\")[[\"ProductCode\"]],\n"," on=\"AccountNumber\",\n"," how='left')\n","\n","transactions_merge = transactions_merge.merge(\n"," business_master.set_index(\"BusinessCode\")[\"RiskRating\"],\n"," left_on=\"BusinessCode_IC\",\n"," right_index=True,\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"IC_BusinessRisk\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," business_master.set_index(\"BusinessCode\")[\"RiskRating\"],\n"," left_on=\"BusinessCode_CP\",\n"," right_index=True,\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"CP_BusinessRisk\"})\n","\n","\n","transactions_merge = transactions_merge.merge(\n"," chann_master.set_index(\"ChannelCode\")[\"RiskRating\"],\n"," right_index=True,\n"," left_on =\"Channel\",\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"ChannelRisk\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," geo_master.set_index(\"GeoCode\")[\"RiskRating\"],\n"," right_index=True,\n"," left_on =\"InitiatingGeo\",\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"IC_GeoRisk\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," geo_master.set_index(\"GeoCode\")[\"RiskRating\"],\n"," right_index=True,\n"," left_on =\"CounterpartyGeo\",\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"CP_GeoRisk\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," ctype_master.set_index(\"CustomerTypeCode\")[\"RiskRating\"],\n"," right_index=True,\n"," left_on =\"CustTypeCode_IC\",\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"IC_CustTypeRisk\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," ctype_master.set_index(\"CustomerTypeCode\")[\"RiskRating\"],\n"," right_index=True,\n"," left_on =\"CustTypeCode_CP\",\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"CP_CustTypeRisk\"})\n","\n","\n","transactions_merge = transactions_merge.merge(\n"," prod_master.set_index(\"ProductCode\")[\"RiskRating\"],\n"," right_index=True,\n"," left_on =\"Product\",\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"ProductRisk\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," tran_master.set_index(\"TranTypeCode\")[\"RiskRating\"],\n"," right_index=True,\n"," left_on =\"TranType\",\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"TranTypeRisk\"})\n","\n","transactions_merge = transactions_merge.merge(\n"," currency_master.set_index(\"CurrencyCode\")[[\"RiskRating\"]],\n"," left_on=\"Currency\",\n"," right_index=True,\n"," how=\"left\"\n",").rename(columns={\"RiskRating\": \"CurrencyRisk\"})\n","\n","watchlist_names = set(watchlist[\"Name\"])\n","\n","# Add WL_IC flag\n","transactions_merge[\"WL_IC\"] = transactions_merge[\"ICName\"].apply(\n"," lambda x: \"Y\" if x in watchlist_names else \"N\"\n",")\n","\n","# Add WL_CP flag\n","transactions_merge[\"WL_CP\"] = transactions_merge[\"CounterpartyName\"].apply(\n"," lambda x: \"Y\" if x in watchlist_names else \"N\"\n",")"],"metadata":{"id":"ND0a0WCx8bZB"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Check the transaction fields after merging the other relevant fields\n","transactions_merge.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dc9QydrlU8Hg","executionInfo":{"status":"ok","timestamp":1755093552449,"user_tz":-330,"elapsed":21,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"9789dd53-7151-4664-8f10-e6af2de7aa78"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 10000 entries, 0 to 9999\n","Data columns (total 35 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 TransactionDate 10000 non-null object \n"," 1 TransactionID 10000 non-null object \n"," 2 InitiatingCustomer 10000 non-null object \n"," 3 AccountNumber 10000 non-null object \n"," 4 Counterparty 10000 non-null object \n"," 5 TranType 10000 non-null object \n"," 6 Channel 10000 non-null object \n"," 7 Currency 10000 non-null object \n"," 8 Credit/Debit 10000 non-null object \n"," 9 Amount 10000 non-null float64\n"," 10 Amount in LCY 10000 non-null float64\n"," 11 Product 10000 non-null object \n"," 12 InitiatingGeo 10000 non-null object \n"," 13 CounterpartyGeo 10000 non-null object \n"," 14 BusinessCode_IC 10000 non-null int64 \n"," 15 CustTypeCode_IC 10000 non-null object \n"," 16 PEP_Flag_IC 10000 non-null object \n"," 17 ICName 10000 non-null object \n"," 18 BusinessCode_CP 10000 non-null int64 \n"," 19 CustTypeCode_CP 10000 non-null object \n"," 20 PEP_Flag_CP 10000 non-null object \n"," 21 CounterpartyName 10000 non-null object \n"," 22 ProductCode 10000 non-null object \n"," 23 IC_BusinessRisk 10000 non-null object \n"," 24 CP_BusinessRisk 10000 non-null object \n"," 25 ChannelRisk 10000 non-null object \n"," 26 IC_GeoRisk 10000 non-null object \n"," 27 CP_GeoRisk 10000 non-null object \n"," 28 IC_CustTypeRisk 10000 non-null object \n"," 29 CP_CustTypeRisk 10000 non-null object \n"," 30 ProductRisk 10000 non-null object \n"," 31 TranTypeRisk 10000 non-null object \n"," 32 CurrencyRisk 10000 non-null object \n"," 33 WL_IC 10000 non-null object \n"," 34 WL_CP 10000 non-null object \n","dtypes: float64(2), int64(2), object(31)\n","memory usage: 2.7+ MB\n"]}]},{"cell_type":"markdown","source":["## 1.3 Assign appropriate Risk levels for the relevant columns in the transaction file"],"metadata":{"id":"RIraS80TgUtK"}},{"cell_type":"code","source":["#Replace the Risk levels with numerics (LR-LowRisk, MR-MediumRisk, HR-HighRisk with 1,5 and 10 respectively)\n","risk_map = {\"LR\": 1, \"MR\": 5, \"HR\": 10}\n","risk_columns = [col for col in transactions_merge.columns if col.endswith(\"Risk\")]\n","transactions_merge[risk_columns] = transactions_merge[risk_columns].replace(risk_map)"],"metadata":{"id":"IIrmKnSMC96R","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1755093557717,"user_tz":-330,"elapsed":74,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"ddf22020-73e9-480e-d88d-3cf14d74d8de"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/tmp/ipython-input-3666971339.py:4: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n"," transactions_merge[risk_columns] = transactions_merge[risk_columns].replace(risk_map)\n"]}]},{"cell_type":"code","source":["transactions_features_hr = transactions_merge.iloc[:,[0,1,2,10,16,20,23,24,25,26,27,28,29,30,31,32,33,34]]"],"metadata":{"id":"7huvfvHR3fep"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#transactions_features_hr.drop(columns=[\"predlabel\"],inplace=True)\n","transactions_features_hr.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tGo0hhyJ4EDh","executionInfo":{"status":"ok","timestamp":1755093560678,"user_tz":-330,"elapsed":19,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"635fb300-2e3f-4bb2-f5b2-cddd701c85ff"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 10000 entries, 0 to 9999\n","Data columns (total 18 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 TransactionDate 10000 non-null object \n"," 1 TransactionID 10000 non-null object \n"," 2 InitiatingCustomer 10000 non-null object \n"," 3 Amount in LCY 10000 non-null float64\n"," 4 PEP_Flag_IC 10000 non-null object \n"," 5 PEP_Flag_CP 10000 non-null object \n"," 6 IC_BusinessRisk 10000 non-null int64 \n"," 7 CP_BusinessRisk 10000 non-null int64 \n"," 8 ChannelRisk 10000 non-null int64 \n"," 9 IC_GeoRisk 10000 non-null int64 \n"," 10 CP_GeoRisk 10000 non-null int64 \n"," 11 IC_CustTypeRisk 10000 non-null int64 \n"," 12 CP_CustTypeRisk 10000 non-null int64 \n"," 13 ProductRisk 10000 non-null int64 \n"," 14 TranTypeRisk 10000 non-null int64 \n"," 15 CurrencyRisk 10000 non-null int64 \n"," 16 WL_IC 10000 non-null object \n"," 17 WL_CP 10000 non-null object \n","dtypes: float64(1), int64(10), object(7)\n","memory usage: 1.4+ MB\n"]}]},{"cell_type":"markdown","source":["# 2. Apply Rule based logic and generate alerts (A-P1, A-P2) and identify transactions related to alerts\n","\n","A-P1 - Priority 1 alert\n","A-P2 - Priority 2 alert"],"metadata":{"id":"RvxGHyn2gtoq"}},{"cell_type":"code","source":["#Calculate the Overall Transaction Risk and update the same in merged transaction file in a new field'Overall_Tranx_Risk'\n","conditions_tx = [\n"," # High Risk\n"," (transactions_merge[\"PEP_Flag_CP\"] == \"Y\") |\n"," (transactions_merge[\"CP_BusinessRisk\"] == 10) |\n"," (transactions_merge[\"CP_GeoRisk\"] == 10) |\n"," (transactions_merge[\"WL_CP\"] == \"Y\") |\n"," (transactions_merge[\"ProductRisk\"] == 10) |\n"," (transactions_merge[\"CurrencyRisk\"] == 10) |\n"," (transactions_merge[\"ChannelRisk\"] == 10),\n","\n"," # Medium Risk\n"," (transactions_merge[\"PEP_Flag_CP\"] == \"N\") &\n"," (transactions_merge[\"WL_CP\"] == \"N\") &\n"," (\n"," (transactions_merge[\"CP_BusinessRisk\"] == 5) |\n"," (transactions_merge[\"CP_GeoRisk\"] == 5) |\n"," (transactions_merge[\"ProductRisk\"] == 5) |\n"," (transactions_merge[\"CurrencyRisk\"] == 5) |\n"," (transactions_merge[\"ChannelRisk\"] == 5)\n"," ),\n","\n"," # Low Risk\n"," (transactions_merge[\"PEP_Flag_CP\"] == \"N\") &\n"," (transactions_merge[\"WL_CP\"] == \"N\") &\n"," (\n"," (transactions_merge[\"CP_BusinessRisk\"] == 1) |\n"," (transactions_merge[\"CP_GeoRisk\"] == 1) |\n"," (transactions_merge[\"ProductRisk\"] == 1) |\n"," (transactions_merge[\"CurrencyRisk\"] == 1) |\n"," (transactions_merge[\"ChannelRisk\"] == 1)\n"," )\n","]\n","\n","choices_tx = [\"HighRisk\", \"MedRisk\", \"LowRisk\"]\n","\n","transactions_merge[\"Overall_Tranx_Risk\"] = np.select(conditions_tx, choices_tx, default=\"Unknown\")"],"metadata":{"id":"vXH_y3Gjbr46"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Calculate the Initiating Customer overall Risk and update the same in merged transaction file in a new field'Overall_IC_Risk'\n","conditions_ic = [\n"," # High Risk\n"," (transactions_merge[\"PEP_Flag_IC\"] == \"Y\") |\n"," (transactions_merge[\"IC_BusinessRisk\"] == 10) |\n"," (transactions_merge[\"IC_GeoRisk\"] == 10) |\n"," (transactions_merge[\"WL_IC\"] == \"Y\"),\n","\n"," # Medium Risk\n"," (transactions_merge[\"PEP_Flag_IC\"] == \"N\") &\n"," (transactions_merge[\"WL_IC\"] == \"N\") &\n"," ((transactions_merge[\"IC_BusinessRisk\"] == 5) | (transactions_merge[\"IC_GeoRisk\"] == 5)),\n","\n"," # Low Risk\n"," (transactions_merge[\"PEP_Flag_IC\"] == \"N\") &\n"," (transactions_merge[\"WL_IC\"] == \"N\") &\n"," ((transactions_merge[\"IC_BusinessRisk\"] == 1) | (transactions_merge[\"IC_GeoRisk\"] == 1))\n","]\n","\n","choices_ic = [\"HighRisk\", \"MedRisk\", \"LowRisk\"]\n","\n","transactions_merge[\"Overall_IC_Risk\"] = np.select(conditions_ic, choices_ic, default=\"Unknown\")"],"metadata":{"id":"xUIgYpjHbWjt"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#Check the merged transaction dataframe for the new fields\n","transactions_merge.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5dE-Osc9cYci","executionInfo":{"status":"ok","timestamp":1755093568613,"user_tz":-330,"elapsed":21,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"59e3db6f-173f-4ce6-ff99-480c3d79ae42"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 10000 entries, 0 to 9999\n","Data columns (total 37 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 TransactionDate 10000 non-null object \n"," 1 TransactionID 10000 non-null object \n"," 2 InitiatingCustomer 10000 non-null object \n"," 3 AccountNumber 10000 non-null object \n"," 4 Counterparty 10000 non-null object \n"," 5 TranType 10000 non-null object \n"," 6 Channel 10000 non-null object \n"," 7 Currency 10000 non-null object \n"," 8 Credit/Debit 10000 non-null object \n"," 9 Amount 10000 non-null float64\n"," 10 Amount in LCY 10000 non-null float64\n"," 11 Product 10000 non-null object \n"," 12 InitiatingGeo 10000 non-null object \n"," 13 CounterpartyGeo 10000 non-null object \n"," 14 BusinessCode_IC 10000 non-null int64 \n"," 15 CustTypeCode_IC 10000 non-null object \n"," 16 PEP_Flag_IC 10000 non-null object \n"," 17 ICName 10000 non-null object \n"," 18 BusinessCode_CP 10000 non-null int64 \n"," 19 CustTypeCode_CP 10000 non-null object \n"," 20 PEP_Flag_CP 10000 non-null object \n"," 21 CounterpartyName 10000 non-null object \n"," 22 ProductCode 10000 non-null object \n"," 23 IC_BusinessRisk 10000 non-null int64 \n"," 24 CP_BusinessRisk 10000 non-null int64 \n"," 25 ChannelRisk 10000 non-null int64 \n"," 26 IC_GeoRisk 10000 non-null int64 \n"," 27 CP_GeoRisk 10000 non-null int64 \n"," 28 IC_CustTypeRisk 10000 non-null int64 \n"," 29 CP_CustTypeRisk 10000 non-null int64 \n"," 30 ProductRisk 10000 non-null int64 \n"," 31 TranTypeRisk 10000 non-null int64 \n"," 32 CurrencyRisk 10000 non-null int64 \n"," 33 WL_IC 10000 non-null object \n"," 34 WL_CP 10000 non-null object \n"," 35 Overall_Tranx_Risk 10000 non-null object \n"," 36 Overall_IC_Risk 10000 non-null object \n","dtypes: float64(2), int64(12), object(23)\n","memory usage: 2.8+ MB\n"]}]},{"cell_type":"code","source":["#Check the spread of initiating customers Risk profile\n","transactions_merge[\"Overall_IC_Risk\"].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":210},"id":"9yn5HkaIdlPj","executionInfo":{"status":"ok","timestamp":1755093570929,"user_tz":-330,"elapsed":7,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"cd66d675-5f14-49f8-f39a-67d7b13d929f"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Overall_IC_Risk\n","LowRisk 9457\n","HighRisk 444\n","MedRisk 99\n","Name: count, dtype: int64"],"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
count
Overall_IC_Risk
LowRisk9457
HighRisk444
MedRisk99
\n","

"]},"metadata":{},"execution_count":32}]},{"cell_type":"code","source":["#Check the spread of transactions Risk profile\n","transactions_merge[\"Overall_Tranx_Risk\"].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":210},"id":"aq0flHN7dsGq","executionInfo":{"status":"ok","timestamp":1755093576004,"user_tz":-330,"elapsed":22,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"b9cd2704-e24b-47ea-d1a5-a35a217bcd96"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Overall_Tranx_Risk\n","LowRisk 5730\n","HighRisk 2754\n","MedRisk 1516\n","Name: count, dtype: int64"],"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
count
Overall_Tranx_Risk
LowRisk5730
HighRisk2754
MedRisk1516
\n","

"]},"metadata":{},"execution_count":33}]},{"cell_type":"code","source":["#Apply the rule based logic on the merged transaction file (Rule based logic is explained in the document)\n","#Rule based logic - Step 1 - aggregate the transaction amount by Overall_IC_Risk and Overall_Tranx_Risk\n","transactions_merge[\"TransactionID\"] = transactions_merge[\"TransactionID\"].astype(str)\n","agg_df = transactions_merge.groupby(\n"," [\"InitiatingCustomer\", \"Overall_IC_Risk\", \"Overall_Tranx_Risk\"]\n",").agg({\n"," \"TransactionID\": lambda x: \",\".join(x), # Join IDs with commas\n"," \"Amount in LCY\": \"sum\" # Total transaction amount\n","}).reset_index()\n","\n","agg_df.rename(columns={\n"," \"TransactionID\": \"Transaction IDs\",\n"," \"Amount in LCY\": \"Total Amount\"\n","}, inplace=True)\n","\n","# Add 'Total Tranx' column by counting commas + 1\n","agg_df[\"Total Tranx\"] = agg_df[\"Transaction IDs\"].apply(lambda x: len(x.split(\",\")))\n","\n","print(agg_df.head())"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TqESGm_yfMuX","executionInfo":{"status":"ok","timestamp":1755093580303,"user_tz":-330,"elapsed":83,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"41a64f92-bda1-4f9c-b787-09bb49d4cee1","collapsed":true},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":[" InitiatingCustomer Overall_IC_Risk Overall_Tranx_Risk \\\n","0 CUST0001 HighRisk HighRisk \n","1 CUST0001 HighRisk LowRisk \n","2 CUST0001 HighRisk MedRisk \n","3 CUST0002 HighRisk HighRisk \n","4 CUST0002 HighRisk LowRisk \n","\n"," Transaction IDs Total Amount \\\n","0 T01617,T01780,T01894,T01936,T02732,T03453,T044... 39255.59 \n","1 T01218,T01458,T01555,T04695,T04786,T05237,T05369 137990.55 \n","2 T02355,T09122 3877.35 \n","3 T00227,T00505,T01384,T03237,T05523,T06055,T066... 16041.11 \n","4 T01669,T02364,T03421,T03940,T04612,T05747,T070... 152152.73 \n","\n"," Total Tranx \n","0 11 \n","1 7 \n","2 2 \n","3 9 \n","4 13 \n"]}]},{"cell_type":"code","source":["#Rule based logic - Step 2 - Flag the alerts based on thresholds\n","risk_summary = transactions_merge.groupby(\n"," ['InitiatingCustomer', 'Overall_Tranx_Risk']\n",")['Amount in LCY'].sum().unstack(fill_value=0).reset_index()\n","\n","# Ensure all risk levels are represented (fill if missing)\n","for level in ['HighRisk', 'MedRisk', 'LowRisk']:\n"," if level not in risk_summary.columns:\n"," risk_summary[level] = 0\n","\n","# Step 2: Add total transaction amount per customer (T)\n","risk_summary['TotalAmount'] = (\n"," risk_summary['HighRisk'] + risk_summary['MedRisk'] + risk_summary['LowRisk']\n",")\n","\n","# Step 3: Compute H/T and M/T ratios\n","risk_summary['H_ratio'] = risk_summary['HighRisk'] / risk_summary['TotalAmount']\n","risk_summary['M_ratio'] = risk_summary['MedRisk'] / risk_summary['TotalAmount']\n","\n","# Step 4: Prepare a mapping from InitiatingCustomer to Alert value\n","def determine_alert(row):\n"," if row['H_ratio'] >= 0.80 and row['HighRisk'] >= 1500:\n"," return 'A-P1'\n"," elif row['M_ratio'] >= 0.60 and row['MedRisk'] >= 1000:\n"," return 'A-P2'\n"," else:\n"," return 'N'\n","\n","risk_summary['Alert'] = risk_summary.apply(determine_alert, axis=1)\n","\n","# Step 5: Merge the Alert info back into agg_df\n","agg_df = agg_df.merge(\n"," risk_summary[['InitiatingCustomer', 'Alert']],\n"," on='InitiatingCustomer',\n"," how='left'\n",")"],"metadata":{"id":"-9xHi8n1ESlR"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["agg_df.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"OPRBiM1HJhQ1","executionInfo":{"status":"ok","timestamp":1755093586494,"user_tz":-330,"elapsed":14,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"ed65d758-7396-4490-f57f-311c78acde4c"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 1190 entries, 0 to 1189\n","Data columns (total 7 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 InitiatingCustomer 1190 non-null object \n"," 1 Overall_IC_Risk 1190 non-null object \n"," 2 Overall_Tranx_Risk 1190 non-null object \n"," 3 Transaction IDs 1190 non-null object \n"," 4 Total Amount 1190 non-null float64\n"," 5 Total Tranx 1190 non-null int64 \n"," 6 Alert 1190 non-null object \n","dtypes: float64(1), int64(1), object(5)\n","memory usage: 65.2+ KB\n"]}]},{"cell_type":"code","source":["risk_summary[risk_summary['Alert']=='A-P2']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":269},"id":"ZTdPK8okJ1RO","executionInfo":{"status":"ok","timestamp":1755093589892,"user_tz":-330,"elapsed":76,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"85846e0f-5675-4a01-af07-4cc63b528984"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Overall_Tranx_Risk InitiatingCustomer HighRisk LowRisk MedRisk \\\n","10 CUST0011 4835.82 86999.37 157961.07 \n","147 CUST0148 32848.61 63993.54 206865.94 \n","162 CUST0163 26303.46 16461.66 103396.17 \n","163 CUST0164 13747.41 19260.94 130419.90 \n","181 CUST0182 336.15 63525.89 108810.41 \n","286 CUST0287 22143.91 43007.52 100272.09 \n","387 CUST0388 29959.36 27145.89 203083.18 \n","\n","Overall_Tranx_Risk TotalAmount H_ratio M_ratio Alert \n","10 249796.26 0.019359 0.632360 A-P2 \n","147 303708.09 0.108158 0.681134 A-P2 \n","162 146161.29 0.179962 0.707411 A-P2 \n","163 163428.25 0.084119 0.798025 A-P2 \n","181 172672.45 0.001947 0.630155 A-P2 \n","286 165423.52 0.133862 0.606154 A-P2 \n","387 260188.43 0.115145 0.780523 A-P2 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
Overall_Tranx_RiskInitiatingCustomerHighRiskLowRiskMedRiskTotalAmountH_ratioM_ratioAlert
10CUST00114835.8286999.37157961.07249796.260.0193590.632360A-P2
147CUST014832848.6163993.54206865.94303708.090.1081580.681134A-P2
162CUST016326303.4616461.66103396.17146161.290.1799620.707411A-P2
163CUST016413747.4119260.94130419.90163428.250.0841190.798025A-P2
181CUST0182336.1563525.89108810.41172672.450.0019470.630155A-P2
286CUST028722143.9143007.52100272.09165423.520.1338620.606154A-P2
387CUST038829959.3627145.89203083.18260188.430.1151450.780523A-P2
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"risk_summary[risk_summary['Alert']=='A-P2']\",\n \"rows\": 7,\n \"fields\": [\n {\n \"column\": \"InitiatingCustomer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"CUST0011\",\n \"CUST0148\",\n \"CUST0287\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"HighRisk\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12586.559941113679,\n \"min\": 336.15,\n \"max\": 32848.61,\n \"num_unique_values\": 7,\n \"samples\": [\n 4835.82,\n 32848.61,\n 22143.91\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LowRisk\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 26656.54332366052,\n \"min\": 16461.66,\n \"max\": 86999.37,\n \"num_unique_values\": 7,\n \"samples\": [\n 86999.37,\n 63993.54,\n 43007.520000000004\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MedRisk\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 45862.585534749865,\n \"min\": 100272.09000000001,\n \"max\": 206865.94,\n \"num_unique_values\": 7,\n \"samples\": [\n 157961.07,\n 206865.94,\n 100272.09000000001\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"TotalAmount\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 61234.42002097477,\n \"min\": 146161.28999999998,\n \"max\": 303708.08999999997,\n \"num_unique_values\": 7,\n \"samples\": [\n 249796.26,\n 303708.08999999997,\n 165423.52000000002\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"H_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.06289488543776585,\n \"min\": 0.0019467494669821385,\n \"max\": 0.17996187636274968,\n \"num_unique_values\": 7,\n \"samples\": [\n 0.01935905685697616,\n 0.10815849521822091,\n 0.13386191999783342\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"M_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.07547078809553161,\n \"min\": 0.606153768218691,\n \"max\": 0.7980254331793921,\n \"num_unique_values\": 7,\n \"samples\": [\n 0.6323596278022737,\n 0.6811341113764866,\n 0.606153768218691\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Alert\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A-P2\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":37}]},{"cell_type":"code","source":["risk_summary[risk_summary['Alert']=='A-P1']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"BGy_SK0z3_9x","executionInfo":{"status":"ok","timestamp":1755093591860,"user_tz":-330,"elapsed":81,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"a1ba7523-4065-475a-8c08-1ff0756ccd50"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Overall_Tranx_Risk InitiatingCustomer HighRisk LowRisk MedRisk \\\n","44 CUST0045 161501.72 32855.53 4322.41 \n","79 CUST0080 128858.02 9789.65 13615.97 \n","90 CUST0091 98082.21 4111.65 7909.00 \n","337 CUST0338 179639.58 33447.37 7904.22 \n","346 CUST0347 236090.97 19138.33 1364.00 \n","\n","Overall_Tranx_Risk TotalAmount H_ratio M_ratio Alert \n","44 198679.66 0.812875 0.021756 A-P1 \n","79 152263.64 0.846282 0.089424 A-P1 \n","90 110102.86 0.890823 0.071833 A-P1 \n","337 220991.17 0.812881 0.035767 A-P1 \n","346 256593.30 0.920098 0.005316 A-P1 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
Overall_Tranx_RiskInitiatingCustomerHighRiskLowRiskMedRiskTotalAmountH_ratioM_ratioAlert
44CUST0045161501.7232855.534322.41198679.660.8128750.021756A-P1
79CUST0080128858.029789.6513615.97152263.640.8462820.089424A-P1
90CUST009198082.214111.657909.00110102.860.8908230.071833A-P1
337CUST0338179639.5833447.377904.22220991.170.8128810.035767A-P1
346CUST0347236090.9719138.331364.00256593.300.9200980.005316A-P1
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"risk_summary[risk_summary['Alert']=='A-P1']\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"InitiatingCustomer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"CUST0080\",\n \"CUST0347\",\n \"CUST0091\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"HighRisk\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 52387.87471025667,\n \"min\": 98082.20999999999,\n \"max\": 236090.97,\n \"num_unique_values\": 5,\n \"samples\": [\n 128858.01999999999,\n 236090.97,\n 98082.20999999999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LowRisk\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13261.253204085955,\n \"min\": 4111.65,\n \"max\": 33447.37,\n \"num_unique_values\": 5,\n \"samples\": [\n 9789.650000000001,\n 19138.33,\n 4111.65\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MedRisk\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4592.011143970581,\n \"min\": 1364.0,\n \"max\": 13615.97,\n \"num_unique_values\": 5,\n \"samples\": [\n 13615.97,\n 1364.0,\n 7909.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"TotalAmount\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 57559.07845849411,\n \"min\": 110102.85999999999,\n \"max\": 256593.3,\n \"num_unique_values\": 5,\n \"samples\": [\n 152263.63999999998,\n 256593.3,\n 110102.85999999999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"H_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.04778309791493816,\n \"min\": 0.8128749566009927,\n \"max\": 0.9200979526745243,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.8462822772396614,\n 0.9200979526745243,\n 0.890823453632358\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"M_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.03497741151157996,\n \"min\": 0.005315805206137495,\n \"max\": 0.08942364703746739,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.08942364703746739,\n 0.005315805206137495,\n 0.07183282977390415\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Alert\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A-P1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":38}]},{"cell_type":"code","source":["agg_df['Alert'].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":210},"id":"_-HItInvJj3n","executionInfo":{"status":"ok","timestamp":1755093594496,"user_tz":-330,"elapsed":8,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"52be776f-a7a7-42d0-b411-15c6203190ee"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Alert\n","N 1154\n","A-P2 21\n","A-P1 15\n","Name: count, dtype: int64"],"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
count
Alert
N1154
A-P221
A-P115
\n","

"]},"metadata":{},"execution_count":39}]},{"cell_type":"code","source":["#Rule based logic - Step 3 - Identify the transactions related to the alerts\n","# Step 1: Filter agg_df for alerts A-P1 and A-P2\n","filtered_agg_df = agg_df[agg_df['Alert'].isin(['A-P1', 'A-P2'])].copy()\n","\n","# Step 2: Split 'Transaction IDs' string into a list\n","filtered_agg_df['Transaction IDs'] = filtered_agg_df['Transaction IDs'].str.split(',')\n","\n","# Step 3: Explode into individual rows\n","flat_trx_df = filtered_agg_df.explode('Transaction IDs').reset_index(drop=True)\n","\n","# Step 4: Trim whitespaces if any\n","flat_trx_df['Transaction IDs'] = flat_trx_df['Transaction IDs'].str.strip()\n","\n","# Step 5: Keep only needed columns\n","flat_trx_df = flat_trx_df[['InitiatingCustomer', 'Transaction IDs', 'Alert']]"],"metadata":{"id":"GslPm1IIcpWQ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["flat_trx_df.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dn1Q-mzHfHue","executionInfo":{"status":"ok","timestamp":1755093599181,"user_tz":-330,"elapsed":13,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"ab9e4fc3-9763-4209-99c8-52c035f84f6a"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 246 entries, 0 to 245\n","Data columns (total 3 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 InitiatingCustomer 246 non-null object\n"," 1 Transaction IDs 246 non-null object\n"," 2 Alert 246 non-null object\n","dtypes: object(3)\n","memory usage: 5.9+ KB\n"]}]},{"cell_type":"code","source":["flat_trx_df[\"Alert\"].value_counts()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":178},"id":"uOORur6ILFhq","executionInfo":{"status":"ok","timestamp":1755093601333,"user_tz":-330,"elapsed":26,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"2f2f2da2-7818-4c01-e607-f1a715ac579b"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Alert\n","A-P2 156\n","A-P1 90\n","Name: count, dtype: int64"],"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
count
Alert
A-P2156
A-P190
\n","

"]},"metadata":{},"execution_count":42}]},{"cell_type":"code","source":["flat_trx_df.to_csv(\"transaction_alerts.csv\")"],"metadata":{"id":"hOoq7qPF-dOa"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["#3. Build & train Variational Auto-Encoder (VAE) model"],"metadata":{"id":"NoPz1HGTXyIn"}},{"cell_type":"code","source":["#Import necessary libraries\n","from sklearn.model_selection import train_test_split\n","from sklearn.preprocessing import MinMaxScaler, OneHotEncoder\n","from sklearn.compose import ColumnTransformer\n","from sklearn.pipeline import Pipeline\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","import tensorflow as tf\n","from tensorflow.keras import layers, Model"],"metadata":{"id":"gWv3BHtss8m8"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["transactions_features = transactions_merge.iloc[:,[10,20,24,25,27,30,32,34]]"],"metadata":{"id":"DBNYItACHMHP"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["transactions_features.info()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"p5zm9rPeP5EJ","executionInfo":{"status":"ok","timestamp":1755093619626,"user_tz":-330,"elapsed":4,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"d7b0729b-1544-42f4-976d-65d22f9cd144"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 10000 entries, 0 to 9999\n","Data columns (total 8 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 Amount in LCY 10000 non-null float64\n"," 1 PEP_Flag_CP 10000 non-null object \n"," 2 CP_BusinessRisk 10000 non-null int64 \n"," 3 ChannelRisk 10000 non-null int64 \n"," 4 CP_GeoRisk 10000 non-null int64 \n"," 5 ProductRisk 10000 non-null int64 \n"," 6 CurrencyRisk 10000 non-null int64 \n"," 7 WL_CP 10000 non-null object \n","dtypes: float64(1), int64(5), object(2)\n","memory usage: 625.1+ KB\n"]}]},{"cell_type":"code","source":["all_indices = np.arange(len(transactions_features))\n","\n","X_train_raw, X_test_raw, idx_train, idx_test = train_test_split(\n"," transactions_features, all_indices, test_size=0.2, random_state=55\n",")"],"metadata":{"id":"2axuq7iMT4em"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.preprocessing import MinMaxScaler, OneHotEncoder\n","from sklearn.compose import ColumnTransformer\n","from sklearn.pipeline import Pipeline\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","import keras\n","\n","# Step 1: Define feature groups\n","numeric_features = [\"Amount in LCY\", \"CP_BusinessRisk\",\"ChannelRisk\", \"CP_GeoRisk\", \"ProductRisk\",\"CurrencyRisk\"]\n","\n","binary_features = [\"PEP_Flag_CP\", \"WL_CP\"]\n","\n","# Step 2: Define transformers\n","preprocessor = ColumnTransformer(\n"," transformers=[\n"," (\"num\", MinMaxScaler(), numeric_features),\n"," (\"bin\", OneHotEncoder(drop='if_binary', dtype=int), binary_features)\n"," ]\n",")\n","\n","# Step 3: Set up pipeline\n","pipeline = Pipeline(steps=[(\"preprocessor\", preprocessor)])\n","#X_preprocessed = pipeline.fit_transform(transactions_features_hr_vae)\n","pipeline.fit(X_train_raw)\n","X_train = pipeline.transform(X_train_raw)\n","X_test = pipeline.transform(X_test_raw)"],"metadata":{"id":"EcI3bmK6-6af"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["@keras.saving.register_keras_serializable()\n","class VAE(Model):\n"," def __init__(self, input_dim, latent_dim, **kwargs):\n"," super().__init__(**kwargs)\n"," self.input_dim = input_dim\n"," self.latent_dim = latent_dim\n","\n"," self.encoder = tf.keras.Sequential([\n"," layers.Input(shape=(input_dim,)),\n"," layers.Dense(512, activation='relu'),\n"," layers.Dense(256, activation='relu'),\n"," layers.Dense(128, activation='relu'),\n"," layers.Dense(64, activation='relu'),\n"," layers.Dense(32, activation='relu'),\n"," layers.Dense(16, activation='relu'),\n"," layers.Dense(8, activation='relu'),\n"," layers.Dense(latent_dim * 2), # z_mean and z_log_var\n"," ])\n","\n"," self.decoder = tf.keras.Sequential([\n"," layers.Input(shape=(latent_dim,)),\n"," layers.Dense(8, activation='relu'),\n"," layers.Dense(16, activation='relu'),\n"," layers.Dense(32, activation='relu'),\n"," layers.Dense(64, activation='relu'),\n"," layers.Dense(128, activation='relu'),\n"," layers.Dense(256, activation='relu'),\n"," layers.Dense(512, activation='relu'),\n"," layers.Dense(input_dim, activation='sigmoid'),\n"," ])\n","\n"," def sample(self, z_mean, z_log_var):\n"," eps = tf.random.normal(shape=tf.shape(z_mean))\n"," return z_mean + tf.exp(0.5 * z_log_var) * eps\n","\n"," def call(self, inputs):\n"," # Encode\n"," z_params = self.encoder(inputs)\n"," z_mean, z_log_var = tf.split(z_params, num_or_size_splits=2, axis=1)\n"," z = self.sample(z_mean, z_log_var)\n","\n"," # Decode\n"," reconstructed = self.decoder(z)\n","\n"," # Compute VAE Loss\n"," reconstruction_loss = tf.reduce_mean(tf.square(inputs - reconstructed), axis=1)\n"," kl_loss = -0.5 * tf.reduce_mean(\n"," 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1\n"," )\n"," total_loss = tf.reduce_mean(reconstruction_loss + kl_loss)\n"," self.add_loss(total_loss)\n","\n"," return reconstructed\n","\n"," def get_config(self):\n"," config = super().get_config()\n"," config.update({\n"," \"input_dim\": self.input_dim,\n"," \"latent_dim\": self.latent_dim\n"," })\n"," return config"],"metadata":{"id":"8LDgPG74pNoi"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["vae.summary()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":209},"id":"mJjIlqLsvVj7","executionInfo":{"status":"ok","timestamp":1755093711873,"user_tz":-330,"elapsed":64,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"423d82dc-8f6c-470c-ca42-1fb3a30891d7"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["\u001b[1mModel: \"vae\"\u001b[0m\n"],"text/html":["
Model: \"vae\"\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n","┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n","┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n","│ sequential (\u001b[38;5;33mSequential\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m12\u001b[0m) │ \u001b[38;5;34m179,940\u001b[0m │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ sequential_1 (\u001b[38;5;33mSequential\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m179,888\u001b[0m │\n","└─────────────────────────────────┴────────────────────────┴───────────────┘\n"],"text/html":["
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n","┃ Layer (type)                     Output Shape                  Param # ┃\n","┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n","│ sequential (Sequential)         │ (None, 12)             │       179,940 │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ sequential_1 (Sequential)       │ (None, 8)              │       179,888 │\n","└─────────────────────────────────┴────────────────────────┴───────────────┘\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["\u001b[1m Total params: \u001b[0m\u001b[38;5;34m1,079,486\u001b[0m (4.12 MB)\n"],"text/html":["
 Total params: 1,079,486 (4.12 MB)\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m359,828\u001b[0m (1.37 MB)\n"],"text/html":["
 Trainable params: 359,828 (1.37 MB)\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"],"text/html":["
 Non-trainable params: 0 (0.00 B)\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["\u001b[1m Optimizer params: \u001b[0m\u001b[38;5;34m719,658\u001b[0m (2.75 MB)\n"],"text/html":["
 Optimizer params: 719,658 (2.75 MB)\n","
\n"]},"metadata":{}}]},{"cell_type":"code","source":["#Compile and train\n","vae = VAE(input_dim=X_train.shape[1], latent_dim=6)\n","vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005))\n","vae.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BBhoFYEcA9vR","executionInfo":{"status":"ok","timestamp":1755093705791,"user_tz":-330,"elapsed":75359,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"8b54c6cb-c02c-419b-e931-7b19f3cfab15","collapsed":true},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Epoch 1/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m11s\u001b[0m 14ms/step - loss: 0.0821 - val_loss: 0.0427\n","Epoch 2/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 5ms/step - loss: 0.0415 - val_loss: 0.0423\n","Epoch 3/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0407 - val_loss: 0.0419\n","Epoch 4/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0420 - val_loss: 0.0419\n","Epoch 5/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0398 - val_loss: 0.0419\n","Epoch 6/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0414 - val_loss: 0.0421\n","Epoch 7/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0407 - val_loss: 0.0419\n","Epoch 8/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0401 - val_loss: 0.0419\n","Epoch 9/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0404 - val_loss: 0.0418\n","Epoch 10/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0389 - val_loss: 0.0420\n","Epoch 11/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0401 - val_loss: 0.0422\n","Epoch 12/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0407 - val_loss: 0.0419\n","Epoch 13/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 5ms/step - loss: 0.0396 - val_loss: 0.0418\n","Epoch 14/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0397 - val_loss: 0.0420\n","Epoch 15/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0404 - val_loss: 0.0422\n","Epoch 16/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0401 - val_loss: 0.0420\n","Epoch 17/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0391 - val_loss: 0.0419\n","Epoch 18/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0407 - val_loss: 0.0419\n","Epoch 19/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0401 - val_loss: 0.0419\n","Epoch 20/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0410 - val_loss: 0.0421\n","Epoch 21/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0405 - val_loss: 0.0418\n","Epoch 22/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0404 - val_loss: 0.0419\n","Epoch 23/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0403 - val_loss: 0.0419\n","Epoch 24/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0405 - val_loss: 0.0418\n","Epoch 25/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0410 - val_loss: 0.0419\n","Epoch 26/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0403 - val_loss: 0.0419\n","Epoch 27/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0397 - val_loss: 0.0419\n","Epoch 28/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0399 - val_loss: 0.0419\n","Epoch 29/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0385 - val_loss: 0.0418\n","Epoch 30/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0402 - val_loss: 0.0420\n","Epoch 31/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0397 - val_loss: 0.0418\n","Epoch 32/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0402 - val_loss: 0.0420\n","Epoch 33/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0406 - val_loss: 0.0418\n","Epoch 34/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 6ms/step - loss: 0.0407 - val_loss: 0.0419\n","Epoch 35/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0398 - val_loss: 0.0418\n","Epoch 36/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0398 - val_loss: 0.0420\n","Epoch 37/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0386 - val_loss: 0.0418\n","Epoch 38/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0397 - val_loss: 0.0418\n","Epoch 39/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0404 - val_loss: 0.0419\n","Epoch 40/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0402 - val_loss: 0.0418\n","Epoch 41/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0412 - val_loss: 0.0418\n","Epoch 42/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0405 - val_loss: 0.0421\n","Epoch 43/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 5ms/step - loss: 0.0396 - val_loss: 0.0420\n","Epoch 44/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 6ms/step - loss: 0.0408 - val_loss: 0.0419\n","Epoch 45/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0408 - val_loss: 0.0418\n","Epoch 46/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0391 - val_loss: 0.0420\n","Epoch 47/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0396 - val_loss: 0.0418\n","Epoch 48/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0398 - val_loss: 0.0419\n","Epoch 49/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0397 - val_loss: 0.0418\n","Epoch 50/50\n","\u001b[1m250/250\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step - loss: 0.0392 - val_loss: 0.0420\n"]},{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":49}]},{"cell_type":"code","source":["reconstructed = vae.predict(X_test)\n","reconstruction_error = tf.reduce_mean(tf.square(X_test - reconstructed), axis=1).numpy()\n","threshold = np.percentile(reconstruction_error, 87.7)\n","\n","anomalies = reconstruction_error > threshold\n","print(f\"Anomalies detected: {np.sum(anomalies)} / {len(X_test)}\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BqMA-MBVDMok","executionInfo":{"status":"ok","timestamp":1755093785450,"user_tz":-330,"elapsed":190,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"c140d98d-202e-4b20-c77c-21036064a993"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[1m63/63\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 2ms/step\n","Anomalies detected: 246 / 2000\n"]}]},{"cell_type":"code","source":["# Get indices of anomalies\n","anomaly_indices = np.where(anomalies)[0]\n","\n","# View the actual anomalous transaction(s)\n","anomalous_transactions = X_test[anomaly_indices]\n"],"metadata":{"collapsed":true,"id":"6iKDVHmIDbgO"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["print(\"Anomaly found at row(s):\", anomaly_indices)"],"metadata":{"collapsed":true,"colab":{"base_uri":"https://localhost:8080/"},"id":"Y1vUmD4JDhHn","executionInfo":{"status":"ok","timestamp":1755093790195,"user_tz":-330,"elapsed":16,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"43271337-2a60-43fa-b93b-60131bf62fb5"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Anomaly found at row(s): [ 2 6 17 18 27 35 40 47 49 50 63 65 67 71\n"," 72 82 86 87 101 125 128 149 151 155 156 162 163 171\n"," 185 191 192 196 204 224 238 242 244 249 254 264 268 287\n"," 301 306 309 323 325 327 336 367 375 377 387 390 392 403\n"," 412 417 418 419 422 434 435 439 440 445 450 462 464 465\n"," 485 486 512 529 539 547 556 565 569 578 579 581 595 604\n"," 617 625 634 641 651 663 667 682 686 692 697 716 717 727\n"," 739 749 768 785 795 802 803 807 809 811 822 841 861 862\n"," 876 886 891 893 909 921 931 965 988 992 993 998 1000 1008\n"," 1009 1030 1035 1040 1042 1052 1070 1073 1096 1101 1106 1111 1120 1124\n"," 1125 1137 1147 1165 1169 1171 1178 1186 1187 1188 1199 1201 1203 1215\n"," 1216 1218 1227 1233 1236 1239 1244 1247 1251 1252 1256 1279 1296 1297\n"," 1301 1302 1319 1323 1328 1334 1347 1348 1352 1369 1372 1373 1374 1381\n"," 1389 1398 1400 1411 1412 1417 1424 1426 1428 1439 1443 1452 1453 1455\n"," 1474 1485 1494 1527 1542 1561 1577 1586 1587 1592 1600 1601 1604 1621\n"," 1653 1662 1669 1671 1676 1682 1688 1696 1697 1703 1708 1712 1743 1753\n"," 1756 1757 1763 1799 1803 1812 1817 1824 1843 1853 1872 1890 1903 1911\n"," 1920 1922 1926 1971 1974 1977 1994 1996]\n"]}]},{"cell_type":"code","source":["# Step 3: Get global indices (in original DataFrame)\n","global_anomaly_indices = idx_test[anomaly_indices]\n","\n","# Step 4: Fetch those rows from transactions_features_hr\n","anomaly_transactions_df = transactions_merge.iloc[global_anomaly_indices]\n","\n","# Optional: View it\n","import pandas as pd\n","pd.set_option('display.max_columns', None)\n","display(anomaly_transactions_df)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":877},"id":"fyLneOyVDnkW","executionInfo":{"status":"ok","timestamp":1755093793194,"user_tz":-330,"elapsed":171,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"7d9dccbe-2aa9-404b-d036-d0a1796d4c07","collapsed":true},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":[" TransactionDate TransactionID InitiatingCustomer AccountNumber \\\n","2751 2025-07-29 T02752 CUST0144 ACC00144 \n","4798 2025-07-20 T04799 CUST0393 ACC00393 \n","1284 2025-07-24 T01285 CUST0105 ACC00562 \n","1990 2025-07-20 T01991 CUST0305 ACC00808 \n","4449 2025-07-21 T04450 CUST0203 ACC00203 \n","... ... ... ... ... \n","3185 2025-07-24 T03186 CUST0132 ACC00698 \n","4464 2025-07-28 T04465 CUST0258 ACC00992 \n","9705 2025-07-30 T09706 CUST0001 ACC00001 \n","2261 2025-07-22 T02262 CUST0115 ACC00424 \n","6969 2025-08-01 T06970 CUST0024 ACC00024 \n","\n"," Counterparty TranType Channel Currency Credit/Debit Amount \\\n","2751 CUST0070 FTDO MBAN VEF Cr 2606.47 \n","4798 CUST0205 FTDO MBAN USD Dr 502.92 \n","1284 CUST0065 MPAY IBAN USD Cr 849.94 \n","1990 CUST0183 FTDO CUST VEF Dr 1186.68 \n","4449 CUST0273 FEEC IBAN USD Cr 15073.67 \n","... ... ... ... ... ... ... \n","3185 CUST0226 BPAY IBAN USD Cr 12471.58 \n","4464 CUST0288 BPAY CHEQ INR Dr 22486.55 \n","9705 CUST0380 FTDO CUST USD Dr 7266.25 \n","2261 CUST0117 FEEC MBAN USD Dr 58381.93 \n","6969 CUST0002 FTDO SELF USD Cr 4603.60 \n","\n"," Amount in LCY Product InitiatingGeo CounterpartyGeo BusinessCode_IC \\\n","2751 208.52 PLON ARE ARE 1020 \n","4798 502.92 BUSL ARE ARE 1015 \n","1284 849.94 GLON CHN USD 1002 \n","1990 94.93 BUSL BRA BRA 1019 \n","4449 15073.67 BUSL COL CAN 1005 \n","... ... ... ... ... ... \n","3185 12471.58 BUSL ARE GBR 1006 \n","4464 269.84 PLON CHN HKG 1025 \n","9705 7266.25 BILLS NPL NPL 1014 \n","2261 58381.93 BUSL AUS IND 1009 \n","6969 4603.60 HOUL CAN CAN 1004 \n","\n"," CustTypeCode_IC PEP_Flag_IC ICName \\\n","2751 MBNK N Christopher Rubio Banking Ltd \n","4798 NBFC N Brian Smith Non-Banking Fin Co \n","1284 NBFC N Richard Henson Non-Banking Fin Co \n","1990 GOVT N Maria Parker Govt Co \n","4449 BFIS N Elizabeth Perkins Banking Ltd \n","... ... ... ... \n","3185 LLPF N Dustin Jordan Limited LP \n","4464 ASSO N Kathleen Moran & Association \n","9705 CLSO N Allison Hill Club \n","2261 FCOM N Garrett Lin Foreign Co \n","6969 BFIS N Tommy Walter Banking Ltd \n","\n"," BusinessCode_CP CustTypeCode_CP PEP_Flag_CP \\\n","2751 1008 SGOV N \n","4798 1013 PVTL N \n","1284 1022 INDM Y \n","1990 1019 PART N \n","4449 1024 MBNK N \n","... ... ... ... \n","3185 1011 MBNK N \n","4464 1013 PUBL Y \n","9705 1019 BFIS N \n","2261 1007 SGOV N \n","6969 1014 CLSO N \n","\n"," CounterpartyName ProductCode IC_BusinessRisk \\\n","2751 William Baker State Gov Co OVDF 1 \n","4798 Erin Warner Pvt Ltd Company PLON 1 \n","1284 Jessica Callahan BILLS 1 \n","1990 Angela Vaughn and Partners SAVS 1 \n","4449 Jonathan Lawrence Banking Ltd ELON 1 \n","... ... ... ... \n","3185 Benjamin Smith Banking Ltd SAVS 1 \n","4464 Carolyn Miller Pub Ltd Company CHEK 1 \n","9705 Megan Nelson Banking Ltd BUSL 10 \n","2261 William Herrera State Gov Co PLON 1 \n","6969 Noah Rhodes Club HOUL 1 \n","\n"," CP_BusinessRisk ChannelRisk IC_GeoRisk CP_GeoRisk IC_CustTypeRisk \\\n","2751 1 5 1 1 1 \n","4798 1 5 1 1 1 \n","1284 1 10 1 1 1 \n","1990 1 1 1 1 1 \n","4449 1 10 1 1 1 \n","... ... ... ... ... ... \n","3185 1 10 1 1 1 \n","4464 1 1 1 1 1 \n","9705 1 1 10 10 10 \n","2261 1 5 1 1 1 \n","6969 10 1 1 1 1 \n","\n"," CP_CustTypeRisk ProductRisk TranTypeRisk CurrencyRisk WL_IC WL_CP \\\n","2751 1 1 5 10 N N \n","4798 1 10 5 1 Y N \n","1284 1 1 1 1 N N \n","1990 1 10 5 10 N N \n","4449 1 10 1 1 N N \n","... ... ... ... ... ... ... \n","3185 1 10 1 1 N N \n","4464 1 1 1 1 N N \n","9705 1 1 5 1 N N \n","2261 1 10 1 1 N N \n","6969 10 5 5 1 N N \n","\n"," Overall_Tranx_Risk Overall_IC_Risk \n","2751 HighRisk LowRisk \n","4798 HighRisk HighRisk \n","1284 HighRisk LowRisk \n","1990 HighRisk LowRisk \n","4449 HighRisk LowRisk \n","... ... ... \n","3185 HighRisk LowRisk \n","4464 HighRisk LowRisk \n","9705 HighRisk HighRisk \n","2261 HighRisk LowRisk \n","6969 HighRisk LowRisk \n","\n","[246 rows x 37 columns]"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
TransactionDateTransactionIDInitiatingCustomerAccountNumberCounterpartyTranTypeChannelCurrencyCredit/DebitAmountAmount in LCYProductInitiatingGeoCounterpartyGeoBusinessCode_ICCustTypeCode_ICPEP_Flag_ICICNameBusinessCode_CPCustTypeCode_CPPEP_Flag_CPCounterpartyNameProductCodeIC_BusinessRiskCP_BusinessRiskChannelRiskIC_GeoRiskCP_GeoRiskIC_CustTypeRiskCP_CustTypeRiskProductRiskTranTypeRiskCurrencyRiskWL_ICWL_CPOverall_Tranx_RiskOverall_IC_Risk
27512025-07-29T02752CUST0144ACC00144CUST0070FTDOMBANVEFCr2606.47208.52PLONAREARE1020MBNKNChristopher Rubio Banking Ltd1008SGOVNWilliam Baker State Gov CoOVDF11511111510NNHighRiskLowRisk
47982025-07-20T04799CUST0393ACC00393CUST0205FTDOMBANUSDDr502.92502.92BUSLAREARE1015NBFCNBrian Smith Non-Banking Fin Co1013PVTLNErin Warner Pvt Ltd CompanyPLON11511111051YNHighRiskHighRisk
12842025-07-24T01285CUST0105ACC00562CUST0065MPAYIBANUSDCr849.94849.94GLONCHNUSD1002NBFCNRichard Henson Non-Banking Fin Co1022INDMYJessica CallahanBILLS11101111111NNHighRiskLowRisk
19902025-07-20T01991CUST0305ACC00808CUST0183FTDOCUSTVEFDr1186.6894.93BUSLBRABRA1019GOVTNMaria Parker Govt Co1019PARTNAngela Vaughn and PartnersSAVS111111110510NNHighRiskLowRisk
44492025-07-21T04450CUST0203ACC00203CUST0273FEECIBANUSDCr15073.6715073.67BUSLCOLCAN1005BFISNElizabeth Perkins Banking Ltd1024MBNKNJonathan Lawrence Banking LtdELON111011111011NNHighRiskLowRisk
..................................................................................................................
31852025-07-24T03186CUST0132ACC00698CUST0226BPAYIBANUSDCr12471.5812471.58BUSLAREGBR1006LLPFNDustin Jordan Limited LP1011MBNKNBenjamin Smith Banking LtdSAVS111011111011NNHighRiskLowRisk
44642025-07-28T04465CUST0258ACC00992CUST0288BPAYCHEQINRDr22486.55269.84PLONCHNHKG1025ASSONKathleen Moran & Association1013PUBLYCarolyn Miller Pub Ltd CompanyCHEK1111111111NNHighRiskLowRisk
97052025-07-30T09706CUST0001ACC00001CUST0380FTDOCUSTUSDDr7266.257266.25BILLSNPLNPL1014CLSONAllison Hill Club1019BFISNMegan Nelson Banking LtdBUSL10111010101151NNHighRiskHighRisk
22612025-07-22T02262CUST0115ACC00424CUST0117FEECMBANUSDDr58381.9358381.93BUSLAUSIND1009FCOMNGarrett Lin Foreign Co1007SGOVNWilliam Herrera State Gov CoPLON11511111011NNHighRiskLowRisk
69692025-08-01T06970CUST0024ACC00024CUST0002FTDOSELFUSDCr4603.604603.60HOULCANCAN1004BFISNTommy Walter Banking Ltd1014CLSONNoah Rhodes ClubHOUL110111110551NNHighRiskLowRisk
\n","

246 rows × 37 columns

\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n"," \n"," \n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"anomaly_transactions_df"}},"metadata":{}}]},{"cell_type":"markdown","source":["#4. Compare the results of rule-based logic and AI based model results (Limited comparison)"],"metadata":{"id":"PHtAoA-mhb_Y"}},{"cell_type":"code","source":["# Step 1: Extract transaction IDs from both sources\n","vae_anomaly_ids = anomaly_transactions_df[\"TransactionID\"].astype(str).unique()\n","rule_based_ids = flat_trx_df[\"Transaction IDs\"].astype(str).unique()\n","\n","# Step 2: Find matches (overlap) and mismatches\n","matched_ids = np.intersect1d(vae_anomaly_ids, rule_based_ids)\n","vae_only_ids = np.setdiff1d(vae_anomaly_ids, rule_based_ids)\n","rule_only_ids = np.setdiff1d(rule_based_ids, vae_anomaly_ids)\n","\n","# Step 3: Print results\n","print(f\"Total VAE anomalies: {len(vae_anomaly_ids)}\")\n","print(f\"Total Rule-based alerts: {len(rule_based_ids)}\")\n","print(f\"Matched (VAE ∩ Rule-based): {len(matched_ids)}\")\n","print(f\"Match % from VAE anomalies: {len(matched_ids) / len(vae_anomaly_ids) * 100:.2f}%\")\n","\n","# Step 4 (Optional): Get full details of matches and mismatches\n","matched_df = anomaly_transactions_df[anomaly_transactions_df[\"TransactionID\"].isin(matched_ids)]\n","vae_only_df = anomaly_transactions_df[anomaly_transactions_df[\"TransactionID\"].isin(vae_only_ids)]\n","rule_only_tranx = flat_trx_df[flat_trx_df[\"Transaction IDs\"].isin(rule_only_ids)]\n","rule_only_tranx = rule_only_tranx.rename(columns={'Transaction IDs': 'TransactionID'})\n","rule_only_df = rule_only_tranx.merge(transactions_merge, on='TransactionID', how='left')\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"YHYomPnZKNq-","executionInfo":{"status":"ok","timestamp":1755093797040,"user_tz":-330,"elapsed":23,"user":{"displayName":"Babu Seshiah","userId":"00614315087314266727"}},"outputId":"f741efc2-63cf-43d7-8963-e046dea69a13"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Total VAE anomalies: 246\n","Total Rule-based alerts: 246\n","Matched (VAE ∩ Rule-based): 6\n","Match % from VAE anomalies: 2.44%\n"]}]},{"cell_type":"code","source":["#Save the model for use on new/unseen data in the other notebook\n","vae.save(\"vae.keras\")"],"metadata":{"id":"htNxM6Ksdh9a"},"execution_count":null,"outputs":[]}]}