AJAY KASU commited on
Commit
7b4b1dd
·
1 Parent(s): 2471c97

Feature: Expanded column mapping with NLP variants and extra AML fields

Browse files
Files changed (1) hide show
  1. modules/etl.py +22 -11
modules/etl.py CHANGED
@@ -3,30 +3,41 @@ import numpy as np
3
 
4
  # Column mapping — maps ANY known variant to internal name
5
  COLUMN_MAP = {
6
- # transaction_id variants
 
7
  "tx_id": "transaction_id",
8
  "txn_id": "transaction_id",
9
 
10
- # customer_id — ONLY map sender, not receiver
11
- "sender_account_id": "customer_id", # ← sender is the customer
12
- "account_id": "customer_id",
13
  "cust_id": "customer_id",
14
 
15
- # Keep receiver separate DO NOT map to customer_id
16
- # "receiver_account_id" stays as receiver_account_id
17
-
18
- # amount variants
19
  "tx_amount": "amount",
20
  "txn_amount": "amount",
21
 
22
- # timestamp variants
 
23
  "transaction_date": "timestamp",
24
  "tx_date": "timestamp",
25
 
26
- # transaction_type variants
 
27
  "tx_type": "transaction_type",
28
  "txn_type": "transaction_type",
29
- "type": "transaction_type",
 
 
 
 
 
 
 
 
 
 
30
  }
31
 
32
  def load_and_validate(file):
 
3
 
4
  # Column mapping — maps ANY known variant to internal name
5
  COLUMN_MAP = {
6
+ # Normalize column names first (lowercase + strip)
7
+ "transaction id": "transaction_id",
8
  "tx_id": "transaction_id",
9
  "txn_id": "transaction_id",
10
 
11
+ # customer
12
+ "person involved": "customer_id",
13
+ "sender_account_id": "customer_id",
14
  "cust_id": "customer_id",
15
 
16
+ # amounthandles spaces and special chars
17
+ "amount (usd)": "amount",
 
 
18
  "tx_amount": "amount",
19
  "txn_amount": "amount",
20
 
21
+ # timestamp
22
+ "date of transaction": "timestamp",
23
  "transaction_date": "timestamp",
24
  "tx_date": "timestamp",
25
 
26
+ # transaction type
27
+ "transaction type": "transaction_type",
28
  "tx_type": "transaction_type",
29
  "txn_type": "transaction_type",
30
+
31
+ # countries
32
+ "country": "origin_country",
33
+ "destination country": "dest_country",
34
+
35
+ # extra AML fields
36
+ "money laundering risk score": "ml_risk_score",
37
+ "shell companies involved": "shell_companies",
38
+ "tax haven country": "tax_haven",
39
+ "source of money": "source_of_money",
40
+ "reported by authority": "reported_by_authority",
41
  }
42
 
43
  def load_and_validate(file):