namanpenguin commited on
Commit
7df50b4
·
verified ·
1 Parent(s): 652507c

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.py +69 -0
  2. dataset_utils.py +165 -0
config.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+
3
+ import torch
4
+ import os
5
+
6
+ # --- Paths ---
7
+ # Adjust DATA_PATH to your actual data location
8
+ DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
9
+ TOKENIZER_PATH = './tokenizer/'
10
+ LABEL_ENCODERS_PATH = './label_encoders.pkl'
11
+ MODEL_SAVE_DIR = './saved_models/'
12
+ PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
13
+
14
+ # --- Data Columns ---
15
+ TEXT_COLUMN = "Sanction_Context"
16
+ # Define all your target label columns
17
+ LABEL_COLUMNS = [
18
+ "Red_Flag_Reason",
19
+ "Maker_Action",
20
+ "Escalation_Level",
21
+ "Risk_Category",
22
+ "Risk_Drivers",
23
+ "Investigation_Outcome"
24
+ ]
25
+ # Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
26
+ # For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
27
+ METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
28
+
29
+ # --- Model Hyperparameters ---
30
+ MAX_LEN = 128 # Maximum sequence length for transformer tokenizers
31
+ BATCH_SIZE = 16 # Batch size for training and evaluation
32
+ LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
33
+ NUM_EPOCHS = 3 # Number of training epochs. Adjust based on convergence.
34
+ DROPOUT_RATE = 0.3 # Dropout rate for regularization
35
+
36
+ # --- Device Configuration ---
37
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+
39
+ # --- Specific Model Configurations ---
40
+ BERT_MODEL_NAME = 'bert-base-uncased'
41
+ ROBERTA_MODEL_NAME = 'roberta-base'
42
+ DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
43
+
44
+ # TF-IDF
45
+ TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
46
+
47
+ # --- Field-Specific Strategy (Conceptual) ---
48
+ # This dictionary provides conceptual strategies for enhancing specific fields.
49
+ # Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
50
+ FIELD_STRATEGIES = {
51
+ "Maker_Action": {
52
+ "loss": "focal_loss", # Requires custom Focal Loss implementation
53
+ "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
54
+ },
55
+ "Risk_Category": {
56
+ "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
57
+ },
58
+ "Escalation_Level": {
59
+ "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
60
+ },
61
+ "Investigation_Outcome": {
62
+ "type": "classification_or_generation" # If generation, T5/BART would be needed.
63
+ }
64
+ }
65
+
66
+ # Ensure model save and predictions directories exist
67
+ os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
68
+ os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
69
+ os.makedirs(TOKENIZER_PATH, exist_ok=True)
dataset_utils.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset_utils.py
2
+
3
+ import pandas as pd
4
+ import torch
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer
8
+ import pickle
9
+ import os
10
+
11
+ from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, TOKENIZER_PATH, LABEL_ENCODERS_PATH, METADATA_COLUMNS
12
+
13
+ class ComplianceDataset(Dataset):
14
+ """
15
+ Custom Dataset class for handling text and multi-output labels for PyTorch models.
16
+ """
17
+ def __init__(self, texts, labels, tokenizer, max_len):
18
+ self.texts = texts
19
+ self.labels = labels
20
+ self.tokenizer = tokenizer
21
+ self.max_len = max_len
22
+
23
+ def __len__(self):
24
+ """Returns the total number of samples in the dataset."""
25
+ return len(self.texts)
26
+
27
+ def __getitem__(self, idx):
28
+ """
29
+ Retrieves a sample from the dataset at the given index.
30
+ Tokenizes the text and converts labels to a PyTorch tensor.
31
+ """
32
+ text = str(self.texts[idx])
33
+ # Tokenize the text, padding to max_length and truncating if longer.
34
+ # return_tensors="pt" ensures PyTorch tensors are returned.
35
+ inputs = self.tokenizer(
36
+ text,
37
+ padding='max_length',
38
+ truncation=True,
39
+ max_length=self.max_len,
40
+ return_tensors="pt"
41
+ )
42
+ # Squeeze removes the batch dimension (which is 1 here because we process one sample at a time)
43
+ inputs = {key: val.squeeze(0) for key, val in inputs.items()}
44
+ # Convert labels to a PyTorch long tensor
45
+ labels = torch.tensor(self.labels[idx], dtype=torch.long)
46
+ return inputs, labels
47
+
48
+ class ComplianceDatasetWithMetadata(Dataset):
49
+ """
50
+ Custom Dataset class for handling text, additional numerical metadata, and multi-output labels.
51
+ Used for hybrid models combining text and tabular features.
52
+ """
53
+ def __init__(self, texts, metadata, labels, tokenizer, max_len):
54
+ self.texts = texts
55
+ self.metadata = metadata # Expects metadata as a NumPy array or list of lists
56
+ self.labels = labels
57
+ self.tokenizer = tokenizer
58
+ self.max_len = max_len
59
+
60
+ def __len__(self):
61
+ """Returns the total number of samples in the dataset."""
62
+ return len(self.texts)
63
+
64
+ def __getitem__(self, idx):
65
+ """
66
+ Retrieves a sample, its metadata, and labels from the dataset at the given index.
67
+ Tokenizes text, converts metadata and labels to PyTorch tensors.
68
+ """
69
+ text = str(self.texts[idx])
70
+ inputs = self.tokenizer(
71
+ text,
72
+ padding='max_length',
73
+ truncation=True,
74
+ max_length=self.max_len,
75
+ return_tensors="pt"
76
+ )
77
+ inputs = {key: val.squeeze(0) for key, val in inputs.items()}
78
+ # Convert metadata for the current sample to a float tensor
79
+ metadata = torch.tensor(self.metadata[idx], dtype=torch.float)
80
+ labels = torch.tensor(self.labels[idx], dtype=torch.long)
81
+ return inputs, metadata, labels
82
+
83
+ def load_and_preprocess_data(data_path):
84
+ """
85
+ Loads data from a CSV, fills missing values, and encodes categorical labels.
86
+ Also handles converting specified METADATA_COLUMNS to numeric.
87
+
88
+ Args:
89
+ data_path (str): Path to the CSV data file.
90
+
91
+ Returns:
92
+ tuple: A tuple containing:
93
+ - data (pd.DataFrame): The preprocessed DataFrame.
94
+ - label_encoders (dict): A dictionary of LabelEncoder objects for each label column.
95
+ """
96
+ data = pd.read_csv(data_path)
97
+ data.fillna("Unknown", inplace=True) # Fill any missing text values with "Unknown"
98
+
99
+ # Convert metadata columns to numeric, coercing errors and filling NaNs with 0
100
+ # This ensures metadata is suitable for neural networks.
101
+ for col in METADATA_COLUMNS:
102
+ if col in data.columns:
103
+ data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0) # Fill NaN with 0 or a suitable value
104
+
105
+ label_encoders = {col: LabelEncoder() for col in LABEL_COLUMNS}
106
+ for col in LABEL_COLUMNS:
107
+ # Fit and transform each label column using its respective LabelEncoder
108
+ data[col] = label_encoders[col].fit_transform(data[col])
109
+ return data, label_encoders
110
+
111
+ def get_tokenizer(model_name):
112
+ """
113
+ Returns the appropriate Hugging Face tokenizer based on the model name.
114
+
115
+ Args:
116
+ model_name (str): The name of the pre-trained model (e.g., 'bert-base-uncased').
117
+
118
+ Returns:
119
+ transformers.PreTrainedTokenizer: The initialized tokenizer.
120
+ """
121
+ if "bert" in model_name.lower():
122
+ return BertTokenizer.from_pretrained(model_name)
123
+ elif "roberta" in model_name.lower():
124
+ return RobertaTokenizer.from_pretrained(model_name)
125
+ elif "deberta" in model_name.lower():
126
+ return DebertaTokenizer.from_pretrained(model_name)
127
+ else:
128
+ raise ValueError(f"Unsupported tokenizer for model: {model_name}")
129
+
130
+ def save_label_encoders(label_encoders):
131
+ """
132
+ Saves a dictionary of label encoders to a pickle file.
133
+ This is crucial for decoding predictions back to original labels.
134
+
135
+ Args:
136
+ label_encoders (dict): Dictionary of LabelEncoder objects.
137
+ """
138
+ with open(LABEL_ENCODERS_PATH, "wb") as f:
139
+ pickle.dump(label_encoders, f)
140
+ print(f"Label encoders saved to {LABEL_ENCODERS_PATH}")
141
+
142
+ def load_label_encoders():
143
+ """
144
+ Loads a dictionary of label encoders from a pickle file.
145
+
146
+ Returns:
147
+ dict: Loaded dictionary of LabelEncoder objects.
148
+ """
149
+ with open(LABEL_ENCODERS_PATH, "rb") as f:
150
+ return pickle.load(f)
151
+ print(f"Label encoders loaded from {LABEL_ENCODERS_PATH}")
152
+
153
+
154
+ def get_num_labels(label_encoders):
155
+ """
156
+ Returns a list containing the number of unique classes for each label column.
157
+ This list is used to define the output dimensions of the model's classification heads.
158
+
159
+ Args:
160
+ label_encoders (dict): Dictionary of LabelEncoder objects.
161
+
162
+ Returns:
163
+ list: A list of integers, where each integer is the number of classes for a label.
164
+ """
165
+ return [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]