subbunanepalli commited on
Commit
ccba496
·
verified ·
1 Parent(s): 356cf69

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.py +61 -0
  2. deberta_model.py +22 -0
  3. label_encoders.pkl +3 -0
  4. requirements.txt +8 -0
config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+
4
+ # --- Paths ---
5
+ DATA_PATH = '/kaggle/input/synthesis-data/synthetic_transactions_samples_5000.csv'
6
+ TOKENIZER_PATH = './tokenizer/'
7
+ LABEL_ENCODERS_PATH = './label_encoders.pkl'
8
+ MODEL_SAVE_DIR = './saved_models/'
9
+ PREDICTIONS_SAVE_DIR = './predictions/'
10
+
11
+ # --- Data Columns ---
12
+ TEXT_COLUMN = "Sanction_Context"
13
+ LABEL_COLUMNS = [
14
+ "Red_Flag_Reason",
15
+ "Maker_Action",
16
+ "Escalation_Level",
17
+ "Risk_Category",
18
+ "Risk_Drivers",
19
+ "Investigation_Outcome"
20
+ ]
21
+ METADATA_COLUMNS = []
22
+
23
+ # --- Model Hyperparameters ---
24
+ MAX_LEN = 128
25
+ BATCH_SIZE = 16
26
+ LEARNING_RATE = 2e-5
27
+ NUM_EPOCHS = 3
28
+ DROPOUT_RATE = 0.3
29
+
30
+ # --- Device Configuration ---
31
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
+
33
+ # --- Model Names ---
34
+ BERT_MODEL_NAME = 'bert-base-uncased'
35
+ ROBERTA_MODEL_NAME = 'roberta-base'
36
+ DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
37
+
38
+ # --- TF-IDF ---
39
+ TFIDF_MAX_FEATURES = 5000
40
+
41
+ # --- Optional Strategy Definitions ---
42
+ FIELD_STRATEGIES = {
43
+ "Maker_Action": {
44
+ "loss": "focal_loss",
45
+ "enhancements": ["action_templates", "context_prompt_tuning"]
46
+ },
47
+ "Risk_Category": {
48
+ "enhancements": ["numerical_metadata", "transaction_patterns"]
49
+ },
50
+ "Escalation_Level": {
51
+ "enhancements": ["class_balancing", "policy_keyword_patterns"]
52
+ },
53
+ "Investigation_Outcome": {
54
+ "type": "classification_or_generation"
55
+ }
56
+ }
57
+
58
+ # Ensure save directories exist
59
+ os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
60
+ os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
61
+ os.makedirs(TOKENIZER_PATH, exist_ok=True)
deberta_model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn as nn
4
+ from transformers import DebertaModel
5
+ from config import DROPOUT_RATE, DEBERTA_MODEL_NAME
6
+
7
+ class DebertaMultiOutputModel(nn.Module):
8
+ tokenizer_name = DEBERTA_MODEL_NAME
9
+
10
+ def __init__(self, num_labels):
11
+ super(DebertaMultiOutputModel, self).__init__()
12
+ self.deberta = DebertaModel.from_pretrained(DEBERTA_MODEL_NAME)
13
+ self.dropout = nn.Dropout(DROPOUT_RATE)
14
+ self.classifiers = nn.ModuleList([
15
+ nn.Linear(self.deberta.config.hidden_size, n_classes) for n_classes in num_labels
16
+ ])
17
+
18
+ def forward(self, input_ids, attention_mask):
19
+ last_hidden_state = self.deberta(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
20
+ pooled_output = last_hidden_state[:, 0] # [CLS] token representation
21
+ pooled_output = self.dropout(pooled_output)
22
+ return [classifier(pooled_output) for classifier in self.classifiers]
label_encoders.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be834abbaaa80f915d0a0015f541a17ae6fda5c75d9485cb23c6a7b7bb7b7c97
3
+ size 2047
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ pydantic==2.4.2
4
+ transformers==4.35.2
5
+ torch==2.1.1
6
+ numpy==1.24.3
7
+ pandas==2.1.3
8
+ scikit-learn==1.3.2