ganeshkonapalli commited on
Commit
c3598f5
·
verified ·
1 Parent(s): de734ab

Upload 9 files

Browse files
Files changed (9) hide show
  1. BERT_model.pth +3 -0
  2. Dockerfile +38 -0
  3. app.py +77 -0
  4. bert_model.py +59 -0
  5. config.py +69 -0
  6. dataset_utils.py +165 -0
  7. docker-compose.yml +15 -0
  8. label_encoders.pkl +3 -0
  9. requirements.txt +8 -0
BERT_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7050d02ac599ef72d7b0410a79a72537fb44d4ac66eb8a1dc719329c8c4b07b
3
+ size 438239057
Dockerfile ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Create a non-root user
13
+ RUN useradd -m -u 1000 appuser
14
+
15
+ # Copy requirements first to leverage Docker cache
16
+ COPY requirements.txt .
17
+
18
+ # Install Python dependencies
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Create necessary directories and set permissions
22
+ RUN mkdir -p /app/saved_models /app/tokenizer /app/predictions /app/.cache \
23
+ && chown -R appuser:appuser /app
24
+
25
+ # Switch to non-root user
26
+ USER appuser
27
+
28
+ # Copy the application code
29
+ COPY --chown=appuser:appuser . .
30
+
31
+ # Download the BERT tokenizer
32
+ RUN python -c "from transformers import BertTokenizer; BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='/app/.cache')"
33
+
34
+ # Expose the port the app runs on (7860 for Hugging Face Spaces)
35
+ EXPOSE 7860
36
+
37
+ # Command to run the application
38
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import torch
4
+ from transformers import BertTokenizer
5
+ from models.bert_model import BertMultiOutputModel
6
+ from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, DEVICE
7
+ from dataset_utils import load_label_encoders
8
+ import numpy as np
9
+ import os
10
+
11
+ app = FastAPI()
12
+
13
+ # Load the model and tokenizer
14
+ model_path = "BERT_model.pth"
15
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
16
+ model = BertMultiOutputModel([len(load_label_encoders()[col].classes_) for col in LABEL_COLUMNS]).to(DEVICE)
17
+ model.load_state_dict(torch.load(model_path, map_location=DEVICE))
18
+ model.eval()
19
+
20
+ class PredictionRequest(BaseModel):
21
+ sanction_context: str
22
+
23
+ @app.get("/")
24
+ async def root():
25
+ return {"status": "healthy", "message": "BERT API is running"}
26
+
27
+ @app.get("/health")
28
+ async def health_check():
29
+ return {"status": "healthy"}
30
+
31
+ @app.post("/predict")
32
+ async def predict(request: PredictionRequest):
33
+ try:
34
+ # Tokenize the input text
35
+ inputs = tokenizer(
36
+ request.sanction_context,
37
+ padding='max_length',
38
+ truncation=True,
39
+ max_length=MAX_LEN,
40
+ return_tensors="pt"
41
+ )
42
+
43
+ # Move inputs to device
44
+ input_ids = inputs['input_ids'].to(DEVICE)
45
+ attention_mask = inputs['attention_mask'].to(DEVICE)
46
+
47
+ # Get predictions
48
+ with torch.no_grad():
49
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
50
+ probabilities = [torch.softmax(output, dim=1).cpu().numpy() for output in outputs]
51
+ predictions = [np.argmax(prob, axis=1) for prob in probabilities]
52
+
53
+ # Load label encoders to decode predictions
54
+ label_encoders = load_label_encoders()
55
+
56
+ # Format response
57
+ response = {}
58
+ for i, (col, pred, prob) in enumerate(zip(LABEL_COLUMNS, predictions, probabilities)):
59
+ decoded_pred = label_encoders[col].inverse_transform(pred)[0]
60
+ response[col] = {
61
+ "prediction": decoded_pred,
62
+ "probabilities": {
63
+ label: float(prob[0][j])
64
+ for j, label in enumerate(label_encoders[col].classes_)
65
+ }
66
+ }
67
+
68
+ return response
69
+
70
+ except Exception as e:
71
+ raise HTTPException(status_code=500, detail=str(e))
72
+
73
+ if __name__ == "__main__":
74
+ import uvicorn
75
+ # For Hugging Face Spaces, we need to use port 7860
76
+ port = int(os.environ.get("PORT", 7860))
77
+ uvicorn.run(app, host="0.0.0.0", port=port)
bert_model.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/bert_model.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import BertModel
6
+ from config import DROPOUT_RATE, BERT_MODEL_NAME # Import BERT_MODEL_NAME from config
7
+
8
+ class BertMultiOutputModel(nn.Module):
9
+ """
10
+ BERT-based model for multi-output classification.
11
+ It uses a pre-trained BERT model as its backbone and adds a dropout layer
12
+ followed by separate linear classification heads for each target label.
13
+ """
14
+ # Statically set tokenizer name for easy access in main.py
15
+ tokenizer_name = BERT_MODEL_NAME
16
+
17
+ def __init__(self, num_labels):
18
+ """
19
+ Initializes the BertMultiOutputModel.
20
+
21
+ Args:
22
+ num_labels (list): A list where each element is the number of classes
23
+ for a corresponding label column.
24
+ """
25
+ super(BertMultiOutputModel, self).__init__()
26
+ # Load the pre-trained BERT model.
27
+ # BertModel provides contextual embeddings and a pooled output for classification.
28
+ self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
29
+ self.dropout = nn.Dropout(DROPOUT_RATE) # Dropout layer for regularization
30
+
31
+ # Create a list of classification heads, one for each label column.
32
+ # Each head is a linear layer mapping BERT's pooled output size to the number of classes for that label.
33
+ self.classifiers = nn.ModuleList([
34
+ nn.Linear(self.bert.config.hidden_size, n_classes) for n_classes in num_labels
35
+ ])
36
+
37
+ def forward(self, input_ids, attention_mask):
38
+ """
39
+ Performs the forward pass of the model.
40
+
41
+ Args:
42
+ input_ids (torch.Tensor): Tensor of token IDs (from tokenizer).
43
+ attention_mask (torch.Tensor): Tensor indicating attention (from tokenizer).
44
+
45
+ Returns:
46
+ list: A list of logit tensors, one for each classification head.
47
+ Each tensor has shape (batch_size, num_classes_for_that_label).
48
+ """
49
+ # Pass input_ids and attention_mask through BERT.
50
+ # .pooler_output typically represents the hidden state of the [CLS] token,
51
+ # processed through a linear layer and tanh activation, often used for classification.
52
+ pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
53
+
54
+ # Apply dropout for regularization
55
+ pooled_output = self.dropout(pooled_output)
56
+
57
+ # Pass the pooled output through each classification head.
58
+ # The result is a list of logits (raw scores before softmax/sigmoid) for each label.
59
+ return [classifier(pooled_output) for classifier in self.classifiers]
config.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+
3
+ import torch
4
+ import os
5
+
6
+ # --- Paths ---
7
+ # Adjust DATA_PATH to your actual data location
8
+ DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
9
+ TOKENIZER_PATH = './tokenizer/'
10
+ LABEL_ENCODERS_PATH = './label_encoders.pkl'
11
+ MODEL_SAVE_DIR = './saved_models/'
12
+ PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
13
+
14
+ # --- Data Columns ---
15
+ TEXT_COLUMN = "Sanction_Context"
16
+ # Define all your target label columns
17
+ LABEL_COLUMNS = [
18
+ "Red_Flag_Reason",
19
+ "Maker_Action",
20
+ "Escalation_Level",
21
+ "Risk_Category",
22
+ "Risk_Drivers",
23
+ "Investigation_Outcome"
24
+ ]
25
+ # Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
26
+ # For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
27
+ METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
28
+
29
+ # --- Model Hyperparameters ---
30
+ MAX_LEN = 128 # Maximum sequence length for transformer tokenizers
31
+ BATCH_SIZE = 16 # Batch size for training and evaluation
32
+ LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
33
+ NUM_EPOCHS = 3 # Number of training epochs. Adjust based on convergence.
34
+ DROPOUT_RATE = 0.3 # Dropout rate for regularization
35
+
36
+ # --- Device Configuration ---
37
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+
39
+ # --- Specific Model Configurations ---
40
+ BERT_MODEL_NAME = 'bert-base-uncased'
41
+ ROBERTA_MODEL_NAME = 'roberta-base'
42
+ DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
43
+
44
+ # TF-IDF
45
+ TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
46
+
47
+ # --- Field-Specific Strategy (Conceptual) ---
48
+ # This dictionary provides conceptual strategies for enhancing specific fields.
49
+ # Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
50
+ FIELD_STRATEGIES = {
51
+ "Maker_Action": {
52
+ "loss": "focal_loss", # Requires custom Focal Loss implementation
53
+ "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
54
+ },
55
+ "Risk_Category": {
56
+ "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
57
+ },
58
+ "Escalation_Level": {
59
+ "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
60
+ },
61
+ "Investigation_Outcome": {
62
+ "type": "classification_or_generation" # If generation, T5/BART would be needed.
63
+ }
64
+ }
65
+
66
+ # Ensure model save and predictions directories exist
67
+ os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
68
+ os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
69
+ os.makedirs(TOKENIZER_PATH, exist_ok=True)
dataset_utils.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset_utils.py
2
+
3
+ import pandas as pd
4
+ import torch
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer
8
+ import pickle
9
+ import os
10
+
11
+ from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, TOKENIZER_PATH, LABEL_ENCODERS_PATH, METADATA_COLUMNS
12
+
13
+ class ComplianceDataset(Dataset):
14
+ """
15
+ Custom Dataset class for handling text and multi-output labels for PyTorch models.
16
+ """
17
+ def __init__(self, texts, labels, tokenizer, max_len):
18
+ self.texts = texts
19
+ self.labels = labels
20
+ self.tokenizer = tokenizer
21
+ self.max_len = max_len
22
+
23
+ def __len__(self):
24
+ """Returns the total number of samples in the dataset."""
25
+ return len(self.texts)
26
+
27
+ def __getitem__(self, idx):
28
+ """
29
+ Retrieves a sample from the dataset at the given index.
30
+ Tokenizes the text and converts labels to a PyTorch tensor.
31
+ """
32
+ text = str(self.texts[idx])
33
+ # Tokenize the text, padding to max_length and truncating if longer.
34
+ # return_tensors="pt" ensures PyTorch tensors are returned.
35
+ inputs = self.tokenizer(
36
+ text,
37
+ padding='max_length',
38
+ truncation=True,
39
+ max_length=self.max_len,
40
+ return_tensors="pt"
41
+ )
42
+ # Squeeze removes the batch dimension (which is 1 here because we process one sample at a time)
43
+ inputs = {key: val.squeeze(0) for key, val in inputs.items()}
44
+ # Convert labels to a PyTorch long tensor
45
+ labels = torch.tensor(self.labels[idx], dtype=torch.long)
46
+ return inputs, labels
47
+
48
+ class ComplianceDatasetWithMetadata(Dataset):
49
+ """
50
+ Custom Dataset class for handling text, additional numerical metadata, and multi-output labels.
51
+ Used for hybrid models combining text and tabular features.
52
+ """
53
+ def __init__(self, texts, metadata, labels, tokenizer, max_len):
54
+ self.texts = texts
55
+ self.metadata = metadata # Expects metadata as a NumPy array or list of lists
56
+ self.labels = labels
57
+ self.tokenizer = tokenizer
58
+ self.max_len = max_len
59
+
60
+ def __len__(self):
61
+ """Returns the total number of samples in the dataset."""
62
+ return len(self.texts)
63
+
64
+ def __getitem__(self, idx):
65
+ """
66
+ Retrieves a sample, its metadata, and labels from the dataset at the given index.
67
+ Tokenizes text, converts metadata and labels to PyTorch tensors.
68
+ """
69
+ text = str(self.texts[idx])
70
+ inputs = self.tokenizer(
71
+ text,
72
+ padding='max_length',
73
+ truncation=True,
74
+ max_length=self.max_len,
75
+ return_tensors="pt"
76
+ )
77
+ inputs = {key: val.squeeze(0) for key, val in inputs.items()}
78
+ # Convert metadata for the current sample to a float tensor
79
+ metadata = torch.tensor(self.metadata[idx], dtype=torch.float)
80
+ labels = torch.tensor(self.labels[idx], dtype=torch.long)
81
+ return inputs, metadata, labels
82
+
83
+ def load_and_preprocess_data(data_path):
84
+ """
85
+ Loads data from a CSV, fills missing values, and encodes categorical labels.
86
+ Also handles converting specified METADATA_COLUMNS to numeric.
87
+
88
+ Args:
89
+ data_path (str): Path to the CSV data file.
90
+
91
+ Returns:
92
+ tuple: A tuple containing:
93
+ - data (pd.DataFrame): The preprocessed DataFrame.
94
+ - label_encoders (dict): A dictionary of LabelEncoder objects for each label column.
95
+ """
96
+ data = pd.read_csv(data_path)
97
+ data.fillna("Unknown", inplace=True) # Fill any missing text values with "Unknown"
98
+
99
+ # Convert metadata columns to numeric, coercing errors and filling NaNs with 0
100
+ # This ensures metadata is suitable for neural networks.
101
+ for col in METADATA_COLUMNS:
102
+ if col in data.columns:
103
+ data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0) # Fill NaN with 0 or a suitable value
104
+
105
+ label_encoders = {col: LabelEncoder() for col in LABEL_COLUMNS}
106
+ for col in LABEL_COLUMNS:
107
+ # Fit and transform each label column using its respective LabelEncoder
108
+ data[col] = label_encoders[col].fit_transform(data[col])
109
+ return data, label_encoders
110
+
111
+ def get_tokenizer(model_name):
112
+ """
113
+ Returns the appropriate Hugging Face tokenizer based on the model name.
114
+
115
+ Args:
116
+ model_name (str): The name of the pre-trained model (e.g., 'bert-base-uncased').
117
+
118
+ Returns:
119
+ transformers.PreTrainedTokenizer: The initialized tokenizer.
120
+ """
121
+ if "bert" in model_name.lower():
122
+ return BertTokenizer.from_pretrained(model_name)
123
+ elif "roberta" in model_name.lower():
124
+ return RobertaTokenizer.from_pretrained(model_name)
125
+ elif "deberta" in model_name.lower():
126
+ return DebertaTokenizer.from_pretrained(model_name)
127
+ else:
128
+ raise ValueError(f"Unsupported tokenizer for model: {model_name}")
129
+
130
+ def save_label_encoders(label_encoders):
131
+ """
132
+ Saves a dictionary of label encoders to a pickle file.
133
+ This is crucial for decoding predictions back to original labels.
134
+
135
+ Args:
136
+ label_encoders (dict): Dictionary of LabelEncoder objects.
137
+ """
138
+ with open(LABEL_ENCODERS_PATH, "wb") as f:
139
+ pickle.dump(label_encoders, f)
140
+ print(f"Label encoders saved to {LABEL_ENCODERS_PATH}")
141
+
142
+ def load_label_encoders():
143
+ """
144
+ Loads a dictionary of label encoders from a pickle file.
145
+
146
+ Returns:
147
+ dict: Loaded dictionary of LabelEncoder objects.
148
+ """
149
+ with open(LABEL_ENCODERS_PATH, "rb") as f:
150
+ return pickle.load(f)
151
+ print(f"Label encoders loaded from {LABEL_ENCODERS_PATH}")
152
+
153
+
154
+ def get_num_labels(label_encoders):
155
+ """
156
+ Returns a list containing the number of unique classes for each label column.
157
+ This list is used to define the output dimensions of the model's classification heads.
158
+
159
+ Args:
160
+ label_encoders (dict): Dictionary of LabelEncoder objects.
161
+
162
+ Returns:
163
+ list: A list of integers, where each integer is the number of classes for a label.
164
+ """
165
+ return [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]
docker-compose.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ bert-api:
5
+ build: .
6
+ ports:
7
+ - "8000:8000"
8
+ volumes:
9
+ - ../saved_models:/app/saved_models
10
+ - ../tokenizer:/app/tokenizer
11
+ - ../predictions:/app/predictions
12
+ - ../label_encoders.pkl:/app/label_encoders.pkl
13
+ environment:
14
+ - PYTHONUNBUFFERED=1
15
+ restart: unless-stopped
label_encoders.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c336fd07858af76d40c7200de1a769099abeec25d4f48b999351318680d4e4d6
3
+ size 2047
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ pydantic==2.4.2
4
+ torch==2.1.1
5
+ transformers==4.35.2
6
+ numpy==1.24.3
7
+ scikit-learn==1.3.2
8
+ pandas==2.1.3