ganeshkonapalli commited on
Commit
46f994e
·
verified ·
1 Parent(s): 2985c2f

Upload 8 files

Browse files
Files changed (8) hide show
  1. Dockerfile +13 -0
  2. ROBERTA_model.pth +3 -0
  3. config.py +21 -0
  4. dataset_utils.py +5 -0
  5. docker-compose.yml +14 -0
  6. main.py +40 -0
  7. requirements.txt +9 -0
  8. roberta_model.py +19 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY ./app /app/app
6
+ COPY ./tokenizer /app/tokenizer
7
+ COPY ./saved_models /app/saved_models
8
+ COPY label_encoders.pkl /app/
9
+ COPY app/requirements.txt /app/
10
+
11
+ RUN pip install --no-cache-dir -r /app/requirements.txt
12
+
13
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
ROBERTA_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61d5f8d8c55420afc000f5033303795d1cf2544f451dac80b878e2054899b4bb
3
+ size 18
config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+
4
+ DATA_PATH = '/app/synthetic_transactions_samples_5000.csv'
5
+ TOKENIZER_PATH = './tokenizer/'
6
+ LABEL_ENCODERS_PATH = './label_encoders.pkl'
7
+ MODEL_SAVE_DIR = './saved_models/'
8
+ PREDICTIONS_SAVE_DIR = './predictions/'
9
+
10
+ TEXT_COLUMN = "Sanction_Context"
11
+ LABEL_COLUMNS = [
12
+ "Red_Flag_Reason",
13
+ "Maker_Action",
14
+ "Escalation_Level",
15
+ "Risk_Category",
16
+ "Risk_Drivers",
17
+ "Investigation_Outcome"
18
+ ]
19
+ MAX_LEN = 128
20
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+ ROBERTA_MODEL_NAME = 'roberta-base'
dataset_utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ def load_label_encoders(path='./label_encoders.pkl'):
4
+ with open(path, 'rb') as f:
5
+ return pickle.load(f)
docker-compose.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.9'
2
+
3
+ services:
4
+ roberta-api:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ ports:
9
+ - "7860:7860"
10
+ volumes:
11
+ - ./app:/app/app
12
+ - ./saved_models:/app/saved_models
13
+ - ./tokenizer:/app/tokenizer
14
+ - ./label_encoders.pkl:/app/label_encoders.pkl
main.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ import torch
4
+ from transformers import RobertaTokenizer
5
+ from app.roberta_model import RobertaMultiOutputModel
6
+ from app.dataset_utils import load_label_encoders
7
+ from app.config import MAX_LEN, LABEL_COLUMNS, MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TOKENIZER_PATH
8
+
9
+ app = FastAPI()
10
+
11
+ class InputText(BaseModel):
12
+ sanction_context: str
13
+
14
+ label_encoders = load_label_encoders(LABEL_ENCODERS_PATH)
15
+ num_classes_per_label = [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]
16
+
17
+ model = RobertaMultiOutputModel(num_classes_per_label)
18
+ model.load_state_dict(torch.load(f"{MODEL_SAVE_DIR}/ROBERTA_model.pth", map_location="cpu"))
19
+ model.eval()
20
+
21
+ tokenizer = RobertaTokenizer.from_pretrained(TOKENIZER_PATH)
22
+
23
+ @app.post("/predict")
24
+ def predict(input_text: InputText):
25
+ inputs = tokenizer(
26
+ input_text.sanction_context,
27
+ padding='max_length',
28
+ truncation=True,
29
+ max_length=MAX_LEN,
30
+ return_tensors="pt"
31
+ )
32
+ with torch.no_grad():
33
+ outputs = model(**inputs)
34
+ predicted = [torch.argmax(logit, dim=1).item() for logit in outputs]
35
+
36
+ decoded = {
37
+ label: label_encoders[label].inverse_transform([pred])[0]
38
+ for label, pred in zip(LABEL_COLUMNS, predicted)
39
+ }
40
+ return {"predictions": decoded}
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers>=4.41.2
4
+ torch>=2.0.0
5
+ scikit-learn
6
+ pandas
7
+ numpy
8
+ tqdm
9
+ regex
roberta_model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import RobertaModel
4
+ from app.config import ROBERTA_MODEL_NAME
5
+
6
+ class RobertaMultiOutputModel(nn.Module):
7
+ def __init__(self, num_labels):
8
+ super(RobertaMultiOutputModel, self).__init__()
9
+ self.roberta = RobertaModel.from_pretrained(ROBERTA_MODEL_NAME)
10
+ self.dropout = nn.Dropout(0.3)
11
+ self.classifiers = nn.ModuleList([
12
+ nn.Linear(self.roberta.config.hidden_size, n_classes) for n_classes in num_labels
13
+ ])
14
+
15
+ def forward(self, input_ids, attention_mask):
16
+ output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
17
+ pooled_output = output.pooler_output
18
+ pooled_output = self.dropout(pooled_output)
19
+ return [classifier(pooled_output) for classifier in self.classifiers]