Spaces:
Runtime error
Runtime error
Upload 8 files
Browse files- Dockerfile +13 -0
- ROBERTA_model.pth +3 -0
- config.py +21 -0
- dataset_utils.py +5 -0
- docker-compose.yml +14 -0
- main.py +40 -0
- requirements.txt +9 -0
- roberta_model.py +19 -0
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY ./app /app/app
|
| 6 |
+
COPY ./tokenizer /app/tokenizer
|
| 7 |
+
COPY ./saved_models /app/saved_models
|
| 8 |
+
COPY label_encoders.pkl /app/
|
| 9 |
+
COPY app/requirements.txt /app/
|
| 10 |
+
|
| 11 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 12 |
+
|
| 13 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
ROBERTA_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61d5f8d8c55420afc000f5033303795d1cf2544f451dac80b878e2054899b4bb
|
| 3 |
+
size 18
|
config.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
DATA_PATH = '/app/synthetic_transactions_samples_5000.csv'
|
| 5 |
+
TOKENIZER_PATH = './tokenizer/'
|
| 6 |
+
LABEL_ENCODERS_PATH = './label_encoders.pkl'
|
| 7 |
+
MODEL_SAVE_DIR = './saved_models/'
|
| 8 |
+
PREDICTIONS_SAVE_DIR = './predictions/'
|
| 9 |
+
|
| 10 |
+
TEXT_COLUMN = "Sanction_Context"
|
| 11 |
+
LABEL_COLUMNS = [
|
| 12 |
+
"Red_Flag_Reason",
|
| 13 |
+
"Maker_Action",
|
| 14 |
+
"Escalation_Level",
|
| 15 |
+
"Risk_Category",
|
| 16 |
+
"Risk_Drivers",
|
| 17 |
+
"Investigation_Outcome"
|
| 18 |
+
]
|
| 19 |
+
MAX_LEN = 128
|
| 20 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 21 |
+
ROBERTA_MODEL_NAME = 'roberta-base'
|
dataset_utils.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
|
| 3 |
+
def load_label_encoders(path='./label_encoders.pkl'):
|
| 4 |
+
with open(path, 'rb') as f:
|
| 5 |
+
return pickle.load(f)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.9'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
roberta-api:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile
|
| 8 |
+
ports:
|
| 9 |
+
- "7860:7860"
|
| 10 |
+
volumes:
|
| 11 |
+
- ./app:/app/app
|
| 12 |
+
- ./saved_models:/app/saved_models
|
| 13 |
+
- ./tokenizer:/app/tokenizer
|
| 14 |
+
- ./label_encoders.pkl:/app/label_encoders.pkl
|
main.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import RobertaTokenizer
|
| 5 |
+
from app.roberta_model import RobertaMultiOutputModel
|
| 6 |
+
from app.dataset_utils import load_label_encoders
|
| 7 |
+
from app.config import MAX_LEN, LABEL_COLUMNS, MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TOKENIZER_PATH
|
| 8 |
+
|
| 9 |
+
app = FastAPI()
|
| 10 |
+
|
| 11 |
+
class InputText(BaseModel):
|
| 12 |
+
sanction_context: str
|
| 13 |
+
|
| 14 |
+
label_encoders = load_label_encoders(LABEL_ENCODERS_PATH)
|
| 15 |
+
num_classes_per_label = [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]
|
| 16 |
+
|
| 17 |
+
model = RobertaMultiOutputModel(num_classes_per_label)
|
| 18 |
+
model.load_state_dict(torch.load(f"{MODEL_SAVE_DIR}/ROBERTA_model.pth", map_location="cpu"))
|
| 19 |
+
model.eval()
|
| 20 |
+
|
| 21 |
+
tokenizer = RobertaTokenizer.from_pretrained(TOKENIZER_PATH)
|
| 22 |
+
|
| 23 |
+
@app.post("/predict")
|
| 24 |
+
def predict(input_text: InputText):
|
| 25 |
+
inputs = tokenizer(
|
| 26 |
+
input_text.sanction_context,
|
| 27 |
+
padding='max_length',
|
| 28 |
+
truncation=True,
|
| 29 |
+
max_length=MAX_LEN,
|
| 30 |
+
return_tensors="pt"
|
| 31 |
+
)
|
| 32 |
+
with torch.no_grad():
|
| 33 |
+
outputs = model(**inputs)
|
| 34 |
+
predicted = [torch.argmax(logit, dim=1).item() for logit in outputs]
|
| 35 |
+
|
| 36 |
+
decoded = {
|
| 37 |
+
label: label_encoders[label].inverse_transform([pred])[0]
|
| 38 |
+
for label, pred in zip(LABEL_COLUMNS, predicted)
|
| 39 |
+
}
|
| 40 |
+
return {"predictions": decoded}
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
transformers>=4.41.2
|
| 4 |
+
torch>=2.0.0
|
| 5 |
+
scikit-learn
|
| 6 |
+
pandas
|
| 7 |
+
numpy
|
| 8 |
+
tqdm
|
| 9 |
+
regex
|
roberta_model.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import RobertaModel
|
| 4 |
+
from app.config import ROBERTA_MODEL_NAME
|
| 5 |
+
|
| 6 |
+
class RobertaMultiOutputModel(nn.Module):
|
| 7 |
+
def __init__(self, num_labels):
|
| 8 |
+
super(RobertaMultiOutputModel, self).__init__()
|
| 9 |
+
self.roberta = RobertaModel.from_pretrained(ROBERTA_MODEL_NAME)
|
| 10 |
+
self.dropout = nn.Dropout(0.3)
|
| 11 |
+
self.classifiers = nn.ModuleList([
|
| 12 |
+
nn.Linear(self.roberta.config.hidden_size, n_classes) for n_classes in num_labels
|
| 13 |
+
])
|
| 14 |
+
|
| 15 |
+
def forward(self, input_ids, attention_mask):
|
| 16 |
+
output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
|
| 17 |
+
pooled_output = output.pooler_output
|
| 18 |
+
pooled_output = self.dropout(pooled_output)
|
| 19 |
+
return [classifier(pooled_output) for classifier in self.classifiers]
|