Spaces:

subbunanepalli
/

roberta-first

Sleeping

App Files Files Community

subbunanepalli commited on Jun 10, 2025

Commit

6993506

verified ·

1 Parent(s): 53eac32

Upload 4 files

Browse files

Files changed (4) hide show

app.py +83 -0
config.py +61 -0
dataset_utils.py +92 -0
label_encoders.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+from transformers import RobertaTokenizer
+from models.roberta_model import RobertaMultiOutputModel
+from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, DEVICE
+from dataset_utils import load_label_encoders
+import numpy as np
+import os
+app = FastAPI()
+# Load the model and tokenizer
+model_path = "saved_models/ROBERTA_model.pth"  # Adjust if different
+tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+# Load label encoders
+label_encoders = load_label_encoders()
+num_classes = [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]
+# Initialize model and load weights
+model = RobertaMultiOutputModel(num_classes).to(DEVICE)
+model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+model.eval()
+# Request format
+class PredictionRequest(BaseModel):
+    sanction_context: str
+# Root health check
+@app.get("/")
+async def root():
+    return {"status": "healthy", "message": "RoBERTa API is running"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+# Prediction endpoint
+@app.post("/predict")
+async def predict(request: PredictionRequest):
+    try:
+        # Tokenize the input text
+        inputs = tokenizer(
+            request.sanction_context,
+            padding='max_length',
+            truncation=True,
+            max_length=MAX_LEN,
+            return_tensors="pt"
+        )
+        # Move inputs to device
+        input_ids = inputs['input_ids'].to(DEVICE)
+        attention_mask = inputs['attention_mask'].to(DEVICE)
+        # Get model predictions
+        with torch.no_grad():
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            probabilities = [torch.softmax(output, dim=1).cpu().numpy() for output in outputs]
+            predictions = [np.argmax(prob, axis=1) for prob in probabilities]
+        # Format the response
+        response = {}
+        for i, (col, pred, prob) in enumerate(zip(LABEL_COLUMNS, predictions, probabilities)):
+            decoded_pred = label_encoders[col].inverse_transform(pred)[0]
+            response[col] = {
+                "prediction": decoded_pred,
+                "probabilities": {
+                    label: float(prob[0][j])
+                    for j, label in enumerate(label_encoders[col].classes_)
+                }
+            }
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# For local or Spaces deployment
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)

config.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+import os
+# --- Paths ---
+DATA_PATH = '/kaggle/input/synthesisss/synthetic_transactions_samples_5000.csv'
+TOKENIZER_PATH = './tokenizer/'
+LABEL_ENCODERS_PATH = './label_encoders.pkl'
+MODEL_SAVE_DIR = './saved_models/'
+PREDICTIONS_SAVE_DIR = './predictions/'
+# --- Data Columns ---
+TEXT_COLUMN = "Sanction_Context"
+LABEL_COLUMNS = [
+    "Red_Flag_Reason",
+    "Maker_Action",
+    "Escalation_Level",
+    "Risk_Category",
+    "Risk_Drivers",
+    "Investigation_Outcome"
+]
+METADATA_COLUMNS = []
+# --- Model Hyperparameters ---
+MAX_LEN = 128
+BATCH_SIZE = 16
+LEARNING_RATE = 2e-5
+NUM_EPOCHS = 3
+DROPOUT_RATE = 0.3
+# --- Device Configuration ---
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Model Names ---
+BERT_MODEL_NAME = 'bert-base-uncased'
+ROBERTA_MODEL_NAME = 'roberta-base'
+DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
+# --- TF-IDF ---
+TFIDF_MAX_FEATURES = 5000
+# --- Optional Strategy Definitions ---
+FIELD_STRATEGIES = {
+    "Maker_Action": {
+        "loss": "focal_loss",
+        "enhancements": ["action_templates", "context_prompt_tuning"]
+    },
+    "Risk_Category": {
+        "enhancements": ["numerical_metadata", "transaction_patterns"]
+    },
+    "Escalation_Level": {
+        "enhancements": ["class_balancing", "policy_keyword_patterns"]
+    },
+    "Investigation_Outcome": {
+        "type": "classification_or_generation"
+    }
+}
+# Ensure save directories exist
+os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
+os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
+os.makedirs(TOKENIZER_PATH, exist_ok=True)

dataset_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from sklearn.preprocessing import LabelEncoder
+from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer
+import pickle
+import os
+from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, TOKENIZER_PATH, LABEL_ENCODERS_PATH, METADATA_COLUMNS
+class ComplianceDataset(Dataset):
+    def __init__(self, texts, labels, tokenizer, max_len):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text = str(self.texts[idx])
+        inputs = self.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_len,
+            return_tensors="pt"
+        )
+        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
+        labels = torch.tensor(self.labels[idx], dtype=torch.long)
+        return inputs, labels
+class ComplianceDatasetWithMetadata(Dataset):
+    def __init__(self, texts, metadata, labels, tokenizer, max_len):
+        self.texts = texts
+        self.metadata = metadata
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text = str(self.texts[idx])
+        inputs = self.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_len,
+            return_tensors="pt"
+        )
+        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
+        metadata = torch.tensor(self.metadata[idx], dtype=torch.float)
+        labels = torch.tensor(self.labels[idx], dtype=torch.long)
+        return inputs, metadata, labels
+def load_and_preprocess_data(data_path):
+    data = pd.read_csv(data_path)
+    data.fillna("Unknown", inplace=True)
+    for col in METADATA_COLUMNS:
+        if col in data.columns:
+            data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)
+    label_encoders = {col: LabelEncoder() for col in LABEL_COLUMNS}
+    for col in LABEL_COLUMNS:
+        data[col] = label_encoders[col].fit_transform(data[col])
+    return data, label_encoders
+def get_tokenizer(model_name):
+    if "bert" in model_name.lower():
+        return BertTokenizer.from_pretrained(model_name)
+    elif "roberta" in model_name.lower():
+        return RobertaTokenizer.from_pretrained(model_name)
+    elif "deberta" in model_name.lower():
+        return DebertaTokenizer.from_pretrained(model_name)
+    else:
+        raise ValueError(f"Unsupported tokenizer for model: {model_name}")
+def save_label_encoders(label_encoders):
+    with open(LABEL_ENCODERS_PATH, "wb") as f:
+        pickle.dump(label_encoders, f)
+    print(f"Label encoders saved to {LABEL_ENCODERS_PATH}")
+def load_label_encoders():
+    with open(LABEL_ENCODERS_PATH, "rb") as f:
+        return pickle.load(f)
+def get_num_labels(label_encoders):
+    return [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]

label_encoders.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be834abbaaa80f915d0a0015f541a17ae6fda5c75d9485cb23c6a7b7bb7b7c97
+size 2047