Spaces:

ganeshkonapalli
/

berts

Sleeping

App Files Files Community

ganeshkonapalli commited on Jun 10, 2025

Commit

c3598f5

verified ·

1 Parent(s): de734ab

Upload 9 files

Browse files

Files changed (9) hide show

BERT_model.pth +3 -0
Dockerfile +38 -0
app.py +77 -0
bert_model.py +59 -0
config.py +69 -0
dataset_utils.py +165 -0
docker-compose.yml +15 -0
label_encoders.pkl +3 -0
requirements.txt +8 -0

BERT_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7050d02ac599ef72d7b0410a79a72537fb44d4ac66eb8a1dc719329c8c4b07b
+size 438239057

Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Create a non-root user
+RUN useradd -m -u 1000 appuser
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Create necessary directories and set permissions
+RUN mkdir -p /app/saved_models /app/tokenizer /app/predictions /app/.cache \
+    && chown -R appuser:appuser /app
+# Switch to non-root user
+USER appuser
+# Copy the application code
+COPY --chown=appuser:appuser . .
+# Download the BERT tokenizer
+RUN python -c "from transformers import BertTokenizer; BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='/app/.cache')"
+# Expose the port the app runs on (7860 for Hugging Face Spaces)
+EXPOSE 7860
+# Command to run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+from transformers import BertTokenizer
+from models.bert_model import BertMultiOutputModel
+from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, DEVICE
+from dataset_utils import load_label_encoders
+import numpy as np
+import os
+app = FastAPI()
+# Load the model and tokenizer
+model_path = "BERT_model.pth"
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertMultiOutputModel([len(load_label_encoders()[col].classes_) for col in LABEL_COLUMNS]).to(DEVICE)
+model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+model.eval()
+class PredictionRequest(BaseModel):
+    sanction_context: str
+@app.get("/")
+async def root():
+    return {"status": "healthy", "message": "BERT API is running"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+@app.post("/predict")
+async def predict(request: PredictionRequest):
+    try:
+        # Tokenize the input text
+        inputs = tokenizer(
+            request.sanction_context,
+            padding='max_length',
+            truncation=True,
+            max_length=MAX_LEN,
+            return_tensors="pt"
+        )
+        # Move inputs to device
+        input_ids = inputs['input_ids'].to(DEVICE)
+        attention_mask = inputs['attention_mask'].to(DEVICE)
+        # Get predictions
+        with torch.no_grad():
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+            probabilities = [torch.softmax(output, dim=1).cpu().numpy() for output in outputs]
+            predictions = [np.argmax(prob, axis=1) for prob in probabilities]
+        # Load label encoders to decode predictions
+        label_encoders = load_label_encoders()
+        # Format response
+        response = {}
+        for i, (col, pred, prob) in enumerate(zip(LABEL_COLUMNS, predictions, probabilities)):
+            decoded_pred = label_encoders[col].inverse_transform(pred)[0]
+            response[col] = {
+                "prediction": decoded_pred,
+                "probabilities": {
+                    label: float(prob[0][j])
+                    for j, label in enumerate(label_encoders[col].classes_)
+                }
+            }
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    # For Hugging Face Spaces, we need to use port 7860
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)

bert_model.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# models/bert_model.py
+import torch
+import torch.nn as nn
+from transformers import BertModel
+from config import DROPOUT_RATE, BERT_MODEL_NAME # Import BERT_MODEL_NAME from config
+class BertMultiOutputModel(nn.Module):
+    """
+    BERT-based model for multi-output classification.
+    It uses a pre-trained BERT model as its backbone and adds a dropout layer
+    followed by separate linear classification heads for each target label.
+    """
+    # Statically set tokenizer name for easy access in main.py
+    tokenizer_name = BERT_MODEL_NAME
+    def __init__(self, num_labels):
+        """
+        Initializes the BertMultiOutputModel.
+        Args:
+            num_labels (list): A list where each element is the number of classes
+                                for a corresponding label column.
+        """
+        super(BertMultiOutputModel, self).__init__()
+        # Load the pre-trained BERT model.
+        # BertModel provides contextual embeddings and a pooled output for classification.
+        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
+        self.dropout = nn.Dropout(DROPOUT_RATE) # Dropout layer for regularization
+        # Create a list of classification heads, one for each label column.
+        # Each head is a linear layer mapping BERT's pooled output size to the number of classes for that label.
+        self.classifiers = nn.ModuleList([
+            nn.Linear(self.bert.config.hidden_size, n_classes) for n_classes in num_labels
+        ])
+    def forward(self, input_ids, attention_mask):
+        """
+        Performs the forward pass of the model.
+        Args:
+            input_ids (torch.Tensor): Tensor of token IDs (from tokenizer).
+            attention_mask (torch.Tensor): Tensor indicating attention (from tokenizer).
+        Returns:
+            list: A list of logit tensors, one for each classification head.
+                  Each tensor has shape (batch_size, num_classes_for_that_label).
+        """
+        # Pass input_ids and attention_mask through BERT.
+        # .pooler_output typically represents the hidden state of the [CLS] token,
+        # processed through a linear layer and tanh activation, often used for classification.
+        pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
+        # Apply dropout for regularization
+        pooled_output = self.dropout(pooled_output)
+        # Pass the pooled output through each classification head.
+        # The result is a list of logits (raw scores before softmax/sigmoid) for each label.
+        return [classifier(pooled_output) for classifier in self.classifiers]

config.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# config.py
+import torch
+import os
+# --- Paths ---
+# Adjust DATA_PATH to your actual data location
+DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
+TOKENIZER_PATH = './tokenizer/'
+LABEL_ENCODERS_PATH = './label_encoders.pkl'
+MODEL_SAVE_DIR = './saved_models/'
+PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
+# --- Data Columns ---
+TEXT_COLUMN = "Sanction_Context"
+# Define all your target label columns
+LABEL_COLUMNS = [
+    "Red_Flag_Reason",
+    "Maker_Action",
+    "Escalation_Level",
+    "Risk_Category",
+    "Risk_Drivers",
+    "Investigation_Outcome"
+]
+# Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
+# For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
+METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
+# --- Model Hyperparameters ---
+MAX_LEN = 128       # Maximum sequence length for transformer tokenizers
+BATCH_SIZE = 16     # Batch size for training and evaluation
+LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
+NUM_EPOCHS = 3      # Number of training epochs. Adjust based on convergence.
+DROPOUT_RATE = 0.3  # Dropout rate for regularization
+# --- Device Configuration ---
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Specific Model Configurations ---
+BERT_MODEL_NAME = 'bert-base-uncased'
+ROBERTA_MODEL_NAME = 'roberta-base'
+DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
+# TF-IDF
+TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
+# --- Field-Specific Strategy (Conceptual) ---
+# This dictionary provides conceptual strategies for enhancing specific fields.
+# Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
+FIELD_STRATEGIES = {
+    "Maker_Action": {
+        "loss": "focal_loss", # Requires custom Focal Loss implementation
+        "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
+    },
+    "Risk_Category": {
+        "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
+    },
+    "Escalation_Level": {
+        "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
+    },
+    "Investigation_Outcome": {
+        "type": "classification_or_generation" # If generation, T5/BART would be needed.
+    }
+}
+# Ensure model save and predictions directories exist
+os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
+os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
+os.makedirs(TOKENIZER_PATH, exist_ok=True)

dataset_utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# dataset_utils.py
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from sklearn.preprocessing import LabelEncoder
+from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer
+import pickle
+import os
+from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, TOKENIZER_PATH, LABEL_ENCODERS_PATH, METADATA_COLUMNS
+class ComplianceDataset(Dataset):
+    """
+    Custom Dataset class for handling text and multi-output labels for PyTorch models.
+    """
+    def __init__(self, texts, labels, tokenizer, max_len):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        """Returns the total number of samples in the dataset."""
+        return len(self.texts)
+    def __getitem__(self, idx):
+        """
+        Retrieves a sample from the dataset at the given index.
+        Tokenizes the text and converts labels to a PyTorch tensor.
+        """
+        text = str(self.texts[idx])
+        # Tokenize the text, padding to max_length and truncating if longer.
+        # return_tensors="pt" ensures PyTorch tensors are returned.
+        inputs = self.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_len,
+            return_tensors="pt"
+        )
+        # Squeeze removes the batch dimension (which is 1 here because we process one sample at a time)
+        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
+        # Convert labels to a PyTorch long tensor
+        labels = torch.tensor(self.labels[idx], dtype=torch.long)
+        return inputs, labels
+class ComplianceDatasetWithMetadata(Dataset):
+    """
+    Custom Dataset class for handling text, additional numerical metadata, and multi-output labels.
+    Used for hybrid models combining text and tabular features.
+    """
+    def __init__(self, texts, metadata, labels, tokenizer, max_len):
+        self.texts = texts
+        self.metadata = metadata # Expects metadata as a NumPy array or list of lists
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        """Returns the total number of samples in the dataset."""
+        return len(self.texts)
+    def __getitem__(self, idx):
+        """
+        Retrieves a sample, its metadata, and labels from the dataset at the given index.
+        Tokenizes text, converts metadata and labels to PyTorch tensors.
+        """
+        text = str(self.texts[idx])
+        inputs = self.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_len,
+            return_tensors="pt"
+        )
+        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
+        # Convert metadata for the current sample to a float tensor
+        metadata = torch.tensor(self.metadata[idx], dtype=torch.float)
+        labels = torch.tensor(self.labels[idx], dtype=torch.long)
+        return inputs, metadata, labels
+def load_and_preprocess_data(data_path):
+    """
+    Loads data from a CSV, fills missing values, and encodes categorical labels.
+    Also handles converting specified METADATA_COLUMNS to numeric.
+    Args:
+        data_path (str): Path to the CSV data file.
+    Returns:
+        tuple: A tuple containing:
+            - data (pd.DataFrame): The preprocessed DataFrame.
+            - label_encoders (dict): A dictionary of LabelEncoder objects for each label column.
+    """
+    data = pd.read_csv(data_path)
+    data.fillna("Unknown", inplace=True) # Fill any missing text values with "Unknown"
+    # Convert metadata columns to numeric, coercing errors and filling NaNs with 0
+    # This ensures metadata is suitable for neural networks.
+    for col in METADATA_COLUMNS:
+        if col in data.columns:
+            data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0) # Fill NaN with 0 or a suitable value
+    label_encoders = {col: LabelEncoder() for col in LABEL_COLUMNS}
+    for col in LABEL_COLUMNS:
+        # Fit and transform each label column using its respective LabelEncoder
+        data[col] = label_encoders[col].fit_transform(data[col])
+    return data, label_encoders
+def get_tokenizer(model_name):
+    """
+    Returns the appropriate Hugging Face tokenizer based on the model name.
+    Args:
+        model_name (str): The name of the pre-trained model (e.g., 'bert-base-uncased').
+    Returns:
+        transformers.PreTrainedTokenizer: The initialized tokenizer.
+    """
+    if "bert" in model_name.lower():
+        return BertTokenizer.from_pretrained(model_name)
+    elif "roberta" in model_name.lower():
+        return RobertaTokenizer.from_pretrained(model_name)
+    elif "deberta" in model_name.lower():
+        return DebertaTokenizer.from_pretrained(model_name)
+    else:
+        raise ValueError(f"Unsupported tokenizer for model: {model_name}")
+def save_label_encoders(label_encoders):
+    """
+    Saves a dictionary of label encoders to a pickle file.
+    This is crucial for decoding predictions back to original labels.
+    Args:
+        label_encoders (dict): Dictionary of LabelEncoder objects.
+    """
+    with open(LABEL_ENCODERS_PATH, "wb") as f:
+        pickle.dump(label_encoders, f)
+    print(f"Label encoders saved to {LABEL_ENCODERS_PATH}")
+def load_label_encoders():
+    """
+    Loads a dictionary of label encoders from a pickle file.
+    Returns:
+        dict: Loaded dictionary of LabelEncoder objects.
+    """
+    with open(LABEL_ENCODERS_PATH, "rb") as f:
+        return pickle.load(f)
+    print(f"Label encoders loaded from {LABEL_ENCODERS_PATH}")
+def get_num_labels(label_encoders):
+    """
+    Returns a list containing the number of unique classes for each label column.
+    This list is used to define the output dimensions of the model's classification heads.
+    Args:
+        label_encoders (dict): Dictionary of LabelEncoder objects.
+    Returns:
+        list: A list of integers, where each integer is the number of classes for a label.
+    """
+    return [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+version: '3.8'
+services:
+  bert-api:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      - ../saved_models:/app/saved_models
+      - ../tokenizer:/app/tokenizer
+      - ../predictions:/app/predictions
+      - ../label_encoders.pkl:/app/label_encoders.pkl
+    environment:
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped

label_encoders.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c336fd07858af76d40c7200de1a769099abeec25d4f48b999351318680d4e4d6
+size 2047

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi==0.104.1
+uvicorn==0.24.0
+pydantic==2.4.2
+torch==2.1.1
+transformers==4.35.2
+numpy==1.24.3
+scikit-learn==1.3.2
+pandas==2.1.3