# Install required packages !pip install -q transformers scikit-learn # Imports import pandas as pd import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split from sklearn.preprocessing import MultiLabelBinarizer from transformers import BertTokenizer, BertModel from torch.optim import Adam # Constants TEXT_COLUMN = 'Sanction_Context' LABEL_COLUMNS = [ 'Red_Flag_Reason', 'Maker_Action', 'Escalation_Level', 'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome' ] # Load and clean data df = pd.read_csv('/kaggle/input/systhesis/synthetic_transactions_samples_5000.csv') df = df.dropna(subset=[TEXT_COLUMN]) df[LABEL_COLUMNS] = df[LABEL_COLUMNS].fillna('Unknown') # Fill missing labels # Encode labels using MultiLabelBinarizer mlb = MultiLabelBinarizer() Y = mlb.fit_transform(df[LABEL_COLUMNS].astype(str).values.tolist()) X = df[TEXT_COLUMN].tolist() # Save the label classes for decoding later import pickle with open("mlb_classes.pkl", "wb") as f: pickle.dump(mlb.classes_, f) # Tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Dataset Class class BertMultiLabelDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_len=128): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __getitem__(self, idx): encoding = self.tokenizer( self.texts[idx], padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt" ) item = {key: val.squeeze(0) for key, val in encoding.items()} item['labels'] = torch.FloatTensor(self.labels[idx]) return item def __len__(self): return len(self.texts) # Model Definition class BertForMultiLabel(nn.Module): def __init__(self, num_labels): super().__init__() self.bert = BertModel.from_pretrained('bert-base-uncased') self.dropout = nn.Dropout(0.3) self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels) def forward(self, input_ids, attention_mask): outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) pooled_output = self.dropout(outputs.pooler_output) logits = self.classifier(pooled_output) return logits # Prepare data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) train_dataset = BertMultiLabelDataset(X_train, Y_train, tokenizer) train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) # Device setup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Training on device: {device}") # Model, optimizer, loss model = BertForMultiLabel(num_labels=Y.shape[1]).to(device) optimizer = Adam(model.parameters(), lr=2e-5) loss_fn = nn.BCEWithLogitsLoss() # Training loop for epoch in range(3): model.train() total_loss = 0 for i, batch in enumerate(train_loader): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) optimizer.zero_grad() logits = model(input_ids, attention_mask) loss = loss_fn(logits, labels) loss.backward() optimizer.step() total_loss += loss.item() if i % 10 == 0: print(f"Epoch {epoch+1}, Step {i}, Loss: {loss.item():.4f}") avg_loss = total_loss / len(train_loader) print(f"Epoch {epoch+1} finished. Avg Loss: {avg_loss:.4f}")