Spaces:
Configuration error
Configuration error
| # deberta_multilabel_train.py | |
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import DebertaTokenizer, DebertaModel | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| import pandas as pd | |
| import pickle | |
| from tqdm import tqdm | |
| # --- Config --- | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| TEXT_COLUMN = "Sanction_Context" | |
| LABEL_COLUMNS = ['Red_Flag_Reason', 'Maker_Action', 'Escalation_Level', | |
| 'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome'] | |
| MODEL_NAME = "microsoft/deberta-base" | |
| BATCH_SIZE = 8 | |
| EPOCHS = 3 | |
| MAX_LEN = 256 | |
| # --- Load Data --- | |
| df = pd.read_csv("/kaggle/input/deberta-model/synthetic_transactions_samples_5000.csv") # Ensure TEXT_COLUMN and LABEL_COLUMNS are in this CSV | |
| label_encoders = [] | |
| # Encode each label column | |
| for col in LABEL_COLUMNS: | |
| le = LabelEncoder() | |
| df[col] = le.fit_transform(df[col]) | |
| label_encoders.append(le) | |
| # Save label encoders | |
| with open("label_encoders.pkl", "wb") as f: | |
| pickle.dump(label_encoders, f) | |
| # Train/val split | |
| train_df, val_df = train_test_split(df, test_size=0.2, random_state=42) | |
| # --- Tokenizer --- | |
| tokenizer = DebertaTokenizer.from_pretrained(MODEL_NAME) | |
| # --- Dataset --- | |
| class TextDataset(Dataset): | |
| def _init_(self, dataframe, tokenizer): | |
| self.tokenizer = tokenizer | |
| self.texts = list(dataframe[TEXT_COLUMN]) | |
| self.labels = dataframe[LABEL_COLUMNS].values | |
| def _len_(self): | |
| return len(self.texts) | |
| def _getitem_(self, idx): | |
| encodings = self.tokenizer( | |
| self.texts[idx], | |
| truncation=True, | |
| padding="max_length", | |
| max_length=MAX_LEN, | |
| return_tensors="pt" | |
| ) | |
| item = {key: val.squeeze(0) for key, val in encodings.items()} | |
| item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) | |
| return item | |
| train_dataset = TextDataset(train_df, tokenizer) | |
| val_dataset = TextDataset(val_df, tokenizer) | |
| train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) | |
| val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE) | |
| # --- Model --- | |
| class DebertaMultiOutput(nn.Module): | |
| def _init_(self, num_labels): | |
| super(DebertaMultiOutput, self)._init_() | |
| self.deberta = DebertaModel.from_pretrained(MODEL_NAME) | |
| self.dropout = nn.Dropout(0.3) | |
| self.output = nn.Linear(self.deberta.config.hidden_size, num_labels) | |
| def forward(self, input_ids, attention_mask): | |
| outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask) | |
| pooled_output = outputs.last_hidden_state[:, 0] # CLS token | |
| dropped = self.dropout(pooled_output) | |
| return self.output(dropped) | |
| model = DebertaMultiOutput(num_labels=len(LABEL_COLUMNS)).to(DEVICE) | |
| criterion = nn.BCEWithLogitsLoss() | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) | |
| # --- Training Loop --- | |
| for epoch in range(EPOCHS): | |
| model.train() | |
| total_loss = 0 | |
| for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"): | |
| input_ids = batch['input_ids'].to(DEVICE) | |
| attention_mask = batch['attention_mask'].to(DEVICE) | |
| labels = batch['labels'].to(DEVICE) | |
| optimizer.zero_grad() | |
| outputs = model(input_ids, attention_mask) | |
| loss = criterion(outputs, labels) | |
| loss.backward() | |
| optimizer.step() | |
| total_loss += loss.item() | |
| print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}") | |
| # --- Save Model & Tokenizer --- | |
| torch.save(model.state_dict(), "deberta_model.pth") | |
| tokenizer.save_pretrained("deberta_tokenizer") |