ganeshkonapalli commited on
Commit
c8a11a5
·
verified ·
1 Parent(s): f49f5c0

Create deberta_model.pth

Browse files
Files changed (1) hide show
  1. deberta_model.pth +108 -0
deberta_model.pth ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deberta_multilabel_train.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from transformers import DebertaTokenizer, DebertaModel
7
+ from sklearn.preprocessing import LabelEncoder
8
+ from sklearn.model_selection import train_test_split
9
+ import pandas as pd
10
+ import pickle
11
+ from tqdm import tqdm
12
+
13
+ # --- Config ---
14
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ TEXT_COLUMN = "Sanction_Context"
16
+ LABEL_COLUMNS = ['Red_Flag_Reason', 'Maker_Action', 'Escalation_Level',
17
+ 'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome']
18
+ MODEL_NAME = "microsoft/deberta-base"
19
+ BATCH_SIZE = 8
20
+ EPOCHS = 3
21
+ MAX_LEN = 256
22
+
23
+ # --- Load Data ---
24
+ df = pd.read_csv("/kaggle/input/deberta-model/synthetic_transactions_samples_5000.csv") # Ensure TEXT_COLUMN and LABEL_COLUMNS are in this CSV
25
+ label_encoders = []
26
+
27
+ # Encode each label column
28
+ for col in LABEL_COLUMNS:
29
+ le = LabelEncoder()
30
+ df[col] = le.fit_transform(df[col])
31
+ label_encoders.append(le)
32
+
33
+ # Save label encoders
34
+ with open("label_encoders.pkl", "wb") as f:
35
+ pickle.dump(label_encoders, f)
36
+
37
+ # Train/val split
38
+ train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
39
+
40
+ # --- Tokenizer ---
41
+ tokenizer = DebertaTokenizer.from_pretrained(MODEL_NAME)
42
+
43
+ # --- Dataset ---
44
+ class TextDataset(Dataset):
45
+ def _init_(self, dataframe, tokenizer):
46
+ self.tokenizer = tokenizer
47
+ self.texts = list(dataframe[TEXT_COLUMN])
48
+ self.labels = dataframe[LABEL_COLUMNS].values
49
+
50
+ def _len_(self):
51
+ return len(self.texts)
52
+
53
+ def _getitem_(self, idx):
54
+ encodings = self.tokenizer(
55
+ self.texts[idx],
56
+ truncation=True,
57
+ padding="max_length",
58
+ max_length=MAX_LEN,
59
+ return_tensors="pt"
60
+ )
61
+ item = {key: val.squeeze(0) for key, val in encodings.items()}
62
+ item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
63
+ return item
64
+
65
+ train_dataset = TextDataset(train_df, tokenizer)
66
+ val_dataset = TextDataset(val_df, tokenizer)
67
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
68
+ val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
69
+
70
+ # --- Model ---
71
+ class DebertaMultiOutput(nn.Module):
72
+ def _init_(self, num_labels):
73
+ super(DebertaMultiOutput, self)._init_()
74
+ self.deberta = DebertaModel.from_pretrained(MODEL_NAME)
75
+ self.dropout = nn.Dropout(0.3)
76
+ self.output = nn.Linear(self.deberta.config.hidden_size, num_labels)
77
+
78
+ def forward(self, input_ids, attention_mask):
79
+ outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
80
+ pooled_output = outputs.last_hidden_state[:, 0] # CLS token
81
+ dropped = self.dropout(pooled_output)
82
+ return self.output(dropped)
83
+
84
+ model = DebertaMultiOutput(num_labels=len(LABEL_COLUMNS)).to(DEVICE)
85
+ criterion = nn.BCEWithLogitsLoss()
86
+ optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
87
+
88
+ # --- Training Loop ---
89
+ for epoch in range(EPOCHS):
90
+ model.train()
91
+ total_loss = 0
92
+ for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
93
+ input_ids = batch['input_ids'].to(DEVICE)
94
+ attention_mask = batch['attention_mask'].to(DEVICE)
95
+ labels = batch['labels'].to(DEVICE)
96
+
97
+ optimizer.zero_grad()
98
+ outputs = model(input_ids, attention_mask)
99
+ loss = criterion(outputs, labels)
100
+ loss.backward()
101
+ optimizer.step()
102
+ total_loss += loss.item()
103
+
104
+ print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")
105
+
106
+ # --- Save Model & Tokenizer ---
107
+ torch.save(model.state_dict(), "deberta_model.pth")
108
+ tokenizer.save_pretrained("deberta_tokenizer")