ganeshkonapalli commited on
Commit
7faa9a5
·
verified ·
1 Parent(s): 1d75db1

Create deberta_model.py

Browse files
Files changed (1) hide show
  1. deberta_model.py +88 -0
deberta_model.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install -q transformers
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import pandas as pd
6
+ import pickle
7
+ from sklearn.preprocessing import LabelEncoder
8
+ from sklearn.model_selection import train_test_split
9
+ from transformers import DebertaTokenizer, DebertaModel
10
+ from torch.optim import AdamW
11
+ from tqdm import tqdm
12
+
13
+ # --- Config ---
14
+ TEXT_COLUMN = 'Sanction_Context'
15
+ LABEL_COLUMNS = ['Red_Flag_Reason', 'Maker_Action', 'Escalation_Level',
16
+ 'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome']
17
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+ # --- Load Data ---
20
+ df = pd.read_csv('/kaggle/input/deberta/synthetic_transactions_samples_5000.csv')
21
+ X = df[TEXT_COLUMN].tolist()
22
+ y = df[LABEL_COLUMNS]
23
+
24
+ # --- Label Encode ---
25
+ label_encoders = {}
26
+ y_encoded = pd.DataFrame()
27
+ for col in LABEL_COLUMNS:
28
+ le = LabelEncoder()
29
+ y_encoded[col] = le.fit_transform(y[col])
30
+ label_encoders[col] = le
31
+
32
+ # --- Train/Test Split ---
33
+ X_train, _, y_train, _ = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
34
+
35
+ # --- Tokenize ---
36
+ tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
37
+ train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128, return_tensors="pt")
38
+
39
+ # --- Model Definition ---
40
+ class DebertaMultiOutput(nn.Module):
41
+ def __init__(self, num_labels_per_output):
42
+ super().__init__()
43
+ self.deberta = DebertaModel.from_pretrained("microsoft/deberta-base")
44
+ self.dropout = nn.Dropout(0.3)
45
+ self.classifiers = nn.ModuleList([
46
+ nn.Linear(self.deberta.config.hidden_size, n_labels) for n_labels in num_labels_per_output
47
+ ])
48
+
49
+ def forward(self, input_ids, attention_mask):
50
+ outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
51
+ pooled = self.dropout(outputs.last_hidden_state[:, 0]) # Use CLS token
52
+ return [classifier(pooled) for classifier in self.classifiers]
53
+
54
+ # --- Prepare Labels ---
55
+ labels = [torch.tensor(y_train[col].values) for col in LABEL_COLUMNS]
56
+ num_labels = [len(le.classes_) for le in label_encoders.values()]
57
+
58
+ # --- Init Model ---
59
+ model = DebertaMultiOutput(num_labels).to(DEVICE)
60
+ optimizer = AdamW(model.parameters(), lr=2e-5)
61
+ loss_fn = nn.CrossEntropyLoss()
62
+
63
+ # --- Train Loop ---
64
+ model.train()
65
+ for epoch in range(3):
66
+ total_loss = 0
67
+ for i in tqdm(range(0, len(X_train), 16)):
68
+ ids = train_encodings['input_ids'][i:i+16].to(DEVICE)
69
+ mask = train_encodings['attention_mask'][i:i+16].to(DEVICE)
70
+ y_batch = [label[i:i+16].to(DEVICE) for label in labels]
71
+
72
+ optimizer.zero_grad()
73
+ outputs = model(ids, mask)
74
+ loss = sum(loss_fn(o, y) for o, y in zip(outputs, y_batch))
75
+ loss.backward()
76
+ optimizer.step()
77
+ total_loss += loss.item()
78
+ print(f"Epoch {epoch+1} Loss: {total_loss:.2f}")
79
+
80
+ # --- Save to Pickle ---
81
+ with open("deberta_model.pkl", "wb") as f:
82
+ pickle.dump({
83
+ 'model_state_dict': model.state_dict(),
84
+ 'tokenizer': tokenizer,
85
+ 'label_encoders': label_encoders
86
+ }, f)
87
+
88
+ print("✅ DeBERTa model saved to 'deberta_model.pkl'")