Spaces:

ganeshkonapalli
/

deberta

Build error

App Files Files Community

ganeshkonapalli commited on Jun 10, 2025

Commit

7faa9a5

verified ·

1 Parent(s): 1d75db1

Create deberta_model.py

Browse files

Files changed (1) hide show

deberta_model.py +88 -0

deberta_model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+!pip install -q transformers
+import torch
+import torch.nn as nn
+import pandas as pd
+import pickle
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+from transformers import DebertaTokenizer, DebertaModel
+from torch.optim import AdamW
+from tqdm import tqdm
+# --- Config ---
+TEXT_COLUMN = 'Sanction_Context'
+LABEL_COLUMNS = ['Red_Flag_Reason', 'Maker_Action', 'Escalation_Level',
+                 'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome']
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Load Data ---
+df = pd.read_csv('/kaggle/input/deberta/synthetic_transactions_samples_5000.csv')
+X = df[TEXT_COLUMN].tolist()
+y = df[LABEL_COLUMNS]
+# --- Label Encode ---
+label_encoders = {}
+y_encoded = pd.DataFrame()
+for col in LABEL_COLUMNS:
+    le = LabelEncoder()
+    y_encoded[col] = le.fit_transform(y[col])
+    label_encoders[col] = le
+# --- Train/Test Split ---
+X_train, _, y_train, _ = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
+# --- Tokenize ---
+tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
+train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128, return_tensors="pt")
+# --- Model Definition ---
+class DebertaMultiOutput(nn.Module):
+    def __init__(self, num_labels_per_output):
+        super().__init__()
+        self.deberta = DebertaModel.from_pretrained("microsoft/deberta-base")
+        self.dropout = nn.Dropout(0.3)
+        self.classifiers = nn.ModuleList([
+            nn.Linear(self.deberta.config.hidden_size, n_labels) for n_labels in num_labels_per_output
+        ])
+    def forward(self, input_ids, attention_mask):
+        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
+        pooled = self.dropout(outputs.last_hidden_state[:, 0])  # Use CLS token
+        return [classifier(pooled) for classifier in self.classifiers]
+# --- Prepare Labels ---
+labels = [torch.tensor(y_train[col].values) for col in LABEL_COLUMNS]
+num_labels = [len(le.classes_) for le in label_encoders.values()]
+# --- Init Model ---
+model = DebertaMultiOutput(num_labels).to(DEVICE)
+optimizer = AdamW(model.parameters(), lr=2e-5)
+loss_fn = nn.CrossEntropyLoss()
+# --- Train Loop ---
+model.train()
+for epoch in range(3):
+    total_loss = 0
+    for i in tqdm(range(0, len(X_train), 16)):
+        ids = train_encodings['input_ids'][i:i+16].to(DEVICE)
+        mask = train_encodings['attention_mask'][i:i+16].to(DEVICE)
+        y_batch = [label[i:i+16].to(DEVICE) for label in labels]
+        optimizer.zero_grad()
+        outputs = model(ids, mask)
+        loss = sum(loss_fn(o, y) for o, y in zip(outputs, y_batch))
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    print(f"Epoch {epoch+1} Loss: {total_loss:.2f}")
+# --- Save to Pickle ---
+with open("deberta_model.pkl", "wb") as f:
+    pickle.dump({
+        'model_state_dict': model.state_dict(),
+        'tokenizer': tokenizer,
+        'label_encoders': label_encoders
+    }, f)
+print("✅ DeBERTa model saved to 'deberta_model.pkl'")