deberta.space / train.py
ganeshkonapalli's picture
Upload 6 files
0e73d34 verified
import pandas as pd
import torch
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
# Sample data
data = pd.DataFrame({
'text': [
'This is a positive message',
'This is negative',
'I am neutral',
'Absolutely wonderful',
'Terrible and bad'
],
'label': ['positive', 'negative', 'neutral', 'positive', 'negative']
})
# Encode labels
le = LabelEncoder()
data['label_enc'] = le.fit_transform(data['label'])
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_enc'], test_size=0.2)
# Tokenization
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors="pt")
# Model
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=len(le.classes_))
inputs = train_encodings['input_ids']
attention_mask = train_encodings['attention_mask']
labels = torch.tensor(y_train.values)
# Training (single epoch for demo)
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
for epoch in range(1):
outputs = model(inputs, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Save model and tokenizer
with open("app/model.pkl", "wb") as f:
pickle.dump(model, f)
with open("app/tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f)
with open("app/label_encoder.pkl", "wb") as f:
pickle.dump(le, f)