import pandas as pd import torch from transformers import DebertaTokenizer, DebertaForSequenceClassification from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder import pickle # Sample data data = pd.DataFrame({ 'text': [ 'This is a positive message', 'This is negative', 'I am neutral', 'Absolutely wonderful', 'Terrible and bad' ], 'label': ['positive', 'negative', 'neutral', 'positive', 'negative'] }) # Encode labels le = LabelEncoder() data['label_enc'] = le.fit_transform(data['label']) # Train-test split X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label_enc'], test_size=0.2) # Tokenization tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base") train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors="pt") # Model model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=len(le.classes_)) inputs = train_encodings['input_ids'] attention_mask = train_encodings['attention_mask'] labels = torch.tensor(y_train.values) # Training (single epoch for demo) model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) for epoch in range(1): outputs = model(inputs, attention_mask=attention_mask, labels=labels) loss = outputs.loss loss.backward() optimizer.step() optimizer.zero_grad() # Save model and tokenizer with open("app/model.pkl", "wb") as f: pickle.dump(model, f) with open("app/tokenizer.pkl", "wb") as f: pickle.dump(tokenizer, f) with open("app/label_encoder.pkl", "wb") as f: pickle.dump(le, f)