| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import LabelEncoder |
| from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments |
| import torch |
| from datasets import Dataset |
|
|
| |
| df = pd.read_csv("/Users/milanradovanovich/Downloads/twitter_training.csv/twitter_training.csv", header=None) |
| df.columns = ['tweet_id', 'topic', 'sentiment', 'text'] |
| df.dropna(subset=['text', 'sentiment'], inplace=True) |
| df['sentiment'] = df['sentiment'].str.strip().str.lower() |
|
|
| |
| label_map = {'positive': 0, 'negative': 1, 'neutral': 2, 'irrelevant': 3} |
| df['label'] = df['sentiment'].map(label_map) |
|
|
| |
| train_texts, val_texts, train_labels, val_labels = train_test_split( |
| df['text'].tolist(), |
| df['label'].tolist(), |
| test_size=0.1, |
| stratify=df['label'], |
| random_state=42 |
| ) |
|
|
| |
| tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') |
| train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128) |
| val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128) |
|
|
| |
| class SentimentDataset(torch.utils.data.Dataset): |
| def __init__(self, encodings, labels): |
| self.encodings = encodings |
| self.labels = labels |
|
|
| def __getitem__(self, idx): |
| item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} |
| item["labels"] = torch.tensor(self.labels[idx]) |
| return item |
|
|
| def __len__(self): |
| return len(self.labels) |
|
|
| train_dataset = SentimentDataset(train_encodings, train_labels) |
| val_dataset = SentimentDataset(val_encodings, val_labels) |
|
|
| |
| model = DistilBertForSequenceClassification.from_pretrained( |
| "distilbert-base-uncased", num_labels=4 |
| ) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir="./model_output", |
| evaluation_strategy="epoch", |
| save_strategy="epoch", |
| learning_rate=2e-5, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=16, |
| num_train_epochs=3, |
| weight_decay=0.01, |
| logging_dir='./logs', |
| logging_steps=50, |
| ) |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=val_dataset, |
| ) |
|
|
| trainer.train() |
|
|
| |
| model.save_pretrained("./sentiment_model") |
| tokenizer.save_pretrained("./sentiment_model") |
|
|
| print(" Training complete and model saved to ./sentiment_model") |