import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments import torch from datasets import Dataset # 1. Load and clean data df = pd.read_csv("/Users/milanradovanovich/Downloads/twitter_training.csv/twitter_training.csv", header=None) df.columns = ['tweet_id', 'topic', 'sentiment', 'text'] df.dropna(subset=['text', 'sentiment'], inplace=True) df['sentiment'] = df['sentiment'].str.strip().str.lower() # 2. Encode sentiment labels label_map = {'positive': 0, 'negative': 1, 'neutral': 2, 'irrelevant': 3} df['label'] = df['sentiment'].map(label_map) # 3. Split into train and validation train_texts, val_texts, train_labels, val_labels = train_test_split( df['text'].tolist(), df['label'].tolist(), test_size=0.1, stratify=df['label'], random_state=42 ) # 4. Tokenize tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128) val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128) # 5. Build Dataset class class SentimentDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item["labels"] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = SentimentDataset(train_encodings, train_labels) val_dataset = SentimentDataset(val_encodings, val_labels) # 6. Load model model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=4 ) # 7. Training config training_args = TrainingArguments( output_dir="./model_output", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, logging_dir='./logs', logging_steps=50, ) # 8. Train trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, ) trainer.train() # 9. Save model and tokenizer model.save_pretrained("./sentiment_model") tokenizer.save_pretrained("./sentiment_model") print(" Training complete and model saved to ./sentiment_model")