import pandas as pd import torch from torch.utils.data import Dataset from transformers import AutoTokenizer from src.config import Config from src.multilingual_data import get_multilingual_data class HateDataset(Dataset): def __init__(self, texts, labels, tokenizer): self.texts = texts self.labels = labels self.tokenizer = tokenizer def __len__(self): return len(self.texts) def __getitem__(self, idx): encoding = self.tokenizer( str(self.texts[idx]), padding="max_length", truncation=True, max_length=Config.MAX_LENGTH, return_tensors="pt" ) return { "input_ids": encoding["input_ids"].squeeze(0), "attention_mask": encoding["attention_mask"].squeeze(0), "labels": torch.tensor(self.labels[idx], dtype=torch.long) } def load_data(): # ========================= # LOAD ENGLISH DATA # ========================= df = pd.read_csv(Config.DATA_PATH) df["label"] = df["class"].apply(lambda x: 0 if x == 2 else 1) texts = df["tweet"].tolist() labels = df["label"].tolist() # ========================= # ADD MULTILINGUAL DATA # ========================= multi_texts, multi_labels = get_multilingual_data() texts.extend(multi_texts) labels.extend(multi_labels) print(f"Total dataset size (with multilingual): {len(texts)}") return texts, labels def get_dataset(): texts, labels = load_data() tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME) return HateDataset(texts, labels, tokenizer)