| import csv | |
| from typing import TypedDict | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| from datasets import load_dataset, Dataset | |
| import pandas as pd | |
| import evaluate | |
| import os | |
| import torch | |
| data_files = { | |
| "train": os.path.realpath(os.path.join(__file__, "..", "./datasets/train.csv")), | |
| "test": os.path.realpath(os.path.join(__file__, "..", "./datasets/test.csv")) | |
| } | |
| output_dir = os.path.realpath(os.path.join(__file__, "..", "./outputs/v2-deberta-100-max")) | |
| tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base") | |
| label_map = { | |
| "FAVOR": 0, | |
| "NONE": 1, | |
| "AGAINST": 2 | |
| } | |
| torch.cuda.empty_cache() | |
| def tokenize(examples): | |
| examples["label"] = [label_map[label] for label in examples["label"]] | |
| examples["text"] = [examples["Target"][i] + " [SEP] " + text for i , text in enumerate(examples["text"])] | |
| return tokenizer(examples["text"], padding="max_length", return_tensors='pt', truncation=True, max_length=100) | |
| def load_dataset(path: str) -> Dataset: | |
| dataframe = pd.read_csv(path) | |
| dataframe = dataframe.drop("Opinion Towards", axis=1) | |
| dataframe = dataframe.drop("Sentiment", axis=1) | |
| dataset = Dataset.from_pandas(dataframe) | |
| dataset = dataset.rename_column('Tweet', 'text') | |
| dataset = dataset.rename_column("Stance", "label") | |
| return dataset.map(tokenize, batched=True) | |
| train_ds = load_dataset(data_files["train"]) | |
| test_ds = load_dataset(data_files["test"]) | |
| model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=3) | |
| metric = evaluate.load("accuracy") | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| return metric.compute(predictions=predictions, references=labels) | |
| training_args = TrainingArguments(output_dir=output_dir, evaluation_strategy="epoch") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_ds, | |
| eval_dataset=test_ds, | |
| compute_metrics=compute_metrics | |
| ) | |
| print("TRAINING") | |
| trainer.train() |