File size: 2,677 Bytes
1180a53
 
0ac2632
 
 
5753b42
 
 
 
 
 
0ac2632
 
 
 
1180a53
 
0ac2632
 
 
 
 
 
 
 
 
 
 
a66d87f
1180a53
 
 
0ac2632
 
a66d87f
 
 
0ac2632
1180a53
 
 
 
0ac2632
1180a53
 
 
0ac2632
 
 
 
 
 
 
 
 
 
 
 
 
 
1180a53
0ac2632
 
 
 
 
 
 
 
 
 
 
 
 
46a01d1
 
0ac2632
 
5394111
 
 
1180a53
 
0ac2632
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_from_disk, concatenate_datasets
import evaluate
import numpy as np
import os
from huggingface_hub import HfApi


hf_token = os.environ["HF_TOKEN"]

#

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
DATA_PATH = "data/processed/tweet_eval_tokenized"
OUTPUT_DIR = "models/sentiment_model"
HF_REPO = "Lordemarco/SentimentAnalysis"


def compute_metrics(eval_pred):
    """Calcola metriche standard: accuracy e F1."""
    metric_acc = evaluate.load("accuracy")
    metric_f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}


def train_model(
    additional_data=None, sample_train_size=1000, sample_eval_size=300, output_dir=OUTPUT_DIR
):
    print("Caricamento dataset Tweet eval preprocessato")
    dataset = load_from_disk(DATA_PATH)
    if additional_data is not None:
        print("Aggiungo dati YouTube al training set...")
        dataset["train"] = concatenate_datasets([dataset["train"], additional_data])

    #
    print(
        f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione."
    )
    train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
    eval_data = dataset["validation"].select(
        range(min(sample_eval_size, len(dataset["validation"])))
    )

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

    # Parametri training
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        report_to="none",
    )

    print("Avvio training")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    os.makedirs(output_dir, exist_ok=True)
    trainer.save_model(output_dir)
    print(f"Modello salvato in: {OUTPUT_DIR}")

    if os.getenv("HF_TOKEN"):
        print("Pushing model to Hugging Face Hub...")
        trainer.push_to_hub("Lordemarco/SentimentAnalysis")


if __name__ == "__main__":
    train_model()