Spaces:
Sleeping
Sleeping
File size: 2,677 Bytes
1180a53 0ac2632 5753b42 0ac2632 1180a53 0ac2632 a66d87f 1180a53 0ac2632 a66d87f 0ac2632 1180a53 0ac2632 1180a53 0ac2632 1180a53 0ac2632 46a01d1 0ac2632 5394111 1180a53 0ac2632 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_from_disk, concatenate_datasets
import evaluate
import numpy as np
import os
from huggingface_hub import HfApi
hf_token = os.environ["HF_TOKEN"]
#
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
DATA_PATH = "data/processed/tweet_eval_tokenized"
OUTPUT_DIR = "models/sentiment_model"
HF_REPO = "Lordemarco/SentimentAnalysis"
def compute_metrics(eval_pred):
"""Calcola metriche standard: accuracy e F1."""
metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = metric_acc.compute(predictions=predictions, references=labels)
f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
def train_model(
additional_data=None, sample_train_size=1000, sample_eval_size=300, output_dir=OUTPUT_DIR
):
print("Caricamento dataset Tweet eval preprocessato")
dataset = load_from_disk(DATA_PATH)
if additional_data is not None:
print("Aggiungo dati YouTube al training set...")
dataset["train"] = concatenate_datasets([dataset["train"], additional_data])
#
print(
f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione."
)
train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
eval_data = dataset["validation"].select(
range(min(sample_eval_size, len(dataset["validation"])))
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
# Parametri training
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
report_to="none",
)
print("Avvio training")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data,
eval_dataset=eval_data,
compute_metrics=compute_metrics,
)
trainer.train()
os.makedirs(output_dir, exist_ok=True)
trainer.save_model(output_dir)
print(f"Modello salvato in: {OUTPUT_DIR}")
if os.getenv("HF_TOKEN"):
print("Pushing model to Hugging Face Hub...")
trainer.push_to_hub("Lordemarco/SentimentAnalysis")
if __name__ == "__main__":
train_model()
|