Spaces:
Sleeping
Sleeping
| from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments | |
| from datasets import load_from_disk, concatenate_datasets | |
| import evaluate | |
| import numpy as np | |
| import os | |
| from huggingface_hub import HfApi | |
| hf_token = os.environ["HF_TOKEN"] | |
| # | |
| MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest" | |
| DATA_PATH = "data/processed/tweet_eval_tokenized" | |
| OUTPUT_DIR = "models/sentiment_model" | |
| HF_REPO = "Lordemarco/SentimentAnalysis" | |
| def compute_metrics(eval_pred): | |
| """Calcola metriche standard: accuracy e F1.""" | |
| metric_acc = evaluate.load("accuracy") | |
| metric_f1 = evaluate.load("f1") | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| acc = metric_acc.compute(predictions=predictions, references=labels) | |
| f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted") | |
| return {"accuracy": acc["accuracy"], "f1": f1["f1"]} | |
| def train_model( | |
| additional_data=None, sample_train_size=1000, sample_eval_size=300, output_dir=OUTPUT_DIR | |
| ): | |
| print("Caricamento dataset Tweet eval preprocessato") | |
| dataset = load_from_disk(DATA_PATH) | |
| if additional_data is not None: | |
| print("Aggiungo dati YouTube al training set...") | |
| dataset["train"] = concatenate_datasets([dataset["train"], additional_data]) | |
| # | |
| print( | |
| f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione." | |
| ) | |
| train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"])))) | |
| eval_data = dataset["validation"].select( | |
| range(min(sample_eval_size, len(dataset["validation"]))) | |
| ) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) | |
| # Parametri training | |
| training_args = TrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| num_train_epochs=1, | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=32, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| logging_dir="./logs", | |
| logging_steps=10, | |
| load_best_model_at_end=True, | |
| report_to="none", | |
| ) | |
| print("Avvio training") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_data, | |
| eval_dataset=eval_data, | |
| compute_metrics=compute_metrics, | |
| ) | |
| trainer.train() | |
| os.makedirs(output_dir, exist_ok=True) | |
| trainer.save_model(output_dir) | |
| print(f"Modello salvato in: {OUTPUT_DIR}") | |
| if os.getenv("HF_TOKEN"): | |
| print("Pushing model to Hugging Face Hub...") | |
| trainer.push_to_hub("Lordemarco/SentimentAnalysis") | |
| if __name__ == "__main__": | |
| train_model() | |