LorenzoBioinfo commited on
Commit
0ac2632
·
1 Parent(s): 26ff02c

Add train and monitoring with tests

Browse files
models/__init__.py ADDED
File without changes
reports/__init__.py ADDED
File without changes
src/monitoring.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ from datasets import load_from_disk
3
+ from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
4
+ import torch
5
+ import numpy as np
6
+ import json
7
+ import os
8
+
9
+ MODEL_PATH = "models/sentiment_model"
10
+ TWEET_PATH = "data/processed/tweet_eval_tokenized"
11
+ YT_PATH = "data/processed/youtube_comments"
12
+ REPORTS_DIR = "reports"
13
+
14
+
15
+ def evaluate_model(model, tokenizer, dataset, dataset_name, sample_size=300):
16
+ print(f"Valutazione su {dataset_name}")
17
+ subset = dataset["test"].select(range(min(sample_size, len(dataset["test"]))))
18
+
19
+ texts = subset["text"]
20
+ labels = subset["label"]
21
+
22
+ inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
23
+ with torch.no_grad():
24
+ outputs = model(**inputs)
25
+ preds = torch.argmax(outputs.logits, dim=1).numpy()
26
+
27
+ acc = accuracy_score(labels, preds)
28
+ f1 = f1_score(labels, preds, average="weighted")
29
+ cm = confusion_matrix(labels, preds).tolist()
30
+
31
+ print(f"{dataset_name} — Accuracy: {acc:.3f}, F1: {f1:.3f}")
32
+ return {"dataset": dataset_name, "accuracy": acc, "f1": f1, "confusion_matrix": cm}
33
+
34
+ def main():
35
+ print("Caricamento del modello")
36
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
37
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
38
+ model.eval()
39
+
40
+ tweet_ds = load_from_disk(TWEET_PATH)
41
+ youtube_ds = load_from_disk(YT_PATH)
42
+
43
+ tweet_metrics = evaluate_model(model, tokenizer, tweet_ds, "TweetEval")
44
+ youtube_metrics = evaluate_model(model, tokenizer, youtube_ds, "YouTube Comments")
45
+
46
+ os.makedirs(REPORTS_DIR, exist_ok=True)
47
+ metrics_path = os.path.join(REPORTS_DIR, "metrics.json")
48
+
49
+ results = {"TweetEval": tweet_metrics, "YouTube": youtube_metrics}
50
+ with open(metrics_path, "w") as f:
51
+ json.dump(results, f, indent=4)
52
+
53
+ print(f"Risultati salvati in: {metrics_path}")
54
+
55
+ if __name__ == "__main__":
56
+ main()
src/train_model.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import (
3
+ AutoModelForSequenceClassification,
4
+ Trainer,
5
+ TrainingArguments,
6
+ AutoTokenizer
7
+ )
8
+ from datasets import load_from_disk
9
+ import evaluate
10
+ import numpy as np
11
+ import os
12
+
13
+ MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
14
+ DATA_PATH = "data/processed/tweet_eval_tokenized"
15
+ OUTPUT_DIR = "models/sentiment_model"
16
+
17
+ def compute_metrics(eval_pred):
18
+ """Calcola metriche standard: accuracy e F1."""
19
+ metric_acc = evaluate.load("accuracy")
20
+ metric_f1 = evaluate.load("f1")
21
+ logits, labels = eval_pred
22
+ predictions = np.argmax(logits, axis=-1)
23
+ acc = metric_acc.compute(predictions=predictions, references=labels)
24
+ f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
25
+ return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
26
+
27
+ def train_model(sample_train_size=1000, sample_eval_size=300):
28
+ print("Caricamento dataset Tweet eval preprocessato")
29
+ dataset = load_from_disk(DATA_PATH)
30
+
31
+ #
32
+ print(f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione.")
33
+ train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
34
+ eval_data = dataset["validation"].select(range(min(sample_eval_size, len(dataset["validation"]))))
35
+
36
+
37
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
38
+
39
+ # Parametri training
40
+ training_args = TrainingArguments(
41
+ output_dir=OUTPUT_DIR,
42
+ num_train_epochs=1,
43
+ per_device_train_batch_size=16,
44
+ per_device_eval_batch_size=32,
45
+ evaluation_strategy="epoch",
46
+ save_strategy="epoch",
47
+ logging_dir="./logs",
48
+ logging_steps=10,
49
+ load_best_model_at_end=True,
50
+ report_to="none",
51
+ )
52
+
53
+ print("Avvio training")
54
+ trainer = Trainer(
55
+ model=model,
56
+ args=training_args,
57
+ train_dataset=train_data,
58
+ eval_dataset=eval_data,
59
+ compute_metrics=compute_metrics,
60
+ )
61
+
62
+ trainer.train()
63
+
64
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
65
+ trainer.save_model(OUTPUT_DIR)
66
+ print(f"Modello salvato in: {OUTPUT_DIR}")
67
+
68
+ if __name__ == "__main__":
69
+ train_model()
tests/integration/test_monitoring.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pytest
4
+ from src.monitoring import monitor_model
5
+
6
+ METRICS_PATH = "reports/metrics.json"
7
+
8
+ @pytest.fixture(autouse=True)
9
+ def cleanup_metrics():
10
+ """Pulisce file metrics prima del test."""
11
+ if os.path.exists(METRICS_PATH):
12
+ os.remove(METRICS_PATH)
13
+ yield
14
+ if os.path.exists(METRICS_PATH):
15
+ os.remove(METRICS_PATH)
16
+
17
+ def test_monitoring_creates_metrics():
18
+ """Verifica che il monitoring crei il file metrics.json."""
19
+ monitor_model()
20
+ assert os.path.exists(METRICS_PATH), "metrics.json non è stato generato"
21
+
22
+ with open(METRICS_PATH, "r") as f:
23
+ metrics = json.load(f)
24
+ assert "accuracy" in metrics and "f1" in metrics, "Metriche principali mancanti"
tests/integration/test_train.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import pytest
4
+ from src.train import train_model
5
+
6
+ MODEL_DIR = "models/sentiment_model"
7
+
8
+ @pytest.fixture(autouse=True)
9
+ def cleanup():
10
+ if os.path.exists(MODEL_DIR):
11
+ shutil.rmtree(MODEL_DIR)
12
+ yield
13
+ if os.path.exists(MODEL_DIR):
14
+ shutil.rmtree(MODEL_DIR)
15
+
16
+ def test_train_model_runs():
17
+ """Testa che il training parta e salvi un modello."""
18
+ train_model(sample_train_size=10, sample_eval_size=5)
19
+ assert os.path.exists(MODEL_DIR), "La directory del modello non è stata creata"
20
+ assert os.path.exists(os.path.join(MODEL_DIR, "config.json")), "File config.json mancante"