GitHub Actions commited on
Commit
1180a53
·
1 Parent(s): 0362599

Auto-deploy new version [skip ci]

Browse files
src/app.py CHANGED
@@ -4,14 +4,14 @@ from pydantic import BaseModel
4
  from fastapi.responses import HTMLResponse
5
  from fastapi.templating import Jinja2Templates
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
- from datasets import load_from_disk
8
  import torch
9
  import random
10
  import subprocess
11
  import json
12
 
13
  # Caricamento del modello e dei dati se già scaricati
14
- MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
15
  TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
16
  YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
17
 
@@ -22,7 +22,6 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
22
  labels = ["negative", "neutral", "positive"]
23
 
24
 
25
-
26
  # TWEET EVAL
27
  if not os.path.exists(TWEET_PROCESSED_PATH):
28
  print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
@@ -41,11 +40,10 @@ if not os.path.exists(YT_PROCESSED_PATH):
41
  subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
42
  youtube_ds = load_from_disk(YT_PROCESSED_PATH)
43
 
44
- app = FastAPI(
45
- title="Sentiment Analysis API"
46
- )
47
  templates = Jinja2Templates(directory="app_templates/")
48
 
 
49
  class TextInput(BaseModel):
50
  text: str
51
 
@@ -60,58 +58,53 @@ def predict_sentiment(text: str):
60
  return {"label": labels[pred], "confidence": round(confidence, 3)}
61
 
62
 
63
- @app.get("/",response_class=HTMLResponse)
64
- async def home( request: Request):
65
  return templates.TemplateResponse("index.html", {"request": request})
66
-
 
67
  @app.get("/random_tweet", response_class=HTMLResponse)
68
  def random_tweet(request: Request):
69
- # sample = random.choice(tweet_eval["test"])
70
  sample = tweet_eval["test"][random.randrange(len(tweet_eval["test"]))]
71
- text = sample["text"] if "text" in sample else tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
72
-
 
 
 
 
73
  result = predict_sentiment(text)
74
 
75
-
76
-
77
- true_label=labels[sample["label"]]
78
 
79
  return templates.TemplateResponse(
80
  "random_tweet.html",
81
- {
82
- "request": request,
83
- "text": text,
84
- "true_label": true_label,
85
- "result": result
86
- }
87
  )
88
 
89
 
90
-
91
-
92
-
93
  @app.get("/predict", response_class=HTMLResponse)
94
  def predict_page(request: Request):
95
  return templates.TemplateResponse("predict.html", {"request": request, "result": None})
96
 
 
97
  @app.post("/predict", response_class=HTMLResponse)
98
  def predict_text(request: Request, text: str = Form(...)):
99
  result = predict_sentiment(text)
100
  return templates.TemplateResponse(
101
- "predict.html",
102
- {"request": request, "text": text, "result": result}
103
  )
104
 
105
 
106
  @app.get("/random_youtube_comment", response_class=HTMLResponse)
107
  def random_youtube_comment(request: Request):
108
- sample = random.choice(youtube_ds["train"])
109
 
110
  text = sample["text"] if "text" in sample else sample["text"]
111
  true_label = sample["label"] if "label" in sample else "N/A"
112
 
113
  if isinstance(true_label, int):
114
-
115
  label_map = {0: "negative", 1: "neutral", 2: "positive"}
116
  true_label = label_map.get(true_label, "N/A")
117
 
@@ -119,16 +112,10 @@ def random_youtube_comment(request: Request):
119
 
120
  return templates.TemplateResponse(
121
  "random_youtube.html",
122
- {
123
- "request": request,
124
- "text": text,
125
- "true_label": true_label,
126
- "result": result
127
- }
128
  )
129
 
130
 
131
-
132
  @app.get("/admin", response_class=HTMLResponse)
133
  async def admin_dashboard(request: Request):
134
  """Pagina principale dell'area admin."""
@@ -137,10 +124,8 @@ async def admin_dashboard(request: Request):
137
  if os.path.exists(metrics_path):
138
  with open(metrics_path, "r") as f:
139
  metrics = json.load(f)
140
- return templates.TemplateResponse(
141
- "admin.html",
142
- {"request": request, "metrics": metrics}
143
- )
144
 
145
  @app.post("/admin/train")
146
  async def retrain_model():
@@ -148,12 +133,14 @@ async def retrain_model():
148
  subprocess.run(["python", "src/train.py"], check=True)
149
  return {"status": "Training completato"}
150
 
 
151
  @app.post("/admin/monitor")
152
  async def run_monitoring():
153
  """Esegue il monitoring e aggiorna metrics.json."""
154
  subprocess.run(["python", "src/monitoring.py"], check=True)
155
  return {"status": "Monitoring completato"}
156
 
 
157
  @app.get("/admin/metrics", response_class=HTMLResponse)
158
  def view_metrics(request: Request):
159
  """Visualizza i risultati del monitoring in forma tabellare e grafica."""
@@ -162,13 +149,10 @@ def view_metrics(request: Request):
162
  if os.path.exists(metrics_path):
163
  with open(metrics_path, "r") as f:
164
  metrics = json.load(f)
165
- return templates.TemplateResponse(
166
- "metrics.html",
167
- {"request": request, "metrics": metrics}
168
- )
169
 
170
 
171
-
172
- if __name__=="__main__":
173
  import uvicorn
174
- uvicorn.run(app,host="0.0.0.0",port=8000)
 
 
4
  from fastapi.responses import HTMLResponse
5
  from fastapi.templating import Jinja2Templates
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ from datasets import load_from_disk
8
  import torch
9
  import random
10
  import subprocess
11
  import json
12
 
13
  # Caricamento del modello e dei dati se già scaricati
14
+ MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
15
  TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
16
  YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
17
 
 
22
  labels = ["negative", "neutral", "positive"]
23
 
24
 
 
25
  # TWEET EVAL
26
  if not os.path.exists(TWEET_PROCESSED_PATH):
27
  print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
 
40
  subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
41
  youtube_ds = load_from_disk(YT_PROCESSED_PATH)
42
 
43
+ app = FastAPI(title="Sentiment Analysis API")
 
 
44
  templates = Jinja2Templates(directory="app_templates/")
45
 
46
+
47
  class TextInput(BaseModel):
48
  text: str
49
 
 
58
  return {"label": labels[pred], "confidence": round(confidence, 3)}
59
 
60
 
61
+ @app.get("/", response_class=HTMLResponse)
62
+ async def home(request: Request):
63
  return templates.TemplateResponse("index.html", {"request": request})
64
+
65
+
66
  @app.get("/random_tweet", response_class=HTMLResponse)
67
  def random_tweet(request: Request):
68
+ # sample = random.choice(tweet_eval["test"])
69
  sample = tweet_eval["test"][random.randrange(len(tweet_eval["test"]))]
70
+ text = (
71
+ sample["text"]
72
+ if "text" in sample
73
+ else tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
74
+ )
75
+
76
  result = predict_sentiment(text)
77
 
78
+ true_label = labels[sample["label"]]
 
 
79
 
80
  return templates.TemplateResponse(
81
  "random_tweet.html",
82
+ {"request": request, "text": text, "true_label": true_label, "result": result},
 
 
 
 
 
83
  )
84
 
85
 
 
 
 
86
  @app.get("/predict", response_class=HTMLResponse)
87
  def predict_page(request: Request):
88
  return templates.TemplateResponse("predict.html", {"request": request, "result": None})
89
 
90
+
91
  @app.post("/predict", response_class=HTMLResponse)
92
  def predict_text(request: Request, text: str = Form(...)):
93
  result = predict_sentiment(text)
94
  return templates.TemplateResponse(
95
+ "predict.html", {"request": request, "text": text, "result": result}
 
96
  )
97
 
98
 
99
  @app.get("/random_youtube_comment", response_class=HTMLResponse)
100
  def random_youtube_comment(request: Request):
101
+ sample = random.choice(youtube_ds["train"])
102
 
103
  text = sample["text"] if "text" in sample else sample["text"]
104
  true_label = sample["label"] if "label" in sample else "N/A"
105
 
106
  if isinstance(true_label, int):
107
+
108
  label_map = {0: "negative", 1: "neutral", 2: "positive"}
109
  true_label = label_map.get(true_label, "N/A")
110
 
 
112
 
113
  return templates.TemplateResponse(
114
  "random_youtube.html",
115
+ {"request": request, "text": text, "true_label": true_label, "result": result},
 
 
 
 
 
116
  )
117
 
118
 
 
119
  @app.get("/admin", response_class=HTMLResponse)
120
  async def admin_dashboard(request: Request):
121
  """Pagina principale dell'area admin."""
 
124
  if os.path.exists(metrics_path):
125
  with open(metrics_path, "r") as f:
126
  metrics = json.load(f)
127
+ return templates.TemplateResponse("admin.html", {"request": request, "metrics": metrics})
128
+
 
 
129
 
130
  @app.post("/admin/train")
131
  async def retrain_model():
 
133
  subprocess.run(["python", "src/train.py"], check=True)
134
  return {"status": "Training completato"}
135
 
136
+
137
  @app.post("/admin/monitor")
138
  async def run_monitoring():
139
  """Esegue il monitoring e aggiorna metrics.json."""
140
  subprocess.run(["python", "src/monitoring.py"], check=True)
141
  return {"status": "Monitoring completato"}
142
 
143
+
144
  @app.get("/admin/metrics", response_class=HTMLResponse)
145
  def view_metrics(request: Request):
146
  """Visualizza i risultati del monitoring in forma tabellare e grafica."""
 
149
  if os.path.exists(metrics_path):
150
  with open(metrics_path, "r") as f:
151
  metrics = json.load(f)
152
+ return templates.TemplateResponse("metrics.html", {"request": request, "metrics": metrics})
 
 
 
153
 
154
 
155
+ if __name__ == "__main__":
 
156
  import uvicorn
157
+
158
+ uvicorn.run(app, host="0.0.0.0", port=8000)
src/data_preparation.py CHANGED
@@ -12,7 +12,8 @@ PROCESSED_DIR = "data/processed/"
12
  os.makedirs(PROCESSED_DIR, exist_ok=True)
13
 
14
 
15
- # FUNZIONI DI SUPPORTO
 
16
 
17
  def clean_text(text):
18
  """Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
@@ -23,6 +24,7 @@ def clean_text(text):
23
  text = re.sub(r"\s+", " ", text)
24
  return text.strip()
25
 
 
26
  def map_label(label):
27
  """Mappa le etichette di sentiment a numeri"""
28
  mapping = {"negative": 0, "neutral": 1, "positive": 2}
@@ -30,9 +32,11 @@ def map_label(label):
30
  return mapping.get(label.lower(), 1)
31
  return label
32
 
 
33
  # Tokenizer globale
34
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
35
 
 
36
  def tokenize_function(examples):
37
  return tokenizer(
38
  examples["text"],
@@ -42,12 +46,11 @@ def tokenize_function(examples):
42
  )
43
 
44
 
45
-
46
-
47
  # ----------------------------- #
48
  # PREPARAZIONE DEI DATASET #
49
  # ----------------------------- #
50
 
 
51
  def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
52
  """
53
  Gestisce i retry del download e crea un dataset di fallback se fallisce.
@@ -79,7 +82,9 @@ def prepare_tweet_eval(tokenizer, output_path):
79
  reduced_splits = {}
80
  for split in ds.keys():
81
  reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
82
- reduced_splits[split] = reduced_splits[split].map(lambda x: {"text": clean_text(x["text"])})
 
 
83
  reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
84
  ds = DatasetDict(reduced_splits)
85
  else:
@@ -98,7 +103,7 @@ def prepare_youtube(tokenizer, output_path):
98
  "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
99
  }
100
  ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
101
-
102
  if isinstance(ds, dict) or "train" in ds:
103
  reduced_splits = {}
104
  for split in ds.keys():
@@ -112,7 +117,7 @@ def prepare_youtube(tokenizer, output_path):
112
  reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
113
  ds = DatasetDict(reduced_splits)
114
  else:
115
-
116
  ds = ds.select(range(min(1000, len(ds))))
117
  ds = ds.map(
118
  lambda x: {
@@ -120,19 +125,18 @@ def prepare_youtube(tokenizer, output_path):
120
  "label": map_label(x["Sentiment"]),
121
  }
122
  )
123
- # ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
124
- # ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
125
- # ds = ds.map(tokenize_function, batched=True)
126
  ds.save_to_disk(output_path)
127
  print(f"Dataset YouTube salvato in {output_path}")
128
 
129
 
130
-
131
-
132
-
133
  if __name__ == "__main__":
134
  parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
135
- parser.add_argument("dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare.")
 
 
136
  args = parser.parse_args()
137
 
138
  if args.dataset == "tweet_eval":
 
12
  os.makedirs(PROCESSED_DIR, exist_ok=True)
13
 
14
 
15
+ # FUNZIONI DI SUPPORTO
16
+
17
 
18
  def clean_text(text):
19
  """Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
 
24
  text = re.sub(r"\s+", " ", text)
25
  return text.strip()
26
 
27
+
28
  def map_label(label):
29
  """Mappa le etichette di sentiment a numeri"""
30
  mapping = {"negative": 0, "neutral": 1, "positive": 2}
 
32
  return mapping.get(label.lower(), 1)
33
  return label
34
 
35
+
36
  # Tokenizer globale
37
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38
 
39
+
40
  def tokenize_function(examples):
41
  return tokenizer(
42
  examples["text"],
 
46
  )
47
 
48
 
 
 
49
  # ----------------------------- #
50
  # PREPARAZIONE DEI DATASET #
51
  # ----------------------------- #
52
 
53
+
54
  def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
55
  """
56
  Gestisce i retry del download e crea un dataset di fallback se fallisce.
 
82
  reduced_splits = {}
83
  for split in ds.keys():
84
  reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
85
+ reduced_splits[split] = reduced_splits[split].map(
86
+ lambda x: {"text": clean_text(x["text"])}
87
+ )
88
  reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
89
  ds = DatasetDict(reduced_splits)
90
  else:
 
103
  "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
104
  }
105
  ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
106
+
107
  if isinstance(ds, dict) or "train" in ds:
108
  reduced_splits = {}
109
  for split in ds.keys():
 
117
  reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
118
  ds = DatasetDict(reduced_splits)
119
  else:
120
+
121
  ds = ds.select(range(min(1000, len(ds))))
122
  ds = ds.map(
123
  lambda x: {
 
125
  "label": map_label(x["Sentiment"]),
126
  }
127
  )
128
+ # ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
129
+ # ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
130
+ # ds = ds.map(tokenize_function, batched=True)
131
  ds.save_to_disk(output_path)
132
  print(f"Dataset YouTube salvato in {output_path}")
133
 
134
 
 
 
 
135
  if __name__ == "__main__":
136
  parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
137
+ parser.add_argument(
138
+ "dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare."
139
+ )
140
  args = parser.parse_args()
141
 
142
  if args.dataset == "tweet_eval":
src/download_data.py CHANGED
@@ -5,4 +5,4 @@ dataset.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/raw/tweet
5
 
6
 
7
  dataset2 = load_dataset("AmaanP314/youtube-comment-sentiment")
8
- dataset2.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/youtube-comment-sentiment")
 
5
 
6
 
7
  dataset2 = load_dataset("AmaanP314/youtube-comment-sentiment")
8
+ dataset2.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/youtube-comment-sentiment")
src/train_model.py CHANGED
@@ -1,10 +1,5 @@
1
-
2
- from transformers import (
3
- AutoModelForSequenceClassification,
4
- Trainer,
5
- TrainingArguments
6
- )
7
- from datasets import load_from_disk,concatenate_datasets
8
  import evaluate
9
  import numpy as np
10
  import os
@@ -18,7 +13,8 @@ hf_token = os.environ["HF_TOKEN"]
18
  MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
19
  DATA_PATH = "data/processed/tweet_eval_tokenized"
20
  OUTPUT_DIR = "models/sentiment_model"
21
- HF_REPO = "Lordemarco/SentimentAnalysis"
 
22
 
23
  def compute_metrics(eval_pred):
24
  """Calcola metriche standard: accuracy e F1."""
@@ -31,19 +27,24 @@ def compute_metrics(eval_pred):
31
  return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
32
 
33
 
34
- def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=300,output_dir=OUTPUT_DIR):
 
 
35
  print("Caricamento dataset Tweet eval preprocessato")
36
  dataset = load_from_disk(DATA_PATH)
37
  if additional_data is not None:
38
  print("Aggiungo dati YouTube al training set...")
39
  dataset["train"] = concatenate_datasets([dataset["train"], additional_data])
40
 
41
- #
42
- print(f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione.")
 
 
43
  train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
44
- eval_data = dataset["validation"].select(range(min(sample_eval_size, len(dataset["validation"]))))
 
 
45
 
46
-
47
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
48
 
49
  # Parametri training
@@ -57,7 +58,7 @@ def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=30
57
  logging_dir="./logs",
58
  logging_steps=10,
59
  load_best_model_at_end=True,
60
- report_to="none",
61
  )
62
 
63
  print("Avvio training")
@@ -74,11 +75,11 @@ def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=30
74
  os.makedirs(output_dir, exist_ok=True)
75
  trainer.save_model(output_dir)
76
  print(f"Modello salvato in: {OUTPUT_DIR}")
77
-
78
 
79
  if os.getenv("HF_TOKEN"):
80
  print("Pushing model to Hugging Face Hub...")
81
  trainer.push_to_hub("Lordemarco/SentimentAnalysis")
82
-
 
83
  if __name__ == "__main__":
84
  train_model()
 
1
+ from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
2
+ from datasets import load_from_disk, concatenate_datasets
 
 
 
 
 
3
  import evaluate
4
  import numpy as np
5
  import os
 
13
  MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
14
  DATA_PATH = "data/processed/tweet_eval_tokenized"
15
  OUTPUT_DIR = "models/sentiment_model"
16
+ HF_REPO = "Lordemarco/SentimentAnalysis"
17
+
18
 
19
  def compute_metrics(eval_pred):
20
  """Calcola metriche standard: accuracy e F1."""
 
27
  return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
28
 
29
 
30
+ def train_model(
31
+ additional_data=None, sample_train_size=1000, sample_eval_size=300, output_dir=OUTPUT_DIR
32
+ ):
33
  print("Caricamento dataset Tweet eval preprocessato")
34
  dataset = load_from_disk(DATA_PATH)
35
  if additional_data is not None:
36
  print("Aggiungo dati YouTube al training set...")
37
  dataset["train"] = concatenate_datasets([dataset["train"], additional_data])
38
 
39
+ #
40
+ print(
41
+ f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione."
42
+ )
43
  train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
44
+ eval_data = dataset["validation"].select(
45
+ range(min(sample_eval_size, len(dataset["validation"])))
46
+ )
47
 
 
48
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
49
 
50
  # Parametri training
 
58
  logging_dir="./logs",
59
  logging_steps=10,
60
  load_best_model_at_end=True,
61
+ report_to="none",
62
  )
63
 
64
  print("Avvio training")
 
75
  os.makedirs(output_dir, exist_ok=True)
76
  trainer.save_model(output_dir)
77
  print(f"Modello salvato in: {OUTPUT_DIR}")
 
78
 
79
  if os.getenv("HF_TOKEN"):
80
  print("Pushing model to Hugging Face Hub...")
81
  trainer.push_to_hub("Lordemarco/SentimentAnalysis")
82
+
83
+
84
  if __name__ == "__main__":
85
  train_model()
tests/integration/test_app.py CHANGED
@@ -1,35 +1,43 @@
1
  from fastapi.testclient import TestClient
2
  from src.app import app
3
- import os
4
 
5
  os.environ["SKIP_DATA_PREP"] = "true"
6
 
7
  client = TestClient(app)
8
 
 
9
  def test_home_page():
10
  response = client.get("/")
11
  assert response.status_code == 200
12
  assert "Benvenuto" in response.text
13
 
 
14
  def test_predict_endpoint_get():
15
  response = client.get("/predict")
16
  assert response.status_code == 200
17
  assert "Testa il Modello" in response.text
18
 
 
19
  def test_predict_endpoint_post():
20
  response = client.post("/predict", data={"text": "I love this!"})
21
  assert response.status_code == 200
22
  assert any(label in response.text for label in ["positive", "neutral", "negative"])
23
 
 
24
  def test_random_tweet_page():
25
  response = client.get("/random_tweet")
26
  assert response.status_code == 200
27
- assert any(lbl in response.text for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"])
28
-
29
-
 
 
 
30
  def test_random_youtube_page():
31
  response = client.get("/random_youtube_comment")
32
  assert response.status_code == 200
33
- assert any(lbl in response.text for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"])
34
-
35
-
 
 
1
  from fastapi.testclient import TestClient
2
  from src.app import app
3
+ import os
4
 
5
  os.environ["SKIP_DATA_PREP"] = "true"
6
 
7
  client = TestClient(app)
8
 
9
+
10
  def test_home_page():
11
  response = client.get("/")
12
  assert response.status_code == 200
13
  assert "Benvenuto" in response.text
14
 
15
+
16
  def test_predict_endpoint_get():
17
  response = client.get("/predict")
18
  assert response.status_code == 200
19
  assert "Testa il Modello" in response.text
20
 
21
+
22
  def test_predict_endpoint_post():
23
  response = client.post("/predict", data={"text": "I love this!"})
24
  assert response.status_code == 200
25
  assert any(label in response.text for label in ["positive", "neutral", "negative"])
26
 
27
+
28
  def test_random_tweet_page():
29
  response = client.get("/random_tweet")
30
  assert response.status_code == 200
31
+ assert any(
32
+ lbl in response.text
33
+ for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"]
34
+ )
35
+
36
+
37
  def test_random_youtube_page():
38
  response = client.get("/random_youtube_comment")
39
  assert response.status_code == 200
40
+ assert any(
41
+ lbl in response.text
42
+ for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"]
43
+ )
tests/integration/test_monitoring.py CHANGED
@@ -18,19 +18,16 @@ def cleanup_metrics():
18
 
19
  def test_monitoring_creates_metrics():
20
  """Verifica che il monitoring crei correttamente il file metrics.json e contenga i dati previsti."""
21
- main()
22
 
23
  assert os.path.exists(METRICS_PATH), "metrics.json non è stato generato"
24
 
25
-
26
  with open(METRICS_PATH, "r") as f:
27
  metrics = json.load(f)
28
 
29
  assert "TweetEval" in metrics, "Mancano metriche TweetEval"
30
  assert "YouTube" in metrics, "Mancano metriche YouTube"
31
 
32
-
33
  for dataset_name, data in metrics.items():
34
  assert "accuracy" in data, f"Manca accuracy per {dataset_name}"
35
  assert "f1" in data, f"Manca F1 per {dataset_name}"
36
-
 
18
 
19
  def test_monitoring_creates_metrics():
20
  """Verifica che il monitoring crei correttamente il file metrics.json e contenga i dati previsti."""
21
+ main()
22
 
23
  assert os.path.exists(METRICS_PATH), "metrics.json non è stato generato"
24
 
 
25
  with open(METRICS_PATH, "r") as f:
26
  metrics = json.load(f)
27
 
28
  assert "TweetEval" in metrics, "Mancano metriche TweetEval"
29
  assert "YouTube" in metrics, "Mancano metriche YouTube"
30
 
 
31
  for dataset_name, data in metrics.items():
32
  assert "accuracy" in data, f"Manca accuracy per {dataset_name}"
33
  assert "f1" in data, f"Manca F1 per {dataset_name}"
 
tests/integration/test_train.py CHANGED
@@ -5,6 +5,7 @@ from src.train_model import train_model
5
 
6
  MODEL_DIR = "models/sentiment_model"
7
 
 
8
  @pytest.fixture(autouse=True)
9
  def cleanup():
10
  if os.path.exists(MODEL_DIR):
@@ -13,8 +14,9 @@ def cleanup():
13
  if os.path.exists(MODEL_DIR):
14
  shutil.rmtree(MODEL_DIR)
15
 
 
16
  def test_train_model_runs():
17
  """Testa che il training parta e salvi un modello."""
18
- train_model(sample_train_size=10, sample_eval_size=5)
19
  assert os.path.exists(MODEL_DIR), "La directory del modello non è stata creata"
20
  assert os.path.exists(os.path.join(MODEL_DIR, "config.json")), "File config.json mancante"
 
5
 
6
  MODEL_DIR = "models/sentiment_model"
7
 
8
+
9
  @pytest.fixture(autouse=True)
10
  def cleanup():
11
  if os.path.exists(MODEL_DIR):
 
14
  if os.path.exists(MODEL_DIR):
15
  shutil.rmtree(MODEL_DIR)
16
 
17
+
18
  def test_train_model_runs():
19
  """Testa che il training parta e salvi un modello."""
20
+ train_model(sample_train_size=10, sample_eval_size=5)
21
  assert os.path.exists(MODEL_DIR), "La directory del modello non è stata creata"
22
  assert os.path.exists(os.path.join(MODEL_DIR, "config.json")), "File config.json mancante"
tests/unit/test_data.py CHANGED
@@ -6,13 +6,12 @@ from datasets import load_from_disk
6
  TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
7
  YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
8
 
 
9
  def run_data_preparation(dataset_name):
10
  """Esegue lo script di data preparation per il dataset richiesto."""
11
  print(f"⚙️ Avvio data_preparation.py per il dataset: {dataset_name}")
12
- subprocess.run(
13
- ["python", "src/data_preparation.py", "--dataset", dataset_name],
14
- check=True
15
- )
16
 
17
  def test_tweet_eval_dataset_exists_or_create():
18
  """Controlla o crea il dataset Tweet Eval preprocessato."""
@@ -20,21 +19,25 @@ def test_tweet_eval_dataset_exists_or_create():
20
  run_data_preparation("tweet_eval")
21
  assert os.path.exists(TWEET_PROCESSED_PATH), "Tweet Eval non disponibile dopo la preparazione"
22
 
 
23
  def test_youtube_dataset_exists_or_create():
24
  """Controlla o crea il dataset YouTube preprocessato."""
25
  if not os.path.exists(YT_PROCESSED_PATH):
26
  run_data_preparation("youtube")
27
  assert os.path.exists(YT_PROCESSED_PATH), "YouTube dataset non disponibile dopo la preparazione"
28
 
 
29
  def test_tweet_eval_structure():
30
  """Verifica che il dataset Tweet Eval abbia la struttura corretta."""
31
  ds = load_from_disk(TWEET_PROCESSED_PATH)
32
  assert "text" in ds["test"].features, "Campo 'text' mancante in Tweet Eval"
33
  assert "label" in ds["test"].features, "Campo 'label' mancante in Tweet Eval"
34
 
 
35
  def test_youtube_structure():
36
  """Verifica che il dataset YouTube abbia la struttura corretta."""
37
  ds = load_from_disk(YT_PROCESSED_PATH)
38
- assert "CommentText" in ds["train"].features or "CommentText" in ds["train"].features, \
39
- "Campo testuale mancante in YouTube dataset"
 
40
  assert "Sentiment" in ds["train"].features, "Campo 'label' mancante in YouTube dataset"
 
6
  TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
7
  YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
8
 
9
+
10
  def run_data_preparation(dataset_name):
11
  """Esegue lo script di data preparation per il dataset richiesto."""
12
  print(f"⚙️ Avvio data_preparation.py per il dataset: {dataset_name}")
13
+ subprocess.run(["python", "src/data_preparation.py", "--dataset", dataset_name], check=True)
14
+
 
 
15
 
16
  def test_tweet_eval_dataset_exists_or_create():
17
  """Controlla o crea il dataset Tweet Eval preprocessato."""
 
19
  run_data_preparation("tweet_eval")
20
  assert os.path.exists(TWEET_PROCESSED_PATH), "Tweet Eval non disponibile dopo la preparazione"
21
 
22
+
23
  def test_youtube_dataset_exists_or_create():
24
  """Controlla o crea il dataset YouTube preprocessato."""
25
  if not os.path.exists(YT_PROCESSED_PATH):
26
  run_data_preparation("youtube")
27
  assert os.path.exists(YT_PROCESSED_PATH), "YouTube dataset non disponibile dopo la preparazione"
28
 
29
+
30
  def test_tweet_eval_structure():
31
  """Verifica che il dataset Tweet Eval abbia la struttura corretta."""
32
  ds = load_from_disk(TWEET_PROCESSED_PATH)
33
  assert "text" in ds["test"].features, "Campo 'text' mancante in Tweet Eval"
34
  assert "label" in ds["test"].features, "Campo 'label' mancante in Tweet Eval"
35
 
36
+
37
  def test_youtube_structure():
38
  """Verifica che il dataset YouTube abbia la struttura corretta."""
39
  ds = load_from_disk(YT_PROCESSED_PATH)
40
+ assert (
41
+ "CommentText" in ds["train"].features or "CommentText" in ds["train"].features
42
+ ), "Campo testuale mancante in YouTube dataset"
43
  assert "Sentiment" in ds["train"].features, "Campo 'label' mancante in YouTube dataset"
tests/unit/test_model.py CHANGED
@@ -7,10 +7,12 @@ LABELS = ["negative", "neutral", "positive"]
7
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
8
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
9
 
 
10
  def test_model_loads():
11
  assert model is not None
12
  assert tokenizer is not None
13
 
 
14
  def test_model_prediction_shape():
15
  text = "I love this product!"
16
  inputs = tokenizer(text, return_tensors="pt")
@@ -18,9 +20,10 @@ def test_model_prediction_shape():
18
  outputs = model(**inputs)
19
  assert outputs.logits.shape[-1] == len(LABELS)
20
 
 
21
  def test_sentiment_confidence():
22
  text = "I hate this"
23
  inputs = tokenizer(text, return_tensors="pt")
24
  with torch.no_grad():
25
  probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
26
- assert torch.isclose(probs.sum(), torch.tensor(1.0), atol=1e-3)
 
7
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
8
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
9
 
10
+
11
  def test_model_loads():
12
  assert model is not None
13
  assert tokenizer is not None
14
 
15
+
16
  def test_model_prediction_shape():
17
  text = "I love this product!"
18
  inputs = tokenizer(text, return_tensors="pt")
 
20
  outputs = model(**inputs)
21
  assert outputs.logits.shape[-1] == len(LABELS)
22
 
23
+
24
  def test_sentiment_confidence():
25
  text = "I hate this"
26
  inputs = tokenizer(text, return_tensors="pt")
27
  with torch.no_grad():
28
  probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
29
+ assert torch.isclose(probs.sum(), torch.tensor(1.0), atol=1e-3)