Spaces:

Lordemarco
/

sentiment-fastapi

Sleeping

App Files Files Community

GitHub Actions commited on Nov 5, 2025

Commit

1180a53

1 Parent(s): 0362599

Auto-deploy new version [skip ci]

Browse files

Files changed (9) hide show

src/app.py +30 -46
src/data_preparation.py +17 -13
src/download_data.py +1 -1
src/train_model.py +17 -16
tests/integration/test_app.py +15 -7
tests/integration/test_monitoring.py +1 -4
tests/integration/test_train.py +3 -1
tests/unit/test_data.py +9 -6
tests/unit/test_model.py +4 -1

src/app.py CHANGED Viewed

@@ -4,14 +4,14 @@ from pydantic import BaseModel
 from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from datasets import  load_from_disk
 import torch
 import random
 import subprocess
 import json
 # Caricamento del modello e dei dati se già scaricati
-MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
 TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
 YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
@@ -22,7 +22,6 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
 labels = ["negative", "neutral", "positive"]
 # TWEET EVAL
 if not os.path.exists(TWEET_PROCESSED_PATH):
     print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
@@ -41,11 +40,10 @@ if not os.path.exists(YT_PROCESSED_PATH):
             subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
 youtube_ds = load_from_disk(YT_PROCESSED_PATH)
-app = FastAPI(
-    title="Sentiment Analysis API"
-)
 templates = Jinja2Templates(directory="app_templates/")
 class TextInput(BaseModel):
     text: str
@@ -60,58 +58,53 @@ def predict_sentiment(text: str):
     return {"label": labels[pred], "confidence": round(confidence, 3)}
-@app.get("/",response_class=HTMLResponse)
-async def home( request: Request):
     return templates.TemplateResponse("index.html", {"request": request})
 @app.get("/random_tweet", response_class=HTMLResponse)
 def random_tweet(request: Request):
-  #  sample = random.choice(tweet_eval["test"])
     sample = tweet_eval["test"][random.randrange(len(tweet_eval["test"]))]
-    text = sample["text"] if "text" in sample else tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
     result = predict_sentiment(text)
-    true_label=labels[sample["label"]]
     return templates.TemplateResponse(
         "random_tweet.html",
-        {
-            "request": request,
-            "text": text,
-            "true_label": true_label,
-            "result": result
-        }
     )
 @app.get("/predict", response_class=HTMLResponse)
 def predict_page(request: Request):
     return templates.TemplateResponse("predict.html", {"request": request, "result": None})
 @app.post("/predict", response_class=HTMLResponse)
 def predict_text(request: Request, text: str = Form(...)):
     result = predict_sentiment(text)
     return templates.TemplateResponse(
-        "predict.html",
-        {"request": request, "text": text, "result": result}
     )
 @app.get("/random_youtube_comment", response_class=HTMLResponse)
 def random_youtube_comment(request: Request):
-    sample = random.choice(youtube_ds["train"])
     text = sample["text"] if "text" in sample else sample["text"]
     true_label = sample["label"] if "label" in sample else "N/A"
     if isinstance(true_label, int):
         label_map = {0: "negative", 1: "neutral", 2: "positive"}
         true_label = label_map.get(true_label, "N/A")
@@ -119,16 +112,10 @@ def random_youtube_comment(request: Request):
     return templates.TemplateResponse(
         "random_youtube.html",
-        {
-            "request": request,
-            "text": text,
-            "true_label": true_label,
-            "result": result
-        }
     )
 @app.get("/admin", response_class=HTMLResponse)
 async def admin_dashboard(request: Request):
     """Pagina principale dell'area admin."""
@@ -137,10 +124,8 @@ async def admin_dashboard(request: Request):
     if os.path.exists(metrics_path):
         with open(metrics_path, "r") as f:
             metrics = json.load(f)
-    return templates.TemplateResponse(
-        "admin.html",
-        {"request": request, "metrics": metrics}
-    )
 @app.post("/admin/train")
 async def retrain_model():
@@ -148,12 +133,14 @@ async def retrain_model():
     subprocess.run(["python", "src/train.py"], check=True)
     return {"status": "Training completato"}
 @app.post("/admin/monitor")
 async def run_monitoring():
     """Esegue il monitoring e aggiorna metrics.json."""
     subprocess.run(["python", "src/monitoring.py"], check=True)
     return {"status": "Monitoring completato"}
 @app.get("/admin/metrics", response_class=HTMLResponse)
 def view_metrics(request: Request):
     """Visualizza i risultati del monitoring in forma tabellare e grafica."""
@@ -162,13 +149,10 @@ def view_metrics(request: Request):
     if os.path.exists(metrics_path):
         with open(metrics_path, "r") as f:
             metrics = json.load(f)
-    return templates.TemplateResponse(
-        "metrics.html",
-        {"request": request, "metrics": metrics}
-    )
-if __name__=="__main__":
     import uvicorn
-    uvicorn.run(app,host="0.0.0.0",port=8000)

 from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from datasets import load_from_disk
 import torch
 import random
 import subprocess
 import json
 # Caricamento del modello e dei dati se già scaricati
+MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
 TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
 YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
 labels = ["negative", "neutral", "positive"]
 # TWEET EVAL
 if not os.path.exists(TWEET_PROCESSED_PATH):
     print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
             subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
 youtube_ds = load_from_disk(YT_PROCESSED_PATH)
+app = FastAPI(title="Sentiment Analysis API")
 templates = Jinja2Templates(directory="app_templates/")
 class TextInput(BaseModel):
     text: str
     return {"label": labels[pred], "confidence": round(confidence, 3)}
+@app.get("/", response_class=HTMLResponse)
+async def home(request: Request):
     return templates.TemplateResponse("index.html", {"request": request})
 @app.get("/random_tweet", response_class=HTMLResponse)
 def random_tweet(request: Request):
+    #  sample = random.choice(tweet_eval["test"])
     sample = tweet_eval["test"][random.randrange(len(tweet_eval["test"]))]
+    text = (
+        sample["text"]
+        if "text" in sample
+        else tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
+    )
     result = predict_sentiment(text)
+    true_label = labels[sample["label"]]
     return templates.TemplateResponse(
         "random_tweet.html",
+        {"request": request, "text": text, "true_label": true_label, "result": result},
     )
 @app.get("/predict", response_class=HTMLResponse)
 def predict_page(request: Request):
     return templates.TemplateResponse("predict.html", {"request": request, "result": None})
 @app.post("/predict", response_class=HTMLResponse)
 def predict_text(request: Request, text: str = Form(...)):
     result = predict_sentiment(text)
     return templates.TemplateResponse(
+        "predict.html", {"request": request, "text": text, "result": result}
     )
 @app.get("/random_youtube_comment", response_class=HTMLResponse)
 def random_youtube_comment(request: Request):
+    sample = random.choice(youtube_ds["train"])
     text = sample["text"] if "text" in sample else sample["text"]
     true_label = sample["label"] if "label" in sample else "N/A"
     if isinstance(true_label, int):
         label_map = {0: "negative", 1: "neutral", 2: "positive"}
         true_label = label_map.get(true_label, "N/A")
     return templates.TemplateResponse(
         "random_youtube.html",
+        {"request": request, "text": text, "true_label": true_label, "result": result},
     )
 @app.get("/admin", response_class=HTMLResponse)
 async def admin_dashboard(request: Request):
     """Pagina principale dell'area admin."""
     if os.path.exists(metrics_path):
         with open(metrics_path, "r") as f:
             metrics = json.load(f)
+    return templates.TemplateResponse("admin.html", {"request": request, "metrics": metrics})
 @app.post("/admin/train")
 async def retrain_model():
     subprocess.run(["python", "src/train.py"], check=True)
     return {"status": "Training completato"}
 @app.post("/admin/monitor")
 async def run_monitoring():
     """Esegue il monitoring e aggiorna metrics.json."""
     subprocess.run(["python", "src/monitoring.py"], check=True)
     return {"status": "Monitoring completato"}
 @app.get("/admin/metrics", response_class=HTMLResponse)
 def view_metrics(request: Request):
     """Visualizza i risultati del monitoring in forma tabellare e grafica."""
     if os.path.exists(metrics_path):
         with open(metrics_path, "r") as f:
             metrics = json.load(f)
+    return templates.TemplateResponse("metrics.html", {"request": request, "metrics": metrics})
+if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

src/data_preparation.py CHANGED Viewed

@@ -12,7 +12,8 @@ PROCESSED_DIR = "data/processed/"
 os.makedirs(PROCESSED_DIR, exist_ok=True)
-#     FUNZIONI DI SUPPORTO
 def clean_text(text):
     """Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
@@ -23,6 +24,7 @@ def clean_text(text):
     text = re.sub(r"\s+", " ", text)
     return text.strip()
 def map_label(label):
     """Mappa le etichette di sentiment a numeri"""
     mapping = {"negative": 0, "neutral": 1, "positive": 2}
@@ -30,9 +32,11 @@ def map_label(label):
         return mapping.get(label.lower(), 1)
     return label
 # Tokenizer globale
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 def tokenize_function(examples):
     return tokenizer(
         examples["text"],
@@ -42,12 +46,11 @@ def tokenize_function(examples):
     )
 # ----------------------------- #
 #   PREPARAZIONE DEI DATASET    #
 # ----------------------------- #
 def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
     """
     Gestisce i retry del download e crea un dataset di fallback se fallisce.
@@ -79,7 +82,9 @@ def prepare_tweet_eval(tokenizer, output_path):
         reduced_splits = {}
         for split in ds.keys():
             reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
-            reduced_splits[split] = reduced_splits[split].map(lambda x: {"text": clean_text(x["text"])})
             reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
         ds = DatasetDict(reduced_splits)
     else:
@@ -98,7 +103,7 @@ def prepare_youtube(tokenizer, output_path):
         "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
     }
     ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
     if isinstance(ds, dict) or "train" in ds:
         reduced_splits = {}
         for split in ds.keys():
@@ -112,7 +117,7 @@ def prepare_youtube(tokenizer, output_path):
             reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
         ds = DatasetDict(reduced_splits)
     else:
         ds = ds.select(range(min(1000, len(ds))))
         ds = ds.map(
             lambda x: {
@@ -120,19 +125,18 @@ def prepare_youtube(tokenizer, output_path):
                 "label": map_label(x["Sentiment"]),
             }
         )
-  #  ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
-  #  ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
-   # ds = ds.map(tokenize_function, batched=True)
     ds.save_to_disk(output_path)
     print(f"Dataset YouTube salvato in {output_path}")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
-    parser.add_argument("dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare.")
     args = parser.parse_args()
     if args.dataset == "tweet_eval":

 os.makedirs(PROCESSED_DIR, exist_ok=True)
+#     FUNZIONI DI SUPPORTO
 def clean_text(text):
     """Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
     text = re.sub(r"\s+", " ", text)
     return text.strip()
 def map_label(label):
     """Mappa le etichette di sentiment a numeri"""
     mapping = {"negative": 0, "neutral": 1, "positive": 2}
         return mapping.get(label.lower(), 1)
     return label
 # Tokenizer globale
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 def tokenize_function(examples):
     return tokenizer(
         examples["text"],
     )
 # ----------------------------- #
 #   PREPARAZIONE DEI DATASET    #
 # ----------------------------- #
 def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
     """
     Gestisce i retry del download e crea un dataset di fallback se fallisce.
         reduced_splits = {}
         for split in ds.keys():
             reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
+            reduced_splits[split] = reduced_splits[split].map(
+                lambda x: {"text": clean_text(x["text"])}
+            )
             reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
         ds = DatasetDict(reduced_splits)
     else:
         "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
     }
     ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
     if isinstance(ds, dict) or "train" in ds:
         reduced_splits = {}
         for split in ds.keys():
             reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
         ds = DatasetDict(reduced_splits)
     else:
         ds = ds.select(range(min(1000, len(ds))))
         ds = ds.map(
             lambda x: {
                 "label": map_label(x["Sentiment"]),
             }
         )
+    #  ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
+    #  ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
+    # ds = ds.map(tokenize_function, batched=True)
     ds.save_to_disk(output_path)
     print(f"Dataset YouTube salvato in {output_path}")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
+    parser.add_argument(
+        "dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare."
+    )
     args = parser.parse_args()
     if args.dataset == "tweet_eval":

src/download_data.py CHANGED Viewed

@@ -5,4 +5,4 @@ dataset.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/raw/tweet
 dataset2 = load_dataset("AmaanP314/youtube-comment-sentiment")
-dataset2.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/youtube-comment-sentiment")


5
6
7	dataset2 = load_dataset("AmaanP314/youtube-comment-sentiment")
8	+ dataset2.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/youtube-comment-sentiment")

src/train_model.py CHANGED Viewed

@@ -1,10 +1,5 @@
-from transformers import (
-    AutoModelForSequenceClassification,
-    Trainer,
-    TrainingArguments
-)
-from datasets import load_from_disk,concatenate_datasets
 import evaluate
 import numpy as np
 import os
@@ -18,7 +13,8 @@ hf_token = os.environ["HF_TOKEN"]
 MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
 DATA_PATH = "data/processed/tweet_eval_tokenized"
 OUTPUT_DIR = "models/sentiment_model"
-HF_REPO = "Lordemarco/SentimentAnalysis"
 def compute_metrics(eval_pred):
     """Calcola metriche standard: accuracy e F1."""
@@ -31,19 +27,24 @@ def compute_metrics(eval_pred):
     return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
-def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=300,output_dir=OUTPUT_DIR):
     print("Caricamento dataset Tweet eval preprocessato")
     dataset = load_from_disk(DATA_PATH)
     if additional_data is not None:
         print("Aggiungo dati YouTube al training set...")
         dataset["train"] = concatenate_datasets([dataset["train"], additional_data])
-    #
-    print(f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione.")
     train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
-    eval_data = dataset["validation"].select(range(min(sample_eval_size, len(dataset["validation"]))))
     model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
     # Parametri training
@@ -57,7 +58,7 @@ def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=30
         logging_dir="./logs",
         logging_steps=10,
         load_best_model_at_end=True,
-        report_to="none",
     )
     print("Avvio training")
@@ -74,11 +75,11 @@ def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=30
     os.makedirs(output_dir, exist_ok=True)
     trainer.save_model(output_dir)
     print(f"Modello salvato in: {OUTPUT_DIR}")
     if os.getenv("HF_TOKEN"):
         print("Pushing model to Hugging Face Hub...")
         trainer.push_to_hub("Lordemarco/SentimentAnalysis")
 if __name__ == "__main__":
     train_model()

+from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
+from datasets import load_from_disk, concatenate_datasets
 import evaluate
 import numpy as np
 import os
 MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
 DATA_PATH = "data/processed/tweet_eval_tokenized"
 OUTPUT_DIR = "models/sentiment_model"
+HF_REPO = "Lordemarco/SentimentAnalysis"
 def compute_metrics(eval_pred):
     """Calcola metriche standard: accuracy e F1."""
     return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
+def train_model(
+    additional_data=None, sample_train_size=1000, sample_eval_size=300, output_dir=OUTPUT_DIR
+):
     print("Caricamento dataset Tweet eval preprocessato")
     dataset = load_from_disk(DATA_PATH)
     if additional_data is not None:
         print("Aggiungo dati YouTube al training set...")
         dataset["train"] = concatenate_datasets([dataset["train"], additional_data])
+    #
+    print(
+        f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione."
+    )
     train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
+    eval_data = dataset["validation"].select(
+        range(min(sample_eval_size, len(dataset["validation"])))
+    )
     model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
     # Parametri training
         logging_dir="./logs",
         logging_steps=10,
         load_best_model_at_end=True,
+        report_to="none",
     )
     print("Avvio training")
     os.makedirs(output_dir, exist_ok=True)
     trainer.save_model(output_dir)
     print(f"Modello salvato in: {OUTPUT_DIR}")
     if os.getenv("HF_TOKEN"):
         print("Pushing model to Hugging Face Hub...")
         trainer.push_to_hub("Lordemarco/SentimentAnalysis")
 if __name__ == "__main__":
     train_model()

tests/integration/test_app.py CHANGED Viewed

@@ -1,35 +1,43 @@
 from fastapi.testclient import TestClient
 from src.app import app
-import os
 os.environ["SKIP_DATA_PREP"] = "true"
 client = TestClient(app)
 def test_home_page():
     response = client.get("/")
     assert response.status_code == 200
     assert "Benvenuto" in response.text
 def test_predict_endpoint_get():
     response = client.get("/predict")
     assert response.status_code == 200
     assert "Testa il Modello" in response.text
 def test_predict_endpoint_post():
     response = client.post("/predict", data={"text": "I love this!"})
     assert response.status_code == 200
     assert any(label in response.text for label in ["positive", "neutral", "negative"])
 def test_random_tweet_page():
     response = client.get("/random_tweet")
     assert response.status_code == 200
-    assert any(lbl in response.text for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"])
 def test_random_youtube_page():
     response = client.get("/random_youtube_comment")
     assert response.status_code == 200
-    assert any(lbl in response.text for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"])

 from fastapi.testclient import TestClient
 from src.app import app
+import os
 os.environ["SKIP_DATA_PREP"] = "true"
 client = TestClient(app)
 def test_home_page():
     response = client.get("/")
     assert response.status_code == 200
     assert "Benvenuto" in response.text
 def test_predict_endpoint_get():
     response = client.get("/predict")
     assert response.status_code == 200
     assert "Testa il Modello" in response.text
 def test_predict_endpoint_post():
     response = client.post("/predict", data={"text": "I love this!"})
     assert response.status_code == 200
     assert any(label in response.text for label in ["positive", "neutral", "negative"])
 def test_random_tweet_page():
     response = client.get("/random_tweet")
     assert response.status_code == 200
+    assert any(
+        lbl in response.text
+        for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"]
+    )
 def test_random_youtube_page():
     response = client.get("/random_youtube_comment")
     assert response.status_code == 200
+    assert any(
+        lbl in response.text
+        for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"]
+    )

tests/integration/test_monitoring.py CHANGED Viewed

@@ -18,19 +18,16 @@ def cleanup_metrics():
 def test_monitoring_creates_metrics():
     """Verifica che il monitoring crei correttamente il file metrics.json e contenga i dati previsti."""
-    main()
     assert os.path.exists(METRICS_PATH), "metrics.json non è stato generato"
     with open(METRICS_PATH, "r") as f:
         metrics = json.load(f)
     assert "TweetEval" in metrics, "Mancano metriche TweetEval"
     assert "YouTube" in metrics, "Mancano metriche YouTube"
     for dataset_name, data in metrics.items():
         assert "accuracy" in data, f"Manca accuracy per {dataset_name}"
         assert "f1" in data, f"Manca F1 per {dataset_name}"

 def test_monitoring_creates_metrics():
     """Verifica che il monitoring crei correttamente il file metrics.json e contenga i dati previsti."""
+    main()
     assert os.path.exists(METRICS_PATH), "metrics.json non è stato generato"
     with open(METRICS_PATH, "r") as f:
         metrics = json.load(f)
     assert "TweetEval" in metrics, "Mancano metriche TweetEval"
     assert "YouTube" in metrics, "Mancano metriche YouTube"
     for dataset_name, data in metrics.items():
         assert "accuracy" in data, f"Manca accuracy per {dataset_name}"
         assert "f1" in data, f"Manca F1 per {dataset_name}"

tests/integration/test_train.py CHANGED Viewed

@@ -5,6 +5,7 @@ from src.train_model import train_model
 MODEL_DIR = "models/sentiment_model"
 @pytest.fixture(autouse=True)
 def cleanup():
     if os.path.exists(MODEL_DIR):
@@ -13,8 +14,9 @@ def cleanup():
     if os.path.exists(MODEL_DIR):
         shutil.rmtree(MODEL_DIR)
 def test_train_model_runs():
     """Testa che il training parta e salvi un modello."""
-    train_model(sample_train_size=10, sample_eval_size=5)
     assert os.path.exists(MODEL_DIR), "La directory del modello non è stata creata"
     assert os.path.exists(os.path.join(MODEL_DIR, "config.json")), "File config.json mancante"

 MODEL_DIR = "models/sentiment_model"
 @pytest.fixture(autouse=True)
 def cleanup():
     if os.path.exists(MODEL_DIR):
     if os.path.exists(MODEL_DIR):
         shutil.rmtree(MODEL_DIR)
 def test_train_model_runs():
     """Testa che il training parta e salvi un modello."""
+    train_model(sample_train_size=10, sample_eval_size=5)
     assert os.path.exists(MODEL_DIR), "La directory del modello non è stata creata"
     assert os.path.exists(os.path.join(MODEL_DIR, "config.json")), "File config.json mancante"

tests/unit/test_data.py CHANGED Viewed

@@ -6,13 +6,12 @@ from datasets import load_from_disk
 TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
 YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
 def run_data_preparation(dataset_name):
     """Esegue lo script di data preparation per il dataset richiesto."""
     print(f"⚙️  Avvio data_preparation.py per il dataset: {dataset_name}")
-    subprocess.run(
-        ["python", "src/data_preparation.py", "--dataset", dataset_name],
-        check=True
-    )
 def test_tweet_eval_dataset_exists_or_create():
     """Controlla o crea il dataset Tweet Eval preprocessato."""
@@ -20,21 +19,25 @@ def test_tweet_eval_dataset_exists_or_create():
         run_data_preparation("tweet_eval")
     assert os.path.exists(TWEET_PROCESSED_PATH), "Tweet Eval non disponibile dopo la preparazione"
 def test_youtube_dataset_exists_or_create():
     """Controlla o crea il dataset YouTube preprocessato."""
     if not os.path.exists(YT_PROCESSED_PATH):
         run_data_preparation("youtube")
     assert os.path.exists(YT_PROCESSED_PATH), "YouTube dataset non disponibile dopo la preparazione"
 def test_tweet_eval_structure():
     """Verifica che il dataset Tweet Eval abbia la struttura corretta."""
     ds = load_from_disk(TWEET_PROCESSED_PATH)
     assert "text" in ds["test"].features, "Campo 'text' mancante in Tweet Eval"
     assert "label" in ds["test"].features, "Campo 'label' mancante in Tweet Eval"
 def test_youtube_structure():
     """Verifica che il dataset YouTube abbia la struttura corretta."""
     ds = load_from_disk(YT_PROCESSED_PATH)
-    assert "CommentText" in ds["train"].features or "CommentText" in ds["train"].features, \
-        "Campo testuale mancante in YouTube dataset"
     assert "Sentiment" in ds["train"].features, "Campo 'label' mancante in YouTube dataset"

 TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
 YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
 def run_data_preparation(dataset_name):
     """Esegue lo script di data preparation per il dataset richiesto."""
     print(f"⚙️  Avvio data_preparation.py per il dataset: {dataset_name}")
+    subprocess.run(["python", "src/data_preparation.py", "--dataset", dataset_name], check=True)
 def test_tweet_eval_dataset_exists_or_create():
     """Controlla o crea il dataset Tweet Eval preprocessato."""
         run_data_preparation("tweet_eval")
     assert os.path.exists(TWEET_PROCESSED_PATH), "Tweet Eval non disponibile dopo la preparazione"
 def test_youtube_dataset_exists_or_create():
     """Controlla o crea il dataset YouTube preprocessato."""
     if not os.path.exists(YT_PROCESSED_PATH):
         run_data_preparation("youtube")
     assert os.path.exists(YT_PROCESSED_PATH), "YouTube dataset non disponibile dopo la preparazione"
 def test_tweet_eval_structure():
     """Verifica che il dataset Tweet Eval abbia la struttura corretta."""
     ds = load_from_disk(TWEET_PROCESSED_PATH)
     assert "text" in ds["test"].features, "Campo 'text' mancante in Tweet Eval"
     assert "label" in ds["test"].features, "Campo 'label' mancante in Tweet Eval"
 def test_youtube_structure():
     """Verifica che il dataset YouTube abbia la struttura corretta."""
     ds = load_from_disk(YT_PROCESSED_PATH)
+    assert (
+        "CommentText" in ds["train"].features or "CommentText" in ds["train"].features
+    ), "Campo testuale mancante in YouTube dataset"
     assert "Sentiment" in ds["train"].features, "Campo 'label' mancante in YouTube dataset"

tests/unit/test_model.py CHANGED Viewed

@@ -7,10 +7,12 @@ LABELS = ["negative", "neutral", "positive"]
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 def test_model_loads():
     assert model is not None
     assert tokenizer is not None
 def test_model_prediction_shape():
     text = "I love this product!"
     inputs = tokenizer(text, return_tensors="pt")
@@ -18,9 +20,10 @@ def test_model_prediction_shape():
         outputs = model(**inputs)
     assert outputs.logits.shape[-1] == len(LABELS)
 def test_sentiment_confidence():
     text = "I hate this"
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
-    assert torch.isclose(probs.sum(), torch.tensor(1.0), atol=1e-3)

 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 def test_model_loads():
     assert model is not None
     assert tokenizer is not None
 def test_model_prediction_shape():
     text = "I love this product!"
     inputs = tokenizer(text, return_tensors="pt")
         outputs = model(**inputs)
     assert outputs.logits.shape[-1] == len(LABELS)
 def test_sentiment_confidence():
     text = "I hate this"
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
+    assert torch.isclose(probs.sum(), torch.tensor(1.0), atol=1e-3)