Spaces:
Sleeping
Sleeping
GitHub Actions commited on
Commit ·
1180a53
1
Parent(s): 0362599
Auto-deploy new version [skip ci]
Browse files- src/app.py +30 -46
- src/data_preparation.py +17 -13
- src/download_data.py +1 -1
- src/train_model.py +17 -16
- tests/integration/test_app.py +15 -7
- tests/integration/test_monitoring.py +1 -4
- tests/integration/test_train.py +3 -1
- tests/unit/test_data.py +9 -6
- tests/unit/test_model.py +4 -1
src/app.py
CHANGED
|
@@ -4,14 +4,14 @@ from pydantic import BaseModel
|
|
| 4 |
from fastapi.responses import HTMLResponse
|
| 5 |
from fastapi.templating import Jinja2Templates
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 7 |
-
from datasets import
|
| 8 |
import torch
|
| 9 |
import random
|
| 10 |
import subprocess
|
| 11 |
import json
|
| 12 |
|
| 13 |
# Caricamento del modello e dei dati se già scaricati
|
| 14 |
-
MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 15 |
TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
|
| 16 |
YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
|
| 17 |
|
|
@@ -22,7 +22,6 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
|
| 22 |
labels = ["negative", "neutral", "positive"]
|
| 23 |
|
| 24 |
|
| 25 |
-
|
| 26 |
# TWEET EVAL
|
| 27 |
if not os.path.exists(TWEET_PROCESSED_PATH):
|
| 28 |
print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
|
|
@@ -41,11 +40,10 @@ if not os.path.exists(YT_PROCESSED_PATH):
|
|
| 41 |
subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
|
| 42 |
youtube_ds = load_from_disk(YT_PROCESSED_PATH)
|
| 43 |
|
| 44 |
-
app = FastAPI(
|
| 45 |
-
title="Sentiment Analysis API"
|
| 46 |
-
)
|
| 47 |
templates = Jinja2Templates(directory="app_templates/")
|
| 48 |
|
|
|
|
| 49 |
class TextInput(BaseModel):
|
| 50 |
text: str
|
| 51 |
|
|
@@ -60,58 +58,53 @@ def predict_sentiment(text: str):
|
|
| 60 |
return {"label": labels[pred], "confidence": round(confidence, 3)}
|
| 61 |
|
| 62 |
|
| 63 |
-
@app.get("/",response_class=HTMLResponse)
|
| 64 |
-
async def home(
|
| 65 |
return templates.TemplateResponse("index.html", {"request": request})
|
| 66 |
-
|
|
|
|
| 67 |
@app.get("/random_tweet", response_class=HTMLResponse)
|
| 68 |
def random_tweet(request: Request):
|
| 69 |
-
|
| 70 |
sample = tweet_eval["test"][random.randrange(len(tweet_eval["test"]))]
|
| 71 |
-
text =
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
result = predict_sentiment(text)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
true_label=labels[sample["label"]]
|
| 78 |
|
| 79 |
return templates.TemplateResponse(
|
| 80 |
"random_tweet.html",
|
| 81 |
-
{
|
| 82 |
-
"request": request,
|
| 83 |
-
"text": text,
|
| 84 |
-
"true_label": true_label,
|
| 85 |
-
"result": result
|
| 86 |
-
}
|
| 87 |
)
|
| 88 |
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
@app.get("/predict", response_class=HTMLResponse)
|
| 94 |
def predict_page(request: Request):
|
| 95 |
return templates.TemplateResponse("predict.html", {"request": request, "result": None})
|
| 96 |
|
|
|
|
| 97 |
@app.post("/predict", response_class=HTMLResponse)
|
| 98 |
def predict_text(request: Request, text: str = Form(...)):
|
| 99 |
result = predict_sentiment(text)
|
| 100 |
return templates.TemplateResponse(
|
| 101 |
-
"predict.html",
|
| 102 |
-
{"request": request, "text": text, "result": result}
|
| 103 |
)
|
| 104 |
|
| 105 |
|
| 106 |
@app.get("/random_youtube_comment", response_class=HTMLResponse)
|
| 107 |
def random_youtube_comment(request: Request):
|
| 108 |
-
sample = random.choice(youtube_ds["train"])
|
| 109 |
|
| 110 |
text = sample["text"] if "text" in sample else sample["text"]
|
| 111 |
true_label = sample["label"] if "label" in sample else "N/A"
|
| 112 |
|
| 113 |
if isinstance(true_label, int):
|
| 114 |
-
|
| 115 |
label_map = {0: "negative", 1: "neutral", 2: "positive"}
|
| 116 |
true_label = label_map.get(true_label, "N/A")
|
| 117 |
|
|
@@ -119,16 +112,10 @@ def random_youtube_comment(request: Request):
|
|
| 119 |
|
| 120 |
return templates.TemplateResponse(
|
| 121 |
"random_youtube.html",
|
| 122 |
-
{
|
| 123 |
-
"request": request,
|
| 124 |
-
"text": text,
|
| 125 |
-
"true_label": true_label,
|
| 126 |
-
"result": result
|
| 127 |
-
}
|
| 128 |
)
|
| 129 |
|
| 130 |
|
| 131 |
-
|
| 132 |
@app.get("/admin", response_class=HTMLResponse)
|
| 133 |
async def admin_dashboard(request: Request):
|
| 134 |
"""Pagina principale dell'area admin."""
|
|
@@ -137,10 +124,8 @@ async def admin_dashboard(request: Request):
|
|
| 137 |
if os.path.exists(metrics_path):
|
| 138 |
with open(metrics_path, "r") as f:
|
| 139 |
metrics = json.load(f)
|
| 140 |
-
return templates.TemplateResponse(
|
| 141 |
-
|
| 142 |
-
{"request": request, "metrics": metrics}
|
| 143 |
-
)
|
| 144 |
|
| 145 |
@app.post("/admin/train")
|
| 146 |
async def retrain_model():
|
|
@@ -148,12 +133,14 @@ async def retrain_model():
|
|
| 148 |
subprocess.run(["python", "src/train.py"], check=True)
|
| 149 |
return {"status": "Training completato"}
|
| 150 |
|
|
|
|
| 151 |
@app.post("/admin/monitor")
|
| 152 |
async def run_monitoring():
|
| 153 |
"""Esegue il monitoring e aggiorna metrics.json."""
|
| 154 |
subprocess.run(["python", "src/monitoring.py"], check=True)
|
| 155 |
return {"status": "Monitoring completato"}
|
| 156 |
|
|
|
|
| 157 |
@app.get("/admin/metrics", response_class=HTMLResponse)
|
| 158 |
def view_metrics(request: Request):
|
| 159 |
"""Visualizza i risultati del monitoring in forma tabellare e grafica."""
|
|
@@ -162,13 +149,10 @@ def view_metrics(request: Request):
|
|
| 162 |
if os.path.exists(metrics_path):
|
| 163 |
with open(metrics_path, "r") as f:
|
| 164 |
metrics = json.load(f)
|
| 165 |
-
return templates.TemplateResponse(
|
| 166 |
-
"metrics.html",
|
| 167 |
-
{"request": request, "metrics": metrics}
|
| 168 |
-
)
|
| 169 |
|
| 170 |
|
| 171 |
-
|
| 172 |
-
if __name__=="__main__":
|
| 173 |
import uvicorn
|
| 174 |
-
|
|
|
|
|
|
| 4 |
from fastapi.responses import HTMLResponse
|
| 5 |
from fastapi.templating import Jinja2Templates
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 7 |
+
from datasets import load_from_disk
|
| 8 |
import torch
|
| 9 |
import random
|
| 10 |
import subprocess
|
| 11 |
import json
|
| 12 |
|
| 13 |
# Caricamento del modello e dei dati se già scaricati
|
| 14 |
+
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 15 |
TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
|
| 16 |
YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
|
| 17 |
|
|
|
|
| 22 |
labels = ["negative", "neutral", "positive"]
|
| 23 |
|
| 24 |
|
|
|
|
| 25 |
# TWEET EVAL
|
| 26 |
if not os.path.exists(TWEET_PROCESSED_PATH):
|
| 27 |
print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
|
|
|
|
| 40 |
subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
|
| 41 |
youtube_ds = load_from_disk(YT_PROCESSED_PATH)
|
| 42 |
|
| 43 |
+
app = FastAPI(title="Sentiment Analysis API")
|
|
|
|
|
|
|
| 44 |
templates = Jinja2Templates(directory="app_templates/")
|
| 45 |
|
| 46 |
+
|
| 47 |
class TextInput(BaseModel):
|
| 48 |
text: str
|
| 49 |
|
|
|
|
| 58 |
return {"label": labels[pred], "confidence": round(confidence, 3)}
|
| 59 |
|
| 60 |
|
| 61 |
+
@app.get("/", response_class=HTMLResponse)
|
| 62 |
+
async def home(request: Request):
|
| 63 |
return templates.TemplateResponse("index.html", {"request": request})
|
| 64 |
+
|
| 65 |
+
|
| 66 |
@app.get("/random_tweet", response_class=HTMLResponse)
|
| 67 |
def random_tweet(request: Request):
|
| 68 |
+
# sample = random.choice(tweet_eval["test"])
|
| 69 |
sample = tweet_eval["test"][random.randrange(len(tweet_eval["test"]))]
|
| 70 |
+
text = (
|
| 71 |
+
sample["text"]
|
| 72 |
+
if "text" in sample
|
| 73 |
+
else tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
result = predict_sentiment(text)
|
| 77 |
|
| 78 |
+
true_label = labels[sample["label"]]
|
|
|
|
|
|
|
| 79 |
|
| 80 |
return templates.TemplateResponse(
|
| 81 |
"random_tweet.html",
|
| 82 |
+
{"request": request, "text": text, "true_label": true_label, "result": result},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
)
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
| 86 |
@app.get("/predict", response_class=HTMLResponse)
|
| 87 |
def predict_page(request: Request):
|
| 88 |
return templates.TemplateResponse("predict.html", {"request": request, "result": None})
|
| 89 |
|
| 90 |
+
|
| 91 |
@app.post("/predict", response_class=HTMLResponse)
|
| 92 |
def predict_text(request: Request, text: str = Form(...)):
|
| 93 |
result = predict_sentiment(text)
|
| 94 |
return templates.TemplateResponse(
|
| 95 |
+
"predict.html", {"request": request, "text": text, "result": result}
|
|
|
|
| 96 |
)
|
| 97 |
|
| 98 |
|
| 99 |
@app.get("/random_youtube_comment", response_class=HTMLResponse)
|
| 100 |
def random_youtube_comment(request: Request):
|
| 101 |
+
sample = random.choice(youtube_ds["train"])
|
| 102 |
|
| 103 |
text = sample["text"] if "text" in sample else sample["text"]
|
| 104 |
true_label = sample["label"] if "label" in sample else "N/A"
|
| 105 |
|
| 106 |
if isinstance(true_label, int):
|
| 107 |
+
|
| 108 |
label_map = {0: "negative", 1: "neutral", 2: "positive"}
|
| 109 |
true_label = label_map.get(true_label, "N/A")
|
| 110 |
|
|
|
|
| 112 |
|
| 113 |
return templates.TemplateResponse(
|
| 114 |
"random_youtube.html",
|
| 115 |
+
{"request": request, "text": text, "true_label": true_label, "result": result},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
)
|
| 117 |
|
| 118 |
|
|
|
|
| 119 |
@app.get("/admin", response_class=HTMLResponse)
|
| 120 |
async def admin_dashboard(request: Request):
|
| 121 |
"""Pagina principale dell'area admin."""
|
|
|
|
| 124 |
if os.path.exists(metrics_path):
|
| 125 |
with open(metrics_path, "r") as f:
|
| 126 |
metrics = json.load(f)
|
| 127 |
+
return templates.TemplateResponse("admin.html", {"request": request, "metrics": metrics})
|
| 128 |
+
|
|
|
|
|
|
|
| 129 |
|
| 130 |
@app.post("/admin/train")
|
| 131 |
async def retrain_model():
|
|
|
|
| 133 |
subprocess.run(["python", "src/train.py"], check=True)
|
| 134 |
return {"status": "Training completato"}
|
| 135 |
|
| 136 |
+
|
| 137 |
@app.post("/admin/monitor")
|
| 138 |
async def run_monitoring():
|
| 139 |
"""Esegue il monitoring e aggiorna metrics.json."""
|
| 140 |
subprocess.run(["python", "src/monitoring.py"], check=True)
|
| 141 |
return {"status": "Monitoring completato"}
|
| 142 |
|
| 143 |
+
|
| 144 |
@app.get("/admin/metrics", response_class=HTMLResponse)
|
| 145 |
def view_metrics(request: Request):
|
| 146 |
"""Visualizza i risultati del monitoring in forma tabellare e grafica."""
|
|
|
|
| 149 |
if os.path.exists(metrics_path):
|
| 150 |
with open(metrics_path, "r") as f:
|
| 151 |
metrics = json.load(f)
|
| 152 |
+
return templates.TemplateResponse("metrics.html", {"request": request, "metrics": metrics})
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
+
if __name__ == "__main__":
|
|
|
|
| 156 |
import uvicorn
|
| 157 |
+
|
| 158 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
src/data_preparation.py
CHANGED
|
@@ -12,7 +12,8 @@ PROCESSED_DIR = "data/processed/"
|
|
| 12 |
os.makedirs(PROCESSED_DIR, exist_ok=True)
|
| 13 |
|
| 14 |
|
| 15 |
-
# FUNZIONI DI SUPPORTO
|
|
|
|
| 16 |
|
| 17 |
def clean_text(text):
|
| 18 |
"""Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
|
|
@@ -23,6 +24,7 @@ def clean_text(text):
|
|
| 23 |
text = re.sub(r"\s+", " ", text)
|
| 24 |
return text.strip()
|
| 25 |
|
|
|
|
| 26 |
def map_label(label):
|
| 27 |
"""Mappa le etichette di sentiment a numeri"""
|
| 28 |
mapping = {"negative": 0, "neutral": 1, "positive": 2}
|
|
@@ -30,9 +32,11 @@ def map_label(label):
|
|
| 30 |
return mapping.get(label.lower(), 1)
|
| 31 |
return label
|
| 32 |
|
|
|
|
| 33 |
# Tokenizer globale
|
| 34 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 35 |
|
|
|
|
| 36 |
def tokenize_function(examples):
|
| 37 |
return tokenizer(
|
| 38 |
examples["text"],
|
|
@@ -42,12 +46,11 @@ def tokenize_function(examples):
|
|
| 42 |
)
|
| 43 |
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
# ----------------------------- #
|
| 48 |
# PREPARAZIONE DEI DATASET #
|
| 49 |
# ----------------------------- #
|
| 50 |
|
|
|
|
| 51 |
def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
|
| 52 |
"""
|
| 53 |
Gestisce i retry del download e crea un dataset di fallback se fallisce.
|
|
@@ -79,7 +82,9 @@ def prepare_tweet_eval(tokenizer, output_path):
|
|
| 79 |
reduced_splits = {}
|
| 80 |
for split in ds.keys():
|
| 81 |
reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
|
| 82 |
-
reduced_splits[split] = reduced_splits[split].map(
|
|
|
|
|
|
|
| 83 |
reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
|
| 84 |
ds = DatasetDict(reduced_splits)
|
| 85 |
else:
|
|
@@ -98,7 +103,7 @@ def prepare_youtube(tokenizer, output_path):
|
|
| 98 |
"Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
|
| 99 |
}
|
| 100 |
ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
|
| 101 |
-
|
| 102 |
if isinstance(ds, dict) or "train" in ds:
|
| 103 |
reduced_splits = {}
|
| 104 |
for split in ds.keys():
|
|
@@ -112,7 +117,7 @@ def prepare_youtube(tokenizer, output_path):
|
|
| 112 |
reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
|
| 113 |
ds = DatasetDict(reduced_splits)
|
| 114 |
else:
|
| 115 |
-
|
| 116 |
ds = ds.select(range(min(1000, len(ds))))
|
| 117 |
ds = ds.map(
|
| 118 |
lambda x: {
|
|
@@ -120,19 +125,18 @@ def prepare_youtube(tokenizer, output_path):
|
|
| 120 |
"label": map_label(x["Sentiment"]),
|
| 121 |
}
|
| 122 |
)
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
ds.save_to_disk(output_path)
|
| 127 |
print(f"Dataset YouTube salvato in {output_path}")
|
| 128 |
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
if __name__ == "__main__":
|
| 134 |
parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
|
| 135 |
-
parser.add_argument(
|
|
|
|
|
|
|
| 136 |
args = parser.parse_args()
|
| 137 |
|
| 138 |
if args.dataset == "tweet_eval":
|
|
|
|
| 12 |
os.makedirs(PROCESSED_DIR, exist_ok=True)
|
| 13 |
|
| 14 |
|
| 15 |
+
# FUNZIONI DI SUPPORTO
|
| 16 |
+
|
| 17 |
|
| 18 |
def clean_text(text):
|
| 19 |
"""Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
|
|
|
|
| 24 |
text = re.sub(r"\s+", " ", text)
|
| 25 |
return text.strip()
|
| 26 |
|
| 27 |
+
|
| 28 |
def map_label(label):
|
| 29 |
"""Mappa le etichette di sentiment a numeri"""
|
| 30 |
mapping = {"negative": 0, "neutral": 1, "positive": 2}
|
|
|
|
| 32 |
return mapping.get(label.lower(), 1)
|
| 33 |
return label
|
| 34 |
|
| 35 |
+
|
| 36 |
# Tokenizer globale
|
| 37 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 38 |
|
| 39 |
+
|
| 40 |
def tokenize_function(examples):
|
| 41 |
return tokenizer(
|
| 42 |
examples["text"],
|
|
|
|
| 46 |
)
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
| 49 |
# ----------------------------- #
|
| 50 |
# PREPARAZIONE DEI DATASET #
|
| 51 |
# ----------------------------- #
|
| 52 |
|
| 53 |
+
|
| 54 |
def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
|
| 55 |
"""
|
| 56 |
Gestisce i retry del download e crea un dataset di fallback se fallisce.
|
|
|
|
| 82 |
reduced_splits = {}
|
| 83 |
for split in ds.keys():
|
| 84 |
reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
|
| 85 |
+
reduced_splits[split] = reduced_splits[split].map(
|
| 86 |
+
lambda x: {"text": clean_text(x["text"])}
|
| 87 |
+
)
|
| 88 |
reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
|
| 89 |
ds = DatasetDict(reduced_splits)
|
| 90 |
else:
|
|
|
|
| 103 |
"Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
|
| 104 |
}
|
| 105 |
ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
|
| 106 |
+
|
| 107 |
if isinstance(ds, dict) or "train" in ds:
|
| 108 |
reduced_splits = {}
|
| 109 |
for split in ds.keys():
|
|
|
|
| 117 |
reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
|
| 118 |
ds = DatasetDict(reduced_splits)
|
| 119 |
else:
|
| 120 |
+
|
| 121 |
ds = ds.select(range(min(1000, len(ds))))
|
| 122 |
ds = ds.map(
|
| 123 |
lambda x: {
|
|
|
|
| 125 |
"label": map_label(x["Sentiment"]),
|
| 126 |
}
|
| 127 |
)
|
| 128 |
+
# ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
|
| 129 |
+
# ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
|
| 130 |
+
# ds = ds.map(tokenize_function, batched=True)
|
| 131 |
ds.save_to_disk(output_path)
|
| 132 |
print(f"Dataset YouTube salvato in {output_path}")
|
| 133 |
|
| 134 |
|
|
|
|
|
|
|
|
|
|
| 135 |
if __name__ == "__main__":
|
| 136 |
parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
|
| 137 |
+
parser.add_argument(
|
| 138 |
+
"dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare."
|
| 139 |
+
)
|
| 140 |
args = parser.parse_args()
|
| 141 |
|
| 142 |
if args.dataset == "tweet_eval":
|
src/download_data.py
CHANGED
|
@@ -5,4 +5,4 @@ dataset.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/raw/tweet
|
|
| 5 |
|
| 6 |
|
| 7 |
dataset2 = load_dataset("AmaanP314/youtube-comment-sentiment")
|
| 8 |
-
dataset2.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/youtube-comment-sentiment")
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
dataset2 = load_dataset("AmaanP314/youtube-comment-sentiment")
|
| 8 |
+
dataset2.save_to_disk("/workspaces/MLOps_Project_SentimentAnalysis/data/youtube-comment-sentiment")
|
src/train_model.py
CHANGED
|
@@ -1,10 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
from
|
| 3 |
-
AutoModelForSequenceClassification,
|
| 4 |
-
Trainer,
|
| 5 |
-
TrainingArguments
|
| 6 |
-
)
|
| 7 |
-
from datasets import load_from_disk,concatenate_datasets
|
| 8 |
import evaluate
|
| 9 |
import numpy as np
|
| 10 |
import os
|
|
@@ -18,7 +13,8 @@ hf_token = os.environ["HF_TOKEN"]
|
|
| 18 |
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 19 |
DATA_PATH = "data/processed/tweet_eval_tokenized"
|
| 20 |
OUTPUT_DIR = "models/sentiment_model"
|
| 21 |
-
HF_REPO = "Lordemarco/SentimentAnalysis"
|
|
|
|
| 22 |
|
| 23 |
def compute_metrics(eval_pred):
|
| 24 |
"""Calcola metriche standard: accuracy e F1."""
|
|
@@ -31,19 +27,24 @@ def compute_metrics(eval_pred):
|
|
| 31 |
return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
|
| 32 |
|
| 33 |
|
| 34 |
-
def train_model(
|
|
|
|
|
|
|
| 35 |
print("Caricamento dataset Tweet eval preprocessato")
|
| 36 |
dataset = load_from_disk(DATA_PATH)
|
| 37 |
if additional_data is not None:
|
| 38 |
print("Aggiungo dati YouTube al training set...")
|
| 39 |
dataset["train"] = concatenate_datasets([dataset["train"], additional_data])
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
print(
|
|
|
|
|
|
|
| 43 |
train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
|
| 44 |
-
eval_data = dataset["validation"].select(
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
| 48 |
|
| 49 |
# Parametri training
|
|
@@ -57,7 +58,7 @@ def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=30
|
|
| 57 |
logging_dir="./logs",
|
| 58 |
logging_steps=10,
|
| 59 |
load_best_model_at_end=True,
|
| 60 |
-
report_to="none",
|
| 61 |
)
|
| 62 |
|
| 63 |
print("Avvio training")
|
|
@@ -74,11 +75,11 @@ def train_model(additional_data=None,sample_train_size=1000, sample_eval_size=30
|
|
| 74 |
os.makedirs(output_dir, exist_ok=True)
|
| 75 |
trainer.save_model(output_dir)
|
| 76 |
print(f"Modello salvato in: {OUTPUT_DIR}")
|
| 77 |
-
|
| 78 |
|
| 79 |
if os.getenv("HF_TOKEN"):
|
| 80 |
print("Pushing model to Hugging Face Hub...")
|
| 81 |
trainer.push_to_hub("Lordemarco/SentimentAnalysis")
|
| 82 |
-
|
|
|
|
| 83 |
if __name__ == "__main__":
|
| 84 |
train_model()
|
|
|
|
| 1 |
+
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
|
| 2 |
+
from datasets import load_from_disk, concatenate_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import evaluate
|
| 4 |
import numpy as np
|
| 5 |
import os
|
|
|
|
| 13 |
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 14 |
DATA_PATH = "data/processed/tweet_eval_tokenized"
|
| 15 |
OUTPUT_DIR = "models/sentiment_model"
|
| 16 |
+
HF_REPO = "Lordemarco/SentimentAnalysis"
|
| 17 |
+
|
| 18 |
|
| 19 |
def compute_metrics(eval_pred):
|
| 20 |
"""Calcola metriche standard: accuracy e F1."""
|
|
|
|
| 27 |
return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
|
| 28 |
|
| 29 |
|
| 30 |
+
def train_model(
|
| 31 |
+
additional_data=None, sample_train_size=1000, sample_eval_size=300, output_dir=OUTPUT_DIR
|
| 32 |
+
):
|
| 33 |
print("Caricamento dataset Tweet eval preprocessato")
|
| 34 |
dataset = load_from_disk(DATA_PATH)
|
| 35 |
if additional_data is not None:
|
| 36 |
print("Aggiungo dati YouTube al training set...")
|
| 37 |
dataset["train"] = concatenate_datasets([dataset["train"], additional_data])
|
| 38 |
|
| 39 |
+
#
|
| 40 |
+
print(
|
| 41 |
+
f"Riduzione dataset: {sample_train_size} per il train, {sample_eval_size} per la validazione."
|
| 42 |
+
)
|
| 43 |
train_data = dataset["train"].select(range(min(sample_train_size, len(dataset["train"]))))
|
| 44 |
+
eval_data = dataset["validation"].select(
|
| 45 |
+
range(min(sample_eval_size, len(dataset["validation"])))
|
| 46 |
+
)
|
| 47 |
|
|
|
|
| 48 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
| 49 |
|
| 50 |
# Parametri training
|
|
|
|
| 58 |
logging_dir="./logs",
|
| 59 |
logging_steps=10,
|
| 60 |
load_best_model_at_end=True,
|
| 61 |
+
report_to="none",
|
| 62 |
)
|
| 63 |
|
| 64 |
print("Avvio training")
|
|
|
|
| 75 |
os.makedirs(output_dir, exist_ok=True)
|
| 76 |
trainer.save_model(output_dir)
|
| 77 |
print(f"Modello salvato in: {OUTPUT_DIR}")
|
|
|
|
| 78 |
|
| 79 |
if os.getenv("HF_TOKEN"):
|
| 80 |
print("Pushing model to Hugging Face Hub...")
|
| 81 |
trainer.push_to_hub("Lordemarco/SentimentAnalysis")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
if __name__ == "__main__":
|
| 85 |
train_model()
|
tests/integration/test_app.py
CHANGED
|
@@ -1,35 +1,43 @@
|
|
| 1 |
from fastapi.testclient import TestClient
|
| 2 |
from src.app import app
|
| 3 |
-
import os
|
| 4 |
|
| 5 |
os.environ["SKIP_DATA_PREP"] = "true"
|
| 6 |
|
| 7 |
client = TestClient(app)
|
| 8 |
|
|
|
|
| 9 |
def test_home_page():
|
| 10 |
response = client.get("/")
|
| 11 |
assert response.status_code == 200
|
| 12 |
assert "Benvenuto" in response.text
|
| 13 |
|
|
|
|
| 14 |
def test_predict_endpoint_get():
|
| 15 |
response = client.get("/predict")
|
| 16 |
assert response.status_code == 200
|
| 17 |
assert "Testa il Modello" in response.text
|
| 18 |
|
|
|
|
| 19 |
def test_predict_endpoint_post():
|
| 20 |
response = client.post("/predict", data={"text": "I love this!"})
|
| 21 |
assert response.status_code == 200
|
| 22 |
assert any(label in response.text for label in ["positive", "neutral", "negative"])
|
| 23 |
|
|
|
|
| 24 |
def test_random_tweet_page():
|
| 25 |
response = client.get("/random_tweet")
|
| 26 |
assert response.status_code == 200
|
| 27 |
-
assert any(
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
def test_random_youtube_page():
|
| 31 |
response = client.get("/random_youtube_comment")
|
| 32 |
assert response.status_code == 200
|
| 33 |
-
assert any(
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
| 1 |
from fastapi.testclient import TestClient
|
| 2 |
from src.app import app
|
| 3 |
+
import os
|
| 4 |
|
| 5 |
os.environ["SKIP_DATA_PREP"] = "true"
|
| 6 |
|
| 7 |
client = TestClient(app)
|
| 8 |
|
| 9 |
+
|
| 10 |
def test_home_page():
|
| 11 |
response = client.get("/")
|
| 12 |
assert response.status_code == 200
|
| 13 |
assert "Benvenuto" in response.text
|
| 14 |
|
| 15 |
+
|
| 16 |
def test_predict_endpoint_get():
|
| 17 |
response = client.get("/predict")
|
| 18 |
assert response.status_code == 200
|
| 19 |
assert "Testa il Modello" in response.text
|
| 20 |
|
| 21 |
+
|
| 22 |
def test_predict_endpoint_post():
|
| 23 |
response = client.post("/predict", data={"text": "I love this!"})
|
| 24 |
assert response.status_code == 200
|
| 25 |
assert any(label in response.text for label in ["positive", "neutral", "negative"])
|
| 26 |
|
| 27 |
+
|
| 28 |
def test_random_tweet_page():
|
| 29 |
response = client.get("/random_tweet")
|
| 30 |
assert response.status_code == 200
|
| 31 |
+
assert any(
|
| 32 |
+
lbl in response.text
|
| 33 |
+
for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"]
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def test_random_youtube_page():
|
| 38 |
response = client.get("/random_youtube_comment")
|
| 39 |
assert response.status_code == 200
|
| 40 |
+
assert any(
|
| 41 |
+
lbl in response.text
|
| 42 |
+
for lbl in ["positive", "neutral", "negative", "Positivo", "Neutro", "Negativo"]
|
| 43 |
+
)
|
tests/integration/test_monitoring.py
CHANGED
|
@@ -18,19 +18,16 @@ def cleanup_metrics():
|
|
| 18 |
|
| 19 |
def test_monitoring_creates_metrics():
|
| 20 |
"""Verifica che il monitoring crei correttamente il file metrics.json e contenga i dati previsti."""
|
| 21 |
-
main()
|
| 22 |
|
| 23 |
assert os.path.exists(METRICS_PATH), "metrics.json non è stato generato"
|
| 24 |
|
| 25 |
-
|
| 26 |
with open(METRICS_PATH, "r") as f:
|
| 27 |
metrics = json.load(f)
|
| 28 |
|
| 29 |
assert "TweetEval" in metrics, "Mancano metriche TweetEval"
|
| 30 |
assert "YouTube" in metrics, "Mancano metriche YouTube"
|
| 31 |
|
| 32 |
-
|
| 33 |
for dataset_name, data in metrics.items():
|
| 34 |
assert "accuracy" in data, f"Manca accuracy per {dataset_name}"
|
| 35 |
assert "f1" in data, f"Manca F1 per {dataset_name}"
|
| 36 |
-
|
|
|
|
| 18 |
|
| 19 |
def test_monitoring_creates_metrics():
|
| 20 |
"""Verifica che il monitoring crei correttamente il file metrics.json e contenga i dati previsti."""
|
| 21 |
+
main()
|
| 22 |
|
| 23 |
assert os.path.exists(METRICS_PATH), "metrics.json non è stato generato"
|
| 24 |
|
|
|
|
| 25 |
with open(METRICS_PATH, "r") as f:
|
| 26 |
metrics = json.load(f)
|
| 27 |
|
| 28 |
assert "TweetEval" in metrics, "Mancano metriche TweetEval"
|
| 29 |
assert "YouTube" in metrics, "Mancano metriche YouTube"
|
| 30 |
|
|
|
|
| 31 |
for dataset_name, data in metrics.items():
|
| 32 |
assert "accuracy" in data, f"Manca accuracy per {dataset_name}"
|
| 33 |
assert "f1" in data, f"Manca F1 per {dataset_name}"
|
|
|
tests/integration/test_train.py
CHANGED
|
@@ -5,6 +5,7 @@ from src.train_model import train_model
|
|
| 5 |
|
| 6 |
MODEL_DIR = "models/sentiment_model"
|
| 7 |
|
|
|
|
| 8 |
@pytest.fixture(autouse=True)
|
| 9 |
def cleanup():
|
| 10 |
if os.path.exists(MODEL_DIR):
|
|
@@ -13,8 +14,9 @@ def cleanup():
|
|
| 13 |
if os.path.exists(MODEL_DIR):
|
| 14 |
shutil.rmtree(MODEL_DIR)
|
| 15 |
|
|
|
|
| 16 |
def test_train_model_runs():
|
| 17 |
"""Testa che il training parta e salvi un modello."""
|
| 18 |
-
train_model(sample_train_size=10, sample_eval_size=5)
|
| 19 |
assert os.path.exists(MODEL_DIR), "La directory del modello non è stata creata"
|
| 20 |
assert os.path.exists(os.path.join(MODEL_DIR, "config.json")), "File config.json mancante"
|
|
|
|
| 5 |
|
| 6 |
MODEL_DIR = "models/sentiment_model"
|
| 7 |
|
| 8 |
+
|
| 9 |
@pytest.fixture(autouse=True)
|
| 10 |
def cleanup():
|
| 11 |
if os.path.exists(MODEL_DIR):
|
|
|
|
| 14 |
if os.path.exists(MODEL_DIR):
|
| 15 |
shutil.rmtree(MODEL_DIR)
|
| 16 |
|
| 17 |
+
|
| 18 |
def test_train_model_runs():
|
| 19 |
"""Testa che il training parta e salvi un modello."""
|
| 20 |
+
train_model(sample_train_size=10, sample_eval_size=5)
|
| 21 |
assert os.path.exists(MODEL_DIR), "La directory del modello non è stata creata"
|
| 22 |
assert os.path.exists(os.path.join(MODEL_DIR, "config.json")), "File config.json mancante"
|
tests/unit/test_data.py
CHANGED
|
@@ -6,13 +6,12 @@ from datasets import load_from_disk
|
|
| 6 |
TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
|
| 7 |
YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
|
| 8 |
|
|
|
|
| 9 |
def run_data_preparation(dataset_name):
|
| 10 |
"""Esegue lo script di data preparation per il dataset richiesto."""
|
| 11 |
print(f"⚙️ Avvio data_preparation.py per il dataset: {dataset_name}")
|
| 12 |
-
subprocess.run(
|
| 13 |
-
|
| 14 |
-
check=True
|
| 15 |
-
)
|
| 16 |
|
| 17 |
def test_tweet_eval_dataset_exists_or_create():
|
| 18 |
"""Controlla o crea il dataset Tweet Eval preprocessato."""
|
|
@@ -20,21 +19,25 @@ def test_tweet_eval_dataset_exists_or_create():
|
|
| 20 |
run_data_preparation("tweet_eval")
|
| 21 |
assert os.path.exists(TWEET_PROCESSED_PATH), "Tweet Eval non disponibile dopo la preparazione"
|
| 22 |
|
|
|
|
| 23 |
def test_youtube_dataset_exists_or_create():
|
| 24 |
"""Controlla o crea il dataset YouTube preprocessato."""
|
| 25 |
if not os.path.exists(YT_PROCESSED_PATH):
|
| 26 |
run_data_preparation("youtube")
|
| 27 |
assert os.path.exists(YT_PROCESSED_PATH), "YouTube dataset non disponibile dopo la preparazione"
|
| 28 |
|
|
|
|
| 29 |
def test_tweet_eval_structure():
|
| 30 |
"""Verifica che il dataset Tweet Eval abbia la struttura corretta."""
|
| 31 |
ds = load_from_disk(TWEET_PROCESSED_PATH)
|
| 32 |
assert "text" in ds["test"].features, "Campo 'text' mancante in Tweet Eval"
|
| 33 |
assert "label" in ds["test"].features, "Campo 'label' mancante in Tweet Eval"
|
| 34 |
|
|
|
|
| 35 |
def test_youtube_structure():
|
| 36 |
"""Verifica che il dataset YouTube abbia la struttura corretta."""
|
| 37 |
ds = load_from_disk(YT_PROCESSED_PATH)
|
| 38 |
-
assert
|
| 39 |
-
"
|
|
|
|
| 40 |
assert "Sentiment" in ds["train"].features, "Campo 'label' mancante in YouTube dataset"
|
|
|
|
| 6 |
TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
|
| 7 |
YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
|
| 8 |
|
| 9 |
+
|
| 10 |
def run_data_preparation(dataset_name):
|
| 11 |
"""Esegue lo script di data preparation per il dataset richiesto."""
|
| 12 |
print(f"⚙️ Avvio data_preparation.py per il dataset: {dataset_name}")
|
| 13 |
+
subprocess.run(["python", "src/data_preparation.py", "--dataset", dataset_name], check=True)
|
| 14 |
+
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def test_tweet_eval_dataset_exists_or_create():
|
| 17 |
"""Controlla o crea il dataset Tweet Eval preprocessato."""
|
|
|
|
| 19 |
run_data_preparation("tweet_eval")
|
| 20 |
assert os.path.exists(TWEET_PROCESSED_PATH), "Tweet Eval non disponibile dopo la preparazione"
|
| 21 |
|
| 22 |
+
|
| 23 |
def test_youtube_dataset_exists_or_create():
|
| 24 |
"""Controlla o crea il dataset YouTube preprocessato."""
|
| 25 |
if not os.path.exists(YT_PROCESSED_PATH):
|
| 26 |
run_data_preparation("youtube")
|
| 27 |
assert os.path.exists(YT_PROCESSED_PATH), "YouTube dataset non disponibile dopo la preparazione"
|
| 28 |
|
| 29 |
+
|
| 30 |
def test_tweet_eval_structure():
|
| 31 |
"""Verifica che il dataset Tweet Eval abbia la struttura corretta."""
|
| 32 |
ds = load_from_disk(TWEET_PROCESSED_PATH)
|
| 33 |
assert "text" in ds["test"].features, "Campo 'text' mancante in Tweet Eval"
|
| 34 |
assert "label" in ds["test"].features, "Campo 'label' mancante in Tweet Eval"
|
| 35 |
|
| 36 |
+
|
| 37 |
def test_youtube_structure():
|
| 38 |
"""Verifica che il dataset YouTube abbia la struttura corretta."""
|
| 39 |
ds = load_from_disk(YT_PROCESSED_PATH)
|
| 40 |
+
assert (
|
| 41 |
+
"CommentText" in ds["train"].features or "CommentText" in ds["train"].features
|
| 42 |
+
), "Campo testuale mancante in YouTube dataset"
|
| 43 |
assert "Sentiment" in ds["train"].features, "Campo 'label' mancante in YouTube dataset"
|
tests/unit/test_model.py
CHANGED
|
@@ -7,10 +7,12 @@ LABELS = ["negative", "neutral", "positive"]
|
|
| 7 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 8 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
| 9 |
|
|
|
|
| 10 |
def test_model_loads():
|
| 11 |
assert model is not None
|
| 12 |
assert tokenizer is not None
|
| 13 |
|
|
|
|
| 14 |
def test_model_prediction_shape():
|
| 15 |
text = "I love this product!"
|
| 16 |
inputs = tokenizer(text, return_tensors="pt")
|
|
@@ -18,9 +20,10 @@ def test_model_prediction_shape():
|
|
| 18 |
outputs = model(**inputs)
|
| 19 |
assert outputs.logits.shape[-1] == len(LABELS)
|
| 20 |
|
|
|
|
| 21 |
def test_sentiment_confidence():
|
| 22 |
text = "I hate this"
|
| 23 |
inputs = tokenizer(text, return_tensors="pt")
|
| 24 |
with torch.no_grad():
|
| 25 |
probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
|
| 26 |
-
assert torch.isclose(probs.sum(), torch.tensor(1.0), atol=1e-3)
|
|
|
|
| 7 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 8 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
| 9 |
|
| 10 |
+
|
| 11 |
def test_model_loads():
|
| 12 |
assert model is not None
|
| 13 |
assert tokenizer is not None
|
| 14 |
|
| 15 |
+
|
| 16 |
def test_model_prediction_shape():
|
| 17 |
text = "I love this product!"
|
| 18 |
inputs = tokenizer(text, return_tensors="pt")
|
|
|
|
| 20 |
outputs = model(**inputs)
|
| 21 |
assert outputs.logits.shape[-1] == len(LABELS)
|
| 22 |
|
| 23 |
+
|
| 24 |
def test_sentiment_confidence():
|
| 25 |
text = "I hate this"
|
| 26 |
inputs = tokenizer(text, return_tensors="pt")
|
| 27 |
with torch.no_grad():
|
| 28 |
probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
|
| 29 |
+
assert torch.isclose(probs.sum(), torch.tensor(1.0), atol=1e-3)
|