Spaces:
Sleeping
Sleeping
LorenzoBioinfo
commited on
Commit
·
66028cc
1
Parent(s):
1754526
Add app and first data
Browse files- app_templates/__init__.py +0 -0
- app_templates/index.html +16 -0
- app_templates/predict.html +28 -0
- app_templates/random_tweet.html +33 -0
- app_templates/random_youtube.html +34 -0
- src/app.py +132 -0
- src/data_preparation.py +63 -0
app_templates/__init__.py
ADDED
|
File without changes
|
app_templates/index.html
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="it">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>Sentiment Analysis App</title>
|
| 6 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-gradient-to-br from-blue-50 to-purple-100 flex flex-col items-center justify-center h-screen text-center">
|
| 9 |
+
<h1 class="text-4xl font-bold text-gray-800 mb-8">🎯 Benvenuto nella Sentiment Analysis App di MachineInnovators Inc.</h1>
|
| 10 |
+
<div class="flex flex-col gap-4">
|
| 11 |
+
<a href="/random_tweet" class="px-6 py-3 bg-blue-600 text-white rounded-xl hover:bg-blue-700 shadow-lg transition">🧪 Testa il modello su dati di training (Twitter)</a>
|
| 12 |
+
<a href="/predict" class="px-6 py-3 bg-green-600 text-white rounded-xl hover:bg-green-700 shadow-lg transition">🧠 Testa il modello con un tuo testo</a>
|
| 13 |
+
<a href="/random_youtube_comment" class="px-6 py-3 bg-purple-600 text-white rounded-xl hover:bg-purple-700 shadow-lg transition">🌍 Testa il modello su nuovi dati (YouTube)</a>
|
| 14 |
+
</div>
|
| 15 |
+
</body>
|
| 16 |
+
</html>
|
app_templates/predict.html
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="it">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>Predici il Sentiment</title>
|
| 6 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-gray-50 flex flex-col items-center justify-center min-h-screen">
|
| 9 |
+
<h2 class="text-3xl font-semibold text-gray-800 mb-6">🧠 Testa il Modello con un tuo testo</h2>
|
| 10 |
+
|
| 11 |
+
<form method="post" class="bg-white rounded-2xl shadow-md p-6 w-3/4 text-center">
|
| 12 |
+
<textarea name="text" rows="3" class="w-full border border-gray-300 rounded-lg p-3 focus:outline-none focus:ring-2 focus:ring-blue-400" placeholder="Scrivi qui il tuo testo...">{{ text if text else "" }}</textarea>
|
| 13 |
+
<button type="submit" class="mt-4 px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition">Analizza Sentiment</button>
|
| 14 |
+
</form>
|
| 15 |
+
|
| 16 |
+
{% if result %}
|
| 17 |
+
<div class="mt-6 bg-white rounded-xl shadow p-4 w-3/4 text-center">
|
| 18 |
+
<p class="text-gray-700 text-lg mb-2">Risultato:</p>
|
| 19 |
+
<p class="text-2xl font-bold text-blue-600">{{ result.label }}</p>
|
| 20 |
+
<p class="text-sm text-gray-500 mt-1">Confidence: {{ result.confidence }}</p>
|
| 21 |
+
</div>
|
| 22 |
+
{% endif %}
|
| 23 |
+
|
| 24 |
+
<div class="mt-6">
|
| 25 |
+
<a href="/" class="text-blue-600 hover:underline">⬅️ Torna alla Home</a>
|
| 26 |
+
</div>
|
| 27 |
+
</body>
|
| 28 |
+
</html>
|
app_templates/random_tweet.html
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="it">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>Random Tweet</title>
|
| 6 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-gray-50 flex flex-col items-center justify-center min-h-screen">
|
| 9 |
+
<h2 class="text-3xl font-semibold text-gray-800 mb-4">🔀 Random Tweet Test</h2>
|
| 10 |
+
|
| 11 |
+
<div class="bg-white rounded-2xl shadow-md p-6 w-3/4 text-center">
|
| 12 |
+
<p class="text-lg text-gray-700 italic mb-4">"{{ text }}"</p>
|
| 13 |
+
|
| 14 |
+
<div class="grid grid-cols-2 gap-4 mt-4">
|
| 15 |
+
<div class="p-3 border rounded-xl">
|
| 16 |
+
<h3 class="text-gray-600 text-sm">🧩 Predizione del Modello</h3>
|
| 17 |
+
<p class="text-xl font-semibold text-blue-600">{{ result.label }}</p>
|
| 18 |
+
<p class="text-xs text-gray-500">Confidence: {{ result.confidence }}</p>
|
| 19 |
+
</div>
|
| 20 |
+
|
| 21 |
+
<div class="p-3 border rounded-xl">
|
| 22 |
+
<h3 class="text-gray-600 text-sm">🎯 Etichetta Reale</h3>
|
| 23 |
+
<p class="text-xl font-semibold text-green-600">{{ true_label }}</p>
|
| 24 |
+
</div>
|
| 25 |
+
</div>
|
| 26 |
+
</div>
|
| 27 |
+
|
| 28 |
+
<div class="mt-6 flex gap-4">
|
| 29 |
+
<a href="/random_tweet" class="px-5 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition">🔁 Altro Tweet</a>
|
| 30 |
+
<a href="/" class="px-5 py-2 bg-gray-300 text-gray-800 rounded-lg hover:bg-gray-400 transition">⬅️ Torna alla Home</a>
|
| 31 |
+
</div>
|
| 32 |
+
</body>
|
| 33 |
+
</html>
|
app_templates/random_youtube.html
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="it">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>Random YouTube Comment</title>
|
| 6 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-gray-50 flex flex-col items-center justify-center min-h-screen">
|
| 9 |
+
<h2 class="text-3xl font-semibold text-gray-800 mb-4">🌍 Test su Dati Nuovi (YouTube Comments)</h2>
|
| 10 |
+
<p class="text-gray-600 mb-6">Questa sezione testa il modello su dati reali non visti durante il training (generalizzazione).</p>
|
| 11 |
+
|
| 12 |
+
<div class="bg-white rounded-2xl shadow-md p-6 w-3/4 text-center">
|
| 13 |
+
<p class="text-lg text-gray-700 italic mb-4">"{{ text }}"</p>
|
| 14 |
+
|
| 15 |
+
<div class="grid grid-cols-2 gap-4 mt-4">
|
| 16 |
+
<div class="p-3 border rounded-xl">
|
| 17 |
+
<h3 class="text-gray-600 text-sm">🧩 Predizione del Modello</h3>
|
| 18 |
+
<p class="text-xl font-semibold text-blue-600">{{ result.label }}</p>
|
| 19 |
+
<p class="text-xs text-gray-500">Confidence: {{ result.confidence }}</p>
|
| 20 |
+
</div>
|
| 21 |
+
|
| 22 |
+
<div class="p-3 border rounded-xl">
|
| 23 |
+
<h3 class="text-gray-600 text-sm">🎯 Etichetta Reale</h3>
|
| 24 |
+
<p class="text-xl font-semibold text-green-600">{{ true_label }}</p>
|
| 25 |
+
</div>
|
| 26 |
+
</div>
|
| 27 |
+
</div>
|
| 28 |
+
|
| 29 |
+
<div class="mt-6 flex gap-4">
|
| 30 |
+
<a href="/random_youtube_comment" class="px-5 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition">🔁 Altro Commento</a>
|
| 31 |
+
<a href="/" class="px-5 py-2 bg-gray-300 text-gray-800 rounded-lg hover:bg-gray-400 transition">⬅️ Torna alla Home</a>
|
| 32 |
+
</div>
|
| 33 |
+
</body>
|
| 34 |
+
</html>
|
src/app.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from fastapi import FastAPI, Request, Form
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from fastapi.responses import HTMLResponse
|
| 5 |
+
from fastapi.templating import Jinja2Templates
|
| 6 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 7 |
+
from datasets import load_dataset, load_from_disk
|
| 8 |
+
import torch
|
| 9 |
+
import random
|
| 10 |
+
|
| 11 |
+
# Caricamento del modello e dei dati se già scaricati
|
| 12 |
+
MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 13 |
+
TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
|
| 14 |
+
YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
|
| 15 |
+
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
| 17 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
labels = ["negative", "neutral", "positive"]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
if not os.path.exists(TWEET_PROCESSED_PATH):
|
| 24 |
+
tweet_eval = load_dataset("tweet_eval", "sentiment")
|
| 25 |
+
raise FileNotFoundError(
|
| 26 |
+
f"Dati non trovati in {TWEET_PROCESSED_PATH}. "
|
| 27 |
+
"Esegui src/data_preparation.py per crearlo."
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
tweet_eval = load_from_disk(TWEET_PROCESSED_PATH)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if not os.path.exists(YT_PROCESSED_PATH):
|
| 34 |
+
youtube_ds = load_dataset("AmaanP314/youtube-comment-sentiment")
|
| 35 |
+
raise FileNotFoundError(
|
| 36 |
+
f"Dati non trovati in {YT_PROCESSED_PATH}. "
|
| 37 |
+
"Esegui src/data_preparation.py per crearlo."
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
youtube_ds = load_from_disk(YT_PROCESSED_PATH)
|
| 41 |
+
|
| 42 |
+
app = FastAPI(
|
| 43 |
+
title="Sentiment Analysis API",
|
| 44 |
+
description="Testa il modello RoBERTa di CardiffNLP su frasi personalizzate o su esempi random dal dataset TweetEval."
|
| 45 |
+
)
|
| 46 |
+
templates = Jinja2Templates(directory="app_templates/")
|
| 47 |
+
|
| 48 |
+
class TextInput(BaseModel):
|
| 49 |
+
text: str
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def predict_sentiment(text: str):
|
| 53 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
| 54 |
+
with torch.no_grad():
|
| 55 |
+
outputs = model(**inputs)
|
| 56 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
| 57 |
+
pred = torch.argmax(probs, dim=1).item()
|
| 58 |
+
confidence = probs[0][pred].item()
|
| 59 |
+
return {"label": labels[pred], "confidence": round(confidence, 3)}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@app.get("/",response_class=HTMLResponse)
|
| 63 |
+
async def home( request: Request):
|
| 64 |
+
#return "Ciao Mondo!"
|
| 65 |
+
#return {"message": "Benvenuto nell'App di MachineInnovators Inc. per la sentiment analysis. Usa /predict o /random_tweet."}
|
| 66 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
| 67 |
+
|
| 68 |
+
@app.get("/random_tweet", response_class=HTMLResponse)
|
| 69 |
+
def random_tweet(request: Request):
|
| 70 |
+
sample = random.choice(tweet_eval["test"])
|
| 71 |
+
text = sample["text"] if "text" in sample else tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
|
| 72 |
+
result = predict_sentiment(text)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
true_label=labels[sample["label"]]
|
| 77 |
+
|
| 78 |
+
return templates.TemplateResponse(
|
| 79 |
+
"random_tweet.html",
|
| 80 |
+
{
|
| 81 |
+
"request": request,
|
| 82 |
+
"text": text,
|
| 83 |
+
"true_label": true_label,
|
| 84 |
+
"result": result
|
| 85 |
+
}
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@app.get("/predict", response_class=HTMLResponse)
|
| 93 |
+
def predict_page(request: Request):
|
| 94 |
+
return templates.TemplateResponse("predict.html", {"request": request, "result": None})
|
| 95 |
+
|
| 96 |
+
@app.post("/predict", response_class=HTMLResponse)
|
| 97 |
+
def predict_text(request: Request, text: str = Form(...)):
|
| 98 |
+
result = predict_sentiment(text)
|
| 99 |
+
return templates.TemplateResponse(
|
| 100 |
+
"predict.html",
|
| 101 |
+
{"request": request, "text": text, "result": result}
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@app.get("/random_youtube_comment", response_class=HTMLResponse)
|
| 106 |
+
def random_youtube_comment(request: Request):
|
| 107 |
+
sample = random.choice(youtube_ds["train"])
|
| 108 |
+
|
| 109 |
+
text = sample["text"] if "text" in sample else sample["text"]
|
| 110 |
+
true_label = sample["label"] if "label" in sample else "N/A"
|
| 111 |
+
|
| 112 |
+
if isinstance(true_label, int):
|
| 113 |
+
|
| 114 |
+
label_map = {0: "negative", 1: "neutral", 2: "positive"}
|
| 115 |
+
true_label = label_map.get(true_label, "N/A")
|
| 116 |
+
|
| 117 |
+
result = predict_sentiment(text)
|
| 118 |
+
|
| 119 |
+
return templates.TemplateResponse(
|
| 120 |
+
"random_youtube.html",
|
| 121 |
+
{
|
| 122 |
+
"request": request,
|
| 123 |
+
"text": text,
|
| 124 |
+
"true_label": true_label,
|
| 125 |
+
"result": result
|
| 126 |
+
}
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__=="__main__":
|
| 131 |
+
import uvicorn
|
| 132 |
+
uvicorn.run(app,host="0.0.0.0",port=8000)
|
src/data_preparation.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
import re
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
| 7 |
+
PROCESSED_DIR = "data/processed/"
|
| 8 |
+
|
| 9 |
+
if not os.path.exists(PROCESSED_DIR):
|
| 10 |
+
os.makedirs(PROCESSED_DIR, exist_ok=True)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
### Funzioni di supporto
|
| 14 |
+
def clean_text(text):
|
| 15 |
+
"""Pulisce il testo da URL, menzioni, hashtag, simboli HTM"""
|
| 16 |
+
text = re.sub(r"http\S+", "", text)
|
| 17 |
+
text = re.sub(r"@\w+", "", text)
|
| 18 |
+
text = re.sub(r"#\w+", "", text)
|
| 19 |
+
text = re.sub(r"&[a-z]+;", "", text)
|
| 20 |
+
text = re.sub(r"\s+", " ", text)
|
| 21 |
+
return text.strip()
|
| 22 |
+
|
| 23 |
+
def map_label(label):
|
| 24 |
+
"""
|
| 25 |
+
Mappa le etichette di sentiment a numeri.
|
| 26 |
+
- 0: negativo
|
| 27 |
+
- 1: neutro
|
| 28 |
+
- 2: positivo
|
| 29 |
+
"""
|
| 30 |
+
mapping = {"negative": 0, "neutral": 1, "positive": 2}
|
| 31 |
+
if isinstance(label, str):
|
| 32 |
+
return mapping.get(label.lower(), 1)
|
| 33 |
+
return label
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Download tweet_eval
|
| 37 |
+
tweet_eval = load_dataset("tweet_eval", "sentiment")
|
| 38 |
+
# Download youtub comment dataset
|
| 39 |
+
youtube = load_dataset("AmaanP314/youtube-comment-sentiment")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
tweet_eval = tweet_eval.map(lambda x: {"text": clean_text(x["text"])})
|
| 43 |
+
youtube = youtube.map(lambda x: {"text": clean_text(x["CommentText"])})
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
youtube = youtube.map(lambda x: {"label": map_label(x["Sentiment"])})
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 50 |
+
|
| 51 |
+
def tokenize_function(examples):
|
| 52 |
+
return tokenizer(
|
| 53 |
+
examples["text"],
|
| 54 |
+
truncation=True,
|
| 55 |
+
padding="max_length",
|
| 56 |
+
max_length=128,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
tweet_tokenized = tweet_eval.map(tokenize_function, batched=True)
|
| 60 |
+
youtube_tokenized = youtube.map(tokenize_function, batched=True)
|
| 61 |
+
|
| 62 |
+
tweet_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
|
| 63 |
+
youtube_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "youtube_tokenized"))
|