LorenzoBioinfo commited on
Commit
66028cc
·
1 Parent(s): 1754526

Add app and first data

Browse files
app_templates/__init__.py ADDED
File without changes
app_templates/index.html ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="it">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Sentiment Analysis App</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-gradient-to-br from-blue-50 to-purple-100 flex flex-col items-center justify-center h-screen text-center">
9
+ <h1 class="text-4xl font-bold text-gray-800 mb-8">🎯 Benvenuto nella Sentiment Analysis App di MachineInnovators Inc.</h1>
10
+ <div class="flex flex-col gap-4">
11
+ <a href="/random_tweet" class="px-6 py-3 bg-blue-600 text-white rounded-xl hover:bg-blue-700 shadow-lg transition">🧪 Testa il modello su dati di training (Twitter)</a>
12
+ <a href="/predict" class="px-6 py-3 bg-green-600 text-white rounded-xl hover:bg-green-700 shadow-lg transition">🧠 Testa il modello con un tuo testo</a>
13
+ <a href="/random_youtube_comment" class="px-6 py-3 bg-purple-600 text-white rounded-xl hover:bg-purple-700 shadow-lg transition">🌍 Testa il modello su nuovi dati (YouTube)</a>
14
+ </div>
15
+ </body>
16
+ </html>
app_templates/predict.html ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="it">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Predici il Sentiment</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-gray-50 flex flex-col items-center justify-center min-h-screen">
9
+ <h2 class="text-3xl font-semibold text-gray-800 mb-6">🧠 Testa il Modello con un tuo testo</h2>
10
+
11
+ <form method="post" class="bg-white rounded-2xl shadow-md p-6 w-3/4 text-center">
12
+ <textarea name="text" rows="3" class="w-full border border-gray-300 rounded-lg p-3 focus:outline-none focus:ring-2 focus:ring-blue-400" placeholder="Scrivi qui il tuo testo...">{{ text if text else "" }}</textarea>
13
+ <button type="submit" class="mt-4 px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition">Analizza Sentiment</button>
14
+ </form>
15
+
16
+ {% if result %}
17
+ <div class="mt-6 bg-white rounded-xl shadow p-4 w-3/4 text-center">
18
+ <p class="text-gray-700 text-lg mb-2">Risultato:</p>
19
+ <p class="text-2xl font-bold text-blue-600">{{ result.label }}</p>
20
+ <p class="text-sm text-gray-500 mt-1">Confidence: {{ result.confidence }}</p>
21
+ </div>
22
+ {% endif %}
23
+
24
+ <div class="mt-6">
25
+ <a href="/" class="text-blue-600 hover:underline">⬅️ Torna alla Home</a>
26
+ </div>
27
+ </body>
28
+ </html>
app_templates/random_tweet.html ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="it">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Random Tweet</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-gray-50 flex flex-col items-center justify-center min-h-screen">
9
+ <h2 class="text-3xl font-semibold text-gray-800 mb-4">🔀 Random Tweet Test</h2>
10
+
11
+ <div class="bg-white rounded-2xl shadow-md p-6 w-3/4 text-center">
12
+ <p class="text-lg text-gray-700 italic mb-4">"{{ text }}"</p>
13
+
14
+ <div class="grid grid-cols-2 gap-4 mt-4">
15
+ <div class="p-3 border rounded-xl">
16
+ <h3 class="text-gray-600 text-sm">🧩 Predizione del Modello</h3>
17
+ <p class="text-xl font-semibold text-blue-600">{{ result.label }}</p>
18
+ <p class="text-xs text-gray-500">Confidence: {{ result.confidence }}</p>
19
+ </div>
20
+
21
+ <div class="p-3 border rounded-xl">
22
+ <h3 class="text-gray-600 text-sm">🎯 Etichetta Reale</h3>
23
+ <p class="text-xl font-semibold text-green-600">{{ true_label }}</p>
24
+ </div>
25
+ </div>
26
+ </div>
27
+
28
+ <div class="mt-6 flex gap-4">
29
+ <a href="/random_tweet" class="px-5 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition">🔁 Altro Tweet</a>
30
+ <a href="/" class="px-5 py-2 bg-gray-300 text-gray-800 rounded-lg hover:bg-gray-400 transition">⬅️ Torna alla Home</a>
31
+ </div>
32
+ </body>
33
+ </html>
app_templates/random_youtube.html ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="it">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Random YouTube Comment</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-gray-50 flex flex-col items-center justify-center min-h-screen">
9
+ <h2 class="text-3xl font-semibold text-gray-800 mb-4">🌍 Test su Dati Nuovi (YouTube Comments)</h2>
10
+ <p class="text-gray-600 mb-6">Questa sezione testa il modello su dati reali non visti durante il training (generalizzazione).</p>
11
+
12
+ <div class="bg-white rounded-2xl shadow-md p-6 w-3/4 text-center">
13
+ <p class="text-lg text-gray-700 italic mb-4">"{{ text }}"</p>
14
+
15
+ <div class="grid grid-cols-2 gap-4 mt-4">
16
+ <div class="p-3 border rounded-xl">
17
+ <h3 class="text-gray-600 text-sm">🧩 Predizione del Modello</h3>
18
+ <p class="text-xl font-semibold text-blue-600">{{ result.label }}</p>
19
+ <p class="text-xs text-gray-500">Confidence: {{ result.confidence }}</p>
20
+ </div>
21
+
22
+ <div class="p-3 border rounded-xl">
23
+ <h3 class="text-gray-600 text-sm">🎯 Etichetta Reale</h3>
24
+ <p class="text-xl font-semibold text-green-600">{{ true_label }}</p>
25
+ </div>
26
+ </div>
27
+ </div>
28
+
29
+ <div class="mt-6 flex gap-4">
30
+ <a href="/random_youtube_comment" class="px-5 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition">🔁 Altro Commento</a>
31
+ <a href="/" class="px-5 py-2 bg-gray-300 text-gray-800 rounded-lg hover:bg-gray-400 transition">⬅️ Torna alla Home</a>
32
+ </div>
33
+ </body>
34
+ </html>
src/app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import FastAPI, Request, Form
3
+ from pydantic import BaseModel
4
+ from fastapi.responses import HTMLResponse
5
+ from fastapi.templating import Jinja2Templates
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ from datasets import load_dataset, load_from_disk
8
+ import torch
9
+ import random
10
+
11
+ # Caricamento del modello e dei dati se già scaricati
12
+ MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
13
+ TWEET_PROCESSED_PATH = "data/processed/tweet_eval_tokenized"
14
+ YT_PROCESSED_PATH = "data/processed/youtube_tokenized"
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
17
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
18
+
19
+
20
+ labels = ["negative", "neutral", "positive"]
21
+
22
+
23
+ if not os.path.exists(TWEET_PROCESSED_PATH):
24
+ tweet_eval = load_dataset("tweet_eval", "sentiment")
25
+ raise FileNotFoundError(
26
+ f"Dati non trovati in {TWEET_PROCESSED_PATH}. "
27
+ "Esegui src/data_preparation.py per crearlo."
28
+ )
29
+
30
+ tweet_eval = load_from_disk(TWEET_PROCESSED_PATH)
31
+
32
+
33
+ if not os.path.exists(YT_PROCESSED_PATH):
34
+ youtube_ds = load_dataset("AmaanP314/youtube-comment-sentiment")
35
+ raise FileNotFoundError(
36
+ f"Dati non trovati in {YT_PROCESSED_PATH}. "
37
+ "Esegui src/data_preparation.py per crearlo."
38
+ )
39
+
40
+ youtube_ds = load_from_disk(YT_PROCESSED_PATH)
41
+
42
+ app = FastAPI(
43
+ title="Sentiment Analysis API",
44
+ description="Testa il modello RoBERTa di CardiffNLP su frasi personalizzate o su esempi random dal dataset TweetEval."
45
+ )
46
+ templates = Jinja2Templates(directory="app_templates/")
47
+
48
+ class TextInput(BaseModel):
49
+ text: str
50
+
51
+
52
+ def predict_sentiment(text: str):
53
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
54
+ with torch.no_grad():
55
+ outputs = model(**inputs)
56
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
57
+ pred = torch.argmax(probs, dim=1).item()
58
+ confidence = probs[0][pred].item()
59
+ return {"label": labels[pred], "confidence": round(confidence, 3)}
60
+
61
+
62
+ @app.get("/",response_class=HTMLResponse)
63
+ async def home( request: Request):
64
+ #return "Ciao Mondo!"
65
+ #return {"message": "Benvenuto nell'App di MachineInnovators Inc. per la sentiment analysis. Usa /predict o /random_tweet."}
66
+ return templates.TemplateResponse("index.html", {"request": request})
67
+
68
+ @app.get("/random_tweet", response_class=HTMLResponse)
69
+ def random_tweet(request: Request):
70
+ sample = random.choice(tweet_eval["test"])
71
+ text = sample["text"] if "text" in sample else tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
72
+ result = predict_sentiment(text)
73
+
74
+
75
+
76
+ true_label=labels[sample["label"]]
77
+
78
+ return templates.TemplateResponse(
79
+ "random_tweet.html",
80
+ {
81
+ "request": request,
82
+ "text": text,
83
+ "true_label": true_label,
84
+ "result": result
85
+ }
86
+ )
87
+
88
+
89
+
90
+
91
+
92
+ @app.get("/predict", response_class=HTMLResponse)
93
+ def predict_page(request: Request):
94
+ return templates.TemplateResponse("predict.html", {"request": request, "result": None})
95
+
96
+ @app.post("/predict", response_class=HTMLResponse)
97
+ def predict_text(request: Request, text: str = Form(...)):
98
+ result = predict_sentiment(text)
99
+ return templates.TemplateResponse(
100
+ "predict.html",
101
+ {"request": request, "text": text, "result": result}
102
+ )
103
+
104
+
105
+ @app.get("/random_youtube_comment", response_class=HTMLResponse)
106
+ def random_youtube_comment(request: Request):
107
+ sample = random.choice(youtube_ds["train"])
108
+
109
+ text = sample["text"] if "text" in sample else sample["text"]
110
+ true_label = sample["label"] if "label" in sample else "N/A"
111
+
112
+ if isinstance(true_label, int):
113
+
114
+ label_map = {0: "negative", 1: "neutral", 2: "positive"}
115
+ true_label = label_map.get(true_label, "N/A")
116
+
117
+ result = predict_sentiment(text)
118
+
119
+ return templates.TemplateResponse(
120
+ "random_youtube.html",
121
+ {
122
+ "request": request,
123
+ "text": text,
124
+ "true_label": true_label,
125
+ "result": result
126
+ }
127
+ )
128
+
129
+
130
+ if __name__=="__main__":
131
+ import uvicorn
132
+ uvicorn.run(app,host="0.0.0.0",port=8000)
src/data_preparation.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer
3
+ import re
4
+ import os
5
+
6
+ MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
7
+ PROCESSED_DIR = "data/processed/"
8
+
9
+ if not os.path.exists(PROCESSED_DIR):
10
+ os.makedirs(PROCESSED_DIR, exist_ok=True)
11
+
12
+
13
+ ### Funzioni di supporto
14
+ def clean_text(text):
15
+ """Pulisce il testo da URL, menzioni, hashtag, simboli HTM"""
16
+ text = re.sub(r"http\S+", "", text)
17
+ text = re.sub(r"@\w+", "", text)
18
+ text = re.sub(r"#\w+", "", text)
19
+ text = re.sub(r"&[a-z]+;", "", text)
20
+ text = re.sub(r"\s+", " ", text)
21
+ return text.strip()
22
+
23
+ def map_label(label):
24
+ """
25
+ Mappa le etichette di sentiment a numeri.
26
+ - 0: negativo
27
+ - 1: neutro
28
+ - 2: positivo
29
+ """
30
+ mapping = {"negative": 0, "neutral": 1, "positive": 2}
31
+ if isinstance(label, str):
32
+ return mapping.get(label.lower(), 1)
33
+ return label
34
+
35
+
36
+ # Download tweet_eval
37
+ tweet_eval = load_dataset("tweet_eval", "sentiment")
38
+ # Download youtub comment dataset
39
+ youtube = load_dataset("AmaanP314/youtube-comment-sentiment")
40
+
41
+
42
+ tweet_eval = tweet_eval.map(lambda x: {"text": clean_text(x["text"])})
43
+ youtube = youtube.map(lambda x: {"text": clean_text(x["CommentText"])})
44
+
45
+
46
+ youtube = youtube.map(lambda x: {"label": map_label(x["Sentiment"])})
47
+
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
50
+
51
+ def tokenize_function(examples):
52
+ return tokenizer(
53
+ examples["text"],
54
+ truncation=True,
55
+ padding="max_length",
56
+ max_length=128,
57
+ )
58
+
59
+ tweet_tokenized = tweet_eval.map(tokenize_function, batched=True)
60
+ youtube_tokenized = youtube.map(tokenize_function, batched=True)
61
+
62
+ tweet_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
63
+ youtube_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "youtube_tokenized"))