Spaces:
Sleeping
Sleeping
LorenzoBioinfo
commited on
Commit
·
4e05a46
1
Parent(s):
66028cc
Add loading dataset
Browse files- .gitignore +2 -1
- src/app.py +8 -12
- src/data_preparation.py +30 -13
.gitignore
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
env
|
| 2 |
data/raw/tweet_eval_sentiment
|
| 3 |
-
data/raw/youtube-comment-sentiment
|
|
|
|
|
|
| 1 |
env
|
| 2 |
data/raw/tweet_eval_sentiment
|
| 3 |
+
data/raw/youtube-comment-sentiment
|
| 4 |
+
data/processed
|
src/app.py
CHANGED
|
@@ -7,6 +7,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
| 7 |
from datasets import load_dataset, load_from_disk
|
| 8 |
import torch
|
| 9 |
import random
|
|
|
|
| 10 |
|
| 11 |
# Caricamento del modello e dei dati se già scaricati
|
| 12 |
MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
|
@@ -20,23 +21,18 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
|
| 20 |
labels = ["negative", "neutral", "positive"]
|
| 21 |
|
| 22 |
|
| 23 |
-
if not os.path.exists(TWEET_PROCESSED_PATH):
|
| 24 |
-
tweet_eval = load_dataset("tweet_eval", "sentiment")
|
| 25 |
-
raise FileNotFoundError(
|
| 26 |
-
f"Dati non trovati in {TWEET_PROCESSED_PATH}. "
|
| 27 |
-
"Esegui src/data_preparation.py per crearlo."
|
| 28 |
-
)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
tweet_eval = load_from_disk(TWEET_PROCESSED_PATH)
|
| 31 |
|
| 32 |
|
|
|
|
| 33 |
if not os.path.exists(YT_PROCESSED_PATH):
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
f"Dati non trovati in {YT_PROCESSED_PATH}. "
|
| 37 |
-
"Esegui src/data_preparation.py per crearlo."
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
youtube_ds = load_from_disk(YT_PROCESSED_PATH)
|
| 41 |
|
| 42 |
app = FastAPI(
|
|
|
|
| 7 |
from datasets import load_dataset, load_from_disk
|
| 8 |
import torch
|
| 9 |
import random
|
| 10 |
+
import subprocess
|
| 11 |
|
| 12 |
# Caricamento del modello e dei dati se già scaricati
|
| 13 |
MODEL= "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
|
|
|
| 21 |
labels = ["negative", "neutral", "positive"]
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
# TWEET EVAL
|
| 26 |
+
if not os.path.exists(TWEET_PROCESSED_PATH):
|
| 27 |
+
print(f"Dataset Tweet Eval non trovato in {TWEET_PROCESSED_PATH}. Lo genero...")
|
| 28 |
+
subprocess.run(["python", "src/data_preparation.py", "tweet_eval"], check=True)
|
| 29 |
tweet_eval = load_from_disk(TWEET_PROCESSED_PATH)
|
| 30 |
|
| 31 |
|
| 32 |
+
# YOUTUBE COMMENTS
|
| 33 |
if not os.path.exists(YT_PROCESSED_PATH):
|
| 34 |
+
print(f" Dataset YouTube non trovato in {YT_PROCESSED_PATH}. Lo genero...")
|
| 35 |
+
subprocess.run(["python", "src/data_preparation.py", "youtube"], check=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
youtube_ds = load_from_disk(YT_PROCESSED_PATH)
|
| 37 |
|
| 38 |
app = FastAPI(
|
src/data_preparation.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
from transformers import AutoTokenizer
|
|
|
|
| 3 |
import re
|
| 4 |
import os
|
| 5 |
|
|
@@ -33,19 +34,6 @@ def map_label(label):
|
|
| 33 |
return label
|
| 34 |
|
| 35 |
|
| 36 |
-
# Download tweet_eval
|
| 37 |
-
tweet_eval = load_dataset("tweet_eval", "sentiment")
|
| 38 |
-
# Download youtub comment dataset
|
| 39 |
-
youtube = load_dataset("AmaanP314/youtube-comment-sentiment")
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
tweet_eval = tweet_eval.map(lambda x: {"text": clean_text(x["text"])})
|
| 43 |
-
youtube = youtube.map(lambda x: {"text": clean_text(x["CommentText"])})
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
youtube = youtube.map(lambda x: {"label": map_label(x["Sentiment"])})
|
| 47 |
-
|
| 48 |
-
|
| 49 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 50 |
|
| 51 |
def tokenize_function(examples):
|
|
@@ -61,3 +49,32 @@ youtube_tokenized = youtube.map(tokenize_function, batched=True)
|
|
| 61 |
|
| 62 |
tweet_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
|
| 63 |
youtube_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "youtube_tokenized"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
from transformers import AutoTokenizer
|
| 3 |
+
import argparse
|
| 4 |
import re
|
| 5 |
import os
|
| 6 |
|
|
|
|
| 34 |
return label
|
| 35 |
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 38 |
|
| 39 |
def tokenize_function(examples):
|
|
|
|
| 49 |
|
| 50 |
tweet_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
|
| 51 |
youtube_tokenized.save_to_disk(os.path.join(PROCESSED_DIR, "youtube_tokenized"))
|
| 52 |
+
|
| 53 |
+
def prepare_tweet_eval(tokenizer, output_path):
|
| 54 |
+
print("Scarico e preparo il dataset Tweet Eval...")
|
| 55 |
+
ds = load_dataset("tweet_eval", "sentiment")
|
| 56 |
+
ds=ds.map(lambda x: {"text": clean_text(x["text"])})
|
| 57 |
+
ds=ds.map(tokenize_function, batched=True)
|
| 58 |
+
ds.save_to_disk(output_path)
|
| 59 |
+
print(f"Dataset Tweet Eval salvato in {output_path}")
|
| 60 |
+
|
| 61 |
+
def prepare_youtube(tokenizer, output_path):
|
| 62 |
+
print("Scarico e preparo il dataset YouTube Comments...")
|
| 63 |
+
ds = load_dataset("AmaanP314/youtube-comment-sentiment")
|
| 64 |
+
ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
|
| 65 |
+
ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
|
| 66 |
+
ds.save_to_disk(output_path)
|
| 67 |
+
print(f"Dataset YouTube salvato in {output_path}")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
|
| 72 |
+
parser.add_argument("dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare.")
|
| 73 |
+
args = parser.parse_args()
|
| 74 |
+
|
| 75 |
+
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
| 76 |
+
|
| 77 |
+
if args.dataset == "tweet_eval":
|
| 78 |
+
prepare_tweet_eval(tokenizer, "data/processed/tweet_eval_tokenized")
|
| 79 |
+
elif args.dataset == "youtube":
|
| 80 |
+
prepare_youtube(tokenizer, "data/processed/youtube_comments")
|