File size: 5,076 Bytes
4597604
66028cc
4e05a46
66028cc
 
0540749
0d848b5
66028cc
 
 
 
0540749
66028cc
 
1180a53
 
0540749
66028cc
0540749
 
 
 
 
 
66028cc
 
1180a53
66028cc
0540749
66028cc
 
 
 
 
1180a53
0540749
66028cc
 
1180a53
66028cc
 
 
 
 
 
 
 
 
0540749
 
 
 
1180a53
0540749
 
 
 
 
 
 
 
 
 
 
 
 
 
11c7c73
0540749
 
 
 
4e05a46
 
 
0540749
 
 
 
 
f42bc7f
 
 
 
1180a53
 
 
f42bc7f
97e2e51
f42bc7f
 
 
 
 
4e05a46
f42bc7f
0540749
4e05a46
 
0540749
 
 
 
 
 
1180a53
d32d7f7
 
 
 
 
 
 
 
 
 
 
 
 
1180a53
d32d7f7
 
 
 
 
 
 
1180a53
 
 
4e05a46
 
 
 
 
 
1180a53
 
 
4e05a46
 
 
0540749
4e05a46
0540749
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
import argparse
import re
import os
import time


MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
PROCESSED_DIR = "data/processed/"

os.makedirs(PROCESSED_DIR, exist_ok=True)


#     FUNZIONI DI SUPPORTO


def clean_text(text):
    """Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"&[a-z]+;", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def map_label(label):
    """Mappa le etichette di sentiment a numeri"""
    mapping = {"negative": 0, "neutral": 1, "positive": 2}
    if isinstance(label, str):
        return mapping.get(label.lower(), 1)
    return label


# Tokenizer globale
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )


# ----------------------------- #
#   PREPARAZIONE DEI DATASET    #
# ----------------------------- #


def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
    """
    Gestisce i retry del download e crea un dataset di fallback se fallisce.
    """
    for attempt in range(max_retries):
        try:
            if config:
                return load_dataset(name, config)
            return load_dataset(name)
        except Exception as e:
            print(f"Tentativo {attempt+1}/{max_retries} fallito per {name}: {e}")
            if attempt < max_retries - 1:
                time.sleep(10)
            else:
                print(f"Errore persistente nel download {name}. Uso dataset di fallback.")
                if fallback_data:
                    return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
                raise e


def prepare_tweet_eval(tokenizer, output_path):
    print("Scarico e preparo il dataset Tweet Eval...")
    fallback_data = {
        "text": ["I love this!", "This is bad", "Just okay", "Great!", "Terrible experience"],
        "label": [2, 0, 1, 2, 0],
    }
    ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
    if isinstance(ds, dict) or "train" in ds:
        reduced_splits = {}
        for split in ds.keys():
            reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
            reduced_splits[split] = reduced_splits[split].map(
                lambda x: {"text": clean_text(x["text"])}
            )
            reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
        ds = DatasetDict(reduced_splits)
    else:
        ds = ds.select(range(min(1000, len(ds))))
        ds = ds.map(lambda x: {"text": clean_text(x["text"])})
        ds = ds.map(tokenize_function, batched=True)

    ds.save_to_disk(output_path)
    print(f"Dataset Tweet Eval salvato in {output_path}")


def prepare_youtube(tokenizer, output_path):
    print("📥 Scarico e preparo il dataset YouTube Comments...")
    fallback_data = {
        "CommentText": ["Amazing video!", "I hated this", "Not bad", "Loved it", "Awful content"],
        "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
    }
    ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)

    if isinstance(ds, dict) or "train" in ds:
        reduced_splits = {}
        for split in ds.keys():
            reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
            reduced_splits[split] = reduced_splits[split].map(
                lambda x: {
                    "text": clean_text(x["CommentText"]),
                    "label": map_label(x["Sentiment"]),
                }
            )
            reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
        ds = DatasetDict(reduced_splits)
    else:

        ds = ds.select(range(min(1000, len(ds))))
        ds = ds.map(
            lambda x: {
                "text": clean_text(x["CommentText"]),
                "label": map_label(x["Sentiment"]),
            }
        )
    #  ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
    #  ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
    # ds = ds.map(tokenize_function, batched=True)
    ds.save_to_disk(output_path)
    print(f"Dataset YouTube salvato in {output_path}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
    parser.add_argument(
        "dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare."
    )
    args = parser.parse_args()

    if args.dataset == "tweet_eval":
        prepare_tweet_eval(tokenizer, os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
    elif args.dataset == "youtube":
        prepare_youtube(tokenizer, os.path.join(PROCESSED_DIR, "youtube_tokenized"))