Spaces:
Sleeping
Sleeping
File size: 5,076 Bytes
4597604 66028cc 4e05a46 66028cc 0540749 0d848b5 66028cc 0540749 66028cc 1180a53 0540749 66028cc 0540749 66028cc 1180a53 66028cc 0540749 66028cc 1180a53 0540749 66028cc 1180a53 66028cc 0540749 1180a53 0540749 11c7c73 0540749 4e05a46 0540749 f42bc7f 1180a53 f42bc7f 97e2e51 f42bc7f 4e05a46 f42bc7f 0540749 4e05a46 0540749 1180a53 d32d7f7 1180a53 d32d7f7 1180a53 4e05a46 1180a53 4e05a46 0540749 4e05a46 0540749 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
import argparse
import re
import os
import time
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
PROCESSED_DIR = "data/processed/"
os.makedirs(PROCESSED_DIR, exist_ok=True)
# FUNZIONI DI SUPPORTO
def clean_text(text):
"""Pulisce il testo da URL, menzioni, hashtag, simboli HTML"""
text = re.sub(r"http\S+", "", text)
text = re.sub(r"@\w+", "", text)
text = re.sub(r"#\w+", "", text)
text = re.sub(r"&[a-z]+;", "", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def map_label(label):
"""Mappa le etichette di sentiment a numeri"""
mapping = {"negative": 0, "neutral": 1, "positive": 2}
if isinstance(label, str):
return mapping.get(label.lower(), 1)
return label
# Tokenizer globale
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=128,
)
# ----------------------------- #
# PREPARAZIONE DEI DATASET #
# ----------------------------- #
def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
"""
Gestisce i retry del download e crea un dataset di fallback se fallisce.
"""
for attempt in range(max_retries):
try:
if config:
return load_dataset(name, config)
return load_dataset(name)
except Exception as e:
print(f"Tentativo {attempt+1}/{max_retries} fallito per {name}: {e}")
if attempt < max_retries - 1:
time.sleep(10)
else:
print(f"Errore persistente nel download {name}. Uso dataset di fallback.")
if fallback_data:
return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
raise e
def prepare_tweet_eval(tokenizer, output_path):
print("Scarico e preparo il dataset Tweet Eval...")
fallback_data = {
"text": ["I love this!", "This is bad", "Just okay", "Great!", "Terrible experience"],
"label": [2, 0, 1, 2, 0],
}
ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
if isinstance(ds, dict) or "train" in ds:
reduced_splits = {}
for split in ds.keys():
reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
reduced_splits[split] = reduced_splits[split].map(
lambda x: {"text": clean_text(x["text"])}
)
reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
ds = DatasetDict(reduced_splits)
else:
ds = ds.select(range(min(1000, len(ds))))
ds = ds.map(lambda x: {"text": clean_text(x["text"])})
ds = ds.map(tokenize_function, batched=True)
ds.save_to_disk(output_path)
print(f"Dataset Tweet Eval salvato in {output_path}")
def prepare_youtube(tokenizer, output_path):
print("📥 Scarico e preparo il dataset YouTube Comments...")
fallback_data = {
"CommentText": ["Amazing video!", "I hated this", "Not bad", "Loved it", "Awful content"],
"Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
}
ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
if isinstance(ds, dict) or "train" in ds:
reduced_splits = {}
for split in ds.keys():
reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
reduced_splits[split] = reduced_splits[split].map(
lambda x: {
"text": clean_text(x["CommentText"]),
"label": map_label(x["Sentiment"]),
}
)
reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
ds = DatasetDict(reduced_splits)
else:
ds = ds.select(range(min(1000, len(ds))))
ds = ds.map(
lambda x: {
"text": clean_text(x["CommentText"]),
"label": map_label(x["Sentiment"]),
}
)
# ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
# ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
# ds = ds.map(tokenize_function, batched=True)
ds.save_to_disk(output_path)
print(f"Dataset YouTube salvato in {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Prepara dataset per sentiment analysis.")
parser.add_argument(
"dataset", choices=["tweet_eval", "youtube"], help="Nome del dataset da preparare."
)
args = parser.parse_args()
if args.dataset == "tweet_eval":
prepare_tweet_eval(tokenizer, os.path.join(PROCESSED_DIR, "tweet_eval_tokenized"))
elif args.dataset == "youtube":
prepare_youtube(tokenizer, os.path.join(PROCESSED_DIR, "youtube_tokenized"))
|