Spaces:
Sleeping
Sleeping
LorenzoBioinfo commited on
Commit ·
11c7c73
1
Parent(s): 7695575
Speed test
Browse files- src/data_preparation.py +3 -1
src/data_preparation.py
CHANGED
|
@@ -60,7 +60,7 @@ def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
|
|
| 60 |
if attempt < max_retries - 1:
|
| 61 |
time.sleep(10)
|
| 62 |
else:
|
| 63 |
-
print(f"
|
| 64 |
if fallback_data:
|
| 65 |
return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
|
| 66 |
raise e
|
|
@@ -73,6 +73,7 @@ def prepare_tweet_eval(tokenizer, output_path):
|
|
| 73 |
"label": [2, 0, 1, 2, 0],
|
| 74 |
}
|
| 75 |
ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
|
|
|
|
| 76 |
ds = ds.map(lambda x: {"text": clean_text(x["text"])})
|
| 77 |
ds = ds.map(tokenize_function, batched=True)
|
| 78 |
ds.save_to_disk(output_path)
|
|
@@ -86,6 +87,7 @@ def prepare_youtube(tokenizer, output_path):
|
|
| 86 |
"Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
|
| 87 |
}
|
| 88 |
ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
|
|
|
|
| 89 |
ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
|
| 90 |
ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
|
| 91 |
ds = ds.map(tokenize_function, batched=True)
|
|
|
|
| 60 |
if attempt < max_retries - 1:
|
| 61 |
time.sleep(10)
|
| 62 |
else:
|
| 63 |
+
print(f"Errore persistente nel download {name}. Uso dataset di fallback.")
|
| 64 |
if fallback_data:
|
| 65 |
return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
|
| 66 |
raise e
|
|
|
|
| 73 |
"label": [2, 0, 1, 2, 0],
|
| 74 |
}
|
| 75 |
ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
|
| 76 |
+
ds = ds.select(range(1000))
|
| 77 |
ds = ds.map(lambda x: {"text": clean_text(x["text"])})
|
| 78 |
ds = ds.map(tokenize_function, batched=True)
|
| 79 |
ds.save_to_disk(output_path)
|
|
|
|
| 87 |
"Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
|
| 88 |
}
|
| 89 |
ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
|
| 90 |
+
ds = ds.select(range(1000))
|
| 91 |
ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
|
| 92 |
ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
|
| 93 |
ds = ds.map(tokenize_function, batched=True)
|