Spaces:

Lordemarco
/

sentiment-fastapi

Sleeping

LorenzoBioinfo commited on Nov 3, 2025

Commit

11c7c73

1 Parent(s): 7695575

Speed test

Files changed (1) hide show

src/data_preparation.py CHANGED Viewed

@@ -60,7 +60,7 @@ def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
             if attempt < max_retries - 1:
                 time.sleep(10)
             else:
-                print(f"⚠️ Errore persistente nel download {name}. Uso dataset di fallback.")
                 if fallback_data:
                     return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
                 raise e
@@ -73,6 +73,7 @@ def prepare_tweet_eval(tokenizer, output_path):
         "label": [2, 0, 1, 2, 0],
     }
     ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
     ds = ds.map(lambda x: {"text": clean_text(x["text"])})
     ds = ds.map(tokenize_function, batched=True)
     ds.save_to_disk(output_path)
@@ -86,6 +87,7 @@ def prepare_youtube(tokenizer, output_path):
         "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
     }
     ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
     ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
     ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
     ds = ds.map(tokenize_function, batched=True)

             if attempt < max_retries - 1:
                 time.sleep(10)
             else:
+                print(f"Errore persistente nel download {name}. Uso dataset di fallback.")
                 if fallback_data:
                     return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
                 raise e
         "label": [2, 0, 1, 2, 0],
     }
     ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
+    ds = ds.select(range(1000))
     ds = ds.map(lambda x: {"text": clean_text(x["text"])})
     ds = ds.map(tokenize_function, batched=True)
     ds.save_to_disk(output_path)
         "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
     }
     ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
+    ds = ds.select(range(1000))
     ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
     ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
     ds = ds.map(tokenize_function, batched=True)