LorenzoBioinfo commited on
Commit
11c7c73
·
1 Parent(s): 7695575

Speed test

Browse files
Files changed (1) hide show
  1. src/data_preparation.py +3 -1
src/data_preparation.py CHANGED
@@ -60,7 +60,7 @@ def safe_load_dataset(name, config=None, max_retries=3, fallback_data=None):
60
  if attempt < max_retries - 1:
61
  time.sleep(10)
62
  else:
63
- print(f"⚠️ Errore persistente nel download {name}. Uso dataset di fallback.")
64
  if fallback_data:
65
  return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
66
  raise e
@@ -73,6 +73,7 @@ def prepare_tweet_eval(tokenizer, output_path):
73
  "label": [2, 0, 1, 2, 0],
74
  }
75
  ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
 
76
  ds = ds.map(lambda x: {"text": clean_text(x["text"])})
77
  ds = ds.map(tokenize_function, batched=True)
78
  ds.save_to_disk(output_path)
@@ -86,6 +87,7 @@ def prepare_youtube(tokenizer, output_path):
86
  "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
87
  }
88
  ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
 
89
  ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
90
  ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
91
  ds = ds.map(tokenize_function, batched=True)
 
60
  if attempt < max_retries - 1:
61
  time.sleep(10)
62
  else:
63
+ print(f"Errore persistente nel download {name}. Uso dataset di fallback.")
64
  if fallback_data:
65
  return Dataset.from_dict(fallback_data).train_test_split(test_size=0.4)
66
  raise e
 
73
  "label": [2, 0, 1, 2, 0],
74
  }
75
  ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
76
+ ds = ds.select(range(1000))
77
  ds = ds.map(lambda x: {"text": clean_text(x["text"])})
78
  ds = ds.map(tokenize_function, batched=True)
79
  ds.save_to_disk(output_path)
 
87
  "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
88
  }
89
  ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
90
+ ds = ds.select(range(1000))
91
  ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
92
  ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
93
  ds = ds.map(tokenize_function, batched=True)