LorenzoBioinfo commited on
Commit
d32d7f7
·
1 Parent(s): 4597604

Fix youtubeimport

Browse files
Files changed (1) hide show
  1. src/data_preparation.py +25 -4
src/data_preparation.py CHANGED
@@ -96,10 +96,31 @@ def prepare_youtube(tokenizer, output_path):
96
  "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
97
  }
98
  ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
99
- ds = ds["train"].select(range(1000))
100
- ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
101
- ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
102
- ds = ds.map(tokenize_function, batched=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  ds.save_to_disk(output_path)
104
  print(f"Dataset YouTube salvato in {output_path}")
105
 
 
96
  "Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
97
  }
98
  ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
99
+
100
+ if isinstance(ds, dict) or "train" in ds:
101
+ reduced_splits = {}
102
+ for split in ds.keys():
103
+ reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
104
+ reduced_splits[split] = reduced_splits[split].map(
105
+ lambda x: {
106
+ "text": clean_text(x["CommentText"]),
107
+ "label": map_label(x["Sentiment"]),
108
+ }
109
+ )
110
+ reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
111
+ ds = DatasetDict(reduced_splits)
112
+ else:
113
+
114
+ ds = ds.select(range(min(1000, len(ds))))
115
+ ds = ds.map(
116
+ lambda x: {
117
+ "text": clean_text(x["CommentText"]),
118
+ "label": map_label(x["Sentiment"]),
119
+ }
120
+ )
121
+ # ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
122
+ # ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
123
+ # ds = ds.map(tokenize_function, batched=True)
124
  ds.save_to_disk(output_path)
125
  print(f"Dataset YouTube salvato in {output_path}")
126