Spaces:
Sleeping
Sleeping
LorenzoBioinfo
commited on
Commit
·
d32d7f7
1
Parent(s):
4597604
Fix youtubeimport
Browse files- src/data_preparation.py +25 -4
src/data_preparation.py
CHANGED
|
@@ -96,10 +96,31 @@ def prepare_youtube(tokenizer, output_path):
|
|
| 96 |
"Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
|
| 97 |
}
|
| 98 |
ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
|
| 99 |
-
|
| 100 |
-
ds
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
ds.save_to_disk(output_path)
|
| 104 |
print(f"Dataset YouTube salvato in {output_path}")
|
| 105 |
|
|
|
|
| 96 |
"Sentiment": ["positive", "negative", "neutral", "positive", "negative"],
|
| 97 |
}
|
| 98 |
ds = safe_load_dataset("AmaanP314/youtube-comment-sentiment", fallback_data=fallback_data)
|
| 99 |
+
|
| 100 |
+
if isinstance(ds, dict) or "train" in ds:
|
| 101 |
+
reduced_splits = {}
|
| 102 |
+
for split in ds.keys():
|
| 103 |
+
reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
|
| 104 |
+
reduced_splits[split] = reduced_splits[split].map(
|
| 105 |
+
lambda x: {
|
| 106 |
+
"text": clean_text(x["CommentText"]),
|
| 107 |
+
"label": map_label(x["Sentiment"]),
|
| 108 |
+
}
|
| 109 |
+
)
|
| 110 |
+
reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
|
| 111 |
+
ds = DatasetDict(reduced_splits)
|
| 112 |
+
else:
|
| 113 |
+
|
| 114 |
+
ds = ds.select(range(min(1000, len(ds))))
|
| 115 |
+
ds = ds.map(
|
| 116 |
+
lambda x: {
|
| 117 |
+
"text": clean_text(x["CommentText"]),
|
| 118 |
+
"label": map_label(x["Sentiment"]),
|
| 119 |
+
}
|
| 120 |
+
)
|
| 121 |
+
# ds = ds.map(lambda x: {"text": clean_text(x["CommentText"])})
|
| 122 |
+
# ds = ds.map(lambda x: {"label": map_label(x["Sentiment"])})
|
| 123 |
+
# ds = ds.map(tokenize_function, batched=True)
|
| 124 |
ds.save_to_disk(output_path)
|
| 125 |
print(f"Dataset YouTube salvato in {output_path}")
|
| 126 |
|