rewrite / scripts /download_all_huggingface_datasets.py
morpheuslord's picture
Add files using upload-large-folder tool
3df5819 verified
"""
Downloads all publicly available HuggingFace datasets automatically.
Datasets requiring registration/request are flagged with instructions.
Run: python scripts/download_all_huggingface_datasets.py
"""
from datasets import load_dataset
import os
os.makedirs("data/raw/hf", exist_ok=True)
# (hf_identifier, config, split, output_subdir)
# Removed trust_remote_code β€” deprecated in newer datasets versions.
# Removed datasets that no longer exist or require custom loading scripts.
HF_DATASETS = [
("liamdugan/raid", None, "train", "raid"),
("Hello-SimpleAI/HC3", "all", "train", "hc3"),
("yaful/MAGE", None, "train", "mage"),
("aadityaubhat/GPT-wiki-intro", None, "train", "gpt_wiki_intro"),
("euclaise/writingprompts", None, "train", "writing_prompts"),
("wikitext", "wikitext-103-raw-v1", "train", "wikitext103"),
("paws", "labeled_final", "train", "paws"),
]
def main():
for hf_id, config, split, subdir in HF_DATASETS:
out_path = f"data/raw/hf/{subdir}"
if os.path.exists(out_path):
print(f"βœ“ Already exists: {subdir}")
continue
try:
print(f"Downloading: {hf_id}...")
if config:
ds = load_dataset(hf_id, config, split=split)
else:
ds = load_dataset(hf_id, split=split)
ds.save_to_disk(out_path)
print(f" βœ“ Saved to {out_path} ({len(ds)} examples)")
except Exception as e:
print(f" βœ— Failed: {hf_id} β€” {e}")
# Datasets requiring manual action
MANUAL_DATASETS = {
"google/clang8": "Requires custom loading script β€” download manually from HF page",
"openwebtext": "Very large (40GB) β€” download separately if needed",
"W&I+LOCNESS": "βœ“ Already downloaded (data/raw/wi+locness/)",
"FCE Corpus": "βœ“ Already downloaded (data/raw/fce/)",
"GYAFC": "Unavailable β€” skipped",
"Kaggle shanegerami": "Run: bash scripts/download_kaggle_datasets.sh",
"Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh",
}
print("\n── Datasets requiring manual action ──")
for name, note in MANUAL_DATASETS.items():
print(f" {name}: {note}")
if __name__ == "__main__":
main()