File size: 2,524 Bytes
3df5819 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | """
Downloads all publicly available HuggingFace datasets automatically.
Datasets requiring registration/request are flagged with instructions.
Run: python scripts/download_all_huggingface_datasets.py
"""
from datasets import load_dataset
import os
os.makedirs("data/raw/hf", exist_ok=True)
# (hf_identifier, config, split, output_subdir)
# Removed trust_remote_code β deprecated in newer datasets versions.
# Removed datasets that no longer exist or require custom loading scripts.
HF_DATASETS = [
("liamdugan/raid", None, "train", "raid"),
("Hello-SimpleAI/HC3", "all", "train", "hc3"),
("yaful/MAGE", None, "train", "mage"),
("aadityaubhat/GPT-wiki-intro", None, "train", "gpt_wiki_intro"),
("euclaise/writingprompts", None, "train", "writing_prompts"),
("wikitext", "wikitext-103-raw-v1", "train", "wikitext103"),
("paws", "labeled_final", "train", "paws"),
]
def main():
for hf_id, config, split, subdir in HF_DATASETS:
out_path = f"data/raw/hf/{subdir}"
if os.path.exists(out_path):
print(f"β Already exists: {subdir}")
continue
try:
print(f"Downloading: {hf_id}...")
if config:
ds = load_dataset(hf_id, config, split=split)
else:
ds = load_dataset(hf_id, split=split)
ds.save_to_disk(out_path)
print(f" β Saved to {out_path} ({len(ds)} examples)")
except Exception as e:
print(f" β Failed: {hf_id} β {e}")
# Datasets requiring manual action
MANUAL_DATASETS = {
"google/clang8": "Requires custom loading script β download manually from HF page",
"openwebtext": "Very large (40GB) β download separately if needed",
"W&I+LOCNESS": "β Already downloaded (data/raw/wi+locness/)",
"FCE Corpus": "β Already downloaded (data/raw/fce/)",
"GYAFC": "Unavailable β skipped",
"Kaggle shanegerami": "Run: bash scripts/download_kaggle_datasets.sh",
"Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh",
}
print("\nββ Datasets requiring manual action ββ")
for name, note in MANUAL_DATASETS.items():
print(f" {name}: {note}")
if __name__ == "__main__":
main()
|