File size: 2,524 Bytes
3df5819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Downloads all publicly available HuggingFace datasets automatically.
Datasets requiring registration/request are flagged with instructions.

Run: python scripts/download_all_huggingface_datasets.py
"""

from datasets import load_dataset
import os

os.makedirs("data/raw/hf", exist_ok=True)

# (hf_identifier, config, split, output_subdir)
# Removed trust_remote_code β€” deprecated in newer datasets versions.
# Removed datasets that no longer exist or require custom loading scripts.
HF_DATASETS = [
    ("liamdugan/raid",              None,                   "train",  "raid"),
    ("Hello-SimpleAI/HC3",          "all",                  "train",  "hc3"),
    ("yaful/MAGE",                  None,                   "train",  "mage"),
    ("aadityaubhat/GPT-wiki-intro", None,                   "train",  "gpt_wiki_intro"),
    ("euclaise/writingprompts",     None,                   "train",  "writing_prompts"),
    ("wikitext",                    "wikitext-103-raw-v1",  "train",  "wikitext103"),
    ("paws",                        "labeled_final",        "train",  "paws"),
]


def main():
    for hf_id, config, split, subdir in HF_DATASETS:
        out_path = f"data/raw/hf/{subdir}"
        if os.path.exists(out_path):
            print(f"βœ“ Already exists: {subdir}")
            continue
        try:
            print(f"Downloading: {hf_id}...")
            if config:
                ds = load_dataset(hf_id, config, split=split)
            else:
                ds = load_dataset(hf_id, split=split)
            ds.save_to_disk(out_path)
            print(f"  βœ“ Saved to {out_path} ({len(ds)} examples)")
        except Exception as e:
            print(f"  βœ— Failed: {hf_id} β€” {e}")

    # Datasets requiring manual action
    MANUAL_DATASETS = {
        "google/clang8":       "Requires custom loading script β€” download manually from HF page",
        "openwebtext":         "Very large (40GB) β€” download separately if needed",
        "W&I+LOCNESS":         "βœ“ Already downloaded (data/raw/wi+locness/)",
        "FCE Corpus":          "βœ“ Already downloaded (data/raw/fce/)",
        "GYAFC":               "Unavailable β€” skipped",
        "Kaggle shanegerami":  "Run: bash scripts/download_kaggle_datasets.sh",
        "Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh",
    }

    print("\n── Datasets requiring manual action ──")
    for name, note in MANUAL_DATASETS.items():
        print(f"  {name}: {note}")


if __name__ == "__main__":
    main()