""" Downloads all publicly available HuggingFace datasets automatically. Datasets requiring registration/request are flagged with instructions. Run: python scripts/download_all_huggingface_datasets.py """ from datasets import load_dataset import os os.makedirs("data/raw/hf", exist_ok=True) # (hf_identifier, config, split, output_subdir) # Removed trust_remote_code — deprecated in newer datasets versions. # Removed datasets that no longer exist or require custom loading scripts. HF_DATASETS = [ ("liamdugan/raid", None, "train", "raid"), ("Hello-SimpleAI/HC3", "all", "train", "hc3"), ("yaful/MAGE", None, "train", "mage"), ("aadityaubhat/GPT-wiki-intro", None, "train", "gpt_wiki_intro"), ("euclaise/writingprompts", None, "train", "writing_prompts"), ("wikitext", "wikitext-103-raw-v1", "train", "wikitext103"), ("paws", "labeled_final", "train", "paws"), ] def main(): for hf_id, config, split, subdir in HF_DATASETS: out_path = f"data/raw/hf/{subdir}" if os.path.exists(out_path): print(f"✓ Already exists: {subdir}") continue try: print(f"Downloading: {hf_id}...") if config: ds = load_dataset(hf_id, config, split=split) else: ds = load_dataset(hf_id, split=split) ds.save_to_disk(out_path) print(f" ✓ Saved to {out_path} ({len(ds)} examples)") except Exception as e: print(f" ✗ Failed: {hf_id} — {e}") # Datasets requiring manual action MANUAL_DATASETS = { "google/clang8": "Requires custom loading script — download manually from HF page", "openwebtext": "Very large (40GB) — download separately if needed", "W&I+LOCNESS": "✓ Already downloaded (data/raw/wi+locness/)", "FCE Corpus": "✓ Already downloaded (data/raw/fce/)", "GYAFC": "Unavailable — skipped", "Kaggle shanegerami": "Run: bash scripts/download_kaggle_datasets.sh", "Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh", } print("\n── Datasets requiring manual action ──") for name, note in MANUAL_DATASETS.items(): print(f" {name}: {note}") if __name__ == "__main__": main()