| """ |
| Downloads all publicly available HuggingFace datasets automatically. |
| Datasets requiring registration/request are flagged with instructions. |
| |
| Run: python scripts/download_all_huggingface_datasets.py |
| """ |
|
|
| from datasets import load_dataset |
| import os |
|
|
| os.makedirs("data/raw/hf", exist_ok=True) |
|
|
| |
| |
| |
| HF_DATASETS = [ |
| ("liamdugan/raid", None, "train", "raid"), |
| ("Hello-SimpleAI/HC3", "all", "train", "hc3"), |
| ("yaful/MAGE", None, "train", "mage"), |
| ("aadityaubhat/GPT-wiki-intro", None, "train", "gpt_wiki_intro"), |
| ("euclaise/writingprompts", None, "train", "writing_prompts"), |
| ("wikitext", "wikitext-103-raw-v1", "train", "wikitext103"), |
| ("paws", "labeled_final", "train", "paws"), |
| ] |
|
|
|
|
| def main(): |
| for hf_id, config, split, subdir in HF_DATASETS: |
| out_path = f"data/raw/hf/{subdir}" |
| if os.path.exists(out_path): |
| print(f"β Already exists: {subdir}") |
| continue |
| try: |
| print(f"Downloading: {hf_id}...") |
| if config: |
| ds = load_dataset(hf_id, config, split=split) |
| else: |
| ds = load_dataset(hf_id, split=split) |
| ds.save_to_disk(out_path) |
| print(f" β Saved to {out_path} ({len(ds)} examples)") |
| except Exception as e: |
| print(f" β Failed: {hf_id} β {e}") |
|
|
| |
| MANUAL_DATASETS = { |
| "google/clang8": "Requires custom loading script β download manually from HF page", |
| "openwebtext": "Very large (40GB) β download separately if needed", |
| "W&I+LOCNESS": "β Already downloaded (data/raw/wi+locness/)", |
| "FCE Corpus": "β Already downloaded (data/raw/fce/)", |
| "GYAFC": "Unavailable β skipped", |
| "Kaggle shanegerami": "Run: bash scripts/download_kaggle_datasets.sh", |
| "Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh", |
| } |
|
|
| print("\nββ Datasets requiring manual action ββ") |
| for name, note in MANUAL_DATASETS.items(): |
| print(f" {name}: {note}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|