rewrite / scripts /download_all_huggingface_datasets.py

Add files using upload-large-folder tool

3df5819 verified 17 days ago

2.52 kB

	"""
	Downloads all publicly available HuggingFace datasets automatically.
	Datasets requiring registration/request are flagged with instructions.

	Run: python scripts/download_all_huggingface_datasets.py
	"""

	from datasets import load_dataset
	import os

	os.makedirs("data/raw/hf", exist_ok=True)

	# (hf_identifier, config, split, output_subdir)
	# Removed trust_remote_code — deprecated in newer datasets versions.
	# Removed datasets that no longer exist or require custom loading scripts.
	HF_DATASETS = [
	("liamdugan/raid", None, "train", "raid"),
	("Hello-SimpleAI/HC3", "all", "train", "hc3"),
	("yaful/MAGE", None, "train", "mage"),
	("aadityaubhat/GPT-wiki-intro", None, "train", "gpt_wiki_intro"),
	("euclaise/writingprompts", None, "train", "writing_prompts"),
	("wikitext", "wikitext-103-raw-v1", "train", "wikitext103"),
	("paws", "labeled_final", "train", "paws"),
	]


	def main():
	for hf_id, config, split, subdir in HF_DATASETS:
	out_path = f"data/raw/hf/{subdir}"
	if os.path.exists(out_path):
	print(f"✓ Already exists: {subdir}")
	continue
	try:
	print(f"Downloading: {hf_id}...")
	if config:
	ds = load_dataset(hf_id, config, split=split)
	else:
	ds = load_dataset(hf_id, split=split)
	ds.save_to_disk(out_path)
	print(f" ✓ Saved to {out_path} ({len(ds)} examples)")
	except Exception as e:
	print(f" ✗ Failed: {hf_id} — {e}")

	# Datasets requiring manual action
	MANUAL_DATASETS = {
	"google/clang8": "Requires custom loading script — download manually from HF page",
	"openwebtext": "Very large (40GB) — download separately if needed",
	"W&I+LOCNESS": "✓ Already downloaded (data/raw/wi+locness/)",
	"FCE Corpus": "✓ Already downloaded (data/raw/fce/)",
	"GYAFC": "Unavailable — skipped",
	"Kaggle shanegerami": "Run: bash scripts/download_kaggle_datasets.sh",
	"Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh",
	}

	print("\n── Datasets requiring manual action ──")
	for name, note in MANUAL_DATASETS.items():
	print(f" {name}: {note}")


	if __name__ == "__main__":
	main()