SHOREKEEPER / scripts /01_download_7b_150gb.py

Restructure to src/ layout with attention, per-layer MoE, and working chat

73400c8 22 days ago

10.1 kB

	#!/usr/bin/env python3
	"""
	150GB Curated STEM Dataset for 7B Model Training
	Enough for a high-quality 7B model from scratch
	Total: ~150GB compressed, ~500GB uncompressed
	"""

	import json
	from pathlib import Path
	from datasets import load_dataset
	import time

	def download_7b_dataset():
	print("=" * 80)
	print("DOWNLOADING 150GB STEM DATASET FOR 7B MODEL")
	print("=" * 80)
	print("\n⚠️ This will download ~150GB of data")
	print(" Estimated time: 4-8 hours depending on connection")
	print(" Disk space needed: ~500GB after decompression")
	print("\nPress Ctrl+C to cancel, or wait 5 seconds to continue...")
	time.sleep(5)

	data_dir = Path("./data/7b_150gb")
	data_dir.mkdir(parents=True, exist_ok=True)

	all_data = []
	total_examples = 0

	# ============================================================
	# DATASET 1: The Pile - 50GB (largest single source)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 1: The Pile (50GB - General text)")
	print("=" * 80)
	try:
	pile = load_dataset("EleutherAI/pile", split="train[:2000000]")
	for item in pile:
	text = item.get("text", "")
	if text and len(text) > 500:
	all_data.append({
	"text": text[:2048],
	"source": "pile"
	})
	print(f" ✓ Added {len(pile):,} examples")
	total_examples += len(pile)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 2: StarCoder - 30GB (Code)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 2: StarCoder (30GB - Code)")
	print("=" * 80)
	try:
	code = load_dataset("bigcode/starcoderdata", split="train[:1500000]")
	for item in code:
	content = item.get("content", "")
	if content and len(content) > 200:
	all_data.append({
	"text": content[:2048],
	"source": "starcoder"
	})
	print(f" ✓ Added {len(code):,} examples")
	total_examples += len(code)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 3: C4 - 25GB (Clean web text)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 3: C4 (25GB - Clean web text)")
	print("=" * 80)
	try:
	c4 = load_dataset("c4", "en", split="train[:1500000]")
	for item in c4:
	text = item.get("text", "")
	if text and len(text) > 300:
	all_data.append({
	"text": text[:2048],
	"source": "c4"
	})
	print(f" ✓ Added {len(c4):,} examples")
	total_examples += len(c4)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 4: Proof-Pile-2 - 20GB (Math/CS papers)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 4: Proof-Pile-2 (20GB - Math/CS papers)")
	print("=" * 80)
	try:
	proof = load_dataset("EleutherAI/proof-pile-2", split="train[:1000000]")
	for item in proof:
	text = item.get("text", "")
	if text and len(text) > 500:
	all_data.append({
	"text": text[:2048],
	"source": "proofpile"
	})
	print(f" ✓ Added {len(proof):,} examples")
	total_examples += len(proof)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 5: OpenWebMath - 10GB (Math web pages)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 5: OpenWebMath (10GB - Math web pages)")
	print("=" * 80)
	try:
	math = load_dataset("open-web-math/open-web-math", split="train[:500000]")
	for item in math:
	text = item.get("text", "")
	if text and len(text) > 300:
	all_data.append({
	"text": text[:2048],
	"source": "openwebmath"
	})
	print(f" ✓ Added {len(math):,} examples")
	total_examples += len(math)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 6: MetaMathQA - 2.5GB (Math problems)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 6: MetaMathQA (2.5GB - Math problems)")
	print("=" * 80)
	try:
	metamath = load_dataset("meta-math/MetaMathQA", split="train")
	for item in metamath:
	text = f"Question: {item.get('query', '')}\nAnswer: {item.get('response', '')}"
	all_data.append({
	"text": text,
	"source": "metamath"
	})
	print(f" ✓ Added {len(metamath):,} examples")
	total_examples += len(metamath)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 7: CodeFeedback - 2GB (Code instructions)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 7: CodeFeedback (2GB - Code instructions)")
	print("=" * 80)
	try:
	codefb = load_dataset("m-a-p/CodeFeedback", split="train[:150000]")
	for item in codefb:
	text = f"Instruction: {item.get('instruction', '')}\nCode: {item.get('output', '')}"
	if len(text) > 100:
	all_data.append({
	"text": text[:2048],
	"source": "codefeedback"
	})
	print(f" ✓ Added {len(codefb):,} examples")
	total_examples += len(codefb)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 8: OpenMathInstruct-2 - 2GB (Math problems)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 8: OpenMathInstruct-2 (2GB - Math problems)")
	print("=" * 80)
	try:
	openmath = load_dataset("nvidia/OpenMathInstruct-2", split="train[:150000]")
	for item in openmath:
	text = f"Problem: {item.get('question', '')}\nSolution: {item.get('generated_solution', '')}"
	all_data.append({
	"text": text[:2048],
	"source": "openmath"
	})
	print(f" ✓ Added {len(openmath):,} examples")
	total_examples += len(openmath)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 9: NuminaMath-CoT - 2GB (Math reasoning)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 9: NuminaMath-CoT (2GB - Math reasoning)")
	print("=" * 80)
	try:
	numina = load_dataset("AI-MO/NuminaMath-CoT", split="train[:100000]")
	for item in numina:
	text = f"Problem: {item.get('problem', '')}\nSolution: {item.get('solution', '')}"
	all_data.append({
	"text": text[:2048],
	"source": "numinamath"
	})
	print(f" ✓ Added {len(numina):,} examples")
	total_examples += len(numina)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# DATASET 10: ScienceQA - 0.5GB (Science questions)
	# ============================================================
	print("\n" + "=" * 80)
	print("DATASET 10: ScienceQA (0.5GB - Science questions)")
	print("=" * 80)
	try:
	science = load_dataset("derek-thomas/ScienceQA", split="train")
	for item in science:
	text = f"Question: {item.get('question', '')}\nAnswer: {item.get('answer', '')}"
	all_data.append({
	"text": text[:2048],
	"source": "scienceqa"
	})
	print(f" ✓ Added {len(science):,} examples")
	total_examples += len(science)
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# ============================================================
	# SAVE DATASET
	# ============================================================
	print("\n" + "=" * 80)
	print("SAVING DATASET")
	print("=" * 80)
	print(f"Total examples collected: {total_examples:,}")
	print(f"Estimated size: ~150GB compressed, ~500GB uncompressed")

	# Shuffle
	import random
	random.shuffle(all_data)

	# Save as JSONL
	output_path = data_dir / "7b_train.jsonl"
	with open(output_path, "w") as f:
	for item in all_data:
	f.write(json.dumps(item) + "\n")

	print(f"\n✓ Saved to: {output_path}")
	print(f" File size: {output_path.stat().st_size / 1e9:.1f} GB")

	# Save metadata
	metadata = {
	"total_examples": total_examples,
	"sources": {}
	}

	for item in all_data:
	src = item['source']
	metadata['sources'][src] = metadata['sources'].get(src, 0) + 1

	with open(data_dir / "metadata.json", "w") as f:
	json.dump(metadata, f, indent=2)

	print("\n" + "=" * 80)
	print("DATASET BREAKDOWN")
	print("=" * 80)
	for src, count in metadata['sources'].items():
	print(f" {src}: {count:,} examples")

	print("\n" + "=" * 80)
	print("✅ DOWNLOAD COMPLETE!")
	print("=" * 80)
	print("\nNext step: python3 scripts/04_train_universal.py")

	if __name__ == "__main__":
	download_7b_dataset()