SHOREKEEPER / scripts /01_download_15b_data.py

Restructure to src/ layout with attention, per-layer MoE, and working chat

73400c8 5 days ago

3.66 kB

	#!/usr/bin/env python3
	"""
	Download MASSIVE datasets for 15B training
	200B+ tokens from verified STEM sources
	"""

	import json
	from pathlib import Path
	from datasets import load_dataset

	def download_15b_data():
	print("=" * 80)
	print("DOWNLOADING 200B+ TOKENS FOR 15B MODEL")
	print("=" * 80)

	data_dir = Path("./data/15b_data")
	data_dir.mkdir(parents=True, exist_ok=True)

	all_data = []
	total_tokens = 0

	# 1. The Pile - 800GB, 300B tokens (take 50B)
	print("\n1. The Pile (300B tokens - taking 50B)...")
	print(" This will take 1-2 days...")
	try:
	pile = load_dataset("EleutherAI/pile", split="train[:5000000]")
	for item in pile:
	text = item.get("text", "")
	if text and len(text) > 200:
	all_data.append({
	"text": text,
	"source": "pile"
	})
	print(f" ✓ Added {len(pile):,} examples")
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# 2. Proof-Pile-2 - 50B tokens of math/CS papers
	print("\n2. Proof-Pile-2 (50B tokens - taking 20B)...")
	try:
	proof = load_dataset("EleutherAI/proof-pile-2", split="train[:2000000]")
	for item in proof:
	text = item.get("text", "")
	if text and len(text) > 200:
	all_data.append({
	"text": text,
	"source": "proofpile"
	})
	print(f" ✓ Added {len(proof):,} examples")
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# 3. StarCoder - 100B tokens of code
	print("\n3. StarCoder (100B tokens - taking 30B)...")
	try:
	code = load_dataset("bigcode/starcoderdata", split="train[:3000000]")
	for item in code:
	content = item.get("content", "")
	if content and len(content) > 100:
	all_data.append({
	"text": content,
	"source": "starcoder"
	})
	print(f" ✓ Added {len(code):,} examples")
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# 4. C4 - 156GB, 150B tokens (take 30B)
	print("\n4. C4 (150B tokens - taking 30B)...")
	try:
	c4 = load_dataset("c4", "en", split="train[:3000000]")
	for item in c4:
	text = item.get("text", "")
	if text and len(text) > 200:
	all_data.append({
	"text": text,
	"source": "c4"
	})
	print(f" ✓ Added {len(c4):,} examples")
	except Exception as e:
	print(f" ✗ Failed: {e}")

	# 5. OpenWebMath - 14B tokens of math
	print("\n5. OpenWebMath (14B tokens - taking all)...")
	try:
	math = load_dataset("open-web-math/open-web-math", split="train")
	for item in math:
	text = item.get("text", "")
	if text and len(text) > 200:
	all_data.append({
	"text": text,
	"source": "openwebmath"
	})
	print(f" ✓ Added {len(math):,} examples")
	except Exception as e:
	print(f" ✗ Failed: {e}")

	print("\n" + "=" * 80)
	print(f"TOTAL EXAMPLES: {len(all_data):,}")
	print(f"ESTIMATED TOKENS: {len(all_data) * 500:,}")
	print("=" * 80)

	# Save
	print("\nSaving to disk...")
	with open(data_dir / "15b_train.jsonl", "w") as f:
	for item in all_data:
	f.write(json.dumps(item) + "\n")

	print(f"✓ Saved to: {data_dir}/15b_train.jsonl")

	if __name__ == "__main__":
	download_15b_data()