#!/usr/bin/env python3 """ Download MASSIVE datasets for 15B training 200B+ tokens from verified STEM sources """ import json from pathlib import Path from datasets import load_dataset def download_15b_data(): print("=" * 80) print("DOWNLOADING 200B+ TOKENS FOR 15B MODEL") print("=" * 80) data_dir = Path("./data/15b_data") data_dir.mkdir(parents=True, exist_ok=True) all_data = [] total_tokens = 0 # 1. The Pile - 800GB, 300B tokens (take 50B) print("\n1. The Pile (300B tokens - taking 50B)...") print(" This will take 1-2 days...") try: pile = load_dataset("EleutherAI/pile", split="train[:5000000]") for item in pile: text = item.get("text", "") if text and len(text) > 200: all_data.append({ "text": text, "source": "pile" }) print(f" ✓ Added {len(pile):,} examples") except Exception as e: print(f" ✗ Failed: {e}") # 2. Proof-Pile-2 - 50B tokens of math/CS papers print("\n2. Proof-Pile-2 (50B tokens - taking 20B)...") try: proof = load_dataset("EleutherAI/proof-pile-2", split="train[:2000000]") for item in proof: text = item.get("text", "") if text and len(text) > 200: all_data.append({ "text": text, "source": "proofpile" }) print(f" ✓ Added {len(proof):,} examples") except Exception as e: print(f" ✗ Failed: {e}") # 3. StarCoder - 100B tokens of code print("\n3. StarCoder (100B tokens - taking 30B)...") try: code = load_dataset("bigcode/starcoderdata", split="train[:3000000]") for item in code: content = item.get("content", "") if content and len(content) > 100: all_data.append({ "text": content, "source": "starcoder" }) print(f" ✓ Added {len(code):,} examples") except Exception as e: print(f" ✗ Failed: {e}") # 4. C4 - 156GB, 150B tokens (take 30B) print("\n4. C4 (150B tokens - taking 30B)...") try: c4 = load_dataset("c4", "en", split="train[:3000000]") for item in c4: text = item.get("text", "") if text and len(text) > 200: all_data.append({ "text": text, "source": "c4" }) print(f" ✓ Added {len(c4):,} examples") except Exception as e: print(f" ✗ Failed: {e}") # 5. OpenWebMath - 14B tokens of math print("\n5. OpenWebMath (14B tokens - taking all)...") try: math = load_dataset("open-web-math/open-web-math", split="train") for item in math: text = item.get("text", "") if text and len(text) > 200: all_data.append({ "text": text, "source": "openwebmath" }) print(f" ✓ Added {len(math):,} examples") except Exception as e: print(f" ✗ Failed: {e}") print("\n" + "=" * 80) print(f"TOTAL EXAMPLES: {len(all_data):,}") print(f"ESTIMATED TOKENS: {len(all_data) * 500:,}") print("=" * 80) # Save print("\nSaving to disk...") with open(data_dir / "15b_train.jsonl", "w") as f: for item in all_data: f.write(json.dumps(item) + "\n") print(f"✓ Saved to: {data_dir}/15b_train.jsonl") if __name__ == "__main__": download_15b_data()