| |
| """ |
| Download MASSIVE datasets for 15B training |
| 200B+ tokens from verified STEM sources |
| """ |
|
|
| import json |
| from pathlib import Path |
| from datasets import load_dataset |
|
|
| def download_15b_data(): |
| print("=" * 80) |
| print("DOWNLOADING 200B+ TOKENS FOR 15B MODEL") |
| print("=" * 80) |
| |
| data_dir = Path("./data/15b_data") |
| data_dir.mkdir(parents=True, exist_ok=True) |
| |
| all_data = [] |
| total_tokens = 0 |
| |
| |
| print("\n1. The Pile (300B tokens - taking 50B)...") |
| print(" This will take 1-2 days...") |
| try: |
| pile = load_dataset("EleutherAI/pile", split="train[:5000000]") |
| for item in pile: |
| text = item.get("text", "") |
| if text and len(text) > 200: |
| all_data.append({ |
| "text": text, |
| "source": "pile" |
| }) |
| print(f" β Added {len(pile):,} examples") |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| print("\n2. Proof-Pile-2 (50B tokens - taking 20B)...") |
| try: |
| proof = load_dataset("EleutherAI/proof-pile-2", split="train[:2000000]") |
| for item in proof: |
| text = item.get("text", "") |
| if text and len(text) > 200: |
| all_data.append({ |
| "text": text, |
| "source": "proofpile" |
| }) |
| print(f" β Added {len(proof):,} examples") |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| print("\n3. StarCoder (100B tokens - taking 30B)...") |
| try: |
| code = load_dataset("bigcode/starcoderdata", split="train[:3000000]") |
| for item in code: |
| content = item.get("content", "") |
| if content and len(content) > 100: |
| all_data.append({ |
| "text": content, |
| "source": "starcoder" |
| }) |
| print(f" β Added {len(code):,} examples") |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| print("\n4. C4 (150B tokens - taking 30B)...") |
| try: |
| c4 = load_dataset("c4", "en", split="train[:3000000]") |
| for item in c4: |
| text = item.get("text", "") |
| if text and len(text) > 200: |
| all_data.append({ |
| "text": text, |
| "source": "c4" |
| }) |
| print(f" β Added {len(c4):,} examples") |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| print("\n5. OpenWebMath (14B tokens - taking all)...") |
| try: |
| math = load_dataset("open-web-math/open-web-math", split="train") |
| for item in math: |
| text = item.get("text", "") |
| if text and len(text) > 200: |
| all_data.append({ |
| "text": text, |
| "source": "openwebmath" |
| }) |
| print(f" β Added {len(math):,} examples") |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| print("\n" + "=" * 80) |
| print(f"TOTAL EXAMPLES: {len(all_data):,}") |
| print(f"ESTIMATED TOKENS: {len(all_data) * 500:,}") |
| print("=" * 80) |
| |
| |
| print("\nSaving to disk...") |
| with open(data_dir / "15b_train.jsonl", "w") as f: |
| for item in all_data: |
| f.write(json.dumps(item) + "\n") |
| |
| print(f"β Saved to: {data_dir}/15b_train.jsonl") |
|
|
| if __name__ == "__main__": |
| download_15b_data() |
|
|