| |
| """ |
| 150GB Curated STEM Dataset for 7B Model Training |
| Enough for a high-quality 7B model from scratch |
| Total: ~150GB compressed, ~500GB uncompressed |
| """ |
|
|
| import json |
| from pathlib import Path |
| from datasets import load_dataset |
| import time |
|
|
| def download_7b_dataset(): |
| print("=" * 80) |
| print("DOWNLOADING 150GB STEM DATASET FOR 7B MODEL") |
| print("=" * 80) |
| print("\nβ οΈ This will download ~150GB of data") |
| print(" Estimated time: 4-8 hours depending on connection") |
| print(" Disk space needed: ~500GB after decompression") |
| print("\nPress Ctrl+C to cancel, or wait 5 seconds to continue...") |
| time.sleep(5) |
| |
| data_dir = Path("./data/7b_150gb") |
| data_dir.mkdir(parents=True, exist_ok=True) |
| |
| all_data = [] |
| total_examples = 0 |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 1: The Pile (50GB - General text)") |
| print("=" * 80) |
| try: |
| pile = load_dataset("EleutherAI/pile", split="train[:2000000]") |
| for item in pile: |
| text = item.get("text", "") |
| if text and len(text) > 500: |
| all_data.append({ |
| "text": text[:2048], |
| "source": "pile" |
| }) |
| print(f" β Added {len(pile):,} examples") |
| total_examples += len(pile) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 2: StarCoder (30GB - Code)") |
| print("=" * 80) |
| try: |
| code = load_dataset("bigcode/starcoderdata", split="train[:1500000]") |
| for item in code: |
| content = item.get("content", "") |
| if content and len(content) > 200: |
| all_data.append({ |
| "text": content[:2048], |
| "source": "starcoder" |
| }) |
| print(f" β Added {len(code):,} examples") |
| total_examples += len(code) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 3: C4 (25GB - Clean web text)") |
| print("=" * 80) |
| try: |
| c4 = load_dataset("c4", "en", split="train[:1500000]") |
| for item in c4: |
| text = item.get("text", "") |
| if text and len(text) > 300: |
| all_data.append({ |
| "text": text[:2048], |
| "source": "c4" |
| }) |
| print(f" β Added {len(c4):,} examples") |
| total_examples += len(c4) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 4: Proof-Pile-2 (20GB - Math/CS papers)") |
| print("=" * 80) |
| try: |
| proof = load_dataset("EleutherAI/proof-pile-2", split="train[:1000000]") |
| for item in proof: |
| text = item.get("text", "") |
| if text and len(text) > 500: |
| all_data.append({ |
| "text": text[:2048], |
| "source": "proofpile" |
| }) |
| print(f" β Added {len(proof):,} examples") |
| total_examples += len(proof) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 5: OpenWebMath (10GB - Math web pages)") |
| print("=" * 80) |
| try: |
| math = load_dataset("open-web-math/open-web-math", split="train[:500000]") |
| for item in math: |
| text = item.get("text", "") |
| if text and len(text) > 300: |
| all_data.append({ |
| "text": text[:2048], |
| "source": "openwebmath" |
| }) |
| print(f" β Added {len(math):,} examples") |
| total_examples += len(math) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 6: MetaMathQA (2.5GB - Math problems)") |
| print("=" * 80) |
| try: |
| metamath = load_dataset("meta-math/MetaMathQA", split="train") |
| for item in metamath: |
| text = f"Question: {item.get('query', '')}\nAnswer: {item.get('response', '')}" |
| all_data.append({ |
| "text": text, |
| "source": "metamath" |
| }) |
| print(f" β Added {len(metamath):,} examples") |
| total_examples += len(metamath) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 7: CodeFeedback (2GB - Code instructions)") |
| print("=" * 80) |
| try: |
| codefb = load_dataset("m-a-p/CodeFeedback", split="train[:150000]") |
| for item in codefb: |
| text = f"Instruction: {item.get('instruction', '')}\nCode: {item.get('output', '')}" |
| if len(text) > 100: |
| all_data.append({ |
| "text": text[:2048], |
| "source": "codefeedback" |
| }) |
| print(f" β Added {len(codefb):,} examples") |
| total_examples += len(codefb) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 8: OpenMathInstruct-2 (2GB - Math problems)") |
| print("=" * 80) |
| try: |
| openmath = load_dataset("nvidia/OpenMathInstruct-2", split="train[:150000]") |
| for item in openmath: |
| text = f"Problem: {item.get('question', '')}\nSolution: {item.get('generated_solution', '')}" |
| all_data.append({ |
| "text": text[:2048], |
| "source": "openmath" |
| }) |
| print(f" β Added {len(openmath):,} examples") |
| total_examples += len(openmath) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 9: NuminaMath-CoT (2GB - Math reasoning)") |
| print("=" * 80) |
| try: |
| numina = load_dataset("AI-MO/NuminaMath-CoT", split="train[:100000]") |
| for item in numina: |
| text = f"Problem: {item.get('problem', '')}\nSolution: {item.get('solution', '')}" |
| all_data.append({ |
| "text": text[:2048], |
| "source": "numinamath" |
| }) |
| print(f" β Added {len(numina):,} examples") |
| total_examples += len(numina) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("DATASET 10: ScienceQA (0.5GB - Science questions)") |
| print("=" * 80) |
| try: |
| science = load_dataset("derek-thomas/ScienceQA", split="train") |
| for item in science: |
| text = f"Question: {item.get('question', '')}\nAnswer: {item.get('answer', '')}" |
| all_data.append({ |
| "text": text[:2048], |
| "source": "scienceqa" |
| }) |
| print(f" β Added {len(science):,} examples") |
| total_examples += len(science) |
| except Exception as e: |
| print(f" β Failed: {e}") |
| |
| |
| |
| |
| print("\n" + "=" * 80) |
| print("SAVING DATASET") |
| print("=" * 80) |
| print(f"Total examples collected: {total_examples:,}") |
| print(f"Estimated size: ~150GB compressed, ~500GB uncompressed") |
| |
| |
| import random |
| random.shuffle(all_data) |
| |
| |
| output_path = data_dir / "7b_train.jsonl" |
| with open(output_path, "w") as f: |
| for item in all_data: |
| f.write(json.dumps(item) + "\n") |
| |
| print(f"\nβ Saved to: {output_path}") |
| print(f" File size: {output_path.stat().st_size / 1e9:.1f} GB") |
| |
| |
| metadata = { |
| "total_examples": total_examples, |
| "sources": {} |
| } |
| |
| for item in all_data: |
| src = item['source'] |
| metadata['sources'][src] = metadata['sources'].get(src, 0) + 1 |
| |
| with open(data_dir / "metadata.json", "w") as f: |
| json.dump(metadata, f, indent=2) |
| |
| print("\n" + "=" * 80) |
| print("DATASET BREAKDOWN") |
| print("=" * 80) |
| for src, count in metadata['sources'].items(): |
| print(f" {src}: {count:,} examples") |
| |
| print("\n" + "=" * 80) |
| print("β
DOWNLOAD COMPLETE!") |
| print("=" * 80) |
| print("\nNext step: python3 scripts/04_train_universal.py") |
|
|
| if __name__ == "__main__": |
| download_7b_dataset() |
|
|