""" prepare_large_data.py - Prepares large dataset (50-100MB) for memory validation. Unlike the code completion dataset, this downloads MUCH more code to train a model that truly learns long-term patterns. Usage: python validation/memory/prepare_large_data.py --size 50 # 50MB python validation/memory/prepare_large_data.py --size 100 # 100MB """ import os import sys import pickle import argparse import numpy as np from tqdm import tqdm # Settings DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') MIN_FILE_SIZE = 200 MAX_FILE_SIZE = 15000 TRAIN_SPLIT = 0.95 # 95% train, 5% validation (more training data) def download_large_python_dataset(target_mb: int) -> str: """ Downloads a large Python code dataset. Args: target_mb: Target size in megabytes (50, 100, etc) """ from datasets import load_dataset target_chars = target_mb * 1_000_000 # ~1 char = 1 byte print(f"🔹 Downloading ~{target_mb}MB of Python code...") print(" This may take a few minutes...") # Try multiple datasets to get enough data datasets_to_try = [ ("bigcode/the-stack-smol", "data/python"), ("codeparrot/codeparrot-clean", None), ] code_samples = [] current_len = 0 for dataset_name, data_dir in datasets_to_try: if current_len >= target_chars: break try: print(f"\n 📦 Loading: {dataset_name}") if data_dir: dataset = load_dataset( dataset_name, data_dir=data_dir, split="train", streaming=True ) else: dataset = load_dataset( dataset_name, split="train", streaming=True ) progress = tqdm( desc=f" Collecting from {dataset_name.split('/')[-1]}", total=target_chars - current_len, unit="chars" ) for sample in dataset: code = sample.get('content', sample.get('code', '')) if not code: continue # Quality filters if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE: continue # Filter files with too much non-ASCII content try: non_ascii = sum(1 for c in code if ord(c) > 127) if non_ascii / len(code) > 0.05: continue except: continue # Normalize code = code.replace('\t', ' ') code = code.replace('\r\n', '\n') code_samples.append(code) current_len += len(code) progress.update(len(code)) if current_len >= target_chars: break progress.close() except Exception as e: print(f" ⚠️ Error with {dataset_name}: {e}") continue if current_len < target_chars * 0.5: print(f"\n⚠️ Warning: We only got {current_len / 1e6:.1f}MB of {target_mb}MB") # Join with separator separator = "\n\n# === END OF FILE ===\n\n" full_text = separator.join(code_samples) return full_text def build_vocabulary(text: str) -> dict: """Builds character vocabulary.""" chars = sorted(list(set(text))) vocab_size = len(chars) stoi = {ch: i for i, ch in enumerate(chars)} itos = {i: ch for i, ch in enumerate(chars)} return { 'vocab_size': vocab_size, 'stoi': stoi, 'itos': itos, 'chars': chars } def prepare_large_dataset(target_mb: int = 50): """Main preparation pipeline.""" print("=" * 60) print(f"🧠 PREPARING LARGE DATASET ({target_mb}MB) FOR KILLER TEST") print("=" * 60) os.makedirs(DATA_DIR, exist_ok=True) # 1. Download code code_text = download_large_python_dataset(target_mb) actual_mb = len(code_text) / 1e6 print(f"\n📊 Final Statistics:") print(f" Total characters: {len(code_text):,}") print(f" Actual size: {actual_mb:.2f} MB") # 2. Vocabulary print("\n🔤 Building vocabulary...") vocab = build_vocabulary(code_text) print(f" Vocab size: {vocab['vocab_size']}") meta_path = os.path.join(DATA_DIR, 'meta.pkl') with open(meta_path, 'wb') as f: pickle.dump(vocab, f) # 3. Split print("\n✂️ Splitting train/validation...") n = len(code_text) split_idx = int(n * TRAIN_SPLIT) train_text = code_text[:split_idx] val_text = code_text[split_idx:] print(f" Train: {len(train_text)/1e6:.2f} MB") print(f" Validation: {len(val_text)/1e6:.2f} MB") # 4. Encode and save print("\n💾 Encoding and saving (this may take a while)...") stoi = vocab['stoi'] # Process in chunks to avoid memory overflow chunk_size = 10_000_000 train_path = os.path.join(DATA_DIR, 'train.bin') val_path = os.path.join(DATA_DIR, 'val.bin') # Train with open(train_path, 'wb') as f: for i in range(0, len(train_text), chunk_size): chunk = train_text[i:i+chunk_size] ids = np.array([stoi[c] for c in chunk], dtype=np.uint16) ids.tofile(f) print(f"\r Train: {min(i+chunk_size, len(train_text))/1e6:.1f}MB processed", end="") print() # Val with open(val_path, 'wb') as f: for i in range(0, len(val_text), chunk_size): chunk = val_text[i:i+chunk_size] ids = np.array([stoi[c] for c in chunk], dtype=np.uint16) ids.tofile(f) # 5. Stats stats = { 'target_mb': target_mb, 'actual_mb': actual_mb, 'train_chars': len(train_text), 'val_chars': len(val_text), 'vocab_size': vocab['vocab_size'], } with open(os.path.join(DATA_DIR, 'stats.pkl'), 'wb') as f: pickle.dump(stats, f) print("\n" + "=" * 60) print("✅ LARGE DATASET PREPARED!") print("=" * 60) print(f"\nNext step: python validation/memory/train_large.py --config medium") return stats if __name__ == '__main__': parser = argparse.ArgumentParser(description='Prepares large dataset for Killer Test') parser.add_argument('--size', type=int, default=50, help='Size in MB (default: 50)') args = parser.parse_args() prepare_large_dataset(args.size)