File size: 6,823 Bytes

148b631

"""
prepare_large_data.py - Prepares large dataset (50-100MB) for memory validation.

Unlike the code completion dataset, this downloads MUCH more code
to train a model that truly learns long-term patterns.

Usage:
    python validation/memory/prepare_large_data.py --size 50   # 50MB
    python validation/memory/prepare_large_data.py --size 100  # 100MB
"""

import os
import sys
import pickle
import argparse
import numpy as np
from tqdm import tqdm

# Settings
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
MIN_FILE_SIZE = 200
MAX_FILE_SIZE = 15000
TRAIN_SPLIT = 0.95  # 95% train, 5% validation (more training data)


def download_large_python_dataset(target_mb: int) -> str:
    """
    Downloads a large Python code dataset.
    
    Args:
        target_mb: Target size in megabytes (50, 100, etc)
    """
    from datasets import load_dataset
    
    target_chars = target_mb * 1_000_000  # ~1 char = 1 byte
    
    print(f"🔹 Downloading ~{target_mb}MB of Python code...")
    print("   This may take a few minutes...")
    
    # Try multiple datasets to get enough data
    datasets_to_try = [
        ("bigcode/the-stack-smol", "data/python"),
        ("codeparrot/codeparrot-clean", None),
    ]
    
    code_samples = []
    current_len = 0
    
    for dataset_name, data_dir in datasets_to_try:
        if current_len >= target_chars:
            break
            
        try:
            print(f"\n   📦 Loading: {dataset_name}")
            
            if data_dir:
                dataset = load_dataset(
                    dataset_name,
                    data_dir=data_dir,
                    split="train",
                    streaming=True
                )
            else:
                dataset = load_dataset(
                    dataset_name,
                    split="train",
                    streaming=True
                )
            
            progress = tqdm(
                desc=f"   Collecting from {dataset_name.split('/')[-1]}",
                total=target_chars - current_len,
                unit="chars"
            )
            
            for sample in dataset:
                code = sample.get('content', sample.get('code', ''))
                
                if not code:
                    continue
                
                # Quality filters
                if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE:
                    continue
                
                # Filter files with too much non-ASCII content
                try:
                    non_ascii = sum(1 for c in code if ord(c) > 127)
                    if non_ascii / len(code) > 0.05:
                        continue
                except:
                    continue
                
                # Normalize
                code = code.replace('\t', '    ')
                code = code.replace('\r\n', '\n')
                
                code_samples.append(code)
                current_len += len(code)
                progress.update(len(code))
                
                if current_len >= target_chars:
                    break
            
            progress.close()
            
        except Exception as e:
            print(f"   ⚠️ Error with {dataset_name}: {e}")
            continue
    
    if current_len < target_chars * 0.5:
        print(f"\n⚠️ Warning: We only got {current_len / 1e6:.1f}MB of {target_mb}MB")
    
    # Join with separator
    separator = "\n\n# === END OF FILE ===\n\n"
    full_text = separator.join(code_samples)
    
    return full_text


def build_vocabulary(text: str) -> dict:
    """Builds character vocabulary."""
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    
    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = {i: ch for i, ch in enumerate(chars)}
    
    return {
        'vocab_size': vocab_size,
        'stoi': stoi,
        'itos': itos,
        'chars': chars
    }


def prepare_large_dataset(target_mb: int = 50):
    """Main preparation pipeline."""
    
    print("=" * 60)
    print(f"🧠 PREPARING LARGE DATASET ({target_mb}MB) FOR KILLER TEST")
    print("=" * 60)
    
    os.makedirs(DATA_DIR, exist_ok=True)
    
    # 1. Download code
    code_text = download_large_python_dataset(target_mb)
    
    actual_mb = len(code_text) / 1e6
    print(f"\n📊 Final Statistics:")
    print(f"   Total characters: {len(code_text):,}")
    print(f"   Actual size: {actual_mb:.2f} MB")
    
    # 2. Vocabulary
    print("\n🔤 Building vocabulary...")
    vocab = build_vocabulary(code_text)
    print(f"   Vocab size: {vocab['vocab_size']}")
    
    meta_path = os.path.join(DATA_DIR, 'meta.pkl')
    with open(meta_path, 'wb') as f:
        pickle.dump(vocab, f)
    
    # 3. Split
    print("\n✂️ Splitting train/validation...")
    n = len(code_text)
    split_idx = int(n * TRAIN_SPLIT)
    
    train_text = code_text[:split_idx]
    val_text = code_text[split_idx:]
    
    print(f"   Train: {len(train_text)/1e6:.2f} MB")
    print(f"   Validation: {len(val_text)/1e6:.2f} MB")
    
    # 4. Encode and save
    print("\n💾 Encoding and saving (this may take a while)...")
    
    stoi = vocab['stoi']
    
    # Process in chunks to avoid memory overflow
    chunk_size = 10_000_000
    
    train_path = os.path.join(DATA_DIR, 'train.bin')
    val_path = os.path.join(DATA_DIR, 'val.bin')
    
    # Train
    with open(train_path, 'wb') as f:
        for i in range(0, len(train_text), chunk_size):
            chunk = train_text[i:i+chunk_size]
            ids = np.array([stoi[c] for c in chunk], dtype=np.uint16)
            ids.tofile(f)
            print(f"\r   Train: {min(i+chunk_size, len(train_text))/1e6:.1f}MB processed", end="")
    print()
    
    # Val
    with open(val_path, 'wb') as f:
        for i in range(0, len(val_text), chunk_size):
            chunk = val_text[i:i+chunk_size]
            ids = np.array([stoi[c] for c in chunk], dtype=np.uint16)
            ids.tofile(f)
    
    # 5. Stats
    stats = {
        'target_mb': target_mb,
        'actual_mb': actual_mb,
        'train_chars': len(train_text),
        'val_chars': len(val_text),
        'vocab_size': vocab['vocab_size'],
    }
    
    with open(os.path.join(DATA_DIR, 'stats.pkl'), 'wb') as f:
        pickle.dump(stats, f)
    
    print("\n" + "=" * 60)
    print("✅ LARGE DATASET PREPARED!")
    print("=" * 60)
    print(f"\nNext step: python validation/memory/train_large.py --config medium")
    
    return stats


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Prepares large dataset for Killer Test')
    parser.add_argument('--size', type=int, default=50, help='Size in MB (default: 50)')
    args = parser.parse_args()
    
    prepare_large_dataset(args.size)