File size: 5,912 Bytes

148b631

"""
prepare_code_data.py - Prepares the-stack-smol dataset for code completion validation.

This script:
1. Downloads Python code from HuggingFace (streaming)
2. Filters and cleans the code
3. Tokenizes at character level
4. Saves in binary format for training

Usage:
    python validation/prepare_code_data.py
"""

import os
import pickle
import numpy as np
from tqdm import tqdm

# Settings
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
TARGET_SIZE_CHARS = 5_000_000  # ~5MB of Python code
MIN_FILE_SIZE = 100  # Ignore very small files
MAX_FILE_SIZE = 10000  # Ignore very large files
TRAIN_SPLIT = 0.9  # 90% train, 10% validation


def download_python_code(target_chars: int) -> str:
    """
    Downloads Python code from the-stack-smol via streaming.
    Does not download the entire dataset, only what is needed.
    """
    from datasets import load_dataset
    
    print("🔹 Downloading Python code from the-stack-smol...")
    print("   (Using streaming, not downloading entire dataset)")
    
    try:
        # Streaming: download only what we need
        dataset = load_dataset(
            "bigcode/the-stack-smol",
            data_dir="data/python",
            split="train",
            streaming=True
        )
    except Exception as e:
        print(f"❌ Error accessing HuggingFace: {e}")
        print("   Trying alternative dataset...")
        # Fallback to another code dataset
        dataset = load_dataset(
            "codeparrot/codeparrot-clean",
            split="train",
            streaming=True
        )
    
    code_samples = []
    current_len = 0
    
    progress = tqdm(desc="Collecting code", total=target_chars, unit="chars")
    
    for sample in dataset:
        # Extract code content
        code = sample.get('content', sample.get('code', ''))
        
        if not code:
            continue
            
        # Quality filters
        if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE:
            continue
            
        # Ignore files with many non-ASCII chars (binaries, etc)
        try:
            code.encode('ascii')
        except UnicodeEncodeError:
            # Allow some special characters but filter too many
            non_ascii = sum(1 for c in code if ord(c) > 127)
            if non_ascii / len(code) > 0.1:  # More than 10% non-ASCII
                continue
        
        # Normalize indentation (convert tabs to 4 spaces)
        code = code.replace('\t', '    ')
        
        code_samples.append(code)
        current_len += len(code)
        progress.update(len(code))
        
        if current_len >= target_chars:
            break
    
    progress.close()
    
    # Join with special separator
    separator = "\n\n# === END OF FILE ===\n\n"
    full_text = separator.join(code_samples)
    
    return full_text


def build_vocabulary(text: str) -> dict:
    """
    Builds character vocabulary.
    Returns dictionaries stoi (char->int) and itos (int->char).
    """
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    
    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = {i: ch for i, ch in enumerate(chars)}
    
    return {
        'vocab_size': vocab_size,
        'stoi': stoi,
        'itos': itos,
        'chars': chars
    }


def encode_text(text: str, stoi: dict) -> np.ndarray:
    """Encodes text to integer array."""
    return np.array([stoi[c] for c in text], dtype=np.uint16)


def prepare_dataset():
    """Main preparation pipeline."""
    
    print("=" * 60)
    print("🧪 PREPARING CODE DATASET FOR VALIDATION")
    print("=" * 60)
    
    # Create data directory
    os.makedirs(DATA_DIR, exist_ok=True)
    
    # 1. Download code
    print(f"\n📥 Downloading ~{TARGET_SIZE_CHARS / 1e6:.1f}MB of Python code...")
    code_text = download_python_code(TARGET_SIZE_CHARS)
    
    print(f"\n📊 Statistics:")
    print(f"   Total characters: {len(code_text):,}")
    print(f"   Size on disk: {len(code_text) / 1024 / 1024:.2f} MB")
    
    # 2. Build vocabulary
    print("\n🔤 Building vocabulary...")
    vocab = build_vocabulary(code_text)
    print(f"   Vocab size: {vocab['vocab_size']}")
    print(f"   Characters (sample): {''.join(vocab['chars'][:50])}...")
    
    # Save vocabulary
    meta_path = os.path.join(DATA_DIR, 'meta.pkl')
    with open(meta_path, 'wb') as f:
        pickle.dump(vocab, f)
    print(f"   Saved to: {meta_path}")
    
    # 3. Split train/validation
    print("\n✂️ Splitting train/validation...")
    n = len(code_text)
    split_idx = int(n * TRAIN_SPLIT)
    
    train_text = code_text[:split_idx]
    val_text = code_text[split_idx:]
    
    print(f"   Train: {len(train_text):,} chars ({TRAIN_SPLIT*100:.0f}%)")
    print(f"   Validation: {len(val_text):,} chars ({(1-TRAIN_SPLIT)*100:.0f}%)")
    
    # 4. Encode and save
    print("\n💾 Encoding and saving...")
    
    train_ids = encode_text(train_text, vocab['stoi'])
    val_ids = encode_text(val_text, vocab['stoi'])
    
    train_path = os.path.join(DATA_DIR, 'train.bin')
    val_path = os.path.join(DATA_DIR, 'val.bin')
    
    train_ids.tofile(train_path)
    val_ids.tofile(val_path)
    
    print(f"   Train saved to: {train_path}")
    print(f"   Validation saved to: {val_path}")
    
    # 5. Create statistics file
    stats = {
        'total_chars': len(code_text),
        'train_chars': len(train_text),
        'val_chars': len(val_text),
        'vocab_size': vocab['vocab_size'],
        'source': 'bigcode/the-stack-smol'
    }
    
    stats_path = os.path.join(DATA_DIR, 'stats.pkl')
    with open(stats_path, 'wb') as f:
        pickle.dump(stats, f)
    
    print("\n" + "=" * 60)
    print("✅ DATASET PREPARED SUCCESSFULLY!")
    print("=" * 60)
    print(f"\nNext step: python validation/code/train_code.py")
    
    return stats


if __name__ == '__main__':
    prepare_dataset()