""" prepare_code_data.py - Prepares the-stack-smol dataset for code completion validation. This script: 1. Downloads Python code from HuggingFace (streaming) 2. Filters and cleans the code 3. Tokenizes at character level 4. Saves in binary format for training Usage: python validation/prepare_code_data.py """ import os import pickle import numpy as np from tqdm import tqdm # Settings DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') TARGET_SIZE_CHARS = 5_000_000 # ~5MB of Python code MIN_FILE_SIZE = 100 # Ignore very small files MAX_FILE_SIZE = 10000 # Ignore very large files TRAIN_SPLIT = 0.9 # 90% train, 10% validation def download_python_code(target_chars: int) -> str: """ Downloads Python code from the-stack-smol via streaming. Does not download the entire dataset, only what is needed. """ from datasets import load_dataset print("๐Ÿ”น Downloading Python code from the-stack-smol...") print(" (Using streaming, not downloading entire dataset)") try: # Streaming: download only what we need dataset = load_dataset( "bigcode/the-stack-smol", data_dir="data/python", split="train", streaming=True ) except Exception as e: print(f"โŒ Error accessing HuggingFace: {e}") print(" Trying alternative dataset...") # Fallback to another code dataset dataset = load_dataset( "codeparrot/codeparrot-clean", split="train", streaming=True ) code_samples = [] current_len = 0 progress = tqdm(desc="Collecting code", total=target_chars, unit="chars") for sample in dataset: # Extract code content code = sample.get('content', sample.get('code', '')) if not code: continue # Quality filters if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE: continue # Ignore files with many non-ASCII chars (binaries, etc) try: code.encode('ascii') except UnicodeEncodeError: # Allow some special characters but filter too many non_ascii = sum(1 for c in code if ord(c) > 127) if non_ascii / len(code) > 0.1: # More than 10% non-ASCII continue # Normalize indentation (convert tabs to 4 spaces) code = code.replace('\t', ' ') code_samples.append(code) current_len += len(code) progress.update(len(code)) if current_len >= target_chars: break progress.close() # Join with special separator separator = "\n\n# === END OF FILE ===\n\n" full_text = separator.join(code_samples) return full_text def build_vocabulary(text: str) -> dict: """ Builds character vocabulary. Returns dictionaries stoi (char->int) and itos (int->char). """ chars = sorted(list(set(text))) vocab_size = len(chars) stoi = {ch: i for i, ch in enumerate(chars)} itos = {i: ch for i, ch in enumerate(chars)} return { 'vocab_size': vocab_size, 'stoi': stoi, 'itos': itos, 'chars': chars } def encode_text(text: str, stoi: dict) -> np.ndarray: """Encodes text to integer array.""" return np.array([stoi[c] for c in text], dtype=np.uint16) def prepare_dataset(): """Main preparation pipeline.""" print("=" * 60) print("๐Ÿงช PREPARING CODE DATASET FOR VALIDATION") print("=" * 60) # Create data directory os.makedirs(DATA_DIR, exist_ok=True) # 1. Download code print(f"\n๐Ÿ“ฅ Downloading ~{TARGET_SIZE_CHARS / 1e6:.1f}MB of Python code...") code_text = download_python_code(TARGET_SIZE_CHARS) print(f"\n๐Ÿ“Š Statistics:") print(f" Total characters: {len(code_text):,}") print(f" Size on disk: {len(code_text) / 1024 / 1024:.2f} MB") # 2. Build vocabulary print("\n๐Ÿ”ค Building vocabulary...") vocab = build_vocabulary(code_text) print(f" Vocab size: {vocab['vocab_size']}") print(f" Characters (sample): {''.join(vocab['chars'][:50])}...") # Save vocabulary meta_path = os.path.join(DATA_DIR, 'meta.pkl') with open(meta_path, 'wb') as f: pickle.dump(vocab, f) print(f" Saved to: {meta_path}") # 3. Split train/validation print("\nโœ‚๏ธ Splitting train/validation...") n = len(code_text) split_idx = int(n * TRAIN_SPLIT) train_text = code_text[:split_idx] val_text = code_text[split_idx:] print(f" Train: {len(train_text):,} chars ({TRAIN_SPLIT*100:.0f}%)") print(f" Validation: {len(val_text):,} chars ({(1-TRAIN_SPLIT)*100:.0f}%)") # 4. Encode and save print("\n๐Ÿ’พ Encoding and saving...") train_ids = encode_text(train_text, vocab['stoi']) val_ids = encode_text(val_text, vocab['stoi']) train_path = os.path.join(DATA_DIR, 'train.bin') val_path = os.path.join(DATA_DIR, 'val.bin') train_ids.tofile(train_path) val_ids.tofile(val_path) print(f" Train saved to: {train_path}") print(f" Validation saved to: {val_path}") # 5. Create statistics file stats = { 'total_chars': len(code_text), 'train_chars': len(train_text), 'val_chars': len(val_text), 'vocab_size': vocab['vocab_size'], 'source': 'bigcode/the-stack-smol' } stats_path = os.path.join(DATA_DIR, 'stats.pkl') with open(stats_path, 'wb') as f: pickle.dump(stats, f) print("\n" + "=" * 60) print("โœ… DATASET PREPARED SUCCESSFULLY!") print("=" * 60) print(f"\nNext step: python validation/code/train_code.py") return stats if __name__ == '__main__': prepare_dataset()