RippleGPT-Nano / validation /code /prepare_code_data.py
Tavernari's picture
Upload folder using huggingface_hub
148b631 verified
"""
prepare_code_data.py - Prepares the-stack-smol dataset for code completion validation.
This script:
1. Downloads Python code from HuggingFace (streaming)
2. Filters and cleans the code
3. Tokenizes at character level
4. Saves in binary format for training
Usage:
python validation/prepare_code_data.py
"""
import os
import pickle
import numpy as np
from tqdm import tqdm
# Settings
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
TARGET_SIZE_CHARS = 5_000_000 # ~5MB of Python code
MIN_FILE_SIZE = 100 # Ignore very small files
MAX_FILE_SIZE = 10000 # Ignore very large files
TRAIN_SPLIT = 0.9 # 90% train, 10% validation
def download_python_code(target_chars: int) -> str:
"""
Downloads Python code from the-stack-smol via streaming.
Does not download the entire dataset, only what is needed.
"""
from datasets import load_dataset
print("🔹 Downloading Python code from the-stack-smol...")
print(" (Using streaming, not downloading entire dataset)")
try:
# Streaming: download only what we need
dataset = load_dataset(
"bigcode/the-stack-smol",
data_dir="data/python",
split="train",
streaming=True
)
except Exception as e:
print(f"❌ Error accessing HuggingFace: {e}")
print(" Trying alternative dataset...")
# Fallback to another code dataset
dataset = load_dataset(
"codeparrot/codeparrot-clean",
split="train",
streaming=True
)
code_samples = []
current_len = 0
progress = tqdm(desc="Collecting code", total=target_chars, unit="chars")
for sample in dataset:
# Extract code content
code = sample.get('content', sample.get('code', ''))
if not code:
continue
# Quality filters
if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE:
continue
# Ignore files with many non-ASCII chars (binaries, etc)
try:
code.encode('ascii')
except UnicodeEncodeError:
# Allow some special characters but filter too many
non_ascii = sum(1 for c in code if ord(c) > 127)
if non_ascii / len(code) > 0.1: # More than 10% non-ASCII
continue
# Normalize indentation (convert tabs to 4 spaces)
code = code.replace('\t', ' ')
code_samples.append(code)
current_len += len(code)
progress.update(len(code))
if current_len >= target_chars:
break
progress.close()
# Join with special separator
separator = "\n\n# === END OF FILE ===\n\n"
full_text = separator.join(code_samples)
return full_text
def build_vocabulary(text: str) -> dict:
"""
Builds character vocabulary.
Returns dictionaries stoi (char->int) and itos (int->char).
"""
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
return {
'vocab_size': vocab_size,
'stoi': stoi,
'itos': itos,
'chars': chars
}
def encode_text(text: str, stoi: dict) -> np.ndarray:
"""Encodes text to integer array."""
return np.array([stoi[c] for c in text], dtype=np.uint16)
def prepare_dataset():
"""Main preparation pipeline."""
print("=" * 60)
print("🧪 PREPARING CODE DATASET FOR VALIDATION")
print("=" * 60)
# Create data directory
os.makedirs(DATA_DIR, exist_ok=True)
# 1. Download code
print(f"\n📥 Downloading ~{TARGET_SIZE_CHARS / 1e6:.1f}MB of Python code...")
code_text = download_python_code(TARGET_SIZE_CHARS)
print(f"\n📊 Statistics:")
print(f" Total characters: {len(code_text):,}")
print(f" Size on disk: {len(code_text) / 1024 / 1024:.2f} MB")
# 2. Build vocabulary
print("\n🔤 Building vocabulary...")
vocab = build_vocabulary(code_text)
print(f" Vocab size: {vocab['vocab_size']}")
print(f" Characters (sample): {''.join(vocab['chars'][:50])}...")
# Save vocabulary
meta_path = os.path.join(DATA_DIR, 'meta.pkl')
with open(meta_path, 'wb') as f:
pickle.dump(vocab, f)
print(f" Saved to: {meta_path}")
# 3. Split train/validation
print("\n✂️ Splitting train/validation...")
n = len(code_text)
split_idx = int(n * TRAIN_SPLIT)
train_text = code_text[:split_idx]
val_text = code_text[split_idx:]
print(f" Train: {len(train_text):,} chars ({TRAIN_SPLIT*100:.0f}%)")
print(f" Validation: {len(val_text):,} chars ({(1-TRAIN_SPLIT)*100:.0f}%)")
# 4. Encode and save
print("\n💾 Encoding and saving...")
train_ids = encode_text(train_text, vocab['stoi'])
val_ids = encode_text(val_text, vocab['stoi'])
train_path = os.path.join(DATA_DIR, 'train.bin')
val_path = os.path.join(DATA_DIR, 'val.bin')
train_ids.tofile(train_path)
val_ids.tofile(val_path)
print(f" Train saved to: {train_path}")
print(f" Validation saved to: {val_path}")
# 5. Create statistics file
stats = {
'total_chars': len(code_text),
'train_chars': len(train_text),
'val_chars': len(val_text),
'vocab_size': vocab['vocab_size'],
'source': 'bigcode/the-stack-smol'
}
stats_path = os.path.join(DATA_DIR, 'stats.pkl')
with open(stats_path, 'wb') as f:
pickle.dump(stats, f)
print("\n" + "=" * 60)
print("✅ DATASET PREPARED SUCCESSFULLY!")
print("=" * 60)
print(f"\nNext step: python validation/code/train_code.py")
return stats
if __name__ == '__main__':
prepare_dataset()