rag-intelligence / prepare_data.py
aagzamov's picture
Fix bug
b3f3041
import os
import json
from datasets import load_dataset
from tqdm import tqdm
# --- Configuration ---
LANGUAGES = ["python", "javascript", "php"]
SAMPLE_SIZE_PER_LANG = 1000 # Adjust based on storage/cost constraints
OUTPUT_FILE = "programming_data.jsonl"
def prepare_dataset():
"""
Streams The Stack v2 (small subset), filters for specific languages,
samples them, and saves to a JSONL file.
"""
print(f"๐Ÿš€ Starting data preparation for {LANGUAGES}...")
# We use the 'train' split which is the main part of the small subset
# streaming=True avoids downloading the entire 100GB+ dataset
ds = load_dataset("bigcode/the-stack-v2-train-sm", split="train", streaming=True)
counts = {lang: 0 for lang in LANGUAGES}
data_out = []
pbar = tqdm(total=SAMPLE_SIZE_PER_LANG * len(LANGUAGES), desc="Collecting samples")
for entry in ds:
lang = entry.get("language", "").lower()
content = entry.get("content", "")
if lang in LANGUAGES and counts[lang] < SAMPLE_SIZE_PER_LANG:
if content.strip():
data_out.append({
"language": lang,
"content": content,
"repository": entry.get("repository_name", "unknown"),
"path": entry.get("path", "unknown")
})
counts[lang] += 1
pbar.update(1)
# Stop if we hit targets for all
if all(c >= SAMPLE_SIZE_PER_LANG for c in counts.values()):
break
pbar.close()
print(f"๐Ÿ’พ Saving {len(data_out)} samples to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for item in data_out:
f.write(json.dumps(item) + "\n")
print("โœ… Data preparation complete!")
if __name__ == "__main__":
prepare_dataset()