import os import json from datasets import load_dataset from tqdm import tqdm # --- Configuration --- LANGUAGES = ["python", "javascript", "php"] SAMPLE_SIZE_PER_LANG = 1000 # Adjust based on storage/cost constraints OUTPUT_FILE = "programming_data.jsonl" def prepare_dataset(): """ Streams The Stack v2 (small subset), filters for specific languages, samples them, and saves to a JSONL file. """ print(f"🚀 Starting data preparation for {LANGUAGES}...") # We use the 'train' split which is the main part of the small subset # streaming=True avoids downloading the entire 100GB+ dataset ds = load_dataset("bigcode/the-stack-v2-train-sm", split="train", streaming=True) counts = {lang: 0 for lang in LANGUAGES} data_out = [] pbar = tqdm(total=SAMPLE_SIZE_PER_LANG * len(LANGUAGES), desc="Collecting samples") for entry in ds: lang = entry.get("language", "").lower() content = entry.get("content", "") if lang in LANGUAGES and counts[lang] < SAMPLE_SIZE_PER_LANG: if content.strip(): data_out.append({ "language": lang, "content": content, "repository": entry.get("repository_name", "unknown"), "path": entry.get("path", "unknown") }) counts[lang] += 1 pbar.update(1) # Stop if we hit targets for all if all(c >= SAMPLE_SIZE_PER_LANG for c in counts.values()): break pbar.close() print(f"💾 Saving {len(data_out)} samples to {OUTPUT_FILE}...") with open(OUTPUT_FILE, "w", encoding="utf-8") as f: for item in data_out: f.write(json.dumps(item) + "\n") print("✅ Data preparation complete!") if __name__ == "__main__": prepare_dataset()