Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| # --- Configuration --- | |
| LANGUAGES = ["python", "javascript", "php"] | |
| SAMPLE_SIZE_PER_LANG = 1000 # Adjust based on storage/cost constraints | |
| OUTPUT_FILE = "programming_data.jsonl" | |
| def prepare_dataset(): | |
| """ | |
| Streams The Stack v2 (small subset), filters for specific languages, | |
| samples them, and saves to a JSONL file. | |
| """ | |
| print(f"๐ Starting data preparation for {LANGUAGES}...") | |
| # We use the 'train' split which is the main part of the small subset | |
| # streaming=True avoids downloading the entire 100GB+ dataset | |
| ds = load_dataset("bigcode/the-stack-v2-train-sm", split="train", streaming=True) | |
| counts = {lang: 0 for lang in LANGUAGES} | |
| data_out = [] | |
| pbar = tqdm(total=SAMPLE_SIZE_PER_LANG * len(LANGUAGES), desc="Collecting samples") | |
| for entry in ds: | |
| lang = entry.get("language", "").lower() | |
| content = entry.get("content", "") | |
| if lang in LANGUAGES and counts[lang] < SAMPLE_SIZE_PER_LANG: | |
| if content.strip(): | |
| data_out.append({ | |
| "language": lang, | |
| "content": content, | |
| "repository": entry.get("repository_name", "unknown"), | |
| "path": entry.get("path", "unknown") | |
| }) | |
| counts[lang] += 1 | |
| pbar.update(1) | |
| # Stop if we hit targets for all | |
| if all(c >= SAMPLE_SIZE_PER_LANG for c in counts.values()): | |
| break | |
| pbar.close() | |
| print(f"๐พ Saving {len(data_out)} samples to {OUTPUT_FILE}...") | |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as f: | |
| for item in data_out: | |
| f.write(json.dumps(item) + "\n") | |
| print("โ Data preparation complete!") | |
| if __name__ == "__main__": | |
| prepare_dataset() | |