Spaces:

aagzamov
/

rag-intelligence

Sleeping

File size: 1,896 Bytes

b3f3041

import os
import json
from datasets import load_dataset
from tqdm import tqdm

# --- Configuration ---
LANGUAGES = ["python", "javascript", "php"]
SAMPLE_SIZE_PER_LANG = 1000  # Adjust based on storage/cost constraints
OUTPUT_FILE = "programming_data.jsonl"

def prepare_dataset():
    """
    Streams The Stack v2 (small subset), filters for specific languages,
    samples them, and saves to a JSONL file.
    """
    print(f"🚀 Starting data preparation for {LANGUAGES}...")
    
    # We use the 'train' split which is the main part of the small subset
    # streaming=True avoids downloading the entire 100GB+ dataset
    ds = load_dataset("bigcode/the-stack-v2-train-sm", split="train", streaming=True)
    
    counts = {lang: 0 for lang in LANGUAGES}
    data_out = []
    
    pbar = tqdm(total=SAMPLE_SIZE_PER_LANG * len(LANGUAGES), desc="Collecting samples")
    
    for entry in ds:
        lang = entry.get("language", "").lower()
        content = entry.get("content", "")
        
        if lang in LANGUAGES and counts[lang] < SAMPLE_SIZE_PER_LANG:
            if content.strip():
                data_out.append({
                    "language": lang,
                    "content": content,
                    "repository": entry.get("repository_name", "unknown"),
                    "path": entry.get("path", "unknown")
                })
                counts[lang] += 1
                pbar.update(1)
        
        # Stop if we hit targets for all
        if all(c >= SAMPLE_SIZE_PER_LANG for c in counts.values()):
            break

    pbar.close()
    
    print(f"💾 Saving {len(data_out)} samples to {OUTPUT_FILE}...")
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for item in data_out:
            f.write(json.dumps(item) + "\n")
            
    print("✅ Data preparation complete!")

if __name__ == "__main__":
    prepare_dataset()