Spaces:

aagzamov
/

rag-intelligence

Sleeping

rag-intelligence / prepare_data.py

Fix bug

b3f3041 2 months ago

1.9 kB

	import os
	import json
	from datasets import load_dataset
	from tqdm import tqdm

	# --- Configuration ---
	LANGUAGES = ["python", "javascript", "php"]
	SAMPLE_SIZE_PER_LANG = 1000 # Adjust based on storage/cost constraints
	OUTPUT_FILE = "programming_data.jsonl"

	def prepare_dataset():
	"""
	Streams The Stack v2 (small subset), filters for specific languages,
	samples them, and saves to a JSONL file.
	"""
	print(f"🚀 Starting data preparation for {LANGUAGES}...")

	# We use the 'train' split which is the main part of the small subset
	# streaming=True avoids downloading the entire 100GB+ dataset
	ds = load_dataset("bigcode/the-stack-v2-train-sm", split="train", streaming=True)

	counts = {lang: 0 for lang in LANGUAGES}
	data_out = []

	pbar = tqdm(total=SAMPLE_SIZE_PER_LANG * len(LANGUAGES), desc="Collecting samples")

	for entry in ds:
	lang = entry.get("language", "").lower()
	content = entry.get("content", "")

	if lang in LANGUAGES and counts[lang] < SAMPLE_SIZE_PER_LANG:
	if content.strip():
	data_out.append({
	"language": lang,
	"content": content,
	"repository": entry.get("repository_name", "unknown"),
	"path": entry.get("path", "unknown")
	})
	counts[lang] += 1
	pbar.update(1)

	# Stop if we hit targets for all
	if all(c >= SAMPLE_SIZE_PER_LANG for c in counts.values()):
	break

	pbar.close()

	print(f"💾 Saving {len(data_out)} samples to {OUTPUT_FILE}...")
	with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
	for item in data_out:
	f.write(json.dumps(item) + "\n")

	print("✅ Data preparation complete!")

	if __name__ == "__main__":
	prepare_dataset()