Spaces:
Sleeping
Sleeping
File size: 990 Bytes
82af8ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | """Prepare a lightweight semantic search dataset from Hugging Face.
Usage:
python prepare_stackoverflow_sample.py
"""
from pathlib import Path
import json
from datasets import load_dataset
DATASET_ID = "MartinElMolon/stackoverflow_preguntas_con_embeddings"
OUTPUT_PATH = Path("data/stackoverflow_sample_3000.json")
SAMPLE_SIZE = 3000
def main() -> None:
ds = load_dataset(DATASET_ID, split="train[:3000]")
if len(ds) < SAMPLE_SIZE:
raise ValueError(f"Dataset has only {len(ds)} rows; expected at least {SAMPLE_SIZE}.")
sampled = ds.shuffle(seed=42).select(range(SAMPLE_SIZE))
sampled = sampled.select_columns(["question", "answer", "embedding"])
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_PATH.open("w", encoding="utf-8") as f:
json.dump(sampled.to_list(), f, ensure_ascii=False)
print(f"Saved {len(sampled)} rows to {OUTPUT_PATH}")
if __name__ == "__main__":
main()
|