"""Prepare a lightweight semantic search dataset from Hugging Face. Usage: python prepare_stackoverflow_sample.py """ from pathlib import Path import json from datasets import load_dataset DATASET_ID = "MartinElMolon/stackoverflow_preguntas_con_embeddings" OUTPUT_PATH = Path("data/stackoverflow_sample_3000.json") SAMPLE_SIZE = 3000 def main() -> None: ds = load_dataset(DATASET_ID, split="train[:3000]") if len(ds) < SAMPLE_SIZE: raise ValueError(f"Dataset has only {len(ds)} rows; expected at least {SAMPLE_SIZE}.") sampled = ds.shuffle(seed=42).select(range(SAMPLE_SIZE)) sampled = sampled.select_columns(["question", "answer", "embedding"]) OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) with OUTPUT_PATH.open("w", encoding="utf-8") as f: json.dump(sampled.to_list(), f, ensure_ascii=False) print(f"Saved {len(sampled)} rows to {OUTPUT_PATH}") if __name__ == "__main__": main()