File size: 955 Bytes
fec93c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc0fa99
fec93c8
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Prepare a lightweight semantic search dataset from Hugging Face.

Usage:
    python prepare_stackoverflow_sample.py
"""

from pathlib import Path
import json

from datasets import load_dataset


DATASET_ID = "MartinElMolon/stackoverflow_preguntas_con_embeddings"
OUTPUT_PATH = Path("data/stackoverflow_sample_3000.json")
SAMPLE_SIZE = 3000


def main() -> None:
    ds = load_dataset(DATASET_ID, split="train[:3000]")

    if len(ds) < SAMPLE_SIZE:
        raise ValueError(f"Dataset has only {len(ds)} rows; expected at least {SAMPLE_SIZE}.")

    sampled = ds.shuffle(seed=42).select(range(SAMPLE_SIZE))
    sampled = sampled.select_columns(["question", "answer", "embedding"])

    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    with OUTPUT_PATH.open("w", encoding="utf-8") as f:
        json.dump(sampled.to_list(), f, ensure_ascii=False)

    print(f"Saved {len(sampled)} rows to {OUTPUT_PATH}")


if __name__ == "__main__":
    main()