Spaces:
Running
Running
File size: 727 Bytes
173f28e bf74331 173f28e bf74331 db0da0a 173f28e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | from __future__ import annotations
from datasets import load_dataset
from dataset_config import DatasetConfig
def build_corpus(size: int, ds_cfg: DatasetConfig | None = None) -> list[str]:
"""Build a corpus of real sentences from the configured dataset."""
if ds_cfg is None:
ds_cfg = DatasetConfig()
if ds_cfg.data is not None:
data = ds_cfg.data
else:
dataset = load_dataset(ds_cfg.name, ds_cfg.config, split=ds_cfg.split)
data = {col: list(dataset[col]) for col in dataset.column_names}
sentences = list(data[ds_cfg.query_col]) + list(data[ds_cfg.passage_col])
full: list[str] = []
while len(full) < size:
full.extend(sentences)
return full[:size]
|