File size: 961 Bytes
3d2dbcf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | from __future__ import annotations
from pathlib import Path
def load_jsonl_text_dataset(
path: str | Path,
controller_families: list[str] | None = None,
controller_types: list[str] | None = None,
):
from datasets import load_dataset
dataset = load_dataset("json", data_files=str(Path(path)), split="train")
if "text" not in dataset.column_names:
raise ValueError("Expected a JSONL dataset with a 'text' field.")
if controller_families:
allowed_families = set(controller_families)
dataset = dataset.filter(
lambda row: row.get("controller_family") in allowed_families
)
if controller_types:
allowed_types = set(controller_types)
dataset = dataset.filter(
lambda row: row.get("controller_type") in allowed_types
)
if len(dataset) == 0:
raise ValueError("No dataset rows remain after applying the requested filters.")
return dataset
|