File size: 961 Bytes
3d2dbcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from __future__ import annotations

from pathlib import Path

def load_jsonl_text_dataset(
    path: str | Path,
    controller_families: list[str] | None = None,
    controller_types: list[str] | None = None,
):
    from datasets import load_dataset

    dataset = load_dataset("json", data_files=str(Path(path)), split="train")
    if "text" not in dataset.column_names:
        raise ValueError("Expected a JSONL dataset with a 'text' field.")
    if controller_families:
        allowed_families = set(controller_families)
        dataset = dataset.filter(
            lambda row: row.get("controller_family") in allowed_families
        )
    if controller_types:
        allowed_types = set(controller_types)
        dataset = dataset.filter(
            lambda row: row.get("controller_type") in allowed_types
        )
    if len(dataset) == 0:
        raise ValueError("No dataset rows remain after applying the requested filters.")
    return dataset