| from __future__ import annotations | |
| from pathlib import Path | |
| def load_jsonl_text_dataset( | |
| path: str | Path, | |
| controller_families: list[str] | None = None, | |
| controller_types: list[str] | None = None, | |
| ): | |
| from datasets import load_dataset | |
| dataset = load_dataset("json", data_files=str(Path(path)), split="train") | |
| if "text" not in dataset.column_names: | |
| raise ValueError("Expected a JSONL dataset with a 'text' field.") | |
| if controller_families: | |
| allowed_families = set(controller_families) | |
| dataset = dataset.filter( | |
| lambda row: row.get("controller_family") in allowed_families | |
| ) | |
| if controller_types: | |
| allowed_types = set(controller_types) | |
| dataset = dataset.filter( | |
| lambda row: row.get("controller_type") in allowed_types | |
| ) | |
| if len(dataset) == 0: | |
| raise ValueError("No dataset rows remain after applying the requested filters.") | |
| return dataset | |