leaderboard / tests /test_create_nano_dataset_script.py
hotchpotch's picture
Deploy Docker leaderboard viewer
e8aa13a verified
from __future__ import annotations
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import yaml
from scripts.create_nano_dataset import main
def _write_parquet(path: Path, rows: list[dict[str, object]]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
pq.write_table(pa.Table.from_pylist(rows), path)
def test_create_nano_dataset_script_builds_local_dataset_and_yaml(tmp_path: Path) -> None:
source_dir = tmp_path / "source"
_write_parquet(
source_dir / "SourceTask" / "corpus" / "test.parquet",
[{"_id": "d1", "text": "alpha document"}, {"_id": "d2", "text": "beta document"}],
)
_write_parquet(
source_dir / "SourceTask" / "queries" / "test.parquet",
[{"_id": "q1", "text": "alpha"}],
)
_write_parquet(
source_dir / "SourceTask" / "qrels" / "test.parquet",
[{"query-id": "q1", "corpus-id": "d1", "score": 1}],
)
output_dir = tmp_path / "NanoScript"
config_dir = tmp_path / "config" / "datasets"
main(
[
"--source-dir",
str(source_dir),
"--source-split-name",
"SourceTask",
"--dataset-name",
"NanoScript",
"--dataset-id",
"hakari-bench/NanoScript",
"--split-name",
"NanoSourceTask",
"--output-dir",
str(output_dir),
"--dataset-config-dir",
str(config_dir),
"--bm25-tokenizer",
"whitespace",
"--top-k",
"2",
"--query-limit",
"10",
"--doc-limit",
"10",
]
)
assert (output_dir / "bm25" / "NanoSourceTask.parquet").exists()
config = yaml.safe_load((config_dir / "nanoscript.yaml").read_text(encoding="utf-8"))
assert config["name"] == "NanoScript"
assert config["splits"] == ["NanoSourceTask"]