Spaces:
Running
Running
| from __future__ import annotations | |
| from pathlib import Path | |
| import pyarrow as pa | |
| import pyarrow.parquet as pq | |
| import yaml | |
| from scripts.create_nano_dataset import main | |
| def _write_parquet(path: Path, rows: list[dict[str, object]]) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| pq.write_table(pa.Table.from_pylist(rows), path) | |
| def test_create_nano_dataset_script_builds_local_dataset_and_yaml(tmp_path: Path) -> None: | |
| source_dir = tmp_path / "source" | |
| _write_parquet( | |
| source_dir / "SourceTask" / "corpus" / "test.parquet", | |
| [{"_id": "d1", "text": "alpha document"}, {"_id": "d2", "text": "beta document"}], | |
| ) | |
| _write_parquet( | |
| source_dir / "SourceTask" / "queries" / "test.parquet", | |
| [{"_id": "q1", "text": "alpha"}], | |
| ) | |
| _write_parquet( | |
| source_dir / "SourceTask" / "qrels" / "test.parquet", | |
| [{"query-id": "q1", "corpus-id": "d1", "score": 1}], | |
| ) | |
| output_dir = tmp_path / "NanoScript" | |
| config_dir = tmp_path / "config" / "datasets" | |
| main( | |
| [ | |
| "--source-dir", | |
| str(source_dir), | |
| "--source-split-name", | |
| "SourceTask", | |
| "--dataset-name", | |
| "NanoScript", | |
| "--dataset-id", | |
| "hakari-bench/NanoScript", | |
| "--split-name", | |
| "NanoSourceTask", | |
| "--output-dir", | |
| str(output_dir), | |
| "--dataset-config-dir", | |
| str(config_dir), | |
| "--bm25-tokenizer", | |
| "whitespace", | |
| "--top-k", | |
| "2", | |
| "--query-limit", | |
| "10", | |
| "--doc-limit", | |
| "10", | |
| ] | |
| ) | |
| assert (output_dir / "bm25" / "NanoSourceTask.parquet").exists() | |
| config = yaml.safe_load((config_dir / "nanoscript.yaml").read_text(encoding="utf-8")) | |
| assert config["name"] == "NanoScript" | |
| assert config["splits"] == ["NanoSourceTask"] | |