"""Tests for Streamlit app helpers.""" from __future__ import annotations from pathlib import Path from PIL import Image from mathvision_explorer.streamlit_app import ( _choose_hf_split, _hf_dataset_id_from_ref, _record_from_hf_row, ) def test_hf_dataset_id_from_url() -> None: """Dataset URLs and repo IDs resolve to the same Hub id.""" assert ( _hf_dataset_id_from_ref("https://huggingface.co/datasets/MathLLMs/MathVision") == "MathLLMs/MathVision" ) assert _hf_dataset_id_from_ref("datasets/MathLLMs/MathVision") == "MathLLMs/MathVision" def test_record_from_hf_row_persists_image(tmp_path: Path) -> None: """HF dataset image objects are persisted so existing image-path code can use them.""" record = _record_from_hf_row( { "id": "1", "question": "How many triangles?", "answer": "3", "decoded_image": Image.new("RGB", (4, 4), color="white"), }, row_index=0, image_dir=tmp_path, ) assert record is not None assert record.problem_id == "1" assert record.image_path == tmp_path / "row-00000.png" assert record.image_path.exists() def test_choose_hf_split_prefers_test() -> None: """Automatic split selection picks a useful default without user input.""" datasets = _FakeDatasets({"train": object(), "test": object()}) assert _choose_hf_split(datasets, "org/name") == "test" def test_choose_hf_split_falls_back_to_first_available() -> None: """Datasets with custom split names still load without a visible split field.""" datasets = _FakeDatasets({"testmini": object(), "dev": object()}) assert _choose_hf_split(datasets, "org/name") == "testmini" class _FakeDatasets: def __init__(self, splits: dict[str, object]) -> None: self._splits = splits def load_dataset_builder(self, repo_id: str) -> _FakeBuilder: assert repo_id == "org/name" return _FakeBuilder(self._splits) class _FakeBuilder: def __init__(self, splits: dict[str, object]) -> None: self.info = _FakeInfo(splits) class _FakeInfo: def __init__(self, splits: dict[str, object]) -> None: self.splits = splits