File size: 2,228 Bytes
5f7d974
 
 
 
 
 
 
 
f21dc8c
 
 
 
 
5f7d974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f21dc8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Tests for Streamlit app helpers."""

from __future__ import annotations

from pathlib import Path

from PIL import Image

from mathvision_explorer.streamlit_app import (
    _choose_hf_split,
    _hf_dataset_id_from_ref,
    _record_from_hf_row,
)


def test_hf_dataset_id_from_url() -> None:
    """Dataset URLs and repo IDs resolve to the same Hub id."""

    assert (
        _hf_dataset_id_from_ref("https://huggingface.co/datasets/MathLLMs/MathVision")
        == "MathLLMs/MathVision"
    )
    assert _hf_dataset_id_from_ref("datasets/MathLLMs/MathVision") == "MathLLMs/MathVision"


def test_record_from_hf_row_persists_image(tmp_path: Path) -> None:
    """HF dataset image objects are persisted so existing image-path code can use them."""

    record = _record_from_hf_row(
        {
            "id": "1",
            "question": "How many triangles?",
            "answer": "3",
            "decoded_image": Image.new("RGB", (4, 4), color="white"),
        },
        row_index=0,
        image_dir=tmp_path,
    )

    assert record is not None
    assert record.problem_id == "1"
    assert record.image_path == tmp_path / "row-00000.png"
    assert record.image_path.exists()


def test_choose_hf_split_prefers_test() -> None:
    """Automatic split selection picks a useful default without user input."""

    datasets = _FakeDatasets({"train": object(), "test": object()})

    assert _choose_hf_split(datasets, "org/name") == "test"


def test_choose_hf_split_falls_back_to_first_available() -> None:
    """Datasets with custom split names still load without a visible split field."""

    datasets = _FakeDatasets({"testmini": object(), "dev": object()})

    assert _choose_hf_split(datasets, "org/name") == "testmini"


class _FakeDatasets:
    def __init__(self, splits: dict[str, object]) -> None:
        self._splits = splits

    def load_dataset_builder(self, repo_id: str) -> _FakeBuilder:
        assert repo_id == "org/name"
        return _FakeBuilder(self._splits)


class _FakeBuilder:
    def __init__(self, splits: dict[str, object]) -> None:
        self.info = _FakeInfo(splits)


class _FakeInfo:
    def __init__(self, splits: dict[str, object]) -> None:
        self.splits = splits