Spaces:

ddebree
/

mathvision-jepa-explorer

Running

File size: 2,789 Bytes

"""Tests for MathVision-style dataset loading."""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from mathvision_explorer.dataset import (
    filter_records,
    load_jsonl_records,
    load_jsonl_records_from_text,
    record_from_mapping,
    summarize_records,
)


def test_record_from_mapping_accepts_problem_id_fallback() -> None:
    """Raw records may use either id or problem_id for the stable key."""

    record = record_from_mapping(
        {
            "problem_id": "mv-1",
            "question": "How many squares are visible?",
            "answer": "4",
            "options": ["3", "4", "5"],
            "subject": "geometry",
            "level": 2,
        }
    )

    assert record.problem_id == "mv-1"
    assert record.options == ("3", "4", "5")
    assert record.subject == "geometry"
    assert record.level == 2


def test_load_jsonl_records_resolves_relative_image_paths(tmp_path: Path) -> None:
    """Relative image paths are resolved from the JSONL file directory."""

    jsonl_path = tmp_path / "records.jsonl"
    payload = {
        "id": "mv-2",
        "question": "Pick the matching pattern.",
        "answer": "B",
        "image": "images/mv-2.png",
    }
    jsonl_path.write_text(json.dumps(payload) + "\n", encoding="utf-8")

    records = load_jsonl_records(jsonl_path)

    assert records[0].image_path == tmp_path / "images" / "mv-2.png"


def test_load_jsonl_records_from_text_resolves_relative_image_paths(tmp_path: Path) -> None:
    """Uploaded JSONL content can still use a caller-provided image base directory."""

    payload = {
        "id": "mv-3",
        "question": "Pick the matching graph.",
        "answer": "A",
        "image": "images/mv-3.png",
    }

    records = load_jsonl_records_from_text(json.dumps(payload), source_dir=tmp_path)

    assert records[0].image_path == tmp_path / "images" / "mv-3.png"


def test_filter_and_summary() -> None:
    """Records can be filtered and summarized for explorer views."""

    records = [
        record_from_mapping({"id": "a", "question": "Q1", "answer": "1", "subject": "counting"}),
        record_from_mapping(
            {"id": "b", "question": "Q2", "answer": "2", "subject": "geometry", "level": 3}
        ),
    ]

    assert [record.problem_id for record in filter_records(records, subject="geometry")] == ["b"]
    assert summarize_records(records) == {
        "records": 2,
        "images": 0,
        "subjects": ["counting", "geometry"],
        "levels": [3],
    }


def test_invalid_options_raise_value_error() -> None:
    """Options must be textual choices."""

    with pytest.raises(ValueError, match="Options"):
        record_from_mapping({"id": "bad", "question": "Q", "answer": "A", "options": [1]})