mathvision-jepa-explorer / tests /test_dataset.py
ddebree's picture
Add
45ade5d
"""Tests for MathVision-style dataset loading."""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from mathvision_explorer.dataset import (
filter_records,
load_jsonl_records,
load_jsonl_records_from_text,
record_from_mapping,
summarize_records,
)
def test_record_from_mapping_accepts_problem_id_fallback() -> None:
"""Raw records may use either id or problem_id for the stable key."""
record = record_from_mapping(
{
"problem_id": "mv-1",
"question": "How many squares are visible?",
"answer": "4",
"options": ["3", "4", "5"],
"subject": "geometry",
"level": 2,
}
)
assert record.problem_id == "mv-1"
assert record.options == ("3", "4", "5")
assert record.subject == "geometry"
assert record.level == 2
def test_load_jsonl_records_resolves_relative_image_paths(tmp_path: Path) -> None:
"""Relative image paths are resolved from the JSONL file directory."""
jsonl_path = tmp_path / "records.jsonl"
payload = {
"id": "mv-2",
"question": "Pick the matching pattern.",
"answer": "B",
"image": "images/mv-2.png",
}
jsonl_path.write_text(json.dumps(payload) + "\n", encoding="utf-8")
records = load_jsonl_records(jsonl_path)
assert records[0].image_path == tmp_path / "images" / "mv-2.png"
def test_load_jsonl_records_from_text_resolves_relative_image_paths(tmp_path: Path) -> None:
"""Uploaded JSONL content can still use a caller-provided image base directory."""
payload = {
"id": "mv-3",
"question": "Pick the matching graph.",
"answer": "A",
"image": "images/mv-3.png",
}
records = load_jsonl_records_from_text(json.dumps(payload), source_dir=tmp_path)
assert records[0].image_path == tmp_path / "images" / "mv-3.png"
def test_filter_and_summary() -> None:
"""Records can be filtered and summarized for explorer views."""
records = [
record_from_mapping({"id": "a", "question": "Q1", "answer": "1", "subject": "counting"}),
record_from_mapping(
{"id": "b", "question": "Q2", "answer": "2", "subject": "geometry", "level": 3}
),
]
assert [record.problem_id for record in filter_records(records, subject="geometry")] == ["b"]
assert summarize_records(records) == {
"records": 2,
"images": 0,
"subjects": ["counting", "geometry"],
"levels": [3],
}
def test_invalid_options_raise_value_error() -> None:
"""Options must be textual choices."""
with pytest.raises(ValueError, match="Options"):
record_from_mapping({"id": "bad", "question": "Q", "answer": "A", "options": [1]})