Spaces:

ddebree
/

mathvision-jepa-explorer

Running

App Files Files Community

mathvision-jepa-explorer / tests /test_dataset.py

ddebree

Add

45ade5d 22 days ago

raw

history blame contribute delete

2.79 kB

	"""Tests for MathVision-style dataset loading."""

	from __future__ import annotations

	import json
	from pathlib import Path

	import pytest

	from mathvision_explorer.dataset import (
	filter_records,
	load_jsonl_records,
	load_jsonl_records_from_text,
	record_from_mapping,
	summarize_records,
	)


	def test_record_from_mapping_accepts_problem_id_fallback() -> None:
	"""Raw records may use either id or problem_id for the stable key."""

	record = record_from_mapping(
	{
	"problem_id": "mv-1",
	"question": "How many squares are visible?",
	"answer": "4",
	"options": ["3", "4", "5"],
	"subject": "geometry",
	"level": 2,
	}
	)

	assert record.problem_id == "mv-1"
	assert record.options == ("3", "4", "5")
	assert record.subject == "geometry"
	assert record.level == 2


	def test_load_jsonl_records_resolves_relative_image_paths(tmp_path: Path) -> None:
	"""Relative image paths are resolved from the JSONL file directory."""

	jsonl_path = tmp_path / "records.jsonl"
	payload = {
	"id": "mv-2",
	"question": "Pick the matching pattern.",
	"answer": "B",
	"image": "images/mv-2.png",
	}
	jsonl_path.write_text(json.dumps(payload) + "\n", encoding="utf-8")

	records = load_jsonl_records(jsonl_path)

	assert records[0].image_path == tmp_path / "images" / "mv-2.png"


	def test_load_jsonl_records_from_text_resolves_relative_image_paths(tmp_path: Path) -> None:
	"""Uploaded JSONL content can still use a caller-provided image base directory."""

	payload = {
	"id": "mv-3",
	"question": "Pick the matching graph.",
	"answer": "A",
	"image": "images/mv-3.png",
	}

	records = load_jsonl_records_from_text(json.dumps(payload), source_dir=tmp_path)

	assert records[0].image_path == tmp_path / "images" / "mv-3.png"


	def test_filter_and_summary() -> None:
	"""Records can be filtered and summarized for explorer views."""

	records = [
	record_from_mapping({"id": "a", "question": "Q1", "answer": "1", "subject": "counting"}),
	record_from_mapping(
	{"id": "b", "question": "Q2", "answer": "2", "subject": "geometry", "level": 3}
	),
	]

	assert [record.problem_id for record in filter_records(records, subject="geometry")] == ["b"]
	assert summarize_records(records) == {
	"records": 2,
	"images": 0,
	"subjects": ["counting", "geometry"],
	"levels": [3],
	}


	def test_invalid_options_raise_value_error() -> None:
	"""Options must be textual choices."""

	with pytest.raises(ValueError, match="Options"):
	record_from_mapping({"id": "bad", "question": "Q", "answer": "A", "options": [1]})