ddebree's picture
Add
45ade5d
"""Dataset loading helpers for MathVision-like JSONL exports."""
from __future__ import annotations
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any
@dataclass(frozen=True, slots=True)
class MathVisionRecord:
"""A single visual math problem with optional image and solution metadata."""
problem_id: str
question: str
answer: str
subject: str | None = None
level: int | None = None
problem_type: str | None = None
image_path: Path | None = None
options: tuple[str, ...] = ()
solution: str | None = None
def load_jsonl_records(path: Path) -> list[MathVisionRecord]:
"""Load MathVision-like records from a UTF-8 JSONL file."""
return load_jsonl_records_from_text(path.read_text(encoding="utf-8"), source_dir=path.parent)
def load_jsonl_records_from_text(
content: str,
*,
source_dir: Path | None = None,
) -> list[MathVisionRecord]:
"""Load MathVision-like records from UTF-8 JSONL text."""
records: list[MathVisionRecord] = []
for line_number, line in enumerate(content.splitlines(), start=1):
stripped = line.strip()
if not stripped:
continue
payload = json.loads(stripped)
if not isinstance(payload, dict):
msg = f"Line {line_number} must contain a JSON object."
raise ValueError(msg)
records.append(record_from_mapping(payload, source_dir=source_dir))
return records
def record_from_mapping(
payload: dict[str, Any], *, source_dir: Path | None = None
) -> MathVisionRecord:
"""Create a typed record from a raw dictionary."""
problem_id = _required_string(payload, "id", fallback_key="problem_id")
question = _required_string(payload, "question")
answer = _required_string(payload, "answer")
image_path = _optional_path(payload.get("image"), source_dir=source_dir)
options = _options_from_value(payload.get("options"))
return MathVisionRecord(
problem_id=problem_id,
question=question,
answer=answer,
subject=_optional_string(payload.get("subject")),
level=_optional_int(payload.get("level")),
problem_type=_optional_string(
payload.get("problem_type")
or payload.get("problemKind")
or payload.get("type")
or payload.get("task")
),
image_path=image_path,
options=options,
solution=_optional_string(payload.get("solution")),
)
def filter_records(
records: list[MathVisionRecord],
*,
subject: str | None = None,
level: int | None = None,
) -> list[MathVisionRecord]:
"""Return records matching optional subject and level filters."""
return [
record
for record in records
if (subject is None or record.subject == subject)
and (level is None or record.level == level)
]
def summarize_records(records: list[MathVisionRecord]) -> dict[str, object]:
"""Build a compact summary for CLI output or dashboards."""
subjects = sorted({record.subject for record in records if record.subject is not None})
levels = sorted({record.level for record in records if record.level is not None})
image_count = sum(1 for record in records if record.image_path is not None)
return {
"records": len(records),
"images": image_count,
"subjects": subjects,
"levels": levels,
}
def _required_string(
payload: dict[str, Any], key: str, *, fallback_key: str | None = None
) -> str:
value = payload.get(key)
if value is None and fallback_key is not None:
value = payload.get(fallback_key)
if not isinstance(value, str) or not value.strip():
msg = f"Missing required string field: {key}"
raise ValueError(msg)
return value
def _optional_string(value: object) -> str | None:
if value is None:
return None
if not isinstance(value, str):
msg = "Optional text fields must be strings when present."
raise ValueError(msg)
return value
def _optional_int(value: object) -> int | None:
if value is None:
return None
if isinstance(value, bool) or not isinstance(value, int):
msg = "Level must be an integer when present."
raise ValueError(msg)
return value
def _optional_path(value: object, *, source_dir: Path | None) -> Path | None:
if value is None:
return None
if not isinstance(value, str) or not value:
msg = "Image path must be a non-empty string when present."
raise ValueError(msg)
image_path = Path(value)
if source_dir is not None and not image_path.is_absolute():
return source_dir / image_path
return image_path
def _options_from_value(value: object) -> tuple[str, ...]:
if value is None:
return ()
if not isinstance(value, list) or not all(isinstance(option, str) for option in value):
msg = "Options must be a list of strings when present."
raise ValueError(msg)
return tuple(value)