"""Dataset loading helpers for MathVision-like JSONL exports.""" from __future__ import annotations import json from dataclasses import dataclass from pathlib import Path from typing import Any @dataclass(frozen=True, slots=True) class MathVisionRecord: """A single visual math problem with optional image and solution metadata.""" problem_id: str question: str answer: str subject: str | None = None level: int | None = None problem_type: str | None = None image_path: Path | None = None options: tuple[str, ...] = () solution: str | None = None def load_jsonl_records(path: Path) -> list[MathVisionRecord]: """Load MathVision-like records from a UTF-8 JSONL file.""" return load_jsonl_records_from_text(path.read_text(encoding="utf-8"), source_dir=path.parent) def load_jsonl_records_from_text( content: str, *, source_dir: Path | None = None, ) -> list[MathVisionRecord]: """Load MathVision-like records from UTF-8 JSONL text.""" records: list[MathVisionRecord] = [] for line_number, line in enumerate(content.splitlines(), start=1): stripped = line.strip() if not stripped: continue payload = json.loads(stripped) if not isinstance(payload, dict): msg = f"Line {line_number} must contain a JSON object." raise ValueError(msg) records.append(record_from_mapping(payload, source_dir=source_dir)) return records def record_from_mapping( payload: dict[str, Any], *, source_dir: Path | None = None ) -> MathVisionRecord: """Create a typed record from a raw dictionary.""" problem_id = _required_string(payload, "id", fallback_key="problem_id") question = _required_string(payload, "question") answer = _required_string(payload, "answer") image_path = _optional_path(payload.get("image"), source_dir=source_dir) options = _options_from_value(payload.get("options")) return MathVisionRecord( problem_id=problem_id, question=question, answer=answer, subject=_optional_string(payload.get("subject")), level=_optional_int(payload.get("level")), problem_type=_optional_string( payload.get("problem_type") or payload.get("problemKind") or payload.get("type") or payload.get("task") ), image_path=image_path, options=options, solution=_optional_string(payload.get("solution")), ) def filter_records( records: list[MathVisionRecord], *, subject: str | None = None, level: int | None = None, ) -> list[MathVisionRecord]: """Return records matching optional subject and level filters.""" return [ record for record in records if (subject is None or record.subject == subject) and (level is None or record.level == level) ] def summarize_records(records: list[MathVisionRecord]) -> dict[str, object]: """Build a compact summary for CLI output or dashboards.""" subjects = sorted({record.subject for record in records if record.subject is not None}) levels = sorted({record.level for record in records if record.level is not None}) image_count = sum(1 for record in records if record.image_path is not None) return { "records": len(records), "images": image_count, "subjects": subjects, "levels": levels, } def _required_string( payload: dict[str, Any], key: str, *, fallback_key: str | None = None ) -> str: value = payload.get(key) if value is None and fallback_key is not None: value = payload.get(fallback_key) if not isinstance(value, str) or not value.strip(): msg = f"Missing required string field: {key}" raise ValueError(msg) return value def _optional_string(value: object) -> str | None: if value is None: return None if not isinstance(value, str): msg = "Optional text fields must be strings when present." raise ValueError(msg) return value def _optional_int(value: object) -> int | None: if value is None: return None if isinstance(value, bool) or not isinstance(value, int): msg = "Level must be an integer when present." raise ValueError(msg) return value def _optional_path(value: object, *, source_dir: Path | None) -> Path | None: if value is None: return None if not isinstance(value, str) or not value: msg = "Image path must be a non-empty string when present." raise ValueError(msg) image_path = Path(value) if source_dir is not None and not image_path.is_absolute(): return source_dir / image_path return image_path def _options_from_value(value: object) -> tuple[str, ...]: if value is None: return () if not isinstance(value, list) or not all(isinstance(option, str) for option in value): msg = "Options must be a list of strings when present." raise ValueError(msg) return tuple(value)