| """Dataset loading helpers for MathVision-like JSONL exports.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class MathVisionRecord: |
| """A single visual math problem with optional image and solution metadata.""" |
|
|
| problem_id: str |
| question: str |
| answer: str |
| subject: str | None = None |
| level: int | None = None |
| problem_type: str | None = None |
| image_path: Path | None = None |
| options: tuple[str, ...] = () |
| solution: str | None = None |
|
|
|
|
| def load_jsonl_records(path: Path) -> list[MathVisionRecord]: |
| """Load MathVision-like records from a UTF-8 JSONL file.""" |
|
|
| return load_jsonl_records_from_text(path.read_text(encoding="utf-8"), source_dir=path.parent) |
|
|
|
|
| def load_jsonl_records_from_text( |
| content: str, |
| *, |
| source_dir: Path | None = None, |
| ) -> list[MathVisionRecord]: |
| """Load MathVision-like records from UTF-8 JSONL text.""" |
|
|
| records: list[MathVisionRecord] = [] |
| for line_number, line in enumerate(content.splitlines(), start=1): |
| stripped = line.strip() |
| if not stripped: |
| continue |
| payload = json.loads(stripped) |
| if not isinstance(payload, dict): |
| msg = f"Line {line_number} must contain a JSON object." |
| raise ValueError(msg) |
| records.append(record_from_mapping(payload, source_dir=source_dir)) |
| return records |
|
|
|
|
| def record_from_mapping( |
| payload: dict[str, Any], *, source_dir: Path | None = None |
| ) -> MathVisionRecord: |
| """Create a typed record from a raw dictionary.""" |
|
|
| problem_id = _required_string(payload, "id", fallback_key="problem_id") |
| question = _required_string(payload, "question") |
| answer = _required_string(payload, "answer") |
| image_path = _optional_path(payload.get("image"), source_dir=source_dir) |
| options = _options_from_value(payload.get("options")) |
|
|
| return MathVisionRecord( |
| problem_id=problem_id, |
| question=question, |
| answer=answer, |
| subject=_optional_string(payload.get("subject")), |
| level=_optional_int(payload.get("level")), |
| problem_type=_optional_string( |
| payload.get("problem_type") |
| or payload.get("problemKind") |
| or payload.get("type") |
| or payload.get("task") |
| ), |
| image_path=image_path, |
| options=options, |
| solution=_optional_string(payload.get("solution")), |
| ) |
|
|
|
|
| def filter_records( |
| records: list[MathVisionRecord], |
| *, |
| subject: str | None = None, |
| level: int | None = None, |
| ) -> list[MathVisionRecord]: |
| """Return records matching optional subject and level filters.""" |
|
|
| return [ |
| record |
| for record in records |
| if (subject is None or record.subject == subject) |
| and (level is None or record.level == level) |
| ] |
|
|
|
|
| def summarize_records(records: list[MathVisionRecord]) -> dict[str, object]: |
| """Build a compact summary for CLI output or dashboards.""" |
|
|
| subjects = sorted({record.subject for record in records if record.subject is not None}) |
| levels = sorted({record.level for record in records if record.level is not None}) |
| image_count = sum(1 for record in records if record.image_path is not None) |
| return { |
| "records": len(records), |
| "images": image_count, |
| "subjects": subjects, |
| "levels": levels, |
| } |
|
|
|
|
| def _required_string( |
| payload: dict[str, Any], key: str, *, fallback_key: str | None = None |
| ) -> str: |
| value = payload.get(key) |
| if value is None and fallback_key is not None: |
| value = payload.get(fallback_key) |
| if not isinstance(value, str) or not value.strip(): |
| msg = f"Missing required string field: {key}" |
| raise ValueError(msg) |
| return value |
|
|
|
|
| def _optional_string(value: object) -> str | None: |
| if value is None: |
| return None |
| if not isinstance(value, str): |
| msg = "Optional text fields must be strings when present." |
| raise ValueError(msg) |
| return value |
|
|
|
|
| def _optional_int(value: object) -> int | None: |
| if value is None: |
| return None |
| if isinstance(value, bool) or not isinstance(value, int): |
| msg = "Level must be an integer when present." |
| raise ValueError(msg) |
| return value |
|
|
|
|
| def _optional_path(value: object, *, source_dir: Path | None) -> Path | None: |
| if value is None: |
| return None |
| if not isinstance(value, str) or not value: |
| msg = "Image path must be a non-empty string when present." |
| raise ValueError(msg) |
| image_path = Path(value) |
| if source_dir is not None and not image_path.is_absolute(): |
| return source_dir / image_path |
| return image_path |
|
|
|
|
| def _options_from_value(value: object) -> tuple[str, ...]: |
| if value is None: |
| return () |
| if not isinstance(value, list) or not all(isinstance(option, str) for option in value): |
| msg = "Options must be a list of strings when present." |
| raise ValueError(msg) |
| return tuple(value) |
|
|