Spaces:

ddebree
/

mathvision-jepa-explorer

Running

App Files Files Community

ddebree commited on May 3

Commit

45ade5d

1 Parent(s): 44f945d

Add

Browse files

Files changed (3) hide show

src/mathvision_explorer/dataset.py +19 -10
src/mathvision_explorer/streamlit_app.py +94 -3
tests/test_dataset.py +16 -0

src/mathvision_explorer/dataset.py CHANGED Viewed

@@ -26,17 +26,26 @@ class MathVisionRecord:
 def load_jsonl_records(path: Path) -> list[MathVisionRecord]:
     """Load MathVision-like records from a UTF-8 JSONL file."""
     records: list[MathVisionRecord] = []
-    with path.open("r", encoding="utf-8") as jsonl_file:
-        for line_number, line in enumerate(jsonl_file, start=1):
-            stripped = line.strip()
-            if not stripped:
-                continue
-            payload = json.loads(stripped)
-            if not isinstance(payload, dict):
-                msg = f"Line {line_number} must contain a JSON object."
-                raise ValueError(msg)
-            records.append(record_from_mapping(payload, source_dir=path.parent))
     return records

 def load_jsonl_records(path: Path) -> list[MathVisionRecord]:
     """Load MathVision-like records from a UTF-8 JSONL file."""
+    return load_jsonl_records_from_text(path.read_text(encoding="utf-8"), source_dir=path.parent)
+def load_jsonl_records_from_text(
+    content: str,
+    *,
+    source_dir: Path | None = None,
+) -> list[MathVisionRecord]:
+    """Load MathVision-like records from UTF-8 JSONL text."""
     records: list[MathVisionRecord] = []
+    for line_number, line in enumerate(content.splitlines(), start=1):
+        stripped = line.strip()
+        if not stripped:
+            continue
+        payload = json.loads(stripped)
+        if not isinstance(payload, dict):
+            msg = f"Line {line_number} must contain a JSON object."
+            raise ValueError(msg)
+        records.append(record_from_mapping(payload, source_dir=source_dir))
     return records

src/mathvision_explorer/streamlit_app.py CHANGED Viewed

@@ -2,11 +2,22 @@
 from __future__ import annotations
 from importlib import import_module
 from pathlib import Path
 from typing import Any
-from mathvision_explorer.dataset import MathVisionRecord, filter_records, load_jsonl_records
 from mathvision_explorer.embeddings import (
     ColorStatsEmbedder,
     IJepaImageEmbedder,
@@ -30,16 +41,31 @@ def main(jsonl_path: Path = Path("data/demo/demo.jsonl")) -> None:
     """Run the Streamlit explorer app."""
     st = _load_streamlit()
-    records = load_jsonl_records(jsonl_path)
     st.set_page_config(page_title="MathVision Explorer", layout="wide")
     _stabilize_layout(st)
     st.title("MathVision Explorer")
     subjects = sorted({record.subject for record in records if record.subject is not None})
     levels = sorted({record.level for record in records if record.level is not None})
     with st.sidebar:
         st.header("Filters")
         subject = st.selectbox(
             "Subject",
@@ -238,6 +264,71 @@ def _render_record(st: Any, record: MathVisionRecord, *, show_solution: bool) ->
                 st.write(record.solution)
 def _render_patch_attention(
     st: Any,
     embedder: ImageEmbedder,

 from __future__ import annotations
+import hashlib
+import io
+import shutil
+import tempfile
 from importlib import import_module
 from pathlib import Path
 from typing import Any
+from zipfile import BadZipFile, ZipFile
+from mathvision_explorer.dataset import (
+    MathVisionRecord,
+    filter_records,
+    load_jsonl_records,
+    load_jsonl_records_from_text,
+    summarize_records,
+)
 from mathvision_explorer.embeddings import (
     ColorStatsEmbedder,
     IJepaImageEmbedder,
     """Run the Streamlit explorer app."""
     st = _load_streamlit()
     st.set_page_config(page_title="MathVision Explorer", layout="wide")
     _stabilize_layout(st)
     st.title("MathVision Explorer")
+    records = _load_active_records(st, jsonl_path)
     subjects = sorted({record.subject for record in records if record.subject is not None})
     levels = sorted({record.level for record in records if record.level is not None})
     with st.sidebar:
+        st.header("Dataset")
+        uploaded_dataset = st.file_uploader(
+            "Upload dataset",
+            type=["jsonl", "zip"],
+            help=(
+                "Use a JSONL file for text-only records, or a ZIP containing one JSONL "
+                "file plus referenced images."
+            ),
+        )
+        if uploaded_dataset is not None:
+            records = _load_uploaded_records(st, uploaded_dataset)
+            subjects = sorted({record.subject for record in records if record.subject is not None})
+            levels = sorted({record.level for record in records if record.level is not None})
+        summary = summarize_records(records)
+        st.caption(f"{summary['records']} records | {summary['images']} images")
         st.header("Filters")
         subject = st.selectbox(
             "Subject",
                 st.write(record.solution)
+def _load_active_records(st: Any, jsonl_path: Path) -> list[MathVisionRecord]:
+    try:
+        return load_jsonl_records(jsonl_path)
+    except (OSError, ValueError) as error:
+        st.error(str(error))
+        st.stop()
+        raise RuntimeError("Streamlit stopped after dataset load error.") from error
+def _load_uploaded_records(st: Any, uploaded_dataset: Any) -> list[MathVisionRecord]:
+    dataset_bytes = uploaded_dataset.getvalue()
+    dataset_name = uploaded_dataset.name
+    dataset_key = _uploaded_dataset_key(dataset_name, dataset_bytes)
+    try:
+        if dataset_name.lower().endswith(".zip"):
+            return _load_uploaded_zip_records(st, dataset_key, dataset_bytes)
+        return load_jsonl_records_from_text(dataset_bytes.decode("utf-8"))
+    except (BadZipFile, UnicodeDecodeError, ValueError, OSError) as error:
+        st.error(str(error))
+        st.stop()
+        raise RuntimeError("Streamlit stopped after upload load error.") from error
+def _load_uploaded_zip_records(
+    st: Any,
+    dataset_key: str,
+    dataset_bytes: bytes,
+) -> list[MathVisionRecord]:
+    upload_state = st.session_state.setdefault("uploaded_dataset", {})
+    if upload_state.get("key") != dataset_key:
+        _remove_upload_dir(upload_state.get("extract_dir"))
+        extract_dir = Path(tempfile.mkdtemp(prefix="mathvision-upload-"))
+        _extract_zip_safely(dataset_bytes, extract_dir)
+        upload_state.clear()
+        upload_state.update({"key": dataset_key, "extract_dir": str(extract_dir)})
+    extract_dir = Path(upload_state["extract_dir"])
+    jsonl_files = sorted(extract_dir.rglob("*.jsonl"))
+    if not jsonl_files:
+        msg = "Uploaded ZIP must contain a .jsonl file."
+        raise ValueError(msg)
+    return load_jsonl_records(jsonl_files[0])
+def _extract_zip_safely(dataset_bytes: bytes, extract_dir: Path) -> None:
+    with ZipFile(io.BytesIO(dataset_bytes)) as dataset_zip:
+        for member in dataset_zip.infolist():
+            target_path = (extract_dir / member.filename).resolve()
+            if not target_path.is_relative_to(extract_dir.resolve()):
+                msg = f"Unsafe ZIP member path: {member.filename}"
+                raise ValueError(msg)
+            dataset_zip.extract(member, extract_dir)
+def _uploaded_dataset_key(dataset_name: str, dataset_bytes: bytes) -> str:
+    digest = hashlib.sha256(dataset_bytes).hexdigest()
+    return f"{dataset_name}:{digest}"
+def _remove_upload_dir(path: object) -> None:
+    if isinstance(path, str):
+        shutil.rmtree(path, ignore_errors=True)
 def _render_patch_attention(
     st: Any,
     embedder: ImageEmbedder,

tests/test_dataset.py CHANGED Viewed

@@ -10,6 +10,7 @@ import pytest
 from mathvision_explorer.dataset import (
     filter_records,
     load_jsonl_records,
     record_from_mapping,
     summarize_records,
 )
@@ -52,6 +53,21 @@ def test_load_jsonl_records_resolves_relative_image_paths(tmp_path: Path) -> Non
     assert records[0].image_path == tmp_path / "images" / "mv-2.png"
 def test_filter_and_summary() -> None:
     """Records can be filtered and summarized for explorer views."""

 from mathvision_explorer.dataset import (
     filter_records,
     load_jsonl_records,
+    load_jsonl_records_from_text,
     record_from_mapping,
     summarize_records,
 )
     assert records[0].image_path == tmp_path / "images" / "mv-2.png"
+def test_load_jsonl_records_from_text_resolves_relative_image_paths(tmp_path: Path) -> None:
+    """Uploaded JSONL content can still use a caller-provided image base directory."""
+    payload = {
+        "id": "mv-3",
+        "question": "Pick the matching graph.",
+        "answer": "A",
+        "image": "images/mv-3.png",
+    }
+    records = load_jsonl_records_from_text(json.dumps(payload), source_dir=tmp_path)
+    assert records[0].image_path == tmp_path / "images" / "mv-3.png"
 def test_filter_and_summary() -> None:
     """Records can be filtered and summarized for explorer views."""