ddebree's picture
Upload folder using huggingface_hub
f9306c2 verified
"""High-level workflows for MathVision exploration."""
from __future__ import annotations
from pathlib import Path
from mathvision_explorer.dataset import MathVisionRecord, filter_records, load_jsonl_records
from mathvision_explorer.embeddings import ImageEmbedder, embed_record_image
from mathvision_explorer.index import Neighbor, VectorIndex
def build_image_index(records: list[MathVisionRecord], embedder: ImageEmbedder) -> VectorIndex:
"""Build a vector index for all records that have image paths."""
index = VectorIndex()
for record in records:
if record.image_path is None:
continue
index.add(record.problem_id, embed_record_image(record.image_path, embedder))
return index
def find_similar_records(
records: list[MathVisionRecord],
index: VectorIndex,
query_id: str,
query_vector: tuple[float, ...],
*,
limit: int = 5,
) -> list[tuple[MathVisionRecord, Neighbor]]:
"""Find records nearest to a query vector."""
record_by_id = {record.problem_id: record for record in records}
neighbors = index.search(query_vector, limit=limit, exclude_id=query_id)
return [
(record_by_id[neighbor.item_id], neighbor)
for neighbor in neighbors
if neighbor.item_id in record_by_id
]
def load_filtered_records(
path: Path, *, subject: str | None = None, level: int | None = None
) -> list[MathVisionRecord]:
"""Load records and apply optional explorer filters."""
return filter_records(load_jsonl_records(path), subject=subject, level=level)