Make
Browse files
src/mathvision_explorer/streamlit_app.py
CHANGED
|
@@ -60,54 +60,64 @@ def main(jsonl_path: Path = Path("data/demo/demo.jsonl")) -> None:
|
|
| 60 |
f"[Example: MathLLMs/MathVision]({MATHVISION_DATASET_URL}) | "
|
| 61 |
f"[Browse HF datasets]({HF_DATASETS_URL})"
|
| 62 |
)
|
| 63 |
-
|
| 64 |
-
"
|
| 65 |
-
|
|
|
|
| 66 |
help=(
|
| 67 |
-
"
|
| 68 |
-
"
|
| 69 |
),
|
| 70 |
)
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
elif
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
subjects = sorted({record.subject for record in records if record.subject is not None})
|
| 112 |
levels = sorted({record.level for record in records if record.level is not None})
|
| 113 |
summary = summarize_records(records)
|
|
|
|
| 60 |
f"[Example: MathLLMs/MathVision]({MATHVISION_DATASET_URL}) | "
|
| 61 |
f"[Browse HF datasets]({HF_DATASETS_URL})"
|
| 62 |
)
|
| 63 |
+
dataset_source = st.radio(
|
| 64 |
+
"Dataset source",
|
| 65 |
+
["Demo", "Hugging Face URL", "Upload file"],
|
| 66 |
+
horizontal=False,
|
| 67 |
help=(
|
| 68 |
+
"Choose whether to use the bundled demo, paste a Hub dataset link, "
|
| 69 |
+
"or upload files."
|
| 70 |
),
|
| 71 |
)
|
| 72 |
+
if dataset_source == "Hugging Face URL":
|
| 73 |
+
hf_dataset_ref = st.text_input(
|
| 74 |
+
"HF dataset URL or ID",
|
| 75 |
+
value="MathLLMs/MathVision",
|
| 76 |
+
placeholder="https://huggingface.co/datasets/MathLLMs/MathVision",
|
| 77 |
+
help="Paste a Hugging Face dataset URL or repo id.",
|
| 78 |
+
)
|
| 79 |
+
hf_split = st.text_input(
|
| 80 |
+
"HF split",
|
| 81 |
+
value="test",
|
| 82 |
+
help="Dataset split to load, such as test, train, validation, or testmini.",
|
| 83 |
+
)
|
| 84 |
+
hf_limit = st.number_input(
|
| 85 |
+
"HF max records",
|
| 86 |
+
min_value=1,
|
| 87 |
+
max_value=500,
|
| 88 |
+
value=50,
|
| 89 |
+
step=10,
|
| 90 |
+
help="Cap rows loaded from Hugging Face so exploration stays responsive.",
|
| 91 |
+
)
|
| 92 |
+
if st.button(
|
| 93 |
+
"Load HF dataset",
|
| 94 |
+
help="Download the selected split and convert compatible rows into records.",
|
| 95 |
+
):
|
| 96 |
+
try:
|
| 97 |
+
records = _load_hf_dataset_records(
|
| 98 |
+
st,
|
| 99 |
+
hf_dataset_ref,
|
| 100 |
+
split=hf_split,
|
| 101 |
+
limit=int(hf_limit),
|
| 102 |
+
)
|
| 103 |
+
except (RuntimeError, ValueError, OSError) as error:
|
| 104 |
+
st.error(str(error))
|
| 105 |
+
st.stop()
|
| 106 |
+
raise RuntimeError("Streamlit stopped after HF dataset load error.") from error
|
| 107 |
+
st.session_state["hf_dataset_records"] = records
|
| 108 |
+
elif "hf_dataset_records" in st.session_state:
|
| 109 |
+
records = st.session_state["hf_dataset_records"]
|
| 110 |
+
elif dataset_source == "Upload file":
|
| 111 |
+
uploaded_dataset = st.file_uploader(
|
| 112 |
+
"Upload dataset",
|
| 113 |
+
type=["jsonl", "zip"],
|
| 114 |
+
help=(
|
| 115 |
+
"Use a JSONL file for text-only records, or a ZIP containing one JSONL "
|
| 116 |
+
"file plus referenced images."
|
| 117 |
+
),
|
| 118 |
+
)
|
| 119 |
+
if uploaded_dataset is not None:
|
| 120 |
+
records = _load_uploaded_records(st, uploaded_dataset)
|
| 121 |
subjects = sorted({record.subject for record in records if record.subject is not None})
|
| 122 |
levels = sorted({record.level for record in records if record.level is not None})
|
| 123 |
summary = summarize_records(records)
|