ddebree commited on
Commit
cff9aa4
·
1 Parent(s): 5f7d974
src/mathvision_explorer/streamlit_app.py CHANGED
@@ -60,54 +60,64 @@ def main(jsonl_path: Path = Path("data/demo/demo.jsonl")) -> None:
60
  f"[Example: MathLLMs/MathVision]({MATHVISION_DATASET_URL}) | "
61
  f"[Browse HF datasets]({HF_DATASETS_URL})"
62
  )
63
- uploaded_dataset = st.file_uploader(
64
- "Upload dataset",
65
- type=["jsonl", "zip"],
 
66
  help=(
67
- "Use a JSONL file for text-only records, or a ZIP containing one JSONL "
68
- "file plus referenced images."
69
  ),
70
  )
71
- hf_dataset_ref = st.text_input(
72
- "HF dataset URL or ID",
73
- placeholder="MathLLMs/MathVision",
74
- help="Paste a Hugging Face dataset URL or repo id, then load a split below.",
75
- )
76
- hf_split = st.text_input(
77
- "HF split",
78
- value="test",
79
- help="Dataset split to load, such as test, train, validation, or testmini.",
80
- )
81
- hf_limit = st.number_input(
82
- "HF max records",
83
- min_value=1,
84
- max_value=500,
85
- value=50,
86
- step=10,
87
- help="Cap rows loaded from Hugging Face so exploration stays responsive.",
88
- )
89
- if st.button(
90
- "Load HF dataset",
91
- help="Download the selected split and convert compatible rows into records.",
92
- ):
93
- try:
94
- records = _load_hf_dataset_records(
95
- st,
96
- hf_dataset_ref,
97
- split=hf_split,
98
- limit=int(hf_limit),
99
- )
100
- except (RuntimeError, ValueError, OSError) as error:
101
- st.error(str(error))
102
- st.stop()
103
- raise RuntimeError("Streamlit stopped after HF dataset load error.") from error
104
- st.session_state["hf_dataset_records"] = records
105
- elif uploaded_dataset is not None:
106
- records = _load_uploaded_records(st, uploaded_dataset)
107
- subjects = sorted({record.subject for record in records if record.subject is not None})
108
- levels = sorted({record.level for record in records if record.level is not None})
109
- elif "hf_dataset_records" in st.session_state:
110
- records = st.session_state["hf_dataset_records"]
 
 
 
 
 
 
 
 
 
111
  subjects = sorted({record.subject for record in records if record.subject is not None})
112
  levels = sorted({record.level for record in records if record.level is not None})
113
  summary = summarize_records(records)
 
60
  f"[Example: MathLLMs/MathVision]({MATHVISION_DATASET_URL}) | "
61
  f"[Browse HF datasets]({HF_DATASETS_URL})"
62
  )
63
+ dataset_source = st.radio(
64
+ "Dataset source",
65
+ ["Demo", "Hugging Face URL", "Upload file"],
66
+ horizontal=False,
67
  help=(
68
+ "Choose whether to use the bundled demo, paste a Hub dataset link, "
69
+ "or upload files."
70
  ),
71
  )
72
+ if dataset_source == "Hugging Face URL":
73
+ hf_dataset_ref = st.text_input(
74
+ "HF dataset URL or ID",
75
+ value="MathLLMs/MathVision",
76
+ placeholder="https://huggingface.co/datasets/MathLLMs/MathVision",
77
+ help="Paste a Hugging Face dataset URL or repo id.",
78
+ )
79
+ hf_split = st.text_input(
80
+ "HF split",
81
+ value="test",
82
+ help="Dataset split to load, such as test, train, validation, or testmini.",
83
+ )
84
+ hf_limit = st.number_input(
85
+ "HF max records",
86
+ min_value=1,
87
+ max_value=500,
88
+ value=50,
89
+ step=10,
90
+ help="Cap rows loaded from Hugging Face so exploration stays responsive.",
91
+ )
92
+ if st.button(
93
+ "Load HF dataset",
94
+ help="Download the selected split and convert compatible rows into records.",
95
+ ):
96
+ try:
97
+ records = _load_hf_dataset_records(
98
+ st,
99
+ hf_dataset_ref,
100
+ split=hf_split,
101
+ limit=int(hf_limit),
102
+ )
103
+ except (RuntimeError, ValueError, OSError) as error:
104
+ st.error(str(error))
105
+ st.stop()
106
+ raise RuntimeError("Streamlit stopped after HF dataset load error.") from error
107
+ st.session_state["hf_dataset_records"] = records
108
+ elif "hf_dataset_records" in st.session_state:
109
+ records = st.session_state["hf_dataset_records"]
110
+ elif dataset_source == "Upload file":
111
+ uploaded_dataset = st.file_uploader(
112
+ "Upload dataset",
113
+ type=["jsonl", "zip"],
114
+ help=(
115
+ "Use a JSONL file for text-only records, or a ZIP containing one JSONL "
116
+ "file plus referenced images."
117
+ ),
118
+ )
119
+ if uploaded_dataset is not None:
120
+ records = _load_uploaded_records(st, uploaded_dataset)
121
  subjects = sorted({record.subject for record in records if record.subject is not None})
122
  levels = sorted({record.level for record in records if record.level is not None})
123
  summary = summarize_records(records)