| import streamlit as st |
| from datasets import load_dataset |
| import os |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
| st.set_page_config(page_title="Synthetic textbooks inspection", layout="wide") |
| st.title("Synthetic textbooks inspection") |
| st.markdown("Inspection of synthetic textbooks generated by `Falcon-180B-chat`") |
|
|
| @st.cache_data() |
| def load_data(source="all"): |
| ds = load_dataset("HuggingFaceTB/synthetic_textbooks_subset", split="train", use_auth_token=HF_TOKEN) |
| if source != "all": |
| ds = ds.filter(lambda x: x["source"] == source) |
| return ds |
|
|
|
|
| source = st.selectbox("Data source", ['all', 'wikihow','khan_academy', 'stanford_courses', 'rw_wikihow', 'rw_stanford']) |
| samples = load_data(source) |
| n_samples = len(samples) |
|
|
| index_example = st.number_input(f"Index of the sample (out of {n_samples}):", min_value=0, max_value=n_samples-1, value=0, step=1) |
| st.markdown(f"Example belongs to source: {samples[index_example]['source']}") |
| st.subheader("Prompt") |
| st.markdown(samples[index_example]["prompt"]) |
|
|
| st.subheader("Textbook") |
| st.markdown(samples[index_example]['textbook']) |