| import streamlit as st |
| from datasets import load_dataset |
| import os |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
| st.set_page_config(page_title="FW Clusters inspection", layout="wide") |
| st.title("FW clusters inspection") |
|
|
| st.markdown(""" |
| We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). |
| |
| Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material. |
| |
| Additionally, the model was tasked with assigning a category to each cluster from 23 predefined categories found in [AFAIK](https://afaik.io/). |
| |
| Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics seperately. Hence the `Select Category Type` dropdown in our interface. |
| """) |
|
|
| @st.cache_data |
| def load_data(educational_topic): |
| ds = load_dataset("HuggingFaceTB/FW_clusters_under_afaik_topics", split="train", token=HF_TOKEN, num_proc=2) |
| if educational_topic in ['Yes', 'No']: |
| ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic) |
| return ds |
|
|
| @st.cache_data |
| def get_categories_by_type(_ds, category_type): |
| filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type) |
| return list(set(filtered_ds['category'])) |
|
|
|
|
| st.subheader("Cluster information") |
| col_1, col_2, col_3 = st.columns(3) |
| with col_1: |
| educational_topic = st.selectbox('Are the topics deemed educational by the LLM?', ["Yes", "No"]) |
|
|
| ds = load_data(educational_topic) |
|
|
| with col_2: |
| category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik'] |
| selected_category_type = st.selectbox("Select Category Type", category_types) |
| with col_3: |
| categories = get_categories_by_type(ds, selected_category_type) |
| selected_category = st.selectbox("Select Category", categories) |
|
|
| selected_cluster = ds.filter(lambda x: x['category'] == selected_category) |
|
|
| |
| n_samples = len(selected_cluster["examples"]) |
| index_example = st.number_input(f"Index of a sample: 0 - {n_samples}", min_value=0, max_value=n_samples-1, value=0, step=1) |
|
|
| sample = selected_cluster["examples"][index_example] |
| st.markdown(sample) |
|
|