Spaces:

derek-thomas
/

top2vec

Paused

App Files Files Community

derek-thomas commited on Jan 22, 2023

Commit

74ce942

1 Parent(s): d23c925

Init commit

Browse files

Files changed (10) hide show

.gitattributes +2 -0
.gitignore +3 -0
README.md +1 -1
app/Top2Vec.py +26 -0
app/pages/01_Topic_Explorer_📚.py +55 -0
app/pages/02_Document_Explorer_📖.py +119 -0
app/pages/03_Semantic_Search_🔍.py +112 -0
bootstrap.py +12 -0
notebooks/explore.ipynb +0 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -3,6 +3,7 @@
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
@@ -23,6 +24,7 @@
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text

 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.sav filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.pyc
+vscode
+notebooks/.ipynb_checkpoints

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: pink
 colorTo: blue
 sdk: streamlit
 sdk_version: 1.17.0
-app_file: app.py
 pinned: false
 license: mit
 ---

 colorTo: blue
 sdk: streamlit
 sdk_version: 1.17.0
+app_file: app/Top2Vec.py
 pinned: false
 license: mit
 ---

app/Top2Vec.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import streamlit as st
+st.set_page_config(page_title="Top2Vec", layout="wide")
+st.markdown(
+    """
+    # Introduction
+    This is [space](https://huggingface.co/spaces) dedicated to using [top2vec](https://github.com/ddangelov/Top2Vec) and showing what features are available for semantic searching and topic modeling.
+    Please check out this [readme](https://github.com/ddangelov/Top2Vec#how-does-it-work) to better understand how it works.
+    > Top2Vec is an algorithm for **topic modeling** and **semantic search**. It automatically detects topics present in text and generates jointly embedded topic, document and word vectors.
+    # Setup
+    I used the [20 NewsGroups](https://huggingface.co/datasets/SetFit/20_newsgroups) dataset with `top2vec`.
+    I fit on the dataset and reduced the topics to 20.
+    The topics are created from top2vec, not the labels.
+    No analysis on the top 20 topics vs labels is provided.
+    # Usage
+    Check out
+    - The [Topic Explorer](/Topic_Explorer) page to understand what topic were detected
+    - The [Document Explorer](/Document_Explorer) page to visually explore documents
+    - The [Semantic Search](/Semantic_Search) page to search by meaning
+    """
+)

app/pages/01_Topic_Explorer_📚.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from logging import getLogger
+from pathlib import Path
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import streamlit as st
+from top2vec import Top2Vec
+def initialize_state():
+    with st.spinner('Loading App...'):
+        if 'model' not in st.session_state:
+            model = Top2Vec.load('models/model.pkl')
+            model._check_model_status()
+            model.hierarchical_topic_reduction(num_topics=20)
+            assert len(model.topic_words_reduced) == 20
+            st.session_state.model = model
+def main():
+    st.write("""
+    A way to dive into each topic. Use the slider on the left to choose the topic.
+    The `y` axis shows which words are closest to a topic centroid. The `x` axis shows how correlated they are.""")
+    topic_num = st.sidebar.slider("Topic Number", 0, 19, value=0)
+    fig = go.Figure(go.Bar(
+                x=st.session_state.model.topic_word_scores_reduced[topic_num][::-1],
+                y=st.session_state.model.topic_words_reduced[topic_num][::-1],
+                orientation='h'))
+    fig.update_layout(
+        title=f'Words for Topic {topic_num}',
+        yaxis_title='Top 20 topic words',
+        xaxis_title='Distance to topic centroid'
+        )
+    st.plotly_chart(fig, True)
+if __name__ == "__main__":
+    # Setting up Logger and proj_dir
+    logger = getLogger(__name__)
+    proj_dir = Path(__file__).parents[2]
+    # For max width tables
+    pd.set_option('display.max_colwidth', 0)
+    # Streamlit settings
+    st.set_page_config(layout="wide")
+    md_title = "# Topic Explorer 📚"
+    st.markdown(md_title)
+    st.sidebar.markdown(md_title)
+    initialize_state()
+    main()

app/pages/02_Document_Explorer_📖.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from distutils.fancy_getopt import wrap_text
+from logging import getLogger
+from pathlib import Path
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import streamlit as st
+from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
+from streamlit_plotly_events import plotly_events
+from top2vec import Top2Vec
+def initialize_state():
+    if 'data' not in st.session_state:
+        logger.info("loading data...")
+        data = pd.read_csv(proj_dir/'data'/'data.csv')
+        data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
+        st.session_state.data = data
+        st.session_state.selected_data = data
+        st.session_state.all_topics = list(data.topic_id.unique())
+    if 'topics' not in st.session_state:
+        logger.info("loading topics...")
+        topics = pd.read_csv(proj_dir/'data'/'topics.csv')
+        topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
+        st.session_state.topics = topics
+        st.session_state.selected_points = []
+def reset():
+    logger.info("Resetting...")
+    st.session_state.selected_data = st.session_state.data
+    st.session_state.selected_points = []
+def filter_df():
+    if st.session_state.selected_points:
+        points_df = pd.DataFrame(st.session_state.selected_points).loc[:, ['x', 'y']]
+        st.session_state.selected_data = st.session_state.data.merge(points_df, on=['x', 'y'])
+        logger.info(f"Updates selected_data: {len(st.session_state.selected_data)}")
+    else:
+        logger.info(f"Lame")
+def reset():
+    st.session_state.selected_data = st.session_state.data
+    st.session_state.selected_points = []
+def main():
+    st.write("""
+    # Topic Modeling
+    This shows a 2d representation of documents embeded in a semantic space. Each dot is a document
+    and the dots close represent documents that are close in meaning.
+    Zoom in and explore a topic of your choice. You can see the documents you select with the `lasso` or `box`
+    tool below in the corresponding tabs."""
+            )
+    st.button("Reset", help="Will Reset the selected points and the selected topics", on_click=reset)
+    data_to_model = st.session_state.data.sort_values(by='topic_id', ascending=True) # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
+    fig = px.scatter(data_to_model, x='x', y='y', color='topic_id', template='plotly_dark', hover_data=['id', 'topic_id', 'x', 'y'])
+    st.session_state.selected_points = plotly_events(fig, select_event=True, click_event=False)
+    filter_df()
+    tab1, tab2 = st.tabs(["Docs", "Topics"])
+    with tab1:
+        if st.session_state.selected_points:
+            filter_df()
+            cols = ['id', 'topic_id', 'documents']
+            data = st.session_state.selected_data[cols]
+            builder = GridOptionsBuilder.from_dataframe(data)
+            builder.configure_pagination()
+            go = builder.build()
+            AgGrid(st.session_state.selected_data[cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)
+        else:
+            st.markdown('Select points in the graph with the `lasso` or `box` select tools to populate this table.')
+    def get_topics_counts() -> pd.DataFrame:
+        topic_counts = st.session_state.selected_data["topic_id"].value_counts().to_frame()
+        merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
+        cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x':'topic_count'}, axis=1)
+        cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
+        return cleaned[cols]
+    with tab2:
+        if st.session_state.selected_points:
+            filter_df()
+            cols = ['topic_id', 'topic_count', 'topic_0']
+            topic_counts = get_topics_counts()
+            # st.write(topic_counts.columns)
+            builder = GridOptionsBuilder.from_dataframe(topic_counts[cols])
+            builder.configure_pagination()
+            builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
+            go = builder.build()
+            AgGrid(topic_counts.loc[:,cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
+        else:
+            st.markdown('Select points in the graph with the `lasso` or `box` select tools to populate this table.')
+if __name__ == "__main__":
+    # Setting up Logger and proj_dir
+    logger = getLogger(__name__)
+    proj_dir = Path(__file__).parents[2]
+    # For max width tables
+    pd.set_option('display.max_colwidth', 0)
+    # Streamlit settings
+    st.set_page_config(layout="wide")
+    md_title = "# Document Explorer 📖"
+    st.markdown(md_title)
+    st.sidebar.markdown(md_title)
+    initialize_state()
+    main()

app/pages/03_Semantic_Search_🔍.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from distutils.fancy_getopt import wrap_text
+from top2vec import Top2Vec
+import joblib
+import streamlit as st
+import pandas as pd
+from pathlib import Path
+import plotly.express as px
+import plotly.graph_objects as go
+from streamlit_plotly_events import plotly_events
+from st_aggrid import AgGrid, GridOptionsBuilder, ColumnsAutoSizeMode
+from logging import getLogger
+@st.cache(show_spinner=False)
+def initialize_state():
+    with st.spinner("Loading app..."):
+        if 'model' not in st.session_state:
+            model = Top2Vec.load('models/model.pkl')
+            model._check_model_status()
+            st.session_state.model = model
+            st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
+            logger.info("loading data...")
+            data = pd.read_csv(proj_dir/'data'/'data.csv')
+            data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
+            st.session_state.data = data
+            topics = pd.read_csv(proj_dir/'data'/'topics.csv')
+            topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
+            st.session_state.topics = topics
+def main():
+    max_docs = st.sidebar.slider("# docs", 10, 100, value=50)
+    to_search = st.text_input("Write your query here", "") or ""
+    with st.spinner('Embedding Query...'):
+        vector = st.session_state.model.embed([to_search])
+    with st.spinner('Dimension Reduction...'):
+        point = st.session_state.umap_model.transform(vector.reshape(1, -1))
+    documents, document_scores, document_ids = st.session_state.model.search_documents_by_vector(vector.flatten(), num_docs=max_docs)
+    st.session_state.search_raw_df = pd.DataFrame({'document_ids':document_ids, 'document_scores':document_scores})
+    st.session_state.data_to_model = st.session_state.data.merge(st.session_state.search_raw_df, left_on='id', right_on='document_ids').drop(['document_ids'], axis=1)
+    st.session_state.data_to_model = st.session_state.data_to_model.sort_values(by='document_scores', ascending=False) # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
+    st.session_state.data_to_model.loc[len(st.session_state.data_to_model.index)] = ['Point', *point[0].tolist(), to_search, 'Query', 0]
+    st.session_state.data_to_model_with_point = st.session_state.data_to_model
+    st.session_state.data_to_model_without_point = st.session_state.data_to_model.iloc[:-1]
+    def get_topics_counts() -> pd.DataFrame:
+        topic_counts = st.session_state.data_to_model_without_point["topic_id"].value_counts().to_frame()
+        merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
+        cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x':'topic_count'}, axis=1)
+        cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
+        return cleaned[cols]
+    st.write("""
+    # Semantic Search
+    This shows a 2d representation of documents embeded in a semantic space. Each dot is a document
+    and the dots close represent documents that are close in meaning.
+    Note that the distance metrics were computed at a higher dimension so take the representation with
+    a grain of salt.
+    The Query is shown with the documents in yellow.
+            """
+            )
+    df = st.session_state.data_to_model_with_point.sort_values(by='topic_id', ascending=True)
+    fig = px.scatter(df.iloc[:-1], x='x', y='y', color='topic_id', template='plotly_dark', hover_data=['id', 'topic_id', 'x', 'y'])
+    fig.add_traces(px.scatter(df.tail(1), x="x", y="y").update_traces(marker_size=10, marker_color="yellow").data)
+    st.plotly_chart(fig, use_container_width=True)
+    tab1, tab2 = st.tabs(["Docs", "Topics"])
+    with tab1:
+        cols = ['id', 'document_scores', 'topic_id', 'documents']
+        builder = GridOptionsBuilder.from_dataframe(st.session_state.data_to_model_without_point.loc[:, cols])
+        builder.configure_pagination()
+        builder.configure_column('document_scores', type=["numericColumn","numberColumnFilter","customNumericFormat"], precision=2)
+        go = builder.build()
+        AgGrid(st.session_state.data_to_model_without_point.loc[:,cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)
+    with tab2:
+        cols = ['topic_id', 'topic_count', 'topic_0']
+        topic_counts = get_topics_counts()
+        builder = GridOptionsBuilder.from_dataframe(topic_counts[cols])
+        builder.configure_pagination()
+        builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
+        go = builder.build()
+        AgGrid(topic_counts.loc[:,cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
+if __name__ == "__main__":
+    # Setting up Logger and proj_dir
+    logger = getLogger(__name__)
+    proj_dir = Path(__file__).parents[2]
+    # For max width tables
+    pd.set_option('display.max_colwidth', 0)
+    # Streamlit settings
+    st.set_page_config(layout="wide")
+    md_title = "# Semantic Search 🔍"
+    st.markdown(md_title)
+    st.sidebar.markdown(md_title)
+    initialize_state()
+    main()

bootstrap.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from pathlib import Path
+import streamlit.web.bootstrap
+from streamlit import config as _config
+proj_dir = Path(__file__).parent
+filename = proj_dir / "app" / "Top2Vec.py"
+_config.set_option("server.headless", True)
+args = []
+# streamlit.cli.main_run(filename, args)
+streamlit.web.bootstrap.run(str(filename), "", args, "")

notebooks/explore.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+top2vec[sentence_transformers]==1.0.27
+scikit-learn==1.1.1
+jupyter==1.0.0
+streamlit==1.16.0
+streamlit-aggrid==0.3.3
+streamlit-plotly-events==0.0.6
+plotly==5.9.0
+datasets==2.8.0
+keybert==0.7.0