Spaces:

derek-thomas
/

top2vec

Paused

App Files Files Community

derek-thomas commited on Feb 8, 2023

Commit

ea72d75

1 Parent(s): b8cbe49

Reformatting

Browse files

Files changed (4) hide show

app/Top2Vec.py +21 -21
app/pages/01_Topic_Explorer_📚.py +16 -16
app/pages/02_Document_Explorer_📖.py +22 -20
app/pages/03_Semantic_Search_🔍.py +33 -30

app/Top2Vec.py CHANGED Viewed

@@ -1,26 +1,26 @@
 import streamlit as st
-st.set_page_config(page_title="Top2Vec", layout="wide")
 st.markdown(
-    """
-    # Introduction
-    This is [space](https://huggingface.co/spaces) dedicated to using [top2vec](https://github.com/ddangelov/Top2Vec) and showing what features are available for semantic searching and topic modeling.
-    Please check out this [readme](https://github.com/ddangelov/Top2Vec#how-does-it-work) to better understand how it works.
-    > Top2Vec is an algorithm for **topic modeling** and **semantic search**. It automatically detects topics present in text and generates jointly embedded topic, document and word vectors.
-    # Setup
-    I used the [20 NewsGroups](https://huggingface.co/datasets/SetFit/20_newsgroups) dataset with `top2vec`.
-    I fit on the dataset and reduced the topics to 20.
-    The topics are created from top2vec, not the labels.
-    No analysis on the top 20 topics vs labels is provided.
-    # Usage
-    Check out
-    - The [Topic Explorer](/Topic_Explorer) page to understand what topic were detected
-    - The [Document Explorer](/Document_Explorer) page to visually explore documents
-    - The [Semantic Search](/Semantic_Search) page to search by meaning
-    """
-)

 import streamlit as st
+st.set_page_config(page_title="Top2Vec", layout="wide")
 st.markdown(
+        """
+        # Introduction
+        This is [space](https://huggingface.co/spaces) dedicated to using [top2vec](https://github.com/ddangelov/Top2Vec) and showing what features are available for semantic searching and topic modeling.
+        Please check out this [readme](https://github.com/ddangelov/Top2Vec#how-does-it-work) to better understand how it works.
+        > Top2Vec is an algorithm for **topic modeling** and **semantic search**. It automatically detects topics present in text and generates jointly embedded topic, document and word vectors.
+        # Setup
+        I used the [20 NewsGroups](https://huggingface.co/datasets/SetFit/20_newsgroups) dataset with `top2vec`.
+        I fit on the dataset and reduced the topics to 20.
+        The topics are created from top2vec, not the labels.
+        No analysis on the top 20 topics vs labels is provided.
+        # Usage
+        Check out
+        - The [Topic Explorer](/Topic_Explorer) page to understand what topic were detected
+        - The [Document Explorer](/Document_Explorer) page to visually explore documents
+        - The [Semantic Search](/Semantic_Search) page to search by meaning
+        """
+        )

app/pages/01_Topic_Explorer_📚.py CHANGED Viewed

@@ -1,12 +1,10 @@
 from logging import getLogger
 from pathlib import Path
-import joblib
 import pandas as pd
-import plotly.express as px
 import plotly.graph_objects as go
 import streamlit as st
 from top2vec import Top2Vec
@@ -22,18 +20,18 @@ def initialize_state():
             st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
             logger.info("loading data...")
-            data = pd.read_csv(proj_dir/'data'/'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
-            topics = pd.read_csv(proj_dir/'data'/'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
         if 'data' not in st.session_state:
             logger.info("loading data...")
-            data = pd.read_csv(proj_dir/'data'/'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
             st.session_state.selected_data = data
@@ -41,10 +39,11 @@ def initialize_state():
         if 'topics' not in st.session_state:
             logger.info("loading topics...")
-            topics = pd.read_csv(proj_dir/'data'/'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
 def main():
     st.write("""
     A way to dive into each topic. Use the slider on the left to choose the topic.
@@ -53,17 +52,18 @@ def main():
     topic_num = st.sidebar.slider("Topic Number", 0, 19, value=0)
     fig = go.Figure(go.Bar(
-                x=st.session_state.model.topic_word_scores_reduced[topic_num][::-1],
-                y=st.session_state.model.topic_words_reduced[topic_num][::-1],
-                orientation='h'))
     fig.update_layout(
-        title=f'Words for Topic {topic_num}',
-        yaxis_title='Top 20 topic words',
-        xaxis_title='Distance to topic centroid'
-        )
     st.plotly_chart(fig, True)
 if __name__ == "__main__":
     # Setting up Logger and proj_dir
     logger = getLogger(__name__)
@@ -73,10 +73,10 @@ if __name__ == "__main__":
     pd.set_option('display.max_colwidth', 0)
     # Streamlit settings
-    st.set_page_config(layout="wide")
     md_title = "# Topic Explorer 📚"
     st.markdown(md_title)
     st.sidebar.markdown(md_title)
     initialize_state()
-    main()

 from logging import getLogger
 from pathlib import Path
+import joblib
 import pandas as pd
 import plotly.graph_objects as go
 import streamlit as st
 from top2vec import Top2Vec
             st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
             logger.info("loading data...")
+            data = pd.read_csv(proj_dir / 'data' / 'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
+            topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
         if 'data' not in st.session_state:
             logger.info("loading data...")
+            data = pd.read_csv(proj_dir / 'data' / 'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
             st.session_state.selected_data = data
         if 'topics' not in st.session_state:
             logger.info("loading topics...")
+            topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
 def main():
     st.write("""
     A way to dive into each topic. Use the slider on the left to choose the topic.
     topic_num = st.sidebar.slider("Topic Number", 0, 19, value=0)
     fig = go.Figure(go.Bar(
+            x=st.session_state.model.topic_word_scores_reduced[topic_num][::-1],
+            y=st.session_state.model.topic_words_reduced[topic_num][::-1],
+            orientation='h'))
     fig.update_layout(
+            title=f'Words for Topic {topic_num}',
+            yaxis_title='Top 20 topic words',
+            xaxis_title='Distance to topic centroid'
+            )
     st.plotly_chart(fig, True)
 if __name__ == "__main__":
     # Setting up Logger and proj_dir
     logger = getLogger(__name__)
     pd.set_option('display.max_colwidth', 0)
     # Streamlit settings
+    st.set_page_config(layout="wide")
     md_title = "# Topic Explorer 📚"
     st.markdown(md_title)
     st.sidebar.markdown(md_title)
     initialize_state()
+    main()

app/pages/02_Document_Explorer_📖.py CHANGED Viewed

@@ -1,15 +1,12 @@
-from distutils.fancy_getopt import wrap_text
 from logging import getLogger
 from pathlib import Path
-import joblib
 import pandas as pd
 import plotly.express as px
-import plotly.graph_objects as go
 import streamlit as st
 from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
 from streamlit_plotly_events import plotly_events
 from top2vec import Top2Vec
@@ -25,18 +22,18 @@ def initialize_state():
             st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
             logger.info("loading data...")
-            data = pd.read_csv(proj_dir/'data'/'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
-            topics = pd.read_csv(proj_dir/'data'/'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
         if 'data' not in st.session_state:
             logger.info("loading data...")
-            data = pd.read_csv(proj_dir/'data'/'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
             st.session_state.selected_data = data
@@ -44,10 +41,11 @@ def initialize_state():
         if 'topics' not in st.session_state:
             logger.info("loading topics...")
-            topics = pd.read_csv(proj_dir/'data'/'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
 def reset():
     logger.info("Resetting...")
     st.session_state.selected_data = st.session_state.data
@@ -61,12 +59,12 @@ def filter_df():
         logger.info(f"Updates selected_data: {len(st.session_state.selected_data)}")
     else:
         logger.info(f"Lame")
 def reset():
     st.session_state.selected_data = st.session_state.data
     st.session_state.selected_points = []
 def main():
     st.write("""
@@ -76,11 +74,13 @@ def main():
     Zoom in and explore a topic of your choice. You can see the documents you select with the `lasso` or `box`
     tool below in the corresponding tabs."""
-            )
     st.button("Reset", help="Will Reset the selected points and the selected topics", on_click=reset)
-    data_to_model = st.session_state.data.sort_values(by='topic_id', ascending=True) # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
-    fig = px.scatter(data_to_model, x='x', y='y', color='topic_id', template='plotly_dark', hover_data=['id', 'topic_id', 'x', 'y'])
     st.session_state.selected_points = plotly_events(fig, select_event=True, click_event=False)
     filter_df()
@@ -94,18 +94,18 @@ def main():
             builder = GridOptionsBuilder.from_dataframe(data)
             builder.configure_pagination()
             go = builder.build()
-            AgGrid(st.session_state.selected_data[cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)
         else:
             st.markdown('Select points in the graph with the `lasso` or `box` select tools to populate this table.')
     def get_topics_counts() -> pd.DataFrame:
         topic_counts = st.session_state.selected_data["topic_id"].value_counts().to_frame()
         merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
-        cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x':'topic_count'}, axis=1)
         cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
         return cleaned[cols]
     with tab2:
         if st.session_state.selected_points:
             filter_df()
@@ -116,10 +116,12 @@ def main():
             builder.configure_pagination()
             builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
             go = builder.build()
-            AgGrid(topic_counts.loc[:,cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
         else:
             st.markdown('Select points in the graph with the `lasso` or `box` select tools to populate this table.')
 if __name__ == "__main__":
     # Setting up Logger and proj_dir
     logger = getLogger(__name__)
@@ -129,10 +131,10 @@ if __name__ == "__main__":
     pd.set_option('display.max_colwidth', 0)
     # Streamlit settings
-    st.set_page_config(layout="wide")
     md_title = "# Document Explorer 📖"
     st.markdown(md_title)
     st.sidebar.markdown(md_title)
     initialize_state()
-    main()

 from logging import getLogger
 from pathlib import Path
+import joblib
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
 from streamlit_plotly_events import plotly_events
 from top2vec import Top2Vec
             st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
             logger.info("loading data...")
+            data = pd.read_csv(proj_dir / 'data' / 'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
+            topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
         if 'data' not in st.session_state:
             logger.info("loading data...")
+            data = pd.read_csv(proj_dir / 'data' / 'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
             st.session_state.selected_data = data
         if 'topics' not in st.session_state:
             logger.info("loading topics...")
+            topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
 def reset():
     logger.info("Resetting...")
     st.session_state.selected_data = st.session_state.data
         logger.info(f"Updates selected_data: {len(st.session_state.selected_data)}")
     else:
         logger.info(f"Lame")
 def reset():
     st.session_state.selected_data = st.session_state.data
     st.session_state.selected_points = []
 def main():
     st.write("""
     Zoom in and explore a topic of your choice. You can see the documents you select with the `lasso` or `box`
     tool below in the corresponding tabs."""
+             )
     st.button("Reset", help="Will Reset the selected points and the selected topics", on_click=reset)
+    data_to_model = st.session_state.data.sort_values(by='topic_id',
+                                                      ascending=True)  # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
+    fig = px.scatter(data_to_model, x='x', y='y', color='topic_id', template='plotly_dark',
+                     hover_data=['id', 'topic_id', 'x', 'y'])
     st.session_state.selected_points = plotly_events(fig, select_event=True, click_event=False)
     filter_df()
             builder = GridOptionsBuilder.from_dataframe(data)
             builder.configure_pagination()
             go = builder.build()
+            AgGrid(st.session_state.selected_data[cols], theme='streamlit', gridOptions=go,
+                   columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)
         else:
             st.markdown('Select points in the graph with the `lasso` or `box` select tools to populate this table.')
     def get_topics_counts() -> pd.DataFrame:
         topic_counts = st.session_state.selected_data["topic_id"].value_counts().to_frame()
         merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
+        cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x': 'topic_count'}, axis=1)
         cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
         return cleaned[cols]
     with tab2:
         if st.session_state.selected_points:
             filter_df()
             builder.configure_pagination()
             builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
             go = builder.build()
+            AgGrid(topic_counts.loc[:, cols], theme='streamlit', gridOptions=go,
+                   columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
         else:
             st.markdown('Select points in the graph with the `lasso` or `box` select tools to populate this table.')
 if __name__ == "__main__":
     # Setting up Logger and proj_dir
     logger = getLogger(__name__)
     pd.set_option('display.max_colwidth', 0)
     # Streamlit settings
+    st.set_page_config(layout="wide")
     md_title = "# Document Explorer 📖"
     st.markdown(md_title)
     st.sidebar.markdown(md_title)
     initialize_state()
+    main()

app/pages/03_Semantic_Search_🔍.py CHANGED Viewed

@@ -1,14 +1,12 @@
-from distutils.fancy_getopt import wrap_text
-from top2vec import Top2Vec
 import joblib
-import streamlit as st
 import pandas as pd
-from pathlib import Path
 import plotly.express as px
-import plotly.graph_objects as go
-from streamlit_plotly_events import plotly_events
-from st_aggrid import AgGrid, GridOptionsBuilder, ColumnsAutoSizeMode
-from logging import getLogger
 @st.cache(show_spinner=False)
@@ -22,19 +20,19 @@ def initialize_state():
             st.session_state.model = model
             st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
             logger.info("loading data...")
-            data = pd.read_csv(proj_dir/'data'/'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
-            topics = pd.read_csv(proj_dir/'data'/'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
         if 'data' not in st.session_state:
             logger.info("loading data...")
-            data = pd.read_csv(proj_dir/'data'/'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
             st.session_state.selected_data = data
@@ -42,14 +40,14 @@ def initialize_state():
         if 'topics' not in st.session_state:
             logger.info("loading topics...")
-            topics = pd.read_csv(proj_dir/'data'/'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
             st.session_state.selected_points = []
-def main():
     max_docs = st.sidebar.slider("# docs", 10, 100, value=50)
     to_search = st.text_input("Write your query here", "") or ""
     with st.spinner('Embedding Query...'):
@@ -57,19 +55,23 @@ def main():
     with st.spinner('Dimension Reduction...'):
         point = st.session_state.umap_model.transform(vector.reshape(1, -1))
-    documents, document_scores, document_ids = st.session_state.model.search_documents_by_vector(vector.flatten(), num_docs=max_docs)
-    st.session_state.search_raw_df = pd.DataFrame({'document_ids':document_ids, 'document_scores':document_scores})
-    st.session_state.data_to_model = st.session_state.data.merge(st.session_state.search_raw_df, left_on='id', right_on='document_ids').drop(['document_ids'], axis=1)
-    st.session_state.data_to_model = st.session_state.data_to_model.sort_values(by='document_scores', ascending=False) # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
-    st.session_state.data_to_model.loc[len(st.session_state.data_to_model.index)] = ['Point', *point[0].tolist(), to_search, 'Query', 0]
     st.session_state.data_to_model_with_point = st.session_state.data_to_model
     st.session_state.data_to_model_without_point = st.session_state.data_to_model.iloc[:-1]
     def get_topics_counts() -> pd.DataFrame:
         topic_counts = st.session_state.data_to_model_without_point["topic_id"].value_counts().to_frame()
         merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
-        cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x':'topic_count'}, axis=1)
         cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
         return cleaned[cols]
@@ -83,25 +85,25 @@ def main():
     The Query is shown with the documents in yellow.
             """
-            )
     df = st.session_state.data_to_model_with_point.sort_values(by='topic_id', ascending=True)
-    fig = px.scatter(df.iloc[:-1], x='x', y='y', color='topic_id', template='plotly_dark', hover_data=['id', 'topic_id', 'x', 'y'])
     fig.add_traces(px.scatter(df.tail(1), x="x", y="y").update_traces(marker_size=10, marker_color="yellow").data)
     st.plotly_chart(fig, use_container_width=True)
     tab1, tab2 = st.tabs(["Docs", "Topics"])
     with tab1:
         cols = ['id', 'document_scores', 'topic_id', 'documents']
         builder = GridOptionsBuilder.from_dataframe(st.session_state.data_to_model_without_point.loc[:, cols])
         builder.configure_pagination()
-        builder.configure_column('document_scores', type=["numericColumn","numberColumnFilter","customNumericFormat"], precision=2)
         go = builder.build()
-        AgGrid(st.session_state.data_to_model_without_point.loc[:,cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)
     with tab2:
         cols = ['topic_id', 'topic_count', 'topic_0']
         topic_counts = get_topics_counts()
@@ -109,7 +111,8 @@ def main():
         builder.configure_pagination()
         builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
         go = builder.build()
-        AgGrid(topic_counts.loc[:,cols], theme='streamlit', gridOptions=go, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
 if __name__ == "__main__":
@@ -121,10 +124,10 @@ if __name__ == "__main__":
     pd.set_option('display.max_colwidth', 0)
     # Streamlit settings
-    st.set_page_config(layout="wide")
     md_title = "# Semantic Search 🔍"
     st.markdown(md_title)
     st.sidebar.markdown(md_title)
     initialize_state()
-    main()

+from logging import getLogger
+from pathlib import Path
 import joblib
 import pandas as pd
 import plotly.express as px
+import streamlit as st
+from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
+from top2vec import Top2Vec
 @st.cache(show_spinner=False)
             st.session_state.model = model
             st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
             logger.info("loading data...")
+            data = pd.read_csv(proj_dir / 'data' / 'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
+            topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
         if 'data' not in st.session_state:
             logger.info("loading data...")
+            data = pd.read_csv(proj_dir / 'data' / 'data.csv')
             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.data = data
             st.session_state.selected_data = data
         if 'topics' not in st.session_state:
             logger.info("loading topics...")
+            topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
             st.session_state.topics = topics
             st.session_state.selected_points = []
+def main():
     max_docs = st.sidebar.slider("# docs", 10, 100, value=50)
     to_search = st.text_input("Write your query here", "") or ""
     with st.spinner('Embedding Query...'):
     with st.spinner('Dimension Reduction...'):
         point = st.session_state.umap_model.transform(vector.reshape(1, -1))
+    documents, document_scores, document_ids = st.session_state.model.search_documents_by_vector(vector.flatten(),
+                                                                                                 num_docs=max_docs)
+    st.session_state.search_raw_df = pd.DataFrame({'document_ids': document_ids, 'document_scores': document_scores})
+    st.session_state.data_to_model = st.session_state.data.merge(st.session_state.search_raw_df, left_on='id',
+                                                                 right_on='document_ids').drop(['document_ids'], axis=1)
+    st.session_state.data_to_model = st.session_state.data_to_model.sort_values(by='document_scores',
+                                                                                ascending=False)  # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
+    st.session_state.data_to_model.loc[len(st.session_state.data_to_model.index)] = ['Point', *point[0].tolist(),
+                                                                                     to_search, 'Query', 0]
     st.session_state.data_to_model_with_point = st.session_state.data_to_model
     st.session_state.data_to_model_without_point = st.session_state.data_to_model.iloc[:-1]
     def get_topics_counts() -> pd.DataFrame:
         topic_counts = st.session_state.data_to_model_without_point["topic_id"].value_counts().to_frame()
         merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
+        cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x': 'topic_count'}, axis=1)
         cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
         return cleaned[cols]
     The Query is shown with the documents in yellow.
             """
+             )
     df = st.session_state.data_to_model_with_point.sort_values(by='topic_id', ascending=True)
+    fig = px.scatter(df.iloc[:-1], x='x', y='y', color='topic_id', template='plotly_dark',
+                     hover_data=['id', 'topic_id', 'x', 'y'])
     fig.add_traces(px.scatter(df.tail(1), x="x", y="y").update_traces(marker_size=10, marker_color="yellow").data)
     st.plotly_chart(fig, use_container_width=True)
     tab1, tab2 = st.tabs(["Docs", "Topics"])
     with tab1:
         cols = ['id', 'document_scores', 'topic_id', 'documents']
         builder = GridOptionsBuilder.from_dataframe(st.session_state.data_to_model_without_point.loc[:, cols])
         builder.configure_pagination()
+        builder.configure_column('document_scores', type=["numericColumn", "numberColumnFilter", "customNumericFormat"],
+                                 precision=2)
         go = builder.build()
+        AgGrid(st.session_state.data_to_model_without_point.loc[:, cols], theme='streamlit', gridOptions=go,
+               columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)
     with tab2:
         cols = ['topic_id', 'topic_count', 'topic_0']
         topic_counts = get_topics_counts()
         builder.configure_pagination()
         builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
         go = builder.build()
+        AgGrid(topic_counts.loc[:, cols], theme='streamlit', gridOptions=go,
+               columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
 if __name__ == "__main__":
     pd.set_option('display.max_colwidth', 0)
     # Streamlit settings
+    st.set_page_config(layout="wide")
     md_title = "# Semantic Search 🔍"
     st.markdown(md_title)
     st.sidebar.markdown(md_title)
     initialize_state()
+    main()