data-measurements-tool

Build error

App Files Files Community

Sasha commited on Dec 6, 2021

Commit

abff13d

1 Parent(s): 64ba64c

Changing text lengths plot to a static one, saving to .png

Browse files

Files changed (2) hide show

data_measurements/dataset_statistics.py +8 -7
data_measurements/streamlit_utils.py +2 -2

data_measurements/dataset_statistics.py CHANGED Viewed

@@ -28,6 +28,8 @@ import plotly.express as px
 import plotly.figure_factory as ff
 import plotly.graph_objects as go
 import pyarrow.feather as feather
 from datasets import load_from_disk
 from nltk.corpus import stopwords
 from sklearn.feature_extraction.text import CountVectorizer
@@ -281,7 +283,7 @@ class DatasetStatisticsCacheClass:
         # Needed for UI
         self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
         # Needed for UI
-        self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
         ## General text stats
         # Needed for UI
@@ -363,13 +365,12 @@ class DatasetStatisticsCacheClass:
         """
         # Text length figure
         if (self.use_cache and exists(self.fig_tok_length_fid)):
-            self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
         else:
             if not self.live:
                 self.prepare_fig_text_lengths()
                 if save:
-                    write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
         # Text length dataframe
         if self.use_cache and exists(self.length_df_fid):
             self.length_df = feather.read_feather(self.length_df_fid)
@@ -1037,9 +1038,9 @@ def read_plotly(fid):
     return fig
 def make_fig_lengths(tokenized_df, length_field):
-    fig_tok_length = px.histogram(
-        tokenized_df, x=length_field, marginal="rug", hover_data=[length_field]
-    )
     return fig_tok_length
 def make_fig_labels(label_df, label_names, label_field):

 import plotly.figure_factory as ff
 import plotly.graph_objects as go
 import pyarrow.feather as feather
+import matplotlib.pyplot as plt
+import seaborn as sns
 from datasets import load_from_disk
 from nltk.corpus import stopwords
 from sklearn.feature_extraction.text import CountVectorizer
         # Needed for UI
         self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
         # Needed for UI
+        self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.png")
         ## General text stats
         # Needed for UI
         """
         # Text length figure
         if (self.use_cache and exists(self.fig_tok_length_fid)):
+            self.fig_tok_length = plt.imread(self.fig_tok_length_fid)
         else:
             if not self.live:
                 self.prepare_fig_text_lengths()
                 if save:
+                    self.fig_tok_length.savefig(self.fig_tok_length_fid)
         # Text length dataframe
         if self.use_cache and exists(self.length_df_fid):
             self.length_df = feather.read_feather(self.length_df_fid)
     return fig
 def make_fig_lengths(tokenized_df, length_field):
+    fig_tok_length, axs = plt.subplots(figsize=(15, 6), dpi=150)
+    sns.histplot(data=tokenized_df[length_field], kde=True, bins=100, ax=axs)
+    sns.rugplot(data=tokenized_df[length_field], ax=axs)
     return fig_tok_length
 def make_fig_labels(label_df, label_names, label_field):

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -167,7 +167,7 @@ def expander_text_lengths(dstats, column_id):
         st.markdown(
             "### Here is the relative frequency of different text lengths in your dataset:"
         )
-        st.plotly_chart(dstats.fig_tok_length, use_container_width=True)
         st.markdown(
             "The average length of text instances is **"
             + str(dstats.avg_length)
@@ -187,7 +187,7 @@ def expander_text_lengths(dstats, column_id):
         # This is quite a large file and is breaking our ability to navigate the app development.
         # Just passing if it's not already there for launch v0
         if dstats.length_df is not None:
-            st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
 ### Third, use a sentence embedding model

         st.markdown(
             "### Here is the relative frequency of different text lengths in your dataset:"
         )
+        st.pyplot(dstats.fig_tok_length, use_container_width=True)
         st.markdown(
             "The average length of text instances is **"
             + str(dstats.avg_length)
         # This is quite a large file and is breaking our ability to navigate the app development.
         # Just passing if it's not already there for launch v0
         if dstats.length_df is not None:
+            st.table(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
 ### Third, use a sentence embedding model