Spaces:
Build error
Build error
Sasha
commited on
Commit
·
abff13d
1
Parent(s):
64ba64c
Changing text lengths plot to a static one, saving to .png
Browse files
data_measurements/dataset_statistics.py
CHANGED
|
@@ -28,6 +28,8 @@ import plotly.express as px
|
|
| 28 |
import plotly.figure_factory as ff
|
| 29 |
import plotly.graph_objects as go
|
| 30 |
import pyarrow.feather as feather
|
|
|
|
|
|
|
| 31 |
from datasets import load_from_disk
|
| 32 |
from nltk.corpus import stopwords
|
| 33 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
@@ -281,7 +283,7 @@ class DatasetStatisticsCacheClass:
|
|
| 281 |
# Needed for UI
|
| 282 |
self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
|
| 283 |
# Needed for UI
|
| 284 |
-
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.
|
| 285 |
|
| 286 |
## General text stats
|
| 287 |
# Needed for UI
|
|
@@ -363,13 +365,12 @@ class DatasetStatisticsCacheClass:
|
|
| 363 |
"""
|
| 364 |
# Text length figure
|
| 365 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
| 366 |
-
self.fig_tok_length =
|
| 367 |
else:
|
| 368 |
if not self.live:
|
| 369 |
self.prepare_fig_text_lengths()
|
| 370 |
if save:
|
| 371 |
-
|
| 372 |
-
|
| 373 |
# Text length dataframe
|
| 374 |
if self.use_cache and exists(self.length_df_fid):
|
| 375 |
self.length_df = feather.read_feather(self.length_df_fid)
|
|
@@ -1037,9 +1038,9 @@ def read_plotly(fid):
|
|
| 1037 |
return fig
|
| 1038 |
|
| 1039 |
def make_fig_lengths(tokenized_df, length_field):
|
| 1040 |
-
fig_tok_length =
|
| 1041 |
-
|
| 1042 |
-
)
|
| 1043 |
return fig_tok_length
|
| 1044 |
|
| 1045 |
def make_fig_labels(label_df, label_names, label_field):
|
|
|
|
| 28 |
import plotly.figure_factory as ff
|
| 29 |
import plotly.graph_objects as go
|
| 30 |
import pyarrow.feather as feather
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
import seaborn as sns
|
| 33 |
from datasets import load_from_disk
|
| 34 |
from nltk.corpus import stopwords
|
| 35 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
| 283 |
# Needed for UI
|
| 284 |
self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
|
| 285 |
# Needed for UI
|
| 286 |
+
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.png")
|
| 287 |
|
| 288 |
## General text stats
|
| 289 |
# Needed for UI
|
|
|
|
| 365 |
"""
|
| 366 |
# Text length figure
|
| 367 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
| 368 |
+
self.fig_tok_length = plt.imread(self.fig_tok_length_fid)
|
| 369 |
else:
|
| 370 |
if not self.live:
|
| 371 |
self.prepare_fig_text_lengths()
|
| 372 |
if save:
|
| 373 |
+
self.fig_tok_length.savefig(self.fig_tok_length_fid)
|
|
|
|
| 374 |
# Text length dataframe
|
| 375 |
if self.use_cache and exists(self.length_df_fid):
|
| 376 |
self.length_df = feather.read_feather(self.length_df_fid)
|
|
|
|
| 1038 |
return fig
|
| 1039 |
|
| 1040 |
def make_fig_lengths(tokenized_df, length_field):
|
| 1041 |
+
fig_tok_length, axs = plt.subplots(figsize=(15, 6), dpi=150)
|
| 1042 |
+
sns.histplot(data=tokenized_df[length_field], kde=True, bins=100, ax=axs)
|
| 1043 |
+
sns.rugplot(data=tokenized_df[length_field], ax=axs)
|
| 1044 |
return fig_tok_length
|
| 1045 |
|
| 1046 |
def make_fig_labels(label_df, label_names, label_field):
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -167,7 +167,7 @@ def expander_text_lengths(dstats, column_id):
|
|
| 167 |
st.markdown(
|
| 168 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
| 169 |
)
|
| 170 |
-
st.
|
| 171 |
st.markdown(
|
| 172 |
"The average length of text instances is **"
|
| 173 |
+ str(dstats.avg_length)
|
|
@@ -187,7 +187,7 @@ def expander_text_lengths(dstats, column_id):
|
|
| 187 |
# This is quite a large file and is breaking our ability to navigate the app development.
|
| 188 |
# Just passing if it's not already there for launch v0
|
| 189 |
if dstats.length_df is not None:
|
| 190 |
-
st.
|
| 191 |
|
| 192 |
|
| 193 |
### Third, use a sentence embedding model
|
|
|
|
| 167 |
st.markdown(
|
| 168 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
| 169 |
)
|
| 170 |
+
st.pyplot(dstats.fig_tok_length, use_container_width=True)
|
| 171 |
st.markdown(
|
| 172 |
"The average length of text instances is **"
|
| 173 |
+ str(dstats.avg_length)
|
|
|
|
| 187 |
# This is quite a large file and is breaking our ability to navigate the app development.
|
| 188 |
# Just passing if it's not already there for launch v0
|
| 189 |
if dstats.length_df is not None:
|
| 190 |
+
st.table(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
|
| 191 |
|
| 192 |
|
| 193 |
### Third, use a sentence embedding model
|