Spaces:
Running
Running
update to gradio
Browse files- .gitattributes +0 -34
- README.md +24 -9
- app.py +252 -194
- data_viewer.py +0 -26
- options.py +14 -2
- process_text.py +0 -2
- pyproject.toml +24 -0
- requirements.txt +5 -6
- uv.lock +0 -0
.gitattributes
DELETED
|
@@ -1,34 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,14 +1,29 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
|
|
|
| 9 |
pinned: false
|
| 10 |
-
license: apache-2.0
|
| 11 |
-
tags: [NLP, feature extraction]
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: TextDescriptives
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "5.12.0"
|
| 8 |
app_file: app.py
|
| 9 |
+
python_version: "3.10"
|
| 10 |
pinned: false
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# TextDescriptives Demo
|
| 14 |
+
|
| 15 |
+
A Gradio dashboard for extracting text metrics with TextDescriptives. Live at https://huggingface.co/spaces/HLasse/textdescriptives
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## TODO
|
| 19 |
+
|
| 20 |
+
[ ] Add license
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
## Installation
|
| 24 |
+
|
| 25 |
+
```shell
|
| 26 |
+
uv venv && source .venv/bin/activate
|
| 27 |
+
uv pip install -e ".[models]"
|
| 28 |
+
python app.py
|
| 29 |
+
```
|
app.py
CHANGED
|
@@ -1,191 +1,32 @@
|
|
| 1 |
"""
|
| 2 |
Dashboard for showcasing extraction of text metrics with textdescriptives.
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
|
| 6 |
-
|
| 7 |
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
-
import streamlit as st
|
| 10 |
import textdescriptives as td
|
| 11 |
|
| 12 |
-
from data_viewer import DataViewer
|
| 13 |
-
from process_text import text_to_metrics
|
| 14 |
from options import (
|
| 15 |
all_model_size_options_pretty_to_short,
|
| 16 |
available_model_size_options,
|
| 17 |
language_options,
|
| 18 |
metrics_options,
|
| 19 |
)
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
# Introduction #
|
| 23 |
-
################
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
col1, col2 = st.columns([9, 2])
|
| 27 |
-
with col1:
|
| 28 |
-
st.title("Extract Text Statistics")
|
| 29 |
-
with col2:
|
| 30 |
-
st.image(
|
| 31 |
-
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
|
| 32 |
-
width=125,
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
st.write(
|
| 36 |
-
"Calculate a large variety of statistics from text via the "
|
| 37 |
-
"[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
|
| 38 |
-
f"(v/{td.__version__}) and download the results as a .csv file. "
|
| 39 |
-
"Includes descriptive statistics and metrics related to readability, "
|
| 40 |
-
"information theory, text coherence and text quality."
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
st.write(
|
| 44 |
-
"The source code for this application can be found on [**GitHub**](https://github.com/HLasse/TextDescriptives_app). "
|
| 45 |
-
"If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)."
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
st.caption(
|
| 49 |
-
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
|
| 50 |
-
"calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
|
| 51 |
-
"5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
############
|
| 56 |
-
# Settings #
|
| 57 |
-
############
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
input_choice = st.radio(
|
| 61 |
-
label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
with st.form(key="settings_form"):
|
| 65 |
-
split_by_line = st.checkbox(label="Split by newline", value=True)
|
| 66 |
-
|
| 67 |
-
file_name_to_text_string = {}
|
| 68 |
-
|
| 69 |
-
if input_choice == "Upload file(s)":
|
| 70 |
-
uploaded_files = st.file_uploader(
|
| 71 |
-
label="Choose a .txt file", type=["txt"], accept_multiple_files=True
|
| 72 |
-
)
|
| 73 |
-
|
| 74 |
-
if uploaded_files is not None and len(uploaded_files) > 0:
|
| 75 |
-
# To convert to a string based IO:
|
| 76 |
-
file_name_to_text_string = {
|
| 77 |
-
file.name: StringIO(file.getvalue().decode("utf-8")).read()
|
| 78 |
-
for file in uploaded_files
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
else:
|
| 82 |
-
default_text = """Hello, morning dew. The grass whispers low.
|
| 83 |
I'm here to dance. The gentle breeze does show.
|
| 84 |
Good morning, world. The birds sing in delight.
|
| 85 |
Let's spread our wings. The butterflies take flight.
|
| 86 |
Nature's chorus sings, a symphony of light."""
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
)
|
| 92 |
-
}
|
| 93 |
-
|
| 94 |
-
# Row of selectors
|
| 95 |
-
col1, col2 = st.columns([1, 1])
|
| 96 |
-
|
| 97 |
-
with col1:
|
| 98 |
-
# Selection of language
|
| 99 |
-
language_pretty = st.selectbox(
|
| 100 |
-
label="Language",
|
| 101 |
-
options=list(language_options().keys()),
|
| 102 |
-
index=5,
|
| 103 |
-
key="language_selector",
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
language_short = language_options()[language_pretty]
|
| 107 |
-
|
| 108 |
-
with col2:
|
| 109 |
-
# Selection of model size
|
| 110 |
-
model_size_pretty = st.selectbox(
|
| 111 |
-
label="Model Size",
|
| 112 |
-
options=available_model_size_options(lang="all"),
|
| 113 |
-
index=0,
|
| 114 |
-
key="size_selector",
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
|
| 118 |
-
|
| 119 |
-
# Multiselection of metrics
|
| 120 |
-
metrics = st.multiselect(
|
| 121 |
-
label="Metrics", options=metrics_options(), default=metrics_options()
|
| 122 |
-
)
|
| 123 |
-
|
| 124 |
-
st.write(
|
| 125 |
-
"See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
|
| 126 |
-
"information on the available metrics."
|
| 127 |
-
)
|
| 128 |
-
|
| 129 |
-
# This shouldn't happen but better safe than sorry
|
| 130 |
-
if isinstance(metrics, list) and not metrics:
|
| 131 |
-
metrics = None
|
| 132 |
-
|
| 133 |
-
apply_settings_button = st.form_submit_button(label="Apply")
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
#############
|
| 137 |
-
# Apply NLP #
|
| 138 |
-
#############
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
if apply_settings_button and len(file_name_to_text_string) > 0:
|
| 142 |
-
if model_size_pretty not in available_model_size_options(lang=language_short):
|
| 143 |
-
st.write(
|
| 144 |
-
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
|
| 145 |
-
)
|
| 146 |
-
else:
|
| 147 |
-
# Extract metrics for each text
|
| 148 |
-
output_df = pd.concat(
|
| 149 |
-
[
|
| 150 |
-
text_to_metrics(
|
| 151 |
-
string=string,
|
| 152 |
-
language_short=language_short,
|
| 153 |
-
model_size_short=model_size_short,
|
| 154 |
-
metrics=metrics,
|
| 155 |
-
split_by_line=split_by_line,
|
| 156 |
-
filename=filename if "Upload" in input_choice else None,
|
| 157 |
-
)
|
| 158 |
-
for filename, string in file_name_to_text_string.items()
|
| 159 |
-
],
|
| 160 |
-
ignore_index=True,
|
| 161 |
-
)
|
| 162 |
-
|
| 163 |
-
###################
|
| 164 |
-
# Present Results #
|
| 165 |
-
###################
|
| 166 |
-
|
| 167 |
-
# Create 2 columns with 1) the output header
|
| 168 |
-
# and 2) a download button
|
| 169 |
-
DataViewer()._header_and_download(
|
| 170 |
-
header="The calculated metrics",
|
| 171 |
-
data=output_df,
|
| 172 |
-
file_name="text_metrics.csv",
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
st.write("**Note**: This data frame has been transposed for readability.")
|
| 176 |
-
output_df = output_df.transpose().reset_index()
|
| 177 |
-
output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
|
| 178 |
-
st.dataframe(data=output_df, use_container_width=True)
|
| 179 |
-
|
| 180 |
|
| 181 |
-
|
| 182 |
-
# Code For Reproducibility #
|
| 183 |
-
############################
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
with st.expander("See python code"):
|
| 187 |
-
st.code(
|
| 188 |
-
"""
|
| 189 |
# Note: This is the code for a single text file
|
| 190 |
# The actual code is slightly more complex
|
| 191 |
# to allow processing multiple files at once
|
|
@@ -219,39 +60,256 @@ extracted_metrics = td.extract_metrics(
|
|
| 219 |
spacy_model_size=model_size,
|
| 220 |
metrics=metrics
|
| 221 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
)
|
| 226 |
|
| 227 |
-
#
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
-
st.subheader("Frequently Asked Questions (FAQ)")
|
| 232 |
|
| 233 |
-
with
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
performed separately for each paragraph. I.e. whenever there's a line break,
|
| 238 |
-
we split the text.
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
)
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
""
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
Dashboard for showcasing extraction of text metrics with textdescriptives.
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
import tempfile
|
| 6 |
|
| 7 |
+
import gradio as gr
|
| 8 |
import pandas as pd
|
|
|
|
| 9 |
import textdescriptives as td
|
| 10 |
|
|
|
|
|
|
|
| 11 |
from options import (
|
| 12 |
all_model_size_options_pretty_to_short,
|
| 13 |
available_model_size_options,
|
| 14 |
language_options,
|
| 15 |
metrics_options,
|
| 16 |
)
|
| 17 |
+
from process_text import text_to_metrics
|
| 18 |
|
| 19 |
+
DEFAULT_TEXT = """Hello, morning dew. The grass whispers low.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
I'm here to dance. The gentle breeze does show.
|
| 21 |
Good morning, world. The birds sing in delight.
|
| 22 |
Let's spread our wings. The butterflies take flight.
|
| 23 |
Nature's chorus sings, a symphony of light."""
|
| 24 |
|
| 25 |
+
LANG_OPTIONS = language_options()
|
| 26 |
+
LANG_NAMES = list(LANG_OPTIONS.keys())
|
| 27 |
+
DEFAULT_LANG_INDEX = LANG_NAMES.index("English")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
CODE_SNIPPET = """\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
# Note: This is the code for a single text file
|
| 31 |
# The actual code is slightly more complex
|
| 32 |
# to allow processing multiple files at once
|
|
|
|
| 60 |
spacy_model_size=model_size,
|
| 61 |
metrics=metrics
|
| 62 |
)
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
CSS = """
|
| 66 |
+
.citation {
|
| 67 |
+
font-size: 0.85em;
|
| 68 |
+
color: #666;
|
| 69 |
+
}
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def toggle_input(choice):
|
| 74 |
+
if choice == "Upload file(s)":
|
| 75 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 76 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 77 |
|
| 78 |
+
|
| 79 |
+
def process_and_display(
|
| 80 |
+
input_choice,
|
| 81 |
+
text_input,
|
| 82 |
+
files,
|
| 83 |
+
split_by_line,
|
| 84 |
+
language_pretty,
|
| 85 |
+
model_size_pretty,
|
| 86 |
+
metrics,
|
| 87 |
+
):
|
| 88 |
+
if not metrics:
|
| 89 |
+
return (
|
| 90 |
+
gr.update(value="**Please select at least one metric.**", visible=True),
|
| 91 |
+
gr.update(visible=False),
|
| 92 |
+
gr.update(visible=False),
|
| 93 |
+
None,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
language_short = LANG_OPTIONS[language_pretty]
|
| 97 |
+
model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
|
| 98 |
+
|
| 99 |
+
if model_size_pretty not in available_model_size_options(lang=language_short):
|
| 100 |
+
return (
|
| 101 |
+
gr.update(
|
| 102 |
+
value="**Sorry!** The chosen *model size* is not available in this language. Please try another.",
|
| 103 |
+
visible=True,
|
| 104 |
+
),
|
| 105 |
+
gr.update(visible=False),
|
| 106 |
+
gr.update(visible=False),
|
| 107 |
+
None,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
# Build mapping of filename -> text
|
| 111 |
+
file_name_to_text = {}
|
| 112 |
+
if input_choice == "Upload file(s)":
|
| 113 |
+
if not files:
|
| 114 |
+
return (
|
| 115 |
+
gr.update(value="**Please upload at least one file.**", visible=True),
|
| 116 |
+
gr.update(visible=False),
|
| 117 |
+
gr.update(visible=False),
|
| 118 |
+
None,
|
| 119 |
+
)
|
| 120 |
+
for f in files:
|
| 121 |
+
with open(f, "r", encoding="utf-8") as fh:
|
| 122 |
+
file_name_to_text[f.rsplit("/", 1)[-1]] = fh.read()
|
| 123 |
+
else:
|
| 124 |
+
if not text_input or not text_input.strip():
|
| 125 |
+
return (
|
| 126 |
+
gr.update(value="**Please enter some text.**", visible=True),
|
| 127 |
+
gr.update(visible=False),
|
| 128 |
+
gr.update(visible=False),
|
| 129 |
+
None,
|
| 130 |
+
)
|
| 131 |
+
file_name_to_text["input"] = text_input
|
| 132 |
+
|
| 133 |
+
# Extract metrics for each text
|
| 134 |
+
output_df = pd.concat(
|
| 135 |
+
[
|
| 136 |
+
text_to_metrics(
|
| 137 |
+
string=string,
|
| 138 |
+
language_short=language_short,
|
| 139 |
+
model_size_short=model_size_short,
|
| 140 |
+
metrics=metrics,
|
| 141 |
+
split_by_line=split_by_line,
|
| 142 |
+
filename=filename if input_choice == "Upload file(s)" else None,
|
| 143 |
+
)
|
| 144 |
+
for filename, string in file_name_to_text.items()
|
| 145 |
+
],
|
| 146 |
+
ignore_index=True,
|
| 147 |
)
|
| 148 |
|
| 149 |
+
# Transpose for readability
|
| 150 |
+
transposed = output_df.transpose().reset_index()
|
| 151 |
+
transposed.columns = ["Metric"] + [str(c) for c in list(transposed.columns)[1:]]
|
| 152 |
+
|
| 153 |
+
# Write CSV to a temp file for download
|
| 154 |
+
csv_path = tempfile.NamedTemporaryFile(
|
| 155 |
+
suffix=".csv", delete=False, prefix="text_metrics_"
|
| 156 |
+
).name
|
| 157 |
+
output_df.to_csv(csv_path, index=False)
|
| 158 |
+
|
| 159 |
+
return (
|
| 160 |
+
gr.update(
|
| 161 |
+
value="**Note**: This data frame has been transposed for readability.",
|
| 162 |
+
visible=True,
|
| 163 |
+
),
|
| 164 |
+
gr.update(value=transposed, visible=True),
|
| 165 |
+
gr.update(value=csv_path, visible=True),
|
| 166 |
+
csv_path,
|
| 167 |
+
)
|
| 168 |
|
|
|
|
| 169 |
|
| 170 |
+
with gr.Blocks(title="TextDescriptives", css=CSS) as demo:
|
| 171 |
+
################
|
| 172 |
+
# Introduction #
|
| 173 |
+
################
|
|
|
|
|
|
|
| 174 |
|
| 175 |
+
gr.HTML(
|
| 176 |
+
'<div style="display:flex;align-items:center;gap:12px;">'
|
| 177 |
+
'<img src="https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png" '
|
| 178 |
+
'style="height:56px;width:auto;border-radius:8px;" />'
|
| 179 |
+
'<h1 style="margin:0;font-size:2em;">Extract Text Statistics</h1>'
|
| 180 |
+
'</div>'
|
| 181 |
)
|
| 182 |
|
| 183 |
+
gr.Markdown(
|
| 184 |
+
f"Calculate a large variety of statistics from text via the "
|
| 185 |
+
f"[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
|
| 186 |
+
f"(v/{td.__version__}) and download the results as a .csv file. "
|
| 187 |
+
f"Includes descriptive statistics and metrics related to readability, "
|
| 188 |
+
f"information theory, text coherence and text quality. "
|
| 189 |
+
f"Source on [**GitHub**](https://github.com/HLasse/TextDescriptives_app) "
|
| 190 |
+
f"— [open an issue](https://github.com/HLasse/textdescriptives_app/issues) for feedback."
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
gr.Markdown(
|
| 194 |
+
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). *TextDescriptives: A Python package for "
|
| 195 |
+
"calculating a large variety of metrics from text.* "
|
| 196 |
+
"[JOSS, 8(84), 5153](https://doi.org/10.21105/joss.05153)",
|
| 197 |
+
elem_classes="citation",
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
############
|
| 201 |
+
# Settings #
|
| 202 |
+
############
|
| 203 |
+
|
| 204 |
+
with gr.Group():
|
| 205 |
+
input_choice = gr.Radio(
|
| 206 |
+
choices=["Enter text", "Upload file(s)"],
|
| 207 |
+
value="Enter text",
|
| 208 |
+
label="Input",
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
text_input = gr.Textbox(
|
| 212 |
+
label="Enter text",
|
| 213 |
+
value=DEFAULT_TEXT,
|
| 214 |
+
lines=7,
|
| 215 |
+
visible=True,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
file_upload = gr.File(
|
| 219 |
+
label="Choose .txt file(s)",
|
| 220 |
+
file_types=[".txt"],
|
| 221 |
+
file_count="multiple",
|
| 222 |
+
visible=False,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
split_by_line = gr.Checkbox(label="Split by newline", value=True)
|
| 226 |
+
|
| 227 |
+
input_choice.change(
|
| 228 |
+
fn=toggle_input,
|
| 229 |
+
inputs=input_choice,
|
| 230 |
+
outputs=[text_input, file_upload],
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
with gr.Row():
|
| 234 |
+
language_dropdown = gr.Dropdown(
|
| 235 |
+
label="Language",
|
| 236 |
+
choices=LANG_NAMES,
|
| 237 |
+
value=LANG_NAMES[DEFAULT_LANG_INDEX],
|
| 238 |
+
)
|
| 239 |
+
model_size_dropdown = gr.Dropdown(
|
| 240 |
+
label="Model Size",
|
| 241 |
+
choices=available_model_size_options(lang="all"),
|
| 242 |
+
value=available_model_size_options(lang="all")[0],
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
metrics_select = gr.CheckboxGroup(
|
| 246 |
+
label="Metrics",
|
| 247 |
+
choices=metrics_options(),
|
| 248 |
+
value=metrics_options(),
|
| 249 |
)
|
| 250 |
+
|
| 251 |
+
gr.Markdown(
|
| 252 |
+
"See the [**documentation**](https://hlasse.github.io/TextDescriptives/) "
|
| 253 |
+
"for information on the available metrics."
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
apply_btn = gr.Button("Apply", variant="primary")
|
| 257 |
+
|
| 258 |
+
#############
|
| 259 |
+
# Results #
|
| 260 |
+
#############
|
| 261 |
+
|
| 262 |
+
status_msg = gr.Markdown(visible=False)
|
| 263 |
+
results_table = gr.DataFrame(visible=False, label="Results")
|
| 264 |
+
csv_state = gr.State(value=None)
|
| 265 |
+
download_btn = gr.DownloadButton("Download CSV", visible=False, variant="primary")
|
| 266 |
+
|
| 267 |
+
apply_btn.click(
|
| 268 |
+
fn=process_and_display,
|
| 269 |
+
inputs=[
|
| 270 |
+
input_choice,
|
| 271 |
+
text_input,
|
| 272 |
+
file_upload,
|
| 273 |
+
split_by_line,
|
| 274 |
+
language_dropdown,
|
| 275 |
+
model_size_dropdown,
|
| 276 |
+
metrics_select,
|
| 277 |
+
],
|
| 278 |
+
outputs=[status_msg, results_table, download_btn, csv_state],
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
############################
|
| 282 |
+
# Code For Reproducibility #
|
| 283 |
+
############################
|
| 284 |
+
|
| 285 |
+
with gr.Accordion("See python code", open=False):
|
| 286 |
+
gr.Code(value=CODE_SNIPPET, language="python", interactive=False)
|
| 287 |
+
|
| 288 |
+
#######
|
| 289 |
+
# FAQ #
|
| 290 |
+
#######
|
| 291 |
+
|
| 292 |
+
gr.Markdown("## FAQ")
|
| 293 |
+
|
| 294 |
+
with gr.Accordion("What does the 'Split by newline' option do?", open=False):
|
| 295 |
+
gr.Markdown(
|
| 296 |
+
"When the `Split by newline` option is `enabled`, the metrics calculation is "
|
| 297 |
+
"performed separately for each paragraph. I.e. whenever there's a line break, "
|
| 298 |
+
"we split the text.\n\n"
|
| 299 |
+
"When this option is `disabled`, the entire text is processed at once."
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
with gr.Accordion(
|
| 303 |
+
"Why do I get a warning/error message for certain languages or model sizes?",
|
| 304 |
+
open=False,
|
| 305 |
+
):
|
| 306 |
+
gr.Markdown(
|
| 307 |
+
"Some combinations of languages, model sizes, and metrics are not currently supported in the app. "
|
| 308 |
+
"While we *are* working on this, you may currently see an error message after clicking `Apply`.\n\n"
|
| 309 |
+
"If you need this language and/or model size to work for your project, "
|
| 310 |
+
"please open an [issue](https://github.com/HLasse/textdescriptives_app/issues). "
|
| 311 |
+
"This may cause us to prioritize supporting your use case."
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
if __name__ == "__main__":
|
| 315 |
+
demo.launch()
|
data_viewer.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Class for showing header and download button in the same row.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import streamlit as st
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class DataViewer:
|
| 9 |
-
def _convert_df_to_csv(self, data, **kwargs):
|
| 10 |
-
return data.to_csv(**kwargs).encode("utf-8")
|
| 11 |
-
|
| 12 |
-
def _header_and_download(
|
| 13 |
-
self, header, data, file_name, key=None, label="Download", help="Download data"
|
| 14 |
-
):
|
| 15 |
-
col1, col2 = st.columns([9, 2])
|
| 16 |
-
with col1:
|
| 17 |
-
st.subheader(header)
|
| 18 |
-
with col2:
|
| 19 |
-
st.write("")
|
| 20 |
-
st.download_button(
|
| 21 |
-
label=label,
|
| 22 |
-
data=self._convert_df_to_csv(data, index=False),
|
| 23 |
-
file_name=file_name,
|
| 24 |
-
key=key,
|
| 25 |
-
help=help,
|
| 26 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
options.py
CHANGED
|
@@ -80,9 +80,18 @@ def available_model_size_options(lang) -> List[str]:
|
|
| 80 |
|
| 81 |
|
| 82 |
class ModelAvailabilityChecker:
|
|
|
|
|
|
|
| 83 |
@staticmethod
|
| 84 |
def available_models() -> List[str]:
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
@staticmethod
|
| 88 |
def extract_language_and_size() -> List[List[str]]:
|
|
@@ -106,8 +115,11 @@ class ModelAvailabilityChecker:
|
|
| 106 |
|
| 107 |
@staticmethod
|
| 108 |
def available_model_sizes_for_language(lang: str) -> Set[str]:
|
| 109 |
-
|
| 110 |
size
|
| 111 |
for (lang_, size) in ModelAvailabilityChecker.extract_language_and_size()
|
| 112 |
if lang_ == lang and size in all_model_size_options_pretty_to_short().values()
|
| 113 |
])
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
class ModelAvailabilityChecker:
|
| 83 |
+
_compatibility_cache = None
|
| 84 |
+
|
| 85 |
@staticmethod
|
| 86 |
def available_models() -> List[str]:
|
| 87 |
+
if ModelAvailabilityChecker._compatibility_cache is None:
|
| 88 |
+
try:
|
| 89 |
+
ModelAvailabilityChecker._compatibility_cache = list(
|
| 90 |
+
get_compatibility().keys()
|
| 91 |
+
)
|
| 92 |
+
except Exception:
|
| 93 |
+
ModelAvailabilityChecker._compatibility_cache = []
|
| 94 |
+
return ModelAvailabilityChecker._compatibility_cache
|
| 95 |
|
| 96 |
@staticmethod
|
| 97 |
def extract_language_and_size() -> List[List[str]]:
|
|
|
|
| 115 |
|
| 116 |
@staticmethod
|
| 117 |
def available_model_sizes_for_language(lang: str) -> Set[str]:
|
| 118 |
+
sizes = set([
|
| 119 |
size
|
| 120 |
for (lang_, size) in ModelAvailabilityChecker.extract_language_and_size()
|
| 121 |
if lang_ == lang and size in all_model_size_options_pretty_to_short().values()
|
| 122 |
])
|
| 123 |
+
if not sizes:
|
| 124 |
+
return set(all_model_size_options_pretty_to_short().values())
|
| 125 |
+
return sizes
|
process_text.py
CHANGED
|
@@ -3,12 +3,10 @@ The text processing functionality.
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
from typing import List, Optional
|
| 6 |
-
import streamlit as st
|
| 7 |
import pandas as pd
|
| 8 |
import textdescriptives as td
|
| 9 |
|
| 10 |
|
| 11 |
-
@st.cache_data
|
| 12 |
def text_to_metrics(
|
| 13 |
string: str,
|
| 14 |
language_short: str,
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
from typing import List, Optional
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
import textdescriptives as td
|
| 8 |
|
| 9 |
|
|
|
|
| 10 |
def text_to_metrics(
|
| 11 |
string: str,
|
| 12 |
language_short: str,
|
pyproject.toml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "textdescriptives-app"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Dashboard for extracting text metrics with TextDescriptives"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"textdescriptives>=2.8.2",
|
| 9 |
+
"gradio>=5.0,<6.0",
|
| 10 |
+
"pandas",
|
| 11 |
+
"pip",
|
| 12 |
+
"en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
| 13 |
+
"da-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.8.0/da_core_news_lg-3.8.0.tar.gz",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
[tool.hatch.metadata]
|
| 17 |
+
allow-direct-references = true
|
| 18 |
+
|
| 19 |
+
[tool.hatch.build.targets.wheel]
|
| 20 |
+
packages = ["."]
|
| 21 |
+
|
| 22 |
+
[build-system]
|
| 23 |
+
requires = ["hatchling"]
|
| 24 |
+
build-backend = "hatchling.build"
|
requirements.txt
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
-
textdescriptives=
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
https://github.com/explosion/spacy-models/releases/download/
|
| 6 |
-
https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.7.0/da_core_news_lg-3.7.0.tar.gz
|
|
|
|
| 1 |
+
textdescriptives>=2.8.2
|
| 2 |
+
gradio>=5.0,<6.0
|
| 3 |
+
pandas
|
| 4 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
| 5 |
+
https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.8.0/da_core_news_lg-3.8.0.tar.gz
|
|
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|