Spaces:

HLasse
/

textdescriptives

Running

App Files Files Community

HLasse commited on Feb 12

Commit

afbf56e

1 Parent(s): ba443b2

update to gradio

Browse files

Files changed (9) hide show

.gitattributes +0 -34
README.md +24 -9
app.py +252 -194
data_viewer.py +0 -26
options.py +14 -2
process_text.py +0 -2
pyproject.toml +24 -0
requirements.txt +5 -6
uv.lock +0 -0

.gitattributes DELETED Viewed

@@ -1,34 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,29 @@
 ---
-title: Textdescriptives
-emoji: 📈
-colorFrom: green
-colorTo: red
-sdk: streamlit
-sdk_version: 1.19.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-tags: [NLP, feature extraction]
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TextDescriptives
+emoji: 📊
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: "5.12.0"
 app_file: app.py
+python_version: "3.10"
 pinned: false
 ---
+# TextDescriptives Demo
+A Gradio dashboard for extracting text metrics with TextDescriptives. Live at https://huggingface.co/spaces/HLasse/textdescriptives
+## TODO
+[ ] Add license
+## Installation
+```shell
+uv venv && source .venv/bin/activate
+uv pip install -e ".[models]"
+python app.py
+```

app.py CHANGED Viewed

@@ -1,191 +1,32 @@
 """
 Dashboard for showcasing extraction of text metrics with textdescriptives.
 """
-from io import StringIO
 import pandas as pd
-import streamlit as st
 import textdescriptives as td
-from data_viewer import DataViewer
-from process_text import text_to_metrics
 from options import (
     all_model_size_options_pretty_to_short,
     available_model_size_options,
     language_options,
     metrics_options,
 )
-################
-# Introduction #
-################
-col1, col2 = st.columns([9, 2])
-with col1:
-    st.title("Extract Text Statistics")
-with col2:
-    st.image(
-        "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
-        width=125,
-    )
-st.write(
-    "Calculate a large variety of statistics from text via the "
-    "[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
-    f"(v/{td.__version__}) and download the results as a .csv file. "
-    "Includes descriptive statistics and metrics related to readability, "
-    "information theory, text coherence and text quality."
-)
-st.write(
-    "The source code for this application can be found on [**GitHub**](https://github.com/HLasse/TextDescriptives_app). "
-    "If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)."
-)
-st.caption(
-    "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
-    "calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
-    "5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
-)
-############
-# Settings #
-############
-input_choice = st.radio(
-    label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
-)
-with st.form(key="settings_form"):
-    split_by_line = st.checkbox(label="Split by newline", value=True)
-    file_name_to_text_string = {}
-    if input_choice == "Upload file(s)":
-        uploaded_files = st.file_uploader(
-            label="Choose a .txt file", type=["txt"], accept_multiple_files=True
-        )
-        if uploaded_files is not None and len(uploaded_files) > 0:
-            # To convert to a string based IO:
-            file_name_to_text_string = {
-                file.name: StringIO(file.getvalue().decode("utf-8")).read()
-                for file in uploaded_files
-            }
-    else:
-        default_text = """Hello, morning dew. The grass whispers low.
 I'm here to dance. The gentle breeze does show.
 Good morning, world. The birds sing in delight.
 Let's spread our wings. The butterflies take flight.
 Nature's chorus sings, a symphony of light."""
-        file_name_to_text_string = {
-            "input": st.text_area(
-                label="Enter text", value=default_text, height=145, max_chars=None
-            )
-        }
-    # Row of selectors
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        # Selection of language
-        language_pretty = st.selectbox(
-            label="Language",
-            options=list(language_options().keys()),
-            index=5,
-            key="language_selector",
-        )
-        language_short = language_options()[language_pretty]
-    with col2:
-        # Selection of model size
-        model_size_pretty = st.selectbox(
-            label="Model Size",
-            options=available_model_size_options(lang="all"),
-            index=0,
-            key="size_selector",
-        )
-        model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
-    # Multiselection of metrics
-    metrics = st.multiselect(
-        label="Metrics", options=metrics_options(), default=metrics_options()
-    )
-    st.write(
-        "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
-        "information on the available metrics."
-    )
-    # This shouldn't happen but better safe than sorry
-    if isinstance(metrics, list) and not metrics:
-        metrics = None
-    apply_settings_button = st.form_submit_button(label="Apply")
-#############
-# Apply NLP #
-#############
-if apply_settings_button and len(file_name_to_text_string) > 0:
-    if model_size_pretty not in available_model_size_options(lang=language_short):
-        st.write(
-            "**Sorry!** The chosen *model size* is not available in this language. Please try another."
-        )
-    else:
-        # Extract metrics for each text
-        output_df = pd.concat(
-            [
-                text_to_metrics(
-                    string=string,
-                    language_short=language_short,
-                    model_size_short=model_size_short,
-                    metrics=metrics,
-                    split_by_line=split_by_line,
-                    filename=filename if "Upload" in input_choice else None,
-                )
-                for filename, string in file_name_to_text_string.items()
-            ],
-            ignore_index=True,
-        )
-        ###################
-        # Present Results #
-        ###################
-        # Create 2 columns with 1) the output header
-        # and 2) a download button
-        DataViewer()._header_and_download(
-            header="The calculated metrics",
-            data=output_df,
-            file_name="text_metrics.csv",
-        )
-        st.write("**Note**: This data frame has been transposed for readability.")
-        output_df = output_df.transpose().reset_index()
-        output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
-        st.dataframe(data=output_df, use_container_width=True)
-############################
-# Code For Reproducibility #
-############################
-with st.expander("See python code"):
-    st.code(
-        """
 # Note: This is the code for a single text file
 # The actual code is slightly more complex
 # to allow processing multiple files at once
@@ -219,39 +60,256 @@ extracted_metrics = td.extract_metrics(
     spacy_model_size=model_size,
     metrics=metrics
 )
-""",
-        language="python",
     )
-#######
-# FAQ #
-#######
-st.subheader("Frequently Asked Questions (FAQ)")
-with st.expander("What does the 'Split by newline' option do?"):
-    st.write(
-        """
-    When the `Split by newline` option is `enabled`, the metrics calculation is
-    performed separately for each paragraph. I.e. whenever there's a line break,
-    we split the text.
-    When this option is `disabled`, the entire text is processed at once.
-    """
     )
-with st.expander(
-    "Why do I get a warning/error message for certain languages or model sizes?"
-):
-    st.write(
-        """
-    Some combinations of languages, model sizes, and metrics are not currently supported in the app.
-    While we *are* working on this, you may currently see a red box
-    with an error message after clicking `Apply`.
-    If you need this language and/or model size to work for your project,
-    please open an [issue](https://github.com/HLasse/textdescriptives_app/issues).
-    This may cause us to prioritize supporting your use case.
-    """
     )

 """
 Dashboard for showcasing extraction of text metrics with textdescriptives.
 """
+import tempfile
+import gradio as gr
 import pandas as pd
 import textdescriptives as td
 from options import (
     all_model_size_options_pretty_to_short,
     available_model_size_options,
     language_options,
     metrics_options,
 )
+from process_text import text_to_metrics
+DEFAULT_TEXT = """Hello, morning dew. The grass whispers low.
 I'm here to dance. The gentle breeze does show.
 Good morning, world. The birds sing in delight.
 Let's spread our wings. The butterflies take flight.
 Nature's chorus sings, a symphony of light."""
+LANG_OPTIONS = language_options()
+LANG_NAMES = list(LANG_OPTIONS.keys())
+DEFAULT_LANG_INDEX = LANG_NAMES.index("English")
+CODE_SNIPPET = """\
 # Note: This is the code for a single text file
 # The actual code is slightly more complex
 # to allow processing multiple files at once
     spacy_model_size=model_size,
     metrics=metrics
 )
+"""
+CSS = """
+.citation {
+    font-size: 0.85em;
+    color: #666;
+}
+"""
+def toggle_input(choice):
+    if choice == "Upload file(s)":
+        return gr.update(visible=False), gr.update(visible=True)
+    return gr.update(visible=True), gr.update(visible=False)
+def process_and_display(
+    input_choice,
+    text_input,
+    files,
+    split_by_line,
+    language_pretty,
+    model_size_pretty,
+    metrics,
+):
+    if not metrics:
+        return (
+            gr.update(value="**Please select at least one metric.**", visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            None,
+        )
+    language_short = LANG_OPTIONS[language_pretty]
+    model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
+    if model_size_pretty not in available_model_size_options(lang=language_short):
+        return (
+            gr.update(
+                value="**Sorry!** The chosen *model size* is not available in this language. Please try another.",
+                visible=True,
+            ),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            None,
+        )
+    # Build mapping of filename -> text
+    file_name_to_text = {}
+    if input_choice == "Upload file(s)":
+        if not files:
+            return (
+                gr.update(value="**Please upload at least one file.**", visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                None,
+            )
+        for f in files:
+            with open(f, "r", encoding="utf-8") as fh:
+                file_name_to_text[f.rsplit("/", 1)[-1]] = fh.read()
+    else:
+        if not text_input or not text_input.strip():
+            return (
+                gr.update(value="**Please enter some text.**", visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                None,
+            )
+        file_name_to_text["input"] = text_input
+    # Extract metrics for each text
+    output_df = pd.concat(
+        [
+            text_to_metrics(
+                string=string,
+                language_short=language_short,
+                model_size_short=model_size_short,
+                metrics=metrics,
+                split_by_line=split_by_line,
+                filename=filename if input_choice == "Upload file(s)" else None,
+            )
+            for filename, string in file_name_to_text.items()
+        ],
+        ignore_index=True,
     )
+    # Transpose for readability
+    transposed = output_df.transpose().reset_index()
+    transposed.columns = ["Metric"] + [str(c) for c in list(transposed.columns)[1:]]
+    # Write CSV to a temp file for download
+    csv_path = tempfile.NamedTemporaryFile(
+        suffix=".csv", delete=False, prefix="text_metrics_"
+    ).name
+    output_df.to_csv(csv_path, index=False)
+    return (
+        gr.update(
+            value="**Note**: This data frame has been transposed for readability.",
+            visible=True,
+        ),
+        gr.update(value=transposed, visible=True),
+        gr.update(value=csv_path, visible=True),
+        csv_path,
+    )
+with gr.Blocks(title="TextDescriptives", css=CSS) as demo:
+    ################
+    # Introduction #
+    ################
+    gr.HTML(
+        '<div style="display:flex;align-items:center;gap:12px;">'
+        '<img src="https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png" '
+        'style="height:56px;width:auto;border-radius:8px;" />'
+        '<h1 style="margin:0;font-size:2em;">Extract Text Statistics</h1>'
+        '</div>'
     )
+    gr.Markdown(
+        f"Calculate a large variety of statistics from text via the "
+        f"[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
+        f"(v/{td.__version__}) and download the results as a .csv file. "
+        f"Includes descriptive statistics and metrics related to readability, "
+        f"information theory, text coherence and text quality. "
+        f"Source on [**GitHub**](https://github.com/HLasse/TextDescriptives_app) "
+        f"— [open an issue](https://github.com/HLasse/textdescriptives_app/issues) for feedback."
+    )
+    gr.Markdown(
+        "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). *TextDescriptives: A Python package for "
+        "calculating a large variety of metrics from text.* "
+        "[JOSS, 8(84), 5153](https://doi.org/10.21105/joss.05153)",
+        elem_classes="citation",
+    )
+    ############
+    # Settings #
+    ############
+    with gr.Group():
+        input_choice = gr.Radio(
+            choices=["Enter text", "Upload file(s)"],
+            value="Enter text",
+            label="Input",
+        )
+        text_input = gr.Textbox(
+            label="Enter text",
+            value=DEFAULT_TEXT,
+            lines=7,
+            visible=True,
+        )
+        file_upload = gr.File(
+            label="Choose .txt file(s)",
+            file_types=[".txt"],
+            file_count="multiple",
+            visible=False,
+        )
+        split_by_line = gr.Checkbox(label="Split by newline", value=True)
+        input_choice.change(
+            fn=toggle_input,
+            inputs=input_choice,
+            outputs=[text_input, file_upload],
+        )
+    with gr.Row():
+        language_dropdown = gr.Dropdown(
+            label="Language",
+            choices=LANG_NAMES,
+            value=LANG_NAMES[DEFAULT_LANG_INDEX],
+        )
+        model_size_dropdown = gr.Dropdown(
+            label="Model Size",
+            choices=available_model_size_options(lang="all"),
+            value=available_model_size_options(lang="all")[0],
+        )
+    metrics_select = gr.CheckboxGroup(
+        label="Metrics",
+        choices=metrics_options(),
+        value=metrics_options(),
     )
+    gr.Markdown(
+        "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) "
+        "for information on the available metrics."
+    )
+    apply_btn = gr.Button("Apply", variant="primary")
+    #############
+    # Results   #
+    #############
+    status_msg = gr.Markdown(visible=False)
+    results_table = gr.DataFrame(visible=False, label="Results")
+    csv_state = gr.State(value=None)
+    download_btn = gr.DownloadButton("Download CSV", visible=False, variant="primary")
+    apply_btn.click(
+        fn=process_and_display,
+        inputs=[
+            input_choice,
+            text_input,
+            file_upload,
+            split_by_line,
+            language_dropdown,
+            model_size_dropdown,
+            metrics_select,
+        ],
+        outputs=[status_msg, results_table, download_btn, csv_state],
+    )
+    ############################
+    # Code For Reproducibility #
+    ############################
+    with gr.Accordion("See python code", open=False):
+        gr.Code(value=CODE_SNIPPET, language="python", interactive=False)
+    #######
+    # FAQ #
+    #######
+    gr.Markdown("## FAQ")
+    with gr.Accordion("What does the 'Split by newline' option do?", open=False):
+        gr.Markdown(
+            "When the `Split by newline` option is `enabled`, the metrics calculation is "
+            "performed separately for each paragraph. I.e. whenever there's a line break, "
+            "we split the text.\n\n"
+            "When this option is `disabled`, the entire text is processed at once."
+        )
+    with gr.Accordion(
+        "Why do I get a warning/error message for certain languages or model sizes?",
+        open=False,
+    ):
+        gr.Markdown(
+            "Some combinations of languages, model sizes, and metrics are not currently supported in the app. "
+            "While we *are* working on this, you may currently see an error message after clicking `Apply`.\n\n"
+            "If you need this language and/or model size to work for your project, "
+            "please open an [issue](https://github.com/HLasse/textdescriptives_app/issues). "
+            "This may cause us to prioritize supporting your use case."
+        )
+if __name__ == "__main__":
+    demo.launch()

data_viewer.py DELETED Viewed

@@ -1,26 +0,0 @@
-"""
-Class for showing header and download button in the same row.
-"""
-import streamlit as st
-class DataViewer:
-    def _convert_df_to_csv(self, data, **kwargs):
-        return data.to_csv(**kwargs).encode("utf-8")
-    def _header_and_download(
-        self, header, data, file_name, key=None, label="Download", help="Download data"
-    ):
-        col1, col2 = st.columns([9, 2])
-        with col1:
-            st.subheader(header)
-        with col2:
-            st.write("")
-            st.download_button(
-                label=label,
-                data=self._convert_df_to_csv(data, index=False),
-                file_name=file_name,
-                key=key,
-                help=help,
-            )

options.py CHANGED Viewed

@@ -80,9 +80,18 @@ def available_model_size_options(lang) -> List[str]:
 class ModelAvailabilityChecker:
     @staticmethod
     def available_models() -> List[str]:
-        return list(get_compatibility().keys())
     @staticmethod
     def extract_language_and_size() -> List[List[str]]:
@@ -106,8 +115,11 @@ class ModelAvailabilityChecker:
     @staticmethod
     def available_model_sizes_for_language(lang: str) -> Set[str]:
-        return set([
             size
             for (lang_, size) in ModelAvailabilityChecker.extract_language_and_size()
             if lang_ == lang and size in all_model_size_options_pretty_to_short().values()
         ])

 class ModelAvailabilityChecker:
+    _compatibility_cache = None
     @staticmethod
     def available_models() -> List[str]:
+        if ModelAvailabilityChecker._compatibility_cache is None:
+            try:
+                ModelAvailabilityChecker._compatibility_cache = list(
+                    get_compatibility().keys()
+                )
+            except Exception:
+                ModelAvailabilityChecker._compatibility_cache = []
+        return ModelAvailabilityChecker._compatibility_cache
     @staticmethod
     def extract_language_and_size() -> List[List[str]]:
     @staticmethod
     def available_model_sizes_for_language(lang: str) -> Set[str]:
+        sizes = set([
             size
             for (lang_, size) in ModelAvailabilityChecker.extract_language_and_size()
             if lang_ == lang and size in all_model_size_options_pretty_to_short().values()
         ])
+        if not sizes:
+            return set(all_model_size_options_pretty_to_short().values())
+        return sizes

process_text.py CHANGED Viewed

@@ -3,12 +3,10 @@ The text processing functionality.
 """
 from typing import List, Optional
-import streamlit as st
 import pandas as pd
 import textdescriptives as td
-@st.cache_data
 def text_to_metrics(
     string: str,
     language_short: str,

 """
 from typing import List, Optional
 import pandas as pd
 import textdescriptives as td
 def text_to_metrics(
     string: str,
     language_short: str,

pyproject.toml ADDED Viewed

	@@ -0,0 +1,24 @@

+[project]
+name = "textdescriptives-app"
+version = "0.1.0"
+description = "Dashboard for extracting text metrics with TextDescriptives"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "textdescriptives>=2.8.2",
+    "gradio>=5.0,<6.0",
+    "pandas",
+    "pip",
+    "en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "da-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.8.0/da_core_news_lg-3.8.0.tar.gz",
+]
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
-textdescriptives==2.8.2
-streamlit
-watchdog
-altair<5.0.0
-https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0.tar.gz
-https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.7.0/da_core_news_lg-3.7.0.tar.gz

+textdescriptives>=2.8.2
+gradio>=5.0,<6.0
+pandas
+https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.8.0/da_core_news_lg-3.8.0.tar.gz

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff