Spaces:
Runtime error
Runtime error
| """ | |
| Dashboard for showcasing extraction of text metrics with textdescriptives. | |
| """ | |
| from io import StringIO | |
| import pandas as pd | |
| import streamlit as st | |
| import textdescriptives as td | |
| from data_viewer import DataViewer | |
| from process_text import text_to_metrics | |
| from options import ( | |
| all_model_size_options_pretty_to_short, | |
| available_model_size_options, | |
| language_options, | |
| metrics_options, | |
| ) | |
| ################ | |
| # Introduction # | |
| ################ | |
| col1, col2 = st.columns([9, 2]) | |
| with col1: | |
| st.title("Extract Text Statistics") | |
| with col2: | |
| st.image( | |
| "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png", | |
| width=125, | |
| ) | |
| st.write( | |
| "Calculate a large variety of statistics from text via the " | |
| "[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package " | |
| f"(v/{td.__version__}) and download the results as a .csv file. " | |
| "Includes descriptive statistics and metrics related to readability, " | |
| "information theory, text coherence and text quality." | |
| ) | |
| st.write( | |
| "The source code for this application can be found on [**GitHub**](https://github.com/HLasse/TextDescriptives_app). " | |
| "If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)." | |
| ) | |
| st.caption( | |
| "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for " | |
| "calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), " | |
| "5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)" | |
| ) | |
| ############ | |
| # Settings # | |
| ############ | |
| input_choice = st.radio( | |
| label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True | |
| ) | |
| with st.form(key="settings_form"): | |
| split_by_line = st.checkbox(label="Split by newline", value=True) | |
| file_name_to_text_string = {} | |
| if input_choice == "Upload file(s)": | |
| uploaded_files = st.file_uploader( | |
| label="Choose a .txt file", type=["txt"], accept_multiple_files=True | |
| ) | |
| if uploaded_files is not None and len(uploaded_files) > 0: | |
| # To convert to a string based IO: | |
| file_name_to_text_string = { | |
| file.name: StringIO(file.getvalue().decode("utf-8")).read() | |
| for file in uploaded_files | |
| } | |
| else: | |
| default_text = """Hello, morning dew. The grass whispers low. | |
| I'm here to dance. The gentle breeze does show. | |
| Good morning, world. The birds sing in delight. | |
| Let's spread our wings. The butterflies take flight. | |
| Nature's chorus sings, a symphony of light.""" | |
| file_name_to_text_string = { | |
| "input": st.text_area( | |
| label="Enter text", value=default_text, height=145, max_chars=None | |
| ) | |
| } | |
| # Row of selectors | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| # Selection of language | |
| language_pretty = st.selectbox( | |
| label="Language", | |
| options=list(language_options().keys()), | |
| index=5, | |
| key="language_selector", | |
| ) | |
| language_short = language_options()[language_pretty] | |
| with col2: | |
| # Selection of model size | |
| model_size_pretty = st.selectbox( | |
| label="Model Size", | |
| options=available_model_size_options(lang="all"), | |
| index=0, | |
| key="size_selector", | |
| ) | |
| model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty] | |
| # Multiselection of metrics | |
| metrics = st.multiselect( | |
| label="Metrics", options=metrics_options(), default=metrics_options() | |
| ) | |
| st.write( | |
| "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for " | |
| "information on the available metrics." | |
| ) | |
| # This shouldn't happen but better safe than sorry | |
| if isinstance(metrics, list) and not metrics: | |
| metrics = None | |
| apply_settings_button = st.form_submit_button(label="Apply") | |
| ############# | |
| # Apply NLP # | |
| ############# | |
| if apply_settings_button and len(file_name_to_text_string) > 0: | |
| if model_size_pretty not in available_model_size_options(lang=language_short): | |
| st.write( | |
| "**Sorry!** The chosen *model size* is not available in this language. Please try another." | |
| ) | |
| else: | |
| # Extract metrics for each text | |
| output_df = pd.concat( | |
| [ | |
| text_to_metrics( | |
| string=string, | |
| language_short=language_short, | |
| model_size_short=model_size_short, | |
| metrics=metrics, | |
| split_by_line=split_by_line, | |
| filename=filename if "Upload" in input_choice else None, | |
| ) | |
| for filename, string in file_name_to_text_string.items() | |
| ], | |
| ignore_index=True, | |
| ) | |
| ################### | |
| # Present Results # | |
| ################### | |
| # Create 2 columns with 1) the output header | |
| # and 2) a download button | |
| DataViewer()._header_and_download( | |
| header="The calculated metrics", | |
| data=output_df, | |
| file_name="text_metrics.csv", | |
| ) | |
| st.write("**Note**: This data frame has been transposed for readability.") | |
| output_df = output_df.transpose().reset_index() | |
| output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]] | |
| st.dataframe(data=output_df, use_container_width=True) | |
| ############################ | |
| # Code For Reproducibility # | |
| ############################ | |
| with st.expander("See python code"): | |
| st.code( | |
| """ | |
| # Note: This is the code for a single text file | |
| # The actual code is slightly more complex | |
| # to allow processing multiple files at once | |
| import textdescriptives as td | |
| # Given a string of text and the settings | |
| text = "..." | |
| language = "..." | |
| model_size = "..." | |
| metrics = [...] | |
| split_by_newline = True | |
| # Remove whitespace from both ends of the string | |
| text = text.strip() | |
| # When asked, split by newlines | |
| if split_by_newline: | |
| lines = text.split("\\n") | |
| else: | |
| lines = [text] | |
| # Remove empty lines | |
| # E.g. due to consecutive newlines | |
| lines = [l for l in lines if l] | |
| # Extract metrics for each line | |
| extracted_metrics = td.extract_metrics( | |
| text=lines, | |
| lang=language, | |
| spacy_model_size=model_size, | |
| metrics=metrics | |
| ) | |
| """, | |
| language="python", | |
| ) | |
| ####### | |
| # FAQ # | |
| ####### | |
| st.subheader("Frequently Asked Questions (FAQ)") | |
| with st.expander("What does the 'Split by newline' option do?"): | |
| st.write( | |
| """ | |
| When the `Split by newline` option is `enabled`, the metrics calculation is | |
| performed separately for each paragraph. I.e. whenever there's a line break, | |
| we split the text. | |
| When this option is `disabled`, the entire text is processed at once. | |
| """ | |
| ) | |
| with st.expander( | |
| "Why do I get a warning/error message for certain languages or model sizes?" | |
| ): | |
| st.write( | |
| """ | |
| Some combinations of languages, model sizes, and metrics are not currently supported in the app. | |
| While we *are* working on this, you may currently see a red box | |
| with an error message after clicking `Apply`. | |
| If you need this language and/or model size to work for your project, | |
| please open an [issue](https://github.com/HLasse/textdescriptives_app/issues). | |
| This may cause us to prioritize supporting your use case. | |
| """ | |
| ) | |