Spaces:
Runtime error
Runtime error
| import json | |
| import streamlit as st | |
| from os.path import join as pjoin | |
| from .streamlit_utils import ( | |
| make_multiselect, | |
| make_selectbox, | |
| make_text_area, | |
| make_text_input, | |
| make_radio, | |
| ) | |
| N_FIELDS_WHERE = 9 | |
| N_FIELDS_LANGUAGES = 6 | |
| N_FIELDS_CREDIT = 3 | |
| N_FIELDS_STRUCTURE = 7 | |
| N_FIELDS = N_FIELDS_WHERE + \ | |
| N_FIELDS_LANGUAGES + \ | |
| N_FIELDS_CREDIT + \ | |
| N_FIELDS_STRUCTURE | |
| languages_bcp47 = [ | |
| x | |
| for x in json.load(open(pjoin("resources", "bcp47.json"), encoding="utf-8"))["subtags"] | |
| if x["type"] == "language" | |
| ] | |
| license_list = json.load(open(pjoin("resources", "licenses.json"), encoding="utf-8")) | |
| def overview_page(): | |
| st.session_state.card_dict["overview"] = st.session_state.card_dict.get("overview", {}) | |
| with st.expander("Where to find", expanded=False): | |
| key_pref = ["overview", "where"] | |
| st.session_state.card_dict["overview"]["where"] = st.session_state.card_dict.get("where", {}) | |
| make_text_input( | |
| label="What is the webpage for the dataset (if it exists)?", | |
| key_list=key_pref + ["website"], | |
| help="[URL]", | |
| ) | |
| make_text_input( | |
| label="What is the link to where the original dataset is hosted?", | |
| key_list=key_pref + ["data-url"], | |
| help="[URL]", | |
| ) | |
| make_text_input( | |
| label="What is the link to the paper describing the dataset (open access preferred)?", | |
| key_list=key_pref + ["paper-url"], | |
| help="[URL]", | |
| ) | |
| make_text_area( | |
| label="Provide the BibTex-formatted reference for the dataset.", | |
| key_list=key_pref + ["paper-bibtext"], | |
| help="[free text]", | |
| ) | |
| make_radio( | |
| label="Does the dataset have an active leaderboard?", | |
| options=["no", "yes"], | |
| key_list=key_pref + ["has-leaderboard"], | |
| help="If no, enter N/A for the following two fields", | |
| ) | |
| make_text_input( | |
| label="Provide a link to the leaderboard if it exists. Otherwise, enter N/A.", | |
| key_list=key_pref + ["leaderboard-url"], | |
| help="[URL] or N/A", | |
| ) | |
| make_text_area( | |
| label="Briefly describe how the leaderboard evaluates models if it exists. Otherwise, enter N/A.", | |
| key_list=key_pref + ["leaderboard-description"], | |
| help="[free text; a paragraph] or N/A", | |
| ) | |
| make_text_input( | |
| label="If known, provide the name of at least one person the reader can contact for questions about the dataset.", | |
| key_list=key_pref + ["contact-name"], | |
| help="[free text]", | |
| ) | |
| make_text_input( | |
| label="If known, provide the email of at least one person the reader can contact for questions about the dataset.", | |
| key_list=key_pref + ["contact-email"], | |
| help="[free text]", | |
| ) | |
| with st.expander("Languages and Intended Use", expanded=False): | |
| key_pref = ["overview", "languages"] | |
| st.session_state.card_dict["overview"]["languages"] = st.session_state.card_dict.get("languages", {}) | |
| make_radio( | |
| label="Is the dataset multilingual?", | |
| options=["no", "yes"], | |
| key_list=key_pref + ["is-multilingual"], | |
| help="More than one language present in all of the text fields", | |
| ) | |
| make_multiselect( | |
| label="What languages/dialects are covered in the dataset?", | |
| key_list=key_pref + ["language-names"], | |
| options=[ | |
| ", ".join(x["description"]) for x in languages_bcp47 | |
| ], | |
| help="This is a comprehensive list of languages obtained from the BCP-47 standard list.", | |
| ) | |
| make_text_area( | |
| label="What is the intended use of the dataset?", | |
| key_list=key_pref + ["intended-use"], | |
| help="[free text, paragraphs]", | |
| ) | |
| make_selectbox( | |
| label="What is the license of the dataset?", | |
| key_list=key_pref + ["license"], | |
| options=license_list, | |
| help="select `other` if missing from list, `unkown` if not provided" | |
| ) | |
| make_text_input( | |
| label="What primary task does the dataset support?", | |
| key_list=key_pref + ["task"], | |
| help="[free text]", | |
| ) | |
| make_text_area( | |
| label="Provide a short description of the communicative goal of a model trained for this task on this dataset.", | |
| key_list=key_pref + ["communicative"], | |
| help="[free text, a paragraph] (e.g., describe a restaurant from a structured representation of its attributes)", | |
| ) | |
| with st.expander("Credit", expanded=False): | |
| key_pref = ["overview", "credit"] | |
| st.session_state.card_dict["overview"]["credit"] = st.session_state.card_dict.get("credit", {}) | |
| make_text_input( | |
| label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).", | |
| key_list=key_pref + ["creators"], | |
| help="name (affiliation); comma-separated", | |
| ) | |
| make_text_input( | |
| label="Who funded the data creation?", | |
| key_list=key_pref + ["funding"], | |
| help="[free text] enter N/A if unkown", | |
| ) | |
| make_text_input( | |
| label="Who contributed to the data card and adding the dataset to GEM? List the people+affiliations involved in creating this data card and who helped integrate this dataset into GEM.", | |
| key_list=key_pref + ["gem-added-by"], | |
| help="name (affiliation); comma-separated", | |
| ) | |
| with st.expander("Structure", expanded=False): | |
| key_pref = ["overview", "structure"] | |
| st.session_state.card_dict["overview"]["structure"] = st.session_state.card_dict.get("structure", {}) | |
| data_fields_help = """ | |
| [free text; paragraphs] | |
| - Mention their data type, and whether and how they are used as part of the generation pipeline. | |
| - Describe each fields' attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. | |
| - If the datasets contain example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points. | |
| """ | |
| make_text_area( | |
| label="List and describe the fields present in the dataset.", | |
| key_list=key_pref + ["data-fields"], | |
| help=data_fields_help, | |
| ) | |
| make_text_area( | |
| label="How was the dataset structure determined?", | |
| key_list=key_pref + ["structure-description"], | |
| help="[free text; paragraph]", | |
| ) | |
| make_text_area( | |
| label="How were the labels chosen?", | |
| key_list=key_pref + ["structure-labels"], | |
| help="[free text; paragraph]", | |
| ) | |
| make_text_area( | |
| label="Provide a JSON formatted example of a typical instance in the dataset.", | |
| key_list=key_pref + ["structure-example"], | |
| help="[JSON]", | |
| ) | |
| make_text_area( | |
| label="Describe and name the splits in the dataset if there are more than one.", | |
| key_list=key_pref + ["structure-splits"], | |
| help="[free text, paragraphs] - As appropriate, provide any descriptive statistics for the features, such as size, average lengths of input and output.", | |
| ) | |
| make_text_area( | |
| label="Describe any criteria for splitting the data, if used. If there are differences between the splits (e.g., if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.", | |
| key_list=key_pref + ["structure-splits-criteria"], | |
| help="[free text, paragraphs]", | |
| ) | |
| make_text_area( | |
| label="What does an outlier of the dataset in terms of length/perplexity/embedding look like?", | |
| key_list=key_pref + ["structure-outlier"], | |
| help="[free text + json formatted text/file for an example]", | |
| ) | |
| def overview_summary(): | |
| with st.expander("Dataset Overview Completion", expanded=True): | |
| completion_markdown = "" | |
| completion_markdown += f"- **Overall competion:**\n - {sum([len(dct) for dct in st.session_state.card_dict.get('overview', {}).values()])} of {N_FIELDS} fields\n" | |
| completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n" | |
| completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n" | |
| completion_markdown += f"- **Sub-section - Credit:**\n - {len(st.session_state.card_dict.get('overview', {}).get('credit', {}))} of {N_FIELDS_CREDIT} fields\n" | |
| completion_markdown += f"- **Sub-section - Structure:**\n - {len(st.session_state.card_dict.get('overview', {}).get('structure', {}))} of {N_FIELDS_STRUCTURE} fields\n" | |
| st.markdown(completion_markdown) | |