Spaces:
Runtime error
Runtime error
Yacine Jernite
commited on
Commit
·
dd1054a
1
Parent(s):
05d58bc
considerations
Browse files- datacards/considerations.py +26 -25
datacards/considerations.py
CHANGED
|
@@ -9,8 +9,8 @@ from .streamlit_utils import (
|
|
| 9 |
)
|
| 10 |
|
| 11 |
N_FIELDS_PII = 1
|
| 12 |
-
N_FIELDS_LICENSES =
|
| 13 |
-
N_FIELDS_LIMITATIONS =
|
| 14 |
|
| 15 |
N_FIELDS = N_FIELDS_PII + N_FIELDS_LICENSES + N_FIELDS_LIMITATIONS
|
| 16 |
|
|
@@ -25,7 +25,7 @@ def considerations_page():
|
|
| 25 |
"considerations"
|
| 26 |
].get("pii", {})
|
| 27 |
make_text_area(
|
| 28 |
-
label="Considering your answers to the PII part of the Data Curation Section, describe any potential privacy risks
|
| 29 |
key_list=key_pref+["risks-description"],
|
| 30 |
help="In terms for example of having models memorize private information of data subjects or other breaches of privacy."
|
| 31 |
)
|
|
@@ -37,7 +37,7 @@ def considerations_page():
|
|
| 37 |
].get("licenses", {})
|
| 38 |
|
| 39 |
make_multiselect(
|
| 40 |
-
label="
|
| 41 |
options=[
|
| 42 |
"public domain",
|
| 43 |
"multiple licenses",
|
|
@@ -52,42 +52,43 @@ def considerations_page():
|
|
| 52 |
help="Does the license restrict how the dataset can be used?",
|
| 53 |
)
|
| 54 |
make_multiselect(
|
| 55 |
-
label="
|
| 56 |
-
options=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
key_list=key_pref + ["data-copyright"],
|
| 58 |
-
help="
|
| 59 |
)
|
| 60 |
|
| 61 |
-
with st.expander("Known
|
| 62 |
key_pref = ["considerations", "limitations"]
|
| 63 |
st.session_state.card_dict["considerations"]["limitations"] = st.session_state.card_dict[
|
| 64 |
"considerations"
|
| 65 |
].get("limitations", {})
|
| 66 |
-
|
| 67 |
-
# TODO: Form proper language
|
| 68 |
-
|
| 69 |
make_text_area(
|
| 70 |
-
label="
|
|
|
|
| 71 |
key_list=key_pref + ["data-technical-limitations"],
|
| 72 |
help="",
|
| 73 |
)
|
| 74 |
-
|
| 75 |
make_text_area(
|
| 76 |
-
label="
|
|
|
|
| 77 |
key_list=key_pref + ["data-unsuited-applications"],
|
| 78 |
-
help="",
|
| 79 |
)
|
| 80 |
-
|
| 81 |
make_text_area(
|
| 82 |
-
label="What are discouraged use cases of the dataset?"
|
|
|
|
| 83 |
key_list=key_pref + ["data-discouraged-use"],
|
| 84 |
-
help="",
|
| 85 |
-
)
|
| 86 |
-
|
| 87 |
-
make_text_area(
|
| 88 |
-
label="Citation of work identifying these limitations",
|
| 89 |
-
key_list=key_pref + ["data-citations-limitations"],
|
| 90 |
-
help="",
|
| 91 |
)
|
| 92 |
|
| 93 |
|
|
@@ -104,5 +105,5 @@ def considerations_summary():
|
|
| 104 |
)
|
| 105 |
completion_markdown += f"- **Sub-section - PII Risks and Liability:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
|
| 106 |
completion_markdown += f"- **Sub-section - Licenses:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('licenses', {}))} of {N_FIELDS_LICENSES} fields\n"
|
| 107 |
-
completion_markdown += f"- **Sub-section - Known
|
| 108 |
st.markdown(completion_markdown)
|
|
|
|
| 9 |
)
|
| 10 |
|
| 11 |
N_FIELDS_PII = 1
|
| 12 |
+
N_FIELDS_LICENSES = 2
|
| 13 |
+
N_FIELDS_LIMITATIONS = 3
|
| 14 |
|
| 15 |
N_FIELDS = N_FIELDS_PII + N_FIELDS_LICENSES + N_FIELDS_LIMITATIONS
|
| 16 |
|
|
|
|
| 25 |
"considerations"
|
| 26 |
].get("pii", {})
|
| 27 |
make_text_area(
|
| 28 |
+
label="Considering your answers to the PII part of the Data Curation Section, describe any potential privacy to the data subjects and creators risks when using the dataset.",
|
| 29 |
key_list=key_pref+["risks-description"],
|
| 30 |
help="In terms for example of having models memorize private information of data subjects or other breaches of privacy."
|
| 31 |
)
|
|
|
|
| 37 |
].get("licenses", {})
|
| 38 |
|
| 39 |
make_multiselect(
|
| 40 |
+
label="Based on your answers in the Intended Use part of the Data Overview Section, which of the following best describe the copyright and licensing status of the dataset?",
|
| 41 |
options=[
|
| 42 |
"public domain",
|
| 43 |
"multiple licenses",
|
|
|
|
| 52 |
help="Does the license restrict how the dataset can be used?",
|
| 53 |
)
|
| 54 |
make_multiselect(
|
| 55 |
+
label="Based on your answers in the Language part of the Data Curation Section, which of the following best describe the copyright and licensing status of the underlying language data?",
|
| 56 |
+
options=[
|
| 57 |
+
"public domain",
|
| 58 |
+
"multiple licenses",
|
| 59 |
+
"copyright - all rights reserved",
|
| 60 |
+
"open license - commercial use allowed",
|
| 61 |
+
"research use only",
|
| 62 |
+
"non-commercial use only",
|
| 63 |
+
"do not distribute",
|
| 64 |
+
"other",
|
| 65 |
+
],
|
| 66 |
key_list=key_pref + ["data-copyright"],
|
| 67 |
+
help="For example if the dataset uses data from Wikipedia, we are asking about the status of Wikipedia text in general.",
|
| 68 |
)
|
| 69 |
|
| 70 |
+
with st.expander("Known Technical Limitations", expanded=False):
|
| 71 |
key_pref = ["considerations", "limitations"]
|
| 72 |
st.session_state.card_dict["considerations"]["limitations"] = st.session_state.card_dict[
|
| 73 |
"considerations"
|
| 74 |
].get("limitations", {})
|
|
|
|
|
|
|
|
|
|
| 75 |
make_text_area(
|
| 76 |
+
label="Describe any known technical limitations, such as spurrious correlations, train/test overlap, annotation biases, or mis-annotations? " + \
|
| 77 |
+
"Describe them and cite the works that first identified these limitations when possible.",
|
| 78 |
key_list=key_pref + ["data-technical-limitations"],
|
| 79 |
help="",
|
| 80 |
)
|
|
|
|
| 81 |
make_text_area(
|
| 82 |
+
label="When using a model trained on this dataset in a setting where users or the public may interact with its predictions, what are some pitfalls to look out for? " + \
|
| 83 |
+
"In particular, describe some applications of the general task featured in this dataset that its curation or properties make it less suitable for.",
|
| 84 |
key_list=key_pref + ["data-unsuited-applications"],
|
| 85 |
+
help="For example, outline language varieties or domains that the model might underperform for.",
|
| 86 |
)
|
|
|
|
| 87 |
make_text_area(
|
| 88 |
+
label="What are some discouraged use cases of a model trained to maximize the proposed metrics on this dataset? " +
|
| 89 |
+
"In particular, think about settings where decisions made by a model that performs reasonably well on the metric my still have strong negative consequences for user or members of the public.",
|
| 90 |
key_list=key_pref + ["data-discouraged-use"],
|
| 91 |
+
help="For example, think about application settings where certain types of mistakes (such as missing a negation) might have a particularly strong negative impact but are not particularly singled out by the aggregated evaluation.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
)
|
| 93 |
|
| 94 |
|
|
|
|
| 105 |
)
|
| 106 |
completion_markdown += f"- **Sub-section - PII Risks and Liability:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
|
| 107 |
completion_markdown += f"- **Sub-section - Licenses:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('licenses', {}))} of {N_FIELDS_LICENSES} fields\n"
|
| 108 |
+
completion_markdown += f"- **Sub-section - Known Technical Limitations:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('limitations', {}))} of {N_FIELDS_LIMITATIONS} fields\n"
|
| 109 |
st.markdown(completion_markdown)
|