Spaces:
Runtime error
Runtime error
Yacine Jernite
commited on
Commit
·
8a2ec29
1
Parent(s):
9994065
first half done
Browse files- datacards/curation.py +128 -105
- datacards/overview.py +26 -11
datacards/curation.py
CHANGED
|
@@ -11,7 +11,7 @@ from .streamlit_utils import (
|
|
| 11 |
)
|
| 12 |
|
| 13 |
N_FIELDS_ORIGINAL = 4
|
| 14 |
-
N_FIELDS_LANGUAGE =
|
| 15 |
N_FIELDS_ANNOTATIONS = 10
|
| 16 |
N_FIELDS_CONSENT = 4
|
| 17 |
N_FIELDS_PII = 7
|
|
@@ -52,11 +52,14 @@ def curation_page():
|
|
| 52 |
key_list=key_pref + ["is-aggregated"],
|
| 53 |
help="e.g. Wikipedia, movi dialogues, etc.",
|
| 54 |
)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
with st.expander("Language Data", expanded=False):
|
| 62 |
key_pref = ["curation", "language"]
|
|
@@ -74,38 +77,49 @@ def curation_page():
|
|
| 74 |
],
|
| 75 |
key_list=key_pref + ["obtained"],
|
| 76 |
)
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
"
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
make_text_area(
|
| 100 |
label="What further information do we have on the language producers?",
|
| 101 |
key_list=key_pref + ["producers-description"],
|
| 102 |
help="Provide a description of the context in which the language was produced and who produced it.",
|
| 103 |
)
|
| 104 |
-
make_text_input(
|
| 105 |
-
label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
|
| 106 |
-
key_list=key_pref + ["machine-generated"],
|
| 107 |
-
help="if the generation code is unavailable, enter N/A",
|
| 108 |
-
)
|
| 109 |
make_selectbox(
|
| 110 |
label="Was the text validated by a different worker or a data curator?",
|
| 111 |
options=[
|
|
@@ -117,16 +131,6 @@ def curation_page():
|
|
| 117 |
key_list=key_pref + ["validated"],
|
| 118 |
help="this question is about human or human-in-the-loop validation only",
|
| 119 |
)
|
| 120 |
-
make_multiselect(
|
| 121 |
-
label="In what kind of organization did the curation happen?",
|
| 122 |
-
options=["industry", "academic", "independent", "other"],
|
| 123 |
-
key_list=key_pref + ["organization-type"],
|
| 124 |
-
)
|
| 125 |
-
make_text_input(
|
| 126 |
-
label="Name the organization(s).",
|
| 127 |
-
key_list=key_pref + ["organization-names"],
|
| 128 |
-
help="comma-separated",
|
| 129 |
-
)
|
| 130 |
make_text_area(
|
| 131 |
label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
|
| 132 |
key_list=key_pref + ["pre-processed"],
|
|
@@ -137,11 +141,14 @@ def curation_page():
|
|
| 137 |
options=["not filtered", "manually", "algorithmically", "hybrid"],
|
| 138 |
key_list=key_pref + ["is-filtered"],
|
| 139 |
)
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
with st.expander("Structured Annotations", expanded=False):
|
| 147 |
key_pref = ["curation", "annotations"]
|
|
@@ -149,72 +156,88 @@ def curation_page():
|
|
| 149 |
"annotations"
|
| 150 |
] = st.session_state.card_dict["curation"].get("annotations", {})
|
| 151 |
|
| 152 |
-
|
| 153 |
label="Does the dataset have additional annotations for each instance?",
|
| 154 |
options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
|
| 155 |
key_list=key_pref + ["origin"],
|
| 156 |
help="Was any additional data collected?",
|
| 157 |
)
|
| 158 |
|
| 159 |
-
# TODO: If yes....
|
| 160 |
# If expert or crowdsourced, this branch
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
-
make_text_area(
|
| 202 |
-
label="Purpose and values for each annoation",
|
| 203 |
-
key_list=key_pref + ["values"],
|
| 204 |
-
help="Describe the purpose and possible values for each kind of annotation.",
|
| 205 |
-
)
|
| 206 |
-
make_multiselect(
|
| 207 |
-
label="Quality control measures?",
|
| 208 |
-
options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"],
|
| 209 |
-
key_list=key_pref + ["quality-control"],
|
| 210 |
-
help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
|
| 211 |
-
)
|
| 212 |
-
# TODO: If not none / unknown
|
| 213 |
-
make_text_area(
|
| 214 |
-
label="Describe the quality control measures that were taken.",
|
| 215 |
-
key_list=key_pref + ["quality-control-details"],
|
| 216 |
-
help="Describe how quality was ensured in the data curation process.",
|
| 217 |
-
)
|
| 218 |
|
| 219 |
with st.expander("Consent", expanded=False):
|
| 220 |
key_pref = ["curation", "consent"]
|
|
|
|
| 11 |
)
|
| 12 |
|
| 13 |
N_FIELDS_ORIGINAL = 4
|
| 14 |
+
N_FIELDS_LANGUAGE = 10
|
| 15 |
N_FIELDS_ANNOTATIONS = 10
|
| 16 |
N_FIELDS_CONSENT = 4
|
| 17 |
N_FIELDS_PII = 7
|
|
|
|
| 52 |
key_list=key_pref + ["is-aggregated"],
|
| 53 |
help="e.g. Wikipedia, movi dialogues, etc.",
|
| 54 |
)
|
| 55 |
+
if st.session_state.card_dict["curation"]["original"]["is-aggregated"] == "yes":
|
| 56 |
+
make_text_area(
|
| 57 |
+
label="List the sources (one per line)",
|
| 58 |
+
key_list=key_pref + ["aggregated-sources"],
|
| 59 |
+
help="One source per line",
|
| 60 |
+
)
|
| 61 |
+
else:
|
| 62 |
+
st.session_state.card_dict["curation"]["original"]["aggregated-sources"] = "N/A"
|
| 63 |
|
| 64 |
with st.expander("Language Data", expanded=False):
|
| 65 |
key_pref = ["curation", "language"]
|
|
|
|
| 77 |
],
|
| 78 |
key_list=key_pref + ["obtained"],
|
| 79 |
)
|
| 80 |
+
if "Found" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
| 81 |
+
make_multiselect(
|
| 82 |
+
label="If found, where from?",
|
| 83 |
+
options=["Multiple websites", "Single website", "Offline media collection", "Other"],
|
| 84 |
+
key_list=key_pref + ["found"],
|
| 85 |
+
help="select N/A if none of the language data was found",
|
| 86 |
+
)
|
| 87 |
+
else:
|
| 88 |
+
st.session_state.card_dict["curation"]["language"]["found"] = []
|
| 89 |
+
if "Crowdsourced" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
| 90 |
+
make_multiselect(
|
| 91 |
+
label="If crowdsourced, where from?",
|
| 92 |
+
options=[
|
| 93 |
+
"Amazon Mechanical Turk",
|
| 94 |
+
"Other crowdworker platform",
|
| 95 |
+
"Participatory experiment",
|
| 96 |
+
"Other",
|
| 97 |
+
],
|
| 98 |
+
key_list=key_pref + ["crowdsourced"],
|
| 99 |
+
help="select N/A if none of the language data was crowdsourced",
|
| 100 |
+
)
|
| 101 |
+
else:
|
| 102 |
+
st.session_state.card_dict["curation"]["language"]["crowdsourced"] = []
|
| 103 |
+
if "Created for the dataset" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
| 104 |
+
make_text_area(
|
| 105 |
+
label="If created for the dataset, describe the creation process.",
|
| 106 |
+
key_list=key_pref + ["created"],
|
| 107 |
+
)
|
| 108 |
+
else:
|
| 109 |
+
st.session_state.card_dict["curation"]["language"]["created"] = "N/A"
|
| 110 |
+
if "Machine-generated" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
| 111 |
+
make_text_input(
|
| 112 |
+
label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
|
| 113 |
+
key_list=key_pref + ["machine-generated"],
|
| 114 |
+
help="if the generation code is unavailable, enter N/A",
|
| 115 |
+
)
|
| 116 |
+
else:
|
| 117 |
+
st.session_state.card_dict["curation"]["language"]["machine-generated"] = "N/A"
|
| 118 |
make_text_area(
|
| 119 |
label="What further information do we have on the language producers?",
|
| 120 |
key_list=key_pref + ["producers-description"],
|
| 121 |
help="Provide a description of the context in which the language was produced and who produced it.",
|
| 122 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
make_selectbox(
|
| 124 |
label="Was the text validated by a different worker or a data curator?",
|
| 125 |
options=[
|
|
|
|
| 131 |
key_list=key_pref + ["validated"],
|
| 132 |
help="this question is about human or human-in-the-loop validation only",
|
| 133 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
make_text_area(
|
| 135 |
label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
|
| 136 |
key_list=key_pref + ["pre-processed"],
|
|
|
|
| 141 |
options=["not filtered", "manually", "algorithmically", "hybrid"],
|
| 142 |
key_list=key_pref + ["is-filtered"],
|
| 143 |
)
|
| 144 |
+
if st.session_state.card_dict["curation"]["language"]["is-filtered"] == "not filtered":
|
| 145 |
+
st.session_state.card_dict["curation"]["language"]["filtered-criteria"] = "N/A"
|
| 146 |
+
else:
|
| 147 |
+
make_text_area(
|
| 148 |
+
label="What were the selection criteria?",
|
| 149 |
+
key_list=key_pref + ["filtered-criteria"],
|
| 150 |
+
help="Describe the process for selecting instances to include in the dataset, including any tools used.",
|
| 151 |
+
)
|
| 152 |
|
| 153 |
with st.expander("Structured Annotations", expanded=False):
|
| 154 |
key_pref = ["curation", "annotations"]
|
|
|
|
| 156 |
"annotations"
|
| 157 |
] = st.session_state.card_dict["curation"].get("annotations", {})
|
| 158 |
|
| 159 |
+
make_selectbox(
|
| 160 |
label="Does the dataset have additional annotations for each instance?",
|
| 161 |
options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
|
| 162 |
key_list=key_pref + ["origin"],
|
| 163 |
help="Was any additional data collected?",
|
| 164 |
)
|
| 165 |
|
|
|
|
| 166 |
# If expert or crowdsourced, this branch
|
| 167 |
+
if st.session_state.card_dict["curation"]["annotations"]["origin"] in ["expert created", "crowd-sourced"]:
|
| 168 |
+
make_selectbox(
|
| 169 |
+
label="What is the number of raters ",
|
| 170 |
+
options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"],
|
| 171 |
+
key_list=key_pref + ["rater-number"],
|
| 172 |
+
help="How many raters were used to create the additional annotations?",
|
| 173 |
+
)
|
| 174 |
+
make_text_area(
|
| 175 |
+
label="Describe the qualifications required of an annotator.",
|
| 176 |
+
key_list=key_pref + ["rater-qualifications"],
|
| 177 |
+
help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).",
|
| 178 |
+
)
|
| 179 |
+
make_selectbox(
|
| 180 |
+
label="How many annotators saw each training example?",
|
| 181 |
+
options=["0", "1", "2", "3", "4", "5", ">5"],
|
| 182 |
+
key_list=key_pref + ["rater-training-num"],
|
| 183 |
+
help="",
|
| 184 |
+
)
|
| 185 |
+
make_selectbox(
|
| 186 |
+
label="How many annotators saw each test example?",
|
| 187 |
+
options=["0", "1", "2", "3", "4", "5", ">5"],
|
| 188 |
+
key_list=key_pref + ["rater-test-num"],
|
| 189 |
+
help="",
|
| 190 |
+
)
|
| 191 |
+
make_radio(
|
| 192 |
+
label="Was an annotation service used?",
|
| 193 |
+
options=["no", "yes", "unknown"],
|
| 194 |
+
key_list=key_pref + ["rater-annotation-service-bool"],
|
| 195 |
+
help="",
|
| 196 |
+
)
|
| 197 |
+
if st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] == "yes":
|
| 198 |
+
make_multiselect(
|
| 199 |
+
label="Which annotation services were used?",
|
| 200 |
+
options=[
|
| 201 |
+
"Amazon Mechanical Turk", "Prolific Academic",
|
| 202 |
+
"Upwork", "Appen", "Crowdflower", "other"
|
| 203 |
+
],
|
| 204 |
+
key_list=key_pref + ["rater-annotation-service"],
|
| 205 |
+
)
|
| 206 |
+
else:
|
| 207 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
|
| 208 |
+
else:
|
| 209 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-number"] = "N/A"
|
| 210 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-qualifications"] = "N/A"
|
| 211 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-training-num"] = "N/A"
|
| 212 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-test-num"] = "N/A"
|
| 213 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] = "no"
|
| 214 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
|
| 215 |
|
| 216 |
+
if st.session_state.card_dict["curation"]["annotations"]["origin"] != "none":
|
| 217 |
+
make_text_area(
|
| 218 |
+
label="Purpose and values for each annoation",
|
| 219 |
+
key_list=key_pref + ["values"],
|
| 220 |
+
help="Describe the purpose and possible values for each kind of annotation.",
|
| 221 |
+
)
|
| 222 |
+
make_selectbox(
|
| 223 |
+
label="Quality control measures?",
|
| 224 |
+
options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"],
|
| 225 |
+
key_list=key_pref + ["quality-control"],
|
| 226 |
+
help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
|
| 227 |
+
)
|
| 228 |
+
if st.session_state.card_dict["curation"]["annotations"]["quality-control"] in ["none", "unknown"]:
|
| 229 |
+
st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
|
| 230 |
+
else:
|
| 231 |
+
make_text_area(
|
| 232 |
+
label="Describe the quality control measures that were taken.",
|
| 233 |
+
key_list=key_pref + ["quality-control-details"],
|
| 234 |
+
help="Describe how quality was ensured in the data curation process.",
|
| 235 |
+
)
|
| 236 |
+
else:
|
| 237 |
+
st.session_state.card_dict["curation"]["annotations"]["values"] = "N/A"
|
| 238 |
+
st.session_state.card_dict["curation"]["annotations"]["quality-control"] = []
|
| 239 |
+
st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
with st.expander("Consent", expanded=False):
|
| 243 |
key_pref = ["curation", "consent"]
|
datacards/overview.py
CHANGED
|
@@ -13,7 +13,7 @@ from .streamlit_utils import (
|
|
| 13 |
|
| 14 |
N_FIELDS_WHERE = 9
|
| 15 |
N_FIELDS_LANGUAGES = 8
|
| 16 |
-
N_FIELDS_CREDIT =
|
| 17 |
N_FIELDS_STRUCTURE = 7
|
| 18 |
|
| 19 |
N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
|
|
@@ -65,16 +65,20 @@ def overview_page():
|
|
| 65 |
key_list=key_pref + ["has-leaderboard"],
|
| 66 |
help="If no, enter N/A for the following two fields",
|
| 67 |
)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
make_text_input(
|
| 79 |
label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
|
| 80 |
key_list=key_pref + ["contact-name"],
|
|
@@ -127,6 +131,7 @@ def overview_page():
|
|
| 127 |
label="What primary task does the dataset support?",
|
| 128 |
key_list=key_pref + ["task"],
|
| 129 |
options=[
|
|
|
|
| 130 |
"Content Transfer",
|
| 131 |
"Data-to-Text",
|
| 132 |
"Dialog Response Generation",
|
|
@@ -150,6 +155,16 @@ def overview_page():
|
|
| 150 |
st.session_state.card_dict["overview"][
|
| 151 |
"credit"
|
| 152 |
] = st.session_state.card_dict.get("credit", {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
make_text_input(
|
| 154 |
label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
|
| 155 |
key_list=key_pref + ["creators"],
|
|
|
|
| 13 |
|
| 14 |
N_FIELDS_WHERE = 9
|
| 15 |
N_FIELDS_LANGUAGES = 8
|
| 16 |
+
N_FIELDS_CREDIT = 5
|
| 17 |
N_FIELDS_STRUCTURE = 7
|
| 18 |
|
| 19 |
N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
|
|
|
|
| 65 |
key_list=key_pref + ["has-leaderboard"],
|
| 66 |
help="If no, enter N/A for the following two fields",
|
| 67 |
)
|
| 68 |
+
if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes":
|
| 69 |
+
make_text_input(
|
| 70 |
+
label="Provide a link to the leaderboard.",
|
| 71 |
+
key_list=key_pref + ["leaderboard-url"],
|
| 72 |
+
help="[URL] or N/A",
|
| 73 |
+
)
|
| 74 |
+
make_text_area(
|
| 75 |
+
label="Briefly describe how the leaderboard evaluates models.",
|
| 76 |
+
key_list=key_pref + ["leaderboard-description"],
|
| 77 |
+
help="[free text; a paragraph] or N/A",
|
| 78 |
+
)
|
| 79 |
+
else:
|
| 80 |
+
st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A"
|
| 81 |
+
st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A"
|
| 82 |
make_text_input(
|
| 83 |
label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
|
| 84 |
key_list=key_pref + ["contact-name"],
|
|
|
|
| 131 |
label="What primary task does the dataset support?",
|
| 132 |
key_list=key_pref + ["task"],
|
| 133 |
options=[
|
| 134 |
+
"", # default needs to be invalid value to make sure people actually fill in
|
| 135 |
"Content Transfer",
|
| 136 |
"Data-to-Text",
|
| 137 |
"Dialog Response Generation",
|
|
|
|
| 155 |
st.session_state.card_dict["overview"][
|
| 156 |
"credit"
|
| 157 |
] = st.session_state.card_dict.get("credit", {})
|
| 158 |
+
make_multiselect(
|
| 159 |
+
label="In what kind of organization did the dataset curation happen?",
|
| 160 |
+
options=["industry", "academic", "independent", "other"],
|
| 161 |
+
key_list=key_pref + ["organization-type"],
|
| 162 |
+
)
|
| 163 |
+
make_text_input(
|
| 164 |
+
label="Name the organization(s).",
|
| 165 |
+
key_list=key_pref + ["organization-names"],
|
| 166 |
+
help="comma-separated",
|
| 167 |
+
)
|
| 168 |
make_text_input(
|
| 169 |
label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
|
| 170 |
key_list=key_pref + ["creators"],
|