Spaces:
Runtime error
Runtime error
Yacine Jernite
commited on
Commit
·
28dd726
1
Parent(s):
8a2ec29
results
Browse files- datacards/results.py +28 -21
datacards/results.py
CHANGED
|
@@ -21,6 +21,11 @@ def results_page():
|
|
| 21 |
st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
|
| 22 |
"results"
|
| 23 |
].get("results", {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
make_multiselect(
|
| 25 |
label="What metrics are typically used for this task?",
|
| 26 |
key_list=key_pref + ["metrics"],
|
|
@@ -39,8 +44,16 @@ def results_page():
|
|
| 39 |
],
|
| 40 |
help="Select all metrics that are typically used when evaluating models for this task.",
|
| 41 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
make_text_area(
|
| 43 |
-
label="
|
| 44 |
key_list=key_pref + ["original-evaluation"],
|
| 45 |
help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
|
| 46 |
)
|
|
@@ -50,26 +63,20 @@ def results_page():
|
|
| 50 |
key_list=key_pref + ["has-previous-results"],
|
| 51 |
help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
|
| 52 |
)
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
)
|
| 68 |
-
make_text_area(
|
| 69 |
-
label="What aspect of model ability can be measured with this dataset?",
|
| 70 |
-
key_list=key_pref + ["model-abilities"],
|
| 71 |
-
help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
|
| 72 |
-
)
|
| 73 |
|
| 74 |
|
| 75 |
|
|
|
|
| 21 |
st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
|
| 22 |
"results"
|
| 23 |
].get("results", {})
|
| 24 |
+
make_text_area(
|
| 25 |
+
label="What aspect of model ability can be measured with this dataset?",
|
| 26 |
+
key_list=key_pref + ["model-abilities"],
|
| 27 |
+
help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
|
| 28 |
+
)
|
| 29 |
make_multiselect(
|
| 30 |
label="What metrics are typically used for this task?",
|
| 31 |
key_list=key_pref + ["metrics"],
|
|
|
|
| 44 |
],
|
| 45 |
help="Select all metrics that are typically used when evaluating models for this task.",
|
| 46 |
)
|
| 47 |
+
if "Other: Other Metrics" in st.session_state.card_dict["results"]["results"].get("metrics", []):
|
| 48 |
+
make_text_area(
|
| 49 |
+
label="Definitions of other metrics",
|
| 50 |
+
key_list=key_pref + ["other-metrics-definitions"],
|
| 51 |
+
help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
|
| 52 |
+
)
|
| 53 |
+
else:
|
| 54 |
+
st.session_state.card_dict["results"]["results"]["other-metrics-definitions"] = "N/A"
|
| 55 |
make_text_area(
|
| 56 |
+
label="List and describe the purpose of the metrics and evaluation methodology (including human evaluation) that the dataset creators used when introducing this task.",
|
| 57 |
key_list=key_pref + ["original-evaluation"],
|
| 58 |
help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
|
| 59 |
)
|
|
|
|
| 63 |
key_list=key_pref + ["has-previous-results"],
|
| 64 |
help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
|
| 65 |
)
|
| 66 |
+
if st.session_state.card_dict["results"]["results"]["has-previous-results"] == "yes":
|
| 67 |
+
make_text_area(
|
| 68 |
+
label="What evaluation approaches have others used?",
|
| 69 |
+
key_list=key_pref + ["current-evaluation"],
|
| 70 |
+
help="If the current evaluation strategy diverts from the original, describe how models are being evaluated.",
|
| 71 |
+
)
|
| 72 |
+
make_text_area(
|
| 73 |
+
label="What are the most relevant previous results for this task/dataset",
|
| 74 |
+
key_list=key_pref + ["previous-results"],
|
| 75 |
+
help="List and describe the source and performance metrics for models on this dataset.",
|
| 76 |
+
)
|
| 77 |
+
else:
|
| 78 |
+
st.session_state.card_dict["results"]["results"]["current-evaluation"] = "N/A"
|
| 79 |
+
st.session_state.card_dict["results"]["results"]["previous-results"] = "N/A"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
|