Spaces:

GEM
/

DatasetCardForm

Runtime error

App Files Files Community

Yacine Jernite commited on Nov 16, 2021

Commit

28dd726

1 Parent(s): 8a2ec29

results

Browse files

Files changed (1) hide show

datacards/results.py +28 -21

datacards/results.py CHANGED Viewed

@@ -21,6 +21,11 @@ def results_page():
         st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
             "results"
         ].get("results", {})
         make_multiselect(
             label="What metrics are typically used for this task?",
             key_list=key_pref + ["metrics"],
@@ -39,8 +44,16 @@ def results_page():
             ],
             help="Select all metrics that are typically used when evaluating models for this task.",
         )
         make_text_area(
-            label="Describe the metrics and evaluation methodology that the dataset creators used when introducing this task.",
             key_list=key_pref + ["original-evaluation"],
             help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
         )
@@ -50,26 +63,20 @@ def results_page():
             key_list=key_pref + ["has-previous-results"],
             help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
         )
-        make_text_area(
-            label="What evaluation approaches have others used?",
-            key_list=key_pref + ["modern-evaluation"],
-            help="If the modern evaluation strategy diverts from the original, describe how models are being evaluated.",
-        )
-        make_text_area(
-            label="What are previous results",
-            key_list=key_pref + ["previous-results"],
-            help="List the source and performance metrics for models on this dataset.",
-        )
-        make_text_area(
-            label="Definitions",
-            key_list=key_pref + ["definitions"],
-            help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
-        )
-        make_text_area(
-            label="What aspect of model ability can be measured with this dataset?",
-            key_list=key_pref + ["model-abilities"],
-            help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
-        )

         st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
             "results"
         ].get("results", {})
+        make_text_area(
+            label="What aspect of model ability can be measured with this dataset?",
+            key_list=key_pref + ["model-abilities"],
+            help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
+        )
         make_multiselect(
             label="What metrics are typically used for this task?",
             key_list=key_pref + ["metrics"],
             ],
             help="Select all metrics that are typically used when evaluating models for this task.",
         )
+        if "Other: Other Metrics" in st.session_state.card_dict["results"]["results"].get("metrics", []):
+            make_text_area(
+                label="Definitions of other metrics",
+                key_list=key_pref + ["other-metrics-definitions"],
+                help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
+            )
+        else:
+            st.session_state.card_dict["results"]["results"]["other-metrics-definitions"] = "N/A"
         make_text_area(
+            label="List and describe the purpose of the metrics and evaluation methodology (including human evaluation) that the dataset creators used when introducing this task.",
             key_list=key_pref + ["original-evaluation"],
             help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
         )
             key_list=key_pref + ["has-previous-results"],
             help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
         )
+        if st.session_state.card_dict["results"]["results"]["has-previous-results"] == "yes":
+            make_text_area(
+                label="What evaluation approaches have others used?",
+                key_list=key_pref + ["current-evaluation"],
+                help="If the current evaluation strategy diverts from the original, describe how models are being evaluated.",
+            )
+            make_text_area(
+                label="What are the most relevant previous results for this task/dataset",
+                key_list=key_pref + ["previous-results"],
+                help="List and describe the source and performance metrics for models on this dataset.",
+            )
+        else:
+            st.session_state.card_dict["results"]["results"]["current-evaluation"] = "N/A"
+            st.session_state.card_dict["results"]["results"]["previous-results"] = "N/A"