Spaces:
Runtime error
Runtime error
Commit ·
7e267bf
1
Parent(s): 88838f6
update
Browse files- src/backend/envs.py +3 -0
- src/backend/tasks/nq8/README.md +0 -0
- src/backend/tasks/nq8/nq8.yaml +32 -0
- src/backend/tasks/tqa8/README.md +51 -0
- src/backend/tasks/tqa8/tqa8.yaml +31 -0
src/backend/envs.py
CHANGED
|
@@ -37,6 +37,9 @@ class Tasks(Enum):
|
|
| 37 |
|
| 38 |
task10 = Task("memo-trap", "acc", "memo-trap", 0)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
| 41 |
|
| 42 |
|
|
|
|
| 37 |
|
| 38 |
task10 = Task("memo-trap", "acc", "memo-trap", 0)
|
| 39 |
|
| 40 |
+
task11 = Task("nq8", "em", "NQ Open 8", 8)
|
| 41 |
+
task12 = Task("tqa8", "em", "TriviaQA 8", 8)
|
| 42 |
+
|
| 43 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
| 44 |
|
| 45 |
|
src/backend/tasks/nq8/README.md
ADDED
|
File without changes
|
src/backend/tasks/nq8/nq8.yaml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: nq8
|
| 2 |
+
dataset_path: nq_open
|
| 3 |
+
output_type: generate_until
|
| 4 |
+
training_split: train
|
| 5 |
+
validation_split: validation
|
| 6 |
+
description: "Answer these questions:\n"
|
| 7 |
+
doc_to_text: "Q: {{question}}?\nA:"
|
| 8 |
+
doc_to_target: "{{answer}}" # TODO: should be multi-target
|
| 9 |
+
fewshot_delimiter: "\n"
|
| 10 |
+
generation_kwargs:
|
| 11 |
+
until:
|
| 12 |
+
- "\n"
|
| 13 |
+
- "."
|
| 14 |
+
- ","
|
| 15 |
+
do_sample: false
|
| 16 |
+
temperature: 0.0
|
| 17 |
+
filter_list:
|
| 18 |
+
- name: remove_whitespace
|
| 19 |
+
filter:
|
| 20 |
+
- function: remove_whitespace
|
| 21 |
+
- function: take_first
|
| 22 |
+
target_delimiter: " "
|
| 23 |
+
metric_list:
|
| 24 |
+
- metric: exact_match
|
| 25 |
+
aggregation: mean
|
| 26 |
+
higher_is_better: true
|
| 27 |
+
ignore_case: true
|
| 28 |
+
ignore_punctuation: true
|
| 29 |
+
regexes_to_ignore:
|
| 30 |
+
- "\ban|a|the\b"
|
| 31 |
+
metadata:
|
| 32 |
+
- version: 0.0
|
src/backend/tasks/tqa8/README.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Trivia QA
|
| 2 |
+
|
| 3 |
+
### Paper
|
| 4 |
+
|
| 5 |
+
Title: `TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension`
|
| 6 |
+
Abstract: https://arxiv.org/abs/1705.03551
|
| 7 |
+
|
| 8 |
+
TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
|
| 9 |
+
triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
|
| 10 |
+
and independently gathered evidence documents, six per question on average, that provide
|
| 11 |
+
high quality distant supervision for answering the questions.
|
| 12 |
+
|
| 13 |
+
Homepage: https://nlp.cs.washington.edu/triviaqa/
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
### Citation
|
| 17 |
+
|
| 18 |
+
```
|
| 19 |
+
@InProceedings{JoshiTriviaQA2017,
|
| 20 |
+
author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
|
| 21 |
+
title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
|
| 22 |
+
booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
|
| 23 |
+
month = {July},
|
| 24 |
+
year = {2017},
|
| 25 |
+
address = {Vancouver, Canada},
|
| 26 |
+
publisher = {Association for Computational Linguistics},
|
| 27 |
+
}
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
### Groups and Tasks
|
| 31 |
+
|
| 32 |
+
#### Groups
|
| 33 |
+
|
| 34 |
+
* Not part of a group yet.
|
| 35 |
+
|
| 36 |
+
#### Tasks
|
| 37 |
+
|
| 38 |
+
* `triviaqa`: `Generate and answer based on the question.`
|
| 39 |
+
|
| 40 |
+
### Checklist
|
| 41 |
+
|
| 42 |
+
For adding novel benchmarks/datasets to the library:
|
| 43 |
+
* [ ] Is the task an existing benchmark in the literature?
|
| 44 |
+
* [ ] Have you referenced the original paper that introduced the task?
|
| 45 |
+
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
If other tasks on this dataset are already supported:
|
| 49 |
+
* [ ] Is the "Main" variant of this task clearly denoted?
|
| 50 |
+
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
|
| 51 |
+
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
|
src/backend/tasks/tqa8/tqa8.yaml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: tqa8
|
| 2 |
+
dataset_path: trivia_qa
|
| 3 |
+
dataset_name: rc.nocontext
|
| 4 |
+
output_type: generate_until
|
| 5 |
+
training_split: train
|
| 6 |
+
validation_split: validation
|
| 7 |
+
doc_to_text: "Question: {{question}}?\nAnswer:"
|
| 8 |
+
doc_to_target: "{{answer.aliases}}"
|
| 9 |
+
should_decontaminate: true
|
| 10 |
+
doc_to_decontamination_query: question
|
| 11 |
+
generation_kwargs:
|
| 12 |
+
until:
|
| 13 |
+
- "\n"
|
| 14 |
+
- "."
|
| 15 |
+
- ","
|
| 16 |
+
do_sample: false
|
| 17 |
+
temperature: 0.0
|
| 18 |
+
filter_list:
|
| 19 |
+
- name: remove_whitespace
|
| 20 |
+
filter:
|
| 21 |
+
- function: remove_whitespace
|
| 22 |
+
- function: take_first
|
| 23 |
+
target_delimiter: " "
|
| 24 |
+
metric_list:
|
| 25 |
+
- metric: exact_match
|
| 26 |
+
aggregation: mean
|
| 27 |
+
higher_is_better: true
|
| 28 |
+
ignore_case: true
|
| 29 |
+
ignore_punctuation: true
|
| 30 |
+
metadata:
|
| 31 |
+
- version: 2.0
|