Merge branch 'main' of https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report into pr/13
Browse files- .gitignore +2 -1
- contamination_report.csv +42 -17
.gitignore
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
*.pyc
|
| 2 |
-
*.json
|
|
|
|
|
|
| 1 |
*.pyc
|
| 2 |
+
*.json
|
| 3 |
+
*.lock
|
contamination_report.csv
CHANGED
|
@@ -1,5 +1,22 @@
|
|
| 1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
|
| 5 |
tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
|
|
@@ -436,32 +453,40 @@ zest;;EleutherAI/pile;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
|
| 436 |
zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
| 437 |
|
| 438 |
|
| 439 |
-
imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/
|
| 440 |
-
imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 441 |
|
| 442 |
-
ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/
|
| 443 |
-
ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 444 |
|
| 445 |
-
yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 446 |
-
yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 447 |
|
| 448 |
-
nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/
|
| 449 |
-
nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 450 |
|
| 451 |
-
nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/
|
| 452 |
-
nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 453 |
|
| 454 |
-
samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 455 |
-
samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
| 456 |
|
| 457 |
-
EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/
|
| 458 |
-
EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/
|
| 459 |
|
| 460 |
-
bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/
|
| 461 |
-
bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/
|
| 462 |
|
| 463 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
| 464 |
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
quac;;GPT-3;model;;99.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
| 467 |
rajpurkar/squad_v2;;GPT-3;model;;94.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
|
@@ -571,4 +596,4 @@ ibragim-bad/arc_easy;;FLAN;model;;20.2;;data-based;https://arxiv.org/abs/2109.01
|
|
| 571 |
ibragim-bad/arc_challenge;;FLAN;model;;15.6;;data-based;https://arxiv.org/abs/2109.01652;13
|
| 572 |
facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
|
| 573 |
facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
|
| 574 |
-
facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13
|
|
|
|
| 1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
| 2 |
|
| 3 |
+
gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
|
| 4 |
+
ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
| 5 |
+
openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
|
| 6 |
+
imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 7 |
+
imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 8 |
+
ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 9 |
+
ag_news;;GPT-3.5;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 10 |
+
yelp_review_full;;GPT-4;model;;;80.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 11 |
+
yelp_review_full;;GPT-3.5;model;;;13.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 12 |
+
nyu-mll/glue;rte;GPT-4;model;;60.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
| 13 |
+
nyu-mll/glue;rte;GPT-3.5;model;;71.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
| 14 |
+
nyu-mll/glue;wnli;GPT-4;model;;50.70;;model-based;https://arxiv.org/abs/2311.06233;8
|
| 15 |
+
nyu-mll/glue;wnli;GPT-3.5;model;;12.68;;model-based;https://arxiv.org/abs/2311.06233;8
|
| 16 |
+
samsum;;GPT-4;model;;;77.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 17 |
+
samsum;;GPT-3.5;model;;;74.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 18 |
+
EdinburghNLP/xsum;;GPT-4;model;;;95.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 19 |
+
EdinburghNLP/xsum;;GPT-3.5;model;;;79.00;model-based;https://arxiv.org/abs/2311.06233;8
|
| 20 |
|
| 21 |
allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
|
| 22 |
tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
|
|
|
|
| 453 |
zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
| 454 |
|
| 455 |
|
| 456 |
+
imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 457 |
+
imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 458 |
|
| 459 |
+
ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 460 |
+
ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 461 |
|
| 462 |
+
yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 463 |
+
yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 464 |
|
| 465 |
+
nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 466 |
+
nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 467 |
|
| 468 |
+
nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 469 |
+
nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 470 |
|
| 471 |
+
samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 472 |
+
samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 473 |
|
| 474 |
+
EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 475 |
+
EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
| 476 |
|
| 477 |
+
bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
| 478 |
+
bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
| 479 |
|
| 480 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
| 481 |
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
| 482 |
+
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
| 483 |
+
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
openai_humaneval;;EleutherAI/pile;corpus;;;12.2;data-based;https://arxiv.org/abs/2403.04811;12
|
| 487 |
+
mbpp;;EleutherAI/pile;corpus;;;3.6;data-based;https://arxiv.org/abs/2403.04811;12
|
| 488 |
+
openai_humaneval;;bigcode/the-stack;corpus;;;18.9;data-based;https://arxiv.org/abs/2403.04811;12
|
| 489 |
+
mbpp;;bigcode/the-stack;corpus;;;20.8;data-based;https://arxiv.org/abs/2403.04811;12
|
| 490 |
|
| 491 |
quac;;GPT-3;model;;99.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
| 492 |
rajpurkar/squad_v2;;GPT-3;model;;94.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
|
|
|
| 596 |
ibragim-bad/arc_challenge;;FLAN;model;;15.6;;data-based;https://arxiv.org/abs/2109.01652;13
|
| 597 |
facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
|
| 598 |
facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
|
| 599 |
+
facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13
|