Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_freqdist.py +7 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_metrics.py +66 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_naivebayes.py +21 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_nombank.py +27 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pl196x.py +13 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pos_tag.py +83 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_ribes.py +246 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_rte_classify.py +94 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_seekable_unicode_stream_reader.py +86 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_senna.py +112 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_stem.py +157 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tag.py +23 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tgrep.py +780 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tokenize.py +867 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_twitter_auth.py +77 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_util.py +82 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_wordnet.py +240 -0
- build/lib/opencompass/configs/dataset_collections/chat_OC15.py +22 -0
- build/lib/opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py +45 -0
- build/lib/opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py +51 -0
- build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py +49 -0
- build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_f1e631.py +49 -0
- build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py +60 -0
- build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py +44 -0
- build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl.py +4 -0
- build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_7d1c07.py +43 -0
- build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_d10e8a.py +48 -0
- build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_fff486.py +48 -0
- build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py +4 -0
- build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py +77 -0
- build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl.py +4 -0
- build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py +76 -0
- build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py +4 -0
- build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl_77d0df.py +50 -0
- build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py +4 -0
- build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl_96564c.py +51 -0
- build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py +4 -0
- build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl_250d00.py +51 -0
- build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py +4 -0
- build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py +304 -0
- build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py +4 -0
- build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py +356 -0
- build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py +45 -0
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_freqdist.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_iterating_returns_an_iterator_ordered_by_frequency():
|
| 5 |
+
samples = ["one", "two", "two"]
|
| 6 |
+
distribution = nltk.FreqDist(samples)
|
| 7 |
+
assert list(distribution) == ["two", "one"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_metrics.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from nltk.metrics import (
|
| 4 |
+
BigramAssocMeasures,
|
| 5 |
+
QuadgramAssocMeasures,
|
| 6 |
+
TrigramAssocMeasures,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
## Test the likelihood ratio metric
|
| 10 |
+
|
| 11 |
+
_DELTA = 1e-8
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestLikelihoodRatio(unittest.TestCase):
|
| 15 |
+
def test_lr_bigram(self):
|
| 16 |
+
self.assertAlmostEqual(
|
| 17 |
+
BigramAssocMeasures.likelihood_ratio(2, (4, 4), 20),
|
| 18 |
+
2.4142743368419755,
|
| 19 |
+
delta=_DELTA,
|
| 20 |
+
)
|
| 21 |
+
self.assertAlmostEqual(
|
| 22 |
+
BigramAssocMeasures.likelihood_ratio(1, (1, 1), 1), 0.0, delta=_DELTA
|
| 23 |
+
)
|
| 24 |
+
self.assertRaises(
|
| 25 |
+
ValueError,
|
| 26 |
+
BigramAssocMeasures.likelihood_ratio,
|
| 27 |
+
*(0, (2, 2), 2),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
def test_lr_trigram(self):
|
| 31 |
+
self.assertAlmostEqual(
|
| 32 |
+
TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 2),
|
| 33 |
+
5.545177444479562,
|
| 34 |
+
delta=_DELTA,
|
| 35 |
+
)
|
| 36 |
+
self.assertAlmostEqual(
|
| 37 |
+
TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 1),
|
| 38 |
+
0.0,
|
| 39 |
+
delta=_DELTA,
|
| 40 |
+
)
|
| 41 |
+
self.assertRaises(
|
| 42 |
+
ValueError,
|
| 43 |
+
TrigramAssocMeasures.likelihood_ratio,
|
| 44 |
+
*(1, (1, 1, 2), (1, 1, 2), 2),
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
def test_lr_quadgram(self):
|
| 48 |
+
self.assertAlmostEqual(
|
| 49 |
+
QuadgramAssocMeasures.likelihood_ratio(
|
| 50 |
+
1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 2
|
| 51 |
+
),
|
| 52 |
+
8.317766166719343,
|
| 53 |
+
delta=_DELTA,
|
| 54 |
+
)
|
| 55 |
+
self.assertAlmostEqual(
|
| 56 |
+
QuadgramAssocMeasures.likelihood_ratio(
|
| 57 |
+
1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 1
|
| 58 |
+
),
|
| 59 |
+
0.0,
|
| 60 |
+
delta=_DELTA,
|
| 61 |
+
)
|
| 62 |
+
self.assertRaises(
|
| 63 |
+
ValueError,
|
| 64 |
+
QuadgramAssocMeasures.likelihood_ratio,
|
| 65 |
+
*(1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 2), (1, 1, 1, 1), 1),
|
| 66 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_naivebayes.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from nltk.classify.naivebayes import NaiveBayesClassifier
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class NaiveBayesClassifierTest(unittest.TestCase):
|
| 7 |
+
def test_simple(self):
|
| 8 |
+
training_features = [
|
| 9 |
+
({"nice": True, "good": True}, "positive"),
|
| 10 |
+
({"bad": True, "mean": True}, "negative"),
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
classifier = NaiveBayesClassifier.train(training_features)
|
| 14 |
+
|
| 15 |
+
result = classifier.prob_classify({"nice": True})
|
| 16 |
+
self.assertTrue(result.prob("positive") > result.prob("negative"))
|
| 17 |
+
self.assertEqual(result.max(), "positive")
|
| 18 |
+
|
| 19 |
+
result = classifier.prob_classify({"bad": True})
|
| 20 |
+
self.assertTrue(result.prob("positive") < result.prob("negative"))
|
| 21 |
+
self.assertEqual(result.max(), "negative")
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_nombank.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for nltk.corpus.nombank
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import unittest
|
| 6 |
+
|
| 7 |
+
from nltk.corpus import nombank
|
| 8 |
+
|
| 9 |
+
# Load the nombank once.
|
| 10 |
+
nombank.nouns()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class NombankDemo(unittest.TestCase):
|
| 14 |
+
def test_numbers(self):
|
| 15 |
+
# No. of instances.
|
| 16 |
+
self.assertEqual(len(nombank.instances()), 114574)
|
| 17 |
+
# No. of rolesets
|
| 18 |
+
self.assertEqual(len(nombank.rolesets()), 5577)
|
| 19 |
+
# No. of nouns.
|
| 20 |
+
self.assertEqual(len(nombank.nouns()), 4704)
|
| 21 |
+
|
| 22 |
+
def test_instance(self):
|
| 23 |
+
self.assertEqual(nombank.instances()[0].roleset, "perc-sign.01")
|
| 24 |
+
|
| 25 |
+
def test_framefiles_fileids(self):
|
| 26 |
+
self.assertEqual(len(nombank.fileids()), 4705)
|
| 27 |
+
self.assertTrue(all(fileid.endswith(".xml") for fileid in nombank.fileids()))
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pl196x.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
import nltk
|
| 4 |
+
from nltk.corpus.reader import pl196x
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class TestCorpusViews(unittest.TestCase):
|
| 8 |
+
def test_corpus_reader(self):
|
| 9 |
+
pl196x_dir = nltk.data.find("corpora/pl196x")
|
| 10 |
+
pl = pl196x.Pl196xCorpusReader(
|
| 11 |
+
pl196x_dir, r".*\.xml", textids="textids.txt", cat_file="cats.txt"
|
| 12 |
+
)
|
| 13 |
+
pl.tagged_words(fileids=pl.fileids(), categories="cats.txt")
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pos_tag.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for nltk.pos_tag
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
import unittest
|
| 7 |
+
|
| 8 |
+
from nltk import pos_tag, word_tokenize
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TestPosTag(unittest.TestCase):
|
| 12 |
+
def test_pos_tag_eng(self):
|
| 13 |
+
text = "John's big idea isn't all that bad."
|
| 14 |
+
expected_tagged = [
|
| 15 |
+
("John", "NNP"),
|
| 16 |
+
("'s", "POS"),
|
| 17 |
+
("big", "JJ"),
|
| 18 |
+
("idea", "NN"),
|
| 19 |
+
("is", "VBZ"),
|
| 20 |
+
("n't", "RB"),
|
| 21 |
+
("all", "PDT"),
|
| 22 |
+
("that", "DT"),
|
| 23 |
+
("bad", "JJ"),
|
| 24 |
+
(".", "."),
|
| 25 |
+
]
|
| 26 |
+
assert pos_tag(word_tokenize(text)) == expected_tagged
|
| 27 |
+
|
| 28 |
+
def test_pos_tag_eng_universal(self):
|
| 29 |
+
text = "John's big idea isn't all that bad."
|
| 30 |
+
expected_tagged = [
|
| 31 |
+
("John", "NOUN"),
|
| 32 |
+
("'s", "PRT"),
|
| 33 |
+
("big", "ADJ"),
|
| 34 |
+
("idea", "NOUN"),
|
| 35 |
+
("is", "VERB"),
|
| 36 |
+
("n't", "ADV"),
|
| 37 |
+
("all", "DET"),
|
| 38 |
+
("that", "DET"),
|
| 39 |
+
("bad", "ADJ"),
|
| 40 |
+
(".", "."),
|
| 41 |
+
]
|
| 42 |
+
assert pos_tag(word_tokenize(text), tagset="universal") == expected_tagged
|
| 43 |
+
|
| 44 |
+
def test_pos_tag_rus(self):
|
| 45 |
+
text = "Илья оторопел и дважды перечитал бумажку."
|
| 46 |
+
expected_tagged = [
|
| 47 |
+
("Илья", "S"),
|
| 48 |
+
("оторопел", "V"),
|
| 49 |
+
("и", "CONJ"),
|
| 50 |
+
("дважды", "ADV"),
|
| 51 |
+
("перечитал", "V"),
|
| 52 |
+
("бумажку", "S"),
|
| 53 |
+
(".", "NONLEX"),
|
| 54 |
+
]
|
| 55 |
+
assert pos_tag(word_tokenize(text), lang="rus") == expected_tagged
|
| 56 |
+
|
| 57 |
+
def test_pos_tag_rus_universal(self):
|
| 58 |
+
text = "Илья оторопел и дважды перечитал бумажку."
|
| 59 |
+
expected_tagged = [
|
| 60 |
+
("Илья", "NOUN"),
|
| 61 |
+
("оторопел", "VERB"),
|
| 62 |
+
("и", "CONJ"),
|
| 63 |
+
("дважды", "ADV"),
|
| 64 |
+
("перечитал", "VERB"),
|
| 65 |
+
("бумажку", "NOUN"),
|
| 66 |
+
(".", "."),
|
| 67 |
+
]
|
| 68 |
+
assert (
|
| 69 |
+
pos_tag(word_tokenize(text), tagset="universal", lang="rus")
|
| 70 |
+
== expected_tagged
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
def test_pos_tag_unknown_lang(self):
|
| 74 |
+
text = "모르겠 습니 다"
|
| 75 |
+
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang="kor")
|
| 76 |
+
# Test for default kwarg, `lang=None`
|
| 77 |
+
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None)
|
| 78 |
+
|
| 79 |
+
def test_unspecified_lang(self):
|
| 80 |
+
# Tries to force the lang='eng' option.
|
| 81 |
+
text = "모르겠 습니 다"
|
| 82 |
+
expected_but_wrong = [("모르겠", "JJ"), ("습니", "NNP"), ("다", "NN")]
|
| 83 |
+
assert pos_tag(word_tokenize(text)) == expected_but_wrong
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_ribes.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from nltk.translate.ribes_score import corpus_ribes, word_rank_alignment
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_ribes_empty_worder(): # worder as in word order
|
| 5 |
+
# Verifies that these two sentences have no alignment,
|
| 6 |
+
# and hence have the lowest possible RIBES score.
|
| 7 |
+
hyp = "This is a nice sentence which I quite like".split()
|
| 8 |
+
ref = "Okay well that's neat and all but the reference's different".split()
|
| 9 |
+
|
| 10 |
+
assert word_rank_alignment(ref, hyp) == []
|
| 11 |
+
|
| 12 |
+
list_of_refs = [[ref]]
|
| 13 |
+
hypotheses = [hyp]
|
| 14 |
+
assert corpus_ribes(list_of_refs, hypotheses) == 0.0
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_ribes_one_worder():
|
| 18 |
+
# Verifies that these two sentences have just one match,
|
| 19 |
+
# and the RIBES score for this sentence with very little
|
| 20 |
+
# correspondence is 0.
|
| 21 |
+
hyp = "This is a nice sentence which I quite like".split()
|
| 22 |
+
ref = "Okay well that's nice and all but the reference's different".split()
|
| 23 |
+
|
| 24 |
+
assert word_rank_alignment(ref, hyp) == [3]
|
| 25 |
+
|
| 26 |
+
list_of_refs = [[ref]]
|
| 27 |
+
hypotheses = [hyp]
|
| 28 |
+
assert corpus_ribes(list_of_refs, hypotheses) == 0.0
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_ribes_two_worder():
|
| 32 |
+
# Verifies that these two sentences have two matches,
|
| 33 |
+
# but still get the lowest possible RIBES score due
|
| 34 |
+
# to the lack of similarity.
|
| 35 |
+
hyp = "This is a nice sentence which I quite like".split()
|
| 36 |
+
ref = "Okay well that's nice and all but the reference is different".split()
|
| 37 |
+
|
| 38 |
+
assert word_rank_alignment(ref, hyp) == [9, 3]
|
| 39 |
+
|
| 40 |
+
list_of_refs = [[ref]]
|
| 41 |
+
hypotheses = [hyp]
|
| 42 |
+
assert corpus_ribes(list_of_refs, hypotheses) == 0.0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_ribes():
|
| 46 |
+
# Based on the doctest of the corpus_ribes function
|
| 47 |
+
hyp1 = [
|
| 48 |
+
"It",
|
| 49 |
+
"is",
|
| 50 |
+
"a",
|
| 51 |
+
"guide",
|
| 52 |
+
"to",
|
| 53 |
+
"action",
|
| 54 |
+
"which",
|
| 55 |
+
"ensures",
|
| 56 |
+
"that",
|
| 57 |
+
"the",
|
| 58 |
+
"military",
|
| 59 |
+
"always",
|
| 60 |
+
"obeys",
|
| 61 |
+
"the",
|
| 62 |
+
"commands",
|
| 63 |
+
"of",
|
| 64 |
+
"the",
|
| 65 |
+
"party",
|
| 66 |
+
]
|
| 67 |
+
ref1a = [
|
| 68 |
+
"It",
|
| 69 |
+
"is",
|
| 70 |
+
"a",
|
| 71 |
+
"guide",
|
| 72 |
+
"to",
|
| 73 |
+
"action",
|
| 74 |
+
"that",
|
| 75 |
+
"ensures",
|
| 76 |
+
"that",
|
| 77 |
+
"the",
|
| 78 |
+
"military",
|
| 79 |
+
"will",
|
| 80 |
+
"forever",
|
| 81 |
+
"heed",
|
| 82 |
+
"Party",
|
| 83 |
+
"commands",
|
| 84 |
+
]
|
| 85 |
+
ref1b = [
|
| 86 |
+
"It",
|
| 87 |
+
"is",
|
| 88 |
+
"the",
|
| 89 |
+
"guiding",
|
| 90 |
+
"principle",
|
| 91 |
+
"which",
|
| 92 |
+
"guarantees",
|
| 93 |
+
"the",
|
| 94 |
+
"military",
|
| 95 |
+
"forces",
|
| 96 |
+
"always",
|
| 97 |
+
"being",
|
| 98 |
+
"under",
|
| 99 |
+
"the",
|
| 100 |
+
"command",
|
| 101 |
+
"of",
|
| 102 |
+
"the",
|
| 103 |
+
"Party",
|
| 104 |
+
]
|
| 105 |
+
ref1c = [
|
| 106 |
+
"It",
|
| 107 |
+
"is",
|
| 108 |
+
"the",
|
| 109 |
+
"practical",
|
| 110 |
+
"guide",
|
| 111 |
+
"for",
|
| 112 |
+
"the",
|
| 113 |
+
"army",
|
| 114 |
+
"always",
|
| 115 |
+
"to",
|
| 116 |
+
"heed",
|
| 117 |
+
"the",
|
| 118 |
+
"directions",
|
| 119 |
+
"of",
|
| 120 |
+
"the",
|
| 121 |
+
"party",
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
hyp2 = [
|
| 125 |
+
"he",
|
| 126 |
+
"read",
|
| 127 |
+
"the",
|
| 128 |
+
"book",
|
| 129 |
+
"because",
|
| 130 |
+
"he",
|
| 131 |
+
"was",
|
| 132 |
+
"interested",
|
| 133 |
+
"in",
|
| 134 |
+
"world",
|
| 135 |
+
"history",
|
| 136 |
+
]
|
| 137 |
+
ref2a = [
|
| 138 |
+
"he",
|
| 139 |
+
"was",
|
| 140 |
+
"interested",
|
| 141 |
+
"in",
|
| 142 |
+
"world",
|
| 143 |
+
"history",
|
| 144 |
+
"because",
|
| 145 |
+
"he",
|
| 146 |
+
"read",
|
| 147 |
+
"the",
|
| 148 |
+
"book",
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]]
|
| 152 |
+
hypotheses = [hyp1, hyp2]
|
| 153 |
+
|
| 154 |
+
score = corpus_ribes(list_of_refs, hypotheses)
|
| 155 |
+
|
| 156 |
+
assert round(score, 4) == 0.3597
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def test_no_zero_div():
|
| 160 |
+
# Regression test for Issue 2529, assure that no ZeroDivisionError is thrown.
|
| 161 |
+
hyp1 = [
|
| 162 |
+
"It",
|
| 163 |
+
"is",
|
| 164 |
+
"a",
|
| 165 |
+
"guide",
|
| 166 |
+
"to",
|
| 167 |
+
"action",
|
| 168 |
+
"which",
|
| 169 |
+
"ensures",
|
| 170 |
+
"that",
|
| 171 |
+
"the",
|
| 172 |
+
"military",
|
| 173 |
+
"always",
|
| 174 |
+
"obeys",
|
| 175 |
+
"the",
|
| 176 |
+
"commands",
|
| 177 |
+
"of",
|
| 178 |
+
"the",
|
| 179 |
+
"party",
|
| 180 |
+
]
|
| 181 |
+
ref1a = [
|
| 182 |
+
"It",
|
| 183 |
+
"is",
|
| 184 |
+
"a",
|
| 185 |
+
"guide",
|
| 186 |
+
"to",
|
| 187 |
+
"action",
|
| 188 |
+
"that",
|
| 189 |
+
"ensures",
|
| 190 |
+
"that",
|
| 191 |
+
"the",
|
| 192 |
+
"military",
|
| 193 |
+
"will",
|
| 194 |
+
"forever",
|
| 195 |
+
"heed",
|
| 196 |
+
"Party",
|
| 197 |
+
"commands",
|
| 198 |
+
]
|
| 199 |
+
ref1b = [
|
| 200 |
+
"It",
|
| 201 |
+
"is",
|
| 202 |
+
"the",
|
| 203 |
+
"guiding",
|
| 204 |
+
"principle",
|
| 205 |
+
"which",
|
| 206 |
+
"guarantees",
|
| 207 |
+
"the",
|
| 208 |
+
"military",
|
| 209 |
+
"forces",
|
| 210 |
+
"always",
|
| 211 |
+
"being",
|
| 212 |
+
"under",
|
| 213 |
+
"the",
|
| 214 |
+
"command",
|
| 215 |
+
"of",
|
| 216 |
+
"the",
|
| 217 |
+
"Party",
|
| 218 |
+
]
|
| 219 |
+
ref1c = [
|
| 220 |
+
"It",
|
| 221 |
+
"is",
|
| 222 |
+
"the",
|
| 223 |
+
"practical",
|
| 224 |
+
"guide",
|
| 225 |
+
"for",
|
| 226 |
+
"the",
|
| 227 |
+
"army",
|
| 228 |
+
"always",
|
| 229 |
+
"to",
|
| 230 |
+
"heed",
|
| 231 |
+
"the",
|
| 232 |
+
"directions",
|
| 233 |
+
"of",
|
| 234 |
+
"the",
|
| 235 |
+
"party",
|
| 236 |
+
]
|
| 237 |
+
|
| 238 |
+
hyp2 = ["he", "read", "the"]
|
| 239 |
+
ref2a = ["he", "was", "interested", "in", "world", "history", "because", "he"]
|
| 240 |
+
|
| 241 |
+
list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]]
|
| 242 |
+
hypotheses = [hyp1, hyp2]
|
| 243 |
+
|
| 244 |
+
score = corpus_ribes(list_of_refs, hypotheses)
|
| 245 |
+
|
| 246 |
+
assert round(score, 4) == 0.1688
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_rte_classify.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from nltk import config_megam
|
| 4 |
+
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
|
| 5 |
+
from nltk.corpus import rte as rte_corpus
|
| 6 |
+
|
| 7 |
+
expected_from_rte_feature_extration = """
|
| 8 |
+
alwayson => True
|
| 9 |
+
ne_hyp_extra => 0
|
| 10 |
+
ne_overlap => 1
|
| 11 |
+
neg_hyp => 0
|
| 12 |
+
neg_txt => 0
|
| 13 |
+
word_hyp_extra => 3
|
| 14 |
+
word_overlap => 3
|
| 15 |
+
|
| 16 |
+
alwayson => True
|
| 17 |
+
ne_hyp_extra => 0
|
| 18 |
+
ne_overlap => 1
|
| 19 |
+
neg_hyp => 0
|
| 20 |
+
neg_txt => 0
|
| 21 |
+
word_hyp_extra => 2
|
| 22 |
+
word_overlap => 1
|
| 23 |
+
|
| 24 |
+
alwayson => True
|
| 25 |
+
ne_hyp_extra => 1
|
| 26 |
+
ne_overlap => 1
|
| 27 |
+
neg_hyp => 0
|
| 28 |
+
neg_txt => 0
|
| 29 |
+
word_hyp_extra => 1
|
| 30 |
+
word_overlap => 2
|
| 31 |
+
|
| 32 |
+
alwayson => True
|
| 33 |
+
ne_hyp_extra => 1
|
| 34 |
+
ne_overlap => 0
|
| 35 |
+
neg_hyp => 0
|
| 36 |
+
neg_txt => 0
|
| 37 |
+
word_hyp_extra => 6
|
| 38 |
+
word_overlap => 2
|
| 39 |
+
|
| 40 |
+
alwayson => True
|
| 41 |
+
ne_hyp_extra => 1
|
| 42 |
+
ne_overlap => 0
|
| 43 |
+
neg_hyp => 0
|
| 44 |
+
neg_txt => 0
|
| 45 |
+
word_hyp_extra => 4
|
| 46 |
+
word_overlap => 0
|
| 47 |
+
|
| 48 |
+
alwayson => True
|
| 49 |
+
ne_hyp_extra => 1
|
| 50 |
+
ne_overlap => 0
|
| 51 |
+
neg_hyp => 0
|
| 52 |
+
neg_txt => 0
|
| 53 |
+
word_hyp_extra => 3
|
| 54 |
+
word_overlap => 1
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class TestRTEClassifier:
|
| 59 |
+
# Test the feature extraction method.
|
| 60 |
+
def test_rte_feature_extraction(self):
|
| 61 |
+
pairs = rte_corpus.pairs(["rte1_dev.xml"])[:6]
|
| 62 |
+
test_output = [
|
| 63 |
+
f"{key:<15} => {rte_features(pair)[key]}"
|
| 64 |
+
for pair in pairs
|
| 65 |
+
for key in sorted(rte_features(pair))
|
| 66 |
+
]
|
| 67 |
+
expected_output = expected_from_rte_feature_extration.strip().split("\n")
|
| 68 |
+
# Remove null strings.
|
| 69 |
+
expected_output = list(filter(None, expected_output))
|
| 70 |
+
assert test_output == expected_output
|
| 71 |
+
|
| 72 |
+
# Test the RTEFeatureExtractor object.
|
| 73 |
+
def test_feature_extractor_object(self):
|
| 74 |
+
rtepair = rte_corpus.pairs(["rte3_dev.xml"])[33]
|
| 75 |
+
extractor = RTEFeatureExtractor(rtepair)
|
| 76 |
+
|
| 77 |
+
assert extractor.hyp_words == {"member", "China", "SCO."}
|
| 78 |
+
assert extractor.overlap("word") == set()
|
| 79 |
+
assert extractor.overlap("ne") == {"China"}
|
| 80 |
+
assert extractor.hyp_extra("word") == {"member"}
|
| 81 |
+
|
| 82 |
+
# Test the RTE classifier training.
|
| 83 |
+
def test_rte_classification_without_megam(self):
|
| 84 |
+
# Use a sample size for unit testing, since we
|
| 85 |
+
# don't need to fully train these classifiers
|
| 86 |
+
clf = rte_classifier("IIS", sample_N=100)
|
| 87 |
+
clf = rte_classifier("GIS", sample_N=100)
|
| 88 |
+
|
| 89 |
+
def test_rte_classification_with_megam(self):
|
| 90 |
+
try:
|
| 91 |
+
config_megam()
|
| 92 |
+
except (LookupError, AttributeError) as e:
|
| 93 |
+
pytest.skip("Skipping tests with dependencies on MEGAM")
|
| 94 |
+
clf = rte_classifier("megam", sample_N=100)
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_seekable_unicode_stream_reader.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from nltk.corpus.reader import SeekableUnicodeStreamReader
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def check_reader(unicode_string, encoding):
|
| 10 |
+
bytestr = unicode_string.encode(encoding)
|
| 11 |
+
stream = BytesIO(bytestr)
|
| 12 |
+
reader = SeekableUnicodeStreamReader(stream, encoding)
|
| 13 |
+
|
| 14 |
+
# Should open at the start of the file
|
| 15 |
+
assert reader.tell() == 0
|
| 16 |
+
|
| 17 |
+
# Compare original string to contents from `.readlines()`
|
| 18 |
+
assert unicode_string == "".join(reader.readlines())
|
| 19 |
+
|
| 20 |
+
# Should be at the end of the file now
|
| 21 |
+
stream.seek(0, os.SEEK_END)
|
| 22 |
+
assert reader.tell() == stream.tell()
|
| 23 |
+
|
| 24 |
+
reader.seek(0) # go back to start
|
| 25 |
+
|
| 26 |
+
# Compare original string to contents from `.read()`
|
| 27 |
+
contents = ""
|
| 28 |
+
char = None
|
| 29 |
+
while char != "":
|
| 30 |
+
char = reader.read(1)
|
| 31 |
+
contents += char
|
| 32 |
+
assert unicode_string == contents
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Call `check_reader` with a variety of input strings and encodings.
|
| 36 |
+
ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"]
|
| 37 |
+
|
| 38 |
+
STRINGS = [
|
| 39 |
+
"""
|
| 40 |
+
This is a test file.
|
| 41 |
+
It is fairly short.
|
| 42 |
+
""",
|
| 43 |
+
"This file can be encoded with latin1. \x83",
|
| 44 |
+
"""\
|
| 45 |
+
This is a test file.
|
| 46 |
+
Here's a blank line:
|
| 47 |
+
|
| 48 |
+
And here's some unicode: \xee \u0123 \uffe3
|
| 49 |
+
""",
|
| 50 |
+
"""\
|
| 51 |
+
This is a test file.
|
| 52 |
+
Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
|
| 53 |
+
""",
|
| 54 |
+
"""\
|
| 55 |
+
This is a larger file. It has some lines that are longer \
|
| 56 |
+
than 72 characters. It's got lots of repetition. Here's \
|
| 57 |
+
some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
|
| 58 |
+
|
| 59 |
+
How fun! Let's repeat it twenty times.
|
| 60 |
+
"""
|
| 61 |
+
* 20,
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@pytest.mark.parametrize("string", STRINGS)
|
| 66 |
+
def test_reader(string):
|
| 67 |
+
for encoding in ENCODINGS:
|
| 68 |
+
# skip strings that can't be encoded with the current encoding
|
| 69 |
+
try:
|
| 70 |
+
string.encode(encoding)
|
| 71 |
+
except UnicodeEncodeError:
|
| 72 |
+
continue
|
| 73 |
+
check_reader(string, encoding)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def test_reader_stream_closes_when_deleted():
|
| 77 |
+
reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii")
|
| 78 |
+
assert not reader.stream.closed
|
| 79 |
+
reader.__del__()
|
| 80 |
+
assert reader.stream.closed
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def teardown_module(module=None):
|
| 84 |
+
import gc
|
| 85 |
+
|
| 86 |
+
gc.collect()
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_senna.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for Senna
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import unittest
|
| 6 |
+
from os import environ, path, sep
|
| 7 |
+
|
| 8 |
+
from nltk.classify import Senna
|
| 9 |
+
from nltk.tag import SennaChunkTagger, SennaNERTagger, SennaTagger
|
| 10 |
+
|
| 11 |
+
# Set Senna executable path for tests if it is not specified as an environment variable
|
| 12 |
+
if "SENNA" in environ:
|
| 13 |
+
SENNA_EXECUTABLE_PATH = path.normpath(environ["SENNA"]) + sep
|
| 14 |
+
else:
|
| 15 |
+
SENNA_EXECUTABLE_PATH = "/usr/share/senna-v3.0"
|
| 16 |
+
|
| 17 |
+
senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
|
| 21 |
+
class TestSennaPipeline(unittest.TestCase):
|
| 22 |
+
"""Unittest for nltk.classify.senna"""
|
| 23 |
+
|
| 24 |
+
def test_senna_pipeline(self):
|
| 25 |
+
"""Senna pipeline interface"""
|
| 26 |
+
|
| 27 |
+
pipeline = Senna(SENNA_EXECUTABLE_PATH, ["pos", "chk", "ner"])
|
| 28 |
+
sent = "Dusseldorf is an international business center".split()
|
| 29 |
+
result = [
|
| 30 |
+
(token["word"], token["chk"], token["ner"], token["pos"])
|
| 31 |
+
for token in pipeline.tag(sent)
|
| 32 |
+
]
|
| 33 |
+
expected = [
|
| 34 |
+
("Dusseldorf", "B-NP", "B-LOC", "NNP"),
|
| 35 |
+
("is", "B-VP", "O", "VBZ"),
|
| 36 |
+
("an", "B-NP", "O", "DT"),
|
| 37 |
+
("international", "I-NP", "O", "JJ"),
|
| 38 |
+
("business", "I-NP", "O", "NN"),
|
| 39 |
+
("center", "I-NP", "O", "NN"),
|
| 40 |
+
]
|
| 41 |
+
self.assertEqual(result, expected)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
|
| 45 |
+
class TestSennaTagger(unittest.TestCase):
|
| 46 |
+
"""Unittest for nltk.tag.senna"""
|
| 47 |
+
|
| 48 |
+
def test_senna_tagger(self):
|
| 49 |
+
tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
|
| 50 |
+
result = tagger.tag("What is the airspeed of an unladen swallow ?".split())
|
| 51 |
+
expected = [
|
| 52 |
+
("What", "WP"),
|
| 53 |
+
("is", "VBZ"),
|
| 54 |
+
("the", "DT"),
|
| 55 |
+
("airspeed", "NN"),
|
| 56 |
+
("of", "IN"),
|
| 57 |
+
("an", "DT"),
|
| 58 |
+
("unladen", "NN"),
|
| 59 |
+
("swallow", "NN"),
|
| 60 |
+
("?", "."),
|
| 61 |
+
]
|
| 62 |
+
self.assertEqual(result, expected)
|
| 63 |
+
|
| 64 |
+
def test_senna_chunk_tagger(self):
|
| 65 |
+
chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
|
| 66 |
+
result_1 = chktagger.tag("What is the airspeed of an unladen swallow ?".split())
|
| 67 |
+
expected_1 = [
|
| 68 |
+
("What", "B-NP"),
|
| 69 |
+
("is", "B-VP"),
|
| 70 |
+
("the", "B-NP"),
|
| 71 |
+
("airspeed", "I-NP"),
|
| 72 |
+
("of", "B-PP"),
|
| 73 |
+
("an", "B-NP"),
|
| 74 |
+
("unladen", "I-NP"),
|
| 75 |
+
("swallow", "I-NP"),
|
| 76 |
+
("?", "O"),
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type="NP"))
|
| 80 |
+
expected_2 = [
|
| 81 |
+
("What", "0"),
|
| 82 |
+
("the airspeed", "2-3"),
|
| 83 |
+
("an unladen swallow", "5-6-7"),
|
| 84 |
+
]
|
| 85 |
+
self.assertEqual(result_1, expected_1)
|
| 86 |
+
self.assertEqual(result_2, expected_2)
|
| 87 |
+
|
| 88 |
+
def test_senna_ner_tagger(self):
|
| 89 |
+
nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
|
| 90 |
+
result_1 = nertagger.tag("Shakespeare theatre was in London .".split())
|
| 91 |
+
expected_1 = [
|
| 92 |
+
("Shakespeare", "B-PER"),
|
| 93 |
+
("theatre", "O"),
|
| 94 |
+
("was", "O"),
|
| 95 |
+
("in", "O"),
|
| 96 |
+
("London", "B-LOC"),
|
| 97 |
+
(".", "O"),
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
result_2 = nertagger.tag("UN headquarters are in NY , USA .".split())
|
| 101 |
+
expected_2 = [
|
| 102 |
+
("UN", "B-ORG"),
|
| 103 |
+
("headquarters", "O"),
|
| 104 |
+
("are", "O"),
|
| 105 |
+
("in", "O"),
|
| 106 |
+
("NY", "B-LOC"),
|
| 107 |
+
(",", "O"),
|
| 108 |
+
("USA", "B-LOC"),
|
| 109 |
+
(".", "O"),
|
| 110 |
+
]
|
| 111 |
+
self.assertEqual(result_1, expected_1)
|
| 112 |
+
self.assertEqual(result_2, expected_2)
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_stem.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
from contextlib import closing
|
| 3 |
+
|
| 4 |
+
from nltk import data
|
| 5 |
+
from nltk.stem.porter import PorterStemmer
|
| 6 |
+
from nltk.stem.snowball import SnowballStemmer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SnowballTest(unittest.TestCase):
|
| 10 |
+
def test_arabic(self):
|
| 11 |
+
"""
|
| 12 |
+
this unit testing for test the snowball arabic light stemmer
|
| 13 |
+
this stemmer deals with prefixes and suffixes
|
| 14 |
+
"""
|
| 15 |
+
# Test where the ignore_stopwords=True.
|
| 16 |
+
ar_stemmer = SnowballStemmer("arabic", True)
|
| 17 |
+
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
|
| 18 |
+
assert ar_stemmer.stem("العربية") == "عرب"
|
| 19 |
+
assert ar_stemmer.stem("فقالوا") == "قال"
|
| 20 |
+
assert ar_stemmer.stem("الطالبات") == "طالب"
|
| 21 |
+
assert ar_stemmer.stem("فالطالبات") == "طالب"
|
| 22 |
+
assert ar_stemmer.stem("والطالبات") == "طالب"
|
| 23 |
+
assert ar_stemmer.stem("الطالبون") == "طالب"
|
| 24 |
+
assert ar_stemmer.stem("اللذان") == "اللذان"
|
| 25 |
+
assert ar_stemmer.stem("من") == "من"
|
| 26 |
+
# Test where the ignore_stopwords=False.
|
| 27 |
+
ar_stemmer = SnowballStemmer("arabic", False)
|
| 28 |
+
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
|
| 29 |
+
assert ar_stemmer.stem("الطالبات") == "طالب"
|
| 30 |
+
assert ar_stemmer.stem("الكلمات") == "كلم"
|
| 31 |
+
# test where create the arabic stemmer without given init value to ignore_stopwords
|
| 32 |
+
ar_stemmer = SnowballStemmer("arabic")
|
| 33 |
+
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
|
| 34 |
+
assert ar_stemmer.stem("العربية") == "عرب"
|
| 35 |
+
assert ar_stemmer.stem("فقالوا") == "قال"
|
| 36 |
+
assert ar_stemmer.stem("الطالبات") == "طالب"
|
| 37 |
+
assert ar_stemmer.stem("الكلمات") == "كلم"
|
| 38 |
+
|
| 39 |
+
def test_russian(self):
|
| 40 |
+
stemmer_russian = SnowballStemmer("russian")
|
| 41 |
+
assert stemmer_russian.stem("авантненькая") == "авантненьк"
|
| 42 |
+
|
| 43 |
+
def test_german(self):
|
| 44 |
+
stemmer_german = SnowballStemmer("german")
|
| 45 |
+
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
|
| 46 |
+
|
| 47 |
+
assert stemmer_german.stem("Schr\xe4nke") == "schrank"
|
| 48 |
+
assert stemmer_german2.stem("Schr\xe4nke") == "schrank"
|
| 49 |
+
|
| 50 |
+
assert stemmer_german.stem("keinen") == "kein"
|
| 51 |
+
assert stemmer_german2.stem("keinen") == "keinen"
|
| 52 |
+
|
| 53 |
+
def test_spanish(self):
|
| 54 |
+
stemmer = SnowballStemmer("spanish")
|
| 55 |
+
|
| 56 |
+
assert stemmer.stem("Visionado") == "vision"
|
| 57 |
+
|
| 58 |
+
# The word 'algue' was raising an IndexError
|
| 59 |
+
assert stemmer.stem("algue") == "algu"
|
| 60 |
+
|
| 61 |
+
def test_short_strings_bug(self):
|
| 62 |
+
stemmer = SnowballStemmer("english")
|
| 63 |
+
assert stemmer.stem("y's") == "y"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class PorterTest(unittest.TestCase):
|
| 67 |
+
def _vocabulary(self):
|
| 68 |
+
with closing(
|
| 69 |
+
data.find("stemmers/porter_test/porter_vocabulary.txt").open(
|
| 70 |
+
encoding="utf-8"
|
| 71 |
+
)
|
| 72 |
+
) as fp:
|
| 73 |
+
return fp.read().splitlines()
|
| 74 |
+
|
| 75 |
+
def _test_against_expected_output(self, stemmer_mode, expected_stems):
|
| 76 |
+
stemmer = PorterStemmer(mode=stemmer_mode)
|
| 77 |
+
for word, true_stem in zip(self._vocabulary(), expected_stems):
|
| 78 |
+
our_stem = stemmer.stem(word)
|
| 79 |
+
assert (
|
| 80 |
+
our_stem == true_stem
|
| 81 |
+
), "{} should stem to {} in {} mode but got {}".format(
|
| 82 |
+
word,
|
| 83 |
+
true_stem,
|
| 84 |
+
stemmer_mode,
|
| 85 |
+
our_stem,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
def test_vocabulary_martin_mode(self):
|
| 89 |
+
"""Tests all words from the test vocabulary provided by M Porter
|
| 90 |
+
|
| 91 |
+
The sample vocabulary and output were sourced from
|
| 92 |
+
https://tartarus.org/martin/PorterStemmer/voc.txt and
|
| 93 |
+
https://tartarus.org/martin/PorterStemmer/output.txt
|
| 94 |
+
and are linked to from the Porter Stemmer algorithm's homepage
|
| 95 |
+
at https://tartarus.org/martin/PorterStemmer/
|
| 96 |
+
"""
|
| 97 |
+
with closing(
|
| 98 |
+
data.find("stemmers/porter_test/porter_martin_output.txt").open(
|
| 99 |
+
encoding="utf-8"
|
| 100 |
+
)
|
| 101 |
+
) as fp:
|
| 102 |
+
self._test_against_expected_output(
|
| 103 |
+
PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
def test_vocabulary_nltk_mode(self):
|
| 107 |
+
with closing(
|
| 108 |
+
data.find("stemmers/porter_test/porter_nltk_output.txt").open(
|
| 109 |
+
encoding="utf-8"
|
| 110 |
+
)
|
| 111 |
+
) as fp:
|
| 112 |
+
self._test_against_expected_output(
|
| 113 |
+
PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
def test_vocabulary_original_mode(self):
|
| 117 |
+
# The list of stems for this test was generated by taking the
|
| 118 |
+
# Martin-blessed stemmer from
|
| 119 |
+
# https://tartarus.org/martin/PorterStemmer/c.txt
|
| 120 |
+
# and removing all the --DEPARTURE-- sections from it and
|
| 121 |
+
# running it against Martin's test vocabulary.
|
| 122 |
+
|
| 123 |
+
with closing(
|
| 124 |
+
data.find("stemmers/porter_test/porter_original_output.txt").open(
|
| 125 |
+
encoding="utf-8"
|
| 126 |
+
)
|
| 127 |
+
) as fp:
|
| 128 |
+
self._test_against_expected_output(
|
| 129 |
+
PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
self._test_against_expected_output(
|
| 133 |
+
PorterStemmer.ORIGINAL_ALGORITHM,
|
| 134 |
+
data.find("stemmers/porter_test/porter_original_output.txt")
|
| 135 |
+
.open(encoding="utf-8")
|
| 136 |
+
.read()
|
| 137 |
+
.splitlines(),
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
def test_oed_bug(self):
|
| 141 |
+
"""Test for bug https://github.com/nltk/nltk/issues/1581
|
| 142 |
+
|
| 143 |
+
Ensures that 'oed' can be stemmed without throwing an error.
|
| 144 |
+
"""
|
| 145 |
+
assert PorterStemmer().stem("oed") == "o"
|
| 146 |
+
|
| 147 |
+
def test_lowercase_option(self):
|
| 148 |
+
"""Test for improvement on https://github.com/nltk/nltk/issues/2507
|
| 149 |
+
|
| 150 |
+
Ensures that stems are lowercased when `to_lowercase=True`
|
| 151 |
+
"""
|
| 152 |
+
porter = PorterStemmer()
|
| 153 |
+
assert porter.stem("On") == "on"
|
| 154 |
+
assert porter.stem("I") == "i"
|
| 155 |
+
assert porter.stem("I", to_lowercase=False) == "I"
|
| 156 |
+
assert porter.stem("Github") == "github"
|
| 157 |
+
assert porter.stem("Github", to_lowercase=False) == "Github"
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tag.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def test_basic():
|
| 2 |
+
from nltk.tag import pos_tag
|
| 3 |
+
from nltk.tokenize import word_tokenize
|
| 4 |
+
|
| 5 |
+
result = pos_tag(word_tokenize("John's big idea isn't all that bad."))
|
| 6 |
+
assert result == [
|
| 7 |
+
("John", "NNP"),
|
| 8 |
+
("'s", "POS"),
|
| 9 |
+
("big", "JJ"),
|
| 10 |
+
("idea", "NN"),
|
| 11 |
+
("is", "VBZ"),
|
| 12 |
+
("n't", "RB"),
|
| 13 |
+
("all", "PDT"),
|
| 14 |
+
("that", "DT"),
|
| 15 |
+
("bad", "JJ"),
|
| 16 |
+
(".", "."),
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def setup_module(module):
|
| 21 |
+
import pytest
|
| 22 |
+
|
| 23 |
+
pytest.importorskip("numpy")
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tgrep.py
ADDED
|
@@ -0,0 +1,780 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
#
|
| 3 |
+
# Natural Language Toolkit: TGrep search
|
| 4 |
+
#
|
| 5 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 6 |
+
# Author: Will Roberts <wildwilhelm@gmail.com>
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
Unit tests for nltk.tgrep.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
import unittest
|
| 16 |
+
|
| 17 |
+
from nltk import tgrep
|
| 18 |
+
from nltk.tree import ParentedTree
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class TestSequenceFunctions(unittest.TestCase):
|
| 22 |
+
|
| 23 |
+
"""
|
| 24 |
+
Class containing unit tests for nltk.tgrep.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def test_tokenize_simple(self):
|
| 28 |
+
"""
|
| 29 |
+
Simple test of tokenization.
|
| 30 |
+
"""
|
| 31 |
+
tokens = tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]")
|
| 32 |
+
self.assertEqual(
|
| 33 |
+
tokens,
|
| 34 |
+
[
|
| 35 |
+
"A",
|
| 36 |
+
"..",
|
| 37 |
+
"(",
|
| 38 |
+
"B",
|
| 39 |
+
"!",
|
| 40 |
+
"<",
|
| 41 |
+
"C",
|
| 42 |
+
".",
|
| 43 |
+
"D",
|
| 44 |
+
")",
|
| 45 |
+
"|",
|
| 46 |
+
"!",
|
| 47 |
+
"[",
|
| 48 |
+
"<<",
|
| 49 |
+
"(",
|
| 50 |
+
"E",
|
| 51 |
+
",",
|
| 52 |
+
"F",
|
| 53 |
+
")",
|
| 54 |
+
"$",
|
| 55 |
+
"G",
|
| 56 |
+
"]",
|
| 57 |
+
],
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
def test_tokenize_encoding(self):
|
| 61 |
+
"""
|
| 62 |
+
Test that tokenization handles bytes and strs the same way.
|
| 63 |
+
"""
|
| 64 |
+
self.assertEqual(
|
| 65 |
+
tgrep.tgrep_tokenize(b"A .. (B !< C . D) | ![<< (E , F) $ G]"),
|
| 66 |
+
tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]"),
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
def test_tokenize_link_types(self):
|
| 70 |
+
"""
|
| 71 |
+
Test tokenization of basic link types.
|
| 72 |
+
"""
|
| 73 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<B"), ["A", "<", "B"])
|
| 74 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>B"), ["A", ">", "B"])
|
| 75 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<3B"), ["A", "<3", "B"])
|
| 76 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>3B"), ["A", ">3", "B"])
|
| 77 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<,B"), ["A", "<,", "B"])
|
| 78 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>,B"), ["A", ">,", "B"])
|
| 79 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<-3B"), ["A", "<-3", "B"])
|
| 80 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>-3B"), ["A", ">-3", "B"])
|
| 81 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<-B"), ["A", "<-", "B"])
|
| 82 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>-B"), ["A", ">-", "B"])
|
| 83 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<'B"), ["A", "<'", "B"])
|
| 84 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>'B"), ["A", ">'", "B"])
|
| 85 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<:B"), ["A", "<:", "B"])
|
| 86 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>:B"), ["A", ">:", "B"])
|
| 87 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<<B"), ["A", "<<", "B"])
|
| 88 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>>B"), ["A", ">>", "B"])
|
| 89 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<<,B"), ["A", "<<,", "B"])
|
| 90 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>>,B"), ["A", ">>,", "B"])
|
| 91 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<<'B"), ["A", "<<'", "B"])
|
| 92 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>>'B"), ["A", ">>'", "B"])
|
| 93 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<<:B"), ["A", "<<:", "B"])
|
| 94 |
+
self.assertEqual(tgrep.tgrep_tokenize("A>>:B"), ["A", ">>:", "B"])
|
| 95 |
+
self.assertEqual(tgrep.tgrep_tokenize("A.B"), ["A", ".", "B"])
|
| 96 |
+
self.assertEqual(tgrep.tgrep_tokenize("A,B"), ["A", ",", "B"])
|
| 97 |
+
self.assertEqual(tgrep.tgrep_tokenize("A..B"), ["A", "..", "B"])
|
| 98 |
+
self.assertEqual(tgrep.tgrep_tokenize("A,,B"), ["A", ",,", "B"])
|
| 99 |
+
self.assertEqual(tgrep.tgrep_tokenize("A$B"), ["A", "$", "B"])
|
| 100 |
+
self.assertEqual(tgrep.tgrep_tokenize("A$.B"), ["A", "$.", "B"])
|
| 101 |
+
self.assertEqual(tgrep.tgrep_tokenize("A$,B"), ["A", "$,", "B"])
|
| 102 |
+
self.assertEqual(tgrep.tgrep_tokenize("A$..B"), ["A", "$..", "B"])
|
| 103 |
+
self.assertEqual(tgrep.tgrep_tokenize("A$,,B"), ["A", "$,,", "B"])
|
| 104 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<B"), ["A", "!", "<", "B"])
|
| 105 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>B"), ["A", "!", ">", "B"])
|
| 106 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<3B"), ["A", "!", "<3", "B"])
|
| 107 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>3B"), ["A", "!", ">3", "B"])
|
| 108 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<,B"), ["A", "!", "<,", "B"])
|
| 109 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>,B"), ["A", "!", ">,", "B"])
|
| 110 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<-3B"), ["A", "!", "<-3", "B"])
|
| 111 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>-3B"), ["A", "!", ">-3", "B"])
|
| 112 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<-B"), ["A", "!", "<-", "B"])
|
| 113 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>-B"), ["A", "!", ">-", "B"])
|
| 114 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<'B"), ["A", "!", "<'", "B"])
|
| 115 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>'B"), ["A", "!", ">'", "B"])
|
| 116 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<:B"), ["A", "!", "<:", "B"])
|
| 117 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>:B"), ["A", "!", ">:", "B"])
|
| 118 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<<B"), ["A", "!", "<<", "B"])
|
| 119 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>>B"), ["A", "!", ">>", "B"])
|
| 120 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<<,B"), ["A", "!", "<<,", "B"])
|
| 121 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>>,B"), ["A", "!", ">>,", "B"])
|
| 122 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<<'B"), ["A", "!", "<<'", "B"])
|
| 123 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>>'B"), ["A", "!", ">>'", "B"])
|
| 124 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!<<:B"), ["A", "!", "<<:", "B"])
|
| 125 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!>>:B"), ["A", "!", ">>:", "B"])
|
| 126 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!.B"), ["A", "!", ".", "B"])
|
| 127 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!,B"), ["A", "!", ",", "B"])
|
| 128 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!..B"), ["A", "!", "..", "B"])
|
| 129 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!,,B"), ["A", "!", ",,", "B"])
|
| 130 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!$B"), ["A", "!", "$", "B"])
|
| 131 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!$.B"), ["A", "!", "$.", "B"])
|
| 132 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!$,B"), ["A", "!", "$,", "B"])
|
| 133 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!$..B"), ["A", "!", "$..", "B"])
|
| 134 |
+
self.assertEqual(tgrep.tgrep_tokenize("A!$,,B"), ["A", "!", "$,,", "B"])
|
| 135 |
+
|
| 136 |
+
def test_tokenize_examples(self):
|
| 137 |
+
"""
|
| 138 |
+
Test tokenization of the TGrep2 manual example patterns.
|
| 139 |
+
"""
|
| 140 |
+
self.assertEqual(tgrep.tgrep_tokenize("NP < PP"), ["NP", "<", "PP"])
|
| 141 |
+
self.assertEqual(tgrep.tgrep_tokenize("/^NP/"), ["/^NP/"])
|
| 142 |
+
self.assertEqual(
|
| 143 |
+
tgrep.tgrep_tokenize("NP << PP . VP"), ["NP", "<<", "PP", ".", "VP"]
|
| 144 |
+
)
|
| 145 |
+
self.assertEqual(
|
| 146 |
+
tgrep.tgrep_tokenize("NP << PP | . VP"), ["NP", "<<", "PP", "|", ".", "VP"]
|
| 147 |
+
)
|
| 148 |
+
self.assertEqual(
|
| 149 |
+
tgrep.tgrep_tokenize("NP !<< PP [> NP | >> VP]"),
|
| 150 |
+
["NP", "!", "<<", "PP", "[", ">", "NP", "|", ">>", "VP", "]"],
|
| 151 |
+
)
|
| 152 |
+
self.assertEqual(
|
| 153 |
+
tgrep.tgrep_tokenize("NP << (PP . VP)"),
|
| 154 |
+
["NP", "<<", "(", "PP", ".", "VP", ")"],
|
| 155 |
+
)
|
| 156 |
+
self.assertEqual(
|
| 157 |
+
tgrep.tgrep_tokenize("NP <' (PP <, (IN < on))"),
|
| 158 |
+
["NP", "<'", "(", "PP", "<,", "(", "IN", "<", "on", ")", ")"],
|
| 159 |
+
)
|
| 160 |
+
self.assertEqual(
|
| 161 |
+
tgrep.tgrep_tokenize("S < (A < B) < C"),
|
| 162 |
+
["S", "<", "(", "A", "<", "B", ")", "<", "C"],
|
| 163 |
+
)
|
| 164 |
+
self.assertEqual(
|
| 165 |
+
tgrep.tgrep_tokenize("S < ((A < B) < C)"),
|
| 166 |
+
["S", "<", "(", "(", "A", "<", "B", ")", "<", "C", ")"],
|
| 167 |
+
)
|
| 168 |
+
self.assertEqual(
|
| 169 |
+
tgrep.tgrep_tokenize("S < (A < B < C)"),
|
| 170 |
+
["S", "<", "(", "A", "<", "B", "<", "C", ")"],
|
| 171 |
+
)
|
| 172 |
+
self.assertEqual(tgrep.tgrep_tokenize("A<B&.C"), ["A", "<", "B", "&", ".", "C"])
|
| 173 |
+
|
| 174 |
+
def test_tokenize_quoting(self):
|
| 175 |
+
"""
|
| 176 |
+
Test tokenization of quoting.
|
| 177 |
+
"""
|
| 178 |
+
self.assertEqual(
|
| 179 |
+
tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
|
| 180 |
+
['"A<<:B"', "<<:", '"A $.. B"', "<", '"A>3B"', "<", "C"],
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
def test_tokenize_nodenames(self):
|
| 184 |
+
"""
|
| 185 |
+
Test tokenization of node names.
|
| 186 |
+
"""
|
| 187 |
+
self.assertEqual(tgrep.tgrep_tokenize("Robert"), ["Robert"])
|
| 188 |
+
self.assertEqual(tgrep.tgrep_tokenize("/^[Bb]ob/"), ["/^[Bb]ob/"])
|
| 189 |
+
self.assertEqual(tgrep.tgrep_tokenize("*"), ["*"])
|
| 190 |
+
self.assertEqual(tgrep.tgrep_tokenize("__"), ["__"])
|
| 191 |
+
# test tokenization of NLTK tree position syntax
|
| 192 |
+
self.assertEqual(tgrep.tgrep_tokenize("N()"), ["N(", ")"])
|
| 193 |
+
self.assertEqual(tgrep.tgrep_tokenize("N(0,)"), ["N(", "0", ",", ")"])
|
| 194 |
+
self.assertEqual(tgrep.tgrep_tokenize("N(0,0)"), ["N(", "0", ",", "0", ")"])
|
| 195 |
+
self.assertEqual(
|
| 196 |
+
tgrep.tgrep_tokenize("N(0,0,)"), ["N(", "0", ",", "0", ",", ")"]
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
def test_tokenize_macros(self):
|
| 200 |
+
"""
|
| 201 |
+
Test tokenization of macro definitions.
|
| 202 |
+
"""
|
| 203 |
+
self.assertEqual(
|
| 204 |
+
tgrep.tgrep_tokenize(
|
| 205 |
+
"@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN"
|
| 206 |
+
),
|
| 207 |
+
[
|
| 208 |
+
"@",
|
| 209 |
+
"NP",
|
| 210 |
+
"/^NP/",
|
| 211 |
+
";",
|
| 212 |
+
"@",
|
| 213 |
+
"NN",
|
| 214 |
+
"/^NN/",
|
| 215 |
+
";",
|
| 216 |
+
"@NP",
|
| 217 |
+
"[",
|
| 218 |
+
"!",
|
| 219 |
+
"<",
|
| 220 |
+
"NP",
|
| 221 |
+
"|",
|
| 222 |
+
"<",
|
| 223 |
+
"@NN",
|
| 224 |
+
"]",
|
| 225 |
+
"!",
|
| 226 |
+
"$..",
|
| 227 |
+
"@NN",
|
| 228 |
+
],
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
def test_node_simple(self):
|
| 232 |
+
"""
|
| 233 |
+
Test a simple use of tgrep for finding nodes matching a given
|
| 234 |
+
pattern.
|
| 235 |
+
"""
|
| 236 |
+
tree = ParentedTree.fromstring(
|
| 237 |
+
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
|
| 238 |
+
)
|
| 239 |
+
self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
|
| 240 |
+
self.assertEqual(
|
| 241 |
+
list(tgrep.tgrep_nodes("NN", [tree])), [[tree[0, 2], tree[2, 1]]]
|
| 242 |
+
)
|
| 243 |
+
self.assertEqual(
|
| 244 |
+
list(tgrep.tgrep_positions("NN|JJ", [tree])), [[(0, 1), (0, 2), (2, 1)]]
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
def test_node_printing(self):
|
| 248 |
+
"""Test that the tgrep print operator ' is properly ignored."""
|
| 249 |
+
tree = ParentedTree.fromstring("(S (n x) (N x))")
|
| 250 |
+
self.assertEqual(
|
| 251 |
+
list(tgrep.tgrep_positions("N", [tree])),
|
| 252 |
+
list(tgrep.tgrep_positions("'N", [tree])),
|
| 253 |
+
)
|
| 254 |
+
self.assertEqual(
|
| 255 |
+
list(tgrep.tgrep_positions("/[Nn]/", [tree])),
|
| 256 |
+
list(tgrep.tgrep_positions("'/[Nn]/", [tree])),
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
def test_node_encoding(self):
|
| 260 |
+
"""
|
| 261 |
+
Test that tgrep search strings handles bytes and strs the same
|
| 262 |
+
way.
|
| 263 |
+
"""
|
| 264 |
+
tree = ParentedTree.fromstring(
|
| 265 |
+
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
|
| 266 |
+
)
|
| 267 |
+
self.assertEqual(
|
| 268 |
+
list(tgrep.tgrep_positions(b"NN", [tree])),
|
| 269 |
+
list(tgrep.tgrep_positions(b"NN", [tree])),
|
| 270 |
+
)
|
| 271 |
+
self.assertEqual(
|
| 272 |
+
list(tgrep.tgrep_nodes(b"NN", [tree])),
|
| 273 |
+
list(tgrep.tgrep_nodes("NN", [tree])),
|
| 274 |
+
)
|
| 275 |
+
self.assertEqual(
|
| 276 |
+
list(tgrep.tgrep_positions(b"NN|JJ", [tree])),
|
| 277 |
+
list(tgrep.tgrep_positions("NN|JJ", [tree])),
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
def test_node_nocase(self):
|
| 281 |
+
"""
|
| 282 |
+
Test selecting nodes using case insensitive node names.
|
| 283 |
+
"""
|
| 284 |
+
tree = ParentedTree.fromstring("(S (n x) (N x))")
|
| 285 |
+
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
|
| 286 |
+
self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
|
| 287 |
+
|
| 288 |
+
def test_node_quoted(self):
|
| 289 |
+
"""
|
| 290 |
+
Test selecting nodes using quoted node names.
|
| 291 |
+
"""
|
| 292 |
+
tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
|
| 293 |
+
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
|
| 294 |
+
self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
|
| 295 |
+
self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
|
| 296 |
+
self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
|
| 297 |
+
|
| 298 |
+
def test_node_regex(self):
|
| 299 |
+
"""
|
| 300 |
+
Test regex matching on nodes.
|
| 301 |
+
"""
|
| 302 |
+
tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
|
| 303 |
+
# This is a regular expression that matches any node whose
|
| 304 |
+
# name starts with NP, including NP-SBJ:
|
| 305 |
+
self.assertEqual(list(tgrep.tgrep_positions("/^NP/", [tree])), [[(0,), (1,)]])
|
| 306 |
+
|
| 307 |
+
def test_node_regex_2(self):
|
| 308 |
+
"""
|
| 309 |
+
Test regex matching on nodes.
|
| 310 |
+
"""
|
| 311 |
+
tree = ParentedTree.fromstring("(S (SBJ x) (SBJ1 x) (NP-SBJ x))")
|
| 312 |
+
self.assertEqual(list(tgrep.tgrep_positions("/^SBJ/", [tree])), [[(0,), (1,)]])
|
| 313 |
+
# This is a regular expression that matches any node whose
|
| 314 |
+
# name includes SBJ, including NP-SBJ:
|
| 315 |
+
self.assertEqual(
|
| 316 |
+
list(tgrep.tgrep_positions("/SBJ/", [tree])), [[(0,), (1,), (2,)]]
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
def test_node_tree_position(self):
|
| 320 |
+
"""
|
| 321 |
+
Test matching on nodes based on NLTK tree position.
|
| 322 |
+
"""
|
| 323 |
+
tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
|
| 324 |
+
# test all tree positions that are not leaves
|
| 325 |
+
leaf_positions = {tree.leaf_treeposition(x) for x in range(len(tree.leaves()))}
|
| 326 |
+
tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
|
| 327 |
+
for position in tree_positions:
|
| 328 |
+
node_id = f"N{position}"
|
| 329 |
+
tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
|
| 330 |
+
self.assertEqual(len(tgrep_positions[0]), 1)
|
| 331 |
+
self.assertEqual(tgrep_positions[0][0], position)
|
| 332 |
+
|
| 333 |
+
def test_node_noleaves(self):
|
| 334 |
+
"""
|
| 335 |
+
Test node name matching with the search_leaves flag set to False.
|
| 336 |
+
"""
|
| 337 |
+
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
|
| 338 |
+
self.assertEqual(
|
| 339 |
+
list(tgrep.tgrep_positions("x", [tree])), [[(0, 0, 0), (1, 0, 0)]]
|
| 340 |
+
)
|
| 341 |
+
self.assertEqual(list(tgrep.tgrep_positions("x", [tree], False)), [[]])
|
| 342 |
+
|
| 343 |
+
def tests_rel_dominance(self):
|
| 344 |
+
"""
|
| 345 |
+
Test matching nodes based on dominance relations.
|
| 346 |
+
"""
|
| 347 |
+
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
|
| 348 |
+
self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,)]])
|
| 349 |
+
self.assertEqual(list(tgrep.tgrep_positions("* < T > S", [tree])), [[(0,)]])
|
| 350 |
+
self.assertEqual(
|
| 351 |
+
list(tgrep.tgrep_positions("* !< T", [tree])),
|
| 352 |
+
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
|
| 353 |
+
)
|
| 354 |
+
self.assertEqual(list(tgrep.tgrep_positions("* !< T > S", [tree])), [[(1,)]])
|
| 355 |
+
self.assertEqual(list(tgrep.tgrep_positions("* > A", [tree])), [[(0, 0)]])
|
| 356 |
+
self.assertEqual(list(tgrep.tgrep_positions("* > B", [tree])), [[(1, 0)]])
|
| 357 |
+
self.assertEqual(
|
| 358 |
+
list(tgrep.tgrep_positions("* !> B", [tree])),
|
| 359 |
+
[[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
|
| 360 |
+
)
|
| 361 |
+
self.assertEqual(
|
| 362 |
+
list(tgrep.tgrep_positions("* !> B >> S", [tree])), [[(0,), (0, 0), (1,)]]
|
| 363 |
+
)
|
| 364 |
+
self.assertEqual(
|
| 365 |
+
list(tgrep.tgrep_positions("* >> S", [tree])),
|
| 366 |
+
[[(0,), (0, 0), (1,), (1, 0)]],
|
| 367 |
+
)
|
| 368 |
+
self.assertEqual(
|
| 369 |
+
list(tgrep.tgrep_positions("* >>, S", [tree])), [[(0,), (0, 0)]]
|
| 370 |
+
)
|
| 371 |
+
self.assertEqual(
|
| 372 |
+
list(tgrep.tgrep_positions("* >>' S", [tree])), [[(1,), (1, 0)]]
|
| 373 |
+
)
|
| 374 |
+
# Known issue:
|
| 375 |
+
# self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
|
| 376 |
+
# [[()]])
|
| 377 |
+
self.assertEqual(list(tgrep.tgrep_positions("* << T", [tree])), [[(), (0,)]])
|
| 378 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <<' T", [tree])), [[(0,)]])
|
| 379 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <<1 N", [tree])), [[(1,)]])
|
| 380 |
+
self.assertEqual(
|
| 381 |
+
list(tgrep.tgrep_positions("* !<< T", [tree])),
|
| 382 |
+
[[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
|
| 383 |
+
)
|
| 384 |
+
tree = ParentedTree.fromstring("(S (A (T x)) (B (T x) (N x )))")
|
| 385 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <: T", [tree])), [[(0,)]])
|
| 386 |
+
self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,), (1,)]])
|
| 387 |
+
self.assertEqual(
|
| 388 |
+
list(tgrep.tgrep_positions("* !<: T", [tree])),
|
| 389 |
+
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
|
| 390 |
+
)
|
| 391 |
+
self.assertEqual(list(tgrep.tgrep_positions("* !<: T > S", [tree])), [[(1,)]])
|
| 392 |
+
tree = ParentedTree.fromstring("(S (T (A x) (B x)) (T (C x)))")
|
| 393 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >: T", [tree])), [[(1, 0)]])
|
| 394 |
+
self.assertEqual(
|
| 395 |
+
list(tgrep.tgrep_positions("* !>: T", [tree])),
|
| 396 |
+
[[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
|
| 397 |
+
)
|
| 398 |
+
tree = ParentedTree.fromstring(
|
| 399 |
+
"(S (A (B (C (D (E (T x))))))" " (A (B (C (D (E (T x))) (N x)))))"
|
| 400 |
+
)
|
| 401 |
+
self.assertEqual(
|
| 402 |
+
list(tgrep.tgrep_positions("* <<: T", [tree])),
|
| 403 |
+
[
|
| 404 |
+
[
|
| 405 |
+
(0,),
|
| 406 |
+
(0, 0),
|
| 407 |
+
(0, 0, 0),
|
| 408 |
+
(0, 0, 0, 0),
|
| 409 |
+
(0, 0, 0, 0, 0),
|
| 410 |
+
(1, 0, 0, 0),
|
| 411 |
+
(1, 0, 0, 0, 0),
|
| 412 |
+
]
|
| 413 |
+
],
|
| 414 |
+
)
|
| 415 |
+
self.assertEqual(
|
| 416 |
+
list(tgrep.tgrep_positions("* >>: A", [tree])),
|
| 417 |
+
[
|
| 418 |
+
[
|
| 419 |
+
(0, 0),
|
| 420 |
+
(0, 0, 0),
|
| 421 |
+
(0, 0, 0, 0),
|
| 422 |
+
(0, 0, 0, 0, 0),
|
| 423 |
+
(0, 0, 0, 0, 0, 0),
|
| 424 |
+
(1, 0),
|
| 425 |
+
(1, 0, 0),
|
| 426 |
+
]
|
| 427 |
+
],
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
def test_bad_operator(self):
|
| 431 |
+
"""
|
| 432 |
+
Test error handling of undefined tgrep operators.
|
| 433 |
+
"""
|
| 434 |
+
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
|
| 435 |
+
self.assertRaises(
|
| 436 |
+
tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree])
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
def test_comments(self):
|
| 440 |
+
"""
|
| 441 |
+
Test that comments are correctly filtered out of tgrep search
|
| 442 |
+
strings.
|
| 443 |
+
"""
|
| 444 |
+
tree = ParentedTree.fromstring("(S (NN x) (NP x) (NN x))")
|
| 445 |
+
search1 = """
|
| 446 |
+
@ NP /^NP/;
|
| 447 |
+
@ NN /^NN/;
|
| 448 |
+
@NN
|
| 449 |
+
"""
|
| 450 |
+
self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
|
| 451 |
+
search2 = """
|
| 452 |
+
# macros
|
| 453 |
+
@ NP /^NP/;
|
| 454 |
+
@ NN /^NN/;
|
| 455 |
+
|
| 456 |
+
# search string
|
| 457 |
+
@NN
|
| 458 |
+
"""
|
| 459 |
+
self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
|
| 460 |
+
|
| 461 |
+
def test_rel_sister_nodes(self):
|
| 462 |
+
"""
|
| 463 |
+
Test matching sister nodes in a tree.
|
| 464 |
+
"""
|
| 465 |
+
tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
|
| 466 |
+
self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]])
|
| 467 |
+
self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]])
|
| 468 |
+
self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]])
|
| 469 |
+
self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]])
|
| 470 |
+
self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]])
|
| 471 |
+
|
| 472 |
+
def tests_rel_indexed_children(self):
|
| 473 |
+
"""
|
| 474 |
+
Test matching nodes based on their index in their parent node.
|
| 475 |
+
"""
|
| 476 |
+
tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
|
| 477 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0,)]])
|
| 478 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0,)]])
|
| 479 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1,)]])
|
| 480 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2,)]])
|
| 481 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2,)]])
|
| 482 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2,)]])
|
| 483 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1,)]])
|
| 484 |
+
self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0,)]])
|
| 485 |
+
tree = ParentedTree.fromstring(
|
| 486 |
+
"(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))"
|
| 487 |
+
)
|
| 488 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0,)]])
|
| 489 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0,)]])
|
| 490 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2,)]])
|
| 491 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1,)]])
|
| 492 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1,)]])
|
| 493 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1,)]])
|
| 494 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2,)]])
|
| 495 |
+
self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0,)]])
|
| 496 |
+
|
| 497 |
+
def test_rel_precedence(self):
|
| 498 |
+
"""
|
| 499 |
+
Test matching nodes based on precedence relations.
|
| 500 |
+
"""
|
| 501 |
+
tree = ParentedTree.fromstring(
|
| 502 |
+
"(S (NP (NP (PP x)) (NP (AP x)))"
|
| 503 |
+
" (VP (AP (X (PP x)) (Y (AP x))))"
|
| 504 |
+
" (NP (RC (NP (AP x)))))"
|
| 505 |
+
)
|
| 506 |
+
self.assertEqual(
|
| 507 |
+
list(tgrep.tgrep_positions("* . X", [tree])), [[(0,), (0, 1), (0, 1, 0)]]
|
| 508 |
+
)
|
| 509 |
+
self.assertEqual(
|
| 510 |
+
list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
|
| 511 |
+
)
|
| 512 |
+
self.assertEqual(
|
| 513 |
+
list(tgrep.tgrep_positions("* .. X", [tree])),
|
| 514 |
+
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
|
| 515 |
+
)
|
| 516 |
+
self.assertEqual(
|
| 517 |
+
list(tgrep.tgrep_positions("* .. Y", [tree])),
|
| 518 |
+
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
|
| 519 |
+
)
|
| 520 |
+
self.assertEqual(
|
| 521 |
+
list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
|
| 522 |
+
)
|
| 523 |
+
self.assertEqual(
|
| 524 |
+
list(tgrep.tgrep_positions("* , Y", [tree])),
|
| 525 |
+
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
|
| 526 |
+
)
|
| 527 |
+
self.assertEqual(
|
| 528 |
+
list(tgrep.tgrep_positions("* ,, X", [tree])),
|
| 529 |
+
[[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
|
| 530 |
+
)
|
| 531 |
+
self.assertEqual(
|
| 532 |
+
list(tgrep.tgrep_positions("* ,, Y", [tree])),
|
| 533 |
+
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
def test_examples(self):
|
| 537 |
+
"""
|
| 538 |
+
Test the Basic Examples from the TGrep2 manual.
|
| 539 |
+
"""
|
| 540 |
+
tree = ParentedTree.fromstring("(S (NP (AP x)) (NP (PP x)))")
|
| 541 |
+
# This matches any NP node that immediately dominates a PP:
|
| 542 |
+
self.assertEqual(list(tgrep.tgrep_positions("NP < PP", [tree])), [[(1,)]])
|
| 543 |
+
|
| 544 |
+
tree = ParentedTree.fromstring("(S (NP x) (VP x) (NP (PP x)) (VP x))")
|
| 545 |
+
# This matches an NP that dominates a PP and is immediately
|
| 546 |
+
# followed by a VP:
|
| 547 |
+
self.assertEqual(list(tgrep.tgrep_positions("NP << PP . VP", [tree])), [[(2,)]])
|
| 548 |
+
|
| 549 |
+
tree = ParentedTree.fromstring(
|
| 550 |
+
"(S (NP (AP x)) (NP (PP x)) " "(NP (DET x) (NN x)) (VP x))"
|
| 551 |
+
)
|
| 552 |
+
# This matches an NP that dominates a PP or is immediately
|
| 553 |
+
# followed by a VP:
|
| 554 |
+
self.assertEqual(
|
| 555 |
+
list(tgrep.tgrep_positions("NP << PP | . VP", [tree])), [[(1,), (2,)]]
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
tree = ParentedTree.fromstring(
|
| 559 |
+
"(S (NP (NP (PP x)) (NP (AP x)))"
|
| 560 |
+
" (VP (AP (NP (PP x)) (NP (AP x))))"
|
| 561 |
+
" (NP (RC (NP (AP x)))))"
|
| 562 |
+
)
|
| 563 |
+
# This matches an NP that does not dominate a PP. Also, the NP
|
| 564 |
+
# must either have a parent that is an NP or be dominated by a
|
| 565 |
+
# VP:
|
| 566 |
+
self.assertEqual(
|
| 567 |
+
list(tgrep.tgrep_positions("NP !<< PP [> NP | >> VP]", [tree])),
|
| 568 |
+
[[(0, 1), (1, 0, 1)]],
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
tree = ParentedTree.fromstring(
|
| 572 |
+
"(S (NP (AP (PP x) (VP x))) " "(NP (AP (PP x) (NP x))) (NP x))"
|
| 573 |
+
)
|
| 574 |
+
# This matches an NP that dominates a PP which itself is
|
| 575 |
+
# immediately followed by a VP. Note the use of parentheses to
|
| 576 |
+
# group ". VP" with the PP rather than with the NP:
|
| 577 |
+
self.assertEqual(
|
| 578 |
+
list(tgrep.tgrep_positions("NP << (PP . VP)", [tree])), [[(0,)]]
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
tree = ParentedTree.fromstring(
|
| 582 |
+
"(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))"
|
| 583 |
+
" (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))"
|
| 584 |
+
" (NP x))"
|
| 585 |
+
)
|
| 586 |
+
# This matches an NP whose last child is a PP that begins with
|
| 587 |
+
# the preposition "on":
|
| 588 |
+
self.assertEqual(
|
| 589 |
+
list(tgrep.tgrep_positions("NP <' (PP <, (IN < on))", [tree])), [[(0,)]]
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
tree = ParentedTree.fromstring(
|
| 593 |
+
"(S (S (C x) (A (B x))) (S (C x) (A x)) " "(S (D x) (A (B x))))"
|
| 594 |
+
)
|
| 595 |
+
# The following pattern matches an S which has a child A and
|
| 596 |
+
# another child that is a C and that the A has a child B:
|
| 597 |
+
self.assertEqual(
|
| 598 |
+
list(tgrep.tgrep_positions("S < (A < B) < C", [tree])), [[(0,)]]
|
| 599 |
+
)
|
| 600 |
+
|
| 601 |
+
tree = ParentedTree.fromstring(
|
| 602 |
+
"(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))"
|
| 603 |
+
)
|
| 604 |
+
# However, this pattern means that S has child A and that A
|
| 605 |
+
# has children B and C:
|
| 606 |
+
self.assertEqual(
|
| 607 |
+
list(tgrep.tgrep_positions("S < ((A < B) < C)", [tree])), [[(0,)]]
|
| 608 |
+
)
|
| 609 |
+
|
| 610 |
+
# It is equivalent to this:
|
| 611 |
+
self.assertEqual(
|
| 612 |
+
list(tgrep.tgrep_positions("S < (A < B < C)", [tree])), [[(0,)]]
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
def test_use_macros(self):
|
| 616 |
+
"""
|
| 617 |
+
Test defining and using tgrep2 macros.
|
| 618 |
+
"""
|
| 619 |
+
tree = ParentedTree.fromstring(
|
| 620 |
+
"(VP (VB sold) (NP (DET the) "
|
| 621 |
+
"(NN heiress)) (NP (NN deed) (PREP to) "
|
| 622 |
+
"(NP (DET the) (NN school) (NN house))))"
|
| 623 |
+
)
|
| 624 |
+
self.assertEqual(
|
| 625 |
+
list(
|
| 626 |
+
tgrep.tgrep_positions(
|
| 627 |
+
"@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN", [tree]
|
| 628 |
+
)
|
| 629 |
+
),
|
| 630 |
+
[[(1,), (2, 2)]],
|
| 631 |
+
)
|
| 632 |
+
# use undefined macro @CNP
|
| 633 |
+
self.assertRaises(
|
| 634 |
+
tgrep.TgrepException,
|
| 635 |
+
list,
|
| 636 |
+
tgrep.tgrep_positions(
|
| 637 |
+
"@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN", [tree]
|
| 638 |
+
),
|
| 639 |
+
)
|
| 640 |
+
|
| 641 |
+
def test_tokenize_node_labels(self):
|
| 642 |
+
"""Test tokenization of labeled nodes."""
|
| 643 |
+
self.assertEqual(
|
| 644 |
+
tgrep.tgrep_tokenize("S < @SBJ < (@VP < (@VB $.. @OBJ))"),
|
| 645 |
+
[
|
| 646 |
+
"S",
|
| 647 |
+
"<",
|
| 648 |
+
"@SBJ",
|
| 649 |
+
"<",
|
| 650 |
+
"(",
|
| 651 |
+
"@VP",
|
| 652 |
+
"<",
|
| 653 |
+
"(",
|
| 654 |
+
"@VB",
|
| 655 |
+
"$..",
|
| 656 |
+
"@OBJ",
|
| 657 |
+
")",
|
| 658 |
+
")",
|
| 659 |
+
],
|
| 660 |
+
)
|
| 661 |
+
self.assertEqual(
|
| 662 |
+
tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))"),
|
| 663 |
+
[
|
| 664 |
+
"S",
|
| 665 |
+
"<",
|
| 666 |
+
"@SBJ",
|
| 667 |
+
"=",
|
| 668 |
+
"s",
|
| 669 |
+
"<",
|
| 670 |
+
"(",
|
| 671 |
+
"@VP",
|
| 672 |
+
"=",
|
| 673 |
+
"v",
|
| 674 |
+
"<",
|
| 675 |
+
"(",
|
| 676 |
+
"@VB",
|
| 677 |
+
"$..",
|
| 678 |
+
"@OBJ",
|
| 679 |
+
")",
|
| 680 |
+
")",
|
| 681 |
+
],
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
def test_tokenize_segmented_patterns(self):
|
| 685 |
+
"""Test tokenization of segmented patterns."""
|
| 686 |
+
self.assertEqual(
|
| 687 |
+
tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"),
|
| 688 |
+
[
|
| 689 |
+
"S",
|
| 690 |
+
"<",
|
| 691 |
+
"@SBJ",
|
| 692 |
+
"=",
|
| 693 |
+
"s",
|
| 694 |
+
"<",
|
| 695 |
+
"(",
|
| 696 |
+
"@VP",
|
| 697 |
+
"=",
|
| 698 |
+
"v",
|
| 699 |
+
"<",
|
| 700 |
+
"(",
|
| 701 |
+
"@VB",
|
| 702 |
+
"$..",
|
| 703 |
+
"@OBJ",
|
| 704 |
+
")",
|
| 705 |
+
")",
|
| 706 |
+
":",
|
| 707 |
+
"=s",
|
| 708 |
+
"..",
|
| 709 |
+
"=v",
|
| 710 |
+
],
|
| 711 |
+
)
|
| 712 |
+
|
| 713 |
+
def test_labeled_nodes(self):
|
| 714 |
+
"""
|
| 715 |
+
Test labeled nodes.
|
| 716 |
+
|
| 717 |
+
Test case from Emily M. Bender.
|
| 718 |
+
"""
|
| 719 |
+
search = """
|
| 720 |
+
# macros
|
| 721 |
+
@ SBJ /SBJ/;
|
| 722 |
+
@ VP /VP/;
|
| 723 |
+
@ VB /VB/;
|
| 724 |
+
@ VPoB /V[PB]/;
|
| 725 |
+
@ OBJ /OBJ/;
|
| 726 |
+
|
| 727 |
+
# 1 svo
|
| 728 |
+
S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"""
|
| 729 |
+
sent1 = ParentedTree.fromstring(
|
| 730 |
+
"(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))"
|
| 731 |
+
)
|
| 732 |
+
sent2 = ParentedTree.fromstring(
|
| 733 |
+
"(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))"
|
| 734 |
+
)
|
| 735 |
+
search_firsthalf = search.split("\n\n")[0] + "S < @SBJ < (@VP < (@VB $.. @OBJ))"
|
| 736 |
+
search_rewrite = "S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))"
|
| 737 |
+
|
| 738 |
+
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
|
| 739 |
+
self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
|
| 740 |
+
self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
|
| 741 |
+
self.assertEqual(
|
| 742 |
+
list(tgrep.tgrep_positions(search, [sent1])),
|
| 743 |
+
list(tgrep.tgrep_positions(search_rewrite, [sent1])),
|
| 744 |
+
)
|
| 745 |
+
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
|
| 746 |
+
self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
|
| 747 |
+
self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
|
| 748 |
+
self.assertEqual(
|
| 749 |
+
list(tgrep.tgrep_positions(search, [sent2])),
|
| 750 |
+
list(tgrep.tgrep_positions(search_rewrite, [sent2])),
|
| 751 |
+
)
|
| 752 |
+
|
| 753 |
+
def test_multiple_conjs(self):
|
| 754 |
+
"""
|
| 755 |
+
Test that multiple (3 or more) conjunctions of node relations are
|
| 756 |
+
handled properly.
|
| 757 |
+
"""
|
| 758 |
+
sent = ParentedTree.fromstring("((A (B b) (C c)) (A (B b) (C c) (D d)))")
|
| 759 |
+
# search = '(A < B < C < D)'
|
| 760 |
+
# search_tworels = '(A < B < C)'
|
| 761 |
+
self.assertEqual(
|
| 762 |
+
list(tgrep.tgrep_positions("(A < B < C < D)", [sent])), [[(1,)]]
|
| 763 |
+
)
|
| 764 |
+
self.assertEqual(
|
| 765 |
+
list(tgrep.tgrep_positions("(A < B < C)", [sent])), [[(0,), (1,)]]
|
| 766 |
+
)
|
| 767 |
+
|
| 768 |
+
def test_trailing_semicolon(self):
|
| 769 |
+
"""
|
| 770 |
+
Test that semicolons at the end of a tgrep2 search string won't
|
| 771 |
+
cause a parse failure.
|
| 772 |
+
"""
|
| 773 |
+
tree = ParentedTree.fromstring(
|
| 774 |
+
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
|
| 775 |
+
)
|
| 776 |
+
self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
|
| 777 |
+
self.assertEqual(list(tgrep.tgrep_positions("NN;", [tree])), [[(0, 2), (2, 1)]])
|
| 778 |
+
self.assertEqual(
|
| 779 |
+
list(tgrep.tgrep_positions("NN;;", [tree])), [[(0, 2), (2, 1)]]
|
| 780 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tokenize.py
ADDED
|
@@ -0,0 +1,867 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for nltk.tokenize.
|
| 3 |
+
See also nltk/test/tokenize.doctest
|
| 4 |
+
"""
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from nltk.tokenize import (
|
| 10 |
+
LegalitySyllableTokenizer,
|
| 11 |
+
StanfordSegmenter,
|
| 12 |
+
SyllableTokenizer,
|
| 13 |
+
TreebankWordTokenizer,
|
| 14 |
+
TweetTokenizer,
|
| 15 |
+
punkt,
|
| 16 |
+
sent_tokenize,
|
| 17 |
+
word_tokenize,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_stanford_segmenter():
|
| 22 |
+
try:
|
| 23 |
+
seg = StanfordSegmenter()
|
| 24 |
+
seg.default_config("ar")
|
| 25 |
+
seg.default_config("zh")
|
| 26 |
+
return True
|
| 27 |
+
except LookupError:
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
check_stanford_segmenter = pytest.mark.skipif(
|
| 32 |
+
not load_stanford_segmenter(),
|
| 33 |
+
reason="NLTK was unable to find stanford-segmenter.jar.",
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class TestTokenize:
|
| 38 |
+
def test_tweet_tokenizer(self):
|
| 39 |
+
"""
|
| 40 |
+
Test TweetTokenizer using words with special and accented characters.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
|
| 44 |
+
s9 = "@myke: Let's test these words: resumé España München français"
|
| 45 |
+
tokens = tokenizer.tokenize(s9)
|
| 46 |
+
expected = [
|
| 47 |
+
":",
|
| 48 |
+
"Let's",
|
| 49 |
+
"test",
|
| 50 |
+
"these",
|
| 51 |
+
"words",
|
| 52 |
+
":",
|
| 53 |
+
"resumé",
|
| 54 |
+
"España",
|
| 55 |
+
"München",
|
| 56 |
+
"français",
|
| 57 |
+
]
|
| 58 |
+
assert tokens == expected
|
| 59 |
+
|
| 60 |
+
@pytest.mark.parametrize(
|
| 61 |
+
"test_input, expecteds",
|
| 62 |
+
[
|
| 63 |
+
(
|
| 64 |
+
"My text 0106404243030 is great text",
|
| 65 |
+
(
|
| 66 |
+
["My", "text", "01064042430", "30", "is", "great", "text"],
|
| 67 |
+
["My", "text", "0106404243030", "is", "great", "text"],
|
| 68 |
+
),
|
| 69 |
+
),
|
| 70 |
+
(
|
| 71 |
+
"My ticket id is 1234543124123",
|
| 72 |
+
(
|
| 73 |
+
["My", "ticket", "id", "is", "12345431241", "23"],
|
| 74 |
+
["My", "ticket", "id", "is", "1234543124123"],
|
| 75 |
+
),
|
| 76 |
+
),
|
| 77 |
+
(
|
| 78 |
+
"@remy: This is waaaaayyyy too much for you!!!!!! 01064042430",
|
| 79 |
+
(
|
| 80 |
+
[
|
| 81 |
+
":",
|
| 82 |
+
"This",
|
| 83 |
+
"is",
|
| 84 |
+
"waaayyy",
|
| 85 |
+
"too",
|
| 86 |
+
"much",
|
| 87 |
+
"for",
|
| 88 |
+
"you",
|
| 89 |
+
"!",
|
| 90 |
+
"!",
|
| 91 |
+
"!",
|
| 92 |
+
"01064042430",
|
| 93 |
+
],
|
| 94 |
+
[
|
| 95 |
+
":",
|
| 96 |
+
"This",
|
| 97 |
+
"is",
|
| 98 |
+
"waaayyy",
|
| 99 |
+
"too",
|
| 100 |
+
"much",
|
| 101 |
+
"for",
|
| 102 |
+
"you",
|
| 103 |
+
"!",
|
| 104 |
+
"!",
|
| 105 |
+
"!",
|
| 106 |
+
"01064042430",
|
| 107 |
+
],
|
| 108 |
+
),
|
| 109 |
+
),
|
| 110 |
+
# Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085,
|
| 111 |
+
# showing the TweetTokenizer performance for `match_phone_numbers=True` and
|
| 112 |
+
# `match_phone_numbers=False`.
|
| 113 |
+
(
|
| 114 |
+
# Some phone numbers are always tokenized, even with `match_phone_numbers=`False`
|
| 115 |
+
"My number is 06-46124080, except it's not.",
|
| 116 |
+
(
|
| 117 |
+
[
|
| 118 |
+
"My",
|
| 119 |
+
"number",
|
| 120 |
+
"is",
|
| 121 |
+
"06-46124080",
|
| 122 |
+
",",
|
| 123 |
+
"except",
|
| 124 |
+
"it's",
|
| 125 |
+
"not",
|
| 126 |
+
".",
|
| 127 |
+
],
|
| 128 |
+
[
|
| 129 |
+
"My",
|
| 130 |
+
"number",
|
| 131 |
+
"is",
|
| 132 |
+
"06-46124080",
|
| 133 |
+
",",
|
| 134 |
+
"except",
|
| 135 |
+
"it's",
|
| 136 |
+
"not",
|
| 137 |
+
".",
|
| 138 |
+
],
|
| 139 |
+
),
|
| 140 |
+
),
|
| 141 |
+
(
|
| 142 |
+
# Phone number here is only tokenized correctly if `match_phone_numbers=True`
|
| 143 |
+
"My number is 601-984-4813, except it's not.",
|
| 144 |
+
(
|
| 145 |
+
[
|
| 146 |
+
"My",
|
| 147 |
+
"number",
|
| 148 |
+
"is",
|
| 149 |
+
"601-984-4813",
|
| 150 |
+
",",
|
| 151 |
+
"except",
|
| 152 |
+
"it's",
|
| 153 |
+
"not",
|
| 154 |
+
".",
|
| 155 |
+
],
|
| 156 |
+
[
|
| 157 |
+
"My",
|
| 158 |
+
"number",
|
| 159 |
+
"is",
|
| 160 |
+
"601-984-",
|
| 161 |
+
"4813",
|
| 162 |
+
",",
|
| 163 |
+
"except",
|
| 164 |
+
"it's",
|
| 165 |
+
"not",
|
| 166 |
+
".",
|
| 167 |
+
],
|
| 168 |
+
),
|
| 169 |
+
),
|
| 170 |
+
(
|
| 171 |
+
# Phone number here is only tokenized correctly if `match_phone_numbers=True`
|
| 172 |
+
"My number is (393) 928 -3010, except it's not.",
|
| 173 |
+
(
|
| 174 |
+
[
|
| 175 |
+
"My",
|
| 176 |
+
"number",
|
| 177 |
+
"is",
|
| 178 |
+
"(393) 928 -3010",
|
| 179 |
+
",",
|
| 180 |
+
"except",
|
| 181 |
+
"it's",
|
| 182 |
+
"not",
|
| 183 |
+
".",
|
| 184 |
+
],
|
| 185 |
+
[
|
| 186 |
+
"My",
|
| 187 |
+
"number",
|
| 188 |
+
"is",
|
| 189 |
+
"(",
|
| 190 |
+
"393",
|
| 191 |
+
")",
|
| 192 |
+
"928",
|
| 193 |
+
"-",
|
| 194 |
+
"3010",
|
| 195 |
+
",",
|
| 196 |
+
"except",
|
| 197 |
+
"it's",
|
| 198 |
+
"not",
|
| 199 |
+
".",
|
| 200 |
+
],
|
| 201 |
+
),
|
| 202 |
+
),
|
| 203 |
+
(
|
| 204 |
+
# A long number is tokenized correctly only if `match_phone_numbers=False`
|
| 205 |
+
"The product identification number is 48103284512.",
|
| 206 |
+
(
|
| 207 |
+
[
|
| 208 |
+
"The",
|
| 209 |
+
"product",
|
| 210 |
+
"identification",
|
| 211 |
+
"number",
|
| 212 |
+
"is",
|
| 213 |
+
"4810328451",
|
| 214 |
+
"2",
|
| 215 |
+
".",
|
| 216 |
+
],
|
| 217 |
+
[
|
| 218 |
+
"The",
|
| 219 |
+
"product",
|
| 220 |
+
"identification",
|
| 221 |
+
"number",
|
| 222 |
+
"is",
|
| 223 |
+
"48103284512",
|
| 224 |
+
".",
|
| 225 |
+
],
|
| 226 |
+
),
|
| 227 |
+
),
|
| 228 |
+
(
|
| 229 |
+
# `match_phone_numbers=True` can have some unforeseen
|
| 230 |
+
"My favourite substraction is 240 - 1353.",
|
| 231 |
+
(
|
| 232 |
+
["My", "favourite", "substraction", "is", "240 - 1353", "."],
|
| 233 |
+
["My", "favourite", "substraction", "is", "240", "-", "1353", "."],
|
| 234 |
+
),
|
| 235 |
+
),
|
| 236 |
+
],
|
| 237 |
+
)
|
| 238 |
+
def test_tweet_tokenizer_expanded(
|
| 239 |
+
self, test_input: str, expecteds: Tuple[List[str], List[str]]
|
| 240 |
+
):
|
| 241 |
+
"""
|
| 242 |
+
Test `match_phone_numbers` in TweetTokenizer.
|
| 243 |
+
|
| 244 |
+
Note that TweetTokenizer is also passed the following for these tests:
|
| 245 |
+
* strip_handles=True
|
| 246 |
+
* reduce_len=True
|
| 247 |
+
|
| 248 |
+
:param test_input: The input string to tokenize using TweetTokenizer.
|
| 249 |
+
:type test_input: str
|
| 250 |
+
:param expecteds: A 2-tuple of tokenized sentences. The first of the two
|
| 251 |
+
tokenized is the expected output of tokenization with `match_phone_numbers=True`.
|
| 252 |
+
The second of the two tokenized lists is the expected output of tokenization
|
| 253 |
+
with `match_phone_numbers=False`.
|
| 254 |
+
:type expecteds: Tuple[List[str], List[str]]
|
| 255 |
+
"""
|
| 256 |
+
for match_phone_numbers, expected in zip([True, False], expecteds):
|
| 257 |
+
tokenizer = TweetTokenizer(
|
| 258 |
+
strip_handles=True,
|
| 259 |
+
reduce_len=True,
|
| 260 |
+
match_phone_numbers=match_phone_numbers,
|
| 261 |
+
)
|
| 262 |
+
predicted = tokenizer.tokenize(test_input)
|
| 263 |
+
assert predicted == expected
|
| 264 |
+
|
| 265 |
+
def test_sonority_sequencing_syllable_tokenizer(self):
|
| 266 |
+
"""
|
| 267 |
+
Test SyllableTokenizer tokenizer.
|
| 268 |
+
"""
|
| 269 |
+
tokenizer = SyllableTokenizer()
|
| 270 |
+
tokens = tokenizer.tokenize("justification")
|
| 271 |
+
assert tokens == ["jus", "ti", "fi", "ca", "tion"]
|
| 272 |
+
|
| 273 |
+
def test_syllable_tokenizer_numbers(self):
|
| 274 |
+
"""
|
| 275 |
+
Test SyllableTokenizer tokenizer.
|
| 276 |
+
"""
|
| 277 |
+
tokenizer = SyllableTokenizer()
|
| 278 |
+
text = "9" * 10000
|
| 279 |
+
tokens = tokenizer.tokenize(text)
|
| 280 |
+
assert tokens == [text]
|
| 281 |
+
|
| 282 |
+
def test_legality_principle_syllable_tokenizer(self):
|
| 283 |
+
"""
|
| 284 |
+
Test LegalitySyllableTokenizer tokenizer.
|
| 285 |
+
"""
|
| 286 |
+
from nltk.corpus import words
|
| 287 |
+
|
| 288 |
+
test_word = "wonderful"
|
| 289 |
+
tokenizer = LegalitySyllableTokenizer(words.words())
|
| 290 |
+
tokens = tokenizer.tokenize(test_word)
|
| 291 |
+
assert tokens == ["won", "der", "ful"]
|
| 292 |
+
|
| 293 |
+
@check_stanford_segmenter
|
| 294 |
+
def test_stanford_segmenter_arabic(self):
|
| 295 |
+
"""
|
| 296 |
+
Test the Stanford Word Segmenter for Arabic (default config)
|
| 297 |
+
"""
|
| 298 |
+
seg = StanfordSegmenter()
|
| 299 |
+
seg.default_config("ar")
|
| 300 |
+
sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات"
|
| 301 |
+
segmented_sent = seg.segment(sent.split())
|
| 302 |
+
assert segmented_sent.split() == [
|
| 303 |
+
"يبحث",
|
| 304 |
+
"علم",
|
| 305 |
+
"الحاسوب",
|
| 306 |
+
"استخدام",
|
| 307 |
+
"الحوسبة",
|
| 308 |
+
"ب",
|
| 309 |
+
"جميع",
|
| 310 |
+
"اشكال",
|
| 311 |
+
"ها",
|
| 312 |
+
"ل",
|
| 313 |
+
"حل",
|
| 314 |
+
"المشكلات",
|
| 315 |
+
]
|
| 316 |
+
|
| 317 |
+
@check_stanford_segmenter
|
| 318 |
+
def test_stanford_segmenter_chinese(self):
|
| 319 |
+
"""
|
| 320 |
+
Test the Stanford Word Segmenter for Chinese (default config)
|
| 321 |
+
"""
|
| 322 |
+
seg = StanfordSegmenter()
|
| 323 |
+
seg.default_config("zh")
|
| 324 |
+
sent = "这是斯坦福中文分词器测试"
|
| 325 |
+
segmented_sent = seg.segment(sent.split())
|
| 326 |
+
assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
|
| 327 |
+
|
| 328 |
+
def test_phone_tokenizer(self):
|
| 329 |
+
"""
|
| 330 |
+
Test a string that resembles a phone number but contains a newline
|
| 331 |
+
"""
|
| 332 |
+
|
| 333 |
+
# Should be recognized as a phone number, albeit one with multiple spaces
|
| 334 |
+
tokenizer = TweetTokenizer()
|
| 335 |
+
test1 = "(393) 928 -3010"
|
| 336 |
+
expected = ["(393) 928 -3010"]
|
| 337 |
+
result = tokenizer.tokenize(test1)
|
| 338 |
+
assert result == expected
|
| 339 |
+
|
| 340 |
+
# Due to newline, first three elements aren't part of a phone number;
|
| 341 |
+
# fourth is
|
| 342 |
+
test2 = "(393)\n928 -3010"
|
| 343 |
+
expected = ["(", "393", ")", "928 -3010"]
|
| 344 |
+
result = tokenizer.tokenize(test2)
|
| 345 |
+
assert result == expected
|
| 346 |
+
|
| 347 |
+
def test_emoji_tokenizer(self):
|
| 348 |
+
"""
|
| 349 |
+
Test a string that contains Emoji ZWJ Sequences and skin tone modifier
|
| 350 |
+
"""
|
| 351 |
+
tokenizer = TweetTokenizer()
|
| 352 |
+
|
| 353 |
+
# A Emoji ZWJ Sequences, they together build as a single emoji, should not be split.
|
| 354 |
+
test1 = "👨👩👧👧"
|
| 355 |
+
expected = ["👨👩👧👧"]
|
| 356 |
+
result = tokenizer.tokenize(test1)
|
| 357 |
+
assert result == expected
|
| 358 |
+
|
| 359 |
+
# A Emoji with skin tone modifier, the two characters build a single emoji, should not be split.
|
| 360 |
+
test2 = "👨🏿"
|
| 361 |
+
expected = ["👨🏿"]
|
| 362 |
+
result = tokenizer.tokenize(test2)
|
| 363 |
+
assert result == expected
|
| 364 |
+
|
| 365 |
+
# A string containing both skin tone modifier and ZWJ Sequences
|
| 366 |
+
test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾🎓 emoji hello 👨👩👦👦 how are 😊 you today🙅🏽🙅🏽"
|
| 367 |
+
expected = [
|
| 368 |
+
"🤔",
|
| 369 |
+
"🙈",
|
| 370 |
+
"me",
|
| 371 |
+
"así",
|
| 372 |
+
",",
|
| 373 |
+
"se",
|
| 374 |
+
"😌",
|
| 375 |
+
"ds",
|
| 376 |
+
"💕",
|
| 377 |
+
"👭",
|
| 378 |
+
"👙",
|
| 379 |
+
"hello",
|
| 380 |
+
"👩🏾\u200d🎓",
|
| 381 |
+
"emoji",
|
| 382 |
+
"hello",
|
| 383 |
+
"👨\u200d👩\u200d👦\u200d👦",
|
| 384 |
+
"how",
|
| 385 |
+
"are",
|
| 386 |
+
"😊",
|
| 387 |
+
"you",
|
| 388 |
+
"today",
|
| 389 |
+
"🙅🏽",
|
| 390 |
+
"🙅🏽",
|
| 391 |
+
]
|
| 392 |
+
result = tokenizer.tokenize(test3)
|
| 393 |
+
assert result == expected
|
| 394 |
+
|
| 395 |
+
# emoji flag sequences, including enclosed letter pairs
|
| 396 |
+
# Expected behavior from #3034
|
| 397 |
+
test4 = "🇦🇵🇵🇱🇪"
|
| 398 |
+
expected = ["🇦🇵", "🇵🇱", "🇪"]
|
| 399 |
+
result = tokenizer.tokenize(test4)
|
| 400 |
+
assert result == expected
|
| 401 |
+
|
| 402 |
+
test5 = "Hi 🇨🇦, 😍!!"
|
| 403 |
+
expected = ["Hi", "🇨🇦", ",", "😍", "!", "!"]
|
| 404 |
+
result = tokenizer.tokenize(test5)
|
| 405 |
+
assert result == expected
|
| 406 |
+
|
| 407 |
+
test6 = "<3 🇨🇦 🤝 🇵🇱 <3"
|
| 408 |
+
expected = ["<3", "🇨🇦", "🤝", "🇵🇱", "<3"]
|
| 409 |
+
result = tokenizer.tokenize(test6)
|
| 410 |
+
assert result == expected
|
| 411 |
+
|
| 412 |
+
def test_pad_asterisk(self):
|
| 413 |
+
"""
|
| 414 |
+
Test padding of asterisk for word tokenization.
|
| 415 |
+
"""
|
| 416 |
+
text = "This is a, *weird sentence with *asterisks in it."
|
| 417 |
+
expected = [
|
| 418 |
+
"This",
|
| 419 |
+
"is",
|
| 420 |
+
"a",
|
| 421 |
+
",",
|
| 422 |
+
"*",
|
| 423 |
+
"weird",
|
| 424 |
+
"sentence",
|
| 425 |
+
"with",
|
| 426 |
+
"*",
|
| 427 |
+
"asterisks",
|
| 428 |
+
"in",
|
| 429 |
+
"it",
|
| 430 |
+
".",
|
| 431 |
+
]
|
| 432 |
+
assert word_tokenize(text) == expected
|
| 433 |
+
|
| 434 |
+
def test_pad_dotdot(self):
|
| 435 |
+
"""
|
| 436 |
+
Test padding of dotdot* for word tokenization.
|
| 437 |
+
"""
|
| 438 |
+
text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
|
| 439 |
+
expected = [
|
| 440 |
+
"Why",
|
| 441 |
+
"did",
|
| 442 |
+
"dotdot",
|
| 443 |
+
"..",
|
| 444 |
+
"not",
|
| 445 |
+
"get",
|
| 446 |
+
"tokenized",
|
| 447 |
+
"but",
|
| 448 |
+
"dotdotdot",
|
| 449 |
+
"...",
|
| 450 |
+
"did",
|
| 451 |
+
"?",
|
| 452 |
+
"How",
|
| 453 |
+
"about",
|
| 454 |
+
"manydots",
|
| 455 |
+
".....",
|
| 456 |
+
]
|
| 457 |
+
assert word_tokenize(text) == expected
|
| 458 |
+
|
| 459 |
+
def test_remove_handle(self):
|
| 460 |
+
"""
|
| 461 |
+
Test remove_handle() from casual.py with specially crafted edge cases
|
| 462 |
+
"""
|
| 463 |
+
|
| 464 |
+
tokenizer = TweetTokenizer(strip_handles=True)
|
| 465 |
+
|
| 466 |
+
# Simple example. Handles with just numbers should be allowed
|
| 467 |
+
test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
|
| 468 |
+
expected = ["hello", ".", "hi"]
|
| 469 |
+
result = tokenizer.tokenize(test1)
|
| 470 |
+
assert result == expected
|
| 471 |
+
|
| 472 |
+
# Handles are allowed to follow any of the following characters
|
| 473 |
+
test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
|
| 474 |
+
expected = [
|
| 475 |
+
"`",
|
| 476 |
+
"~",
|
| 477 |
+
"(",
|
| 478 |
+
")",
|
| 479 |
+
"-",
|
| 480 |
+
"=",
|
| 481 |
+
"+",
|
| 482 |
+
"\\",
|
| 483 |
+
"|",
|
| 484 |
+
"[",
|
| 485 |
+
"]",
|
| 486 |
+
"{",
|
| 487 |
+
"}",
|
| 488 |
+
";",
|
| 489 |
+
":",
|
| 490 |
+
"'",
|
| 491 |
+
'"',
|
| 492 |
+
"/",
|
| 493 |
+
"?",
|
| 494 |
+
".",
|
| 495 |
+
",",
|
| 496 |
+
"<",
|
| 497 |
+
">",
|
| 498 |
+
"ñ",
|
| 499 |
+
".",
|
| 500 |
+
"ü",
|
| 501 |
+
".",
|
| 502 |
+
"ç",
|
| 503 |
+
".",
|
| 504 |
+
]
|
| 505 |
+
result = tokenizer.tokenize(test2)
|
| 506 |
+
assert result == expected
|
| 507 |
+
|
| 508 |
+
# Handles are NOT allowed to follow any of the following characters
|
| 509 |
+
test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
|
| 510 |
+
expected = [
|
| 511 |
+
"a",
|
| 512 |
+
"@n",
|
| 513 |
+
"j",
|
| 514 |
+
"@n",
|
| 515 |
+
"z",
|
| 516 |
+
"@n",
|
| 517 |
+
"A",
|
| 518 |
+
"@n",
|
| 519 |
+
"L",
|
| 520 |
+
"@n",
|
| 521 |
+
"Z",
|
| 522 |
+
"@n",
|
| 523 |
+
"1",
|
| 524 |
+
"@n",
|
| 525 |
+
"4",
|
| 526 |
+
"@n",
|
| 527 |
+
"7",
|
| 528 |
+
"@n",
|
| 529 |
+
"9",
|
| 530 |
+
"@n",
|
| 531 |
+
"0",
|
| 532 |
+
"@n",
|
| 533 |
+
"_",
|
| 534 |
+
"@n",
|
| 535 |
+
"!",
|
| 536 |
+
"@n",
|
| 537 |
+
"@",
|
| 538 |
+
"@n",
|
| 539 |
+
"#",
|
| 540 |
+
"@n",
|
| 541 |
+
"$",
|
| 542 |
+
"@n",
|
| 543 |
+
"%",
|
| 544 |
+
"@n",
|
| 545 |
+
"&",
|
| 546 |
+
"@n",
|
| 547 |
+
"*",
|
| 548 |
+
"@n",
|
| 549 |
+
]
|
| 550 |
+
result = tokenizer.tokenize(test3)
|
| 551 |
+
assert result == expected
|
| 552 |
+
|
| 553 |
+
# Handles are allowed to precede the following characters
|
| 554 |
+
test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
|
| 555 |
+
expected = ["!", "a", "#", "a", "$", "a", "%", "a", "&", "a", "*", "a"]
|
| 556 |
+
result = tokenizer.tokenize(test4)
|
| 557 |
+
assert result == expected
|
| 558 |
+
|
| 559 |
+
# Tests interactions with special symbols and multiple @
|
| 560 |
+
test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
|
| 561 |
+
expected = [
|
| 562 |
+
"!",
|
| 563 |
+
"@n",
|
| 564 |
+
"#",
|
| 565 |
+
"@n",
|
| 566 |
+
"$",
|
| 567 |
+
"@n",
|
| 568 |
+
"%",
|
| 569 |
+
"@n",
|
| 570 |
+
"&",
|
| 571 |
+
"@n",
|
| 572 |
+
"*",
|
| 573 |
+
"@n",
|
| 574 |
+
"@n",
|
| 575 |
+
"@n",
|
| 576 |
+
"@",
|
| 577 |
+
"@n",
|
| 578 |
+
"@n",
|
| 579 |
+
"@",
|
| 580 |
+
"@n",
|
| 581 |
+
"@n_",
|
| 582 |
+
"@n",
|
| 583 |
+
"@n7",
|
| 584 |
+
"@n",
|
| 585 |
+
"@nj",
|
| 586 |
+
"@n",
|
| 587 |
+
]
|
| 588 |
+
result = tokenizer.tokenize(test5)
|
| 589 |
+
assert result == expected
|
| 590 |
+
|
| 591 |
+
# Tests that handles can have a max length of 15
|
| 592 |
+
test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle"
|
| 593 |
+
expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"]
|
| 594 |
+
result = tokenizer.tokenize(test6)
|
| 595 |
+
assert result == expected
|
| 596 |
+
|
| 597 |
+
# Edge case where an @ comes directly after a long handle
|
| 598 |
+
test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde"
|
| 599 |
+
expected = [
|
| 600 |
+
"p",
|
| 601 |
+
"@abcde",
|
| 602 |
+
"@abcdefghijklmno",
|
| 603 |
+
"@abcde",
|
| 604 |
+
"_",
|
| 605 |
+
"@abcde",
|
| 606 |
+
"5",
|
| 607 |
+
"@abcde",
|
| 608 |
+
]
|
| 609 |
+
result = tokenizer.tokenize(test7)
|
| 610 |
+
assert result == expected
|
| 611 |
+
|
| 612 |
+
def test_treebank_span_tokenizer(self):
|
| 613 |
+
"""
|
| 614 |
+
Test TreebankWordTokenizer.span_tokenize function
|
| 615 |
+
"""
|
| 616 |
+
|
| 617 |
+
tokenizer = TreebankWordTokenizer()
|
| 618 |
+
|
| 619 |
+
# Test case in the docstring
|
| 620 |
+
test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
|
| 621 |
+
expected = [
|
| 622 |
+
(0, 4),
|
| 623 |
+
(5, 12),
|
| 624 |
+
(13, 17),
|
| 625 |
+
(18, 19),
|
| 626 |
+
(19, 23),
|
| 627 |
+
(24, 26),
|
| 628 |
+
(27, 30),
|
| 629 |
+
(31, 32),
|
| 630 |
+
(32, 36),
|
| 631 |
+
(36, 37),
|
| 632 |
+
(37, 38),
|
| 633 |
+
(40, 46),
|
| 634 |
+
(47, 48),
|
| 635 |
+
(48, 51),
|
| 636 |
+
(51, 52),
|
| 637 |
+
(53, 55),
|
| 638 |
+
(56, 59),
|
| 639 |
+
(60, 62),
|
| 640 |
+
(63, 68),
|
| 641 |
+
(69, 70),
|
| 642 |
+
(70, 76),
|
| 643 |
+
(76, 77),
|
| 644 |
+
(77, 78),
|
| 645 |
+
]
|
| 646 |
+
result = list(tokenizer.span_tokenize(test1))
|
| 647 |
+
assert result == expected
|
| 648 |
+
|
| 649 |
+
# Test case with double quotation
|
| 650 |
+
test2 = 'The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues'
|
| 651 |
+
expected = [
|
| 652 |
+
(0, 3),
|
| 653 |
+
(4, 7),
|
| 654 |
+
(8, 10),
|
| 655 |
+
(11, 18),
|
| 656 |
+
(19, 21),
|
| 657 |
+
(22, 25),
|
| 658 |
+
(26, 27),
|
| 659 |
+
(27, 36),
|
| 660 |
+
(37, 42),
|
| 661 |
+
(42, 43),
|
| 662 |
+
(44, 46),
|
| 663 |
+
(47, 50),
|
| 664 |
+
(51, 57),
|
| 665 |
+
(58, 64),
|
| 666 |
+
(65, 68),
|
| 667 |
+
(69, 74),
|
| 668 |
+
(75, 76),
|
| 669 |
+
(77, 85),
|
| 670 |
+
(86, 92),
|
| 671 |
+
(93, 95),
|
| 672 |
+
(96, 102),
|
| 673 |
+
(103, 109),
|
| 674 |
+
]
|
| 675 |
+
result = list(tokenizer.span_tokenize(test2))
|
| 676 |
+
assert result == expected
|
| 677 |
+
|
| 678 |
+
# Test case with double qoutation as well as converted quotations
|
| 679 |
+
test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
|
| 680 |
+
expected = [
|
| 681 |
+
(0, 3),
|
| 682 |
+
(4, 7),
|
| 683 |
+
(8, 10),
|
| 684 |
+
(11, 18),
|
| 685 |
+
(19, 21),
|
| 686 |
+
(22, 25),
|
| 687 |
+
(26, 27),
|
| 688 |
+
(27, 36),
|
| 689 |
+
(37, 42),
|
| 690 |
+
(42, 43),
|
| 691 |
+
(44, 46),
|
| 692 |
+
(47, 50),
|
| 693 |
+
(51, 57),
|
| 694 |
+
(58, 64),
|
| 695 |
+
(65, 68),
|
| 696 |
+
(69, 74),
|
| 697 |
+
(75, 76),
|
| 698 |
+
(77, 79),
|
| 699 |
+
(79, 87),
|
| 700 |
+
(87, 89),
|
| 701 |
+
(90, 96),
|
| 702 |
+
(97, 99),
|
| 703 |
+
(100, 106),
|
| 704 |
+
(107, 113),
|
| 705 |
+
]
|
| 706 |
+
result = list(tokenizer.span_tokenize(test3))
|
| 707 |
+
assert result == expected
|
| 708 |
+
|
| 709 |
+
def test_word_tokenize(self):
|
| 710 |
+
"""
|
| 711 |
+
Test word_tokenize function
|
| 712 |
+
"""
|
| 713 |
+
|
| 714 |
+
sentence = "The 'v', I've been fooled but I'll seek revenge."
|
| 715 |
+
expected = [
|
| 716 |
+
"The",
|
| 717 |
+
"'",
|
| 718 |
+
"v",
|
| 719 |
+
"'",
|
| 720 |
+
",",
|
| 721 |
+
"I",
|
| 722 |
+
"'ve",
|
| 723 |
+
"been",
|
| 724 |
+
"fooled",
|
| 725 |
+
"but",
|
| 726 |
+
"I",
|
| 727 |
+
"'ll",
|
| 728 |
+
"seek",
|
| 729 |
+
"revenge",
|
| 730 |
+
".",
|
| 731 |
+
]
|
| 732 |
+
assert word_tokenize(sentence) == expected
|
| 733 |
+
|
| 734 |
+
sentence = "'v' 're'"
|
| 735 |
+
expected = ["'", "v", "'", "'re", "'"]
|
| 736 |
+
assert word_tokenize(sentence) == expected
|
| 737 |
+
|
| 738 |
+
def test_punkt_pair_iter(self):
|
| 739 |
+
|
| 740 |
+
test_cases = [
|
| 741 |
+
("12", [("1", "2"), ("2", None)]),
|
| 742 |
+
("123", [("1", "2"), ("2", "3"), ("3", None)]),
|
| 743 |
+
("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]),
|
| 744 |
+
]
|
| 745 |
+
|
| 746 |
+
for (test_input, expected_output) in test_cases:
|
| 747 |
+
actual_output = [x for x in punkt._pair_iter(test_input)]
|
| 748 |
+
|
| 749 |
+
assert actual_output == expected_output
|
| 750 |
+
|
| 751 |
+
def test_punkt_pair_iter_handles_stop_iteration_exception(self):
|
| 752 |
+
# test input to trigger StopIteration from next()
|
| 753 |
+
it = iter([])
|
| 754 |
+
# call method under test and produce a generator
|
| 755 |
+
gen = punkt._pair_iter(it)
|
| 756 |
+
# unpack generator, ensure that no error is raised
|
| 757 |
+
list(gen)
|
| 758 |
+
|
| 759 |
+
def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
|
| 760 |
+
obj = punkt.PunktBaseClass()
|
| 761 |
+
|
| 762 |
+
class TestPunktTokenizeWordsMock:
|
| 763 |
+
def word_tokenize(self, s):
|
| 764 |
+
return iter([])
|
| 765 |
+
|
| 766 |
+
obj._lang_vars = TestPunktTokenizeWordsMock()
|
| 767 |
+
# unpack generator, ensure that no error is raised
|
| 768 |
+
list(obj._tokenize_words("test"))
|
| 769 |
+
|
| 770 |
+
def test_punkt_tokenize_custom_lang_vars(self):
|
| 771 |
+
|
| 772 |
+
# Create LangVars including a full stop end character as used in Bengali
|
| 773 |
+
class BengaliLanguageVars(punkt.PunktLanguageVars):
|
| 774 |
+
sent_end_chars = (".", "?", "!", "\u0964")
|
| 775 |
+
|
| 776 |
+
obj = punkt.PunktSentenceTokenizer(lang_vars=BengaliLanguageVars())
|
| 777 |
+
|
| 778 |
+
# We now expect these sentences to be split up into the individual sentences
|
| 779 |
+
sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
|
| 780 |
+
expected = [
|
| 781 |
+
"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।",
|
| 782 |
+
"অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন।",
|
| 783 |
+
"এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।",
|
| 784 |
+
]
|
| 785 |
+
|
| 786 |
+
assert obj.tokenize(sentences) == expected
|
| 787 |
+
|
| 788 |
+
def test_punkt_tokenize_no_custom_lang_vars(self):
|
| 789 |
+
|
| 790 |
+
obj = punkt.PunktSentenceTokenizer()
|
| 791 |
+
|
| 792 |
+
# We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars
|
| 793 |
+
sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
|
| 794 |
+
expected = [
|
| 795 |
+
"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
|
| 796 |
+
]
|
| 797 |
+
|
| 798 |
+
assert obj.tokenize(sentences) == expected
|
| 799 |
+
|
| 800 |
+
@pytest.mark.parametrize(
|
| 801 |
+
"input_text,n_sents,n_splits,lang_vars",
|
| 802 |
+
[
|
| 803 |
+
# Test debug_decisions on a text with two sentences, split by a dot.
|
| 804 |
+
("Subject: Some subject. Attachments: Some attachments", 2, 1),
|
| 805 |
+
# The sentence should be split into two sections,
|
| 806 |
+
# with one split and hence one decision.
|
| 807 |
+
# Test debug_decisions on a text with two sentences, split by an exclamation mark.
|
| 808 |
+
("Subject: Some subject! Attachments: Some attachments", 2, 1),
|
| 809 |
+
# The sentence should be split into two sections,
|
| 810 |
+
# with one split and hence one decision.
|
| 811 |
+
# Test debug_decisions on a text with one sentences,
|
| 812 |
+
# which is not split.
|
| 813 |
+
("This is just a normal sentence, just like any other.", 1, 0)
|
| 814 |
+
# Hence just 1
|
| 815 |
+
],
|
| 816 |
+
)
|
| 817 |
+
def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None):
|
| 818 |
+
tokenizer = punkt.PunktSentenceTokenizer()
|
| 819 |
+
if lang_vars != None:
|
| 820 |
+
tokenizer._lang_vars = lang_vars
|
| 821 |
+
|
| 822 |
+
assert len(tokenizer.tokenize(input_text)) == n_sents
|
| 823 |
+
assert len(list(tokenizer.debug_decisions(input_text))) == n_splits
|
| 824 |
+
|
| 825 |
+
def test_punkt_debug_decisions_custom_end(self):
|
| 826 |
+
# Test debug_decisions on a text with two sentences,
|
| 827 |
+
# split by a custom end character, based on Issue #2519
|
| 828 |
+
class ExtLangVars(punkt.PunktLanguageVars):
|
| 829 |
+
sent_end_chars = (".", "?", "!", "^")
|
| 830 |
+
|
| 831 |
+
self.punkt_debug_decisions(
|
| 832 |
+
"Subject: Some subject^ Attachments: Some attachments",
|
| 833 |
+
n_sents=2,
|
| 834 |
+
n_splits=1,
|
| 835 |
+
lang_vars=ExtLangVars(),
|
| 836 |
+
)
|
| 837 |
+
# The sentence should be split into two sections,
|
| 838 |
+
# with one split and hence one decision.
|
| 839 |
+
|
| 840 |
+
@pytest.mark.parametrize(
|
| 841 |
+
"sentences, expected",
|
| 842 |
+
[
|
| 843 |
+
(
|
| 844 |
+
"this is a test. . new sentence.",
|
| 845 |
+
["this is a test.", ".", "new sentence."],
|
| 846 |
+
),
|
| 847 |
+
("This. . . That", ["This.", ".", ".", "That"]),
|
| 848 |
+
("This..... That", ["This..... That"]),
|
| 849 |
+
("This... That", ["This... That"]),
|
| 850 |
+
("This.. . That", ["This.. .", "That"]),
|
| 851 |
+
("This. .. That", ["This.", ".. That"]),
|
| 852 |
+
("This. ,. That", ["This.", ",.", "That"]),
|
| 853 |
+
("This!!! That", ["This!!!", "That"]),
|
| 854 |
+
("This! That", ["This!", "That"]),
|
| 855 |
+
(
|
| 856 |
+
"1. This is R .\n2. This is A .\n3. That's all",
|
| 857 |
+
["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
|
| 858 |
+
),
|
| 859 |
+
(
|
| 860 |
+
"1. This is R .\t2. This is A .\t3. That's all",
|
| 861 |
+
["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
|
| 862 |
+
),
|
| 863 |
+
("Hello.\tThere", ["Hello.", "There"]),
|
| 864 |
+
],
|
| 865 |
+
)
|
| 866 |
+
def test_sent_tokenize(self, sentences: str, expected: List[str]):
|
| 867 |
+
assert sent_tokenize(sentences) == expected
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_twitter_auth.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for static parts of Twitter package
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
pytest.importorskip("twython")
|
| 10 |
+
|
| 11 |
+
from nltk.twitter import Authenticate
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@pytest.fixture
|
| 15 |
+
def auth():
|
| 16 |
+
return Authenticate()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class TestCredentials:
|
| 20 |
+
"""
|
| 21 |
+
Tests that Twitter credentials from a file are handled correctly.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
@classmethod
|
| 25 |
+
def setup_class(self):
|
| 26 |
+
self.subdir = os.path.join(os.path.dirname(__file__), "files")
|
| 27 |
+
os.environ["TWITTER"] = "twitter-files"
|
| 28 |
+
|
| 29 |
+
def test_environment(self, auth):
|
| 30 |
+
"""
|
| 31 |
+
Test that environment variable has been read correctly.
|
| 32 |
+
"""
|
| 33 |
+
fn = os.path.basename(auth.creds_subdir)
|
| 34 |
+
assert fn == os.environ["TWITTER"]
|
| 35 |
+
|
| 36 |
+
@pytest.mark.parametrize(
|
| 37 |
+
"kwargs",
|
| 38 |
+
[
|
| 39 |
+
# Each of the following scenarios should raise an error:
|
| 40 |
+
# An empty subdir path
|
| 41 |
+
{"subdir": ""},
|
| 42 |
+
# A subdir path of None
|
| 43 |
+
{"subdir": None},
|
| 44 |
+
# A nonexistent directory
|
| 45 |
+
{"subdir": "/nosuchdir"},
|
| 46 |
+
# 'credentials.txt' is not in default subdir, as read from `os.environ['TWITTER']`
|
| 47 |
+
{},
|
| 48 |
+
# Nonexistent credentials file ('foobar')
|
| 49 |
+
{"creds_file": "foobar"},
|
| 50 |
+
# 'bad_oauth1-1.txt' is incomplete
|
| 51 |
+
{"creds_file": "bad_oauth1-1.txt"},
|
| 52 |
+
# The first key in credentials file 'bad_oauth1-2.txt' is ill-formed
|
| 53 |
+
{"creds_file": "bad_oauth1-2.txt"},
|
| 54 |
+
# The first two lines in 'bad_oauth1-3.txt' are collapsed
|
| 55 |
+
{"creds_file": "bad_oauth1-3.txt"},
|
| 56 |
+
],
|
| 57 |
+
)
|
| 58 |
+
def test_scenarios_that_should_raise_errors(self, kwargs, auth):
|
| 59 |
+
"""Various scenarios that should raise errors"""
|
| 60 |
+
try:
|
| 61 |
+
auth.load_creds(**kwargs)
|
| 62 |
+
# raises ValueError (zero length field name in format) for python 2.6
|
| 63 |
+
# OSError for the rest
|
| 64 |
+
except (OSError, ValueError):
|
| 65 |
+
pass
|
| 66 |
+
except Exception as e:
|
| 67 |
+
pytest.fail("Unexpected exception thrown: %s" % e)
|
| 68 |
+
else:
|
| 69 |
+
pytest.fail("OSError exception not thrown.")
|
| 70 |
+
|
| 71 |
+
def test_correct_file(self, auth):
|
| 72 |
+
"""Test that a proper file succeeds and is read correctly"""
|
| 73 |
+
oauth = auth.load_creds(subdir=self.subdir)
|
| 74 |
+
|
| 75 |
+
assert auth.creds_fullpath == os.path.join(self.subdir, auth.creds_file)
|
| 76 |
+
assert auth.creds_file == "credentials.txt"
|
| 77 |
+
assert oauth["app_key"] == "a"
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_util.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from nltk.util import everygrams
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@pytest.fixture
|
| 7 |
+
def everygram_input():
|
| 8 |
+
"""Form test data for tests."""
|
| 9 |
+
return iter(["a", "b", "c"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_everygrams_without_padding(everygram_input):
|
| 13 |
+
expected_output = [
|
| 14 |
+
("a",),
|
| 15 |
+
("a", "b"),
|
| 16 |
+
("a", "b", "c"),
|
| 17 |
+
("b",),
|
| 18 |
+
("b", "c"),
|
| 19 |
+
("c",),
|
| 20 |
+
]
|
| 21 |
+
output = list(everygrams(everygram_input))
|
| 22 |
+
assert output == expected_output
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_everygrams_max_len(everygram_input):
|
| 26 |
+
expected_output = [
|
| 27 |
+
("a",),
|
| 28 |
+
("a", "b"),
|
| 29 |
+
("b",),
|
| 30 |
+
("b", "c"),
|
| 31 |
+
("c",),
|
| 32 |
+
]
|
| 33 |
+
output = list(everygrams(everygram_input, max_len=2))
|
| 34 |
+
assert output == expected_output
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_everygrams_min_len(everygram_input):
|
| 38 |
+
expected_output = [
|
| 39 |
+
("a", "b"),
|
| 40 |
+
("a", "b", "c"),
|
| 41 |
+
("b", "c"),
|
| 42 |
+
]
|
| 43 |
+
output = list(everygrams(everygram_input, min_len=2))
|
| 44 |
+
assert output == expected_output
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_everygrams_pad_right(everygram_input):
|
| 48 |
+
expected_output = [
|
| 49 |
+
("a",),
|
| 50 |
+
("a", "b"),
|
| 51 |
+
("a", "b", "c"),
|
| 52 |
+
("b",),
|
| 53 |
+
("b", "c"),
|
| 54 |
+
("b", "c", None),
|
| 55 |
+
("c",),
|
| 56 |
+
("c", None),
|
| 57 |
+
("c", None, None),
|
| 58 |
+
(None,),
|
| 59 |
+
(None, None),
|
| 60 |
+
(None,),
|
| 61 |
+
]
|
| 62 |
+
output = list(everygrams(everygram_input, max_len=3, pad_right=True))
|
| 63 |
+
assert output == expected_output
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_everygrams_pad_left(everygram_input):
|
| 67 |
+
expected_output = [
|
| 68 |
+
(None,),
|
| 69 |
+
(None, None),
|
| 70 |
+
(None, None, "a"),
|
| 71 |
+
(None,),
|
| 72 |
+
(None, "a"),
|
| 73 |
+
(None, "a", "b"),
|
| 74 |
+
("a",),
|
| 75 |
+
("a", "b"),
|
| 76 |
+
("a", "b", "c"),
|
| 77 |
+
("b",),
|
| 78 |
+
("b", "c"),
|
| 79 |
+
("c",),
|
| 80 |
+
]
|
| 81 |
+
output = list(everygrams(everygram_input, max_len=3, pad_left=True))
|
| 82 |
+
assert output == expected_output
|
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_wordnet.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for nltk.corpus.wordnet
|
| 3 |
+
See also nltk/test/wordnet.doctest
|
| 4 |
+
"""
|
| 5 |
+
import unittest
|
| 6 |
+
|
| 7 |
+
from nltk.corpus import wordnet as wn
|
| 8 |
+
from nltk.corpus import wordnet_ic as wnic
|
| 9 |
+
|
| 10 |
+
wn.ensure_loaded()
|
| 11 |
+
S = wn.synset
|
| 12 |
+
L = wn.lemma
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class WordnNetDemo(unittest.TestCase):
|
| 16 |
+
def test_retrieve_synset(self):
|
| 17 |
+
move_synset = S("go.v.21")
|
| 18 |
+
self.assertEqual(move_synset.name(), "move.v.15")
|
| 19 |
+
self.assertEqual(move_synset.lemma_names(), ["move", "go"])
|
| 20 |
+
self.assertEqual(
|
| 21 |
+
move_synset.definition(), "have a turn; make one's move in a game"
|
| 22 |
+
)
|
| 23 |
+
self.assertEqual(move_synset.examples(), ["Can I go now?"])
|
| 24 |
+
|
| 25 |
+
def test_retrieve_synsets(self):
|
| 26 |
+
self.assertEqual(sorted(wn.synsets("zap", pos="n")), [S("zap.n.01")])
|
| 27 |
+
self.assertEqual(
|
| 28 |
+
sorted(wn.synsets("zap", pos="v")),
|
| 29 |
+
[S("microwave.v.01"), S("nuke.v.01"), S("zap.v.01"), S("zap.v.02")],
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
def test_hyperhyponyms(self):
|
| 33 |
+
# Not every synset as hypernyms()
|
| 34 |
+
self.assertEqual(S("travel.v.01").hypernyms(), [])
|
| 35 |
+
self.assertEqual(S("travel.v.02").hypernyms(), [S("travel.v.03")])
|
| 36 |
+
self.assertEqual(S("travel.v.03").hypernyms(), [])
|
| 37 |
+
|
| 38 |
+
# Test hyper-/hyponyms.
|
| 39 |
+
self.assertEqual(S("breakfast.n.1").hypernyms(), [S("meal.n.01")])
|
| 40 |
+
first_five_meal_hypo = [
|
| 41 |
+
S("banquet.n.02"),
|
| 42 |
+
S("bite.n.04"),
|
| 43 |
+
S("breakfast.n.01"),
|
| 44 |
+
S("brunch.n.01"),
|
| 45 |
+
S("buffet.n.02"),
|
| 46 |
+
]
|
| 47 |
+
self.assertEqual(sorted(S("meal.n.1").hyponyms()[:5]), first_five_meal_hypo)
|
| 48 |
+
self.assertEqual(S("Austen.n.1").instance_hypernyms(), [S("writer.n.01")])
|
| 49 |
+
first_five_composer_hypo = [
|
| 50 |
+
S("ambrose.n.01"),
|
| 51 |
+
S("bach.n.01"),
|
| 52 |
+
S("barber.n.01"),
|
| 53 |
+
S("bartok.n.01"),
|
| 54 |
+
S("beethoven.n.01"),
|
| 55 |
+
]
|
| 56 |
+
self.assertEqual(
|
| 57 |
+
S("composer.n.1").instance_hyponyms()[:5], first_five_composer_hypo
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Test root hyper-/hyponyms
|
| 61 |
+
self.assertEqual(S("person.n.01").root_hypernyms(), [S("entity.n.01")])
|
| 62 |
+
self.assertEqual(S("sail.v.01").root_hypernyms(), [S("travel.v.01")])
|
| 63 |
+
self.assertEqual(
|
| 64 |
+
S("fall.v.12").root_hypernyms(), [S("act.v.01"), S("fall.v.17")]
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def test_derivationally_related_forms(self):
|
| 68 |
+
# Test `derivationally_related_forms()`
|
| 69 |
+
self.assertEqual(
|
| 70 |
+
L("zap.v.03.nuke").derivationally_related_forms(),
|
| 71 |
+
[L("atomic_warhead.n.01.nuke")],
|
| 72 |
+
)
|
| 73 |
+
self.assertEqual(
|
| 74 |
+
L("zap.v.03.atomize").derivationally_related_forms(),
|
| 75 |
+
[L("atomization.n.02.atomization")],
|
| 76 |
+
)
|
| 77 |
+
self.assertEqual(
|
| 78 |
+
L("zap.v.03.atomise").derivationally_related_forms(),
|
| 79 |
+
[L("atomization.n.02.atomisation")],
|
| 80 |
+
)
|
| 81 |
+
self.assertEqual(L("zap.v.03.zap").derivationally_related_forms(), [])
|
| 82 |
+
|
| 83 |
+
def test_meronyms_holonyms(self):
|
| 84 |
+
# Test meronyms, holonyms.
|
| 85 |
+
self.assertEqual(
|
| 86 |
+
S("dog.n.01").member_holonyms(), [S("canis.n.01"), S("pack.n.06")]
|
| 87 |
+
)
|
| 88 |
+
self.assertEqual(S("dog.n.01").part_meronyms(), [S("flag.n.07")])
|
| 89 |
+
|
| 90 |
+
self.assertEqual(S("faculty.n.2").member_meronyms(), [S("professor.n.01")])
|
| 91 |
+
self.assertEqual(S("copilot.n.1").member_holonyms(), [S("crew.n.01")])
|
| 92 |
+
|
| 93 |
+
self.assertEqual(
|
| 94 |
+
S("table.n.2").part_meronyms(),
|
| 95 |
+
[S("leg.n.03"), S("tabletop.n.01"), S("tableware.n.01")],
|
| 96 |
+
)
|
| 97 |
+
self.assertEqual(S("course.n.7").part_holonyms(), [S("meal.n.01")])
|
| 98 |
+
|
| 99 |
+
self.assertEqual(
|
| 100 |
+
S("water.n.1").substance_meronyms(), [S("hydrogen.n.01"), S("oxygen.n.01")]
|
| 101 |
+
)
|
| 102 |
+
self.assertEqual(
|
| 103 |
+
S("gin.n.1").substance_holonyms(),
|
| 104 |
+
[
|
| 105 |
+
S("gin_and_it.n.01"),
|
| 106 |
+
S("gin_and_tonic.n.01"),
|
| 107 |
+
S("martini.n.01"),
|
| 108 |
+
S("pink_lady.n.01"),
|
| 109 |
+
],
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
def test_antonyms(self):
|
| 113 |
+
# Test antonyms.
|
| 114 |
+
self.assertEqual(
|
| 115 |
+
L("leader.n.1.leader").antonyms(), [L("follower.n.01.follower")]
|
| 116 |
+
)
|
| 117 |
+
self.assertEqual(
|
| 118 |
+
L("increase.v.1.increase").antonyms(), [L("decrease.v.01.decrease")]
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
def test_misc_relations(self):
|
| 122 |
+
# Test misc relations.
|
| 123 |
+
self.assertEqual(S("snore.v.1").entailments(), [S("sleep.v.01")])
|
| 124 |
+
self.assertEqual(
|
| 125 |
+
S("heavy.a.1").similar_tos(),
|
| 126 |
+
[
|
| 127 |
+
S("dense.s.03"),
|
| 128 |
+
S("doughy.s.01"),
|
| 129 |
+
S("heavier-than-air.s.01"),
|
| 130 |
+
S("hefty.s.02"),
|
| 131 |
+
S("massive.s.04"),
|
| 132 |
+
S("non-buoyant.s.01"),
|
| 133 |
+
S("ponderous.s.02"),
|
| 134 |
+
],
|
| 135 |
+
)
|
| 136 |
+
self.assertEqual(S("light.a.1").attributes(), [S("weight.n.01")])
|
| 137 |
+
self.assertEqual(S("heavy.a.1").attributes(), [S("weight.n.01")])
|
| 138 |
+
|
| 139 |
+
# Test pertainyms.
|
| 140 |
+
self.assertEqual(
|
| 141 |
+
L("English.a.1.English").pertainyms(), [L("england.n.01.England")]
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
def test_lch(self):
|
| 145 |
+
# Test LCH.
|
| 146 |
+
self.assertEqual(
|
| 147 |
+
S("person.n.01").lowest_common_hypernyms(S("dog.n.01")),
|
| 148 |
+
[S("organism.n.01")],
|
| 149 |
+
)
|
| 150 |
+
self.assertEqual(
|
| 151 |
+
S("woman.n.01").lowest_common_hypernyms(S("girlfriend.n.02")),
|
| 152 |
+
[S("woman.n.01")],
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
def test_domains(self):
|
| 156 |
+
# Test domains.
|
| 157 |
+
self.assertEqual(S("code.n.03").topic_domains(), [S("computer_science.n.01")])
|
| 158 |
+
self.assertEqual(S("pukka.a.01").region_domains(), [S("india.n.01")])
|
| 159 |
+
self.assertEqual(S("freaky.a.01").usage_domains(), [S("slang.n.02")])
|
| 160 |
+
|
| 161 |
+
def test_in_topic_domains(self):
|
| 162 |
+
# Test in domains.
|
| 163 |
+
self.assertEqual(
|
| 164 |
+
S("computer_science.n.01").in_topic_domains()[0], S("access.n.05")
|
| 165 |
+
)
|
| 166 |
+
self.assertEqual(S("germany.n.01").in_region_domains()[23], S("trillion.n.02"))
|
| 167 |
+
self.assertEqual(S("slang.n.02").in_usage_domains()[1], S("airhead.n.01"))
|
| 168 |
+
|
| 169 |
+
def test_wordnet_similarities(self):
|
| 170 |
+
# Path based similarities.
|
| 171 |
+
self.assertAlmostEqual(S("cat.n.01").path_similarity(S("cat.n.01")), 1.0)
|
| 172 |
+
self.assertAlmostEqual(S("dog.n.01").path_similarity(S("cat.n.01")), 0.2)
|
| 173 |
+
self.assertAlmostEqual(
|
| 174 |
+
S("car.n.01").path_similarity(S("automobile.v.01")),
|
| 175 |
+
S("automobile.v.01").path_similarity(S("car.n.01")),
|
| 176 |
+
)
|
| 177 |
+
self.assertAlmostEqual(
|
| 178 |
+
S("big.a.01").path_similarity(S("dog.n.01")),
|
| 179 |
+
S("dog.n.01").path_similarity(S("big.a.01")),
|
| 180 |
+
)
|
| 181 |
+
self.assertAlmostEqual(
|
| 182 |
+
S("big.a.01").path_similarity(S("long.a.01")),
|
| 183 |
+
S("long.a.01").path_similarity(S("big.a.01")),
|
| 184 |
+
)
|
| 185 |
+
self.assertAlmostEqual(
|
| 186 |
+
S("dog.n.01").lch_similarity(S("cat.n.01")), 2.028, places=3
|
| 187 |
+
)
|
| 188 |
+
self.assertAlmostEqual(
|
| 189 |
+
S("dog.n.01").wup_similarity(S("cat.n.01")), 0.8571, places=3
|
| 190 |
+
)
|
| 191 |
+
self.assertAlmostEqual(
|
| 192 |
+
S("car.n.01").wup_similarity(S("automobile.v.01")),
|
| 193 |
+
S("automobile.v.01").wup_similarity(S("car.n.01")),
|
| 194 |
+
)
|
| 195 |
+
self.assertAlmostEqual(
|
| 196 |
+
S("big.a.01").wup_similarity(S("dog.n.01")),
|
| 197 |
+
S("dog.n.01").wup_similarity(S("big.a.01")),
|
| 198 |
+
)
|
| 199 |
+
self.assertAlmostEqual(
|
| 200 |
+
S("big.a.01").wup_similarity(S("long.a.01")),
|
| 201 |
+
S("long.a.01").wup_similarity(S("big.a.01")),
|
| 202 |
+
)
|
| 203 |
+
self.assertAlmostEqual(
|
| 204 |
+
S("big.a.01").lch_similarity(S("long.a.01")),
|
| 205 |
+
S("long.a.01").lch_similarity(S("big.a.01")),
|
| 206 |
+
)
|
| 207 |
+
# Information Content similarities.
|
| 208 |
+
brown_ic = wnic.ic("ic-brown.dat")
|
| 209 |
+
self.assertAlmostEqual(
|
| 210 |
+
S("dog.n.01").jcn_similarity(S("cat.n.01"), brown_ic), 0.4497, places=3
|
| 211 |
+
)
|
| 212 |
+
semcor_ic = wnic.ic("ic-semcor.dat")
|
| 213 |
+
self.assertAlmostEqual(
|
| 214 |
+
S("dog.n.01").lin_similarity(S("cat.n.01"), semcor_ic), 0.8863, places=3
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
def test_omw_lemma_no_trailing_underscore(self):
|
| 218 |
+
expected = sorted(
|
| 219 |
+
[
|
| 220 |
+
"popolna_sprememba_v_mišljenju",
|
| 221 |
+
"popoln_obrat",
|
| 222 |
+
"preobrat",
|
| 223 |
+
"preobrat_v_mišljenju",
|
| 224 |
+
]
|
| 225 |
+
)
|
| 226 |
+
self.assertEqual(sorted(S("about-face.n.02").lemma_names(lang="slv")), expected)
|
| 227 |
+
|
| 228 |
+
def test_iterable_type_for_all_lemma_names(self):
|
| 229 |
+
# Duck-test for iterables.
|
| 230 |
+
# See https://stackoverflow.com/a/36230057/610569
|
| 231 |
+
cat_lemmas = wn.all_lemma_names(lang="cat")
|
| 232 |
+
eng_lemmas = wn.all_lemma_names(lang="eng")
|
| 233 |
+
|
| 234 |
+
self.assertTrue(hasattr(eng_lemmas, "__iter__"))
|
| 235 |
+
self.assertTrue(hasattr(eng_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
|
| 236 |
+
self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
|
| 237 |
+
|
| 238 |
+
self.assertTrue(hasattr(cat_lemmas, "__iter__"))
|
| 239 |
+
self.assertTrue(hasattr(cat_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
|
| 240 |
+
self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
|
build/lib/opencompass/configs/dataset_collections/chat_OC15.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
|
| 5 |
+
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
| 6 |
+
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
| 7 |
+
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
|
| 8 |
+
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
|
| 9 |
+
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
|
| 10 |
+
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
|
| 11 |
+
from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
|
| 12 |
+
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
|
| 13 |
+
from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import bbh_datasets
|
| 14 |
+
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
| 15 |
+
from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
|
| 16 |
+
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
|
| 17 |
+
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
| 18 |
+
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
|
| 19 |
+
from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
|
| 20 |
+
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
|
| 21 |
+
|
| 22 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
build/lib/opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_bustm_gen_634f41 import bustm_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import CHIDDataset
|
| 6 |
+
|
| 7 |
+
chid_reader_cfg = dict(
|
| 8 |
+
input_columns=[f'content{i}' for i in range(7)], output_column='answer')
|
| 9 |
+
|
| 10 |
+
chid_infer_cfg = dict(
|
| 11 |
+
prompt_template=dict(
|
| 12 |
+
type=PromptTemplate,
|
| 13 |
+
template={
|
| 14 |
+
i: dict(
|
| 15 |
+
round=[
|
| 16 |
+
dict(role='HUMAN', prompt=f'以下句子是否通顺?\n{{content{i}}}'),
|
| 17 |
+
dict(role='BOT', prompt='这个句子是通顺的。'),
|
| 18 |
+
], )
|
| 19 |
+
for i in range(7)
|
| 20 |
+
}),
|
| 21 |
+
retriever=dict(type=ZeroRetriever),
|
| 22 |
+
inferencer=dict(type=PPLInferencer))
|
| 23 |
+
|
| 24 |
+
chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 25 |
+
|
| 26 |
+
chid_datasets = [
|
| 27 |
+
dict(
|
| 28 |
+
type=CHIDDataset,
|
| 29 |
+
path='json',
|
| 30 |
+
abbr='chid-dev',
|
| 31 |
+
data_files='./data/FewCLUE/chid/dev_few_all.json',
|
| 32 |
+
split='train',
|
| 33 |
+
reader_cfg=chid_reader_cfg,
|
| 34 |
+
infer_cfg=chid_infer_cfg,
|
| 35 |
+
eval_cfg=chid_eval_cfg),
|
| 36 |
+
dict(
|
| 37 |
+
type=CHIDDataset,
|
| 38 |
+
path='json',
|
| 39 |
+
abbr='chid-test',
|
| 40 |
+
data_files='./data/FewCLUE/chid/test_public.json',
|
| 41 |
+
split='train',
|
| 42 |
+
reader_cfg=chid_reader_cfg,
|
| 43 |
+
infer_cfg=chid_infer_cfg,
|
| 44 |
+
eval_cfg=chid_eval_cfg),
|
| 45 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_csl_gen_28b223 import csl_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import CslDatasetV2
|
| 6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
| 7 |
+
|
| 8 |
+
csl_reader_cfg = dict(
|
| 9 |
+
input_columns=['abst', 'keywords'],
|
| 10 |
+
output_column='label',
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
csl_infer_cfg = dict(
|
| 14 |
+
prompt_template=dict(
|
| 15 |
+
type=PromptTemplate,
|
| 16 |
+
template=dict(round=[
|
| 17 |
+
dict(
|
| 18 |
+
role='HUMAN',
|
| 19 |
+
prompt=
|
| 20 |
+
'摘要:{abst}\n关键词:{keywords}\n上述关键词出现在学术期刊中是否恰当?\nA. 否\nB. 是\n请从”A“,”B“中进行选择。\n答:'
|
| 21 |
+
)
|
| 22 |
+
]),
|
| 23 |
+
),
|
| 24 |
+
retriever=dict(type=ZeroRetriever),
|
| 25 |
+
inferencer=dict(type=GenInferencer),
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
csl_eval_cfg = dict(
|
| 29 |
+
evaluator=dict(type=AccEvaluator),
|
| 30 |
+
pred_role='BOT',
|
| 31 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
csl_datasets = [
|
| 35 |
+
dict(
|
| 36 |
+
abbr='csl_dev',
|
| 37 |
+
type=CslDatasetV2,
|
| 38 |
+
path='./data/FewCLUE/csl/dev_few_all.json',
|
| 39 |
+
reader_cfg=csl_reader_cfg,
|
| 40 |
+
infer_cfg=csl_infer_cfg,
|
| 41 |
+
eval_cfg=csl_eval_cfg,
|
| 42 |
+
),
|
| 43 |
+
dict(
|
| 44 |
+
abbr='csl_test',
|
| 45 |
+
type=CslDatasetV2,
|
| 46 |
+
path='./data/FewCLUE/csl/test_public.json',
|
| 47 |
+
reader_cfg=csl_reader_cfg,
|
| 48 |
+
infer_cfg=csl_infer_cfg,
|
| 49 |
+
eval_cfg=csl_eval_cfg,
|
| 50 |
+
),
|
| 51 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import EprstmtDatasetV2
|
| 6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
| 7 |
+
|
| 8 |
+
eprstmt_reader_cfg = dict(
|
| 9 |
+
input_columns=['sentence'], output_column='label', test_split='train')
|
| 10 |
+
|
| 11 |
+
eprstmt_infer_cfg = dict(
|
| 12 |
+
prompt_template=dict(
|
| 13 |
+
type=PromptTemplate,
|
| 14 |
+
template=dict(round=[
|
| 15 |
+
dict(
|
| 16 |
+
role='HUMAN',
|
| 17 |
+
prompt=
|
| 18 |
+
'内容: "{sentence}"。请对上述内容进行情绪分类。\nA. 积极\nB. 消极\n请从”A“,”B“中进行选择。\n答:'
|
| 19 |
+
),
|
| 20 |
+
]),
|
| 21 |
+
),
|
| 22 |
+
retriever=dict(type=ZeroRetriever),
|
| 23 |
+
inferencer=dict(type=GenInferencer),
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
eprstmt_eval_cfg = dict(
|
| 27 |
+
evaluator=dict(type=AccEvaluator),
|
| 28 |
+
pred_role='BOT',
|
| 29 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
eprstmt_datasets = [
|
| 33 |
+
dict(
|
| 34 |
+
abbr='eprstmt-dev',
|
| 35 |
+
type=EprstmtDatasetV2,
|
| 36 |
+
path='./data/FewCLUE/eprstmt/dev_few_all.json',
|
| 37 |
+
reader_cfg=eprstmt_reader_cfg,
|
| 38 |
+
infer_cfg=eprstmt_infer_cfg,
|
| 39 |
+
eval_cfg=eprstmt_eval_cfg,
|
| 40 |
+
),
|
| 41 |
+
dict(
|
| 42 |
+
abbr='eprstmt-test',
|
| 43 |
+
type=EprstmtDatasetV2,
|
| 44 |
+
path='./data/FewCLUE/eprstmt/test_public.json',
|
| 45 |
+
reader_cfg=eprstmt_reader_cfg,
|
| 46 |
+
infer_cfg=eprstmt_infer_cfg,
|
| 47 |
+
eval_cfg=eprstmt_eval_cfg,
|
| 48 |
+
),
|
| 49 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_f1e631.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import HFDataset
|
| 6 |
+
|
| 7 |
+
eprstmt_reader_cfg = dict(
|
| 8 |
+
input_columns=['sentence'], output_column='label', test_split='train')
|
| 9 |
+
|
| 10 |
+
eprstmt_infer_cfg = dict(
|
| 11 |
+
prompt_template=dict(
|
| 12 |
+
type=PromptTemplate,
|
| 13 |
+
template={
|
| 14 |
+
'Negative':
|
| 15 |
+
dict(round=[
|
| 16 |
+
dict(role='HUMAN', prompt='内容: "{sentence}"。情绪分类:'),
|
| 17 |
+
dict(role='BOT', prompt='消极。')
|
| 18 |
+
]),
|
| 19 |
+
'Positive':
|
| 20 |
+
dict(round=[
|
| 21 |
+
dict(role='HUMAN', prompt='内容: "{sentence}"。情绪分类:'),
|
| 22 |
+
dict(role='BOT', prompt='积极。')
|
| 23 |
+
]),
|
| 24 |
+
}),
|
| 25 |
+
retriever=dict(type=ZeroRetriever),
|
| 26 |
+
inferencer=dict(type=PPLInferencer))
|
| 27 |
+
|
| 28 |
+
eprstmt_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 29 |
+
|
| 30 |
+
eprstmt_datasets = [
|
| 31 |
+
dict(
|
| 32 |
+
type=HFDataset,
|
| 33 |
+
abbr='eprstmt-dev',
|
| 34 |
+
path='json',
|
| 35 |
+
data_files='./data/FewCLUE/eprstmt/dev_few_all.json',
|
| 36 |
+
split='train',
|
| 37 |
+
reader_cfg=eprstmt_reader_cfg,
|
| 38 |
+
infer_cfg=eprstmt_infer_cfg,
|
| 39 |
+
eval_cfg=eprstmt_eval_cfg),
|
| 40 |
+
dict(
|
| 41 |
+
type=HFDataset,
|
| 42 |
+
abbr='eprstmt-test',
|
| 43 |
+
path='json',
|
| 44 |
+
data_files='./data/FewCLUE/eprstmt/test_public.json',
|
| 45 |
+
split='train',
|
| 46 |
+
reader_cfg=eprstmt_reader_cfg,
|
| 47 |
+
infer_cfg=eprstmt_infer_cfg,
|
| 48 |
+
eval_cfg=eprstmt_eval_cfg)
|
| 49 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import HFDataset
|
| 6 |
+
|
| 7 |
+
ocnli_fc_reader_cfg = dict(
|
| 8 |
+
input_columns=['sentence1', 'sentence2'],
|
| 9 |
+
output_column='label',
|
| 10 |
+
test_split='train')
|
| 11 |
+
|
| 12 |
+
ocnli_fc_infer_cfg = dict(
|
| 13 |
+
prompt_template=dict(
|
| 14 |
+
type=PromptTemplate,
|
| 15 |
+
template={
|
| 16 |
+
'contradiction':
|
| 17 |
+
dict(round=[
|
| 18 |
+
dict(
|
| 19 |
+
role='HUMAN',
|
| 20 |
+
prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
|
| 21 |
+
dict(role='BOT', prompt='错')
|
| 22 |
+
]),
|
| 23 |
+
'entailment':
|
| 24 |
+
dict(round=[
|
| 25 |
+
dict(
|
| 26 |
+
role='HUMAN',
|
| 27 |
+
prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
|
| 28 |
+
dict(role='BOT', prompt='对')
|
| 29 |
+
]),
|
| 30 |
+
'neutral':
|
| 31 |
+
dict(round=[
|
| 32 |
+
dict(
|
| 33 |
+
role='HUMAN', prompt='如果{sentence1}为真,那么{sentence2}也为真吗?'),
|
| 34 |
+
dict(role='BOT', prompt='可能')
|
| 35 |
+
]),
|
| 36 |
+
}),
|
| 37 |
+
retriever=dict(type=ZeroRetriever),
|
| 38 |
+
inferencer=dict(type=PPLInferencer))
|
| 39 |
+
ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 40 |
+
|
| 41 |
+
ocnli_fc_datasets = [
|
| 42 |
+
dict(
|
| 43 |
+
type=HFDataset,
|
| 44 |
+
abbr='ocnli_fc-dev',
|
| 45 |
+
path='json',
|
| 46 |
+
split='train',
|
| 47 |
+
data_files='./data/FewCLUE/ocnli/dev_few_all.json',
|
| 48 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
| 49 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
| 50 |
+
eval_cfg=ocnli_fc_eval_cfg),
|
| 51 |
+
dict(
|
| 52 |
+
type=HFDataset,
|
| 53 |
+
abbr='ocnli_fc-test',
|
| 54 |
+
path='json',
|
| 55 |
+
split='train',
|
| 56 |
+
data_files='./data/FewCLUE/ocnli/test_public.json',
|
| 57 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
| 58 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
| 59 |
+
eval_cfg=ocnli_fc_eval_cfg)
|
| 60 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import HFDataset
|
| 6 |
+
|
| 7 |
+
ocnli_fc_reader_cfg = dict(
|
| 8 |
+
input_columns=['sentence1', 'sentence2'],
|
| 9 |
+
output_column='label',
|
| 10 |
+
test_split='train')
|
| 11 |
+
|
| 12 |
+
ocnli_fc_infer_cfg = dict(
|
| 13 |
+
prompt_template=dict(
|
| 14 |
+
type=PromptTemplate,
|
| 15 |
+
template={
|
| 16 |
+
'contradiction':
|
| 17 |
+
'阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:错',
|
| 18 |
+
'entailment': '阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:对',
|
| 19 |
+
'neutral': '如果{sentence1}为真,那么{sentence2}也为真吗?可能'
|
| 20 |
+
}),
|
| 21 |
+
retriever=dict(type=ZeroRetriever),
|
| 22 |
+
inferencer=dict(type=PPLInferencer))
|
| 23 |
+
ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 24 |
+
|
| 25 |
+
ocnli_fc_datasets = [
|
| 26 |
+
dict(
|
| 27 |
+
type=HFDataset,
|
| 28 |
+
abbr='ocnli_fc-dev',
|
| 29 |
+
path='json',
|
| 30 |
+
split='train',
|
| 31 |
+
data_files='./data/FewCLUE/ocnli/dev_few_all.json',
|
| 32 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
| 33 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
| 34 |
+
eval_cfg=ocnli_fc_eval_cfg),
|
| 35 |
+
dict(
|
| 36 |
+
type=HFDataset,
|
| 37 |
+
abbr='ocnli_fc-test',
|
| 38 |
+
path='json',
|
| 39 |
+
split='train',
|
| 40 |
+
data_files='./data/FewCLUE/ocnli/test_public.json',
|
| 41 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
| 42 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
| 43 |
+
eval_cfg=ocnli_fc_eval_cfg)
|
| 44 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_tnews_gen_b90e4a import tnews_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FewCLUE_tnews_ppl_d10e8a import tnews_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_7d1c07.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import TNewsDataset
|
| 6 |
+
|
| 7 |
+
tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2')
|
| 8 |
+
|
| 9 |
+
tnews_labels = [
|
| 10 |
+
'农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯',
|
| 11 |
+
'军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻'
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
tnews_infer_cfg = dict(
|
| 15 |
+
prompt_template=dict(
|
| 16 |
+
type=PromptTemplate,
|
| 17 |
+
template={lb: f'{{sentence}}这篇新闻属于:{lb}'
|
| 18 |
+
for lb in tnews_labels}),
|
| 19 |
+
retriever=dict(type=ZeroRetriever),
|
| 20 |
+
inferencer=dict(type=PPLInferencer))
|
| 21 |
+
|
| 22 |
+
tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 23 |
+
|
| 24 |
+
tnews_datasets = [
|
| 25 |
+
dict(
|
| 26 |
+
type=TNewsDataset,
|
| 27 |
+
path='json',
|
| 28 |
+
abbr='tnews-dev',
|
| 29 |
+
data_files='./data/FewCLUE/tnews/dev_few_all.json',
|
| 30 |
+
split='train',
|
| 31 |
+
reader_cfg=tnews_reader_cfg,
|
| 32 |
+
infer_cfg=tnews_infer_cfg,
|
| 33 |
+
eval_cfg=tnews_eval_cfg),
|
| 34 |
+
dict(
|
| 35 |
+
type=TNewsDataset,
|
| 36 |
+
path='json',
|
| 37 |
+
abbr='tnews-test',
|
| 38 |
+
data_files='./data/FewCLUE/tnews/test_public.json',
|
| 39 |
+
split='train',
|
| 40 |
+
reader_cfg=tnews_reader_cfg,
|
| 41 |
+
infer_cfg=tnews_infer_cfg,
|
| 42 |
+
eval_cfg=tnews_eval_cfg)
|
| 43 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_d10e8a.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import TNewsDataset
|
| 6 |
+
|
| 7 |
+
tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2')
|
| 8 |
+
|
| 9 |
+
tnews_labels = [
|
| 10 |
+
'农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯',
|
| 11 |
+
'军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻'
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
tnews_infer_cfg = dict(
|
| 15 |
+
prompt_template=dict(
|
| 16 |
+
type=PromptTemplate,
|
| 17 |
+
template={
|
| 18 |
+
lb: dict(round=[
|
| 19 |
+
dict(role='HUMAN', prompt='{sentence}\n上述内容属于什么新闻?'),
|
| 20 |
+
dict(role='BOT', prompt=lb)
|
| 21 |
+
])
|
| 22 |
+
for lb in tnews_labels
|
| 23 |
+
}),
|
| 24 |
+
retriever=dict(type=ZeroRetriever),
|
| 25 |
+
inferencer=dict(type=PPLInferencer))
|
| 26 |
+
|
| 27 |
+
tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 28 |
+
|
| 29 |
+
tnews_datasets = [
|
| 30 |
+
dict(
|
| 31 |
+
type=TNewsDataset,
|
| 32 |
+
path='json',
|
| 33 |
+
abbr='tnews-dev',
|
| 34 |
+
data_files='./data/FewCLUE/tnews/dev_few_all.json',
|
| 35 |
+
split='train',
|
| 36 |
+
reader_cfg=tnews_reader_cfg,
|
| 37 |
+
infer_cfg=tnews_infer_cfg,
|
| 38 |
+
eval_cfg=tnews_eval_cfg),
|
| 39 |
+
dict(
|
| 40 |
+
type=TNewsDataset,
|
| 41 |
+
path='json',
|
| 42 |
+
abbr='tnews-test',
|
| 43 |
+
data_files='./data/FewCLUE/tnews/test_public.json',
|
| 44 |
+
split='train',
|
| 45 |
+
reader_cfg=tnews_reader_cfg,
|
| 46 |
+
infer_cfg=tnews_infer_cfg,
|
| 47 |
+
eval_cfg=tnews_eval_cfg)
|
| 48 |
+
]
|
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_fff486.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import TNewsDataset
|
| 6 |
+
|
| 7 |
+
tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2')
|
| 8 |
+
|
| 9 |
+
tnews_labels = [
|
| 10 |
+
'农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯',
|
| 11 |
+
'军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻'
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
tnews_infer_cfg = dict(
|
| 15 |
+
prompt_template=dict(
|
| 16 |
+
type=PromptTemplate,
|
| 17 |
+
template={
|
| 18 |
+
lb: dict(round=[
|
| 19 |
+
dict(role='HUMAN', prompt='以下内容属于什么新闻:{sentence}。'),
|
| 20 |
+
dict(role='BOT', prompt=lb)
|
| 21 |
+
])
|
| 22 |
+
for lb in tnews_labels
|
| 23 |
+
}),
|
| 24 |
+
retriever=dict(type=ZeroRetriever),
|
| 25 |
+
inferencer=dict(type=PPLInferencer))
|
| 26 |
+
|
| 27 |
+
tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 28 |
+
|
| 29 |
+
tnews_datasets = [
|
| 30 |
+
dict(
|
| 31 |
+
type=TNewsDataset,
|
| 32 |
+
path='json',
|
| 33 |
+
abbr='tnews-dev',
|
| 34 |
+
data_files='./data/FewCLUE/tnews/dev_few_all.json',
|
| 35 |
+
split='train',
|
| 36 |
+
reader_cfg=tnews_reader_cfg,
|
| 37 |
+
infer_cfg=tnews_infer_cfg,
|
| 38 |
+
eval_cfg=tnews_eval_cfg),
|
| 39 |
+
dict(
|
| 40 |
+
type=TNewsDataset,
|
| 41 |
+
path='json',
|
| 42 |
+
abbr='tnews-test',
|
| 43 |
+
data_files='./data/FewCLUE/tnews/test_public.json',
|
| 44 |
+
split='train',
|
| 45 |
+
reader_cfg=tnews_reader_cfg,
|
| 46 |
+
infer_cfg=tnews_infer_cfg,
|
| 47 |
+
eval_cfg=tnews_eval_cfg)
|
| 48 |
+
]
|
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FinanceIQ_gen_e0e6b5 import financeIQ_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import FinanceIQDataset
|
| 6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
| 7 |
+
|
| 8 |
+
financeIQ_subject_mapping_en = {
|
| 9 |
+
'certified_public_accountant': '注册会计师(CPA)',
|
| 10 |
+
'banking_qualification': '银行从业资格',
|
| 11 |
+
'securities_qualification': '证券从业资格',
|
| 12 |
+
'fund_qualification': '基金从业资格',
|
| 13 |
+
'insurance_qualification': '保险从业资格CICE',
|
| 14 |
+
'economic_analyst': '经济师',
|
| 15 |
+
'taxation_practitioner': '税务师',
|
| 16 |
+
'futures_qualification': '期货从业资格',
|
| 17 |
+
'certified_fin_planner': '理财规划师',
|
| 18 |
+
'actuary_fin_math': '精算师-金融数学',
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
financeIQ_subject_mapping = {
|
| 22 |
+
'注册会计师(CPA)': '注册会计师(CPA)',
|
| 23 |
+
'银行从业资格': '银行从业资格',
|
| 24 |
+
'证券从业资格': '证券从业资格',
|
| 25 |
+
'基金从业资格': '基金从业资格',
|
| 26 |
+
'保险从业资格CICE': '保险从业资格CICE',
|
| 27 |
+
'经济师': '经济师',
|
| 28 |
+
'税务师': '税务师',
|
| 29 |
+
'期货从业资格': '期货从业资格',
|
| 30 |
+
'理财规划师': '理财规划师',
|
| 31 |
+
'精算师-金融数学': '精算师-金融数学',
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
|
| 35 |
+
|
| 36 |
+
financeIQ_datasets = []
|
| 37 |
+
for _name in financeIQ_all_sets:
|
| 38 |
+
_ch_name = financeIQ_subject_mapping[_name]
|
| 39 |
+
financeIQ_infer_cfg = dict(
|
| 40 |
+
ice_template=dict(
|
| 41 |
+
type=PromptTemplate,
|
| 42 |
+
template=dict(
|
| 43 |
+
begin='</E>',
|
| 44 |
+
round=[
|
| 45 |
+
dict(
|
| 46 |
+
role='HUMAN',
|
| 47 |
+
prompt=
|
| 48 |
+
f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
|
| 49 |
+
),
|
| 50 |
+
dict(role='BOT', prompt='答案是: {answer}'),
|
| 51 |
+
]),
|
| 52 |
+
ice_token='</E>',
|
| 53 |
+
),
|
| 54 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
| 55 |
+
inferencer=dict(type=GenInferencer),
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
financeIQ_eval_cfg = dict(
|
| 59 |
+
evaluator=dict(type=AccEvaluator),
|
| 60 |
+
pred_postprocessor=dict(type=first_capital_postprocess))
|
| 61 |
+
|
| 62 |
+
financeIQ_datasets.append(
|
| 63 |
+
dict(
|
| 64 |
+
type=FinanceIQDataset,
|
| 65 |
+
path='./data/FinanceIQ/',
|
| 66 |
+
name=_name,
|
| 67 |
+
abbr=f'FinanceIQ-{_name}',
|
| 68 |
+
reader_cfg=dict(
|
| 69 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
| 70 |
+
output_column='answer',
|
| 71 |
+
train_split='dev',
|
| 72 |
+
test_split='test'),
|
| 73 |
+
infer_cfg=financeIQ_infer_cfg,
|
| 74 |
+
eval_cfg=financeIQ_eval_cfg,
|
| 75 |
+
))
|
| 76 |
+
|
| 77 |
+
del _name, _ch_name
|
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .FinanceIQ_ppl_42b9bd import financeIQ_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import FinanceIQDataset
|
| 6 |
+
|
| 7 |
+
financeIQ_subject_mapping_en = {
|
| 8 |
+
'certified_public_accountant': '注册会计师(CPA)',
|
| 9 |
+
'banking_qualification': '银行从业资格',
|
| 10 |
+
'securities_qualification': '证券从业资格',
|
| 11 |
+
'fund_qualification': '基金从业资格',
|
| 12 |
+
'insurance_qualification': '保险从业资格CICE',
|
| 13 |
+
'economic_analyst': '经济师',
|
| 14 |
+
'taxation_practitioner': '税务师',
|
| 15 |
+
'futures_qualification': '期货从业资格',
|
| 16 |
+
'certified_fin_planner': '理财规划师',
|
| 17 |
+
'actuary_fin_math': '精算师-金融数学',
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
financeIQ_subject_mapping = {
|
| 21 |
+
'注册会计师(CPA)': '注册会计师(CPA)',
|
| 22 |
+
'银行从业资格': '银行从业资格',
|
| 23 |
+
'证券从业资格': '证券从业资格',
|
| 24 |
+
'基金从业资格': '基金从业资格',
|
| 25 |
+
'保险从业资格CICE': '保险从业资格CICE',
|
| 26 |
+
'经济师': '经济师',
|
| 27 |
+
'税务师': '税务师',
|
| 28 |
+
'期货从业资格': '期货从业资格',
|
| 29 |
+
'理财规划师': '理财规划师',
|
| 30 |
+
'精算师-金融数学': '精算师-金融数学',
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
|
| 34 |
+
|
| 35 |
+
financeIQ_datasets = []
|
| 36 |
+
for _name in financeIQ_all_sets:
|
| 37 |
+
_ch_name = financeIQ_subject_mapping[_name]
|
| 38 |
+
financeIQ_infer_cfg = dict(
|
| 39 |
+
ice_template=dict(
|
| 40 |
+
type=PromptTemplate,
|
| 41 |
+
template={
|
| 42 |
+
answer: dict(
|
| 43 |
+
begin='</E>',
|
| 44 |
+
round=[
|
| 45 |
+
dict(
|
| 46 |
+
role='HUMAN',
|
| 47 |
+
prompt=f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
|
| 48 |
+
),
|
| 49 |
+
dict(role='BOT', prompt=f'答案是: {answer}'),
|
| 50 |
+
])
|
| 51 |
+
for answer in ['A', 'B', 'C', 'D']
|
| 52 |
+
},
|
| 53 |
+
ice_token='</E>',
|
| 54 |
+
),
|
| 55 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
| 56 |
+
inferencer=dict(type=PPLInferencer),
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
financeIQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
| 60 |
+
|
| 61 |
+
financeIQ_datasets.append(
|
| 62 |
+
dict(
|
| 63 |
+
type=FinanceIQDataset,
|
| 64 |
+
path='./data/FinanceIQ/',
|
| 65 |
+
name=_name,
|
| 66 |
+
abbr=f'FinanceIQ-{_name}',
|
| 67 |
+
reader_cfg=dict(
|
| 68 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
| 69 |
+
output_column='answer',
|
| 70 |
+
train_split='dev',
|
| 71 |
+
test_split='test'),
|
| 72 |
+
infer_cfg=financeIQ_infer_cfg,
|
| 73 |
+
eval_cfg=financeIQ_eval_cfg,
|
| 74 |
+
))
|
| 75 |
+
|
| 76 |
+
del _name, _ch_name
|
build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .GLUE_CoLA_ppl_77d0df import CoLA_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl_77d0df.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import HFDataset
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
_hint = 'The following are text classification questions. \n' \
|
| 9 |
+
'Please determine whether the following sentence is linguistically acceptable: ' \
|
| 10 |
+
'0 means unacceptable, 1 means acceptable.\n'
|
| 11 |
+
|
| 12 |
+
CoLA_infer_cfg = dict(
|
| 13 |
+
ice_template=dict(
|
| 14 |
+
type=PromptTemplate,
|
| 15 |
+
template='Sentence: {sentence}\nResult: {label}',
|
| 16 |
+
),
|
| 17 |
+
prompt_template=dict(
|
| 18 |
+
type=PromptTemplate,
|
| 19 |
+
template={
|
| 20 |
+
answer:
|
| 21 |
+
f'{_hint}</E>Sentence: {{sentence}}\nResult: {answer}'
|
| 22 |
+
for answer in [0, 1]
|
| 23 |
+
},
|
| 24 |
+
ice_token='</E>',
|
| 25 |
+
),
|
| 26 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[17, 18, 19, 20, 21]),
|
| 27 |
+
inferencer=dict(type=PPLInferencer))
|
| 28 |
+
|
| 29 |
+
CoLA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
| 30 |
+
|
| 31 |
+
CoLA_datasets = []
|
| 32 |
+
for _split in ['validation']:
|
| 33 |
+
|
| 34 |
+
CoLA_reader_cfg = dict(
|
| 35 |
+
input_columns=['sentence'],
|
| 36 |
+
output_column='label',
|
| 37 |
+
test_split=_split
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
CoLA_datasets.append(
|
| 41 |
+
dict(
|
| 42 |
+
abbr=f'CoLA-{_split}',
|
| 43 |
+
type=HFDataset,
|
| 44 |
+
path='glue',
|
| 45 |
+
name='cola',
|
| 46 |
+
reader_cfg=CoLA_reader_cfg,
|
| 47 |
+
infer_cfg=CoLA_infer_cfg,
|
| 48 |
+
eval_cfg=CoLA_eval_cfg
|
| 49 |
+
)
|
| 50 |
+
)
|
build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .GLUE_MRPC_ppl_96564c import MRPC_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl_96564c.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import HFDataset
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
_hint = 'The following are semantic matching questions. \n' \
|
| 9 |
+
'Please determine whether the following two sentences are semantically equivalent: ' \
|
| 10 |
+
'0 means not equivalent, 1 means equivalent.\n'
|
| 11 |
+
MRPC_infer_cfg = dict(
|
| 12 |
+
ice_template=dict(
|
| 13 |
+
type=PromptTemplate,
|
| 14 |
+
template='Sentence one: {sentence1}\nSentence two: {sentence2}\nResult: {label}',
|
| 15 |
+
),
|
| 16 |
+
prompt_template=dict(
|
| 17 |
+
type=PromptTemplate,
|
| 18 |
+
template={
|
| 19 |
+
answer:
|
| 20 |
+
f'{_hint}</E>Sentence one: {{sentence1}}\nSentence two: {{sentence2}}\nResult: {answer}'
|
| 21 |
+
for answer in [0, 1]
|
| 22 |
+
},
|
| 23 |
+
ice_token='</E>',
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
| 26 |
+
inferencer=dict(type=PPLInferencer))
|
| 27 |
+
|
| 28 |
+
MRPC_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
MRPC_datasets = []
|
| 32 |
+
for _split in ['validation', 'test']:
|
| 33 |
+
|
| 34 |
+
MRPC_reader_cfg = dict(
|
| 35 |
+
input_columns=['sentence1', 'sentence2'],
|
| 36 |
+
output_column='label',
|
| 37 |
+
train_split='train',
|
| 38 |
+
test_split=_split
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
MRPC_datasets.append(
|
| 42 |
+
dict(
|
| 43 |
+
abbr=f'MRPC-{_split}',
|
| 44 |
+
type=HFDataset,
|
| 45 |
+
path='glue',
|
| 46 |
+
name='mrpc',
|
| 47 |
+
reader_cfg=MRPC_reader_cfg,
|
| 48 |
+
infer_cfg=MRPC_infer_cfg,
|
| 49 |
+
eval_cfg=MRPC_eval_cfg
|
| 50 |
+
)
|
| 51 |
+
)
|
build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .GLUE_QQP_ppl_250d00 import QQP_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl_250d00.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
| 4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
| 5 |
+
from opencompass.datasets import HFDataset
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
_hint = 'The following are semantic matching questions. \n' \
|
| 9 |
+
'Please determine whether the following two sentences are semantically duplicate: ' \
|
| 10 |
+
'0 means not duplicate, 1 means duplicate.\n'
|
| 11 |
+
QQP_infer_cfg = dict(
|
| 12 |
+
ice_template=dict(
|
| 13 |
+
type=PromptTemplate,
|
| 14 |
+
template='Sentence one: {question1}\nSentence two: {question2}\nResult: {label}',
|
| 15 |
+
),
|
| 16 |
+
prompt_template=dict(
|
| 17 |
+
type=PromptTemplate,
|
| 18 |
+
template={
|
| 19 |
+
answer:
|
| 20 |
+
f'{_hint}</E>Sentence one: {{question1}}\nSentence two: {{question2}}\nResult: {answer}'
|
| 21 |
+
for answer in [0, 1]
|
| 22 |
+
},
|
| 23 |
+
ice_token='</E>',
|
| 24 |
+
),
|
| 25 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
| 26 |
+
inferencer=dict(type=PPLInferencer))
|
| 27 |
+
|
| 28 |
+
QQP_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
QQP_datasets = []
|
| 32 |
+
for _split in ['validation', 'test']:
|
| 33 |
+
|
| 34 |
+
QQP_reader_cfg = dict(
|
| 35 |
+
input_columns=['question1', 'question2'],
|
| 36 |
+
output_column='label',
|
| 37 |
+
train_split='train',
|
| 38 |
+
test_split=_split
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
QQP_datasets.append(
|
| 42 |
+
dict(
|
| 43 |
+
abbr=f'QQP-{_split}',
|
| 44 |
+
type=HFDataset,
|
| 45 |
+
path='glue',
|
| 46 |
+
name='qqp',
|
| 47 |
+
reader_cfg=QQP_reader_cfg,
|
| 48 |
+
infer_cfg=QQP_infer_cfg,
|
| 49 |
+
eval_cfg=QQP_eval_cfg
|
| 50 |
+
)
|
| 51 |
+
)
|
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .GaokaoBench_gen_5cfe9e import GaokaoBench_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 4 |
+
from opencompass.datasets import GaokaoBenchDataset
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
_MCQ_prompts = [
|
| 8 |
+
{
|
| 9 |
+
'type': 'single_choice',
|
| 10 |
+
'keyword': '2010-2022_Math_II_MCQs',
|
| 11 |
+
'prefix_prompt':
|
| 12 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 13 |
+
'comment': ''
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
'type': 'single_choice',
|
| 17 |
+
'keyword': '2010-2022_Math_I_MCQs',
|
| 18 |
+
'prefix_prompt':
|
| 19 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 20 |
+
'comment': ''
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
'type':
|
| 24 |
+
'single_choice',
|
| 25 |
+
'keyword':
|
| 26 |
+
'2010-2022_History_MCQs',
|
| 27 |
+
'prefix_prompt':
|
| 28 |
+
'请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
'type':
|
| 32 |
+
'single_choice',
|
| 33 |
+
'keyword':
|
| 34 |
+
'2010-2022_Biology_MCQs',
|
| 35 |
+
'prefix_prompt':
|
| 36 |
+
'请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
'type':
|
| 40 |
+
'single_choice',
|
| 41 |
+
'keyword':
|
| 42 |
+
'2010-2022_Political_Science_MCQs',
|
| 43 |
+
'prefix_prompt':
|
| 44 |
+
'请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
'type':
|
| 48 |
+
'multi_choice',
|
| 49 |
+
'keyword':
|
| 50 |
+
'2010-2022_Physics_MCQs',
|
| 51 |
+
'prefix_prompt':
|
| 52 |
+
'请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
'type':
|
| 56 |
+
'single_choice',
|
| 57 |
+
'keyword':
|
| 58 |
+
'2010-2022_Chemistry_MCQs',
|
| 59 |
+
'prefix_prompt':
|
| 60 |
+
'请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
'type':
|
| 64 |
+
'single_choice',
|
| 65 |
+
'keyword':
|
| 66 |
+
'2010-2013_English_MCQs',
|
| 67 |
+
'prefix_prompt':
|
| 68 |
+
'请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
'type':
|
| 72 |
+
'multi_question_choice',
|
| 73 |
+
'keyword':
|
| 74 |
+
'2010-2022_Chinese_Modern_Lit',
|
| 75 |
+
'prefix_prompt':
|
| 76 |
+
'请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
'type':
|
| 80 |
+
'multi_question_choice',
|
| 81 |
+
'keyword':
|
| 82 |
+
'2010-2022_English_Fill_in_Blanks',
|
| 83 |
+
'prefix_prompt':
|
| 84 |
+
'请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
'type':
|
| 88 |
+
'five_out_of_seven',
|
| 89 |
+
'keyword':
|
| 90 |
+
'2012-2022_English_Cloze_Test',
|
| 91 |
+
'prefix_prompt':
|
| 92 |
+
'请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
'type':
|
| 96 |
+
'multi_question_choice',
|
| 97 |
+
'keyword':
|
| 98 |
+
'2010-2022_Geography_MCQs',
|
| 99 |
+
'prefix_prompt':
|
| 100 |
+
'请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
'type':
|
| 104 |
+
'multi_question_choice',
|
| 105 |
+
'keyword':
|
| 106 |
+
'2010-2022_English_Reading_Comp',
|
| 107 |
+
'prefix_prompt':
|
| 108 |
+
'请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
'type':
|
| 112 |
+
'multi_question_choice',
|
| 113 |
+
'keyword':
|
| 114 |
+
'2010-2022_Chinese_Lang_and_Usage_MCQs',
|
| 115 |
+
'prefix_prompt':
|
| 116 |
+
'请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
|
| 117 |
+
},
|
| 118 |
+
]
|
| 119 |
+
_FBQ_prompts = [{
|
| 120 |
+
'type': 'cloze',
|
| 121 |
+
'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
|
| 122 |
+
'prefix_prompt':
|
| 123 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 124 |
+
'comment': ''
|
| 125 |
+
}, {
|
| 126 |
+
'type': 'cloze',
|
| 127 |
+
'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
|
| 128 |
+
'prefix_prompt':
|
| 129 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 130 |
+
'comment': ''
|
| 131 |
+
}, {
|
| 132 |
+
'type': 'cloze',
|
| 133 |
+
'keyword':
|
| 134 |
+
'2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
|
| 135 |
+
'prefix_prompt':
|
| 136 |
+
'请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 137 |
+
'comment': ''
|
| 138 |
+
}, {
|
| 139 |
+
'type': 'cloze',
|
| 140 |
+
'keyword': '2014-2022_English_Language_Cloze_Passage',
|
| 141 |
+
'prefix_prompt':
|
| 142 |
+
'请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 143 |
+
'comment': ''
|
| 144 |
+
}]
|
| 145 |
+
_OEQ_prompts = [
|
| 146 |
+
{
|
| 147 |
+
'type': 'subjective',
|
| 148 |
+
'keyword': '2010-2022_Geography_Open-ended_Questions',
|
| 149 |
+
'prefix_prompt':
|
| 150 |
+
'请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果���止一道题,请分别作答。\n题目如下:',
|
| 151 |
+
'comment': ''
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
'type': 'subjective',
|
| 155 |
+
'keyword': '2010-2022_Chemistry_Open-ended_Questions',
|
| 156 |
+
'prefix_prompt':
|
| 157 |
+
'请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 158 |
+
'comment': ''
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
'type': 'subjective',
|
| 162 |
+
'keyword': '2010-2022_Math_I_Open-ended_Questions',
|
| 163 |
+
'prefix_prompt':
|
| 164 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 165 |
+
'comment': ''
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
'type': 'subjective',
|
| 169 |
+
'keyword': '2010-2022_History_Open-ended_Questions',
|
| 170 |
+
'prefix_prompt':
|
| 171 |
+
'请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 172 |
+
'comment': ''
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
'type': 'subjective',
|
| 176 |
+
'keyword': '2010-2022_Biology_Open-ended_Questions',
|
| 177 |
+
'prefix_prompt':
|
| 178 |
+
'请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 179 |
+
'comment': ''
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
'type': 'subjective',
|
| 183 |
+
'keyword': '2010-2022_Math_II_Open-ended_Questions',
|
| 184 |
+
'prefix_prompt':
|
| 185 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 186 |
+
'comment': ''
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
'type': 'subjective',
|
| 190 |
+
'keyword': '2010-2022_Physics_Open-ended_Questions',
|
| 191 |
+
'prefix_prompt':
|
| 192 |
+
'请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
| 193 |
+
'comment': ''
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
'type': 'subjective',
|
| 197 |
+
'keyword': '2010-2022_Political_Science_Open-ended_Questions',
|
| 198 |
+
'prefix_prompt':
|
| 199 |
+
'请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 200 |
+
'comment': ''
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
'type': 'correction',
|
| 204 |
+
'keyword': '2012-2022_English_Language_Error_Correction',
|
| 205 |
+
'prefix_prompt':
|
| 206 |
+
'请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一���步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
|
| 207 |
+
# "prefix_prompt": [
|
| 208 |
+
# "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
| 209 |
+
# "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
|
| 210 |
+
# ],
|
| 211 |
+
'comment': ''
|
| 212 |
+
},
|
| 213 |
+
{
|
| 214 |
+
'type': 'subjective',
|
| 215 |
+
'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
|
| 216 |
+
'prefix_prompt':
|
| 217 |
+
'请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 218 |
+
'comment': ''
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
'type': 'subjective',
|
| 222 |
+
'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
|
| 223 |
+
'prefix_prompt':
|
| 224 |
+
'请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 225 |
+
'comment': ''
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
'type': 'subjective',
|
| 229 |
+
'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
|
| 230 |
+
'prefix_prompt':
|
| 231 |
+
'请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 232 |
+
'comment': ''
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
'type': 'subjective',
|
| 236 |
+
'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
|
| 237 |
+
'prefix_prompt':
|
| 238 |
+
'请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 239 |
+
'comment': ''
|
| 240 |
+
},
|
| 241 |
+
{
|
| 242 |
+
'type': 'subjective',
|
| 243 |
+
'keyword':
|
| 244 |
+
'2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
|
| 245 |
+
'prefix_prompt':
|
| 246 |
+
'请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
| 247 |
+
'comment': ''
|
| 248 |
+
}
|
| 249 |
+
]
|
| 250 |
+
|
| 251 |
+
GaokaoBench_datasets = []
|
| 252 |
+
for _folder, _prompts in [
|
| 253 |
+
('Multiple-choice_Questions', _MCQ_prompts),
|
| 254 |
+
('Fill-in-the-blank_Questions', _FBQ_prompts),
|
| 255 |
+
('Open-ended_Questions', _OEQ_prompts),
|
| 256 |
+
]:
|
| 257 |
+
for _p in _prompts:
|
| 258 |
+
_reader_cfg = {
|
| 259 |
+
'input_columns': ['question'],
|
| 260 |
+
'output_column': 'answer',
|
| 261 |
+
}
|
| 262 |
+
_infer_cfg = {
|
| 263 |
+
'ice_template': {
|
| 264 |
+
'type': PromptTemplate,
|
| 265 |
+
'template': {
|
| 266 |
+
'round': [{
|
| 267 |
+
'role': 'HUMAN',
|
| 268 |
+
'prompt': _p['prefix_prompt'] + '{question}'
|
| 269 |
+
}]
|
| 270 |
+
},
|
| 271 |
+
'ice_token': '</E>'
|
| 272 |
+
},
|
| 273 |
+
'retriever': {
|
| 274 |
+
'type': ZeroRetriever
|
| 275 |
+
},
|
| 276 |
+
'inferencer': {
|
| 277 |
+
'type': GenInferencer,
|
| 278 |
+
'max_out_len': 1024,
|
| 279 |
+
}
|
| 280 |
+
}
|
| 281 |
+
_eval_cfg = {
|
| 282 |
+
'evaluator': {
|
| 283 |
+
'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
|
| 284 |
+
},
|
| 285 |
+
'pred_role': 'BOT',
|
| 286 |
+
}
|
| 287 |
+
_base_path = 'opencompass/GAOKAO-BENCH'
|
| 288 |
+
_dataset = {
|
| 289 |
+
'type': GaokaoBenchDataset,
|
| 290 |
+
'abbr': 'GaokaoBench_' + _p['keyword'],
|
| 291 |
+
'path': _base_path,
|
| 292 |
+
'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
|
| 293 |
+
'name': _p['keyword'],
|
| 294 |
+
'reader_cfg': _reader_cfg,
|
| 295 |
+
'infer_cfg': _infer_cfg,
|
| 296 |
+
'eval_cfg': _eval_cfg,
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
GaokaoBench_datasets.append(_dataset)
|
| 300 |
+
|
| 301 |
+
_temporary_variables = [k for k in globals() if k.startswith('_')]
|
| 302 |
+
for _t in _temporary_variables:
|
| 303 |
+
del globals()[_t]
|
| 304 |
+
del _temporary_variables, _t
|
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmengine.config import read_base
|
| 2 |
+
|
| 3 |
+
with read_base():
|
| 4 |
+
from .GaokaoBench_mixed_9af5ee import GaokaoBench_datasets # noqa: F401, F403
|
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
| 4 |
+
from opencompass.datasets import GaokaoBenchDataset
|
| 5 |
+
_MCQ_prompts = [
|
| 6 |
+
{
|
| 7 |
+
'type': 'single_choice',
|
| 8 |
+
'keyword': '2010-2022_Math_II_MCQs',
|
| 9 |
+
'prefix_prompt':
|
| 10 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 11 |
+
'comment': ''
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
'type': 'single_choice',
|
| 15 |
+
'keyword': '2010-2022_Math_I_MCQs',
|
| 16 |
+
'prefix_prompt':
|
| 17 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 18 |
+
'comment': ''
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
'type':
|
| 22 |
+
'single_choice',
|
| 23 |
+
'keyword':
|
| 24 |
+
'2010-2022_History_MCQs',
|
| 25 |
+
'prefix_prompt':
|
| 26 |
+
'请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
'type':
|
| 30 |
+
'single_choice',
|
| 31 |
+
'keyword':
|
| 32 |
+
'2010-2022_Biology_MCQs',
|
| 33 |
+
'prefix_prompt':
|
| 34 |
+
'请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
'type':
|
| 38 |
+
'single_choice',
|
| 39 |
+
'keyword':
|
| 40 |
+
'2010-2022_Political_Science_MCQs',
|
| 41 |
+
'prefix_prompt':
|
| 42 |
+
'请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
'type':
|
| 46 |
+
'multi_choice',
|
| 47 |
+
'keyword':
|
| 48 |
+
'2010-2022_Physics_MCQs',
|
| 49 |
+
'prefix_prompt':
|
| 50 |
+
'请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
'type':
|
| 54 |
+
'single_choice',
|
| 55 |
+
'keyword':
|
| 56 |
+
'2010-2022_Chemistry_MCQs',
|
| 57 |
+
'prefix_prompt':
|
| 58 |
+
'请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
'type':
|
| 62 |
+
'single_choice',
|
| 63 |
+
'keyword':
|
| 64 |
+
'2010-2013_English_MCQs',
|
| 65 |
+
'prefix_prompt':
|
| 66 |
+
'请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
'type':
|
| 70 |
+
'multi_question_choice',
|
| 71 |
+
'keyword':
|
| 72 |
+
'2010-2022_Chinese_Modern_Lit',
|
| 73 |
+
'prefix_prompt':
|
| 74 |
+
'请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
'type':
|
| 78 |
+
'multi_question_choice',
|
| 79 |
+
'keyword':
|
| 80 |
+
'2010-2022_English_Fill_in_Blanks',
|
| 81 |
+
'prefix_prompt':
|
| 82 |
+
'请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
'type':
|
| 86 |
+
'five_out_of_seven',
|
| 87 |
+
'keyword':
|
| 88 |
+
'2012-2022_English_Cloze_Test',
|
| 89 |
+
'prefix_prompt':
|
| 90 |
+
'请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
'type':
|
| 94 |
+
'multi_question_choice',
|
| 95 |
+
'keyword':
|
| 96 |
+
'2010-2022_Geography_MCQs',
|
| 97 |
+
'prefix_prompt':
|
| 98 |
+
'请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
'type':
|
| 102 |
+
'multi_question_choice',
|
| 103 |
+
'keyword':
|
| 104 |
+
'2010-2022_English_Reading_Comp',
|
| 105 |
+
'prefix_prompt':
|
| 106 |
+
'请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
'type':
|
| 110 |
+
'multi_question_choice',
|
| 111 |
+
'keyword':
|
| 112 |
+
'2010-2022_Chinese_Lang_and_Usage_MCQs',
|
| 113 |
+
'prefix_prompt':
|
| 114 |
+
'请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
|
| 115 |
+
},
|
| 116 |
+
]
|
| 117 |
+
_FBQ_prompts = [{
|
| 118 |
+
'type': 'cloze',
|
| 119 |
+
'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
|
| 120 |
+
'prefix_prompt':
|
| 121 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 122 |
+
'comment': ''
|
| 123 |
+
}, {
|
| 124 |
+
'type': 'cloze',
|
| 125 |
+
'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
|
| 126 |
+
'prefix_prompt':
|
| 127 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
| 128 |
+
'comment': ''
|
| 129 |
+
}, {
|
| 130 |
+
'type': 'cloze',
|
| 131 |
+
'keyword':
|
| 132 |
+
'2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
|
| 133 |
+
'prefix_prompt':
|
| 134 |
+
'请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 135 |
+
'comment': ''
|
| 136 |
+
}, {
|
| 137 |
+
'type': 'cloze',
|
| 138 |
+
'keyword': '2014-2022_English_Language_Cloze_Passage',
|
| 139 |
+
'prefix_prompt':
|
| 140 |
+
'请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 141 |
+
'comment': ''
|
| 142 |
+
}]
|
| 143 |
+
_OEQ_prompts = [
|
| 144 |
+
{
|
| 145 |
+
'type': 'subjective',
|
| 146 |
+
'keyword': '2010-2022_Geography_Open-ended_Questions',
|
| 147 |
+
'prefix_prompt':
|
| 148 |
+
'请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作��,如果不止一道题,请分别作答。\n题目如下:',
|
| 149 |
+
'comment': ''
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
'type': 'subjective',
|
| 153 |
+
'keyword': '2010-2022_Chemistry_Open-ended_Questions',
|
| 154 |
+
'prefix_prompt':
|
| 155 |
+
'请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 156 |
+
'comment': ''
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
'type': 'subjective',
|
| 160 |
+
'keyword': '2010-2022_Math_I_Open-ended_Questions',
|
| 161 |
+
'prefix_prompt':
|
| 162 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 163 |
+
'comment': ''
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
'type': 'subjective',
|
| 167 |
+
'keyword': '2010-2022_History_Open-ended_Questions',
|
| 168 |
+
'prefix_prompt':
|
| 169 |
+
'请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 170 |
+
'comment': ''
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
'type': 'subjective',
|
| 174 |
+
'keyword': '2010-2022_Biology_Open-ended_Questions',
|
| 175 |
+
'prefix_prompt':
|
| 176 |
+
'请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 177 |
+
'comment': ''
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
'type': 'subjective',
|
| 181 |
+
'keyword': '2010-2022_Math_II_Open-ended_Questions',
|
| 182 |
+
'prefix_prompt':
|
| 183 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 184 |
+
'comment': ''
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
'type': 'subjective',
|
| 188 |
+
'keyword': '2010-2022_Physics_Open-ended_Questions',
|
| 189 |
+
'prefix_prompt':
|
| 190 |
+
'请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
| 191 |
+
'comment': ''
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
'type': 'subjective',
|
| 195 |
+
'keyword': '2010-2022_Political_Science_Open-ended_Questions',
|
| 196 |
+
'prefix_prompt':
|
| 197 |
+
'请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 198 |
+
'comment': ''
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
'type': 'correction',
|
| 202 |
+
'keyword': '2012-2022_English_Language_Error_Correction',
|
| 203 |
+
'prefix_prompt':
|
| 204 |
+
'请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方��请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
|
| 205 |
+
# "prefix_prompt": [
|
| 206 |
+
# "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
| 207 |
+
# "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
|
| 208 |
+
# ],
|
| 209 |
+
'comment': ''
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
'type': 'subjective',
|
| 213 |
+
'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
|
| 214 |
+
'prefix_prompt':
|
| 215 |
+
'请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 216 |
+
'comment': ''
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
'type': 'subjective',
|
| 220 |
+
'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
|
| 221 |
+
'prefix_prompt':
|
| 222 |
+
'请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 223 |
+
'comment': ''
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
'type': 'subjective',
|
| 227 |
+
'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
|
| 228 |
+
'prefix_prompt':
|
| 229 |
+
'请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 230 |
+
'comment': ''
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
'type': 'subjective',
|
| 234 |
+
'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
|
| 235 |
+
'prefix_prompt':
|
| 236 |
+
'请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
| 237 |
+
'comment': ''
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
'type': 'subjective',
|
| 241 |
+
'keyword':
|
| 242 |
+
'2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
|
| 243 |
+
'prefix_prompt':
|
| 244 |
+
'请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
| 245 |
+
'comment': ''
|
| 246 |
+
}
|
| 247 |
+
]
|
| 248 |
+
|
| 249 |
+
GaokaoBench_datasets = []
|
| 250 |
+
for _folder, _prompts in [
|
| 251 |
+
('Multiple-choice_Questions', _MCQ_prompts),
|
| 252 |
+
('Fill-in-the-blank_Questions', _FBQ_prompts),
|
| 253 |
+
('Open-ended_Questions', _OEQ_prompts),
|
| 254 |
+
]:
|
| 255 |
+
for _p in _prompts:
|
| 256 |
+
if _p['type'] == 'single_choice':
|
| 257 |
+
continue
|
| 258 |
+
_reader_cfg = {
|
| 259 |
+
'input_columns': ['question'],
|
| 260 |
+
'output_column': 'answer',
|
| 261 |
+
}
|
| 262 |
+
_infer_cfg = {
|
| 263 |
+
'ice_template': {
|
| 264 |
+
'type': PromptTemplate,
|
| 265 |
+
'template': {
|
| 266 |
+
'round': [{
|
| 267 |
+
'role': 'HUMAN',
|
| 268 |
+
'prompt': _p['prefix_prompt'] + '{question}'
|
| 269 |
+
}]
|
| 270 |
+
},
|
| 271 |
+
'ice_token': '</E>'
|
| 272 |
+
},
|
| 273 |
+
'retriever': {
|
| 274 |
+
'type': ZeroRetriever
|
| 275 |
+
},
|
| 276 |
+
'inferencer': {
|
| 277 |
+
'type': GenInferencer,
|
| 278 |
+
'max_out_len': 1024,
|
| 279 |
+
}
|
| 280 |
+
}
|
| 281 |
+
_eval_cfg = {
|
| 282 |
+
'evaluator': {
|
| 283 |
+
'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
|
| 284 |
+
},
|
| 285 |
+
'pred_role': 'BOT',
|
| 286 |
+
}
|
| 287 |
+
_base_path = './data/GAOKAO-BENCH/data'
|
| 288 |
+
_dataset = {
|
| 289 |
+
'type': GaokaoBenchDataset,
|
| 290 |
+
'abbr': 'GaokaoBench_' + _p['keyword'],
|
| 291 |
+
'path': _base_path,
|
| 292 |
+
'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
|
| 293 |
+
'name': _p['keyword'],
|
| 294 |
+
'reader_cfg': _reader_cfg,
|
| 295 |
+
'infer_cfg': _infer_cfg,
|
| 296 |
+
'eval_cfg': _eval_cfg,
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
GaokaoBench_datasets.append(_dataset)
|
| 300 |
+
|
| 301 |
+
_folder = 'Multiple-choice_Questions'
|
| 302 |
+
for _p in _MCQ_prompts:
|
| 303 |
+
if _p['type'] != 'single_choice':
|
| 304 |
+
continue
|
| 305 |
+
_reader_cfg = {
|
| 306 |
+
'input_columns': ['question'],
|
| 307 |
+
'output_column': 'answer',
|
| 308 |
+
}
|
| 309 |
+
_infer_cfg = {
|
| 310 |
+
'ice_template': {
|
| 311 |
+
'type': PromptTemplate,
|
| 312 |
+
'template': {
|
| 313 |
+
answer: {
|
| 314 |
+
'round': [{
|
| 315 |
+
'role': 'HUMAN',
|
| 316 |
+
'prompt': _p['prefix_prompt'] + '{question}'
|
| 317 |
+
}, {
|
| 318 |
+
'role': 'BOT',
|
| 319 |
+
'prompt': f'【答案】{answer} <eoa>'
|
| 320 |
+
}]
|
| 321 |
+
}
|
| 322 |
+
for answer in ['A', 'B', 'C', 'D']
|
| 323 |
+
},
|
| 324 |
+
'ice_token': '</E>'
|
| 325 |
+
},
|
| 326 |
+
'retriever': {
|
| 327 |
+
'type': ZeroRetriever
|
| 328 |
+
},
|
| 329 |
+
'inferencer': {
|
| 330 |
+
'type': PPLInferencer
|
| 331 |
+
}
|
| 332 |
+
}
|
| 333 |
+
_eval_cfg = {
|
| 334 |
+
'evaluator': {
|
| 335 |
+
'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
|
| 336 |
+
},
|
| 337 |
+
'pred_role': 'BOT',
|
| 338 |
+
}
|
| 339 |
+
_base_path = 'opencompass/GAOKAO-BENCH'
|
| 340 |
+
_dataset = {
|
| 341 |
+
'type': GaokaoBenchDataset,
|
| 342 |
+
'abbr': 'GaokaoBench_' + _p['keyword'],
|
| 343 |
+
'path': _base_path,
|
| 344 |
+
'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
|
| 345 |
+
'name': _p['keyword'],
|
| 346 |
+
'reader_cfg': _reader_cfg,
|
| 347 |
+
'infer_cfg': _infer_cfg,
|
| 348 |
+
'eval_cfg': _eval_cfg,
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
GaokaoBench_datasets.append(_dataset)
|
| 352 |
+
|
| 353 |
+
_temporary_variables = [k for k in globals() if k.startswith('_')]
|
| 354 |
+
for _t in _temporary_variables:
|
| 355 |
+
del globals()[_t]
|
| 356 |
+
del _temporary_variables, _t
|
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 5 |
+
from opencompass.datasets import GaokaoBenchDataset
|
| 6 |
+
from mmengine.config import read_base
|
| 7 |
+
|
| 8 |
+
with read_base():
|
| 9 |
+
from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
|
| 10 |
+
|
| 11 |
+
GaokaoBench_datasets = []
|
| 12 |
+
for folder, prompts in [
|
| 13 |
+
('Multiple-choice_Questions', MCQ_prompts),
|
| 14 |
+
('Fill-in-the-blank_Questions', FBQ_prompts),
|
| 15 |
+
]:
|
| 16 |
+
for p in prompts:
|
| 17 |
+
reader_cfg = {
|
| 18 |
+
'input_columns': ['question'],
|
| 19 |
+
'output_column': 'answer',
|
| 20 |
+
}
|
| 21 |
+
infer_cfg = {
|
| 22 |
+
'ice_template': {
|
| 23 |
+
'type': PromptTemplate,
|
| 24 |
+
'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
|
| 25 |
+
'ice_token': '</E>',
|
| 26 |
+
},
|
| 27 |
+
'retriever': {'type': ZeroRetriever},
|
| 28 |
+
'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
|
| 29 |
+
}
|
| 30 |
+
eval_cfg = {
|
| 31 |
+
'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
|
| 32 |
+
'pred_role': 'BOT',
|
| 33 |
+
}
|
| 34 |
+
_base_path = 'opencompass/GAOKAO-BENCH'
|
| 35 |
+
dataset = {
|
| 36 |
+
'type': GaokaoBenchDataset,
|
| 37 |
+
'abbr': 'GaokaoBench_' + p['keyword'],
|
| 38 |
+
'path': _base_path,
|
| 39 |
+
'filename': '/' + folder + '/' + p['keyword'] + '.json',
|
| 40 |
+
'name': p['keyword'],
|
| 41 |
+
'reader_cfg': reader_cfg,
|
| 42 |
+
'infer_cfg': infer_cfg,
|
| 43 |
+
'eval_cfg': eval_cfg,
|
| 44 |
+
}
|
| 45 |
+
GaokaoBench_datasets.append(dataset)
|