msj19 commited on
Commit
af6b942
·
verified ·
1 Parent(s): 9b40ad5

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_freqdist.py +7 -0
  2. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_metrics.py +66 -0
  3. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_naivebayes.py +21 -0
  4. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_nombank.py +27 -0
  5. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pl196x.py +13 -0
  6. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pos_tag.py +83 -0
  7. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_ribes.py +246 -0
  8. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_rte_classify.py +94 -0
  9. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_seekable_unicode_stream_reader.py +86 -0
  10. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_senna.py +112 -0
  11. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_stem.py +157 -0
  12. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tag.py +23 -0
  13. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tgrep.py +780 -0
  14. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tokenize.py +867 -0
  15. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_twitter_auth.py +77 -0
  16. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_util.py +82 -0
  17. .eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_wordnet.py +240 -0
  18. build/lib/opencompass/configs/dataset_collections/chat_OC15.py +22 -0
  19. build/lib/opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py +4 -0
  20. build/lib/opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py +45 -0
  21. build/lib/opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py +4 -0
  22. build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py +4 -0
  23. build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py +51 -0
  24. build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py +4 -0
  25. build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py +49 -0
  26. build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl.py +4 -0
  27. build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_f1e631.py +49 -0
  28. build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py +4 -0
  29. build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py +60 -0
  30. build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py +44 -0
  31. build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py +4 -0
  32. build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl.py +4 -0
  33. build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_7d1c07.py +43 -0
  34. build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_d10e8a.py +48 -0
  35. build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_fff486.py +48 -0
  36. build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py +4 -0
  37. build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py +77 -0
  38. build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl.py +4 -0
  39. build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py +76 -0
  40. build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py +4 -0
  41. build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl_77d0df.py +50 -0
  42. build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py +4 -0
  43. build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl_96564c.py +51 -0
  44. build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py +4 -0
  45. build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl_250d00.py +51 -0
  46. build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py +4 -0
  47. build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py +304 -0
  48. build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py +4 -0
  49. build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py +356 -0
  50. build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py +45 -0
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_freqdist.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import nltk
2
+
3
+
4
+ def test_iterating_returns_an_iterator_ordered_by_frequency():
5
+ samples = ["one", "two", "two"]
6
+ distribution = nltk.FreqDist(samples)
7
+ assert list(distribution) == ["two", "one"]
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_metrics.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from nltk.metrics import (
4
+ BigramAssocMeasures,
5
+ QuadgramAssocMeasures,
6
+ TrigramAssocMeasures,
7
+ )
8
+
9
+ ## Test the likelihood ratio metric
10
+
11
+ _DELTA = 1e-8
12
+
13
+
14
+ class TestLikelihoodRatio(unittest.TestCase):
15
+ def test_lr_bigram(self):
16
+ self.assertAlmostEqual(
17
+ BigramAssocMeasures.likelihood_ratio(2, (4, 4), 20),
18
+ 2.4142743368419755,
19
+ delta=_DELTA,
20
+ )
21
+ self.assertAlmostEqual(
22
+ BigramAssocMeasures.likelihood_ratio(1, (1, 1), 1), 0.0, delta=_DELTA
23
+ )
24
+ self.assertRaises(
25
+ ValueError,
26
+ BigramAssocMeasures.likelihood_ratio,
27
+ *(0, (2, 2), 2),
28
+ )
29
+
30
+ def test_lr_trigram(self):
31
+ self.assertAlmostEqual(
32
+ TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 2),
33
+ 5.545177444479562,
34
+ delta=_DELTA,
35
+ )
36
+ self.assertAlmostEqual(
37
+ TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 1),
38
+ 0.0,
39
+ delta=_DELTA,
40
+ )
41
+ self.assertRaises(
42
+ ValueError,
43
+ TrigramAssocMeasures.likelihood_ratio,
44
+ *(1, (1, 1, 2), (1, 1, 2), 2),
45
+ )
46
+
47
+ def test_lr_quadgram(self):
48
+ self.assertAlmostEqual(
49
+ QuadgramAssocMeasures.likelihood_ratio(
50
+ 1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 2
51
+ ),
52
+ 8.317766166719343,
53
+ delta=_DELTA,
54
+ )
55
+ self.assertAlmostEqual(
56
+ QuadgramAssocMeasures.likelihood_ratio(
57
+ 1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 1
58
+ ),
59
+ 0.0,
60
+ delta=_DELTA,
61
+ )
62
+ self.assertRaises(
63
+ ValueError,
64
+ QuadgramAssocMeasures.likelihood_ratio,
65
+ *(1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 2), (1, 1, 1, 1), 1),
66
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_naivebayes.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from nltk.classify.naivebayes import NaiveBayesClassifier
4
+
5
+
6
+ class NaiveBayesClassifierTest(unittest.TestCase):
7
+ def test_simple(self):
8
+ training_features = [
9
+ ({"nice": True, "good": True}, "positive"),
10
+ ({"bad": True, "mean": True}, "negative"),
11
+ ]
12
+
13
+ classifier = NaiveBayesClassifier.train(training_features)
14
+
15
+ result = classifier.prob_classify({"nice": True})
16
+ self.assertTrue(result.prob("positive") > result.prob("negative"))
17
+ self.assertEqual(result.max(), "positive")
18
+
19
+ result = classifier.prob_classify({"bad": True})
20
+ self.assertTrue(result.prob("positive") < result.prob("negative"))
21
+ self.assertEqual(result.max(), "negative")
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_nombank.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for nltk.corpus.nombank
3
+ """
4
+
5
+ import unittest
6
+
7
+ from nltk.corpus import nombank
8
+
9
+ # Load the nombank once.
10
+ nombank.nouns()
11
+
12
+
13
+ class NombankDemo(unittest.TestCase):
14
+ def test_numbers(self):
15
+ # No. of instances.
16
+ self.assertEqual(len(nombank.instances()), 114574)
17
+ # No. of rolesets
18
+ self.assertEqual(len(nombank.rolesets()), 5577)
19
+ # No. of nouns.
20
+ self.assertEqual(len(nombank.nouns()), 4704)
21
+
22
+ def test_instance(self):
23
+ self.assertEqual(nombank.instances()[0].roleset, "perc-sign.01")
24
+
25
+ def test_framefiles_fileids(self):
26
+ self.assertEqual(len(nombank.fileids()), 4705)
27
+ self.assertTrue(all(fileid.endswith(".xml") for fileid in nombank.fileids()))
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pl196x.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ import nltk
4
+ from nltk.corpus.reader import pl196x
5
+
6
+
7
+ class TestCorpusViews(unittest.TestCase):
8
+ def test_corpus_reader(self):
9
+ pl196x_dir = nltk.data.find("corpora/pl196x")
10
+ pl = pl196x.Pl196xCorpusReader(
11
+ pl196x_dir, r".*\.xml", textids="textids.txt", cat_file="cats.txt"
12
+ )
13
+ pl.tagged_words(fileids=pl.fileids(), categories="cats.txt")
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_pos_tag.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for nltk.pos_tag
3
+ """
4
+
5
+
6
+ import unittest
7
+
8
+ from nltk import pos_tag, word_tokenize
9
+
10
+
11
+ class TestPosTag(unittest.TestCase):
12
+ def test_pos_tag_eng(self):
13
+ text = "John's big idea isn't all that bad."
14
+ expected_tagged = [
15
+ ("John", "NNP"),
16
+ ("'s", "POS"),
17
+ ("big", "JJ"),
18
+ ("idea", "NN"),
19
+ ("is", "VBZ"),
20
+ ("n't", "RB"),
21
+ ("all", "PDT"),
22
+ ("that", "DT"),
23
+ ("bad", "JJ"),
24
+ (".", "."),
25
+ ]
26
+ assert pos_tag(word_tokenize(text)) == expected_tagged
27
+
28
+ def test_pos_tag_eng_universal(self):
29
+ text = "John's big idea isn't all that bad."
30
+ expected_tagged = [
31
+ ("John", "NOUN"),
32
+ ("'s", "PRT"),
33
+ ("big", "ADJ"),
34
+ ("idea", "NOUN"),
35
+ ("is", "VERB"),
36
+ ("n't", "ADV"),
37
+ ("all", "DET"),
38
+ ("that", "DET"),
39
+ ("bad", "ADJ"),
40
+ (".", "."),
41
+ ]
42
+ assert pos_tag(word_tokenize(text), tagset="universal") == expected_tagged
43
+
44
+ def test_pos_tag_rus(self):
45
+ text = "Илья оторопел и дважды перечитал бумажку."
46
+ expected_tagged = [
47
+ ("Илья", "S"),
48
+ ("оторопел", "V"),
49
+ ("и", "CONJ"),
50
+ ("дважды", "ADV"),
51
+ ("перечитал", "V"),
52
+ ("бумажку", "S"),
53
+ (".", "NONLEX"),
54
+ ]
55
+ assert pos_tag(word_tokenize(text), lang="rus") == expected_tagged
56
+
57
+ def test_pos_tag_rus_universal(self):
58
+ text = "Илья оторопел и дважды перечитал бумажку."
59
+ expected_tagged = [
60
+ ("Илья", "NOUN"),
61
+ ("оторопел", "VERB"),
62
+ ("и", "CONJ"),
63
+ ("дважды", "ADV"),
64
+ ("перечитал", "VERB"),
65
+ ("бумажку", "NOUN"),
66
+ (".", "."),
67
+ ]
68
+ assert (
69
+ pos_tag(word_tokenize(text), tagset="universal", lang="rus")
70
+ == expected_tagged
71
+ )
72
+
73
+ def test_pos_tag_unknown_lang(self):
74
+ text = "모르겠 습니 다"
75
+ self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang="kor")
76
+ # Test for default kwarg, `lang=None`
77
+ self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None)
78
+
79
+ def test_unspecified_lang(self):
80
+ # Tries to force the lang='eng' option.
81
+ text = "모르겠 습니 다"
82
+ expected_but_wrong = [("모르겠", "JJ"), ("습니", "NNP"), ("다", "NN")]
83
+ assert pos_tag(word_tokenize(text)) == expected_but_wrong
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_ribes.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.translate.ribes_score import corpus_ribes, word_rank_alignment
2
+
3
+
4
+ def test_ribes_empty_worder(): # worder as in word order
5
+ # Verifies that these two sentences have no alignment,
6
+ # and hence have the lowest possible RIBES score.
7
+ hyp = "This is a nice sentence which I quite like".split()
8
+ ref = "Okay well that's neat and all but the reference's different".split()
9
+
10
+ assert word_rank_alignment(ref, hyp) == []
11
+
12
+ list_of_refs = [[ref]]
13
+ hypotheses = [hyp]
14
+ assert corpus_ribes(list_of_refs, hypotheses) == 0.0
15
+
16
+
17
+ def test_ribes_one_worder():
18
+ # Verifies that these two sentences have just one match,
19
+ # and the RIBES score for this sentence with very little
20
+ # correspondence is 0.
21
+ hyp = "This is a nice sentence which I quite like".split()
22
+ ref = "Okay well that's nice and all but the reference's different".split()
23
+
24
+ assert word_rank_alignment(ref, hyp) == [3]
25
+
26
+ list_of_refs = [[ref]]
27
+ hypotheses = [hyp]
28
+ assert corpus_ribes(list_of_refs, hypotheses) == 0.0
29
+
30
+
31
+ def test_ribes_two_worder():
32
+ # Verifies that these two sentences have two matches,
33
+ # but still get the lowest possible RIBES score due
34
+ # to the lack of similarity.
35
+ hyp = "This is a nice sentence which I quite like".split()
36
+ ref = "Okay well that's nice and all but the reference is different".split()
37
+
38
+ assert word_rank_alignment(ref, hyp) == [9, 3]
39
+
40
+ list_of_refs = [[ref]]
41
+ hypotheses = [hyp]
42
+ assert corpus_ribes(list_of_refs, hypotheses) == 0.0
43
+
44
+
45
+ def test_ribes():
46
+ # Based on the doctest of the corpus_ribes function
47
+ hyp1 = [
48
+ "It",
49
+ "is",
50
+ "a",
51
+ "guide",
52
+ "to",
53
+ "action",
54
+ "which",
55
+ "ensures",
56
+ "that",
57
+ "the",
58
+ "military",
59
+ "always",
60
+ "obeys",
61
+ "the",
62
+ "commands",
63
+ "of",
64
+ "the",
65
+ "party",
66
+ ]
67
+ ref1a = [
68
+ "It",
69
+ "is",
70
+ "a",
71
+ "guide",
72
+ "to",
73
+ "action",
74
+ "that",
75
+ "ensures",
76
+ "that",
77
+ "the",
78
+ "military",
79
+ "will",
80
+ "forever",
81
+ "heed",
82
+ "Party",
83
+ "commands",
84
+ ]
85
+ ref1b = [
86
+ "It",
87
+ "is",
88
+ "the",
89
+ "guiding",
90
+ "principle",
91
+ "which",
92
+ "guarantees",
93
+ "the",
94
+ "military",
95
+ "forces",
96
+ "always",
97
+ "being",
98
+ "under",
99
+ "the",
100
+ "command",
101
+ "of",
102
+ "the",
103
+ "Party",
104
+ ]
105
+ ref1c = [
106
+ "It",
107
+ "is",
108
+ "the",
109
+ "practical",
110
+ "guide",
111
+ "for",
112
+ "the",
113
+ "army",
114
+ "always",
115
+ "to",
116
+ "heed",
117
+ "the",
118
+ "directions",
119
+ "of",
120
+ "the",
121
+ "party",
122
+ ]
123
+
124
+ hyp2 = [
125
+ "he",
126
+ "read",
127
+ "the",
128
+ "book",
129
+ "because",
130
+ "he",
131
+ "was",
132
+ "interested",
133
+ "in",
134
+ "world",
135
+ "history",
136
+ ]
137
+ ref2a = [
138
+ "he",
139
+ "was",
140
+ "interested",
141
+ "in",
142
+ "world",
143
+ "history",
144
+ "because",
145
+ "he",
146
+ "read",
147
+ "the",
148
+ "book",
149
+ ]
150
+
151
+ list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]]
152
+ hypotheses = [hyp1, hyp2]
153
+
154
+ score = corpus_ribes(list_of_refs, hypotheses)
155
+
156
+ assert round(score, 4) == 0.3597
157
+
158
+
159
+ def test_no_zero_div():
160
+ # Regression test for Issue 2529, assure that no ZeroDivisionError is thrown.
161
+ hyp1 = [
162
+ "It",
163
+ "is",
164
+ "a",
165
+ "guide",
166
+ "to",
167
+ "action",
168
+ "which",
169
+ "ensures",
170
+ "that",
171
+ "the",
172
+ "military",
173
+ "always",
174
+ "obeys",
175
+ "the",
176
+ "commands",
177
+ "of",
178
+ "the",
179
+ "party",
180
+ ]
181
+ ref1a = [
182
+ "It",
183
+ "is",
184
+ "a",
185
+ "guide",
186
+ "to",
187
+ "action",
188
+ "that",
189
+ "ensures",
190
+ "that",
191
+ "the",
192
+ "military",
193
+ "will",
194
+ "forever",
195
+ "heed",
196
+ "Party",
197
+ "commands",
198
+ ]
199
+ ref1b = [
200
+ "It",
201
+ "is",
202
+ "the",
203
+ "guiding",
204
+ "principle",
205
+ "which",
206
+ "guarantees",
207
+ "the",
208
+ "military",
209
+ "forces",
210
+ "always",
211
+ "being",
212
+ "under",
213
+ "the",
214
+ "command",
215
+ "of",
216
+ "the",
217
+ "Party",
218
+ ]
219
+ ref1c = [
220
+ "It",
221
+ "is",
222
+ "the",
223
+ "practical",
224
+ "guide",
225
+ "for",
226
+ "the",
227
+ "army",
228
+ "always",
229
+ "to",
230
+ "heed",
231
+ "the",
232
+ "directions",
233
+ "of",
234
+ "the",
235
+ "party",
236
+ ]
237
+
238
+ hyp2 = ["he", "read", "the"]
239
+ ref2a = ["he", "was", "interested", "in", "world", "history", "because", "he"]
240
+
241
+ list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]]
242
+ hypotheses = [hyp1, hyp2]
243
+
244
+ score = corpus_ribes(list_of_refs, hypotheses)
245
+
246
+ assert round(score, 4) == 0.1688
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_rte_classify.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from nltk import config_megam
4
+ from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
5
+ from nltk.corpus import rte as rte_corpus
6
+
7
+ expected_from_rte_feature_extration = """
8
+ alwayson => True
9
+ ne_hyp_extra => 0
10
+ ne_overlap => 1
11
+ neg_hyp => 0
12
+ neg_txt => 0
13
+ word_hyp_extra => 3
14
+ word_overlap => 3
15
+
16
+ alwayson => True
17
+ ne_hyp_extra => 0
18
+ ne_overlap => 1
19
+ neg_hyp => 0
20
+ neg_txt => 0
21
+ word_hyp_extra => 2
22
+ word_overlap => 1
23
+
24
+ alwayson => True
25
+ ne_hyp_extra => 1
26
+ ne_overlap => 1
27
+ neg_hyp => 0
28
+ neg_txt => 0
29
+ word_hyp_extra => 1
30
+ word_overlap => 2
31
+
32
+ alwayson => True
33
+ ne_hyp_extra => 1
34
+ ne_overlap => 0
35
+ neg_hyp => 0
36
+ neg_txt => 0
37
+ word_hyp_extra => 6
38
+ word_overlap => 2
39
+
40
+ alwayson => True
41
+ ne_hyp_extra => 1
42
+ ne_overlap => 0
43
+ neg_hyp => 0
44
+ neg_txt => 0
45
+ word_hyp_extra => 4
46
+ word_overlap => 0
47
+
48
+ alwayson => True
49
+ ne_hyp_extra => 1
50
+ ne_overlap => 0
51
+ neg_hyp => 0
52
+ neg_txt => 0
53
+ word_hyp_extra => 3
54
+ word_overlap => 1
55
+ """
56
+
57
+
58
+ class TestRTEClassifier:
59
+ # Test the feature extraction method.
60
+ def test_rte_feature_extraction(self):
61
+ pairs = rte_corpus.pairs(["rte1_dev.xml"])[:6]
62
+ test_output = [
63
+ f"{key:<15} => {rte_features(pair)[key]}"
64
+ for pair in pairs
65
+ for key in sorted(rte_features(pair))
66
+ ]
67
+ expected_output = expected_from_rte_feature_extration.strip().split("\n")
68
+ # Remove null strings.
69
+ expected_output = list(filter(None, expected_output))
70
+ assert test_output == expected_output
71
+
72
+ # Test the RTEFeatureExtractor object.
73
+ def test_feature_extractor_object(self):
74
+ rtepair = rte_corpus.pairs(["rte3_dev.xml"])[33]
75
+ extractor = RTEFeatureExtractor(rtepair)
76
+
77
+ assert extractor.hyp_words == {"member", "China", "SCO."}
78
+ assert extractor.overlap("word") == set()
79
+ assert extractor.overlap("ne") == {"China"}
80
+ assert extractor.hyp_extra("word") == {"member"}
81
+
82
+ # Test the RTE classifier training.
83
+ def test_rte_classification_without_megam(self):
84
+ # Use a sample size for unit testing, since we
85
+ # don't need to fully train these classifiers
86
+ clf = rte_classifier("IIS", sample_N=100)
87
+ clf = rte_classifier("GIS", sample_N=100)
88
+
89
+ def test_rte_classification_with_megam(self):
90
+ try:
91
+ config_megam()
92
+ except (LookupError, AttributeError) as e:
93
+ pytest.skip("Skipping tests with dependencies on MEGAM")
94
+ clf = rte_classifier("megam", sample_N=100)
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_seekable_unicode_stream_reader.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from io import BytesIO
3
+
4
+ import pytest
5
+
6
+ from nltk.corpus.reader import SeekableUnicodeStreamReader
7
+
8
+
9
+ def check_reader(unicode_string, encoding):
10
+ bytestr = unicode_string.encode(encoding)
11
+ stream = BytesIO(bytestr)
12
+ reader = SeekableUnicodeStreamReader(stream, encoding)
13
+
14
+ # Should open at the start of the file
15
+ assert reader.tell() == 0
16
+
17
+ # Compare original string to contents from `.readlines()`
18
+ assert unicode_string == "".join(reader.readlines())
19
+
20
+ # Should be at the end of the file now
21
+ stream.seek(0, os.SEEK_END)
22
+ assert reader.tell() == stream.tell()
23
+
24
+ reader.seek(0) # go back to start
25
+
26
+ # Compare original string to contents from `.read()`
27
+ contents = ""
28
+ char = None
29
+ while char != "":
30
+ char = reader.read(1)
31
+ contents += char
32
+ assert unicode_string == contents
33
+
34
+
35
+ # Call `check_reader` with a variety of input strings and encodings.
36
+ ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"]
37
+
38
+ STRINGS = [
39
+ """
40
+ This is a test file.
41
+ It is fairly short.
42
+ """,
43
+ "This file can be encoded with latin1. \x83",
44
+ """\
45
+ This is a test file.
46
+ Here's a blank line:
47
+
48
+ And here's some unicode: \xee \u0123 \uffe3
49
+ """,
50
+ """\
51
+ This is a test file.
52
+ Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
53
+ """,
54
+ """\
55
+ This is a larger file. It has some lines that are longer \
56
+ than 72 characters. It's got lots of repetition. Here's \
57
+ some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
58
+
59
+ How fun! Let's repeat it twenty times.
60
+ """
61
+ * 20,
62
+ ]
63
+
64
+
65
+ @pytest.mark.parametrize("string", STRINGS)
66
+ def test_reader(string):
67
+ for encoding in ENCODINGS:
68
+ # skip strings that can't be encoded with the current encoding
69
+ try:
70
+ string.encode(encoding)
71
+ except UnicodeEncodeError:
72
+ continue
73
+ check_reader(string, encoding)
74
+
75
+
76
+ def test_reader_stream_closes_when_deleted():
77
+ reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii")
78
+ assert not reader.stream.closed
79
+ reader.__del__()
80
+ assert reader.stream.closed
81
+
82
+
83
+ def teardown_module(module=None):
84
+ import gc
85
+
86
+ gc.collect()
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_senna.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for Senna
3
+ """
4
+
5
+ import unittest
6
+ from os import environ, path, sep
7
+
8
+ from nltk.classify import Senna
9
+ from nltk.tag import SennaChunkTagger, SennaNERTagger, SennaTagger
10
+
11
+ # Set Senna executable path for tests if it is not specified as an environment variable
12
+ if "SENNA" in environ:
13
+ SENNA_EXECUTABLE_PATH = path.normpath(environ["SENNA"]) + sep
14
+ else:
15
+ SENNA_EXECUTABLE_PATH = "/usr/share/senna-v3.0"
16
+
17
+ senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
18
+
19
+
20
+ @unittest.skipUnless(senna_is_installed, "Requires Senna executable")
21
+ class TestSennaPipeline(unittest.TestCase):
22
+ """Unittest for nltk.classify.senna"""
23
+
24
+ def test_senna_pipeline(self):
25
+ """Senna pipeline interface"""
26
+
27
+ pipeline = Senna(SENNA_EXECUTABLE_PATH, ["pos", "chk", "ner"])
28
+ sent = "Dusseldorf is an international business center".split()
29
+ result = [
30
+ (token["word"], token["chk"], token["ner"], token["pos"])
31
+ for token in pipeline.tag(sent)
32
+ ]
33
+ expected = [
34
+ ("Dusseldorf", "B-NP", "B-LOC", "NNP"),
35
+ ("is", "B-VP", "O", "VBZ"),
36
+ ("an", "B-NP", "O", "DT"),
37
+ ("international", "I-NP", "O", "JJ"),
38
+ ("business", "I-NP", "O", "NN"),
39
+ ("center", "I-NP", "O", "NN"),
40
+ ]
41
+ self.assertEqual(result, expected)
42
+
43
+
44
+ @unittest.skipUnless(senna_is_installed, "Requires Senna executable")
45
+ class TestSennaTagger(unittest.TestCase):
46
+ """Unittest for nltk.tag.senna"""
47
+
48
+ def test_senna_tagger(self):
49
+ tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
50
+ result = tagger.tag("What is the airspeed of an unladen swallow ?".split())
51
+ expected = [
52
+ ("What", "WP"),
53
+ ("is", "VBZ"),
54
+ ("the", "DT"),
55
+ ("airspeed", "NN"),
56
+ ("of", "IN"),
57
+ ("an", "DT"),
58
+ ("unladen", "NN"),
59
+ ("swallow", "NN"),
60
+ ("?", "."),
61
+ ]
62
+ self.assertEqual(result, expected)
63
+
64
+ def test_senna_chunk_tagger(self):
65
+ chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
66
+ result_1 = chktagger.tag("What is the airspeed of an unladen swallow ?".split())
67
+ expected_1 = [
68
+ ("What", "B-NP"),
69
+ ("is", "B-VP"),
70
+ ("the", "B-NP"),
71
+ ("airspeed", "I-NP"),
72
+ ("of", "B-PP"),
73
+ ("an", "B-NP"),
74
+ ("unladen", "I-NP"),
75
+ ("swallow", "I-NP"),
76
+ ("?", "O"),
77
+ ]
78
+
79
+ result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type="NP"))
80
+ expected_2 = [
81
+ ("What", "0"),
82
+ ("the airspeed", "2-3"),
83
+ ("an unladen swallow", "5-6-7"),
84
+ ]
85
+ self.assertEqual(result_1, expected_1)
86
+ self.assertEqual(result_2, expected_2)
87
+
88
+ def test_senna_ner_tagger(self):
89
+ nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
90
+ result_1 = nertagger.tag("Shakespeare theatre was in London .".split())
91
+ expected_1 = [
92
+ ("Shakespeare", "B-PER"),
93
+ ("theatre", "O"),
94
+ ("was", "O"),
95
+ ("in", "O"),
96
+ ("London", "B-LOC"),
97
+ (".", "O"),
98
+ ]
99
+
100
+ result_2 = nertagger.tag("UN headquarters are in NY , USA .".split())
101
+ expected_2 = [
102
+ ("UN", "B-ORG"),
103
+ ("headquarters", "O"),
104
+ ("are", "O"),
105
+ ("in", "O"),
106
+ ("NY", "B-LOC"),
107
+ (",", "O"),
108
+ ("USA", "B-LOC"),
109
+ (".", "O"),
110
+ ]
111
+ self.assertEqual(result_1, expected_1)
112
+ self.assertEqual(result_2, expected_2)
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_stem.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from contextlib import closing
3
+
4
+ from nltk import data
5
+ from nltk.stem.porter import PorterStemmer
6
+ from nltk.stem.snowball import SnowballStemmer
7
+
8
+
9
+ class SnowballTest(unittest.TestCase):
10
+ def test_arabic(self):
11
+ """
12
+ this unit testing for test the snowball arabic light stemmer
13
+ this stemmer deals with prefixes and suffixes
14
+ """
15
+ # Test where the ignore_stopwords=True.
16
+ ar_stemmer = SnowballStemmer("arabic", True)
17
+ assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
18
+ assert ar_stemmer.stem("العربية") == "عرب"
19
+ assert ar_stemmer.stem("فقالوا") == "قال"
20
+ assert ar_stemmer.stem("الطالبات") == "طالب"
21
+ assert ar_stemmer.stem("فالطالبات") == "طالب"
22
+ assert ar_stemmer.stem("والطالبات") == "طالب"
23
+ assert ar_stemmer.stem("الطالبون") == "طالب"
24
+ assert ar_stemmer.stem("اللذان") == "اللذان"
25
+ assert ar_stemmer.stem("من") == "من"
26
+ # Test where the ignore_stopwords=False.
27
+ ar_stemmer = SnowballStemmer("arabic", False)
28
+ assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
29
+ assert ar_stemmer.stem("الطالبات") == "طالب"
30
+ assert ar_stemmer.stem("الكلمات") == "كلم"
31
+ # test where create the arabic stemmer without given init value to ignore_stopwords
32
+ ar_stemmer = SnowballStemmer("arabic")
33
+ assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
34
+ assert ar_stemmer.stem("العربية") == "عرب"
35
+ assert ar_stemmer.stem("فقالوا") == "قال"
36
+ assert ar_stemmer.stem("الطالبات") == "طالب"
37
+ assert ar_stemmer.stem("الكلمات") == "كلم"
38
+
39
+ def test_russian(self):
40
+ stemmer_russian = SnowballStemmer("russian")
41
+ assert stemmer_russian.stem("авантненькая") == "авантненьк"
42
+
43
+ def test_german(self):
44
+ stemmer_german = SnowballStemmer("german")
45
+ stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
46
+
47
+ assert stemmer_german.stem("Schr\xe4nke") == "schrank"
48
+ assert stemmer_german2.stem("Schr\xe4nke") == "schrank"
49
+
50
+ assert stemmer_german.stem("keinen") == "kein"
51
+ assert stemmer_german2.stem("keinen") == "keinen"
52
+
53
+ def test_spanish(self):
54
+ stemmer = SnowballStemmer("spanish")
55
+
56
+ assert stemmer.stem("Visionado") == "vision"
57
+
58
+ # The word 'algue' was raising an IndexError
59
+ assert stemmer.stem("algue") == "algu"
60
+
61
+ def test_short_strings_bug(self):
62
+ stemmer = SnowballStemmer("english")
63
+ assert stemmer.stem("y's") == "y"
64
+
65
+
66
+ class PorterTest(unittest.TestCase):
67
+ def _vocabulary(self):
68
+ with closing(
69
+ data.find("stemmers/porter_test/porter_vocabulary.txt").open(
70
+ encoding="utf-8"
71
+ )
72
+ ) as fp:
73
+ return fp.read().splitlines()
74
+
75
+ def _test_against_expected_output(self, stemmer_mode, expected_stems):
76
+ stemmer = PorterStemmer(mode=stemmer_mode)
77
+ for word, true_stem in zip(self._vocabulary(), expected_stems):
78
+ our_stem = stemmer.stem(word)
79
+ assert (
80
+ our_stem == true_stem
81
+ ), "{} should stem to {} in {} mode but got {}".format(
82
+ word,
83
+ true_stem,
84
+ stemmer_mode,
85
+ our_stem,
86
+ )
87
+
88
+ def test_vocabulary_martin_mode(self):
89
+ """Tests all words from the test vocabulary provided by M Porter
90
+
91
+ The sample vocabulary and output were sourced from
92
+ https://tartarus.org/martin/PorterStemmer/voc.txt and
93
+ https://tartarus.org/martin/PorterStemmer/output.txt
94
+ and are linked to from the Porter Stemmer algorithm's homepage
95
+ at https://tartarus.org/martin/PorterStemmer/
96
+ """
97
+ with closing(
98
+ data.find("stemmers/porter_test/porter_martin_output.txt").open(
99
+ encoding="utf-8"
100
+ )
101
+ ) as fp:
102
+ self._test_against_expected_output(
103
+ PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
104
+ )
105
+
106
+ def test_vocabulary_nltk_mode(self):
107
+ with closing(
108
+ data.find("stemmers/porter_test/porter_nltk_output.txt").open(
109
+ encoding="utf-8"
110
+ )
111
+ ) as fp:
112
+ self._test_against_expected_output(
113
+ PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
114
+ )
115
+
116
+ def test_vocabulary_original_mode(self):
117
+ # The list of stems for this test was generated by taking the
118
+ # Martin-blessed stemmer from
119
+ # https://tartarus.org/martin/PorterStemmer/c.txt
120
+ # and removing all the --DEPARTURE-- sections from it and
121
+ # running it against Martin's test vocabulary.
122
+
123
+ with closing(
124
+ data.find("stemmers/porter_test/porter_original_output.txt").open(
125
+ encoding="utf-8"
126
+ )
127
+ ) as fp:
128
+ self._test_against_expected_output(
129
+ PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
130
+ )
131
+
132
+ self._test_against_expected_output(
133
+ PorterStemmer.ORIGINAL_ALGORITHM,
134
+ data.find("stemmers/porter_test/porter_original_output.txt")
135
+ .open(encoding="utf-8")
136
+ .read()
137
+ .splitlines(),
138
+ )
139
+
140
+ def test_oed_bug(self):
141
+ """Test for bug https://github.com/nltk/nltk/issues/1581
142
+
143
+ Ensures that 'oed' can be stemmed without throwing an error.
144
+ """
145
+ assert PorterStemmer().stem("oed") == "o"
146
+
147
+ def test_lowercase_option(self):
148
+ """Test for improvement on https://github.com/nltk/nltk/issues/2507
149
+
150
+ Ensures that stems are lowercased when `to_lowercase=True`
151
+ """
152
+ porter = PorterStemmer()
153
+ assert porter.stem("On") == "on"
154
+ assert porter.stem("I") == "i"
155
+ assert porter.stem("I", to_lowercase=False) == "I"
156
+ assert porter.stem("Github") == "github"
157
+ assert porter.stem("Github", to_lowercase=False) == "Github"
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tag.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def test_basic():
2
+ from nltk.tag import pos_tag
3
+ from nltk.tokenize import word_tokenize
4
+
5
+ result = pos_tag(word_tokenize("John's big idea isn't all that bad."))
6
+ assert result == [
7
+ ("John", "NNP"),
8
+ ("'s", "POS"),
9
+ ("big", "JJ"),
10
+ ("idea", "NN"),
11
+ ("is", "VBZ"),
12
+ ("n't", "RB"),
13
+ ("all", "PDT"),
14
+ ("that", "DT"),
15
+ ("bad", "JJ"),
16
+ (".", "."),
17
+ ]
18
+
19
+
20
+ def setup_module(module):
21
+ import pytest
22
+
23
+ pytest.importorskip("numpy")
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tgrep.py ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ #
3
+ # Natural Language Toolkit: TGrep search
4
+ #
5
+ # Copyright (C) 2001-2022 NLTK Project
6
+ # Author: Will Roberts <wildwilhelm@gmail.com>
7
+ # URL: <https://www.nltk.org/>
8
+ # For license information, see LICENSE.TXT
9
+
10
+ """
11
+ Unit tests for nltk.tgrep.
12
+ """
13
+
14
+
15
+ import unittest
16
+
17
+ from nltk import tgrep
18
+ from nltk.tree import ParentedTree
19
+
20
+
21
+ class TestSequenceFunctions(unittest.TestCase):
22
+
23
+ """
24
+ Class containing unit tests for nltk.tgrep.
25
+ """
26
+
27
+ def test_tokenize_simple(self):
28
+ """
29
+ Simple test of tokenization.
30
+ """
31
+ tokens = tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]")
32
+ self.assertEqual(
33
+ tokens,
34
+ [
35
+ "A",
36
+ "..",
37
+ "(",
38
+ "B",
39
+ "!",
40
+ "<",
41
+ "C",
42
+ ".",
43
+ "D",
44
+ ")",
45
+ "|",
46
+ "!",
47
+ "[",
48
+ "<<",
49
+ "(",
50
+ "E",
51
+ ",",
52
+ "F",
53
+ ")",
54
+ "$",
55
+ "G",
56
+ "]",
57
+ ],
58
+ )
59
+
60
+ def test_tokenize_encoding(self):
61
+ """
62
+ Test that tokenization handles bytes and strs the same way.
63
+ """
64
+ self.assertEqual(
65
+ tgrep.tgrep_tokenize(b"A .. (B !< C . D) | ![<< (E , F) $ G]"),
66
+ tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]"),
67
+ )
68
+
69
+ def test_tokenize_link_types(self):
70
+ """
71
+ Test tokenization of basic link types.
72
+ """
73
+ self.assertEqual(tgrep.tgrep_tokenize("A<B"), ["A", "<", "B"])
74
+ self.assertEqual(tgrep.tgrep_tokenize("A>B"), ["A", ">", "B"])
75
+ self.assertEqual(tgrep.tgrep_tokenize("A<3B"), ["A", "<3", "B"])
76
+ self.assertEqual(tgrep.tgrep_tokenize("A>3B"), ["A", ">3", "B"])
77
+ self.assertEqual(tgrep.tgrep_tokenize("A<,B"), ["A", "<,", "B"])
78
+ self.assertEqual(tgrep.tgrep_tokenize("A>,B"), ["A", ">,", "B"])
79
+ self.assertEqual(tgrep.tgrep_tokenize("A<-3B"), ["A", "<-3", "B"])
80
+ self.assertEqual(tgrep.tgrep_tokenize("A>-3B"), ["A", ">-3", "B"])
81
+ self.assertEqual(tgrep.tgrep_tokenize("A<-B"), ["A", "<-", "B"])
82
+ self.assertEqual(tgrep.tgrep_tokenize("A>-B"), ["A", ">-", "B"])
83
+ self.assertEqual(tgrep.tgrep_tokenize("A<'B"), ["A", "<'", "B"])
84
+ self.assertEqual(tgrep.tgrep_tokenize("A>'B"), ["A", ">'", "B"])
85
+ self.assertEqual(tgrep.tgrep_tokenize("A<:B"), ["A", "<:", "B"])
86
+ self.assertEqual(tgrep.tgrep_tokenize("A>:B"), ["A", ">:", "B"])
87
+ self.assertEqual(tgrep.tgrep_tokenize("A<<B"), ["A", "<<", "B"])
88
+ self.assertEqual(tgrep.tgrep_tokenize("A>>B"), ["A", ">>", "B"])
89
+ self.assertEqual(tgrep.tgrep_tokenize("A<<,B"), ["A", "<<,", "B"])
90
+ self.assertEqual(tgrep.tgrep_tokenize("A>>,B"), ["A", ">>,", "B"])
91
+ self.assertEqual(tgrep.tgrep_tokenize("A<<'B"), ["A", "<<'", "B"])
92
+ self.assertEqual(tgrep.tgrep_tokenize("A>>'B"), ["A", ">>'", "B"])
93
+ self.assertEqual(tgrep.tgrep_tokenize("A<<:B"), ["A", "<<:", "B"])
94
+ self.assertEqual(tgrep.tgrep_tokenize("A>>:B"), ["A", ">>:", "B"])
95
+ self.assertEqual(tgrep.tgrep_tokenize("A.B"), ["A", ".", "B"])
96
+ self.assertEqual(tgrep.tgrep_tokenize("A,B"), ["A", ",", "B"])
97
+ self.assertEqual(tgrep.tgrep_tokenize("A..B"), ["A", "..", "B"])
98
+ self.assertEqual(tgrep.tgrep_tokenize("A,,B"), ["A", ",,", "B"])
99
+ self.assertEqual(tgrep.tgrep_tokenize("A$B"), ["A", "$", "B"])
100
+ self.assertEqual(tgrep.tgrep_tokenize("A$.B"), ["A", "$.", "B"])
101
+ self.assertEqual(tgrep.tgrep_tokenize("A$,B"), ["A", "$,", "B"])
102
+ self.assertEqual(tgrep.tgrep_tokenize("A$..B"), ["A", "$..", "B"])
103
+ self.assertEqual(tgrep.tgrep_tokenize("A$,,B"), ["A", "$,,", "B"])
104
+ self.assertEqual(tgrep.tgrep_tokenize("A!<B"), ["A", "!", "<", "B"])
105
+ self.assertEqual(tgrep.tgrep_tokenize("A!>B"), ["A", "!", ">", "B"])
106
+ self.assertEqual(tgrep.tgrep_tokenize("A!<3B"), ["A", "!", "<3", "B"])
107
+ self.assertEqual(tgrep.tgrep_tokenize("A!>3B"), ["A", "!", ">3", "B"])
108
+ self.assertEqual(tgrep.tgrep_tokenize("A!<,B"), ["A", "!", "<,", "B"])
109
+ self.assertEqual(tgrep.tgrep_tokenize("A!>,B"), ["A", "!", ">,", "B"])
110
+ self.assertEqual(tgrep.tgrep_tokenize("A!<-3B"), ["A", "!", "<-3", "B"])
111
+ self.assertEqual(tgrep.tgrep_tokenize("A!>-3B"), ["A", "!", ">-3", "B"])
112
+ self.assertEqual(tgrep.tgrep_tokenize("A!<-B"), ["A", "!", "<-", "B"])
113
+ self.assertEqual(tgrep.tgrep_tokenize("A!>-B"), ["A", "!", ">-", "B"])
114
+ self.assertEqual(tgrep.tgrep_tokenize("A!<'B"), ["A", "!", "<'", "B"])
115
+ self.assertEqual(tgrep.tgrep_tokenize("A!>'B"), ["A", "!", ">'", "B"])
116
+ self.assertEqual(tgrep.tgrep_tokenize("A!<:B"), ["A", "!", "<:", "B"])
117
+ self.assertEqual(tgrep.tgrep_tokenize("A!>:B"), ["A", "!", ">:", "B"])
118
+ self.assertEqual(tgrep.tgrep_tokenize("A!<<B"), ["A", "!", "<<", "B"])
119
+ self.assertEqual(tgrep.tgrep_tokenize("A!>>B"), ["A", "!", ">>", "B"])
120
+ self.assertEqual(tgrep.tgrep_tokenize("A!<<,B"), ["A", "!", "<<,", "B"])
121
+ self.assertEqual(tgrep.tgrep_tokenize("A!>>,B"), ["A", "!", ">>,", "B"])
122
+ self.assertEqual(tgrep.tgrep_tokenize("A!<<'B"), ["A", "!", "<<'", "B"])
123
+ self.assertEqual(tgrep.tgrep_tokenize("A!>>'B"), ["A", "!", ">>'", "B"])
124
+ self.assertEqual(tgrep.tgrep_tokenize("A!<<:B"), ["A", "!", "<<:", "B"])
125
+ self.assertEqual(tgrep.tgrep_tokenize("A!>>:B"), ["A", "!", ">>:", "B"])
126
+ self.assertEqual(tgrep.tgrep_tokenize("A!.B"), ["A", "!", ".", "B"])
127
+ self.assertEqual(tgrep.tgrep_tokenize("A!,B"), ["A", "!", ",", "B"])
128
+ self.assertEqual(tgrep.tgrep_tokenize("A!..B"), ["A", "!", "..", "B"])
129
+ self.assertEqual(tgrep.tgrep_tokenize("A!,,B"), ["A", "!", ",,", "B"])
130
+ self.assertEqual(tgrep.tgrep_tokenize("A!$B"), ["A", "!", "$", "B"])
131
+ self.assertEqual(tgrep.tgrep_tokenize("A!$.B"), ["A", "!", "$.", "B"])
132
+ self.assertEqual(tgrep.tgrep_tokenize("A!$,B"), ["A", "!", "$,", "B"])
133
+ self.assertEqual(tgrep.tgrep_tokenize("A!$..B"), ["A", "!", "$..", "B"])
134
+ self.assertEqual(tgrep.tgrep_tokenize("A!$,,B"), ["A", "!", "$,,", "B"])
135
+
136
+ def test_tokenize_examples(self):
137
+ """
138
+ Test tokenization of the TGrep2 manual example patterns.
139
+ """
140
+ self.assertEqual(tgrep.tgrep_tokenize("NP < PP"), ["NP", "<", "PP"])
141
+ self.assertEqual(tgrep.tgrep_tokenize("/^NP/"), ["/^NP/"])
142
+ self.assertEqual(
143
+ tgrep.tgrep_tokenize("NP << PP . VP"), ["NP", "<<", "PP", ".", "VP"]
144
+ )
145
+ self.assertEqual(
146
+ tgrep.tgrep_tokenize("NP << PP | . VP"), ["NP", "<<", "PP", "|", ".", "VP"]
147
+ )
148
+ self.assertEqual(
149
+ tgrep.tgrep_tokenize("NP !<< PP [> NP | >> VP]"),
150
+ ["NP", "!", "<<", "PP", "[", ">", "NP", "|", ">>", "VP", "]"],
151
+ )
152
+ self.assertEqual(
153
+ tgrep.tgrep_tokenize("NP << (PP . VP)"),
154
+ ["NP", "<<", "(", "PP", ".", "VP", ")"],
155
+ )
156
+ self.assertEqual(
157
+ tgrep.tgrep_tokenize("NP <' (PP <, (IN < on))"),
158
+ ["NP", "<'", "(", "PP", "<,", "(", "IN", "<", "on", ")", ")"],
159
+ )
160
+ self.assertEqual(
161
+ tgrep.tgrep_tokenize("S < (A < B) < C"),
162
+ ["S", "<", "(", "A", "<", "B", ")", "<", "C"],
163
+ )
164
+ self.assertEqual(
165
+ tgrep.tgrep_tokenize("S < ((A < B) < C)"),
166
+ ["S", "<", "(", "(", "A", "<", "B", ")", "<", "C", ")"],
167
+ )
168
+ self.assertEqual(
169
+ tgrep.tgrep_tokenize("S < (A < B < C)"),
170
+ ["S", "<", "(", "A", "<", "B", "<", "C", ")"],
171
+ )
172
+ self.assertEqual(tgrep.tgrep_tokenize("A<B&.C"), ["A", "<", "B", "&", ".", "C"])
173
+
174
+ def test_tokenize_quoting(self):
175
+ """
176
+ Test tokenization of quoting.
177
+ """
178
+ self.assertEqual(
179
+ tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
180
+ ['"A<<:B"', "<<:", '"A $.. B"', "<", '"A>3B"', "<", "C"],
181
+ )
182
+
183
+ def test_tokenize_nodenames(self):
184
+ """
185
+ Test tokenization of node names.
186
+ """
187
+ self.assertEqual(tgrep.tgrep_tokenize("Robert"), ["Robert"])
188
+ self.assertEqual(tgrep.tgrep_tokenize("/^[Bb]ob/"), ["/^[Bb]ob/"])
189
+ self.assertEqual(tgrep.tgrep_tokenize("*"), ["*"])
190
+ self.assertEqual(tgrep.tgrep_tokenize("__"), ["__"])
191
+ # test tokenization of NLTK tree position syntax
192
+ self.assertEqual(tgrep.tgrep_tokenize("N()"), ["N(", ")"])
193
+ self.assertEqual(tgrep.tgrep_tokenize("N(0,)"), ["N(", "0", ",", ")"])
194
+ self.assertEqual(tgrep.tgrep_tokenize("N(0,0)"), ["N(", "0", ",", "0", ")"])
195
+ self.assertEqual(
196
+ tgrep.tgrep_tokenize("N(0,0,)"), ["N(", "0", ",", "0", ",", ")"]
197
+ )
198
+
199
+ def test_tokenize_macros(self):
200
+ """
201
+ Test tokenization of macro definitions.
202
+ """
203
+ self.assertEqual(
204
+ tgrep.tgrep_tokenize(
205
+ "@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN"
206
+ ),
207
+ [
208
+ "@",
209
+ "NP",
210
+ "/^NP/",
211
+ ";",
212
+ "@",
213
+ "NN",
214
+ "/^NN/",
215
+ ";",
216
+ "@NP",
217
+ "[",
218
+ "!",
219
+ "<",
220
+ "NP",
221
+ "|",
222
+ "<",
223
+ "@NN",
224
+ "]",
225
+ "!",
226
+ "$..",
227
+ "@NN",
228
+ ],
229
+ )
230
+
231
+ def test_node_simple(self):
232
+ """
233
+ Test a simple use of tgrep for finding nodes matching a given
234
+ pattern.
235
+ """
236
+ tree = ParentedTree.fromstring(
237
+ "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
238
+ )
239
+ self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
240
+ self.assertEqual(
241
+ list(tgrep.tgrep_nodes("NN", [tree])), [[tree[0, 2], tree[2, 1]]]
242
+ )
243
+ self.assertEqual(
244
+ list(tgrep.tgrep_positions("NN|JJ", [tree])), [[(0, 1), (0, 2), (2, 1)]]
245
+ )
246
+
247
+ def test_node_printing(self):
248
+ """Test that the tgrep print operator ' is properly ignored."""
249
+ tree = ParentedTree.fromstring("(S (n x) (N x))")
250
+ self.assertEqual(
251
+ list(tgrep.tgrep_positions("N", [tree])),
252
+ list(tgrep.tgrep_positions("'N", [tree])),
253
+ )
254
+ self.assertEqual(
255
+ list(tgrep.tgrep_positions("/[Nn]/", [tree])),
256
+ list(tgrep.tgrep_positions("'/[Nn]/", [tree])),
257
+ )
258
+
259
+ def test_node_encoding(self):
260
+ """
261
+ Test that tgrep search strings handles bytes and strs the same
262
+ way.
263
+ """
264
+ tree = ParentedTree.fromstring(
265
+ "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
266
+ )
267
+ self.assertEqual(
268
+ list(tgrep.tgrep_positions(b"NN", [tree])),
269
+ list(tgrep.tgrep_positions(b"NN", [tree])),
270
+ )
271
+ self.assertEqual(
272
+ list(tgrep.tgrep_nodes(b"NN", [tree])),
273
+ list(tgrep.tgrep_nodes("NN", [tree])),
274
+ )
275
+ self.assertEqual(
276
+ list(tgrep.tgrep_positions(b"NN|JJ", [tree])),
277
+ list(tgrep.tgrep_positions("NN|JJ", [tree])),
278
+ )
279
+
280
+ def test_node_nocase(self):
281
+ """
282
+ Test selecting nodes using case insensitive node names.
283
+ """
284
+ tree = ParentedTree.fromstring("(S (n x) (N x))")
285
+ self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
286
+ self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
287
+
288
+ def test_node_quoted(self):
289
+ """
290
+ Test selecting nodes using quoted node names.
291
+ """
292
+ tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
293
+ self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
294
+ self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
295
+ self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
296
+ self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
297
+
298
+ def test_node_regex(self):
299
+ """
300
+ Test regex matching on nodes.
301
+ """
302
+ tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
303
+ # This is a regular expression that matches any node whose
304
+ # name starts with NP, including NP-SBJ:
305
+ self.assertEqual(list(tgrep.tgrep_positions("/^NP/", [tree])), [[(0,), (1,)]])
306
+
307
+ def test_node_regex_2(self):
308
+ """
309
+ Test regex matching on nodes.
310
+ """
311
+ tree = ParentedTree.fromstring("(S (SBJ x) (SBJ1 x) (NP-SBJ x))")
312
+ self.assertEqual(list(tgrep.tgrep_positions("/^SBJ/", [tree])), [[(0,), (1,)]])
313
+ # This is a regular expression that matches any node whose
314
+ # name includes SBJ, including NP-SBJ:
315
+ self.assertEqual(
316
+ list(tgrep.tgrep_positions("/SBJ/", [tree])), [[(0,), (1,), (2,)]]
317
+ )
318
+
319
+ def test_node_tree_position(self):
320
+ """
321
+ Test matching on nodes based on NLTK tree position.
322
+ """
323
+ tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
324
+ # test all tree positions that are not leaves
325
+ leaf_positions = {tree.leaf_treeposition(x) for x in range(len(tree.leaves()))}
326
+ tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
327
+ for position in tree_positions:
328
+ node_id = f"N{position}"
329
+ tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
330
+ self.assertEqual(len(tgrep_positions[0]), 1)
331
+ self.assertEqual(tgrep_positions[0][0], position)
332
+
333
+ def test_node_noleaves(self):
334
+ """
335
+ Test node name matching with the search_leaves flag set to False.
336
+ """
337
+ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
338
+ self.assertEqual(
339
+ list(tgrep.tgrep_positions("x", [tree])), [[(0, 0, 0), (1, 0, 0)]]
340
+ )
341
+ self.assertEqual(list(tgrep.tgrep_positions("x", [tree], False)), [[]])
342
+
343
+ def tests_rel_dominance(self):
344
+ """
345
+ Test matching nodes based on dominance relations.
346
+ """
347
+ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
348
+ self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,)]])
349
+ self.assertEqual(list(tgrep.tgrep_positions("* < T > S", [tree])), [[(0,)]])
350
+ self.assertEqual(
351
+ list(tgrep.tgrep_positions("* !< T", [tree])),
352
+ [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
353
+ )
354
+ self.assertEqual(list(tgrep.tgrep_positions("* !< T > S", [tree])), [[(1,)]])
355
+ self.assertEqual(list(tgrep.tgrep_positions("* > A", [tree])), [[(0, 0)]])
356
+ self.assertEqual(list(tgrep.tgrep_positions("* > B", [tree])), [[(1, 0)]])
357
+ self.assertEqual(
358
+ list(tgrep.tgrep_positions("* !> B", [tree])),
359
+ [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
360
+ )
361
+ self.assertEqual(
362
+ list(tgrep.tgrep_positions("* !> B >> S", [tree])), [[(0,), (0, 0), (1,)]]
363
+ )
364
+ self.assertEqual(
365
+ list(tgrep.tgrep_positions("* >> S", [tree])),
366
+ [[(0,), (0, 0), (1,), (1, 0)]],
367
+ )
368
+ self.assertEqual(
369
+ list(tgrep.tgrep_positions("* >>, S", [tree])), [[(0,), (0, 0)]]
370
+ )
371
+ self.assertEqual(
372
+ list(tgrep.tgrep_positions("* >>' S", [tree])), [[(1,), (1, 0)]]
373
+ )
374
+ # Known issue:
375
+ # self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
376
+ # [[()]])
377
+ self.assertEqual(list(tgrep.tgrep_positions("* << T", [tree])), [[(), (0,)]])
378
+ self.assertEqual(list(tgrep.tgrep_positions("* <<' T", [tree])), [[(0,)]])
379
+ self.assertEqual(list(tgrep.tgrep_positions("* <<1 N", [tree])), [[(1,)]])
380
+ self.assertEqual(
381
+ list(tgrep.tgrep_positions("* !<< T", [tree])),
382
+ [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
383
+ )
384
+ tree = ParentedTree.fromstring("(S (A (T x)) (B (T x) (N x )))")
385
+ self.assertEqual(list(tgrep.tgrep_positions("* <: T", [tree])), [[(0,)]])
386
+ self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,), (1,)]])
387
+ self.assertEqual(
388
+ list(tgrep.tgrep_positions("* !<: T", [tree])),
389
+ [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
390
+ )
391
+ self.assertEqual(list(tgrep.tgrep_positions("* !<: T > S", [tree])), [[(1,)]])
392
+ tree = ParentedTree.fromstring("(S (T (A x) (B x)) (T (C x)))")
393
+ self.assertEqual(list(tgrep.tgrep_positions("* >: T", [tree])), [[(1, 0)]])
394
+ self.assertEqual(
395
+ list(tgrep.tgrep_positions("* !>: T", [tree])),
396
+ [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
397
+ )
398
+ tree = ParentedTree.fromstring(
399
+ "(S (A (B (C (D (E (T x))))))" " (A (B (C (D (E (T x))) (N x)))))"
400
+ )
401
+ self.assertEqual(
402
+ list(tgrep.tgrep_positions("* <<: T", [tree])),
403
+ [
404
+ [
405
+ (0,),
406
+ (0, 0),
407
+ (0, 0, 0),
408
+ (0, 0, 0, 0),
409
+ (0, 0, 0, 0, 0),
410
+ (1, 0, 0, 0),
411
+ (1, 0, 0, 0, 0),
412
+ ]
413
+ ],
414
+ )
415
+ self.assertEqual(
416
+ list(tgrep.tgrep_positions("* >>: A", [tree])),
417
+ [
418
+ [
419
+ (0, 0),
420
+ (0, 0, 0),
421
+ (0, 0, 0, 0),
422
+ (0, 0, 0, 0, 0),
423
+ (0, 0, 0, 0, 0, 0),
424
+ (1, 0),
425
+ (1, 0, 0),
426
+ ]
427
+ ],
428
+ )
429
+
430
+ def test_bad_operator(self):
431
+ """
432
+ Test error handling of undefined tgrep operators.
433
+ """
434
+ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
435
+ self.assertRaises(
436
+ tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree])
437
+ )
438
+
439
+ def test_comments(self):
440
+ """
441
+ Test that comments are correctly filtered out of tgrep search
442
+ strings.
443
+ """
444
+ tree = ParentedTree.fromstring("(S (NN x) (NP x) (NN x))")
445
+ search1 = """
446
+ @ NP /^NP/;
447
+ @ NN /^NN/;
448
+ @NN
449
+ """
450
+ self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
451
+ search2 = """
452
+ # macros
453
+ @ NP /^NP/;
454
+ @ NN /^NN/;
455
+
456
+ # search string
457
+ @NN
458
+ """
459
+ self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
460
+
461
+ def test_rel_sister_nodes(self):
462
+ """
463
+ Test matching sister nodes in a tree.
464
+ """
465
+ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
466
+ self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]])
467
+ self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]])
468
+ self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]])
469
+ self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]])
470
+ self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]])
471
+
472
+ def tests_rel_indexed_children(self):
473
+ """
474
+ Test matching nodes based on their index in their parent node.
475
+ """
476
+ tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
477
+ self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0,)]])
478
+ self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0,)]])
479
+ self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1,)]])
480
+ self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2,)]])
481
+ self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2,)]])
482
+ self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2,)]])
483
+ self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1,)]])
484
+ self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0,)]])
485
+ tree = ParentedTree.fromstring(
486
+ "(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))"
487
+ )
488
+ self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0,)]])
489
+ self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0,)]])
490
+ self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2,)]])
491
+ self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1,)]])
492
+ self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1,)]])
493
+ self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1,)]])
494
+ self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2,)]])
495
+ self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0,)]])
496
+
497
+ def test_rel_precedence(self):
498
+ """
499
+ Test matching nodes based on precedence relations.
500
+ """
501
+ tree = ParentedTree.fromstring(
502
+ "(S (NP (NP (PP x)) (NP (AP x)))"
503
+ " (VP (AP (X (PP x)) (Y (AP x))))"
504
+ " (NP (RC (NP (AP x)))))"
505
+ )
506
+ self.assertEqual(
507
+ list(tgrep.tgrep_positions("* . X", [tree])), [[(0,), (0, 1), (0, 1, 0)]]
508
+ )
509
+ self.assertEqual(
510
+ list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
511
+ )
512
+ self.assertEqual(
513
+ list(tgrep.tgrep_positions("* .. X", [tree])),
514
+ [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
515
+ )
516
+ self.assertEqual(
517
+ list(tgrep.tgrep_positions("* .. Y", [tree])),
518
+ [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
519
+ )
520
+ self.assertEqual(
521
+ list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
522
+ )
523
+ self.assertEqual(
524
+ list(tgrep.tgrep_positions("* , Y", [tree])),
525
+ [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
526
+ )
527
+ self.assertEqual(
528
+ list(tgrep.tgrep_positions("* ,, X", [tree])),
529
+ [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
530
+ )
531
+ self.assertEqual(
532
+ list(tgrep.tgrep_positions("* ,, Y", [tree])),
533
+ [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
534
+ )
535
+
536
+ def test_examples(self):
537
+ """
538
+ Test the Basic Examples from the TGrep2 manual.
539
+ """
540
+ tree = ParentedTree.fromstring("(S (NP (AP x)) (NP (PP x)))")
541
+ # This matches any NP node that immediately dominates a PP:
542
+ self.assertEqual(list(tgrep.tgrep_positions("NP < PP", [tree])), [[(1,)]])
543
+
544
+ tree = ParentedTree.fromstring("(S (NP x) (VP x) (NP (PP x)) (VP x))")
545
+ # This matches an NP that dominates a PP and is immediately
546
+ # followed by a VP:
547
+ self.assertEqual(list(tgrep.tgrep_positions("NP << PP . VP", [tree])), [[(2,)]])
548
+
549
+ tree = ParentedTree.fromstring(
550
+ "(S (NP (AP x)) (NP (PP x)) " "(NP (DET x) (NN x)) (VP x))"
551
+ )
552
+ # This matches an NP that dominates a PP or is immediately
553
+ # followed by a VP:
554
+ self.assertEqual(
555
+ list(tgrep.tgrep_positions("NP << PP | . VP", [tree])), [[(1,), (2,)]]
556
+ )
557
+
558
+ tree = ParentedTree.fromstring(
559
+ "(S (NP (NP (PP x)) (NP (AP x)))"
560
+ " (VP (AP (NP (PP x)) (NP (AP x))))"
561
+ " (NP (RC (NP (AP x)))))"
562
+ )
563
+ # This matches an NP that does not dominate a PP. Also, the NP
564
+ # must either have a parent that is an NP or be dominated by a
565
+ # VP:
566
+ self.assertEqual(
567
+ list(tgrep.tgrep_positions("NP !<< PP [> NP | >> VP]", [tree])),
568
+ [[(0, 1), (1, 0, 1)]],
569
+ )
570
+
571
+ tree = ParentedTree.fromstring(
572
+ "(S (NP (AP (PP x) (VP x))) " "(NP (AP (PP x) (NP x))) (NP x))"
573
+ )
574
+ # This matches an NP that dominates a PP which itself is
575
+ # immediately followed by a VP. Note the use of parentheses to
576
+ # group ". VP" with the PP rather than with the NP:
577
+ self.assertEqual(
578
+ list(tgrep.tgrep_positions("NP << (PP . VP)", [tree])), [[(0,)]]
579
+ )
580
+
581
+ tree = ParentedTree.fromstring(
582
+ "(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))"
583
+ " (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))"
584
+ " (NP x))"
585
+ )
586
+ # This matches an NP whose last child is a PP that begins with
587
+ # the preposition "on":
588
+ self.assertEqual(
589
+ list(tgrep.tgrep_positions("NP <' (PP <, (IN < on))", [tree])), [[(0,)]]
590
+ )
591
+
592
+ tree = ParentedTree.fromstring(
593
+ "(S (S (C x) (A (B x))) (S (C x) (A x)) " "(S (D x) (A (B x))))"
594
+ )
595
+ # The following pattern matches an S which has a child A and
596
+ # another child that is a C and that the A has a child B:
597
+ self.assertEqual(
598
+ list(tgrep.tgrep_positions("S < (A < B) < C", [tree])), [[(0,)]]
599
+ )
600
+
601
+ tree = ParentedTree.fromstring(
602
+ "(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))"
603
+ )
604
+ # However, this pattern means that S has child A and that A
605
+ # has children B and C:
606
+ self.assertEqual(
607
+ list(tgrep.tgrep_positions("S < ((A < B) < C)", [tree])), [[(0,)]]
608
+ )
609
+
610
+ # It is equivalent to this:
611
+ self.assertEqual(
612
+ list(tgrep.tgrep_positions("S < (A < B < C)", [tree])), [[(0,)]]
613
+ )
614
+
615
+ def test_use_macros(self):
616
+ """
617
+ Test defining and using tgrep2 macros.
618
+ """
619
+ tree = ParentedTree.fromstring(
620
+ "(VP (VB sold) (NP (DET the) "
621
+ "(NN heiress)) (NP (NN deed) (PREP to) "
622
+ "(NP (DET the) (NN school) (NN house))))"
623
+ )
624
+ self.assertEqual(
625
+ list(
626
+ tgrep.tgrep_positions(
627
+ "@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN", [tree]
628
+ )
629
+ ),
630
+ [[(1,), (2, 2)]],
631
+ )
632
+ # use undefined macro @CNP
633
+ self.assertRaises(
634
+ tgrep.TgrepException,
635
+ list,
636
+ tgrep.tgrep_positions(
637
+ "@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN", [tree]
638
+ ),
639
+ )
640
+
641
+ def test_tokenize_node_labels(self):
642
+ """Test tokenization of labeled nodes."""
643
+ self.assertEqual(
644
+ tgrep.tgrep_tokenize("S < @SBJ < (@VP < (@VB $.. @OBJ))"),
645
+ [
646
+ "S",
647
+ "<",
648
+ "@SBJ",
649
+ "<",
650
+ "(",
651
+ "@VP",
652
+ "<",
653
+ "(",
654
+ "@VB",
655
+ "$..",
656
+ "@OBJ",
657
+ ")",
658
+ ")",
659
+ ],
660
+ )
661
+ self.assertEqual(
662
+ tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))"),
663
+ [
664
+ "S",
665
+ "<",
666
+ "@SBJ",
667
+ "=",
668
+ "s",
669
+ "<",
670
+ "(",
671
+ "@VP",
672
+ "=",
673
+ "v",
674
+ "<",
675
+ "(",
676
+ "@VB",
677
+ "$..",
678
+ "@OBJ",
679
+ ")",
680
+ ")",
681
+ ],
682
+ )
683
+
684
+ def test_tokenize_segmented_patterns(self):
685
+ """Test tokenization of segmented patterns."""
686
+ self.assertEqual(
687
+ tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"),
688
+ [
689
+ "S",
690
+ "<",
691
+ "@SBJ",
692
+ "=",
693
+ "s",
694
+ "<",
695
+ "(",
696
+ "@VP",
697
+ "=",
698
+ "v",
699
+ "<",
700
+ "(",
701
+ "@VB",
702
+ "$..",
703
+ "@OBJ",
704
+ ")",
705
+ ")",
706
+ ":",
707
+ "=s",
708
+ "..",
709
+ "=v",
710
+ ],
711
+ )
712
+
713
+ def test_labeled_nodes(self):
714
+ """
715
+ Test labeled nodes.
716
+
717
+ Test case from Emily M. Bender.
718
+ """
719
+ search = """
720
+ # macros
721
+ @ SBJ /SBJ/;
722
+ @ VP /VP/;
723
+ @ VB /VB/;
724
+ @ VPoB /V[PB]/;
725
+ @ OBJ /OBJ/;
726
+
727
+ # 1 svo
728
+ S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"""
729
+ sent1 = ParentedTree.fromstring(
730
+ "(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))"
731
+ )
732
+ sent2 = ParentedTree.fromstring(
733
+ "(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))"
734
+ )
735
+ search_firsthalf = search.split("\n\n")[0] + "S < @SBJ < (@VP < (@VB $.. @OBJ))"
736
+ search_rewrite = "S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))"
737
+
738
+ self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
739
+ self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
740
+ self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
741
+ self.assertEqual(
742
+ list(tgrep.tgrep_positions(search, [sent1])),
743
+ list(tgrep.tgrep_positions(search_rewrite, [sent1])),
744
+ )
745
+ self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
746
+ self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
747
+ self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
748
+ self.assertEqual(
749
+ list(tgrep.tgrep_positions(search, [sent2])),
750
+ list(tgrep.tgrep_positions(search_rewrite, [sent2])),
751
+ )
752
+
753
+ def test_multiple_conjs(self):
754
+ """
755
+ Test that multiple (3 or more) conjunctions of node relations are
756
+ handled properly.
757
+ """
758
+ sent = ParentedTree.fromstring("((A (B b) (C c)) (A (B b) (C c) (D d)))")
759
+ # search = '(A < B < C < D)'
760
+ # search_tworels = '(A < B < C)'
761
+ self.assertEqual(
762
+ list(tgrep.tgrep_positions("(A < B < C < D)", [sent])), [[(1,)]]
763
+ )
764
+ self.assertEqual(
765
+ list(tgrep.tgrep_positions("(A < B < C)", [sent])), [[(0,), (1,)]]
766
+ )
767
+
768
+ def test_trailing_semicolon(self):
769
+ """
770
+ Test that semicolons at the end of a tgrep2 search string won't
771
+ cause a parse failure.
772
+ """
773
+ tree = ParentedTree.fromstring(
774
+ "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
775
+ )
776
+ self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
777
+ self.assertEqual(list(tgrep.tgrep_positions("NN;", [tree])), [[(0, 2), (2, 1)]])
778
+ self.assertEqual(
779
+ list(tgrep.tgrep_positions("NN;;", [tree])), [[(0, 2), (2, 1)]]
780
+ )
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_tokenize.py ADDED
@@ -0,0 +1,867 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for nltk.tokenize.
3
+ See also nltk/test/tokenize.doctest
4
+ """
5
+ from typing import List, Tuple
6
+
7
+ import pytest
8
+
9
+ from nltk.tokenize import (
10
+ LegalitySyllableTokenizer,
11
+ StanfordSegmenter,
12
+ SyllableTokenizer,
13
+ TreebankWordTokenizer,
14
+ TweetTokenizer,
15
+ punkt,
16
+ sent_tokenize,
17
+ word_tokenize,
18
+ )
19
+
20
+
21
+ def load_stanford_segmenter():
22
+ try:
23
+ seg = StanfordSegmenter()
24
+ seg.default_config("ar")
25
+ seg.default_config("zh")
26
+ return True
27
+ except LookupError:
28
+ return False
29
+
30
+
31
+ check_stanford_segmenter = pytest.mark.skipif(
32
+ not load_stanford_segmenter(),
33
+ reason="NLTK was unable to find stanford-segmenter.jar.",
34
+ )
35
+
36
+
37
+ class TestTokenize:
38
+ def test_tweet_tokenizer(self):
39
+ """
40
+ Test TweetTokenizer using words with special and accented characters.
41
+ """
42
+
43
+ tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
44
+ s9 = "@myke: Let's test these words: resumé España München français"
45
+ tokens = tokenizer.tokenize(s9)
46
+ expected = [
47
+ ":",
48
+ "Let's",
49
+ "test",
50
+ "these",
51
+ "words",
52
+ ":",
53
+ "resumé",
54
+ "España",
55
+ "München",
56
+ "français",
57
+ ]
58
+ assert tokens == expected
59
+
60
+ @pytest.mark.parametrize(
61
+ "test_input, expecteds",
62
+ [
63
+ (
64
+ "My text 0106404243030 is great text",
65
+ (
66
+ ["My", "text", "01064042430", "30", "is", "great", "text"],
67
+ ["My", "text", "0106404243030", "is", "great", "text"],
68
+ ),
69
+ ),
70
+ (
71
+ "My ticket id is 1234543124123",
72
+ (
73
+ ["My", "ticket", "id", "is", "12345431241", "23"],
74
+ ["My", "ticket", "id", "is", "1234543124123"],
75
+ ),
76
+ ),
77
+ (
78
+ "@remy: This is waaaaayyyy too much for you!!!!!! 01064042430",
79
+ (
80
+ [
81
+ ":",
82
+ "This",
83
+ "is",
84
+ "waaayyy",
85
+ "too",
86
+ "much",
87
+ "for",
88
+ "you",
89
+ "!",
90
+ "!",
91
+ "!",
92
+ "01064042430",
93
+ ],
94
+ [
95
+ ":",
96
+ "This",
97
+ "is",
98
+ "waaayyy",
99
+ "too",
100
+ "much",
101
+ "for",
102
+ "you",
103
+ "!",
104
+ "!",
105
+ "!",
106
+ "01064042430",
107
+ ],
108
+ ),
109
+ ),
110
+ # Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085,
111
+ # showing the TweetTokenizer performance for `match_phone_numbers=True` and
112
+ # `match_phone_numbers=False`.
113
+ (
114
+ # Some phone numbers are always tokenized, even with `match_phone_numbers=`False`
115
+ "My number is 06-46124080, except it's not.",
116
+ (
117
+ [
118
+ "My",
119
+ "number",
120
+ "is",
121
+ "06-46124080",
122
+ ",",
123
+ "except",
124
+ "it's",
125
+ "not",
126
+ ".",
127
+ ],
128
+ [
129
+ "My",
130
+ "number",
131
+ "is",
132
+ "06-46124080",
133
+ ",",
134
+ "except",
135
+ "it's",
136
+ "not",
137
+ ".",
138
+ ],
139
+ ),
140
+ ),
141
+ (
142
+ # Phone number here is only tokenized correctly if `match_phone_numbers=True`
143
+ "My number is 601-984-4813, except it's not.",
144
+ (
145
+ [
146
+ "My",
147
+ "number",
148
+ "is",
149
+ "601-984-4813",
150
+ ",",
151
+ "except",
152
+ "it's",
153
+ "not",
154
+ ".",
155
+ ],
156
+ [
157
+ "My",
158
+ "number",
159
+ "is",
160
+ "601-984-",
161
+ "4813",
162
+ ",",
163
+ "except",
164
+ "it's",
165
+ "not",
166
+ ".",
167
+ ],
168
+ ),
169
+ ),
170
+ (
171
+ # Phone number here is only tokenized correctly if `match_phone_numbers=True`
172
+ "My number is (393) 928 -3010, except it's not.",
173
+ (
174
+ [
175
+ "My",
176
+ "number",
177
+ "is",
178
+ "(393) 928 -3010",
179
+ ",",
180
+ "except",
181
+ "it's",
182
+ "not",
183
+ ".",
184
+ ],
185
+ [
186
+ "My",
187
+ "number",
188
+ "is",
189
+ "(",
190
+ "393",
191
+ ")",
192
+ "928",
193
+ "-",
194
+ "3010",
195
+ ",",
196
+ "except",
197
+ "it's",
198
+ "not",
199
+ ".",
200
+ ],
201
+ ),
202
+ ),
203
+ (
204
+ # A long number is tokenized correctly only if `match_phone_numbers=False`
205
+ "The product identification number is 48103284512.",
206
+ (
207
+ [
208
+ "The",
209
+ "product",
210
+ "identification",
211
+ "number",
212
+ "is",
213
+ "4810328451",
214
+ "2",
215
+ ".",
216
+ ],
217
+ [
218
+ "The",
219
+ "product",
220
+ "identification",
221
+ "number",
222
+ "is",
223
+ "48103284512",
224
+ ".",
225
+ ],
226
+ ),
227
+ ),
228
+ (
229
+ # `match_phone_numbers=True` can have some unforeseen
230
+ "My favourite substraction is 240 - 1353.",
231
+ (
232
+ ["My", "favourite", "substraction", "is", "240 - 1353", "."],
233
+ ["My", "favourite", "substraction", "is", "240", "-", "1353", "."],
234
+ ),
235
+ ),
236
+ ],
237
+ )
238
+ def test_tweet_tokenizer_expanded(
239
+ self, test_input: str, expecteds: Tuple[List[str], List[str]]
240
+ ):
241
+ """
242
+ Test `match_phone_numbers` in TweetTokenizer.
243
+
244
+ Note that TweetTokenizer is also passed the following for these tests:
245
+ * strip_handles=True
246
+ * reduce_len=True
247
+
248
+ :param test_input: The input string to tokenize using TweetTokenizer.
249
+ :type test_input: str
250
+ :param expecteds: A 2-tuple of tokenized sentences. The first of the two
251
+ tokenized is the expected output of tokenization with `match_phone_numbers=True`.
252
+ The second of the two tokenized lists is the expected output of tokenization
253
+ with `match_phone_numbers=False`.
254
+ :type expecteds: Tuple[List[str], List[str]]
255
+ """
256
+ for match_phone_numbers, expected in zip([True, False], expecteds):
257
+ tokenizer = TweetTokenizer(
258
+ strip_handles=True,
259
+ reduce_len=True,
260
+ match_phone_numbers=match_phone_numbers,
261
+ )
262
+ predicted = tokenizer.tokenize(test_input)
263
+ assert predicted == expected
264
+
265
+ def test_sonority_sequencing_syllable_tokenizer(self):
266
+ """
267
+ Test SyllableTokenizer tokenizer.
268
+ """
269
+ tokenizer = SyllableTokenizer()
270
+ tokens = tokenizer.tokenize("justification")
271
+ assert tokens == ["jus", "ti", "fi", "ca", "tion"]
272
+
273
+ def test_syllable_tokenizer_numbers(self):
274
+ """
275
+ Test SyllableTokenizer tokenizer.
276
+ """
277
+ tokenizer = SyllableTokenizer()
278
+ text = "9" * 10000
279
+ tokens = tokenizer.tokenize(text)
280
+ assert tokens == [text]
281
+
282
+ def test_legality_principle_syllable_tokenizer(self):
283
+ """
284
+ Test LegalitySyllableTokenizer tokenizer.
285
+ """
286
+ from nltk.corpus import words
287
+
288
+ test_word = "wonderful"
289
+ tokenizer = LegalitySyllableTokenizer(words.words())
290
+ tokens = tokenizer.tokenize(test_word)
291
+ assert tokens == ["won", "der", "ful"]
292
+
293
+ @check_stanford_segmenter
294
+ def test_stanford_segmenter_arabic(self):
295
+ """
296
+ Test the Stanford Word Segmenter for Arabic (default config)
297
+ """
298
+ seg = StanfordSegmenter()
299
+ seg.default_config("ar")
300
+ sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات"
301
+ segmented_sent = seg.segment(sent.split())
302
+ assert segmented_sent.split() == [
303
+ "يبحث",
304
+ "علم",
305
+ "الحاسوب",
306
+ "استخدام",
307
+ "الحوسبة",
308
+ "ب",
309
+ "جميع",
310
+ "اشكال",
311
+ "ها",
312
+ "ل",
313
+ "حل",
314
+ "المشكلات",
315
+ ]
316
+
317
+ @check_stanford_segmenter
318
+ def test_stanford_segmenter_chinese(self):
319
+ """
320
+ Test the Stanford Word Segmenter for Chinese (default config)
321
+ """
322
+ seg = StanfordSegmenter()
323
+ seg.default_config("zh")
324
+ sent = "这是斯坦福中文分词器测试"
325
+ segmented_sent = seg.segment(sent.split())
326
+ assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
327
+
328
+ def test_phone_tokenizer(self):
329
+ """
330
+ Test a string that resembles a phone number but contains a newline
331
+ """
332
+
333
+ # Should be recognized as a phone number, albeit one with multiple spaces
334
+ tokenizer = TweetTokenizer()
335
+ test1 = "(393) 928 -3010"
336
+ expected = ["(393) 928 -3010"]
337
+ result = tokenizer.tokenize(test1)
338
+ assert result == expected
339
+
340
+ # Due to newline, first three elements aren't part of a phone number;
341
+ # fourth is
342
+ test2 = "(393)\n928 -3010"
343
+ expected = ["(", "393", ")", "928 -3010"]
344
+ result = tokenizer.tokenize(test2)
345
+ assert result == expected
346
+
347
+ def test_emoji_tokenizer(self):
348
+ """
349
+ Test a string that contains Emoji ZWJ Sequences and skin tone modifier
350
+ """
351
+ tokenizer = TweetTokenizer()
352
+
353
+ # A Emoji ZWJ Sequences, they together build as a single emoji, should not be split.
354
+ test1 = "👨‍👩‍👧‍👧"
355
+ expected = ["👨‍👩‍👧‍👧"]
356
+ result = tokenizer.tokenize(test1)
357
+ assert result == expected
358
+
359
+ # A Emoji with skin tone modifier, the two characters build a single emoji, should not be split.
360
+ test2 = "👨🏿"
361
+ expected = ["👨🏿"]
362
+ result = tokenizer.tokenize(test2)
363
+ assert result == expected
364
+
365
+ # A string containing both skin tone modifier and ZWJ Sequences
366
+ test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
367
+ expected = [
368
+ "🤔",
369
+ "🙈",
370
+ "me",
371
+ "así",
372
+ ",",
373
+ "se",
374
+ "😌",
375
+ "ds",
376
+ "💕",
377
+ "👭",
378
+ "👙",
379
+ "hello",
380
+ "👩🏾\u200d🎓",
381
+ "emoji",
382
+ "hello",
383
+ "👨\u200d👩\u200d👦\u200d👦",
384
+ "how",
385
+ "are",
386
+ "😊",
387
+ "you",
388
+ "today",
389
+ "🙅🏽",
390
+ "🙅🏽",
391
+ ]
392
+ result = tokenizer.tokenize(test3)
393
+ assert result == expected
394
+
395
+ # emoji flag sequences, including enclosed letter pairs
396
+ # Expected behavior from #3034
397
+ test4 = "🇦🇵🇵🇱🇪"
398
+ expected = ["🇦🇵", "🇵🇱", "🇪"]
399
+ result = tokenizer.tokenize(test4)
400
+ assert result == expected
401
+
402
+ test5 = "Hi 🇨🇦, 😍!!"
403
+ expected = ["Hi", "🇨🇦", ",", "😍", "!", "!"]
404
+ result = tokenizer.tokenize(test5)
405
+ assert result == expected
406
+
407
+ test6 = "<3 🇨🇦 🤝 🇵🇱 <3"
408
+ expected = ["<3", "🇨🇦", "🤝", "🇵🇱", "<3"]
409
+ result = tokenizer.tokenize(test6)
410
+ assert result == expected
411
+
412
+ def test_pad_asterisk(self):
413
+ """
414
+ Test padding of asterisk for word tokenization.
415
+ """
416
+ text = "This is a, *weird sentence with *asterisks in it."
417
+ expected = [
418
+ "This",
419
+ "is",
420
+ "a",
421
+ ",",
422
+ "*",
423
+ "weird",
424
+ "sentence",
425
+ "with",
426
+ "*",
427
+ "asterisks",
428
+ "in",
429
+ "it",
430
+ ".",
431
+ ]
432
+ assert word_tokenize(text) == expected
433
+
434
+ def test_pad_dotdot(self):
435
+ """
436
+ Test padding of dotdot* for word tokenization.
437
+ """
438
+ text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
439
+ expected = [
440
+ "Why",
441
+ "did",
442
+ "dotdot",
443
+ "..",
444
+ "not",
445
+ "get",
446
+ "tokenized",
447
+ "but",
448
+ "dotdotdot",
449
+ "...",
450
+ "did",
451
+ "?",
452
+ "How",
453
+ "about",
454
+ "manydots",
455
+ ".....",
456
+ ]
457
+ assert word_tokenize(text) == expected
458
+
459
+ def test_remove_handle(self):
460
+ """
461
+ Test remove_handle() from casual.py with specially crafted edge cases
462
+ """
463
+
464
+ tokenizer = TweetTokenizer(strip_handles=True)
465
+
466
+ # Simple example. Handles with just numbers should be allowed
467
+ test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
468
+ expected = ["hello", ".", "hi"]
469
+ result = tokenizer.tokenize(test1)
470
+ assert result == expected
471
+
472
+ # Handles are allowed to follow any of the following characters
473
+ test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
474
+ expected = [
475
+ "`",
476
+ "~",
477
+ "(",
478
+ ")",
479
+ "-",
480
+ "=",
481
+ "+",
482
+ "\\",
483
+ "|",
484
+ "[",
485
+ "]",
486
+ "{",
487
+ "}",
488
+ ";",
489
+ ":",
490
+ "'",
491
+ '"',
492
+ "/",
493
+ "?",
494
+ ".",
495
+ ",",
496
+ "<",
497
+ ">",
498
+ "ñ",
499
+ ".",
500
+ "ü",
501
+ ".",
502
+ "ç",
503
+ ".",
504
+ ]
505
+ result = tokenizer.tokenize(test2)
506
+ assert result == expected
507
+
508
+ # Handles are NOT allowed to follow any of the following characters
509
+ test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
510
+ expected = [
511
+ "a",
512
+ "@n",
513
+ "j",
514
+ "@n",
515
+ "z",
516
+ "@n",
517
+ "A",
518
+ "@n",
519
+ "L",
520
+ "@n",
521
+ "Z",
522
+ "@n",
523
+ "1",
524
+ "@n",
525
+ "4",
526
+ "@n",
527
+ "7",
528
+ "@n",
529
+ "9",
530
+ "@n",
531
+ "0",
532
+ "@n",
533
+ "_",
534
+ "@n",
535
+ "!",
536
+ "@n",
537
+ "@",
538
+ "@n",
539
+ "#",
540
+ "@n",
541
+ "$",
542
+ "@n",
543
+ "%",
544
+ "@n",
545
+ "&",
546
+ "@n",
547
+ "*",
548
+ "@n",
549
+ ]
550
+ result = tokenizer.tokenize(test3)
551
+ assert result == expected
552
+
553
+ # Handles are allowed to precede the following characters
554
+ test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
555
+ expected = ["!", "a", "#", "a", "$", "a", "%", "a", "&", "a", "*", "a"]
556
+ result = tokenizer.tokenize(test4)
557
+ assert result == expected
558
+
559
+ # Tests interactions with special symbols and multiple @
560
+ test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
561
+ expected = [
562
+ "!",
563
+ "@n",
564
+ "#",
565
+ "@n",
566
+ "$",
567
+ "@n",
568
+ "%",
569
+ "@n",
570
+ "&",
571
+ "@n",
572
+ "*",
573
+ "@n",
574
+ "@n",
575
+ "@n",
576
+ "@",
577
+ "@n",
578
+ "@n",
579
+ "@",
580
+ "@n",
581
+ "@n_",
582
+ "@n",
583
+ "@n7",
584
+ "@n",
585
+ "@nj",
586
+ "@n",
587
+ ]
588
+ result = tokenizer.tokenize(test5)
589
+ assert result == expected
590
+
591
+ # Tests that handles can have a max length of 15
592
+ test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle"
593
+ expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"]
594
+ result = tokenizer.tokenize(test6)
595
+ assert result == expected
596
+
597
+ # Edge case where an @ comes directly after a long handle
598
+ test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde"
599
+ expected = [
600
+ "p",
601
+ "@abcde",
602
+ "@abcdefghijklmno",
603
+ "@abcde",
604
+ "_",
605
+ "@abcde",
606
+ "5",
607
+ "@abcde",
608
+ ]
609
+ result = tokenizer.tokenize(test7)
610
+ assert result == expected
611
+
612
+ def test_treebank_span_tokenizer(self):
613
+ """
614
+ Test TreebankWordTokenizer.span_tokenize function
615
+ """
616
+
617
+ tokenizer = TreebankWordTokenizer()
618
+
619
+ # Test case in the docstring
620
+ test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
621
+ expected = [
622
+ (0, 4),
623
+ (5, 12),
624
+ (13, 17),
625
+ (18, 19),
626
+ (19, 23),
627
+ (24, 26),
628
+ (27, 30),
629
+ (31, 32),
630
+ (32, 36),
631
+ (36, 37),
632
+ (37, 38),
633
+ (40, 46),
634
+ (47, 48),
635
+ (48, 51),
636
+ (51, 52),
637
+ (53, 55),
638
+ (56, 59),
639
+ (60, 62),
640
+ (63, 68),
641
+ (69, 70),
642
+ (70, 76),
643
+ (76, 77),
644
+ (77, 78),
645
+ ]
646
+ result = list(tokenizer.span_tokenize(test1))
647
+ assert result == expected
648
+
649
+ # Test case with double quotation
650
+ test2 = 'The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues'
651
+ expected = [
652
+ (0, 3),
653
+ (4, 7),
654
+ (8, 10),
655
+ (11, 18),
656
+ (19, 21),
657
+ (22, 25),
658
+ (26, 27),
659
+ (27, 36),
660
+ (37, 42),
661
+ (42, 43),
662
+ (44, 46),
663
+ (47, 50),
664
+ (51, 57),
665
+ (58, 64),
666
+ (65, 68),
667
+ (69, 74),
668
+ (75, 76),
669
+ (77, 85),
670
+ (86, 92),
671
+ (93, 95),
672
+ (96, 102),
673
+ (103, 109),
674
+ ]
675
+ result = list(tokenizer.span_tokenize(test2))
676
+ assert result == expected
677
+
678
+ # Test case with double qoutation as well as converted quotations
679
+ test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
680
+ expected = [
681
+ (0, 3),
682
+ (4, 7),
683
+ (8, 10),
684
+ (11, 18),
685
+ (19, 21),
686
+ (22, 25),
687
+ (26, 27),
688
+ (27, 36),
689
+ (37, 42),
690
+ (42, 43),
691
+ (44, 46),
692
+ (47, 50),
693
+ (51, 57),
694
+ (58, 64),
695
+ (65, 68),
696
+ (69, 74),
697
+ (75, 76),
698
+ (77, 79),
699
+ (79, 87),
700
+ (87, 89),
701
+ (90, 96),
702
+ (97, 99),
703
+ (100, 106),
704
+ (107, 113),
705
+ ]
706
+ result = list(tokenizer.span_tokenize(test3))
707
+ assert result == expected
708
+
709
+ def test_word_tokenize(self):
710
+ """
711
+ Test word_tokenize function
712
+ """
713
+
714
+ sentence = "The 'v', I've been fooled but I'll seek revenge."
715
+ expected = [
716
+ "The",
717
+ "'",
718
+ "v",
719
+ "'",
720
+ ",",
721
+ "I",
722
+ "'ve",
723
+ "been",
724
+ "fooled",
725
+ "but",
726
+ "I",
727
+ "'ll",
728
+ "seek",
729
+ "revenge",
730
+ ".",
731
+ ]
732
+ assert word_tokenize(sentence) == expected
733
+
734
+ sentence = "'v' 're'"
735
+ expected = ["'", "v", "'", "'re", "'"]
736
+ assert word_tokenize(sentence) == expected
737
+
738
+ def test_punkt_pair_iter(self):
739
+
740
+ test_cases = [
741
+ ("12", [("1", "2"), ("2", None)]),
742
+ ("123", [("1", "2"), ("2", "3"), ("3", None)]),
743
+ ("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]),
744
+ ]
745
+
746
+ for (test_input, expected_output) in test_cases:
747
+ actual_output = [x for x in punkt._pair_iter(test_input)]
748
+
749
+ assert actual_output == expected_output
750
+
751
+ def test_punkt_pair_iter_handles_stop_iteration_exception(self):
752
+ # test input to trigger StopIteration from next()
753
+ it = iter([])
754
+ # call method under test and produce a generator
755
+ gen = punkt._pair_iter(it)
756
+ # unpack generator, ensure that no error is raised
757
+ list(gen)
758
+
759
+ def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
760
+ obj = punkt.PunktBaseClass()
761
+
762
+ class TestPunktTokenizeWordsMock:
763
+ def word_tokenize(self, s):
764
+ return iter([])
765
+
766
+ obj._lang_vars = TestPunktTokenizeWordsMock()
767
+ # unpack generator, ensure that no error is raised
768
+ list(obj._tokenize_words("test"))
769
+
770
+ def test_punkt_tokenize_custom_lang_vars(self):
771
+
772
+ # Create LangVars including a full stop end character as used in Bengali
773
+ class BengaliLanguageVars(punkt.PunktLanguageVars):
774
+ sent_end_chars = (".", "?", "!", "\u0964")
775
+
776
+ obj = punkt.PunktSentenceTokenizer(lang_vars=BengaliLanguageVars())
777
+
778
+ # We now expect these sentences to be split up into the individual sentences
779
+ sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
780
+ expected = [
781
+ "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।",
782
+ "অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন।",
783
+ "এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।",
784
+ ]
785
+
786
+ assert obj.tokenize(sentences) == expected
787
+
788
+ def test_punkt_tokenize_no_custom_lang_vars(self):
789
+
790
+ obj = punkt.PunktSentenceTokenizer()
791
+
792
+ # We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars
793
+ sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
794
+ expected = [
795
+ "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
796
+ ]
797
+
798
+ assert obj.tokenize(sentences) == expected
799
+
800
+ @pytest.mark.parametrize(
801
+ "input_text,n_sents,n_splits,lang_vars",
802
+ [
803
+ # Test debug_decisions on a text with two sentences, split by a dot.
804
+ ("Subject: Some subject. Attachments: Some attachments", 2, 1),
805
+ # The sentence should be split into two sections,
806
+ # with one split and hence one decision.
807
+ # Test debug_decisions on a text with two sentences, split by an exclamation mark.
808
+ ("Subject: Some subject! Attachments: Some attachments", 2, 1),
809
+ # The sentence should be split into two sections,
810
+ # with one split and hence one decision.
811
+ # Test debug_decisions on a text with one sentences,
812
+ # which is not split.
813
+ ("This is just a normal sentence, just like any other.", 1, 0)
814
+ # Hence just 1
815
+ ],
816
+ )
817
+ def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None):
818
+ tokenizer = punkt.PunktSentenceTokenizer()
819
+ if lang_vars != None:
820
+ tokenizer._lang_vars = lang_vars
821
+
822
+ assert len(tokenizer.tokenize(input_text)) == n_sents
823
+ assert len(list(tokenizer.debug_decisions(input_text))) == n_splits
824
+
825
+ def test_punkt_debug_decisions_custom_end(self):
826
+ # Test debug_decisions on a text with two sentences,
827
+ # split by a custom end character, based on Issue #2519
828
+ class ExtLangVars(punkt.PunktLanguageVars):
829
+ sent_end_chars = (".", "?", "!", "^")
830
+
831
+ self.punkt_debug_decisions(
832
+ "Subject: Some subject^ Attachments: Some attachments",
833
+ n_sents=2,
834
+ n_splits=1,
835
+ lang_vars=ExtLangVars(),
836
+ )
837
+ # The sentence should be split into two sections,
838
+ # with one split and hence one decision.
839
+
840
+ @pytest.mark.parametrize(
841
+ "sentences, expected",
842
+ [
843
+ (
844
+ "this is a test. . new sentence.",
845
+ ["this is a test.", ".", "new sentence."],
846
+ ),
847
+ ("This. . . That", ["This.", ".", ".", "That"]),
848
+ ("This..... That", ["This..... That"]),
849
+ ("This... That", ["This... That"]),
850
+ ("This.. . That", ["This.. .", "That"]),
851
+ ("This. .. That", ["This.", ".. That"]),
852
+ ("This. ,. That", ["This.", ",.", "That"]),
853
+ ("This!!! That", ["This!!!", "That"]),
854
+ ("This! That", ["This!", "That"]),
855
+ (
856
+ "1. This is R .\n2. This is A .\n3. That's all",
857
+ ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
858
+ ),
859
+ (
860
+ "1. This is R .\t2. This is A .\t3. That's all",
861
+ ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
862
+ ),
863
+ ("Hello.\tThere", ["Hello.", "There"]),
864
+ ],
865
+ )
866
+ def test_sent_tokenize(self, sentences: str, expected: List[str]):
867
+ assert sent_tokenize(sentences) == expected
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_twitter_auth.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for static parts of Twitter package
3
+ """
4
+
5
+ import os
6
+
7
+ import pytest
8
+
9
+ pytest.importorskip("twython")
10
+
11
+ from nltk.twitter import Authenticate
12
+
13
+
14
+ @pytest.fixture
15
+ def auth():
16
+ return Authenticate()
17
+
18
+
19
+ class TestCredentials:
20
+ """
21
+ Tests that Twitter credentials from a file are handled correctly.
22
+ """
23
+
24
+ @classmethod
25
+ def setup_class(self):
26
+ self.subdir = os.path.join(os.path.dirname(__file__), "files")
27
+ os.environ["TWITTER"] = "twitter-files"
28
+
29
+ def test_environment(self, auth):
30
+ """
31
+ Test that environment variable has been read correctly.
32
+ """
33
+ fn = os.path.basename(auth.creds_subdir)
34
+ assert fn == os.environ["TWITTER"]
35
+
36
+ @pytest.mark.parametrize(
37
+ "kwargs",
38
+ [
39
+ # Each of the following scenarios should raise an error:
40
+ # An empty subdir path
41
+ {"subdir": ""},
42
+ # A subdir path of None
43
+ {"subdir": None},
44
+ # A nonexistent directory
45
+ {"subdir": "/nosuchdir"},
46
+ # 'credentials.txt' is not in default subdir, as read from `os.environ['TWITTER']`
47
+ {},
48
+ # Nonexistent credentials file ('foobar')
49
+ {"creds_file": "foobar"},
50
+ # 'bad_oauth1-1.txt' is incomplete
51
+ {"creds_file": "bad_oauth1-1.txt"},
52
+ # The first key in credentials file 'bad_oauth1-2.txt' is ill-formed
53
+ {"creds_file": "bad_oauth1-2.txt"},
54
+ # The first two lines in 'bad_oauth1-3.txt' are collapsed
55
+ {"creds_file": "bad_oauth1-3.txt"},
56
+ ],
57
+ )
58
+ def test_scenarios_that_should_raise_errors(self, kwargs, auth):
59
+ """Various scenarios that should raise errors"""
60
+ try:
61
+ auth.load_creds(**kwargs)
62
+ # raises ValueError (zero length field name in format) for python 2.6
63
+ # OSError for the rest
64
+ except (OSError, ValueError):
65
+ pass
66
+ except Exception as e:
67
+ pytest.fail("Unexpected exception thrown: %s" % e)
68
+ else:
69
+ pytest.fail("OSError exception not thrown.")
70
+
71
+ def test_correct_file(self, auth):
72
+ """Test that a proper file succeeds and is read correctly"""
73
+ oauth = auth.load_creds(subdir=self.subdir)
74
+
75
+ assert auth.creds_fullpath == os.path.join(self.subdir, auth.creds_file)
76
+ assert auth.creds_file == "credentials.txt"
77
+ assert oauth["app_key"] == "a"
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_util.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from nltk.util import everygrams
4
+
5
+
6
+ @pytest.fixture
7
+ def everygram_input():
8
+ """Form test data for tests."""
9
+ return iter(["a", "b", "c"])
10
+
11
+
12
+ def test_everygrams_without_padding(everygram_input):
13
+ expected_output = [
14
+ ("a",),
15
+ ("a", "b"),
16
+ ("a", "b", "c"),
17
+ ("b",),
18
+ ("b", "c"),
19
+ ("c",),
20
+ ]
21
+ output = list(everygrams(everygram_input))
22
+ assert output == expected_output
23
+
24
+
25
+ def test_everygrams_max_len(everygram_input):
26
+ expected_output = [
27
+ ("a",),
28
+ ("a", "b"),
29
+ ("b",),
30
+ ("b", "c"),
31
+ ("c",),
32
+ ]
33
+ output = list(everygrams(everygram_input, max_len=2))
34
+ assert output == expected_output
35
+
36
+
37
+ def test_everygrams_min_len(everygram_input):
38
+ expected_output = [
39
+ ("a", "b"),
40
+ ("a", "b", "c"),
41
+ ("b", "c"),
42
+ ]
43
+ output = list(everygrams(everygram_input, min_len=2))
44
+ assert output == expected_output
45
+
46
+
47
+ def test_everygrams_pad_right(everygram_input):
48
+ expected_output = [
49
+ ("a",),
50
+ ("a", "b"),
51
+ ("a", "b", "c"),
52
+ ("b",),
53
+ ("b", "c"),
54
+ ("b", "c", None),
55
+ ("c",),
56
+ ("c", None),
57
+ ("c", None, None),
58
+ (None,),
59
+ (None, None),
60
+ (None,),
61
+ ]
62
+ output = list(everygrams(everygram_input, max_len=3, pad_right=True))
63
+ assert output == expected_output
64
+
65
+
66
+ def test_everygrams_pad_left(everygram_input):
67
+ expected_output = [
68
+ (None,),
69
+ (None, None),
70
+ (None, None, "a"),
71
+ (None,),
72
+ (None, "a"),
73
+ (None, "a", "b"),
74
+ ("a",),
75
+ ("a", "b"),
76
+ ("a", "b", "c"),
77
+ ("b",),
78
+ ("b", "c"),
79
+ ("c",),
80
+ ]
81
+ output = list(everygrams(everygram_input, max_len=3, pad_left=True))
82
+ assert output == expected_output
.eggs/nltk-3.8-py3.10.egg/nltk/test/unit/test_wordnet.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for nltk.corpus.wordnet
3
+ See also nltk/test/wordnet.doctest
4
+ """
5
+ import unittest
6
+
7
+ from nltk.corpus import wordnet as wn
8
+ from nltk.corpus import wordnet_ic as wnic
9
+
10
+ wn.ensure_loaded()
11
+ S = wn.synset
12
+ L = wn.lemma
13
+
14
+
15
+ class WordnNetDemo(unittest.TestCase):
16
+ def test_retrieve_synset(self):
17
+ move_synset = S("go.v.21")
18
+ self.assertEqual(move_synset.name(), "move.v.15")
19
+ self.assertEqual(move_synset.lemma_names(), ["move", "go"])
20
+ self.assertEqual(
21
+ move_synset.definition(), "have a turn; make one's move in a game"
22
+ )
23
+ self.assertEqual(move_synset.examples(), ["Can I go now?"])
24
+
25
+ def test_retrieve_synsets(self):
26
+ self.assertEqual(sorted(wn.synsets("zap", pos="n")), [S("zap.n.01")])
27
+ self.assertEqual(
28
+ sorted(wn.synsets("zap", pos="v")),
29
+ [S("microwave.v.01"), S("nuke.v.01"), S("zap.v.01"), S("zap.v.02")],
30
+ )
31
+
32
+ def test_hyperhyponyms(self):
33
+ # Not every synset as hypernyms()
34
+ self.assertEqual(S("travel.v.01").hypernyms(), [])
35
+ self.assertEqual(S("travel.v.02").hypernyms(), [S("travel.v.03")])
36
+ self.assertEqual(S("travel.v.03").hypernyms(), [])
37
+
38
+ # Test hyper-/hyponyms.
39
+ self.assertEqual(S("breakfast.n.1").hypernyms(), [S("meal.n.01")])
40
+ first_five_meal_hypo = [
41
+ S("banquet.n.02"),
42
+ S("bite.n.04"),
43
+ S("breakfast.n.01"),
44
+ S("brunch.n.01"),
45
+ S("buffet.n.02"),
46
+ ]
47
+ self.assertEqual(sorted(S("meal.n.1").hyponyms()[:5]), first_five_meal_hypo)
48
+ self.assertEqual(S("Austen.n.1").instance_hypernyms(), [S("writer.n.01")])
49
+ first_five_composer_hypo = [
50
+ S("ambrose.n.01"),
51
+ S("bach.n.01"),
52
+ S("barber.n.01"),
53
+ S("bartok.n.01"),
54
+ S("beethoven.n.01"),
55
+ ]
56
+ self.assertEqual(
57
+ S("composer.n.1").instance_hyponyms()[:5], first_five_composer_hypo
58
+ )
59
+
60
+ # Test root hyper-/hyponyms
61
+ self.assertEqual(S("person.n.01").root_hypernyms(), [S("entity.n.01")])
62
+ self.assertEqual(S("sail.v.01").root_hypernyms(), [S("travel.v.01")])
63
+ self.assertEqual(
64
+ S("fall.v.12").root_hypernyms(), [S("act.v.01"), S("fall.v.17")]
65
+ )
66
+
67
+ def test_derivationally_related_forms(self):
68
+ # Test `derivationally_related_forms()`
69
+ self.assertEqual(
70
+ L("zap.v.03.nuke").derivationally_related_forms(),
71
+ [L("atomic_warhead.n.01.nuke")],
72
+ )
73
+ self.assertEqual(
74
+ L("zap.v.03.atomize").derivationally_related_forms(),
75
+ [L("atomization.n.02.atomization")],
76
+ )
77
+ self.assertEqual(
78
+ L("zap.v.03.atomise").derivationally_related_forms(),
79
+ [L("atomization.n.02.atomisation")],
80
+ )
81
+ self.assertEqual(L("zap.v.03.zap").derivationally_related_forms(), [])
82
+
83
+ def test_meronyms_holonyms(self):
84
+ # Test meronyms, holonyms.
85
+ self.assertEqual(
86
+ S("dog.n.01").member_holonyms(), [S("canis.n.01"), S("pack.n.06")]
87
+ )
88
+ self.assertEqual(S("dog.n.01").part_meronyms(), [S("flag.n.07")])
89
+
90
+ self.assertEqual(S("faculty.n.2").member_meronyms(), [S("professor.n.01")])
91
+ self.assertEqual(S("copilot.n.1").member_holonyms(), [S("crew.n.01")])
92
+
93
+ self.assertEqual(
94
+ S("table.n.2").part_meronyms(),
95
+ [S("leg.n.03"), S("tabletop.n.01"), S("tableware.n.01")],
96
+ )
97
+ self.assertEqual(S("course.n.7").part_holonyms(), [S("meal.n.01")])
98
+
99
+ self.assertEqual(
100
+ S("water.n.1").substance_meronyms(), [S("hydrogen.n.01"), S("oxygen.n.01")]
101
+ )
102
+ self.assertEqual(
103
+ S("gin.n.1").substance_holonyms(),
104
+ [
105
+ S("gin_and_it.n.01"),
106
+ S("gin_and_tonic.n.01"),
107
+ S("martini.n.01"),
108
+ S("pink_lady.n.01"),
109
+ ],
110
+ )
111
+
112
+ def test_antonyms(self):
113
+ # Test antonyms.
114
+ self.assertEqual(
115
+ L("leader.n.1.leader").antonyms(), [L("follower.n.01.follower")]
116
+ )
117
+ self.assertEqual(
118
+ L("increase.v.1.increase").antonyms(), [L("decrease.v.01.decrease")]
119
+ )
120
+
121
+ def test_misc_relations(self):
122
+ # Test misc relations.
123
+ self.assertEqual(S("snore.v.1").entailments(), [S("sleep.v.01")])
124
+ self.assertEqual(
125
+ S("heavy.a.1").similar_tos(),
126
+ [
127
+ S("dense.s.03"),
128
+ S("doughy.s.01"),
129
+ S("heavier-than-air.s.01"),
130
+ S("hefty.s.02"),
131
+ S("massive.s.04"),
132
+ S("non-buoyant.s.01"),
133
+ S("ponderous.s.02"),
134
+ ],
135
+ )
136
+ self.assertEqual(S("light.a.1").attributes(), [S("weight.n.01")])
137
+ self.assertEqual(S("heavy.a.1").attributes(), [S("weight.n.01")])
138
+
139
+ # Test pertainyms.
140
+ self.assertEqual(
141
+ L("English.a.1.English").pertainyms(), [L("england.n.01.England")]
142
+ )
143
+
144
+ def test_lch(self):
145
+ # Test LCH.
146
+ self.assertEqual(
147
+ S("person.n.01").lowest_common_hypernyms(S("dog.n.01")),
148
+ [S("organism.n.01")],
149
+ )
150
+ self.assertEqual(
151
+ S("woman.n.01").lowest_common_hypernyms(S("girlfriend.n.02")),
152
+ [S("woman.n.01")],
153
+ )
154
+
155
+ def test_domains(self):
156
+ # Test domains.
157
+ self.assertEqual(S("code.n.03").topic_domains(), [S("computer_science.n.01")])
158
+ self.assertEqual(S("pukka.a.01").region_domains(), [S("india.n.01")])
159
+ self.assertEqual(S("freaky.a.01").usage_domains(), [S("slang.n.02")])
160
+
161
+ def test_in_topic_domains(self):
162
+ # Test in domains.
163
+ self.assertEqual(
164
+ S("computer_science.n.01").in_topic_domains()[0], S("access.n.05")
165
+ )
166
+ self.assertEqual(S("germany.n.01").in_region_domains()[23], S("trillion.n.02"))
167
+ self.assertEqual(S("slang.n.02").in_usage_domains()[1], S("airhead.n.01"))
168
+
169
+ def test_wordnet_similarities(self):
170
+ # Path based similarities.
171
+ self.assertAlmostEqual(S("cat.n.01").path_similarity(S("cat.n.01")), 1.0)
172
+ self.assertAlmostEqual(S("dog.n.01").path_similarity(S("cat.n.01")), 0.2)
173
+ self.assertAlmostEqual(
174
+ S("car.n.01").path_similarity(S("automobile.v.01")),
175
+ S("automobile.v.01").path_similarity(S("car.n.01")),
176
+ )
177
+ self.assertAlmostEqual(
178
+ S("big.a.01").path_similarity(S("dog.n.01")),
179
+ S("dog.n.01").path_similarity(S("big.a.01")),
180
+ )
181
+ self.assertAlmostEqual(
182
+ S("big.a.01").path_similarity(S("long.a.01")),
183
+ S("long.a.01").path_similarity(S("big.a.01")),
184
+ )
185
+ self.assertAlmostEqual(
186
+ S("dog.n.01").lch_similarity(S("cat.n.01")), 2.028, places=3
187
+ )
188
+ self.assertAlmostEqual(
189
+ S("dog.n.01").wup_similarity(S("cat.n.01")), 0.8571, places=3
190
+ )
191
+ self.assertAlmostEqual(
192
+ S("car.n.01").wup_similarity(S("automobile.v.01")),
193
+ S("automobile.v.01").wup_similarity(S("car.n.01")),
194
+ )
195
+ self.assertAlmostEqual(
196
+ S("big.a.01").wup_similarity(S("dog.n.01")),
197
+ S("dog.n.01").wup_similarity(S("big.a.01")),
198
+ )
199
+ self.assertAlmostEqual(
200
+ S("big.a.01").wup_similarity(S("long.a.01")),
201
+ S("long.a.01").wup_similarity(S("big.a.01")),
202
+ )
203
+ self.assertAlmostEqual(
204
+ S("big.a.01").lch_similarity(S("long.a.01")),
205
+ S("long.a.01").lch_similarity(S("big.a.01")),
206
+ )
207
+ # Information Content similarities.
208
+ brown_ic = wnic.ic("ic-brown.dat")
209
+ self.assertAlmostEqual(
210
+ S("dog.n.01").jcn_similarity(S("cat.n.01"), brown_ic), 0.4497, places=3
211
+ )
212
+ semcor_ic = wnic.ic("ic-semcor.dat")
213
+ self.assertAlmostEqual(
214
+ S("dog.n.01").lin_similarity(S("cat.n.01"), semcor_ic), 0.8863, places=3
215
+ )
216
+
217
+ def test_omw_lemma_no_trailing_underscore(self):
218
+ expected = sorted(
219
+ [
220
+ "popolna_sprememba_v_mišljenju",
221
+ "popoln_obrat",
222
+ "preobrat",
223
+ "preobrat_v_mišljenju",
224
+ ]
225
+ )
226
+ self.assertEqual(sorted(S("about-face.n.02").lemma_names(lang="slv")), expected)
227
+
228
+ def test_iterable_type_for_all_lemma_names(self):
229
+ # Duck-test for iterables.
230
+ # See https://stackoverflow.com/a/36230057/610569
231
+ cat_lemmas = wn.all_lemma_names(lang="cat")
232
+ eng_lemmas = wn.all_lemma_names(lang="eng")
233
+
234
+ self.assertTrue(hasattr(eng_lemmas, "__iter__"))
235
+ self.assertTrue(hasattr(eng_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
236
+ self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
237
+
238
+ self.assertTrue(hasattr(cat_lemmas, "__iter__"))
239
+ self.assertTrue(hasattr(cat_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
240
+ self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
build/lib/opencompass/configs/dataset_collections/chat_OC15.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
5
+ from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
6
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
7
+ from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
8
+ from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
9
+ from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
10
+ from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
11
+ from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
12
+ from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
13
+ from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import bbh_datasets
14
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
15
+ from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
16
+ from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
17
+ from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
18
+ from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
19
+ from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
20
+ from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
21
+
22
+ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
build/lib/opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_bustm_gen_634f41 import bustm_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CHIDDataset
6
+
7
+ chid_reader_cfg = dict(
8
+ input_columns=[f'content{i}' for i in range(7)], output_column='answer')
9
+
10
+ chid_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template={
14
+ i: dict(
15
+ round=[
16
+ dict(role='HUMAN', prompt=f'以下句子是否通顺?\n{{content{i}}}'),
17
+ dict(role='BOT', prompt='这个句子是通顺的。'),
18
+ ], )
19
+ for i in range(7)
20
+ }),
21
+ retriever=dict(type=ZeroRetriever),
22
+ inferencer=dict(type=PPLInferencer))
23
+
24
+ chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
25
+
26
+ chid_datasets = [
27
+ dict(
28
+ type=CHIDDataset,
29
+ path='json',
30
+ abbr='chid-dev',
31
+ data_files='./data/FewCLUE/chid/dev_few_all.json',
32
+ split='train',
33
+ reader_cfg=chid_reader_cfg,
34
+ infer_cfg=chid_infer_cfg,
35
+ eval_cfg=chid_eval_cfg),
36
+ dict(
37
+ type=CHIDDataset,
38
+ path='json',
39
+ abbr='chid-test',
40
+ data_files='./data/FewCLUE/chid/test_public.json',
41
+ split='train',
42
+ reader_cfg=chid_reader_cfg,
43
+ infer_cfg=chid_infer_cfg,
44
+ eval_cfg=chid_eval_cfg),
45
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_csl_gen_28b223 import csl_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CslDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ csl_reader_cfg = dict(
9
+ input_columns=['abst', 'keywords'],
10
+ output_column='label',
11
+ )
12
+
13
+ csl_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ '摘要:{abst}\n关键词:{keywords}\n上述关键词出现在学术期刊中是否恰当?\nA. 否\nB. 是\n请从”A“,”B“中进行选择。\n答:'
21
+ )
22
+ ]),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ csl_eval_cfg = dict(
29
+ evaluator=dict(type=AccEvaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_capital_postprocess),
32
+ )
33
+
34
+ csl_datasets = [
35
+ dict(
36
+ abbr='csl_dev',
37
+ type=CslDatasetV2,
38
+ path='./data/FewCLUE/csl/dev_few_all.json',
39
+ reader_cfg=csl_reader_cfg,
40
+ infer_cfg=csl_infer_cfg,
41
+ eval_cfg=csl_eval_cfg,
42
+ ),
43
+ dict(
44
+ abbr='csl_test',
45
+ type=CslDatasetV2,
46
+ path='./data/FewCLUE/csl/test_public.json',
47
+ reader_cfg=csl_reader_cfg,
48
+ infer_cfg=csl_infer_cfg,
49
+ eval_cfg=csl_eval_cfg,
50
+ ),
51
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import EprstmtDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ eprstmt_reader_cfg = dict(
9
+ input_columns=['sentence'], output_column='label', test_split='train')
10
+
11
+ eprstmt_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(round=[
15
+ dict(
16
+ role='HUMAN',
17
+ prompt=
18
+ '内容: "{sentence}"。请对上述内容进行情绪分类。\nA. 积极\nB. 消极\n请从”A“,”B“中进行选择。\n答:'
19
+ ),
20
+ ]),
21
+ ),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(type=GenInferencer),
24
+ )
25
+
26
+ eprstmt_eval_cfg = dict(
27
+ evaluator=dict(type=AccEvaluator),
28
+ pred_role='BOT',
29
+ pred_postprocessor=dict(type=first_capital_postprocess),
30
+ )
31
+
32
+ eprstmt_datasets = [
33
+ dict(
34
+ abbr='eprstmt-dev',
35
+ type=EprstmtDatasetV2,
36
+ path='./data/FewCLUE/eprstmt/dev_few_all.json',
37
+ reader_cfg=eprstmt_reader_cfg,
38
+ infer_cfg=eprstmt_infer_cfg,
39
+ eval_cfg=eprstmt_eval_cfg,
40
+ ),
41
+ dict(
42
+ abbr='eprstmt-test',
43
+ type=EprstmtDatasetV2,
44
+ path='./data/FewCLUE/eprstmt/test_public.json',
45
+ reader_cfg=eprstmt_reader_cfg,
46
+ infer_cfg=eprstmt_infer_cfg,
47
+ eval_cfg=eprstmt_eval_cfg,
48
+ ),
49
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_f1e631.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ eprstmt_reader_cfg = dict(
8
+ input_columns=['sentence'], output_column='label', test_split='train')
9
+
10
+ eprstmt_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template={
14
+ 'Negative':
15
+ dict(round=[
16
+ dict(role='HUMAN', prompt='内容: "{sentence}"。情绪分类:'),
17
+ dict(role='BOT', prompt='消极。')
18
+ ]),
19
+ 'Positive':
20
+ dict(round=[
21
+ dict(role='HUMAN', prompt='内容: "{sentence}"。情绪分类:'),
22
+ dict(role='BOT', prompt='积极。')
23
+ ]),
24
+ }),
25
+ retriever=dict(type=ZeroRetriever),
26
+ inferencer=dict(type=PPLInferencer))
27
+
28
+ eprstmt_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
29
+
30
+ eprstmt_datasets = [
31
+ dict(
32
+ type=HFDataset,
33
+ abbr='eprstmt-dev',
34
+ path='json',
35
+ data_files='./data/FewCLUE/eprstmt/dev_few_all.json',
36
+ split='train',
37
+ reader_cfg=eprstmt_reader_cfg,
38
+ infer_cfg=eprstmt_infer_cfg,
39
+ eval_cfg=eprstmt_eval_cfg),
40
+ dict(
41
+ type=HFDataset,
42
+ abbr='eprstmt-test',
43
+ path='json',
44
+ data_files='./data/FewCLUE/eprstmt/test_public.json',
45
+ split='train',
46
+ reader_cfg=eprstmt_reader_cfg,
47
+ infer_cfg=eprstmt_infer_cfg,
48
+ eval_cfg=eprstmt_eval_cfg)
49
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ ocnli_fc_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ ocnli_fc_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'contradiction':
17
+ dict(round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
21
+ dict(role='BOT', prompt='错')
22
+ ]),
23
+ 'entailment':
24
+ dict(round=[
25
+ dict(
26
+ role='HUMAN',
27
+ prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
28
+ dict(role='BOT', prompt='对')
29
+ ]),
30
+ 'neutral':
31
+ dict(round=[
32
+ dict(
33
+ role='HUMAN', prompt='如果{sentence1}为真,那么{sentence2}也为真吗?'),
34
+ dict(role='BOT', prompt='可能')
35
+ ]),
36
+ }),
37
+ retriever=dict(type=ZeroRetriever),
38
+ inferencer=dict(type=PPLInferencer))
39
+ ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
40
+
41
+ ocnli_fc_datasets = [
42
+ dict(
43
+ type=HFDataset,
44
+ abbr='ocnli_fc-dev',
45
+ path='json',
46
+ split='train',
47
+ data_files='./data/FewCLUE/ocnli/dev_few_all.json',
48
+ reader_cfg=ocnli_fc_reader_cfg,
49
+ infer_cfg=ocnli_fc_infer_cfg,
50
+ eval_cfg=ocnli_fc_eval_cfg),
51
+ dict(
52
+ type=HFDataset,
53
+ abbr='ocnli_fc-test',
54
+ path='json',
55
+ split='train',
56
+ data_files='./data/FewCLUE/ocnli/test_public.json',
57
+ reader_cfg=ocnli_fc_reader_cfg,
58
+ infer_cfg=ocnli_fc_infer_cfg,
59
+ eval_cfg=ocnli_fc_eval_cfg)
60
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ ocnli_fc_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ ocnli_fc_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'contradiction':
17
+ '阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:错',
18
+ 'entailment': '阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:对',
19
+ 'neutral': '如果{sentence1}为真,那么{sentence2}也为真吗?可能'
20
+ }),
21
+ retriever=dict(type=ZeroRetriever),
22
+ inferencer=dict(type=PPLInferencer))
23
+ ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
24
+
25
+ ocnli_fc_datasets = [
26
+ dict(
27
+ type=HFDataset,
28
+ abbr='ocnli_fc-dev',
29
+ path='json',
30
+ split='train',
31
+ data_files='./data/FewCLUE/ocnli/dev_few_all.json',
32
+ reader_cfg=ocnli_fc_reader_cfg,
33
+ infer_cfg=ocnli_fc_infer_cfg,
34
+ eval_cfg=ocnli_fc_eval_cfg),
35
+ dict(
36
+ type=HFDataset,
37
+ abbr='ocnli_fc-test',
38
+ path='json',
39
+ split='train',
40
+ data_files='./data/FewCLUE/ocnli/test_public.json',
41
+ reader_cfg=ocnli_fc_reader_cfg,
42
+ infer_cfg=ocnli_fc_infer_cfg,
43
+ eval_cfg=ocnli_fc_eval_cfg)
44
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_tnews_gen_b90e4a import tnews_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_tnews_ppl_d10e8a import tnews_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_7d1c07.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import TNewsDataset
6
+
7
+ tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2')
8
+
9
+ tnews_labels = [
10
+ '农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯',
11
+ '军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻'
12
+ ]
13
+
14
+ tnews_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template={lb: f'{{sentence}}这篇新闻属于:{lb}'
18
+ for lb in tnews_labels}),
19
+ retriever=dict(type=ZeroRetriever),
20
+ inferencer=dict(type=PPLInferencer))
21
+
22
+ tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
23
+
24
+ tnews_datasets = [
25
+ dict(
26
+ type=TNewsDataset,
27
+ path='json',
28
+ abbr='tnews-dev',
29
+ data_files='./data/FewCLUE/tnews/dev_few_all.json',
30
+ split='train',
31
+ reader_cfg=tnews_reader_cfg,
32
+ infer_cfg=tnews_infer_cfg,
33
+ eval_cfg=tnews_eval_cfg),
34
+ dict(
35
+ type=TNewsDataset,
36
+ path='json',
37
+ abbr='tnews-test',
38
+ data_files='./data/FewCLUE/tnews/test_public.json',
39
+ split='train',
40
+ reader_cfg=tnews_reader_cfg,
41
+ infer_cfg=tnews_infer_cfg,
42
+ eval_cfg=tnews_eval_cfg)
43
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_d10e8a.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import TNewsDataset
6
+
7
+ tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2')
8
+
9
+ tnews_labels = [
10
+ '农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯',
11
+ '军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻'
12
+ ]
13
+
14
+ tnews_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template={
18
+ lb: dict(round=[
19
+ dict(role='HUMAN', prompt='{sentence}\n上述内容属于什么新闻?'),
20
+ dict(role='BOT', prompt=lb)
21
+ ])
22
+ for lb in tnews_labels
23
+ }),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=PPLInferencer))
26
+
27
+ tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
28
+
29
+ tnews_datasets = [
30
+ dict(
31
+ type=TNewsDataset,
32
+ path='json',
33
+ abbr='tnews-dev',
34
+ data_files='./data/FewCLUE/tnews/dev_few_all.json',
35
+ split='train',
36
+ reader_cfg=tnews_reader_cfg,
37
+ infer_cfg=tnews_infer_cfg,
38
+ eval_cfg=tnews_eval_cfg),
39
+ dict(
40
+ type=TNewsDataset,
41
+ path='json',
42
+ abbr='tnews-test',
43
+ data_files='./data/FewCLUE/tnews/test_public.json',
44
+ split='train',
45
+ reader_cfg=tnews_reader_cfg,
46
+ infer_cfg=tnews_infer_cfg,
47
+ eval_cfg=tnews_eval_cfg)
48
+ ]
build/lib/opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_fff486.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import TNewsDataset
6
+
7
+ tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2')
8
+
9
+ tnews_labels = [
10
+ '农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯',
11
+ '军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻'
12
+ ]
13
+
14
+ tnews_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template={
18
+ lb: dict(round=[
19
+ dict(role='HUMAN', prompt='以下内容属于什么新闻:{sentence}。'),
20
+ dict(role='BOT', prompt=lb)
21
+ ])
22
+ for lb in tnews_labels
23
+ }),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=PPLInferencer))
26
+
27
+ tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
28
+
29
+ tnews_datasets = [
30
+ dict(
31
+ type=TNewsDataset,
32
+ path='json',
33
+ abbr='tnews-dev',
34
+ data_files='./data/FewCLUE/tnews/dev_few_all.json',
35
+ split='train',
36
+ reader_cfg=tnews_reader_cfg,
37
+ infer_cfg=tnews_infer_cfg,
38
+ eval_cfg=tnews_eval_cfg),
39
+ dict(
40
+ type=TNewsDataset,
41
+ path='json',
42
+ abbr='tnews-test',
43
+ data_files='./data/FewCLUE/tnews/test_public.json',
44
+ split='train',
45
+ reader_cfg=tnews_reader_cfg,
46
+ infer_cfg=tnews_infer_cfg,
47
+ eval_cfg=tnews_eval_cfg)
48
+ ]
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FinanceIQ_gen_e0e6b5 import financeIQ_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import FinanceIQDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ financeIQ_subject_mapping_en = {
9
+ 'certified_public_accountant': '注册会计师(CPA)',
10
+ 'banking_qualification': '银行从业资格',
11
+ 'securities_qualification': '证券从业资格',
12
+ 'fund_qualification': '基金从业资格',
13
+ 'insurance_qualification': '保险从业资格CICE',
14
+ 'economic_analyst': '经济师',
15
+ 'taxation_practitioner': '税务师',
16
+ 'futures_qualification': '期货从业资格',
17
+ 'certified_fin_planner': '理财规划师',
18
+ 'actuary_fin_math': '精算师-金融数学',
19
+ }
20
+
21
+ financeIQ_subject_mapping = {
22
+ '注册会计师(CPA)': '注册会计师(CPA)',
23
+ '银行从业资格': '银行从业资格',
24
+ '证券从业资格': '证券从业资格',
25
+ '基金从业资格': '基金从业资格',
26
+ '保险从业资格CICE': '保险从业资格CICE',
27
+ '经济师': '经济师',
28
+ '税务师': '税务师',
29
+ '期货从业资格': '期货从业资格',
30
+ '理财规划师': '理财规划师',
31
+ '精算师-金融数学': '精算师-金融数学',
32
+ }
33
+
34
+ financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
35
+
36
+ financeIQ_datasets = []
37
+ for _name in financeIQ_all_sets:
38
+ _ch_name = financeIQ_subject_mapping[_name]
39
+ financeIQ_infer_cfg = dict(
40
+ ice_template=dict(
41
+ type=PromptTemplate,
42
+ template=dict(
43
+ begin='</E>',
44
+ round=[
45
+ dict(
46
+ role='HUMAN',
47
+ prompt=
48
+ f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
49
+ ),
50
+ dict(role='BOT', prompt='答案是: {answer}'),
51
+ ]),
52
+ ice_token='</E>',
53
+ ),
54
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
55
+ inferencer=dict(type=GenInferencer),
56
+ )
57
+
58
+ financeIQ_eval_cfg = dict(
59
+ evaluator=dict(type=AccEvaluator),
60
+ pred_postprocessor=dict(type=first_capital_postprocess))
61
+
62
+ financeIQ_datasets.append(
63
+ dict(
64
+ type=FinanceIQDataset,
65
+ path='./data/FinanceIQ/',
66
+ name=_name,
67
+ abbr=f'FinanceIQ-{_name}',
68
+ reader_cfg=dict(
69
+ input_columns=['question', 'A', 'B', 'C', 'D'],
70
+ output_column='answer',
71
+ train_split='dev',
72
+ test_split='test'),
73
+ infer_cfg=financeIQ_infer_cfg,
74
+ eval_cfg=financeIQ_eval_cfg,
75
+ ))
76
+
77
+ del _name, _ch_name
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FinanceIQ_ppl_42b9bd import financeIQ_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import FinanceIQDataset
6
+
7
+ financeIQ_subject_mapping_en = {
8
+ 'certified_public_accountant': '注册会计师(CPA)',
9
+ 'banking_qualification': '银行从业资格',
10
+ 'securities_qualification': '证券从业资格',
11
+ 'fund_qualification': '基金从业资格',
12
+ 'insurance_qualification': '保险从业资格CICE',
13
+ 'economic_analyst': '经济师',
14
+ 'taxation_practitioner': '税务师',
15
+ 'futures_qualification': '期货从业资格',
16
+ 'certified_fin_planner': '理财规划师',
17
+ 'actuary_fin_math': '精算师-金融数学',
18
+ }
19
+
20
+ financeIQ_subject_mapping = {
21
+ '注册会计师(CPA)': '注册会计师(CPA)',
22
+ '银行从业资格': '银行从业资格',
23
+ '证券从业资格': '证券从业资格',
24
+ '基金从业资格': '基金从业资格',
25
+ '保险从业资格CICE': '保险从业资格CICE',
26
+ '经济师': '经济师',
27
+ '税务师': '税务师',
28
+ '期货从业资格': '期货从业资格',
29
+ '理财规划师': '理财规划师',
30
+ '精算师-金融数学': '精算师-金融数学',
31
+ }
32
+
33
+ financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
34
+
35
+ financeIQ_datasets = []
36
+ for _name in financeIQ_all_sets:
37
+ _ch_name = financeIQ_subject_mapping[_name]
38
+ financeIQ_infer_cfg = dict(
39
+ ice_template=dict(
40
+ type=PromptTemplate,
41
+ template={
42
+ answer: dict(
43
+ begin='</E>',
44
+ round=[
45
+ dict(
46
+ role='HUMAN',
47
+ prompt=f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
48
+ ),
49
+ dict(role='BOT', prompt=f'答案是: {answer}'),
50
+ ])
51
+ for answer in ['A', 'B', 'C', 'D']
52
+ },
53
+ ice_token='</E>',
54
+ ),
55
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
56
+ inferencer=dict(type=PPLInferencer),
57
+ )
58
+
59
+ financeIQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
60
+
61
+ financeIQ_datasets.append(
62
+ dict(
63
+ type=FinanceIQDataset,
64
+ path='./data/FinanceIQ/',
65
+ name=_name,
66
+ abbr=f'FinanceIQ-{_name}',
67
+ reader_cfg=dict(
68
+ input_columns=['question', 'A', 'B', 'C', 'D'],
69
+ output_column='answer',
70
+ train_split='dev',
71
+ test_split='test'),
72
+ infer_cfg=financeIQ_infer_cfg,
73
+ eval_cfg=financeIQ_eval_cfg,
74
+ ))
75
+
76
+ del _name, _ch_name
build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GLUE_CoLA_ppl_77d0df import CoLA_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl_77d0df.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+
8
+ _hint = 'The following are text classification questions. \n' \
9
+ 'Please determine whether the following sentence is linguistically acceptable: ' \
10
+ '0 means unacceptable, 1 means acceptable.\n'
11
+
12
+ CoLA_infer_cfg = dict(
13
+ ice_template=dict(
14
+ type=PromptTemplate,
15
+ template='Sentence: {sentence}\nResult: {label}',
16
+ ),
17
+ prompt_template=dict(
18
+ type=PromptTemplate,
19
+ template={
20
+ answer:
21
+ f'{_hint}</E>Sentence: {{sentence}}\nResult: {answer}'
22
+ for answer in [0, 1]
23
+ },
24
+ ice_token='</E>',
25
+ ),
26
+ retriever=dict(type=FixKRetriever, fix_id_list=[17, 18, 19, 20, 21]),
27
+ inferencer=dict(type=PPLInferencer))
28
+
29
+ CoLA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
30
+
31
+ CoLA_datasets = []
32
+ for _split in ['validation']:
33
+
34
+ CoLA_reader_cfg = dict(
35
+ input_columns=['sentence'],
36
+ output_column='label',
37
+ test_split=_split
38
+ )
39
+
40
+ CoLA_datasets.append(
41
+ dict(
42
+ abbr=f'CoLA-{_split}',
43
+ type=HFDataset,
44
+ path='glue',
45
+ name='cola',
46
+ reader_cfg=CoLA_reader_cfg,
47
+ infer_cfg=CoLA_infer_cfg,
48
+ eval_cfg=CoLA_eval_cfg
49
+ )
50
+ )
build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GLUE_MRPC_ppl_96564c import MRPC_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl_96564c.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+
8
+ _hint = 'The following are semantic matching questions. \n' \
9
+ 'Please determine whether the following two sentences are semantically equivalent: ' \
10
+ '0 means not equivalent, 1 means equivalent.\n'
11
+ MRPC_infer_cfg = dict(
12
+ ice_template=dict(
13
+ type=PromptTemplate,
14
+ template='Sentence one: {sentence1}\nSentence two: {sentence2}\nResult: {label}',
15
+ ),
16
+ prompt_template=dict(
17
+ type=PromptTemplate,
18
+ template={
19
+ answer:
20
+ f'{_hint}</E>Sentence one: {{sentence1}}\nSentence two: {{sentence2}}\nResult: {answer}'
21
+ for answer in [0, 1]
22
+ },
23
+ ice_token='</E>',
24
+ ),
25
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
26
+ inferencer=dict(type=PPLInferencer))
27
+
28
+ MRPC_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
29
+
30
+
31
+ MRPC_datasets = []
32
+ for _split in ['validation', 'test']:
33
+
34
+ MRPC_reader_cfg = dict(
35
+ input_columns=['sentence1', 'sentence2'],
36
+ output_column='label',
37
+ train_split='train',
38
+ test_split=_split
39
+ )
40
+
41
+ MRPC_datasets.append(
42
+ dict(
43
+ abbr=f'MRPC-{_split}',
44
+ type=HFDataset,
45
+ path='glue',
46
+ name='mrpc',
47
+ reader_cfg=MRPC_reader_cfg,
48
+ infer_cfg=MRPC_infer_cfg,
49
+ eval_cfg=MRPC_eval_cfg
50
+ )
51
+ )
build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GLUE_QQP_ppl_250d00 import QQP_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl_250d00.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+
8
+ _hint = 'The following are semantic matching questions. \n' \
9
+ 'Please determine whether the following two sentences are semantically duplicate: ' \
10
+ '0 means not duplicate, 1 means duplicate.\n'
11
+ QQP_infer_cfg = dict(
12
+ ice_template=dict(
13
+ type=PromptTemplate,
14
+ template='Sentence one: {question1}\nSentence two: {question2}\nResult: {label}',
15
+ ),
16
+ prompt_template=dict(
17
+ type=PromptTemplate,
18
+ template={
19
+ answer:
20
+ f'{_hint}</E>Sentence one: {{question1}}\nSentence two: {{question2}}\nResult: {answer}'
21
+ for answer in [0, 1]
22
+ },
23
+ ice_token='</E>',
24
+ ),
25
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
26
+ inferencer=dict(type=PPLInferencer))
27
+
28
+ QQP_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
29
+
30
+
31
+ QQP_datasets = []
32
+ for _split in ['validation', 'test']:
33
+
34
+ QQP_reader_cfg = dict(
35
+ input_columns=['question1', 'question2'],
36
+ output_column='label',
37
+ train_split='train',
38
+ test_split=_split
39
+ )
40
+
41
+ QQP_datasets.append(
42
+ dict(
43
+ abbr=f'QQP-{_split}',
44
+ type=HFDataset,
45
+ path='glue',
46
+ name='qqp',
47
+ reader_cfg=QQP_reader_cfg,
48
+ infer_cfg=QQP_infer_cfg,
49
+ eval_cfg=QQP_eval_cfg
50
+ )
51
+ )
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GaokaoBench_gen_5cfe9e import GaokaoBench_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import GaokaoBenchDataset
5
+
6
+
7
+ _MCQ_prompts = [
8
+ {
9
+ 'type': 'single_choice',
10
+ 'keyword': '2010-2022_Math_II_MCQs',
11
+ 'prefix_prompt':
12
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
13
+ 'comment': ''
14
+ },
15
+ {
16
+ 'type': 'single_choice',
17
+ 'keyword': '2010-2022_Math_I_MCQs',
18
+ 'prefix_prompt':
19
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
20
+ 'comment': ''
21
+ },
22
+ {
23
+ 'type':
24
+ 'single_choice',
25
+ 'keyword':
26
+ '2010-2022_History_MCQs',
27
+ 'prefix_prompt':
28
+ '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
29
+ },
30
+ {
31
+ 'type':
32
+ 'single_choice',
33
+ 'keyword':
34
+ '2010-2022_Biology_MCQs',
35
+ 'prefix_prompt':
36
+ '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
37
+ },
38
+ {
39
+ 'type':
40
+ 'single_choice',
41
+ 'keyword':
42
+ '2010-2022_Political_Science_MCQs',
43
+ 'prefix_prompt':
44
+ '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
45
+ },
46
+ {
47
+ 'type':
48
+ 'multi_choice',
49
+ 'keyword':
50
+ '2010-2022_Physics_MCQs',
51
+ 'prefix_prompt':
52
+ '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
53
+ },
54
+ {
55
+ 'type':
56
+ 'single_choice',
57
+ 'keyword':
58
+ '2010-2022_Chemistry_MCQs',
59
+ 'prefix_prompt':
60
+ '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
61
+ },
62
+ {
63
+ 'type':
64
+ 'single_choice',
65
+ 'keyword':
66
+ '2010-2013_English_MCQs',
67
+ 'prefix_prompt':
68
+ '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
69
+ },
70
+ {
71
+ 'type':
72
+ 'multi_question_choice',
73
+ 'keyword':
74
+ '2010-2022_Chinese_Modern_Lit',
75
+ 'prefix_prompt':
76
+ '请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
77
+ },
78
+ {
79
+ 'type':
80
+ 'multi_question_choice',
81
+ 'keyword':
82
+ '2010-2022_English_Fill_in_Blanks',
83
+ 'prefix_prompt':
84
+ '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
85
+ },
86
+ {
87
+ 'type':
88
+ 'five_out_of_seven',
89
+ 'keyword':
90
+ '2012-2022_English_Cloze_Test',
91
+ 'prefix_prompt':
92
+ '请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
93
+ },
94
+ {
95
+ 'type':
96
+ 'multi_question_choice',
97
+ 'keyword':
98
+ '2010-2022_Geography_MCQs',
99
+ 'prefix_prompt':
100
+ '请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
101
+ },
102
+ {
103
+ 'type':
104
+ 'multi_question_choice',
105
+ 'keyword':
106
+ '2010-2022_English_Reading_Comp',
107
+ 'prefix_prompt':
108
+ '请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
109
+ },
110
+ {
111
+ 'type':
112
+ 'multi_question_choice',
113
+ 'keyword':
114
+ '2010-2022_Chinese_Lang_and_Usage_MCQs',
115
+ 'prefix_prompt':
116
+ '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
117
+ },
118
+ ]
119
+ _FBQ_prompts = [{
120
+ 'type': 'cloze',
121
+ 'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
122
+ 'prefix_prompt':
123
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
124
+ 'comment': ''
125
+ }, {
126
+ 'type': 'cloze',
127
+ 'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
128
+ 'prefix_prompt':
129
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
130
+ 'comment': ''
131
+ }, {
132
+ 'type': 'cloze',
133
+ 'keyword':
134
+ '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
135
+ 'prefix_prompt':
136
+ '请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
137
+ 'comment': ''
138
+ }, {
139
+ 'type': 'cloze',
140
+ 'keyword': '2014-2022_English_Language_Cloze_Passage',
141
+ 'prefix_prompt':
142
+ '请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
143
+ 'comment': ''
144
+ }]
145
+ _OEQ_prompts = [
146
+ {
147
+ 'type': 'subjective',
148
+ 'keyword': '2010-2022_Geography_Open-ended_Questions',
149
+ 'prefix_prompt':
150
+ '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果���止一道题,请分别作答。\n题目如下:',
151
+ 'comment': ''
152
+ },
153
+ {
154
+ 'type': 'subjective',
155
+ 'keyword': '2010-2022_Chemistry_Open-ended_Questions',
156
+ 'prefix_prompt':
157
+ '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
158
+ 'comment': ''
159
+ },
160
+ {
161
+ 'type': 'subjective',
162
+ 'keyword': '2010-2022_Math_I_Open-ended_Questions',
163
+ 'prefix_prompt':
164
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
165
+ 'comment': ''
166
+ },
167
+ {
168
+ 'type': 'subjective',
169
+ 'keyword': '2010-2022_History_Open-ended_Questions',
170
+ 'prefix_prompt':
171
+ '请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
172
+ 'comment': ''
173
+ },
174
+ {
175
+ 'type': 'subjective',
176
+ 'keyword': '2010-2022_Biology_Open-ended_Questions',
177
+ 'prefix_prompt':
178
+ '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
179
+ 'comment': ''
180
+ },
181
+ {
182
+ 'type': 'subjective',
183
+ 'keyword': '2010-2022_Math_II_Open-ended_Questions',
184
+ 'prefix_prompt':
185
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
186
+ 'comment': ''
187
+ },
188
+ {
189
+ 'type': 'subjective',
190
+ 'keyword': '2010-2022_Physics_Open-ended_Questions',
191
+ 'prefix_prompt':
192
+ '请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
193
+ 'comment': ''
194
+ },
195
+ {
196
+ 'type': 'subjective',
197
+ 'keyword': '2010-2022_Political_Science_Open-ended_Questions',
198
+ 'prefix_prompt':
199
+ '请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
200
+ 'comment': ''
201
+ },
202
+ {
203
+ 'type': 'correction',
204
+ 'keyword': '2012-2022_English_Language_Error_Correction',
205
+ 'prefix_prompt':
206
+ '请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一���步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
207
+ # "prefix_prompt": [
208
+ # "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
209
+ # "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
210
+ # ],
211
+ 'comment': ''
212
+ },
213
+ {
214
+ 'type': 'subjective',
215
+ 'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
216
+ 'prefix_prompt':
217
+ '请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
218
+ 'comment': ''
219
+ },
220
+ {
221
+ 'type': 'subjective',
222
+ 'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
223
+ 'prefix_prompt':
224
+ '请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
225
+ 'comment': ''
226
+ },
227
+ {
228
+ 'type': 'subjective',
229
+ 'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
230
+ 'prefix_prompt':
231
+ '请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
232
+ 'comment': ''
233
+ },
234
+ {
235
+ 'type': 'subjective',
236
+ 'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
237
+ 'prefix_prompt':
238
+ '请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
239
+ 'comment': ''
240
+ },
241
+ {
242
+ 'type': 'subjective',
243
+ 'keyword':
244
+ '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
245
+ 'prefix_prompt':
246
+ '请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
247
+ 'comment': ''
248
+ }
249
+ ]
250
+
251
+ GaokaoBench_datasets = []
252
+ for _folder, _prompts in [
253
+ ('Multiple-choice_Questions', _MCQ_prompts),
254
+ ('Fill-in-the-blank_Questions', _FBQ_prompts),
255
+ ('Open-ended_Questions', _OEQ_prompts),
256
+ ]:
257
+ for _p in _prompts:
258
+ _reader_cfg = {
259
+ 'input_columns': ['question'],
260
+ 'output_column': 'answer',
261
+ }
262
+ _infer_cfg = {
263
+ 'ice_template': {
264
+ 'type': PromptTemplate,
265
+ 'template': {
266
+ 'round': [{
267
+ 'role': 'HUMAN',
268
+ 'prompt': _p['prefix_prompt'] + '{question}'
269
+ }]
270
+ },
271
+ 'ice_token': '</E>'
272
+ },
273
+ 'retriever': {
274
+ 'type': ZeroRetriever
275
+ },
276
+ 'inferencer': {
277
+ 'type': GenInferencer,
278
+ 'max_out_len': 1024,
279
+ }
280
+ }
281
+ _eval_cfg = {
282
+ 'evaluator': {
283
+ 'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
284
+ },
285
+ 'pred_role': 'BOT',
286
+ }
287
+ _base_path = 'opencompass/GAOKAO-BENCH'
288
+ _dataset = {
289
+ 'type': GaokaoBenchDataset,
290
+ 'abbr': 'GaokaoBench_' + _p['keyword'],
291
+ 'path': _base_path,
292
+ 'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
293
+ 'name': _p['keyword'],
294
+ 'reader_cfg': _reader_cfg,
295
+ 'infer_cfg': _infer_cfg,
296
+ 'eval_cfg': _eval_cfg,
297
+ }
298
+
299
+ GaokaoBench_datasets.append(_dataset)
300
+
301
+ _temporary_variables = [k for k in globals() if k.startswith('_')]
302
+ for _t in _temporary_variables:
303
+ del globals()[_t]
304
+ del _temporary_variables, _t
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GaokaoBench_mixed_9af5ee import GaokaoBench_datasets # noqa: F401, F403
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
4
+ from opencompass.datasets import GaokaoBenchDataset
5
+ _MCQ_prompts = [
6
+ {
7
+ 'type': 'single_choice',
8
+ 'keyword': '2010-2022_Math_II_MCQs',
9
+ 'prefix_prompt':
10
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
11
+ 'comment': ''
12
+ },
13
+ {
14
+ 'type': 'single_choice',
15
+ 'keyword': '2010-2022_Math_I_MCQs',
16
+ 'prefix_prompt':
17
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
18
+ 'comment': ''
19
+ },
20
+ {
21
+ 'type':
22
+ 'single_choice',
23
+ 'keyword':
24
+ '2010-2022_History_MCQs',
25
+ 'prefix_prompt':
26
+ '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
27
+ },
28
+ {
29
+ 'type':
30
+ 'single_choice',
31
+ 'keyword':
32
+ '2010-2022_Biology_MCQs',
33
+ 'prefix_prompt':
34
+ '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
35
+ },
36
+ {
37
+ 'type':
38
+ 'single_choice',
39
+ 'keyword':
40
+ '2010-2022_Political_Science_MCQs',
41
+ 'prefix_prompt':
42
+ '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
43
+ },
44
+ {
45
+ 'type':
46
+ 'multi_choice',
47
+ 'keyword':
48
+ '2010-2022_Physics_MCQs',
49
+ 'prefix_prompt':
50
+ '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
51
+ },
52
+ {
53
+ 'type':
54
+ 'single_choice',
55
+ 'keyword':
56
+ '2010-2022_Chemistry_MCQs',
57
+ 'prefix_prompt':
58
+ '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
59
+ },
60
+ {
61
+ 'type':
62
+ 'single_choice',
63
+ 'keyword':
64
+ '2010-2013_English_MCQs',
65
+ 'prefix_prompt':
66
+ '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
67
+ },
68
+ {
69
+ 'type':
70
+ 'multi_question_choice',
71
+ 'keyword':
72
+ '2010-2022_Chinese_Modern_Lit',
73
+ 'prefix_prompt':
74
+ '请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
75
+ },
76
+ {
77
+ 'type':
78
+ 'multi_question_choice',
79
+ 'keyword':
80
+ '2010-2022_English_Fill_in_Blanks',
81
+ 'prefix_prompt':
82
+ '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
83
+ },
84
+ {
85
+ 'type':
86
+ 'five_out_of_seven',
87
+ 'keyword':
88
+ '2012-2022_English_Cloze_Test',
89
+ 'prefix_prompt':
90
+ '请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
91
+ },
92
+ {
93
+ 'type':
94
+ 'multi_question_choice',
95
+ 'keyword':
96
+ '2010-2022_Geography_MCQs',
97
+ 'prefix_prompt':
98
+ '请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
99
+ },
100
+ {
101
+ 'type':
102
+ 'multi_question_choice',
103
+ 'keyword':
104
+ '2010-2022_English_Reading_Comp',
105
+ 'prefix_prompt':
106
+ '请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
107
+ },
108
+ {
109
+ 'type':
110
+ 'multi_question_choice',
111
+ 'keyword':
112
+ '2010-2022_Chinese_Lang_and_Usage_MCQs',
113
+ 'prefix_prompt':
114
+ '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
115
+ },
116
+ ]
117
+ _FBQ_prompts = [{
118
+ 'type': 'cloze',
119
+ 'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
120
+ 'prefix_prompt':
121
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
122
+ 'comment': ''
123
+ }, {
124
+ 'type': 'cloze',
125
+ 'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
126
+ 'prefix_prompt':
127
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
128
+ 'comment': ''
129
+ }, {
130
+ 'type': 'cloze',
131
+ 'keyword':
132
+ '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
133
+ 'prefix_prompt':
134
+ '请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
135
+ 'comment': ''
136
+ }, {
137
+ 'type': 'cloze',
138
+ 'keyword': '2014-2022_English_Language_Cloze_Passage',
139
+ 'prefix_prompt':
140
+ '请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
141
+ 'comment': ''
142
+ }]
143
+ _OEQ_prompts = [
144
+ {
145
+ 'type': 'subjective',
146
+ 'keyword': '2010-2022_Geography_Open-ended_Questions',
147
+ 'prefix_prompt':
148
+ '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作��,如果不止一道题,请分别作答。\n题目如下:',
149
+ 'comment': ''
150
+ },
151
+ {
152
+ 'type': 'subjective',
153
+ 'keyword': '2010-2022_Chemistry_Open-ended_Questions',
154
+ 'prefix_prompt':
155
+ '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
156
+ 'comment': ''
157
+ },
158
+ {
159
+ 'type': 'subjective',
160
+ 'keyword': '2010-2022_Math_I_Open-ended_Questions',
161
+ 'prefix_prompt':
162
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
163
+ 'comment': ''
164
+ },
165
+ {
166
+ 'type': 'subjective',
167
+ 'keyword': '2010-2022_History_Open-ended_Questions',
168
+ 'prefix_prompt':
169
+ '请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
170
+ 'comment': ''
171
+ },
172
+ {
173
+ 'type': 'subjective',
174
+ 'keyword': '2010-2022_Biology_Open-ended_Questions',
175
+ 'prefix_prompt':
176
+ '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
177
+ 'comment': ''
178
+ },
179
+ {
180
+ 'type': 'subjective',
181
+ 'keyword': '2010-2022_Math_II_Open-ended_Questions',
182
+ 'prefix_prompt':
183
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
184
+ 'comment': ''
185
+ },
186
+ {
187
+ 'type': 'subjective',
188
+ 'keyword': '2010-2022_Physics_Open-ended_Questions',
189
+ 'prefix_prompt':
190
+ '请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
191
+ 'comment': ''
192
+ },
193
+ {
194
+ 'type': 'subjective',
195
+ 'keyword': '2010-2022_Political_Science_Open-ended_Questions',
196
+ 'prefix_prompt':
197
+ '请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
198
+ 'comment': ''
199
+ },
200
+ {
201
+ 'type': 'correction',
202
+ 'keyword': '2012-2022_English_Language_Error_Correction',
203
+ 'prefix_prompt':
204
+ '请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方��请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
205
+ # "prefix_prompt": [
206
+ # "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
207
+ # "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
208
+ # ],
209
+ 'comment': ''
210
+ },
211
+ {
212
+ 'type': 'subjective',
213
+ 'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
214
+ 'prefix_prompt':
215
+ '请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
216
+ 'comment': ''
217
+ },
218
+ {
219
+ 'type': 'subjective',
220
+ 'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
221
+ 'prefix_prompt':
222
+ '请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
223
+ 'comment': ''
224
+ },
225
+ {
226
+ 'type': 'subjective',
227
+ 'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
228
+ 'prefix_prompt':
229
+ '请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
230
+ 'comment': ''
231
+ },
232
+ {
233
+ 'type': 'subjective',
234
+ 'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
235
+ 'prefix_prompt':
236
+ '请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
237
+ 'comment': ''
238
+ },
239
+ {
240
+ 'type': 'subjective',
241
+ 'keyword':
242
+ '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
243
+ 'prefix_prompt':
244
+ '请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
245
+ 'comment': ''
246
+ }
247
+ ]
248
+
249
+ GaokaoBench_datasets = []
250
+ for _folder, _prompts in [
251
+ ('Multiple-choice_Questions', _MCQ_prompts),
252
+ ('Fill-in-the-blank_Questions', _FBQ_prompts),
253
+ ('Open-ended_Questions', _OEQ_prompts),
254
+ ]:
255
+ for _p in _prompts:
256
+ if _p['type'] == 'single_choice':
257
+ continue
258
+ _reader_cfg = {
259
+ 'input_columns': ['question'],
260
+ 'output_column': 'answer',
261
+ }
262
+ _infer_cfg = {
263
+ 'ice_template': {
264
+ 'type': PromptTemplate,
265
+ 'template': {
266
+ 'round': [{
267
+ 'role': 'HUMAN',
268
+ 'prompt': _p['prefix_prompt'] + '{question}'
269
+ }]
270
+ },
271
+ 'ice_token': '</E>'
272
+ },
273
+ 'retriever': {
274
+ 'type': ZeroRetriever
275
+ },
276
+ 'inferencer': {
277
+ 'type': GenInferencer,
278
+ 'max_out_len': 1024,
279
+ }
280
+ }
281
+ _eval_cfg = {
282
+ 'evaluator': {
283
+ 'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
284
+ },
285
+ 'pred_role': 'BOT',
286
+ }
287
+ _base_path = './data/GAOKAO-BENCH/data'
288
+ _dataset = {
289
+ 'type': GaokaoBenchDataset,
290
+ 'abbr': 'GaokaoBench_' + _p['keyword'],
291
+ 'path': _base_path,
292
+ 'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
293
+ 'name': _p['keyword'],
294
+ 'reader_cfg': _reader_cfg,
295
+ 'infer_cfg': _infer_cfg,
296
+ 'eval_cfg': _eval_cfg,
297
+ }
298
+
299
+ GaokaoBench_datasets.append(_dataset)
300
+
301
+ _folder = 'Multiple-choice_Questions'
302
+ for _p in _MCQ_prompts:
303
+ if _p['type'] != 'single_choice':
304
+ continue
305
+ _reader_cfg = {
306
+ 'input_columns': ['question'],
307
+ 'output_column': 'answer',
308
+ }
309
+ _infer_cfg = {
310
+ 'ice_template': {
311
+ 'type': PromptTemplate,
312
+ 'template': {
313
+ answer: {
314
+ 'round': [{
315
+ 'role': 'HUMAN',
316
+ 'prompt': _p['prefix_prompt'] + '{question}'
317
+ }, {
318
+ 'role': 'BOT',
319
+ 'prompt': f'【答案】{answer} <eoa>'
320
+ }]
321
+ }
322
+ for answer in ['A', 'B', 'C', 'D']
323
+ },
324
+ 'ice_token': '</E>'
325
+ },
326
+ 'retriever': {
327
+ 'type': ZeroRetriever
328
+ },
329
+ 'inferencer': {
330
+ 'type': PPLInferencer
331
+ }
332
+ }
333
+ _eval_cfg = {
334
+ 'evaluator': {
335
+ 'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
336
+ },
337
+ 'pred_role': 'BOT',
338
+ }
339
+ _base_path = 'opencompass/GAOKAO-BENCH'
340
+ _dataset = {
341
+ 'type': GaokaoBenchDataset,
342
+ 'abbr': 'GaokaoBench_' + _p['keyword'],
343
+ 'path': _base_path,
344
+ 'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
345
+ 'name': _p['keyword'],
346
+ 'reader_cfg': _reader_cfg,
347
+ 'infer_cfg': _infer_cfg,
348
+ 'eval_cfg': _eval_cfg,
349
+ }
350
+
351
+ GaokaoBench_datasets.append(_dataset)
352
+
353
+ _temporary_variables = [k for k in globals() if k.startswith('_')]
354
+ for _t in _temporary_variables:
355
+ del globals()[_t]
356
+ del _temporary_variables, _t
build/lib/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import GaokaoBenchDataset
6
+ from mmengine.config import read_base
7
+
8
+ with read_base():
9
+ from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
10
+
11
+ GaokaoBench_datasets = []
12
+ for folder, prompts in [
13
+ ('Multiple-choice_Questions', MCQ_prompts),
14
+ ('Fill-in-the-blank_Questions', FBQ_prompts),
15
+ ]:
16
+ for p in prompts:
17
+ reader_cfg = {
18
+ 'input_columns': ['question'],
19
+ 'output_column': 'answer',
20
+ }
21
+ infer_cfg = {
22
+ 'ice_template': {
23
+ 'type': PromptTemplate,
24
+ 'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
25
+ 'ice_token': '</E>',
26
+ },
27
+ 'retriever': {'type': ZeroRetriever},
28
+ 'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
29
+ }
30
+ eval_cfg = {
31
+ 'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
32
+ 'pred_role': 'BOT',
33
+ }
34
+ _base_path = 'opencompass/GAOKAO-BENCH'
35
+ dataset = {
36
+ 'type': GaokaoBenchDataset,
37
+ 'abbr': 'GaokaoBench_' + p['keyword'],
38
+ 'path': _base_path,
39
+ 'filename': '/' + folder + '/' + p['keyword'] + '.json',
40
+ 'name': p['keyword'],
41
+ 'reader_cfg': reader_cfg,
42
+ 'infer_cfg': infer_cfg,
43
+ 'eval_cfg': eval_cfg,
44
+ }
45
+ GaokaoBench_datasets.append(dataset)