raptorkwok commited on
Commit
fe6f409
·
1 Parent(s): 3c9928e

suppress warnings

Browse files
.ipynb_checkpoints/README-checkpoint.md DELETED
@@ -1,33 +0,0 @@
1
- ---
2
- library_name: evaluate
3
- emoji: 🤗
4
- colorFrom: blue
5
- colorTo: red
6
- datasets:
7
- - raptorkwok/cantonese-traditional-chinese-parallel-corpus-gen3
8
- tags:
9
- - nlp
10
- - translation
11
- - chinese
12
- - meteor
13
- - jieba
14
- description: A BLEU implementation dedicated for Chinese sentences
15
- sdk: gradio
16
- sdk_version: 3.19.1
17
- app_file: app.py
18
- pinned: false
19
- ---
20
-
21
- # # Metric Card for ChineseMETEOR
22
-
23
- Chinese METEOR Implementation
24
-
25
- ```python
26
- import evaluate
27
- meteor = evaluate.load("raptorkwok/chinesemeteor")
28
- results = meteor.compute(
29
- predictions=["我在這裡吃飯"],
30
- references=["我在這裡吃飯"]
31
- )
32
- print(results)
33
- # {'meteor': 1.0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/chinesemeteor-checkpoint.py DELETED
@@ -1,222 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """
16
- Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
17
- """
18
- import jieba_fast as jieba
19
- import datasets
20
- from typing import List, Dict
21
- import numpy as np
22
- from nltk.translate import meteor_score
23
- from nltk import word_tokenize
24
- import nltk
25
- import evaluate
26
- import re
27
-
28
- # suppress WordNet warnings
29
- import warnings
30
- warnings.filterwarnings("ignore")
31
- warnings.filterwarnings(
32
- "ignore",
33
- message="more than one synset, returning the first",
34
- category=UserWarning,
35
- module="nltk.translate.meteor_score"
36
- )
37
-
38
- # Download once
39
- nltk.download("wordnet", quiet=True)
40
- nltk.download("omw-1.4", quiet=True)
41
- nltk.download("punkt", quiet=True)
42
- nltk.download('punkt_tab', quiet=True)
43
-
44
- # ------------------------------------------------------------------- #
45
- # REAL Chinese WordNet (CwnGraph) Integration
46
- # ------------------------------------------------------------------- #
47
- _cwn = None
48
- def _load_cwn():
49
- global _cwn
50
- if _cwn is None:
51
- try:
52
- from CwnGraph import CwnImage
53
- print("Loading Chinese WordNet (CwnGraph, first time only)...")
54
- _cwn = CwnImage.latest()
55
- except ImportError:
56
- raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph")
57
- return _cwn
58
-
59
- # Helper to get lemma name (with fallback for API versions)
60
- def _get_lemma_name(lemma):
61
- try:
62
- return lemma.name
63
- except AttributeError:
64
- return str(lemma).split(': ')[1].split('_')[0]
65
-
66
- # Custom Lemma & Synset for NLTK compatibility
67
- class _CwnLemma:
68
- def __init__(self, name): self._name = name
69
- def name(self): return self._name
70
-
71
- class _CwnSynset:
72
- def __init__(self, lemmas, synset_id):
73
- self._lemmas = lemmas
74
- self._id = synset_id
75
- def lemmas(self):
76
- return [_CwnLemma(name) for name in self._lemmas]
77
-
78
- # ------------------------------------------------------------------- #
79
- # HuggingFace Evaluation Metric
80
- # ------------------------------------------------------------------- #
81
-
82
- _DESCRIPTION = """\
83
- This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation.
84
- """
85
-
86
- _KWARGS_DESCRIPTION = """
87
- Calculates how good are predictions given some references, using certain scores
88
- Args:
89
- predictions (str): translation sentence to score.
90
- references (str): reference sentence for each translation.
91
- Returns:
92
- meteor: the average METEOR score
93
- scores: the METEOR score for each sentence pairs
94
-
95
- Examples:
96
- Examples should be written in doctest format, and should illustrate how
97
- to use the function.
98
-
99
- >>> cmeteor = evaluate.load("raptorkwok/chinesemeteor")
100
- >>> results = cmeteor.compute(references=["我在這裡吃飯"], predictions=["我在這兒吃晚飯"])
101
- >>> print(results)
102
- {'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]}
103
- """
104
-
105
- # ------------------------------------------------------------------- #
106
- # HuggingFace evaluate template
107
- # ------------------------------------------------------------------- #
108
- @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
109
- class ChineseMETEOR(evaluate.Metric):
110
- """TODO: Short description not ready yet."""
111
-
112
- def _info(self):
113
- return evaluate.MetricInfo(
114
- module_type="metric",
115
- description=_DESCRIPTION,
116
- citation="""@inproceedings{denkowski-lavie-2014-meteor,
117
- title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language",
118
- author = "Denkowski, Michael and Lavie, Alon",
119
- booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
120
- year = "2014"
121
- }""",
122
- inputs_description=_KWARGS_DESCRIPTION,
123
- features=datasets.Features(
124
- {
125
- "predictions": datasets.Value("string"),
126
- "references": datasets.Value("string"),
127
- }
128
- ),
129
- # Homepage of the module for documentation
130
- homepage="https://yourappapp.com",
131
- # Additional links to the codebase or references
132
- codebase_urls=["https://github.com/nltk/nltk"],
133
- reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"],
134
- )
135
-
136
- def _download_and_prepare(self, dl_manager) -> None:
137
- """Download external resources useful to compute the scores"""
138
- import nltk
139
- nltk.download("wordnet", quiet=True)
140
- nltk.download("omw-1.4", quiet=True)
141
- nltk.download("punkt", quiet=True)
142
- nltk.download('punkt_tab', quiet=True)
143
- # CwnGraph auto-downloads on first use
144
-
145
- def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
146
- pred_seg = [" ".join(jieba.cut(p.strip())) for p in predictions]
147
- ref_seg = [" ".join(jieba.cut(r.strip())) for r in references]
148
-
149
- # --- Apply Real Chinese WordNet into METEOR algorithm ---
150
- def _cwn_synsets(self, word, pos=None): # Matches NLTK method call
151
- if not isinstance(word, str) or not word.strip():
152
- #print(f"DEBUG: Skipping non-string input: {type(word)}")
153
- return []
154
- cwn = _load_cwn()
155
- try:
156
- # Use escaped regex for exact match (CwnGraph expects string pattern)
157
- pattern = f"^{re.escape(word)}$"
158
- lemmas = cwn.find_lemma(pattern)
159
- except Exception as e:
160
- #print(f"DEBUG: Error querying CWN for '{word}': {e}")
161
- return []
162
-
163
- exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
164
- if not exact_lemmas:
165
- #print(f"DEBUG: No exact lemma found for '{word}'")
166
- return []
167
- synsets_list = []
168
- seen_synset_ids = set()
169
- for lemma in exact_lemmas:
170
- for sense in lemma.senses:
171
- synset = sense.synset
172
- if synset:
173
- try:
174
- synset_id = synset.id
175
- except AttributeError:
176
- synset_id = str(synset)
177
- if synset_id not in seen_synset_ids:
178
- seen_synset_ids.add(synset_id)
179
- try:
180
- synset_lemmas = synset.lemmas
181
- syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
182
- except AttributeError:
183
- synset_lemmas = []
184
- for s in synset.senses:
185
- try:
186
- # Access the single lemma via lemmas[0]
187
- lemma = s.lemmas[0]
188
- synset_lemmas.append(lemma)
189
- except (AttributeError, IndexError, TypeError):
190
- try:
191
- lemma = s.lemma
192
- synset_lemmas.append(lemma)
193
- except AttributeError:
194
- #print(f"DEBUG: Could not extract lemma from sense {s}")
195
- continue
196
- syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
197
- syn_lemmas_set = set(syn_lemma_names)
198
- if syn_lemmas_set:
199
- synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
200
- #print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
201
- return synsets_list[:1]
202
-
203
- # Use class for proper method binding
204
- class ChineseWordNet:
205
- def synsets(self, word, pos=None):
206
- return _cwn_synsets(self, word, pos)
207
-
208
- chinese_wn = ChineseWordNet()
209
-
210
- scores = [
211
- meteor_score.single_meteor_score(
212
- word_tokenize(ref),
213
- word_tokenize(hyp),
214
- wordnet=chinese_wn
215
- )
216
- for ref, hyp in zip(ref_seg, pred_seg)
217
- ]
218
-
219
- return {
220
- "meteor": float(np.mean(scores)),
221
- "scores": scores,
222
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
chinesemeteor.py CHANGED
@@ -15,16 +15,6 @@
15
  """
16
  Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
17
  """
18
- import jieba_fast as jieba
19
- import datasets
20
- from typing import List, Dict
21
- import numpy as np
22
- from nltk.translate import meteor_score
23
- from nltk import word_tokenize
24
- import nltk
25
- import evaluate
26
- import re
27
-
28
  # suppress WordNet warnings
29
  import warnings
30
  warnings.filterwarnings("ignore")
@@ -34,6 +24,25 @@ warnings.filterwarnings(
34
  category=UserWarning,
35
  module="nltk.translate.meteor_score"
36
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # Download once
39
  nltk.download("wordnet", quiet=True)
 
15
  """
16
  Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
17
  """
 
 
 
 
 
 
 
 
 
 
18
  # suppress WordNet warnings
19
  import warnings
20
  warnings.filterwarnings("ignore")
 
24
  category=UserWarning,
25
  module="nltk.translate.meteor_score"
26
  )
27
+ import logging
28
+ logging.getLogger("nltk").setLevel(logging.CRITICAL)
29
+
30
+ _original_warn = warnings.warn
31
+ def _no_meteor_warn(msg, *args, **kwargs):
32
+ if "more than one synset" in str(msg):
33
+ return
34
+ return _original_warn(msg, *args, **kwargs)
35
+ warnings.warn = _no_meteor_warn
36
+
37
+ import jieba_fast as jieba
38
+ import datasets
39
+ from typing import List, Dict
40
+ import numpy as np
41
+ from nltk.translate import meteor_score
42
+ from nltk import word_tokenize
43
+ import nltk
44
+ import evaluate
45
+ import re
46
 
47
  # Download once
48
  nltk.download("wordnet", quiet=True)