raptorkwok commited on
Commit
f7c53ae
·
1 Parent(s): ddeed21

re-organize warning suppress codes

Browse files
Files changed (1) hide show
  1. chinesemeteor.py +78 -83
chinesemeteor.py CHANGED
@@ -18,22 +18,10 @@ Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
18
  # suppress WordNet warnings
19
  import warnings
20
  warnings.filterwarnings("ignore")
21
- warnings.filterwarnings(
22
- "ignore",
23
- message="more than one synset, returning the first",
24
- category=UserWarning,
25
- module="nltk.translate.meteor_score"
26
- )
27
  import logging
28
  logging.getLogger("nltk").setLevel(logging.CRITICAL)
29
-
30
- _original_warn = warnings.warn
31
- def _no_meteor_warn(msg, *args, **kwargs):
32
- if "more than one synset" in str(msg):
33
- return
34
- return _original_warn(msg, *args, **kwargs)
35
- warnings.warn = _no_meteor_warn
36
-
37
  import jieba_fast as jieba
38
  import datasets
39
  from typing import List, Dict
@@ -157,80 +145,87 @@ class ChineseMETEOR(evaluate.Metric):
157
  return pycantonese.segment(sentence)
158
 
159
  def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
160
- pred_seg = [" ".join(jieba.cut(p.strip())) for p in predictions]
161
- ref_seg = [" ".join(jieba.cut(r.strip())) for r in references]
162
 
163
- # --- Apply Real Chinese WordNet into METEOR algorithm ---
164
- def _cwn_synsets(self, word, pos=None): # Matches NLTK method call
165
- if not isinstance(word, str) or not word.strip():
166
- #print(f"DEBUG: Skipping non-string input: {type(word)}")
167
- return []
168
- cwn = _load_cwn()
169
- try:
170
- # Use escaped regex for exact match (CwnGraph expects string pattern)
171
- pattern = f"^{re.escape(word)}$"
172
- lemmas = cwn.find_lemma(pattern)
173
- except Exception as e:
174
- #print(f"DEBUG: Error querying CWN for '{word}': {e}")
175
- return []
176
-
177
- exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
178
- if not exact_lemmas:
179
- #print(f"DEBUG: No exact lemma found for '{word}'")
180
- return []
181
- synsets_list = []
182
- seen_synset_ids = set()
183
- for lemma in exact_lemmas:
184
- for sense in lemma.senses:
185
- synset = sense.synset
186
- if synset:
187
- try:
188
- synset_id = synset.id
189
- except AttributeError:
190
- synset_id = str(synset)
191
- if synset_id not in seen_synset_ids:
192
- seen_synset_ids.add(synset_id)
193
  try:
194
- synset_lemmas = synset.lemmas
195
- syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
196
  except AttributeError:
197
- synset_lemmas = []
198
- for s in synset.senses:
199
- try:
200
- # Access the single lemma via lemmas[0]
201
- lemma = s.lemmas[0]
202
- synset_lemmas.append(lemma)
203
- except (AttributeError, IndexError, TypeError):
 
 
204
  try:
205
- lemma = s.lemma
 
206
  synset_lemmas.append(lemma)
207
- except AttributeError:
208
- #print(f"DEBUG: Could not extract lemma from sense {s}")
209
- continue
210
- syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
211
- syn_lemmas_set = set(syn_lemma_names)
212
- if syn_lemmas_set:
213
- synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
214
- #print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
215
- return synsets_list[:1]
216
-
217
- # Use class for proper method binding
218
- class ChineseWordNet:
219
- def synsets(self, word, pos=None):
220
- return _cwn_synsets(self, word, pos)
221
-
222
- chinese_wn = ChineseWordNet()
223
-
224
- scores = [
225
- meteor_score.single_meteor_score(
226
- #word_tokenize(ref),
227
- self._tokenize_chinese(ref),
228
- #word_tokenize(hyp),
229
- self._tokenize_chinese(hyp),
230
- wordnet=chinese_wn
231
- )
232
- for ref, hyp in zip(ref_seg, pred_seg)
233
- ]
 
 
 
 
 
 
 
234
 
235
  return {
236
  "meteor": float(np.mean(scores)),
 
18
  # suppress WordNet warnings
19
  import warnings
20
  warnings.filterwarnings("ignore")
 
 
 
 
 
 
21
  import logging
22
  logging.getLogger("nltk").setLevel(logging.CRITICAL)
23
+ import os
24
+ import sys
 
 
 
 
 
 
25
  import jieba_fast as jieba
26
  import datasets
27
  from typing import List, Dict
 
145
  return pycantonese.segment(sentence)
146
 
147
  def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
148
+ original_stdout = sys.stdout # store original output
149
+ sys.stdout = open(os.devnull, 'w')
150
 
151
+ try:
152
+ pred_seg = [" ".join(jieba.cut(p.strip())) for p in predictions]
153
+ ref_seg = [" ".join(jieba.cut(r.strip())) for r in references]
154
+
155
+ # --- Apply Real Chinese WordNet into METEOR algorithm ---
156
+ def _cwn_synsets(self, word, pos=None): # Matches NLTK method call
157
+ if not isinstance(word, str) or not word.strip():
158
+ #print(f"DEBUG: Skipping non-string input: {type(word)}")
159
+ return []
160
+ cwn = _load_cwn()
161
+ try:
162
+ # Use escaped regex for exact match (CwnGraph expects string pattern)
163
+ pattern = f"^{re.escape(word)}$"
164
+ lemmas = cwn.find_lemma(pattern)
165
+ except Exception as e:
166
+ #print(f"DEBUG: Error querying CWN for '{word}': {e}")
167
+ return []
168
+
169
+ exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
170
+ if not exact_lemmas:
171
+ #print(f"DEBUG: No exact lemma found for '{word}'")
172
+ return []
173
+ synsets_list = []
174
+ seen_synset_ids = set()
175
+ for lemma in exact_lemmas:
176
+ for sense in lemma.senses:
177
+ synset = sense.synset
178
+ if synset:
 
 
179
  try:
180
+ synset_id = synset.id
 
181
  except AttributeError:
182
+ synset_id = str(synset)
183
+ if synset_id not in seen_synset_ids:
184
+ seen_synset_ids.add(synset_id)
185
+ try:
186
+ synset_lemmas = synset.lemmas
187
+ syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
188
+ except AttributeError:
189
+ synset_lemmas = []
190
+ for s in synset.senses:
191
  try:
192
+ # Access the single lemma via lemmas[0]
193
+ lemma = s.lemmas[0]
194
  synset_lemmas.append(lemma)
195
+ except (AttributeError, IndexError, TypeError):
196
+ try:
197
+ lemma = s.lemma
198
+ synset_lemmas.append(lemma)
199
+ except AttributeError:
200
+ #print(f"DEBUG: Could not extract lemma from sense {s}")
201
+ continue
202
+ syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
203
+ syn_lemmas_set = set(syn_lemma_names)
204
+ if syn_lemmas_set:
205
+ synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
206
+ #print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
207
+ return synsets_list[:1]
208
+
209
+ # Use class for proper method binding
210
+ class ChineseWordNet:
211
+ def synsets(self, word, pos=None):
212
+ return _cwn_synsets(self, word, pos)
213
+
214
+ chinese_wn = ChineseWordNet()
215
+
216
+ scores = [
217
+ meteor_score.single_meteor_score(
218
+ #word_tokenize(ref),
219
+ self._tokenize_chinese(ref),
220
+ #word_tokenize(hyp),
221
+ self._tokenize_chinese(hyp),
222
+ wordnet=chinese_wn
223
+ )
224
+ for ref, hyp in zip(ref_seg, pred_seg)
225
+ ]
226
+ finally:
227
+ sys.stdout.close()
228
+ sys.stdout = original_stdout # restore original output
229
 
230
  return {
231
  "meteor": float(np.mean(scores)),