raptorkwok commited on
Commit
d1f0daf
·
1 Parent(s): 868038e

added back NLTK library

Browse files
.ipynb_checkpoints/chinesemeteor-checkpoint.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ METEOR (Chinese) — with Jieba pre-segmentation + Real CwnGraph Chinese WordNet
4
+ HuggingFace evaluate metric template
5
+ """
6
+ import jieba_fast as jieba
7
+ import datasets
8
+ from typing import List, Dict
9
+ import numpy as np
10
+ from nltk.translate import meteor_score
11
+ from nltk import word_tokenize
12
+ import nltk
13
+ import evaluate
14
+ import re
15
+
16
+ # Download once
17
+ nltk.download("wordnet", quiet=True)
18
+ nltk.download("omw-1.4", quiet=True)
19
+ nltk.download("punkt", quiet=True)
20
+
21
+ # ------------------------------------------------------------------- #
22
+ # REAL Chinese WordNet (CwnGraph) Integration
23
+ # ------------------------------------------------------------------- #
24
+ _cwn = None
25
+ def _load_cwn():
26
+ global _cwn
27
+ if _cwn is None:
28
+ try:
29
+ from CwnGraph import CwnImage
30
+ print("Loading Chinese WordNet (CwnGraph, first time only)...")
31
+ _cwn = CwnImage.latest()
32
+ except ImportError:
33
+ raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph")
34
+ return _cwn
35
+
36
+ # Helper to get lemma name (with fallback for API versions)
37
+ def _get_lemma_name(lemma):
38
+ try:
39
+ return lemma.name
40
+ except AttributeError:
41
+ return str(lemma).split(': ')[1].split('_')[0]
42
+
43
+ # Custom Lemma & Synset for NLTK compatibility
44
+ class _CwnLemma:
45
+ def __init__(self, name): self._name = name
46
+ def name(self): return self._name
47
+
48
+ class _CwnSynset:
49
+ def __init__(self, lemmas, synset_id):
50
+ self._lemmas = lemmas
51
+ self._id = synset_id
52
+ def lemmas(self):
53
+ return [_CwnLemma(name) for name in self._lemmas]
54
+
55
+ # ------------------------------------------------------------------- #
56
+ # HuggingFace Evaluation Metric
57
+ # ------------------------------------------------------------------- #
58
+
59
+ _DESCRIPTION = """\
60
+ This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation.
61
+ """
62
+
63
+ _KWARGS_DESCRIPTION = """
64
+ Calculates how good are predictions given some references, using certain scores
65
+ Args:
66
+ predictions (str): translation sentence to score.
67
+ references (str): reference sentence for each translation.
68
+ Returns:
69
+ meteor: the average METEOR score
70
+ scores: the METEOR score for each sentence pairs
71
+
72
+ Examples:
73
+ Examples should be written in doctest format, and should illustrate how
74
+ to use the function.
75
+
76
+ >>> cmeteor = evaluate.load("raptorkwok/chinesemeteor")
77
+ >>> results = cmeteor.compute(references=["Reference Sentence in Chinese"], predictions=["Predicted Sentence in Chinese"])
78
+ >>> print(results)
79
+ {'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]}
80
+ """
81
+
82
+ # ------------------------------------------------------------------- #
83
+ # HuggingFace evaluate template
84
+ # ------------------------------------------------------------------- #
85
+ class ChineseMETEOR(evaluate.Metric):
86
+ def _info(self):
87
+ return evaluate.MetricInfo(
88
+ module_type="metric",
89
+ description=_DESCRIPTION,
90
+ citation="""@inproceedings{denkowski-lavie-2014-meteor,
91
+ title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language",
92
+ author = "Denkowski, Michael and Lavie, Alon",
93
+ booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
94
+ year = "2014"
95
+ }""",
96
+ inputs_description=_KWARGS_DESCRIPTION,
97
+ features=datasets.Features(
98
+ {
99
+ "predictions": datasets.Value("string"),
100
+ "references": datasets.Value("string"),
101
+ }
102
+ ),
103
+ # Homepage of the module for documentation
104
+ homepage="https://yourappapp.com",
105
+ # Additional links to the codebase or references
106
+ codebase_urls=["https://github.com/nltk/nltk"],
107
+ reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"],
108
+ )
109
+
110
+ def _download_and_prepare(self, dl_manager) -> None:
111
+ """Optional: download external resources useful to compute the scores"""
112
+ # CwnGraph auto-downloads on first use
113
+ import nltk
114
+ nltk.download("wordnet", quiet=True)
115
+ nltk.download("omw-1.4", quiet=True)
116
+ nltk.download("punkt", quiet=True)
117
+ pass
118
+
119
+ def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
120
+ pred_seg = [" ".join(jieba.cut(p.strip())) for p in predictions]
121
+ ref_seg = [" ".join(jieba.cut(r.strip())) for r in references]
122
+
123
+ # --- FORCE Real CWN INTO METEOR ---
124
+ def _cwn_synsets(self, word, pos=None): # Matches NLTK method call
125
+ if not isinstance(word, str) or not word.strip():
126
+ print(f"DEBUG: Skipping non-string input: {type(word)}")
127
+ return []
128
+ cwn = _load_cwn()
129
+ try:
130
+ # Use escaped regex for exact match (CwnGraph expects string pattern)
131
+ pattern = f"^{re.escape(word)}$"
132
+ lemmas = cwn.find_lemma(pattern)
133
+ except Exception as e:
134
+ print(f"DEBUG: Error querying CWN for '{word}': {e}")
135
+ return []
136
+ # FIXED: Use _get_lemma_name for comparison (handles missing .name)
137
+ exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
138
+ if not exact_lemmas:
139
+ print(f"DEBUG: No exact lemma found for '{word}'")
140
+ return []
141
+ synsets_list = []
142
+ seen_synset_ids = set()
143
+ for lemma in exact_lemmas:
144
+ for sense in lemma.senses:
145
+ synset = sense.synset
146
+ if synset:
147
+ try:
148
+ synset_id = synset.id
149
+ except AttributeError:
150
+ synset_id = str(synset)
151
+ if synset_id not in seen_synset_ids:
152
+ seen_synset_ids.add(synset_id)
153
+ try:
154
+ synset_lemmas = synset.lemmas
155
+ syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
156
+ except AttributeError:
157
+ synset_lemmas = []
158
+ for s in synset.senses:
159
+ try:
160
+ # Access the single lemma via lemmas[0]
161
+ lemma = s.lemmas[0]
162
+ synset_lemmas.append(lemma)
163
+ except (AttributeError, IndexError, TypeError):
164
+ try:
165
+ lemma = s.lemma
166
+ synset_lemmas.append(lemma)
167
+ except AttributeError:
168
+ print(f"DEBUG: Could not extract lemma from sense {s}")
169
+ continue
170
+ syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
171
+ syn_lemmas_set = set(syn_lemma_names)
172
+ if syn_lemmas_set:
173
+ synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
174
+ print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
175
+ return synsets_list
176
+
177
+ # Use class for proper method binding
178
+ class ChineseWordNet:
179
+ def synsets(self, word, pos=None):
180
+ return _cwn_synsets(self, word, pos)
181
+
182
+ chinese_wn = ChineseWordNet()
183
+
184
+ scores = [
185
+ meteor_score.single_meteor_score(
186
+ word_tokenize(ref),
187
+ word_tokenize(hyp),
188
+ wordnet=chinese_wn
189
+ )
190
+ for ref, hyp in zip(ref_seg, pred_seg)
191
+ ]
192
+
193
+ return {
194
+ "meteor": float(np.mean(scores)),
195
+ "scores": scores,
196
+ }
chinesemeteor.py CHANGED
@@ -9,14 +9,14 @@ from typing import List, Dict
9
  import numpy as np
10
  from nltk.translate import meteor_score
11
  from nltk import word_tokenize
12
- #import nltk
13
  import evaluate
14
  import re
15
 
16
  # Download once
17
- #nltk.download("wordnet", quiet=True)
18
- #nltk.download("omw-1.4", quiet=True)
19
- #nltk.download("punkt", quiet=True)
20
 
21
  # ------------------------------------------------------------------- #
22
  # REAL Chinese WordNet (CwnGraph) Integration
 
9
  import numpy as np
10
  from nltk.translate import meteor_score
11
  from nltk import word_tokenize
12
+ import nltk
13
  import evaluate
14
  import re
15
 
16
  # Download once
17
+ nltk.download("wordnet", quiet=True)
18
+ nltk.download("omw-1.4", quiet=True)
19
+ nltk.download("punkt", quiet=True)
20
 
21
  # ------------------------------------------------------------------- #
22
  # REAL Chinese WordNet (CwnGraph) Integration