Spaces:
Sleeping
Sleeping
File size: 9,529 Bytes
e6953e3 98cc462 e6953e3 21a1897 e6953e3 21a1897 fe6f409 f7c53ae fe6f409 ddeed21 21a1897 e6953e3 d1f0daf f751254 8f33fa3 e6953e3 8f33fa3 e6953e3 8f33fa3 e6953e3 8f33fa3 e6953e3 8f33fa3 e6953e3 21a1897 8f33fa3 e6953e3 8f33fa3 e6953e3 98cc462 8f33fa3 98cc462 8f33fa3 e6953e3 8f33fa3 e6953e3 8f33fa3 e6953e3 8f33fa3 e6953e3 8f33fa3 e6953e3 98cc462 e6953e3 f751254 98cc462 ddeed21 e6953e3 f7c53ae e6953e3 f7c53ae 29744eb f7c53ae e6953e3 f7c53ae e6953e3 f7c53ae e6953e3 f7c53ae e6953e3 f7c53ae f91e01a f7c53ae e6953e3 8f33fa3 e6953e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
# -*- coding: utf-8 -*-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
"""
# suppress WordNet warnings
import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger("nltk").setLevel(logging.CRITICAL)
import os
import sys
import jieba_fast as jieba
import datasets
from typing import List, Dict
import numpy as np
from nltk.translate import meteor_score
from nltk import word_tokenize
import nltk
import evaluate
import re
import pycantonese
# Download once
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab', quiet=True)
# ------------------------------------------------------------------- #
# REAL Chinese WordNet (CwnGraph) Integration
# ------------------------------------------------------------------- #
_cwn = None
def _load_cwn():
global _cwn
if _cwn is None:
try:
from CwnGraph import CwnImage
print("Loading Chinese WordNet (CwnGraph, first time only)...")
_cwn = CwnImage.latest()
except ImportError:
raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph")
return _cwn
# Helper to get lemma name (with fallback for API versions)
def _get_lemma_name(lemma):
try:
return lemma.name
except AttributeError:
return str(lemma).split(': ')[1].split('_')[0]
# Custom Lemma & Synset for NLTK compatibility
class _CwnLemma:
def __init__(self, name): self._name = name
def name(self): return self._name
class _CwnSynset:
def __init__(self, lemmas, synset_id):
self._lemmas = lemmas
self._id = synset_id
def lemmas(self):
return [_CwnLemma(name) for name in self._lemmas]
# ------------------------------------------------------------------- #
# HuggingFace Evaluation Metric
# ------------------------------------------------------------------- #
_DESCRIPTION = """\
This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation.
"""
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
predictions (str): translation sentence to score.
references (str): reference sentence for each translation.
Returns:
meteor: the average METEOR score
scores: the METEOR score for each sentence pairs
Examples:
Examples should be written in doctest format, and should illustrate how
to use the function.
>>> cmeteor = evaluate.load("raptorkwok/chinesemeteor")
>>> results = cmeteor.compute(references=["我在這裡吃飯"], predictions=["我在這兒吃晚飯"])
>>> print(results)
{'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]}
"""
# ------------------------------------------------------------------- #
# HuggingFace evaluate template
# ------------------------------------------------------------------- #
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ChineseMETEOR(evaluate.Metric):
"""TODO: Short description not ready yet."""
def _info(self):
return evaluate.MetricInfo(
module_type="metric",
description=_DESCRIPTION,
citation="""@inproceedings{denkowski-lavie-2014-meteor,
title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language",
author = "Denkowski, Michael and Lavie, Alon",
booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
year = "2014"
}""",
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string"),
"references": datasets.Value("string"),
}
),
# Homepage of the module for documentation
homepage="https://yourappapp.com",
# Additional links to the codebase or references
codebase_urls=["https://github.com/nltk/nltk"],
reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"],
)
def _download_and_prepare(self, dl_manager) -> None:
"""Download external resources useful to compute the scores"""
import nltk
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab', quiet=True)
# CwnGraph auto-downloads on first use
def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
original_stdout = sys.stdout # store original output
sys.stdout = open(os.devnull, 'w')
try:
# Tokenize using PyCantonese
pred_seg = [pycantonese.segment(p.strip()) for p in predictions]
ref_seg = [pycantonese.segment(r.strip()) for r in references]
# --- Apply Real Chinese WordNet into METEOR algorithm ---
def _cwn_synsets(self, word, pos=None): # Matches NLTK method call
if not isinstance(word, str) or not word.strip():
#print(f"DEBUG: Skipping non-string input: {type(word)}")
return []
cwn = _load_cwn()
try:
# Use escaped regex for exact match (CwnGraph expects string pattern)
pattern = f"^{re.escape(word)}$"
lemmas = cwn.find_lemma(pattern)
except Exception as e:
#print(f"DEBUG: Error querying CWN for '{word}': {e}")
return []
exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
if not exact_lemmas:
#print(f"DEBUG: No exact lemma found for '{word}'")
return []
synsets_list = []
seen_synset_ids = set()
for lemma in exact_lemmas:
for sense in lemma.senses:
synset = sense.synset
if synset:
try:
synset_id = synset.id
except AttributeError:
synset_id = str(synset)
if synset_id not in seen_synset_ids:
seen_synset_ids.add(synset_id)
try:
synset_lemmas = synset.lemmas
syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
except AttributeError:
synset_lemmas = []
for s in synset.senses:
try:
# Access the single lemma via lemmas[0]
lemma = s.lemmas[0]
synset_lemmas.append(lemma)
except (AttributeError, IndexError, TypeError):
try:
lemma = s.lemma
synset_lemmas.append(lemma)
except AttributeError:
#print(f"DEBUG: Could not extract lemma from sense {s}")
continue
syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
syn_lemmas_set = set(syn_lemma_names)
if syn_lemmas_set:
synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
#print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
return synsets_list[:1]
# Use class for proper method binding
class ChineseWordNet:
def synsets(self, word, pos=None):
return _cwn_synsets(self, word, pos)
chinese_wn = ChineseWordNet()
scores = [
meteor_score.single_meteor_score(
ref,
hyp,
wordnet=chinese_wn
)
for ref, hyp in zip(ref_seg, pred_seg)
]
finally:
sys.stdout.close()
sys.stdout = original_stdout # restore original output
return {
"meteor": float(np.mean(scores)),
"scores": scores,
}
|