Spaces:

raptorkwok
/

chinesemeteor

Sleeping

App Files Files Community

raptorkwok commited on Nov 7, 2025

Commit

fe6f409

1 Parent(s): 3c9928e

suppress warnings

Browse files

Files changed (3) hide show

.ipynb_checkpoints/README-checkpoint.md +0 -33
.ipynb_checkpoints/chinesemeteor-checkpoint.py +0 -222
chinesemeteor.py +19 -10

.ipynb_checkpoints/README-checkpoint.md DELETED Viewed

@@ -1,33 +0,0 @@
----
-library_name: evaluate
-emoji: 🤗
-colorFrom: blue
-colorTo: red
-datasets:
-- raptorkwok/cantonese-traditional-chinese-parallel-corpus-gen3
-tags:
-  - nlp
-  - translation
-  - chinese
-  - meteor
-  - jieba
-description: A BLEU implementation dedicated for Chinese sentences
-sdk: gradio
-sdk_version: 3.19.1
-app_file: app.py
-pinned: false
----
-# # Metric Card for ChineseMETEOR
-Chinese METEOR Implementation
-```python
-import evaluate
-meteor = evaluate.load("raptorkwok/chinesemeteor")
-results = meteor.compute(
-    predictions=["我在這裡吃飯"],
-    references=["我在這裡吃飯"]
-)
-print(results)
-# {'meteor': 1.0}

.ipynb_checkpoints/chinesemeteor-checkpoint.py DELETED Viewed

@@ -1,222 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
-"""
-import jieba_fast as jieba
-import datasets
-from typing import List, Dict
-import numpy as np
-from nltk.translate import meteor_score
-from nltk import word_tokenize
-import nltk
-import evaluate
-import re
-# suppress WordNet warnings
-import warnings
-warnings.filterwarnings("ignore")
-warnings.filterwarnings(
-    "ignore",
-    message="more than one synset, returning the first",
-    category=UserWarning,
-    module="nltk.translate.meteor_score"
-)
-# Download once
-nltk.download("wordnet", quiet=True)
-nltk.download("omw-1.4", quiet=True)
-nltk.download("punkt", quiet=True)
-nltk.download('punkt_tab', quiet=True)
-# ------------------------------------------------------------------- #
-#  REAL Chinese WordNet (CwnGraph) Integration
-# ------------------------------------------------------------------- #
-_cwn = None
-def _load_cwn():
-    global _cwn
-    if _cwn is None:
-        try:
-            from CwnGraph import CwnImage
-            print("Loading Chinese WordNet (CwnGraph, first time only)...")
-            _cwn = CwnImage.latest()
-        except ImportError:
-            raise ImportError("CwnGraph failed to load. Run: pip install CwnGraph")
-    return _cwn
-# Helper to get lemma name (with fallback for API versions)
-def _get_lemma_name(lemma):
-    try:
-        return lemma.name
-    except AttributeError:
-        return str(lemma).split(': ')[1].split('_')[0]
-# Custom Lemma & Synset for NLTK compatibility
-class _CwnLemma:
-    def __init__(self, name): self._name = name
-    def name(self): return self._name
-class _CwnSynset:
-    def __init__(self, lemmas, synset_id):
-        self._lemmas = lemmas
-        self._id = synset_id
-    def lemmas(self):
-        return [_CwnLemma(name) for name in self._lemmas]
-# ------------------------------------------------------------------- #
-#  HuggingFace Evaluation Metric
-# ------------------------------------------------------------------- #
-_DESCRIPTION = """\
-This evaluation metric is tailor-made to evaluate the translation quality of Chinese translation.
-"""
-_KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
-Args:
-    predictions (str): translation sentence to score.
-    references (str): reference sentence for each translation.
-Returns:
-    meteor: the average METEOR score
-    scores: the METEOR score for each sentence pairs
-Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> cmeteor = evaluate.load("raptorkwok/chinesemeteor")
-    >>> results = cmeteor.compute(references=["我在這裡吃飯"], predictions=["我在這兒吃晚飯"])
-    >>> print(results)
-    {'meteor': 0.5111111111111111, 'scores': [0.5111111111111111]}
-"""
-# ------------------------------------------------------------------- #
-#  HuggingFace evaluate template
-# ------------------------------------------------------------------- #
-@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class ChineseMETEOR(evaluate.Metric):
-    """TODO: Short description not ready yet."""
-    def _info(self):
-        return evaluate.MetricInfo(
-            module_type="metric",
-            description=_DESCRIPTION,
-            citation="""@inproceedings{denkowski-lavie-2014-meteor,
-                title = "Meteor Universal: Language Specific Translation Evaluation for Any Target Language",
-                author = "Denkowski, Michael  and  Lavie, Alon",
-                booktitle = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
-                year = "2014"
-            }""",
-            inputs_description=_KWARGS_DESCRIPTION,
-            features=datasets.Features(
-                {
-                    "predictions": datasets.Value("string"),
-                    "references": datasets.Value("string"),
-                }
-            ),
-            # Homepage of the module for documentation
-            homepage="https://yourappapp.com",
-            # Additional links to the codebase or references
-            codebase_urls=["https://github.com/nltk/nltk"],
-            reference_urls=["https://www.cs.cmu.edu/~alavie/METEOR/"],
-        )
-    def _download_and_prepare(self, dl_manager) -> None:
-        """Download external resources useful to compute the scores"""
-        import nltk
-        nltk.download("wordnet", quiet=True)
-        nltk.download("omw-1.4", quiet=True)
-        nltk.download("punkt", quiet=True)
-        nltk.download('punkt_tab', quiet=True)
-        # CwnGraph auto-downloads on first use
-    def _compute(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
-        pred_seg = [" ".join(jieba.cut(p.strip())) for p in predictions]
-        ref_seg  = [" ".join(jieba.cut(r.strip())) for r in references]
-        # --- Apply Real Chinese WordNet into METEOR algorithm ---
-        def _cwn_synsets(self, word, pos=None):  # Matches NLTK method call
-            if not isinstance(word, str) or not word.strip():
-                #print(f"DEBUG: Skipping non-string input: {type(word)}")
-                return []
-            cwn = _load_cwn()
-            try:
-                # Use escaped regex for exact match (CwnGraph expects string pattern)
-                pattern = f"^{re.escape(word)}$"
-                lemmas = cwn.find_lemma(pattern)
-            except Exception as e:
-                #print(f"DEBUG: Error querying CWN for '{word}': {e}")
-                return []
-            exact_lemmas = [l for l in lemmas if _get_lemma_name(l) == word]
-            if not exact_lemmas:
-                #print(f"DEBUG: No exact lemma found for '{word}'")
-                return []
-            synsets_list = []
-            seen_synset_ids = set()
-            for lemma in exact_lemmas:
-                for sense in lemma.senses:
-                    synset = sense.synset
-                    if synset:
-                        try:
-                            synset_id = synset.id
-                        except AttributeError:
-                            synset_id = str(synset)
-                        if synset_id not in seen_synset_ids:
-                            seen_synset_ids.add(synset_id)
-                            try:
-                                synset_lemmas = synset.lemmas
-                                syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
-                            except AttributeError:
-                                synset_lemmas = []
-                                for s in synset.senses:
-                                    try:
-                                        # Access the single lemma via lemmas[0]
-                                        lemma = s.lemmas[0]
-                                        synset_lemmas.append(lemma)
-                                    except (AttributeError, IndexError, TypeError):
-                                        try:
-                                            lemma = s.lemma
-                                            synset_lemmas.append(lemma)
-                                        except AttributeError:
-                                            #print(f"DEBUG: Could not extract lemma from sense {s}")
-                                            continue
-                                syn_lemma_names = [_get_lemma_name(l) for l in synset_lemmas]
-                            syn_lemmas_set = set(syn_lemma_names)
-                            if syn_lemmas_set:
-                                synsets_list.append(_CwnSynset(list(syn_lemmas_set), synset_id))
-            #print(f"DEBUG: Found {len(synsets_list)} synsets for '{word}': {synsets_list[0]._lemmas if synsets_list else []}")
-            return synsets_list[:1]
-        # Use class for proper method binding
-        class ChineseWordNet:
-            def synsets(self, word, pos=None):
-                return _cwn_synsets(self, word, pos)
-        chinese_wn = ChineseWordNet()
-        scores = [
-            meteor_score.single_meteor_score(
-                word_tokenize(ref),
-                word_tokenize(hyp),
-                wordnet=chinese_wn
-            )
-            for ref, hyp in zip(ref_seg, pred_seg)
-        ]
-        return {
-            "meteor": float(np.mean(scores)),
-            "scores": scores,
-        }

chinesemeteor.py CHANGED Viewed

@@ -15,16 +15,6 @@
 """
 Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
 """
-import jieba_fast as jieba
-import datasets
-from typing import List, Dict
-import numpy as np
-from nltk.translate import meteor_score
-from nltk import word_tokenize
-import nltk
-import evaluate
-import re
 # suppress WordNet warnings
 import warnings
 warnings.filterwarnings("ignore")
@@ -34,6 +24,25 @@ warnings.filterwarnings(
     category=UserWarning,
     module="nltk.translate.meteor_score"
 )
 # Download once
 nltk.download("wordnet", quiet=True)

 """
 Chinese METEOR — with Jieba pre-segmentation + CwnGraph Chinese WordNet
 """
 # suppress WordNet warnings
 import warnings
 warnings.filterwarnings("ignore")
     category=UserWarning,
     module="nltk.translate.meteor_score"
 )
+import logging
+logging.getLogger("nltk").setLevel(logging.CRITICAL)
+_original_warn = warnings.warn
+def _no_meteor_warn(msg, *args, **kwargs):
+    if "more than one synset" in str(msg):
+        return
+    return _original_warn(msg, *args, **kwargs)
+warnings.warn = _no_meteor_warn
+import jieba_fast as jieba
+import datasets
+from typing import List, Dict
+import numpy as np
+from nltk.translate import meteor_score
+from nltk import word_tokenize
+import nltk
+import evaluate
+import re
 # Download once
 nltk.download("wordnet", quiet=True)