EugeneXiang
/

prival

prompt-validation

Model card Files Files and versions

EugeneXiang commited on Apr 28, 2025

Commit

67096a2

·

verified ·

1 Parent(s): bed19e6

Upload nlp_helpers.py

Files changed (1) hide show

utils/nlp_helpers.py +42 -13

utils/nlp_helpers.py CHANGED Viewed

@@ -1,22 +1,51 @@
-# utils/nlp_helpers.py
-import spacy
-from typing import List
-# 加载小型中文模型或英文模型
 try:
-    nlp = spacy.load("zh_core_web_sm")
-except:
-    nlp = spacy.load("en_core_web_sm")
-def tokenize(text: str) -> List[str]:
-    return [token.text for token in nlp(text)]
 def sentence_length(text: str) -> int:
     return len(tokenize(text))
-def dependency_depth(text: str) -> int:
-    doc = nlp(text)
-    return max([len([t for t in token.ancestors]) for token in doc])

+"""
+NLP helper functions for PRIVAL. Guards against missing spaCy.
+"""
 try:
+    import spacy
+except ImportError:
+    spacy = None
+# Lazy-loaded spaCy model (English small by default)
+_nlp = None
+def _get_nlp(model_name: str = "en_core_web_sm"):
+    global _nlp
+    if spacy is None:
+        return None
+    if _nlp is None:
+        try:
+            _nlp = spacy.load(model_name)
+        except Exception:
+            _nlp = None
+    return _nlp
+def tokenize(text: str) -> list[str]:
+    """
+    分词：有 spaCy 则用它，否则按空白切分。
+    """
+    nlp = _get_nlp()
+    if nlp:
+        return [tok.text for tok in nlp(text)]
+    return text.split()
 def sentence_length(text: str) -> int:
+    """
+    句子长度（以词计数）：依赖 tokenize。
+    """
     return len(tokenize(text))
+def dependency_depth(doc) -> int:
+    """
+    句法依存树深度：需要传入 spaCy Doc；无 spaCy 时返回 0。
+    """
+    if spacy is None or doc is None:
+        return 0
+    # 计算最大依存链长度
+    def depth(tok):
+        if not list(tok.children):
+            return 1
+        return 1 + max(depth(child) for child in tok.children)
+    return max(depth(sent.root) for sent in doc.sents)
+# 你可以根据需要，继续添加其它工具（比如词性标注、命名实体等）