EugeneXiang commited on
Commit
67096a2
·
verified ·
1 Parent(s): bed19e6

Upload nlp_helpers.py

Browse files
Files changed (1) hide show
  1. utils/nlp_helpers.py +42 -13
utils/nlp_helpers.py CHANGED
@@ -1,22 +1,51 @@
1
- # utils/nlp_helpers.py
2
- import spacy
3
- from typing import List
4
 
5
- # 加载小型中文模型或英文模型
6
  try:
7
- nlp = spacy.load("zh_core_web_sm")
8
- except:
9
- nlp = spacy.load("en_core_web_sm")
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- def tokenize(text: str) -> List[str]:
13
- return [token.text for token in nlp(text)]
14
-
 
 
 
 
 
15
 
16
  def sentence_length(text: str) -> int:
 
 
 
17
  return len(tokenize(text))
18
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- def dependency_depth(text: str) -> int:
21
- doc = nlp(text)
22
- return max([len([t for t in token.ancestors]) for token in doc])
 
1
+ """
2
+ NLP helper functions for PRIVAL. Guards against missing spaCy.
3
+ """
4
 
 
5
  try:
6
+ import spacy
7
+ except ImportError:
8
+ spacy = None
9
 
10
+ # Lazy-loaded spaCy model (English small by default)
11
+ _nlp = None
12
+ def _get_nlp(model_name: str = "en_core_web_sm"):
13
+ global _nlp
14
+ if spacy is None:
15
+ return None
16
+ if _nlp is None:
17
+ try:
18
+ _nlp = spacy.load(model_name)
19
+ except Exception:
20
+ _nlp = None
21
+ return _nlp
22
 
23
+ def tokenize(text: str) -> list[str]:
24
+ """
25
+ 分词:有 spaCy 则用它,否则按空白切分。
26
+ """
27
+ nlp = _get_nlp()
28
+ if nlp:
29
+ return [tok.text for tok in nlp(text)]
30
+ return text.split()
31
 
32
  def sentence_length(text: str) -> int:
33
+ """
34
+ 句子长度(以词计数):依赖 tokenize。
35
+ """
36
  return len(tokenize(text))
37
 
38
+ def dependency_depth(doc) -> int:
39
+ """
40
+ 句法依存树深度:需要传入 spaCy Doc;无 spaCy 时返回 0。
41
+ """
42
+ if spacy is None or doc is None:
43
+ return 0
44
+ # 计算最大依存链长度
45
+ def depth(tok):
46
+ if not list(tok.children):
47
+ return 1
48
+ return 1 + max(depth(child) for child in tok.children)
49
+ return max(depth(sent.root) for sent in doc.sents)
50
 
51
+ # 你可以根据需要,继续添加其它工具(比如词性标注、命名实体等)