HoneyTian commited on
Commit
845e414
·
1 Parent(s): ac18f6b
examples/tutorial_pyltp/ner.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://huggingface.co/LTP
5
+ """
6
+ import argparse
7
+ import os
8
+
9
+ from pyltp import Postagger, Segmentor, NamedEntityRecognizer
10
+
11
+ from project_settings import project_path
12
+
13
+
14
+ def get_args():
15
+ parser = argparse.ArgumentParser()
16
+
17
+ parser.add_argument(
18
+ "--text",
19
+ default="元芳你怎么看?我就趴窗口上看呗!",
20
+ type=str
21
+ )
22
+ parser.add_argument(
23
+ "--ltp_data_dir",
24
+ default=(project_path / "data/pyltp_models/ltp_data_v3.4.0").as_posix(),
25
+ type=str
26
+ )
27
+ args = parser.parse_args()
28
+ return args
29
+
30
+
31
+ def main():
32
+ args = get_args()
33
+
34
+ cws_model_path = os.path.join(args.ltp_data_dir, "cws.model")
35
+ pos_model_path = os.path.join(args.ltp_data_dir, "pos.model")
36
+ ner_model_path = os.path.join(args.ltp_data_dir, "ner.model")
37
+
38
+ segmentor = Segmentor(cws_model_path)
39
+ postagger = Postagger(pos_model_path)
40
+ recognizer = NamedEntityRecognizer(ner_model_path)
41
+
42
+ words = segmentor.segment(args.text)
43
+ postags = postagger.postag(words)
44
+ ner_tags = recognizer.recognize(words, postags)
45
+
46
+ print(words)
47
+ print(postags)
48
+ print(ner_tags)
49
+
50
+ segmentor.release()
51
+ postagger.release()
52
+ recognizer.release()
53
+
54
+ return
55
+
56
+
57
+ if __name__ == "__main__":
58
+ main()
main.py CHANGED
@@ -21,6 +21,11 @@ log.setup(log_directory=log_directory)
21
  import gradio as gr
22
 
23
  from toolbox.os.command import Command
 
 
 
 
 
24
  from toolbox.part_of_speech.part_of_speech import (
25
  language_to_engines as pos_language_to_engines,
26
  engine_to_tagger as pos_engine_to_tagger,
@@ -38,6 +43,11 @@ main_logger = logging.getLogger("main")
38
  def get_args():
39
  parser = argparse.ArgumentParser()
40
 
 
 
 
 
 
41
  parser.add_argument(
42
  "--pos_example_json_file",
43
  default=(project_path / "pos_examples.json").as_posix(),
@@ -52,6 +62,26 @@ def get_args():
52
  return args
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def run_pos_tag(text: str, language: str, engine: str) -> str:
56
  try:
57
  main_logger.info(f"pos tag started. text: {text}, language: {language}, engine: {engine}")
@@ -106,6 +136,8 @@ def shell(cmd: str):
106
  def main():
107
  args = get_args()
108
 
 
 
109
  with open(args.pos_example_json_file, "r", encoding="utf-8") as f:
110
  pos_examples: list = json.load(f)
111
  with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
@@ -200,6 +232,47 @@ def main():
200
  fn=run_srl,
201
  )
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  with gr.TabItem("shell"):
204
  shell_text = gr.Textbox(label="cmd")
205
  shell_button = gr.Button("run")
 
21
  import gradio as gr
22
 
23
  from toolbox.os.command import Command
24
+ from toolbox.named_entity_recognization.named_entity_recognization import (
25
+ language_to_engines as ner_language_to_engines,
26
+ engine_to_tagger as ner_engine_to_tagger,
27
+ ner
28
+ )
29
  from toolbox.part_of_speech.part_of_speech import (
30
  language_to_engines as pos_language_to_engines,
31
  engine_to_tagger as pos_engine_to_tagger,
 
43
  def get_args():
44
  parser = argparse.ArgumentParser()
45
 
46
+ parser.add_argument(
47
+ "--ner_example_json_file",
48
+ default=(project_path / "ner_examples.json").as_posix(),
49
+ type=str
50
+ )
51
  parser.add_argument(
52
  "--pos_example_json_file",
53
  default=(project_path / "pos_examples.json").as_posix(),
 
62
  return args
63
 
64
 
65
+ def run_ner(text: str, language: str, engine: str) -> str:
66
+ try:
67
+ main_logger.info(f"ner started. text: {text}, language: {language}, engine: {engine}")
68
+
69
+ begin = time.time()
70
+
71
+ words, postags, ner_tags = ner(text, language, engine)
72
+ result = ""
73
+ for word, ner_tag in zip(words, ner_tags):
74
+ row = f"{word}/{ner_tag}"
75
+ result += f"{row}\t"
76
+
77
+ time_cost = time.time() - begin
78
+ result += f"\n\ntime_cost: {round(time_cost, 4)}"
79
+ return result
80
+ except Exception as e:
81
+ result = f"{type(e)}\n{str(e)}"
82
+ return result
83
+
84
+
85
  def run_pos_tag(text: str, language: str, engine: str) -> str:
86
  try:
87
  main_logger.info(f"pos tag started. text: {text}, language: {language}, engine: {engine}")
 
136
  def main():
137
  args = get_args()
138
 
139
+ with open(args.ner_example_json_file, "r", encoding="utf-8") as f:
140
+ ner_examples: list = json.load(f)
141
  with open(args.pos_example_json_file, "r", encoding="utf-8") as f:
142
  pos_examples: list = json.load(f)
143
  with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
 
232
  fn=run_srl,
233
  )
234
 
235
+ with gr.TabItem("ner"):
236
+ def ner_get_languages_by_engine(engine: str):
237
+ language_list = list()
238
+ for k, v in ner_language_to_engines.items():
239
+ if engine in v:
240
+ language_list.append(k)
241
+ return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
242
+
243
+ ner_language_choices = list(ner_language_to_engines.keys())
244
+ ner_engine_choices = list(ner_engine_to_tagger.keys())
245
+
246
+ ner_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
247
+
248
+ with gr.Row():
249
+ ner_language = gr.Dropdown(
250
+ choices=ner_language_choices, value=ner_language_choices[0],
251
+ label="language"
252
+ )
253
+ ner_engine = gr.Dropdown(
254
+ choices=ner_engine_choices, value=ner_engine_choices[0],
255
+ label="engine"
256
+ )
257
+ ner_engine.change(
258
+ ner_get_languages_by_engine,
259
+ inputs=[ner_engine],
260
+ outputs=[ner_language],
261
+ )
262
+ ner_output = gr.Textbox(lines=4, max_lines=50, label="output")
263
+ ner_button = gr.Button(value="pos_tag", variant="primary")
264
+ ner_button.click(
265
+ run_ner,
266
+ inputs=[ner_text, ner_language, ner_engine],
267
+ outputs=[ner_output],
268
+ )
269
+ gr.Examples(
270
+ examples=ner_examples,
271
+ inputs=[ner_text, ner_language, ner_engine],
272
+ outputs=[ner_output],
273
+ fn=run_ner,
274
+ )
275
+
276
  with gr.TabItem("shell"):
277
  shell_text = gr.Textbox(label="cmd")
278
  shell_button = gr.Button("run")
ner_examples.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [
2
+ ["元芳你怎么看?我就趴窗口上看呗!", "chinese", "pyltp"]
3
+ ]
toolbox/named_entity_recognization/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == "__main__":
6
+ pass
toolbox/named_entity_recognization/named_entity_recognization.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Callable, Dict, List, Tuple, Union
4
+
5
+ from toolbox.named_entity_recognization.pyltp_ner import pyltp_ner
6
+
7
+
8
+ language_to_engines = {
9
+ "chinese": ["pyltp"]
10
+ }
11
+
12
+
13
+ engine_to_tagger: Dict[str, Callable] = {
14
+ "pyltp": pyltp_ner
15
+ }
16
+
17
+
18
+ def ner(text: str, language: str, engine: str):
19
+ ner_tagger = engine_to_tagger.get(engine)
20
+ if ner_tagger is None:
21
+ raise AssertionError(f"engine {engine} not supported.")
22
+
23
+ words, postags, ner_tags = ner_tagger(text, language)
24
+ return words, postags, ner_tags
25
+
26
+
27
+ if __name__ == "__main__":
28
+ pass
toolbox/named_entity_recognization/pyltp_ner.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from functools import lru_cache
4
+ import os
5
+
6
+ ltp_data_dir = os.environ.get("LTP_DATA_DIR")
7
+
8
+ from pyltp import Postagger, Segmentor, NamedEntityRecognizer
9
+
10
+
11
+ @lru_cache(maxsize=5)
12
+ def get_pyltp_srl_tagger():
13
+ global ltp_data_dir
14
+
15
+ cws_model_path = os.path.join(ltp_data_dir, "cws.model")
16
+ pos_model_path = os.path.join(ltp_data_dir, "pos.model")
17
+ ner_model_path = os.path.join(ltp_data_dir, "ner.model")
18
+
19
+ segmentor = Segmentor(cws_model_path)
20
+ pos_tagger = Postagger(pos_model_path)
21
+ recognizer = NamedEntityRecognizer(ner_model_path)
22
+
23
+ return segmentor, pos_tagger, recognizer
24
+
25
+
26
+ def pyltp_ner(text: str, language: str) -> list:
27
+ segmentor, pos_tagger, recognizer = get_pyltp_srl_tagger()
28
+
29
+ words = segmentor.segment(text)
30
+ postags = pos_tagger.postag(words)
31
+ ner_tags = recognizer.recognize(words, postags)
32
+
33
+ return words, postags, ner_tags
34
+
35
+
36
+ if __name__ == "__main__":
37
+ pass
toolbox/sementic_role_labeling/pyltp_srl.py CHANGED
@@ -14,14 +14,14 @@ from pyltp import Parser, Postagger, Segmentor, SementicRoleLabeller
14
  def get_pyltp_srl_tagger():
15
  global ltp_data_dir
16
 
17
- cws_model_path = os.path.join(ltp_data_dir, 'cws.model')
18
- pos_model_path = os.path.join(ltp_data_dir, 'pos.model')
19
- parser_model_path = os.path.join(ltp_data_dir, 'parser.model')
20
 
21
- if platform.system() == 'Windows':
22
- srl_model_path = os.path.join(ltp_data_dir, 'pisrl_win.model')
23
  else:
24
- srl_model_path = os.path.join(ltp_data_dir, 'pisrl.model')
25
 
26
  segmentor = Segmentor(cws_model_path)
27
  pos_tagger = Postagger(pos_model_path)
 
14
  def get_pyltp_srl_tagger():
15
  global ltp_data_dir
16
 
17
+ cws_model_path = os.path.join(ltp_data_dir, "cws.model")
18
+ pos_model_path = os.path.join(ltp_data_dir, "pos.model")
19
+ parser_model_path = os.path.join(ltp_data_dir, "parser.model")
20
 
21
+ if platform.system() == "Windows":
22
+ srl_model_path = os.path.join(ltp_data_dir, "pisrl_win.model")
23
  else:
24
+ srl_model_path = os.path.join(ltp_data_dir, "pisrl.model")
25
 
26
  segmentor = Segmentor(cws_model_path)
27
  pos_tagger = Postagger(pos_model_path)