Spaces:
Paused
Paused
update
Browse files- examples/tutorial_pyltp/ner.py +58 -0
- main.py +73 -0
- ner_examples.json +3 -0
- toolbox/named_entity_recognization/__init__.py +6 -0
- toolbox/named_entity_recognization/named_entity_recognization.py +28 -0
- toolbox/named_entity_recognization/pyltp_ner.py +37 -0
- toolbox/sementic_role_labeling/pyltp_srl.py +6 -6
examples/tutorial_pyltp/ner.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://huggingface.co/LTP
|
| 5 |
+
"""
|
| 6 |
+
import argparse
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
from pyltp import Postagger, Segmentor, NamedEntityRecognizer
|
| 10 |
+
|
| 11 |
+
from project_settings import project_path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_args():
|
| 15 |
+
parser = argparse.ArgumentParser()
|
| 16 |
+
|
| 17 |
+
parser.add_argument(
|
| 18 |
+
"--text",
|
| 19 |
+
default="元芳你怎么看?我就趴窗口上看呗!",
|
| 20 |
+
type=str
|
| 21 |
+
)
|
| 22 |
+
parser.add_argument(
|
| 23 |
+
"--ltp_data_dir",
|
| 24 |
+
default=(project_path / "data/pyltp_models/ltp_data_v3.4.0").as_posix(),
|
| 25 |
+
type=str
|
| 26 |
+
)
|
| 27 |
+
args = parser.parse_args()
|
| 28 |
+
return args
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main():
|
| 32 |
+
args = get_args()
|
| 33 |
+
|
| 34 |
+
cws_model_path = os.path.join(args.ltp_data_dir, "cws.model")
|
| 35 |
+
pos_model_path = os.path.join(args.ltp_data_dir, "pos.model")
|
| 36 |
+
ner_model_path = os.path.join(args.ltp_data_dir, "ner.model")
|
| 37 |
+
|
| 38 |
+
segmentor = Segmentor(cws_model_path)
|
| 39 |
+
postagger = Postagger(pos_model_path)
|
| 40 |
+
recognizer = NamedEntityRecognizer(ner_model_path)
|
| 41 |
+
|
| 42 |
+
words = segmentor.segment(args.text)
|
| 43 |
+
postags = postagger.postag(words)
|
| 44 |
+
ner_tags = recognizer.recognize(words, postags)
|
| 45 |
+
|
| 46 |
+
print(words)
|
| 47 |
+
print(postags)
|
| 48 |
+
print(ner_tags)
|
| 49 |
+
|
| 50 |
+
segmentor.release()
|
| 51 |
+
postagger.release()
|
| 52 |
+
recognizer.release()
|
| 53 |
+
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
main()
|
main.py
CHANGED
|
@@ -21,6 +21,11 @@ log.setup(log_directory=log_directory)
|
|
| 21 |
import gradio as gr
|
| 22 |
|
| 23 |
from toolbox.os.command import Command
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
from toolbox.part_of_speech.part_of_speech import (
|
| 25 |
language_to_engines as pos_language_to_engines,
|
| 26 |
engine_to_tagger as pos_engine_to_tagger,
|
|
@@ -38,6 +43,11 @@ main_logger = logging.getLogger("main")
|
|
| 38 |
def get_args():
|
| 39 |
parser = argparse.ArgumentParser()
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
parser.add_argument(
|
| 42 |
"--pos_example_json_file",
|
| 43 |
default=(project_path / "pos_examples.json").as_posix(),
|
|
@@ -52,6 +62,26 @@ def get_args():
|
|
| 52 |
return args
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def run_pos_tag(text: str, language: str, engine: str) -> str:
|
| 56 |
try:
|
| 57 |
main_logger.info(f"pos tag started. text: {text}, language: {language}, engine: {engine}")
|
|
@@ -106,6 +136,8 @@ def shell(cmd: str):
|
|
| 106 |
def main():
|
| 107 |
args = get_args()
|
| 108 |
|
|
|
|
|
|
|
| 109 |
with open(args.pos_example_json_file, "r", encoding="utf-8") as f:
|
| 110 |
pos_examples: list = json.load(f)
|
| 111 |
with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
|
|
@@ -200,6 +232,47 @@ def main():
|
|
| 200 |
fn=run_srl,
|
| 201 |
)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
with gr.TabItem("shell"):
|
| 204 |
shell_text = gr.Textbox(label="cmd")
|
| 205 |
shell_button = gr.Button("run")
|
|
|
|
| 21 |
import gradio as gr
|
| 22 |
|
| 23 |
from toolbox.os.command import Command
|
| 24 |
+
from toolbox.named_entity_recognization.named_entity_recognization import (
|
| 25 |
+
language_to_engines as ner_language_to_engines,
|
| 26 |
+
engine_to_tagger as ner_engine_to_tagger,
|
| 27 |
+
ner
|
| 28 |
+
)
|
| 29 |
from toolbox.part_of_speech.part_of_speech import (
|
| 30 |
language_to_engines as pos_language_to_engines,
|
| 31 |
engine_to_tagger as pos_engine_to_tagger,
|
|
|
|
| 43 |
def get_args():
|
| 44 |
parser = argparse.ArgumentParser()
|
| 45 |
|
| 46 |
+
parser.add_argument(
|
| 47 |
+
"--ner_example_json_file",
|
| 48 |
+
default=(project_path / "ner_examples.json").as_posix(),
|
| 49 |
+
type=str
|
| 50 |
+
)
|
| 51 |
parser.add_argument(
|
| 52 |
"--pos_example_json_file",
|
| 53 |
default=(project_path / "pos_examples.json").as_posix(),
|
|
|
|
| 62 |
return args
|
| 63 |
|
| 64 |
|
| 65 |
+
def run_ner(text: str, language: str, engine: str) -> str:
|
| 66 |
+
try:
|
| 67 |
+
main_logger.info(f"ner started. text: {text}, language: {language}, engine: {engine}")
|
| 68 |
+
|
| 69 |
+
begin = time.time()
|
| 70 |
+
|
| 71 |
+
words, postags, ner_tags = ner(text, language, engine)
|
| 72 |
+
result = ""
|
| 73 |
+
for word, ner_tag in zip(words, ner_tags):
|
| 74 |
+
row = f"{word}/{ner_tag}"
|
| 75 |
+
result += f"{row}\t"
|
| 76 |
+
|
| 77 |
+
time_cost = time.time() - begin
|
| 78 |
+
result += f"\n\ntime_cost: {round(time_cost, 4)}"
|
| 79 |
+
return result
|
| 80 |
+
except Exception as e:
|
| 81 |
+
result = f"{type(e)}\n{str(e)}"
|
| 82 |
+
return result
|
| 83 |
+
|
| 84 |
+
|
| 85 |
def run_pos_tag(text: str, language: str, engine: str) -> str:
|
| 86 |
try:
|
| 87 |
main_logger.info(f"pos tag started. text: {text}, language: {language}, engine: {engine}")
|
|
|
|
| 136 |
def main():
|
| 137 |
args = get_args()
|
| 138 |
|
| 139 |
+
with open(args.ner_example_json_file, "r", encoding="utf-8") as f:
|
| 140 |
+
ner_examples: list = json.load(f)
|
| 141 |
with open(args.pos_example_json_file, "r", encoding="utf-8") as f:
|
| 142 |
pos_examples: list = json.load(f)
|
| 143 |
with open(args.srl_example_json_file, "r", encoding="utf-8") as f:
|
|
|
|
| 232 |
fn=run_srl,
|
| 233 |
)
|
| 234 |
|
| 235 |
+
with gr.TabItem("ner"):
|
| 236 |
+
def ner_get_languages_by_engine(engine: str):
|
| 237 |
+
language_list = list()
|
| 238 |
+
for k, v in ner_language_to_engines.items():
|
| 239 |
+
if engine in v:
|
| 240 |
+
language_list.append(k)
|
| 241 |
+
return gr.Dropdown(choices=language_list, value=language_list[0], label="language")
|
| 242 |
+
|
| 243 |
+
ner_language_choices = list(ner_language_to_engines.keys())
|
| 244 |
+
ner_engine_choices = list(ner_engine_to_tagger.keys())
|
| 245 |
+
|
| 246 |
+
ner_text = gr.Textbox(value="学而时习之,不亦悦乎。", lines=4, max_lines=50, label="text")
|
| 247 |
+
|
| 248 |
+
with gr.Row():
|
| 249 |
+
ner_language = gr.Dropdown(
|
| 250 |
+
choices=ner_language_choices, value=ner_language_choices[0],
|
| 251 |
+
label="language"
|
| 252 |
+
)
|
| 253 |
+
ner_engine = gr.Dropdown(
|
| 254 |
+
choices=ner_engine_choices, value=ner_engine_choices[0],
|
| 255 |
+
label="engine"
|
| 256 |
+
)
|
| 257 |
+
ner_engine.change(
|
| 258 |
+
ner_get_languages_by_engine,
|
| 259 |
+
inputs=[ner_engine],
|
| 260 |
+
outputs=[ner_language],
|
| 261 |
+
)
|
| 262 |
+
ner_output = gr.Textbox(lines=4, max_lines=50, label="output")
|
| 263 |
+
ner_button = gr.Button(value="pos_tag", variant="primary")
|
| 264 |
+
ner_button.click(
|
| 265 |
+
run_ner,
|
| 266 |
+
inputs=[ner_text, ner_language, ner_engine],
|
| 267 |
+
outputs=[ner_output],
|
| 268 |
+
)
|
| 269 |
+
gr.Examples(
|
| 270 |
+
examples=ner_examples,
|
| 271 |
+
inputs=[ner_text, ner_language, ner_engine],
|
| 272 |
+
outputs=[ner_output],
|
| 273 |
+
fn=run_ner,
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
with gr.TabItem("shell"):
|
| 277 |
shell_text = gr.Textbox(label="cmd")
|
| 278 |
shell_button = gr.Button("run")
|
ner_examples.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
["元芳你怎么看?我就趴窗口上看呗!", "chinese", "pyltp"]
|
| 3 |
+
]
|
toolbox/named_entity_recognization/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
pass
|
toolbox/named_entity_recognization/named_entity_recognization.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
from typing import Callable, Dict, List, Tuple, Union
|
| 4 |
+
|
| 5 |
+
from toolbox.named_entity_recognization.pyltp_ner import pyltp_ner
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
language_to_engines = {
|
| 9 |
+
"chinese": ["pyltp"]
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
engine_to_tagger: Dict[str, Callable] = {
|
| 14 |
+
"pyltp": pyltp_ner
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def ner(text: str, language: str, engine: str):
|
| 19 |
+
ner_tagger = engine_to_tagger.get(engine)
|
| 20 |
+
if ner_tagger is None:
|
| 21 |
+
raise AssertionError(f"engine {engine} not supported.")
|
| 22 |
+
|
| 23 |
+
words, postags, ner_tags = ner_tagger(text, language)
|
| 24 |
+
return words, postags, ner_tags
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
if __name__ == "__main__":
|
| 28 |
+
pass
|
toolbox/named_entity_recognization/pyltp_ner.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
ltp_data_dir = os.environ.get("LTP_DATA_DIR")
|
| 7 |
+
|
| 8 |
+
from pyltp import Postagger, Segmentor, NamedEntityRecognizer
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@lru_cache(maxsize=5)
|
| 12 |
+
def get_pyltp_srl_tagger():
|
| 13 |
+
global ltp_data_dir
|
| 14 |
+
|
| 15 |
+
cws_model_path = os.path.join(ltp_data_dir, "cws.model")
|
| 16 |
+
pos_model_path = os.path.join(ltp_data_dir, "pos.model")
|
| 17 |
+
ner_model_path = os.path.join(ltp_data_dir, "ner.model")
|
| 18 |
+
|
| 19 |
+
segmentor = Segmentor(cws_model_path)
|
| 20 |
+
pos_tagger = Postagger(pos_model_path)
|
| 21 |
+
recognizer = NamedEntityRecognizer(ner_model_path)
|
| 22 |
+
|
| 23 |
+
return segmentor, pos_tagger, recognizer
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def pyltp_ner(text: str, language: str) -> list:
|
| 27 |
+
segmentor, pos_tagger, recognizer = get_pyltp_srl_tagger()
|
| 28 |
+
|
| 29 |
+
words = segmentor.segment(text)
|
| 30 |
+
postags = pos_tagger.postag(words)
|
| 31 |
+
ner_tags = recognizer.recognize(words, postags)
|
| 32 |
+
|
| 33 |
+
return words, postags, ner_tags
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
pass
|
toolbox/sementic_role_labeling/pyltp_srl.py
CHANGED
|
@@ -14,14 +14,14 @@ from pyltp import Parser, Postagger, Segmentor, SementicRoleLabeller
|
|
| 14 |
def get_pyltp_srl_tagger():
|
| 15 |
global ltp_data_dir
|
| 16 |
|
| 17 |
-
cws_model_path = os.path.join(ltp_data_dir,
|
| 18 |
-
pos_model_path = os.path.join(ltp_data_dir,
|
| 19 |
-
parser_model_path = os.path.join(ltp_data_dir,
|
| 20 |
|
| 21 |
-
if platform.system() ==
|
| 22 |
-
srl_model_path = os.path.join(ltp_data_dir,
|
| 23 |
else:
|
| 24 |
-
srl_model_path = os.path.join(ltp_data_dir,
|
| 25 |
|
| 26 |
segmentor = Segmentor(cws_model_path)
|
| 27 |
pos_tagger = Postagger(pos_model_path)
|
|
|
|
| 14 |
def get_pyltp_srl_tagger():
|
| 15 |
global ltp_data_dir
|
| 16 |
|
| 17 |
+
cws_model_path = os.path.join(ltp_data_dir, "cws.model")
|
| 18 |
+
pos_model_path = os.path.join(ltp_data_dir, "pos.model")
|
| 19 |
+
parser_model_path = os.path.join(ltp_data_dir, "parser.model")
|
| 20 |
|
| 21 |
+
if platform.system() == "Windows":
|
| 22 |
+
srl_model_path = os.path.join(ltp_data_dir, "pisrl_win.model")
|
| 23 |
else:
|
| 24 |
+
srl_model_path = os.path.join(ltp_data_dir, "pisrl.model")
|
| 25 |
|
| 26 |
segmentor = Segmentor(cws_model_path)
|
| 27 |
pos_tagger = Postagger(pos_model_path)
|